diff --git a/packages/kokkos/.clang-format b/packages/kokkos/.clang-format new file mode 100644 index 0000000000000000000000000000000000000000..db5f94fa2ebb6f3d343ff9ce86507229c2b990f9 --- /dev/null +++ b/packages/kokkos/.clang-format @@ -0,0 +1,6 @@ +#Official Tool: clang-format version 8.0.0 +BasedOnStyle: google +SortIncludes: false +AlignConsecutiveAssignments: true +AllowShortCaseLabelsOnASingleLine: true +AllowShortIfStatementsOnASingleLine: true diff --git a/packages/kokkos/.clang-format-ignore b/packages/kokkos/.clang-format-ignore new file mode 100644 index 0000000000000000000000000000000000000000..b163a2bfeaf4e865925fb96f46260fd3ec2cfcb9 --- /dev/null +++ b/packages/kokkos/.clang-format-ignore @@ -0,0 +1,2 @@ +core/unit_test/config/results/* +tpls/gtest/gtest/* diff --git a/packages/kokkos/.clang-tidy b/packages/kokkos/.clang-tidy new file mode 100644 index 0000000000000000000000000000000000000000..207a105c5bdf60b807db528d612aac89e6bb88b6 --- /dev/null +++ b/packages/kokkos/.clang-tidy @@ -0,0 +1,3 @@ +Checks: '-*,kokkos-*,modernize-use-using,modernize-use-nullptr' +FormatStyle: file +HeaderFilterRegex: '.*/*.hpp' diff --git a/packages/kokkos/.codecov.yml b/packages/kokkos/.codecov.yml new file mode 100644 index 0000000000000000000000000000000000000000..097b0264a272ece51c38932b6f2486f75234f040 --- /dev/null +++ b/packages/kokkos/.codecov.yml @@ -0,0 +1,11 @@ +coverage: + precision: 1 + round: down + range: "70...100" +ignore: + - tpls/ + - algorithms/unit_tests + - core/perf_test/ + - core/unit_test/ + - containers/performance_tests + - containers/unit_tests diff --git a/packages/kokkos/.github/workflows/cancelling.yml b/packages/kokkos/.github/workflows/cancelling.yml new file mode 100644 index 0000000000000000000000000000000000000000..fa30adf956e1c272c1b8d29d131f225b1ff94919 --- /dev/null +++ b/packages/kokkos/.github/workflows/cancelling.yml @@ -0,0 +1,20 @@ +name: cancel-builds-on-update +on: + workflow_run: + workflows: ['github-Linux', 'github-OSX'] + types: ['requested'] + +jobs: + cancel-duplicate-workflow-runs: + name: "Cancel duplicate workflow runs" + runs-on: ubuntu-latest + steps: + - uses: potiuk/cancel-workflow-runs@master + name: "Cancel duplicate workflow runs" + with: + cancelMode: duplicates + cancelFutureDuplicates: true + token: ${{ secrets.GITHUB_TOKEN }} + sourceRunId: ${{ github.event.workflow_run.id }} + notifyPRCancel: true + skipEventTypes: '["push", "schedule"]' diff --git a/packages/kokkos/.github/workflows/continuous-integration-workflow.yml b/packages/kokkos/.github/workflows/continuous-integration-workflow.yml new file mode 100644 index 0000000000000000000000000000000000000000..0e5f523ccf77014b18a034659b450f7036901747 --- /dev/null +++ b/packages/kokkos/.github/workflows/continuous-integration-workflow.yml @@ -0,0 +1,72 @@ +name: github-Linux +on: [push, pull_request] + +jobs: + CI: + continue-on-error: true + strategy: + matrix: + distro: ['fedora:latest', 'fedora:rawhide', 'ubuntu:latest'] + cxx: ['g++', 'clang++'] + cmake_build_type: ['Release', 'Debug'] + openmp: ['ON'] + include: + - distro: 'fedora:intel' + cxx: 'icpc' + cmake_build_type: 'Release' + openmp: 'ON' + - distro: 'fedora:intel' + cxx: 'icpc' + cmake_build_type: 'Debug' + openmp: 'ON' + - distro: 'fedora:intel-oneapi' + cxx: 'icpx' + cmake_build_type: 'Release' + openmp: 'ON' + - distro: 'fedora:intel-oneapi' + cxx: 'icpx' + cmake_build_type: 'Debug' + openmp: 'ON' + runs-on: ubuntu-latest + container: ghcr.io/kokkos/ci-containers/${{ matrix.distro }} + steps: + - name: Checkout code + uses: actions/checkout@v2.2.0 + - uses: actions/cache@v2 + with: + path: ~/.ccache + key: kokkos-${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.cmake_build_type }}-${{ matrix.openmp }}-${github.ref}-${{ github.sha }} + restore-keys: kokkos-${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.cmake_build_type }}-${{ matrix.openmp }}-${{github.ref}} + - name: Get trial license + if: ${{ matrix.cxx == 'icpc' }} + run: | + mkdir ~/Licenses + curl https://dynamicinstaller.intel.com/api/v2/license > ~/Licenses/intel.lic + - name: maybe_disable_death_tests + if: ${{ matrix.distro == 'fedora:rawhide' }} + run: echo "GTEST_FILTER=-*DeathTest*" >> $GITHUB_ENV + - name: build-and-test + run: | + ccache -z + cmake \ + -DCMAKE_INSTALL_PREFIX=/usr \ + -DKokkos_ENABLE_HWLOC=ON \ + -DKokkos_ENABLE_OPENMP=${{ matrix.openmp }} \ + -DKokkos_ENABLE_TESTS=ON \ + -DKokkos_ENABLE_EXAMPLES=ON \ + -DCMAKE_CXX_COMPILER=${{ matrix.cxx }} \ + -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }} \ + -DBUILD_NAME=${{ matrix.distro }}-${{ matrix.cxx }} \ + -DBUILD_JOBS=2 -DBINARY_DIR=builddir -DSITE=GitHub-Linux \ + -P cmake/KokkosCI.cmake + ccache -s + - name: Test DESTDIR Install + run: DESTDIR=${PWD}/install cmake --build builddir --target install && rm -rf ${PWD}/install/usr && rmdir ${PWD}/install + - name: Install + run: sudo cmake --build builddir --target install + - name: Test install + working-directory: example/build_cmake_installed + run: | + cmake -B builddir -DCMAKE_CXX_COMPILER=${{ matrix.cxx }} + cmake --build builddir + cmake --build builddir --target test diff --git a/packages/kokkos/.github/workflows/osx.yml b/packages/kokkos/.github/workflows/osx.yml new file mode 100644 index 0000000000000000000000000000000000000000..855b557c829a609f34b82c7e5f307eef60cf0ede --- /dev/null +++ b/packages/kokkos/.github/workflows/osx.yml @@ -0,0 +1,35 @@ +name: github-OSX + +on: [push, pull_request] + +jobs: + osxci: + name: osx-ci + runs-on: [macos-latest] + + strategy: + matrix: + include: + - backend: "SERIAL" + cmake_build_type: "RelWithDebInfo" + - backend: "PTHREAD" + cmake_build_type: "RelWithDebInfo" + - backend: "SERIAL" + cmake_build_type: "Debug" + - backend: "SERIAL" + cmake_build_type: "Release" + + steps: + - uses: actions/checkout@v2 + - name: build-and-test + run: + cmake + -DKokkos_ENABLE_${{ matrix.backend }}=On + -DCMAKE_CXX_FLAGS="-Werror" + -DCMAKE_CXX_STANDARD=14 + -DKokkos_ENABLE_COMPILER_WARNINGS=ON + -DKokkos_ENABLE_TESTS=On + -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }} + -DBUILD_NAME=macOS-${{ matrix.backend }} + -DTARGET=install -DBUILD_JOBS=2 -DSITE=GitHub-OSX + -P cmake/KokkosCI.cmake diff --git a/packages/kokkos/.gitignore b/packages/kokkos/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..eb2257762bdbc1a0536bb04ef935d94387a5578d --- /dev/null +++ b/packages/kokkos/.gitignore @@ -0,0 +1,23 @@ +# Standard ignores +*~ +*.pyc +\#*# +.#* +.*.swp +.cproject +.project +testing/ +.settings/ +/.vs +/out/build +/CMakeSettings.json +/out/mytest + +# build directories in source tree +/build* + +# IDE-specific files/folders +## VSCode +/.vscode +## QtCreator +/CMakeLists.txt.user* diff --git a/packages/kokkos/.gitrepo b/packages/kokkos/.gitrepo new file mode 100644 index 0000000000000000000000000000000000000000..6dd4101e5bdf1210d26ef2ff0a34f557416c532b --- /dev/null +++ b/packages/kokkos/.gitrepo @@ -0,0 +1,12 @@ +; DO NOT EDIT (unless you know what you are doing) +; +; This subdirectory is a git "subrepo", and this file is maintained by the +; git-subrepo command. See https://github.com/git-commands/git-subrepo#readme +; +[subrepo] + remote = git@github.com:kokkos/kokkos.git + branch = master + commit = 4b97a22ff7be7635116930bb97173058d6079202 + parent = f2fc77ba9037b2a2032ab980fb445175441f6d1f + method = merge + cmdver = 0.4.3 diff --git a/packages/kokkos/.jenkins b/packages/kokkos/.jenkins new file mode 100644 index 0000000000000000000000000000000000000000..001171d648e7cfb2236d17439720562707faaab4 --- /dev/null +++ b/packages/kokkos/.jenkins @@ -0,0 +1,362 @@ +pipeline { + agent none + + environment { + CCACHE_DIR = '/tmp/ccache' + CCACHE_MAXSIZE = '10G' + CCACHE_CPP2 = 'true' + BUILD_JOBS = 8 + SITE = 'Jenkins' + } + stages { + stage('Clang-Format') { + agent { + dockerfile { + filename 'Dockerfile.clang' + dir 'scripts/docker' + label 'nvidia-docker || docker' + args '-v /tmp/ccache.kokkos:/tmp/ccache' + } + } + steps { + sh './scripts/docker/check_format_cpp.sh' + } + } + + stage('Build') { + parallel { + stage('SYCL-OneAPI') { + agent { + dockerfile { + filename 'Dockerfile.sycl' + dir 'scripts/docker' + label 'nvidia-docker && volta' + args '-v /tmp/ccache.kokkos:/tmp/ccache' + } + } + steps { + sh 'ccache --zero-stats' + sh '''rm -rf build && \ + cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_COMPILER=clang++ \ + -DCMAKE_CXX_FLAGS="-Werror -Wno-unknown-cuda-version -Wno-gnu-zero-variadic-macro-arguments" \ + -DKokkos_ARCH_VOLTA70=ON \ + -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ + -DKokkos_ENABLE_EXAMPLES=ON \ + -DKokkos_ENABLE_TESTS=ON \ + -DKokkos_ENABLE_SYCL=ON \ + -DKokkos_ENABLE_UNSUPPORTED_ARCHS=ON \ + -DCMAKE_CXX_STANDARD=17 \ + -DBUILD_NAME=${STAGE_NAME} \ + -P cmake/KokkosCI.cmake''' + } + post { + always { + sh 'ccache --show-stats' + } + } + } + stage('HIP-ROCm-3.8-C++14') { + agent { + dockerfile { + filename 'Dockerfile.hipcc' + dir 'scripts/docker' + additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:3.8' + label 'rocm-docker && vega' + args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' + } + } + environment { + OMP_NUM_THREADS = 8 + OMP_PLACES = 'threads' + OMP_PROC_BIND = 'spread' + LC_ALL = 'C' + } + steps { + sh 'ccache --zero-stats' + sh 'echo "/opt/rocm/llvm/lib" > /etc/ld.so.conf.d/llvm.conf && ldconfig' + sh '''rm -rf build && \ + cmake \ + -DCMAKE_BUILD_TYPE=Debug \ + -DCMAKE_CXX_COMPILER=hipcc \ + -DCMAKE_CXX_FLAGS="-Werror -Wno-unused-command-line-argument -DNDEBUG" \ + -DCMAKE_CXX_STANDARD=14 \ + -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ + -DKokkos_ENABLE_TESTS=ON \ + -DKokkos_ENABLE_HIP=ON \ + -DKokkos_ARCH_VEGA906=ON \ + -DKokkos_ENABLE_OPENMP=ON \ + -DBUILD_NAME=${STAGE_NAME} \ + -P cmake/KokkosCI.cmake''' + } + post { + always { + sh 'ccache --show-stats' + } + } + } + stage('HIP-ROCm-3.8-C++17') { + agent { + dockerfile { + filename 'Dockerfile.hipcc' + dir 'scripts/docker' + additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:3.8' + label 'rocm-docker && vega' + args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' + } + } + environment { + LC_ALL = 'C' + } + steps { + sh 'ccache --zero-stats' + sh '''rm -rf build && \ + cmake \ + -DCMAKE_BUILD_TYPE=RelWithDebInfo \ + -DCMAKE_CXX_COMPILER=hipcc \ + -DCMAKE_CXX_FLAGS="-Werror -Wno-unused-command-line-argument" \ + -DCMAKE_CXX_STANDARD=17 \ + -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ + -DKokkos_ENABLE_TESTS=ON \ + -DKokkos_ENABLE_HIP=ON \ + -DKokkos_ARCH_VEGA906=ON \ + -DBUILD_NAME=${STAGE_NAME} \ + -P cmake/KokkosCI.cmake''' + } + post { + always { + sh 'ccache --show-stats' + } + } + } + stage('OPENMPTARGET-Clang') { + agent { + dockerfile { + filename 'Dockerfile.openmptarget' + dir 'scripts/docker' + label 'nvidia-docker && volta' + args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES' + } + } + steps { + sh 'ccache --zero-stats' + sh '''rm -rf build && \ + cmake \ + -DCMAKE_BUILD_TYPE=RelWithDebInfo \ + -DCMAKE_CXX_COMPILER=clang++ \ + -DCMAKE_CXX_FLAGS="-Wno-unknown-cuda-version -Werror -Wno-undefined-internal -Wno-pass-failed" \ + -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ + -DKokkos_ENABLE_TESTS=ON \ + -DKokkos_ENABLE_TUNING=ON \ + -DKokkos_ENABLE_OPENMPTARGET=ON \ + -DKokkos_ARCH_VOLTA70=ON \ + -DCMAKE_CXX_STANDARD=17 \ + -DBUILD_NAME=${STAGE_NAME} \ + -P cmake/KokkosCI.cmake''' + } + post { + always { + sh 'ccache --show-stats' + } + } + } + stage('CUDA-10.1-Clang-Tidy') { + agent { + dockerfile { + filename 'Dockerfile.kokkosllvmproject' + dir 'scripts/docker' + label 'nvidia-docker && volta' + args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES' + } + } + steps { + sh 'ccache --zero-stats' + sh '''rm -rf build && \ + cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_CLANG_TIDY="clang-tidy;-warnings-as-errors=*" \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_COMPILER=clang++ \ + -DCMAKE_CXX_FLAGS=-Werror \ + -DCMAKE_CXX_STANDARD=14 \ + -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ + -DKokkos_ENABLE_TESTS=ON \ + -DKokkos_ENABLE_CUDA=ON \ + -DKokkos_ENABLE_CUDA_LAMBDA=ON \ + -DKokkos_ENABLE_TUNING=ON \ + -DKokkos_ARCH_VOLTA70=ON \ + -DBUILD_NAME=${STAGE_NAME} \ + -P cmake/KokkosCI.cmake''' + } + post { + always { + sh 'ccache --show-stats' + } + } + } + stage('CUDA-9.2-NVCC') { + agent { + dockerfile { + filename 'Dockerfile.nvcc' + dir 'scripts/docker' + additionalBuildArgs '--build-arg BASE=nvidia/cuda:9.2-devel' + label 'nvidia-docker && volta' + args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES' + } + } + steps { + sh 'ccache --zero-stats' + sh '''rm -rf build && mkdir -p build && cd build && \ + ../gnu_generate_makefile.bash \ + --with-options=compiler_warnings \ + --cxxflags="-Werror" \ + --cxxstandard=c++14 \ + --with-cuda \ + --with-cuda-options=enable_lambda \ + --arch=Volta70 \ + .. && \ + make test -j8''' + } + post { + always { + sh 'ccache --show-stats' + } + } + } + stage('CUDA-11.0-NVCC-C++17-RDC') { + agent { + dockerfile { + filename 'Dockerfile.nvcc' + dir 'scripts/docker' + additionalBuildArgs '--build-arg BASE=nvidia/cuda:11.0-devel --build-arg ADDITIONAL_PACKAGES="g++-8 gfortran clang" --build-arg CMAKE_VERSION=3.17.3' + label 'nvidia-docker && volta' + args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES' + } + } + environment { + OMP_NUM_THREADS = 8 + OMP_PLACES = 'threads' + OMP_PROC_BIND = 'spread' + NVCC_WRAPPER_DEFAULT_COMPILER = 'g++-8' + } + steps { + sh 'ccache --zero-stats' + sh '''rm -rf install && mkdir -p install && \ + rm -rf build && \ + cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_COMPILER=g++-8 \ + -DCMAKE_CXX_FLAGS=-Werror \ + -DCMAKE_CXX_STANDARD=17 \ + -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ + -DKokkos_ENABLE_OPENMP=ON \ + -DKokkos_ENABLE_CUDA=ON \ + -DKokkos_ENABLE_CUDA_LAMBDA=OFF \ + -DKokkos_ENABLE_CUDA_UVM=ON \ + -DKokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE=ON \ + -DCMAKE_INSTALL_PREFIX=${PWD}/install \ + -DBUILD_NAME=${STAGE_NAME} \ + -DTARGET=install \ + -P cmake/KokkosCI.cmake && \ + rm -rf build-tests && mkdir -p build-tests && cd build-tests && \ + export CMAKE_PREFIX_PATH=${PWD}/../install && \ + cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_COMPILER=$WORKSPACE/bin/nvcc_wrapper \ + -DCMAKE_CXX_FLAGS=-Werror \ + -DCMAKE_CXX_STANDARD=17 \ + -DKokkos_INSTALL_TESTING=ON \ + .. && \ + make -j8 && ctest --verbose && \ + cd ../example/build_cmake_installed && \ + rm -rf build && mkdir -p build && cd build && \ + cmake \ + -DCMAKE_CXX_COMPILER=g++-8 \ + -DCMAKE_CXX_FLAGS=-Werror \ + -DCMAKE_CXX_STANDARD=17 \ + .. && \ + make -j8 && ctest --verbose && \ + cd ../.. && \ + cmake -B build_cmake_installed_different_compiler/build -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CXX_FLAGS=-Werror -DCMAKE_CXX_STANDARD=17 build_cmake_installed_different_compiler && \ + cmake --build build_cmake_installed_different_compiler/build --target all && \ + cmake --build build_cmake_installed_different_compiler/build --target test''' + } + post { + always { + sh 'ccache --show-stats' + } + } + } + stage('CUDA-10.1-NVCC-DEBUG') { + agent { + dockerfile { + filename 'Dockerfile.nvcc' + dir 'scripts/docker' + additionalBuildArgs '--build-arg BASE=nvidia/cuda:10.1-devel' + label 'nvidia-docker && volta' + args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES' + } + } + steps { + sh 'ccache --zero-stats' + sh '''rm -rf build && \ + cmake \ + -DCMAKE_BUILD_TYPE=Debug \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_COMPILER=$WORKSPACE/bin/nvcc_wrapper \ + -DCMAKE_CXX_FLAGS=-Werror \ + -DCMAKE_CXX_STANDARD=14 \ + -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ + -DKokkos_ENABLE_DEBUG=ON \ + -DKokkos_ENABLE_DEBUG_BOUNDS_CHECK=ON \ + -DKokkos_ENABLE_TESTS=ON \ + -DKokkos_ENABLE_CUDA=ON \ + -DKokkos_ENABLE_CUDA_LAMBDA=ON \ + -DKokkos_ENABLE_LIBDL=OFF \ + -DBUILD_NAME=${STAGE_NAME} \ + -DTARGET=install \ + -P cmake/KokkosCI.cmake && \ + cd example/build_cmake_in_tree && \ + rm -rf build && mkdir -p build && cd build && \ + cmake -DCMAKE_CXX_STANDARD=14 .. && make -j8 && ctest --verbose''' + } + post { + always { + sh 'ccache --show-stats' + } + } + } + stage('GCC-5.3.0') { + agent { + dockerfile { + filename 'Dockerfile.gcc' + dir 'scripts/docker' + label 'docker' + } + } + environment { + OMP_NUM_THREADS = 8 + OMP_PROC_BIND = 'true' + } + steps { + sh '''rm -rf build && \ + cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_STANDARD=14 \ + -DCMAKE_CXX_FLAGS=-Werror \ + -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ + -DKokkos_ENABLE_TESTS=ON \ + -DKokkos_ENABLE_OPENMP=ON \ + -DKokkos_ENABLE_LIBDL=OFF \ + -DBUILD_NAME=${STAGE_NAME} \ + -P cmake/KokkosCI.cmake && \ + gcc -I$PWD/core/src core/unit_test/tools/TestCInterface.c''' + } + } + } + } + } +} diff --git a/packages/kokkos/.travis.yml b/packages/kokkos/.travis.yml new file mode 100644 index 0000000000000000000000000000000000000000..04ef01c1602cf87aae3e39225037d65f49651f62 --- /dev/null +++ b/packages/kokkos/.travis.yml @@ -0,0 +1,109 @@ +sudo: false + +language: cpp + +os: + - linux + +compiler: + - gcc + - clang + +cache: + - ccache + +stages: + - canary + - test + +jobs: + include: + - stage: canary + env: CMAKE_BUILD_TYPE=Release BACKEND="OPENMP" + os: linux + +branches: + only: + - master + - develop + - /^release/ + +env: + - +# - BACKEND="OPENMP" + - BACKEND="PTHREAD" + - CMAKE_BUILD_TYPE=Debug COVERAGE=yes GTEST_FILTER="-*DeathTest*" + - CMAKE_BUILD_TYPE=Debug BACKEND="OPENMP" COVERAGE=yes GTEST_FILTER="-*DeathTest*" +# - CMAKE_BUILD_TYPE=Debug BACKEND="PTHREAD" COVERAGE=yes + - CMAKE_BUILD_TYPE=Release + - CMAKE_BUILD_TYPE=Release BACKEND="OPENMP" +# - CMAKE_BUILD_TYPE=Release BACKEND="PTHREAD" + +matrix: + exclude: + - os: linux + compiler: gcc + env: CMAKE_BUILD_TYPE=Release BACKEND="OPENMP" + +# Install newer CMake. The distribution comes with CMake 3.12.4 but we require at least 3.16 +install: + - CMAKE_VERSION=3.17.1 + - CMAKE_DIR=/opt/cmake + - CMAKE_KEY=2D2CEF1034921684 && + CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && + CMAKE_SCRIPT=cmake-${CMAKE_VERSION}-Linux-x86_64.sh && + CMAKE_SHA256=cmake-${CMAKE_VERSION}-SHA-256.txt && + wget --quiet ${CMAKE_URL}/${CMAKE_SHA256} && + wget --quiet ${CMAKE_URL}/${CMAKE_SHA256}.asc && + wget --quiet ${CMAKE_URL}/${CMAKE_SCRIPT} && + #gpg --keyserver pool.sks-keyservers.net --recv-keys ${CMAKE_KEY} && + #gpg --verify ${CMAKE_SHA256}.asc ${CMAKE_SHA256} && + #grep ${CMAKE_SCRIPT} ${CMAKE_SHA256} | sha256sum --check && + mkdir -p ${CMAKE_DIR} && + sh ${CMAKE_SCRIPT} --skip-license --prefix=${CMAKE_DIR} && + rm cmake* + - PATH=${CMAKE_DIR}/bin:$PATH + - cd ${TRAVIS_BUILD_DIR} + +before_script: + - ccache -z + - if [[ ${COVERAGE} ]]; then export CXX="${CXX} --coverage"; export BUILD_NAME_SUFFIX="-Coverage"; fi + - if [[ ! ${CMAKE_BUILD_TYPE} ]]; then export CXXFLAGS="${CXXFLAGS} -O2"; fi + +script: + - export OMP_NUM_THREADS=2 + - export OMP_PLACES=threads + - export OMP_PROC_BIND=spread + - export BUILD_JOBS=2 + # LD_LIBRARY_PATH workaround to find clang's libomp: https://github.com/travis-ci/travis-ci/issues/8613 + - if [[ ${CC} = clang ]]; then export LD_LIBRARY_PATH=/usr/local/clang/lib${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH; fi + # enable ccache for clang on linux and add CCACHE_CPP2 to avoid 'Argument unused during compilation -I...' warning + - if [[ ${TRAVIS_OS_NAME} = linux && ${CC} = clang ]]; then + ln -s /usr/bin/ccache $HOME/bin/clang++; + export CCACHE_CPP2=yes; + fi + - cmake + ${BACKEND:+-DKokkos_ENABLE_${BACKEND}=On} + -DCMAKE_CXX_FLAGS="${CXXFLAGS} -Werror" + -DCMAKE_CXX_STANDARD=14 + -DKokkos_ENABLE_COMPILER_WARNINGS=ON + -DKokkos_ENABLE_TESTS=On + ${CMAKE_BUILD_TYPE:+-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}} + -DBUILD_NAME="${CC}-${BACKEND}${BUILD_NAME_SUFFIX}" + -DSITE=Travis + -P cmake/KokkosCI.cmake && + pushd build && + make install DESTDIR=${PWD}/install && rm -rf ${PWD}/install/usr/local && rmdir ${PWD}/install/usr && + popd + +after_success: + - ccache -s + - if [[ ${COVERAGE} ]]; then + mkdir -p $HOME/.local/bin && wget -O $HOME/.local/bin/codecov https://codecov.io/bash && chmod +x $HOME/.local/bin/codecov; + pushd build && + if [[ ${CC} = clang* ]]; then + codecov -x "llvm-cov gcov" -F "${CC}"; + else + codecov -x gcov -F "${CC}"; + fi; + fi diff --git a/packages/kokkos/BUILD.md b/packages/kokkos/BUILD.md new file mode 100644 index 0000000000000000000000000000000000000000..e1f0e3e472762fa7c78a68350da9e9bc74f41de1 --- /dev/null +++ b/packages/kokkos/BUILD.md @@ -0,0 +1,334 @@ + + +# Installing and Using Kokkos + +## Kokkos Philosophy +Kokkos provides a modern CMake style build system. +As C++ continues to develop for C++20 and beyond, CMake is likely to provide the most robust support +for C++. Applications heavily leveraging Kokkos are strongly encouraged to use a CMake build system. + +You can either use Kokkos as an installed package (encouraged) or use Kokkos in-tree in your project. +Modern CMake is exceedingly simple at a high-level (with the devil in the details). +Once Kokkos is installed In your `CMakeLists.txt` simply use: +````cmake +find_package(Kokkos REQUIRED) +```` +Then for every executable or library in your project: +````cmake +target_link_libraries(myTarget Kokkos::kokkos) +```` +That's it! There is no checking Kokkos preprocessor, compiler, or linker flags. +Kokkos propagates all the necessary flags to your project. +This means not only is linking to Kokkos easy, but Kokkos itself can actually configure compiler and linker flags for *your* +project. +When configuring your project just set: +````bash +> cmake ${srcdir} \ + -DKokkos_ROOT=${kokkos_install_prefix} \ + -DCMAKE_CXX_COMPILER=${compiler_used_to_build_kokkos} +```` +Note: You may need the following if using some versions of CMake (e.g. 3.12): +````cmake +cmake_policy(SET CMP0074 NEW) +```` +If building in-tree, there is no `find_package`. You can use `add_subdirectory(kokkos)` with the Kokkos source and again just link with `target_link_libraries(Kokkos::kokkos)`. +The examples in `examples/cmake_build_installed` and `examples/cmake_build_in_tree` can help get you started. + + +## Configuring CMake +A very basic installation of Kokkos is done with: +````bash +> cmake ${srcdir} \ + -DCMAKE_CXX_COMPILER=g++ \ + -DCMAKE_INSTALL_PREFIX=${kokkos_install_folder} +```` +which builds and installed a default Kokkos when you run `make install`. +There are numerous device backends, options, and architecture-specific optimizations that can be configured, e.g. +````bash +> cmake ${srcdir} \ + -DCMAKE_CXX_COMPILER=g++ \ + -DCMAKE_INSTALL_PREFIX=${kokkos_install_folder} \ + -DKokkos_ENABLE_OPENMP=ON +```` +which activates the OpenMP backend. All of the options controlling device backends, options, architectures, and third-party libraries (TPLs) are given below. + +## Known Issues<a name="KnownIssues"></a> + +### Cray + +* The Cray compiler wrappers do static linking by default. This seems to break the Kokkos build. You will likely need to set the environment variable `CRAYPE_LINK_TYPE=dynamic` in order to link correctly. Kokkos warns during configure if this is missing. +* The Cray compiler identifies to CMake as Clang, but it sometimes has its own flags that differ from Clang. We try to include all exceptions, but flag errors may occur in which a Clang-specific flag is passed that the Cray compiler does not recognize. + +### Fortran + +* In a mixed C++/Fortran code, CMake will use the C++ linker by default. If you override this behavior and use Fortran as the link language, the link may break because Kokkos adds linker flags expecting the linker to be C++. Prior to CMake 3.18, Kokkos has no way of detecting in downstream projects that the linker was changed to Fortran. From CMake 3.18, Kokkos can use generator expressions to avoid adding flags when the linker is not C++. Note: Kokkos will not add any linker flags in this Fortran case. The user will be entirely on their own to add the appropriate linker flags. + +## Spack +An alternative to manually building with the CMake is to use the Spack package manager. +Make sure you have downloaded [Spack](https://github.com/spack/spack). +The easiest way to configure the Spack environment is: +````bash +> source spack/share/spack/setup-env.sh +```` +with other scripts available for other shells. +You can display information about how to install packages with: +````bash +> spack info kokkos +A basic installation would be done as: +````bash +> spack install kokkos +```` +Spack allows options and and compilers to be tuned in the install command. +````bash +> spack install kokkos@3.0 %gcc@7.3.0 +openmp +```` +This example illustrates the three most common parameters to Spack: +* Variants: specified with, e.g. `+openmp`, this activates (or deactivates with, e.g. `~openmp`) certain options. +* Version: immediately following `kokkos` the `@version` can specify a particular Kokkos to build +* Compiler: a default compiler will be chosen if not specified, but an exact compiler version can be given with the `%`option. + +For a complete list of Kokkos options, run: +````bash +> spack info kokkos +```` +More details can be found in the [Spack README](Spack.md) + +#### Spack Development +Spack currently installs packages to a location determined by a unique hash. This hash name is not really "human readable". +Generally, Spack usage should never really require you to reference the computer-generated unique install folder. +If you must know, you can locate Spack Kokkos installations with: +````bash +> spack find -p kokkos ... +```` +where `...` is the unique spec identifying the particular Kokkos configuration and version. + +A better way to use Spack for doing Kokkos development is the dev-build feature of Spack. +For dev-build details, consult the kokkos-spack repository [README](https://github.com/kokkos/kokkos-spack/blob/master/README.md). + +# Kokkos Keyword Listing + +## Device Backends +Device backends can be enabled by specifying `-DKokkos_ENABLE_X`. + +* Kokkos_ENABLE_CUDA + * Whether to build CUDA backend + * BOOL Default: OFF +* Kokkos_ENABLE_HPX + * Whether to build HPX backend (experimental) + * BOOL Default: OFF +* Kokkos_ENABLE_OPENMP + * Whether to build OpenMP backend + * BOOL Default: OFF +* Kokkos_ENABLE_PTHREAD + * Whether to build Pthread backend + * BOOL Default: OFF +* Kokkos_ENABLE_SERIAL + * Whether to build serial backend + * BOOL Default: ON +* Kokkos_ENABLE_HIP (Experimental) + * Whether to build HIP backend + * BOOL Default: OFF +* Kokkos_ENABLE_OPENMPTARGET (Experimental) + * Whether to build the OpenMP target backend + * BOOL Default: OFF + +## Enable Options +Options can be enabled by specifying `-DKokkos_ENABLE_X`. + +* Kokkos_ENABLE_AGGRESSIVE_VECTORIZATION + * Whether to aggressively vectorize loops + * BOOL Default: OFF +* Kokkos_ENABLE_COMPILER_WARNINGS + * Whether to print all compiler warnings + * BOOL Default: OFF +* Kokkos_ENABLE_CUDA_CONSTEXPR + * Whether to activate experimental relaxed constexpr functions + * BOOL Default: OFF +* Kokkos_ENABLE_CUDA_LAMBDA + * Whether to activate experimental lambda features + * BOOL Default: OFF +* Kokkos_ENABLE_CUDA_LDG_INTRINSIC + * Whether to use CUDA LDG intrinsics + * BOOL Default: OFF +* Kokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE + * Whether to enable relocatable device code (RDC) for CUDA + * BOOL Default: OFF +* Kokkos_ENABLE_CUDA_UVM + * Whether to use unified memory (UM) by default for CUDA + * BOOL Default: OFF +* Kokkos_ENABLE_DEBUG + * Whether to activate extra debug features - may increase compile times + * BOOL Default: OFF +* Kokkos_ENABLE_DEBUG_BOUNDS_CHECK + * Whether to use bounds checking - will increase runtime + * BOOL Default: OFF +* Kokkos_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK + * Debug check on dual views + * BOOL Default: OFF +* Kokkos_ENABLE_EXAMPLES + * Whether to enable building examples + * BOOL Default: OFF +* Kokkos_ENABLE_HPX_ASYNC_DISPATCH + * Whether HPX supports asynchronous dispatch + * BOOL Default: OFF +* Kokkos_ENABLE_LARGE_MEM_TESTS + * Whether to perform extra large memory tests + * BOOL_Default: OFF +* Kokkos_ENABLE_PROFILING_LOAD_PRINT + * Whether to print information about which profiling tools gotloaded + * BOOL Default: OFF +* Kokkos_ENABLE_TESTS + * Whether to build serial backend + * BOOL Default: OFF + +## Other Options +* Kokkos_CXX_STANDARD + * The C++ standard for Kokkos to use: c++14, c++17, or c++20. This should be given in CMake style as 14, 17, or 20. + * STRING Default: 14 + +## Third-party Libraries (TPLs) +The following options control enabling TPLs: +* Kokkos_ENABLE_HPX + * Whether to enable the HPX library + * BOOL Default: OFF +* Kokkos_ENABLE_HWLOC + * Whether to enable the HWLOC library + * BOOL Default: Off +* Kokkos_ENABLE_LIBNUMA + * Whether to enable the LIBNUMA library + * BOOL Default: Off +* Kokkos_ENABLE_MEMKIND + * Whether to enable the MEMKIND library + * BOOL Default: Off +* Kokkos_ENABLE_LIBDL + * Whether to enable the LIBDL library + * BOOL Default: On +* Kokkos_ENABLE_LIBRT + * Whether to enable the LIBRT library + * BOOL Default: Off + +The following options control finding and configuring non-CMake TPLs: +* Kokkos_CUDA_DIR or CUDA_ROOT + * Location of CUDA install prefix for libraries + * PATH Default: +* Kokkos_HWLOC_DIR or HWLOC_ROOT + * Location of HWLOC install prefix + * PATH Default: +* Kokkos_LIBNUMA_DIR or LIBNUMA_ROOT + * Location of LIBNUMA install prefix + * PATH Default: +* Kokkos_MEMKIND_DIR or MEMKIND_ROOT + * Location of MEMKIND install prefix + * PATH Default: +* Kokkos_LIBDL_DIR or LIBDL_ROOT + * Location of LIBDL install prefix + * PATH Default: +* Kokkos_LIBRT_DIR or LIBRT_ROOT + * Location of LIBRT install prefix + * PATH Default: + +The following options control `find_package` paths for CMake-based TPLs: +* HPX_DIR or HPX_ROOT + * Location of HPX prefix (ROOT) or CMake config file (DIR) + * PATH Default: + +## Architecture Keywords +Architecture-specific optimizations can be enabled by specifying `-DKokkos_ARCH_X`. + +* Kokkos_ARCH_AMDAVX + * Whether to optimize for the AMDAVX architecture + * BOOL Default: OFF +* Kokkos_ARCH_ARMV80 + * Whether to optimize for the ARMV80 architecture + * BOOL Default: OFF +* Kokkos_ARCH_ARMV81 + * Whether to optimize for the ARMV81 architecture + * BOOL Default: OFF +* Kokkos_ARCH_ARMV8_THUNDERX + * Whether to optimize for the ARMV8_THUNDERX architecture + * BOOL Default: OFF +* Kokkos_ARCH_ARMV8_TX2 + * Whether to optimize for the ARMV8_TX2 architecture + * BOOL Default: OFF +* Kokkos_ARCH_BDW + * Whether to optimize for the BDW architecture + * BOOL Default: OFF +* Kokkos_ARCH_BGQ + * Whether to optimize for the BGQ architecture + * BOOL Default: OFF +* Kokkos_ARCH_ZEN + * Whether to optimize for the Zen architecture + * BOOL Default: OFF +* Kokkos_ARCH_ZEN2 + * Whether to optimize for the Zen2 architecture + * BOOL Default: OFF +* Kokkos_ARCH_HSW + * Whether to optimize for the HSW architecture + * BOOL Default: OFF +* Kokkos_ARCH_KEPLER30 + * Whether to optimize for the KEPLER30 architecture + * BOOL Default: OFF +* Kokkos_ARCH_KEPLER32 + * Whether to optimize for the KEPLER32 architecture + * BOOL Default: OFF +* Kokkos_ARCH_KEPLER35 + * Whether to optimize for the KEPLER35 architecture + * BOOL Default: OFF +* Kokkos_ARCH_KEPLER37 + * Whether to optimize for the KEPLER37 architecture + * BOOL Default: OFF +* Kokkos_ARCH_KNC + * Whether to optimize for the KNC architecture + * BOOL Default: OFF +* Kokkos_ARCH_KNL + * Whether to optimize for the KNL architecture + * BOOL Default: OFF +* Kokkos_ARCH_MAXWELL50 + * Whether to optimize for the MAXWELL50 architecture + * BOOL Default: OFF +* Kokkos_ARCH_MAXWELL52 + * Whether to optimize for the MAXWELL52 architecture + * BOOL Default: OFF +* Kokkos_ARCH_MAXWELL53 + * Whether to optimize for the MAXWELL53 architecture + * BOOL Default: OFF +* Kokkos_ARCH_PASCAL60 + * Whether to optimize for the PASCAL60 architecture + * BOOL Default: OFF +* Kokkos_ARCH_PASCAL61 + * Whether to optimize for the PASCAL61 architecture + * BOOL Default: OFF +* Kokkos_ARCH_POWER7 + * Whether to optimize for the POWER7 architecture + * BOOL Default: OFF +* Kokkos_ARCH_POWER8 + * Whether to optimize for the POWER8 architecture + * BOOL Default: OFF +* Kokkos_ARCH_POWER9 + * Whether to optimize for the POWER9 architecture + * BOOL Default: OFF +* Kokkos_ARCH_SKX + * Whether to optimize for the SKX architecture + * BOOL Default: OFF +* Kokkos_ARCH_SNB + * Whether to optimize for the SNB architecture + * BOOL Default: OFF +* Kokkos_ARCH_TURING75 + * Whether to optimize for the TURING75 architecture + * BOOL Default: OFF +* Kokkos_ARCH_VOLTA70 + * Whether to optimize for the VOLTA70 architecture + * BOOL Default: OFF +* Kokkos_ARCH_VOLTA72 + * Whether to optimize for the VOLTA72 architecture + * BOOL Default: OFF +* Kokkos_ARCH_WSM + * Whether to optimize for the WSM architecture + * BOOL Default: OFF + +##### [LICENSE](https://github.com/kokkos/kokkos/blob/devel/LICENSE) + +[](https://opensource.org/licenses/BSD-3-Clause) + +Under the terms of Contract DE-NA0003525 with NTESS, +the U.S. Government retains certain rights in this software. diff --git a/packages/kokkos/CHANGELOG.md b/packages/kokkos/CHANGELOG.md new file mode 100644 index 0000000000000000000000000000000000000000..3ce38c37d866dacc25528f5597461e7629175e00 --- /dev/null +++ b/packages/kokkos/CHANGELOG.md @@ -0,0 +1,1240 @@ +# Change Log + +## [3.4.00](https://github.com/kokkos/kokkos/tree/3.4.00) (2021-04-25) +[Full Changelog](https://github.com/kokkos/kokkos/compare/3.3.01...3.4.00) + +**Highlights:** +- SYCL Backend Almost Feature Complete +- OpenMPTarget Backend Almost Feature Complete +- Performance Improvements for HIP backend +- Require CMake 3.16 or newer +- Tool Callback Interface Enhancements +- cmath wrapper functions available now in Kokkos::Experimental + +**Features:** +- Implement parallel_scan with ThreadVectorRange and Reducer [\#3861](https://github.com/kokkos/kokkos/pull/3861) +- Implement SYCL Random [\#3849](https://github.com/kokkos/kokkos/pull/3849) +- OpenMPTarget: Adding Implementation for nested reducers [\#3845](https://github.com/kokkos/kokkos/pull/3845) +- Implement UniqueToken for SYCL [\#3833](https://github.com/kokkos/kokkos/pull/3833) +- OpenMPTarget: UniqueToken::Global implementation [\#3823](https://github.com/kokkos/kokkos/pull/3823) +- DualView sync's on ExecutionSpaces [\#3822](https://github.com/kokkos/kokkos/pull/3822) +- SYCL outer TeamPolicy parallel_reduce [\#3818](https://github.com/kokkos/kokkos/pull/3818) +- SYCL TeamPolicy::team_scan [\#3815](https://github.com/kokkos/kokkos/pull/3815) +- SYCL MDRangePolicy parallel_reduce [\#3801](https://github.com/kokkos/kokkos/pull/3801) +- Enable use of execution space instances in ScatterView [\#3786](https://github.com/kokkos/kokkos/pull/3786) +- SYCL TeamPolicy nested parallel_reduce [\#3783](https://github.com/kokkos/kokkos/pull/3783) +- OpenMPTarget: MDRange with TagType for parallel_for [\#3781](https://github.com/kokkos/kokkos/pull/3781) +- Adding OpenMPTarget parallel_scan [\#3655](https://github.com/kokkos/kokkos/pull/3655) +- SYCL basic TeamPolicy [\#3654](https://github.com/kokkos/kokkos/pull/3654) +- OpenMPTarget: scratch memory implementation [\#3611](https://github.com/kokkos/kokkos/pull/3611) + +**Implemented enhancements Backends and Archs:** +- SYCL choose a specific GPU [\#3918](https://github.com/kokkos/kokkos/pull/3918) +- [HIP] Lock access to scratch memory when using Teams [\#3916](https://github.com/kokkos/kokkos/pull/3916) +- [HIP] fix multithreaded access to get_next_driver [\#3908](https://github.com/kokkos/kokkos/pull/3908) +- Forward declare HIPHostPinnedSpace and SYCLSharedUSMSpace [\#3902](https://github.com/kokkos/kokkos/pull/3902) +- Let SYCL USMObjectMem use SharedAllocationRecord [\#3898](https://github.com/kokkos/kokkos/pull/3898) +- Implement clock_tic for SYCL [\#3893](https://github.com/kokkos/kokkos/pull/3893) +- Don't use a static variable in HIPInternal::scratch_space [\#3866](https://github.com/kokkos/kokkos/pull/3866)(https://github.com/kokkos/kokkos/pull/3866) +- Reuse memory for SYCL parallel_reduce [\#3873](https://github.com/kokkos/kokkos/pull/3873) +- Update SYCL compiler in CI [\#3826](https://github.com/kokkos/kokkos/pull/3826) +- Introduce HostSharedPtr to manage m_space_instance for Cuda/HIP/SYCL [\#3824](https://github.com/kokkos/kokkos/pull/3824) +- [HIP] Use shuffle for range reduction [\#3811](https://github.com/kokkos/kokkos/pull/3811) +- OpenMPTarget: Changes to the hierarchical parallelism [\#3808](https://github.com/kokkos/kokkos/pull/3808) +- Remove ExtendedReferenceWrapper for SYCL parallel_reduce [\#3802](https://github.com/kokkos/kokkos/pull/3802) +- Eliminate sycl_indirect_launch [\#3777](https://github.com/kokkos/kokkos/pull/3777) +- OpenMPTarget: scratch implementation for parallel_reduce [\#3776](https://github.com/kokkos/kokkos/pull/3776) +- Allow initializing SYCL execution space from sycl::queue and SYCL::impl_static_fence [\#3767](https://github.com/kokkos/kokkos/pull/3767) +- SYCL TeamPolicy scratch memory alternative [\#3763](https://github.com/kokkos/kokkos/pull/3763) +- Alternative implementation for SYCL TeamPolicy [\#3759](https://github.com/kokkos/kokkos/pull/3759) +- Unify handling of synchronous errors in SYCL [\#3754](https://github.com/kokkos/kokkos/pull/3754) +- core/Cuda: Half_t updates for cgsolve [\#3746](https://github.com/kokkos/kokkos/pull/3746) +- Unify HIPParallelLaunch structures [\#3733](https://github.com/kokkos/kokkos/pull/3733) +- Improve performance for SYCL parallel_reduce [\#3732](https://github.com/kokkos/kokkos/pull/3732) +- Use consistent types in Kokkos_OpenMPTarget_Parallel.hpp [\#3703](https://github.com/kokkos/kokkos/pull/3703) +- Implement non-blocking kernel launches for HIP backend [\#3697](https://github.com/kokkos/kokkos/pull/3697) +- Change SYCLInternal::m_queue std::unique_ptr -> std::optional [\#3677](https://github.com/kokkos/kokkos/pull/3677) +- Use alternative SYCL parallel_reduce implementation [\#3671](https://github.com/kokkos/kokkos/pull/3671) +- Use runtime values in KokkosExp_MDRangePolicy.hpp [\#3626](https://github.com/kokkos/kokkos/pull/3626) +- Clean up AnalyzePolicy [\#3564](https://github.com/kokkos/kokkos/pull/3564) +- Changes for indirect launch of SYCL parallel reduce [\#3511](https://github.com/kokkos/kokkos/pull/3511) + +**Implemented enhancements BuildSystem:** +- Also require C++14 when building gtest [\#3912](https://github.com/kokkos/kokkos/pull/3912) +- Fix compiling SYCL with OpenMP [\#3874](https://github.com/kokkos/kokkos/pull/3874) +- Require C++17 for SYCL (at configuration time) [\#3869](https://github.com/kokkos/kokkos/pull/3869) +- Add COMPILE_DEFINITIONS argument to kokkos_create_imported_tpl [\#3862](https://github.com/kokkos/kokkos/pull/3862) +- Do not pass arch flags to the linker with no rdc [\#3846](https://github.com/kokkos/kokkos/pull/3846) +- Try compiling C++14 check with C++14 support and print error message [\#3843](https://github.com/kokkos/kokkos/pull/3843) +- Enable HIP with Cray Clang [\#3842](https://github.com/kokkos/kokkos/pull/3842) +- Add an option to disable header self containment tests [\#3834](https://github.com/kokkos/kokkos/pull/3834) +- CMake check for C++14 [\#3809](https://github.com/kokkos/kokkos/pull/3809) +- Prefer -std=* over --std=* [\#3779](https://github.com/kokkos/kokkos/pull/3779) +- Kokkos launch compiler updates [\#3778](https://github.com/kokkos/kokkos/pull/3778) +- Updated comments and enabled no-op for kokkos_launch_compiler [\#3774](https://github.com/kokkos/kokkos/pull/3774) +- Apple's Clang not correctly recognised [\#3772](https://github.com/kokkos/kokkos/pull/3772) +- kokkos_launch_compiler + CUDA auto-detect arch [\#3770](https://github.com/kokkos/kokkos/pull/3770) +- Add Spack test support for Kokkos [\#3753](https://github.com/kokkos/kokkos/pull/3753) +- Split SYCL tests for aot compilation [\#3741](https://github.com/kokkos/kokkos/pull/3741) +- Use consistent OpenMP flag for IntelClang [\#3735](https://github.com/kokkos/kokkos/pull/3735) +- Add support for -Wno-deprecated-gpu-targets [\#3722](https://github.com/kokkos/kokkos/pull/3722) +- Add configuration to target CUDA compute capability 8.6 [\#3713](https://github.com/kokkos/kokkos/pull/3713) +- Added VERSION and SOVERSION to KOKKOS_INTERNAL_ADD_LIBRARY [\#3706](https://github.com/kokkos/kokkos/pull/3706) +- Add fast-math to known NVCC flags [\#3699](https://github.com/kokkos/kokkos/pull/3699) +- Add MI-100 arch string [\#3698](https://github.com/kokkos/kokkos/pull/3698) +- Require CMake >=3.16 [\#3679](https://github.com/kokkos/kokkos/pull/3679) +- KokkosCI.cmake, KokkosCTest.cmake.in, CTestConfig.cmake.in + CI updates [\#2844](https://github.com/kokkos/kokkos/pull/2844) + +**Implemented enhancements Tools:** +- Improve readability of the callback invocation in profiling [\#3860](https://github.com/kokkos/kokkos/pull/3860) +- V1.1 Tools Interface: incremental, action-based [\#3812](https://github.com/kokkos/kokkos/pull/3812) +- Enable launch latency simulations [\#3721](https://github.com/kokkos/kokkos/pull/3721) +- Added metadata callback to tools interface [\#3711](https://github.com/kokkos/kokkos/pull/3711) +- MDRange Tile Size Tuning [\#3688](https://github.com/kokkos/kokkos/pull/3688) +- Added support for command-line args for kokkos-tools [\#3627](https://github.com/kokkos/kokkos/pull/3627) +- Query max tile sizes for an MDRangePolicy, and set tile sizes on an existing policy [\#3481](https://github.com/kokkos/kokkos/pull/3481) + +**Implemented enhancements Other:** +- Try detecting ndevices in get_gpu [\#3921](https://github.com/kokkos/kokkos/pull/3921) +- Use strcmp to compare names() [\#3909](https://github.com/kokkos/kokkos/pull/3909) +- Add execution space arguments for constructor overloads that might allocate a new underlying View [\#3904](https://github.com/kokkos/kokkos/pull/3904) +- Prefix labels in internal use of kokkos_malloc [\#3891](https://github.com/kokkos/kokkos/pull/3891) +- Prefix labels for internal uses of SharedAllocationRecord [\#3890](https://github.com/kokkos/kokkos/pull/3890) +- Add missing hypot math function [\#3880](https://github.com/kokkos/kokkos/pull/3880) +- Unify algorithm unit tests to avoid code duplication [\#3851](https://github.com/kokkos/kokkos/pull/3851) +- DualView.template view() better matches for Devices in UVMSpace cases [\#3857](https://github.com/kokkos/kokkos/pull/3857) +- More extensive disentangling of Policy Traits [\#3829](https://github.com/kokkos/kokkos/pull/3829) +- Replaced nanosleep and sched_yield with STL routines [\#3825](https://github.com/kokkos/kokkos/pull/3825) +- Constructing Atomic Subviews [\#3810](https://github.com/kokkos/kokkos/pull/3810) +- Metadata Declaration in Core [\#3729](https://github.com/kokkos/kokkos/pull/3729) +- Allow using tagged final functor in parallel_reduce [\#3714](https://github.com/kokkos/kokkos/pull/3714) +- Major duplicate code removal in SharedAllocationRecord specializations [\#3658](https://github.com/kokkos/kokkos/pull/3658) + +**Fixed bugs:** +- Provide forward declarations in Kokkos_ViewLayoutTiled.hpp for XL [\#3911](https://github.com/kokkos/kokkos/pull/3911) +- Fixup absolute value of floating points in Kokkos complex [\#3882](https://github.com/kokkos/kokkos/pull/3882) +- Address intel 17 ICE [\#3881](https://github.com/kokkos/kokkos/pull/3881) +- Add missing pow(Kokkos::complex) overloads [\#3868](https://github.com/kokkos/kokkos/pull/3868) +- Fix bug {pow, log}(Kokkos::complex) [\#3866](https://github.com/kokkos/kokkos/pull/3866)(https://github.com/kokkos/kokkos/pull/3866) +- Cleanup writing to output streams in Cuda [\#3859](https://github.com/kokkos/kokkos/pull/3859) +- Fixup cache CUDA fallback execution space instance used by DualView::sync [\#3856](https://github.com/kokkos/kokkos/pull/3856) +- Fix cmake warning with pthread [\#3854](https://github.com/kokkos/kokkos/pull/3854) +- Fix typo FOUND_CUDA_{DRIVVER -> DRIVER} [\#3852](https://github.com/kokkos/kokkos/pull/3852) +- Fix bug in SYCL team_reduce [\#3848](https://github.com/kokkos/kokkos/pull/3848) +- Atrocious bug in MDRange tuning [\#3803](https://github.com/kokkos/kokkos/pull/3803) +- Fix compiling SYCL with Kokkos_ENABLE_TUNING=ON [\#3800](https://github.com/kokkos/kokkos/pull/3800) +- Fixed command line parsing bug [\#3797](https://github.com/kokkos/kokkos/pull/3797) +- Workaround race condition in SYCL parallel_reduce [\#3782](https://github.com/kokkos/kokkos/pull/3782) +- Fix Atomic{Min,Max} for Kepler30 [\#3780](https://github.com/kokkos/kokkos/pull/3780) +- Fix SYCL typo [\#3755](https://github.com/kokkos/kokkos/pull/3755) +- Fixed Kokkos_install_additional_files macro [\#3752](https://github.com/kokkos/kokkos/pull/3752) +- Fix a typo for Kokkos_ARCH_A64FX [\#3751](https://github.com/kokkos/kokkos/pull/3751) +- OpenMPTarget: fixes and workarounds to work with "Release" build type [\#3748](https://github.com/kokkos/kokkos/pull/3748) +- Fix parsing bug for number of devices command line argument [\#3724](https://github.com/kokkos/kokkos/pull/3724) +- Avoid more warnings with clang and C++20 [\#3719](https://github.com/kokkos/kokkos/pull/3719) +- Fix gcc-10.1 C++20 warnings [\#3718](https://github.com/kokkos/kokkos/pull/3718) +- Fix cuda cache config not being set correct [\#3712](https://github.com/kokkos/kokkos/pull/3712) +- Fix dualview deepcopy perftools [\#3701](https://github.com/kokkos/kokkos/pull/3701) +- use drand instead of frand in drand [\#3696](https://github.com/kokkos/kokkos/pull/3696) + +**Incompatibilities:** +- Remove unimplemented member functions of SYCLDevice [\#3919](https://github.com/kokkos/kokkos/pull/3919) +- Replace cl::sycl [\#3896](https://github.com/kokkos/kokkos/pull/3896) +- Get rid of SYCL workaround in Kokkos_Complex.hpp [\#3884](https://github.com/kokkos/kokkos/pull/3884) +- Replace most uses of if_c [\#3883](https://github.com/kokkos/kokkos/pull/3883) +- Remove Impl::enable_if_type [\#3863](https://github.com/kokkos/kokkos/pull/3863) +- Remove HostBarrier test [\#3847](https://github.com/kokkos/kokkos/pull/3847) +- Avoid (void) interface [\#3836](https://github.com/kokkos/kokkos/pull/3836) +- Remove VerifyExecutionCanAccessMemorySpace [\#3813](https://github.com/kokkos/kokkos/pull/3813) +- Avoid duplicated code in ScratchMemorySpace [\#3793](https://github.com/kokkos/kokkos/pull/3793) +- Remove superfluous FunctorFinal specialization [\#3788](https://github.com/kokkos/kokkos/pull/3788) +- Rename cl::sycl -> sycl in Kokkos_MathematicalFunctions.hpp [\#3678](https://github.com/kokkos/kokkos/pull/3678) +- Remove integer_sequence backward compatibility implementation [\#3533](https://github.com/kokkos/kokkos/pull/3533) + +**Enabled tests:** +- Fixup re-enable core performance tests [\#3903](https://github.com/kokkos/kokkos/pull/3903) +- Enable more SYCL tests [\#3900](https://github.com/kokkos/kokkos/pull/3900) +- Restrict MDRange Policy tests for Intel GPUs [\#3853](https://github.com/kokkos/kokkos/pull/3853) +- Disable death tests for rawhide [\#3844](https://github.com/kokkos/kokkos/pull/3844) +- OpenMPTarget: Block unit tests that do not pass with the nvidia compiler [\#3839](https://github.com/kokkos/kokkos/pull/3839) +- Enable Bitset container test for SYCL [\#3830](https://github.com/kokkos/kokkos/pull/3830) +- Enable some more SYCL tests [\#3744](https://github.com/kokkos/kokkos/pull/3744) +- Enable SYCL atomic tests [\#3742](https://github.com/kokkos/kokkos/pull/3742) +- Enable more SYCL perf_tests [\#3692](https://github.com/kokkos/kokkos/pull/3692) +- Enable examples for SYCL [\#3691](https://github.com/kokkos/kokkos/pull/3691) + +## [3.3.01](https://github.com/kokkos/kokkos/tree/3.3.01) (2021-01-06) +[Full Changelog](https://github.com/kokkos/kokkos/compare/3.3.00...3.3.01) + +**Bug Fixes:** +- Fix severe performance bug in DualView which added memcpys for sync and modify [\#3693](https://github.com/kokkos/kokkos/issues/#3693) +- Fix performance bug in CUDA backend, where the cuda Cache config was not set correct. + +## [3.3.00](https://github.com/kokkos/kokkos/tree/3.3.00) (2020-12-16) +[Full Changelog](https://github.com/kokkos/kokkos/compare/3.2.01...3.3.00) + +**Features:** +- Require C++14 as minimum C++ standard. C++17 and C++20 are supported too. +- HIP backend is nearly feature complete. Kokkos Dynamic Task Graphs are missing. +- Major update for OpenMPTarget: many capabilities now work. For details contact us. +- Added DPC++/SYCL backend: primary capabilites are working. +- Added Kokkos Graph API analogous to CUDA Graphs. +- Added parallel_scan support with TeamThreadRange [\#3536](https://github.com/kokkos/kokkos/pull/#3536) +- Added Logical Memory Spaces [\#3546](https://github.com/kokkos/kokkos/pull/#3546) +- Added initial half precision support [\#3439](https://github.com/kokkos/kokkos/pull/#3439) +- Experimental feature: control cuda occupancy [\#3379](https://github.com/kokkos/kokkos/pull/#3379) + +**Implemented enhancements Backends and Archs:** +- Add a64fx and fujitsu Compiler support [\#3614](https://github.com/kokkos/kokkos/pull/#3614) +- Adding support for AMD gfx908 archictecture [\#3375](https://github.com/kokkos/kokkos/pull/#3375) +- SYCL parallel\_for MDRangePolicy [\#3583](https://github.com/kokkos/kokkos/pull/#3583) +- SYCL add parallel\_scan [\#3577](https://github.com/kokkos/kokkos/pull/#3577) +- SYCL custom reductions [\#3544](https://github.com/kokkos/kokkos/pull/#3544) +- SYCL Enable container unit tests [\#3550](https://github.com/kokkos/kokkos/pull/#3550) +- SYCL feature level 5 [\#3480](https://github.com/kokkos/kokkos/pull/#3480) +- SYCL Feature level 4 (parallel\_for) [\#3474](https://github.com/kokkos/kokkos/pull/#3474) +- SYCL feature level 3 [\#3451](https://github.com/kokkos/kokkos/pull/#3451) +- SYCL feature level 2 [\#3447](https://github.com/kokkos/kokkos/pull/#3447) +- OpenMPTarget: Hierarchial reduction for + operator on scalars [\#3504](https://github.com/kokkos/kokkos/pull/#3504) +- OpenMPTarget hierarchical [\#3411](https://github.com/kokkos/kokkos/pull/#3411) +- HIP Add Impl::atomic\_[store,load] [\#3440](https://github.com/kokkos/kokkos/pull/#3440) +- HIP enable global lock arrays [\#3418](https://github.com/kokkos/kokkos/pull/#3418) +- HIP Implement multiple occupancy paths for various HIP kernel launchers [\#3366](https://github.com/kokkos/kokkos/pull/#3366) + +**Implemented enhancements Policies:** +- MDRangePolicy: Let it be semiregular [\#3494](https://github.com/kokkos/kokkos/pull/#3494) +- MDRangePolicy: Check narrowing conversion in construction [\#3527](https://github.com/kokkos/kokkos/pull/#3527) +- MDRangePolicy: CombinedReducers support [\#3395](https://github.com/kokkos/kokkos/pull/#3395) +- Kokkos Graph: Interface and Default Implementation [\#3362](https://github.com/kokkos/kokkos/pull/#3362) +- Kokkos Graph: add Cuda Graph implementation [\#3369](https://github.com/kokkos/kokkos/pull/#3369) +- TeamPolicy: implemented autotuning of team sizes and vector lengths [\#3206](https://github.com/kokkos/kokkos/pull/#3206) +- RangePolicy: Initialize all data members in default constructor [\#3509](https://github.com/kokkos/kokkos/pull/#3509) + +**Implemented enhancements BuildSystem:** +- Auto-generate core test files for all backends [\#3488](https://github.com/kokkos/kokkos/pull/#3488) +- Avoid rewriting test files when calling cmake [\#3548](https://github.com/kokkos/kokkos/pull/#3548) +- RULE\_LAUNCH\_COMPILE and RULE\_LAUNCH\_LINK system for nvcc\_wrapper [\#3136](https://github.com/kokkos/kokkos/pull/#3136) +- Adding -include as a known argument to nvcc\_wrapper [\#3434](https://github.com/kokkos/kokkos/pull/#3434) +- Install hpcbind script [\#3402](https://github.com/kokkos/kokkos/pull/#3402) +- cmake/kokkos\_tribits.cmake: add parsing for args [\#3457](https://github.com/kokkos/kokkos/pull/#3457) + +**Implemented enhancements Tools:** +- Changed namespacing of Kokkos::Tools::Impl::Impl::tune\_policy [\#3455](https://github.com/kokkos/kokkos/pull/#3455) +- Delegate to an impl allocate/deallocate method to allow specifying a SpaceHandle for MemorySpaces [\#3530](https://github.com/kokkos/kokkos/pull/#3530) +- Use the Kokkos Profiling interface rather than the Impl interface [\#3518](https://github.com/kokkos/kokkos/pull/#3518) +- Runtime option for tuning [\#3459](https://github.com/kokkos/kokkos/pull/#3459) +- Dual View Tool Events [\#3326](https://github.com/kokkos/kokkos/pull/#3326) + +**Implemented enhancements Other:** +- Abort on errors instead of just printing [\#3528](https://github.com/kokkos/kokkos/pull/#3528) +- Enable C++14 macros unconditionally [\#3449](https://github.com/kokkos/kokkos/pull/#3449) +- Make ViewMapping trivially copyable [\#3436](https://github.com/kokkos/kokkos/pull/#3436) +- Rename struct ViewMapping to class [\#3435](https://github.com/kokkos/kokkos/pull/#3435) +- Replace enums in Kokkos\_ViewMapping.hpp (removes -Wextra) [\#3422](https://github.com/kokkos/kokkos/pull/#3422) +- Use bool for enums representing bools [\#3416](https://github.com/kokkos/kokkos/pull/#3416) +- Fence active instead of default execution space instances [\#3388](https://github.com/kokkos/kokkos/pull/#3388) +- Refactor parallel\_reduce fence usage [\#3359](https://github.com/kokkos/kokkos/pull/#3359) +- Moved Space EBO helpers to Kokkos\_EBO [\#3357](https://github.com/kokkos/kokkos/pull/#3357) +- Add remove\_cvref type trait [\#3340](https://github.com/kokkos/kokkos/pull/#3340) +- Adding identity type traits and update definition of identity\_t alias [\#3339](https://github.com/kokkos/kokkos/pull/#3339) +- Add is\_specialization\_of type trait [\#3338](https://github.com/kokkos/kokkos/pull/#3338) +- Make ScratchMemorySpace semi-regular [\#3309](https://github.com/kokkos/kokkos/pull/#3309) +- Optimize min/max atomics with early exit on no-op case [\#3265](https://github.com/kokkos/kokkos/pull/#3265) +- Refactor Backend Development [\#2941](https://github.com/kokkos/kokkos/pull/#2941) + +**Fixed bugs:** +- Fixup MDRangePolicy construction from Kokkos arrays [\#3591](https://github.com/kokkos/kokkos/pull/#3591) +- Add atomic functions for unsigned long long using gcc built-in [\#3588](https://github.com/kokkos/kokkos/pull/#3588) +- Fixup silent pointless comparison with zero in checked\_narrow\_cast (compiler workaround) [\#3566](https://github.com/kokkos/kokkos/pull/#3566) +- Fixes for ROCm 3.9 [\#3565](https://github.com/kokkos/kokkos/pull/#3565) +- Fix windows build issues which crept in for the CUDA build [\#3532](https://github.com/kokkos/kokkos/pull/#3532) +- HIP Fix atomics of large data types and clean up lock arrays [\#3529](https://github.com/kokkos/kokkos/pull/#3529) +- Pthreads fix exception resulting from 0 grain size [\#3510](https://github.com/kokkos/kokkos/pull/#3510) +- Fixup do not require atomic operation to be default constructible [\#3503](https://github.com/kokkos/kokkos/pull/#3503) +- Fix race condition in HIP backend [\#3467](https://github.com/kokkos/kokkos/pull/#3467) +- Replace KOKKOS\_DEBUG with KOKKOS\_ENABLE\_DEBUG [\#3458](https://github.com/kokkos/kokkos/pull/#3458) +- Fix multi-stream team scratch space definition for HIP [\#3398](https://github.com/kokkos/kokkos/pull/#3398) +- HIP fix template deduction [\#3393](https://github.com/kokkos/kokkos/pull/#3393) +- Fix compiling with HIP and C++17 [\#3390](https://github.com/kokkos/kokkos/pull/#3390) +- Fix sigFPE in HIP blocksize deduction [\#3378](https://github.com/kokkos/kokkos/pull/#3378) +- Type alias change: replace CS with CTS to avoid conflicts with NVSHMEM [\#3348](https://github.com/kokkos/kokkos/pull/#3348) +- Clang compilation of CUDA backend on Windows [\#3345](https://github.com/kokkos/kokkos/pull/#3345) +- Fix HBW support [\#3343](https://github.com/kokkos/kokkos/pull/#3343) +- Added missing fences to unique token [\#3260](https://github.com/kokkos/kokkos/pull/#3260) + +**Incompatibilities:** +- Remove unused utilities (forward, move, and expand\_variadic) from Kokkos::Impl [\#3535](https://github.com/kokkos/kokkos/pull/#3535) +- Remove unused traits [\#3534](https://github.com/kokkos/kokkos/pull/#3534) +- HIP: Remove old HCC code [\#3301](https://github.com/kokkos/kokkos/pull/#3301) +- Prepare for deprecation of ViewAllocateWithoutInitializing [\#3264](https://github.com/kokkos/kokkos/pull/#3264) +- Remove ROCm backend [\#3148](https://github.com/kokkos/kokkos/pull/#3148) + +## [3.2.01](https://github.com/kokkos/kokkos/tree/3.2.01) (2020-11-17) +[Full Changelog](https://github.com/kokkos/kokkos/compare/3.2.00...3.2.01) + +**Fixed bugs:** +- Disallow KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE in shared library builds [\#3332](https://github.com/kokkos/kokkos/pull/3332) +- Do not install libprinter-tool when testing is enabled [\#3313](https://github.com/kokkos/kokkos/pull/3313) +- Fix restrict/alignment following refactor [\#3373](https://github.com/kokkos/kokkos/pull/3373) + - Intel fix: workaround compiler issue with using statement [\#3383](https://github.com/kokkos/kokkos/pull/3383) +- Fix zero-length reductions [#\3364](https://github.com/kokkos/kokkos/pull/3364) + - Pthread zero-length reduction fix [\#3452](https://github.com/kokkos/kokkos/pull/3452) + - HPX zero-length reduction fix [\#3470](https://github.com/kokkos/kokkos/pull/3470) + - cuda/9.2 zero-length reduction fix [\#3580](https://github.com/kokkos/kokkos/pull/3580) +- Fix multi-stream scratch [#\3269](https://github.com/kokkos/kokkos/pull/3269) +- Guard KOKKOS_ALL_COMPILE_OPTIONS if Cuda is not enabled [\#3387](https://github.com/kokkos/kokkos/pull/3387) +- Do not include link flags for Fortran linkage [\#3384](https://github.com/kokkos/kokkos/pull/3384) +- Fix NVIDIA GPU arch macro with autodetection [\#3473](https://github.com/kokkos/kokkos/pull/3473) +- Fix libdl/test issues with Trilinos [\#3543](https://github.com/kokkos/kokkos/pull/3543) + - Register Pthread as Tribits option to be enabled with Trilinos [\#3558](https://github.com/kokkos/kokkos/pull/3558) + +**Implemented enhancements:** +- Separate Cuda timing-based tests into their own executable [\#3407](https://github.com/kokkos/kokkos/pull/3407) + +## [3.2.00](https://github.com/kokkos/kokkos/tree/3.2.00) (2020-08-19) +[Full Changelog](https://github.com/kokkos/kokkos/compare/3.1.01...3.2.00) + +**Implemented enhancements:** + +- HIP:Enable stream in HIP [\#3163](https://github.com/kokkos/kokkos/issues/3163) +- HIP:Add support for shuffle reduction for the HIP backend [\#3154](https://github.com/kokkos/kokkos/issues/3154) +- HIP:Add implementations of missing HIPHostPinnedSpace methods for LAMMPS [\#3137](https://github.com/kokkos/kokkos/issues/3137) +- HIP:Require HIP 3.5.0 or higher [\#3099](https://github.com/kokkos/kokkos/issues/3099) +- HIP:WorkGraphPolicy for HIP [\#3096](https://github.com/kokkos/kokkos/issues/3096) +- OpenMPTarget: Significant update to the new experimental backend. Requires C++17, works on Intel GPUs, reference counting fixes. [\#3169](https://github.com/kokkos/kokkos/issues/3169) +- Windows Cuda support [\#3018](https://github.com/kokkos/kokkos/issues/3018) +- Pass `-Wext-lambda-captures-this` to NVCC when support for `__host__ __device__` lambda is enabled from CUDA 11 [\#3241](https://github.com/kokkos/kokkos/issues/3241) +- Use explicit staging buffer for constant memory kernel launches and cleanup host/device synchronization [\#3234](https://github.com/kokkos/kokkos/issues/3234) +- Various fixup to policies including making TeamPolicy default constructible and making RangePolicy and TeamPolicy assignable: [\#3202](https://github.com/kokkos/kokkos/issues/3202) , [\#3203](https://github.com/kokkos/kokkos/issues/3203) , [\#3196](https://github.com/kokkos/kokkos/issues/3196) +- Annotations for `DefaultExectutionSpace` and `DefaultHostExectutionSpace` to use in static analysis [\#3189](https://github.com/kokkos/kokkos/issues/3189) +- Add documentation on using Spack to install Kokkos and developing packages that depend on Kokkos [\#3187](https://github.com/kokkos/kokkos/issues/3187) +- Add OpenMPTarget backend flags for NVC++ compiler [\#3185](https://github.com/kokkos/kokkos/issues/3185) +- Move deep\_copy/create\_mirror\_view on Experimental::OffsetView into Kokkos:: namespace [\#3166](https://github.com/kokkos/kokkos/issues/3166) +- Allow for larger block size in HIP [\#3165](https://github.com/kokkos/kokkos/issues/3165) +- View: Added names of Views to the different View initialize/free kernels [\#3159](https://github.com/kokkos/kokkos/issues/3159) +- Cuda: Caching cudaFunctorAttributes and whether L1/Shmem prefer was set [\#3151](https://github.com/kokkos/kokkos/issues/3151) +- BuildSystem: Improved performance in default configuration by defaulting to Release build [\#3131](https://github.com/kokkos/kokkos/issues/3131) +- Cuda: Update CUDA occupancy calculation [\#3124](https://github.com/kokkos/kokkos/issues/3124) +- Vector: Adding data() to Vector [\#3123](https://github.com/kokkos/kokkos/issues/3123) +- BuildSystem: Add CUDA Ampere configuration support [\#3122](https://github.com/kokkos/kokkos/issues/3122) +- General: Apply [[noreturn]] to Kokkos::abort when applicable [\#3106](https://github.com/kokkos/kokkos/issues/3106) +- TeamPolicy: Validate storage level argument passed to TeamPolicy::set\_scratch\_size() [\#3098](https://github.com/kokkos/kokkos/issues/3098) +- BuildSystem: Make kokkos\_has\_string() function in Makefile.kokkos case insensitive [\#3091](https://github.com/kokkos/kokkos/issues/3091) +- Modify KOKKOS\_FUNCTION macro for clang-tidy analysis [\#3087](https://github.com/kokkos/kokkos/issues/3087) +- Move allocation profiling to allocate/deallocate calls [\#3084](https://github.com/kokkos/kokkos/issues/3084) +- BuildSystem: FATAL\_ERROR when attempting in-source build [\#3082](https://github.com/kokkos/kokkos/issues/3082) +- Change enums in ScatterView to types [\#3076](https://github.com/kokkos/kokkos/issues/3076) +- HIP: Changes for new compiler/runtime [\#3067](https://github.com/kokkos/kokkos/issues/3067) +- Extract and use get\_gpu [\#3061](https://github.com/kokkos/kokkos/issues/3061) , [\#3048](https://github.com/kokkos/kokkos/issues/3048) +- Add is\_allocated to View-like containers [\#3059](https://github.com/kokkos/kokkos/issues/3059) +- Combined reducers for scalar references [\#3052](https://github.com/kokkos/kokkos/issues/3052) +- Add configurable capacity for UniqueToken [\#3051](https://github.com/kokkos/kokkos/issues/3051) +- Add installation testing [\#3034](https://github.com/kokkos/kokkos/issues/3034) +- HIP: Add UniqueToken [\#3020](https://github.com/kokkos/kokkos/issues/3020) +- Autodetect number of devices [\#3013](https://github.com/kokkos/kokkos/issues/3013) + + +**Fixed bugs:** + +- Check error code from `cudaStreamSynchronize` in CUDA fences [\#3255](https://github.com/kokkos/kokkos/issues/3255) +- Fix issue with C++ standard flags when using `nvcc\_wrapper` with PGI [\#3254](https://github.com/kokkos/kokkos/issues/3254) +- Add missing threadfence in lock-based atomics [\#3208](https://github.com/kokkos/kokkos/issues/3208) +- Fix dedup of linker flags for shared lib on CMake <=3.12 [\#3176](https://github.com/kokkos/kokkos/issues/3176) +- Fix memory leak with CUDA streams [\#3170](https://github.com/kokkos/kokkos/issues/3170) +- BuildSystem: Fix OpenMP Target flags for Cray [\#3161](https://github.com/kokkos/kokkos/issues/3161) +- ScatterView: fix for OpenmpTarget remove inheritance from reducers [\#3162](https://github.com/kokkos/kokkos/issues/3162) +- BuildSystem: Set OpenMP flags according to host compiler [\#3127](https://github.com/kokkos/kokkos/issues/3127) +- OpenMP: Fix logic for nested omp in partition\_master bug [\#3101](https://github.com/kokkos/kokkos/issues/3101) +- nvcc\_wrapper: send --cudart to nvcc instead of host compiler [\#3092](https://github.com/kokkos/kokkos/issues/3092) +- BuildSystem: Fixes for Cuda/11 and c++17 [\#3085](https://github.com/kokkos/kokkos/issues/3085) +- HIP: Fix print\_configuration [\#3080](https://github.com/kokkos/kokkos/issues/3080) +- Conditionally define get\_gpu [\#3072](https://github.com/kokkos/kokkos/issues/3072) +- Fix bounds for ranges in random number generator [\#3069](https://github.com/kokkos/kokkos/issues/3069) +- Fix Cuda minor arch check [\#3035](https://github.com/kokkos/kokkos/issues/3035) +- BuildSystem: Add -expt-relaxed-constexpr flag to nvcc\_wrapper [\#3021](https://github.com/kokkos/kokkos/issues/3021) + +**Incompatibilities:** + +- Remove ETI support [\#3157](https://github.com/kokkos/kokkos/issues/3157) +- Remove KOKKOS\_INTERNAL\_ENABLE\_NON\_CUDA\_BACKEND [\#3147](https://github.com/kokkos/kokkos/issues/3147) +- Remove core/unit\_test/config [\#3146](https://github.com/kokkos/kokkos/issues/3146) +- Removed the preprocessor branch for KOKKOS\_ENABLE\_PROFILING [\#3115](https://github.com/kokkos/kokkos/issues/3115) +- Disable profiling with MSVC [\#3066](https://github.com/kokkos/kokkos/issues/3066) + +**Closed issues:** + +- Silent error (Validate storage level arg to set_scratch_size) [\#3097](https://github.com/kokkos/kokkos/issues/3097) +- Remove KOKKKOS\_ENABLE\_PROFILING Option [\#3095](https://github.com/kokkos/kokkos/issues/3095) +- Cuda 11 -\> allow C++17 [\#3083](https://github.com/kokkos/kokkos/issues/3083) +- In source build failure not explained [\#3081](https://github.com/kokkos/kokkos/issues/3081) +- Allow naming of Views for initialization kernel [\#3070](https://github.com/kokkos/kokkos/issues/3070) +- DefaultInit tests failing when using CTest resource allocation feature [\#3040](https://github.com/kokkos/kokkos/issues/3040) +- Add installation testing. [\#3037](https://github.com/kokkos/kokkos/issues/3037) +- nvcc\_wrapper needs to handle `-expt-relaxed-constexpr` flag [\#3017](https://github.com/kokkos/kokkos/issues/3017) +- CPU core oversubscription warning on macOS with OpenMP backend [\#2996](https://github.com/kokkos/kokkos/issues/2996) +- Default behavior of KOKKOS\_NUM\_DEVICES to use all devices available [\#2975](https://github.com/kokkos/kokkos/issues/2975) +- Assert blocksize \> 0 [\#2974](https://github.com/kokkos/kokkos/issues/2974) +- Add ability to assign kokkos profile function from executable [\#2973](https://github.com/kokkos/kokkos/issues/2973) +- ScatterView Support for the pre/post increment operator [\#2967](https://github.com/kokkos/kokkos/issues/2967) + +- Compiler issue: Cuda build with clang 10 has errors with the atomic unit tests [\#3237](https://github.com/kokkos/kokkos/issues/3237) +- Incompatibility of flags for C++ standard with PGI v20.4 on Power9/NVIDIA V100 system [\#3252](https://github.com/kokkos/kokkos/issues/3252) +- Error configuring as subproject [\#3140](https://github.com/kokkos/kokkos/issues/3140) +- CMake fails with Nvidia compilers when the GPU architecture option is not supplied (Fix configure with OMPT and Cuda) [\#3207](https://github.com/kokkos/kokkos/issues/3207) +- PGI compiler being passed the gcc -fopenmp flag [\#3125](https://github.com/kokkos/kokkos/issues/3125) +- Cuda: Memory leak when using CUDA stream [\#3167](https://github.com/kokkos/kokkos/issues/3167) +- RangePolicy has an implicitly deleted assignment operator [\#3192](https://github.com/kokkos/kokkos/issues/3192) +- MemorySpace::allocate needs to have memory pool counting. [\#3064](https://github.com/kokkos/kokkos/issues/3064) +- Missing write fence for lock based atomics on CUDA [\#3038](https://github.com/kokkos/kokkos/issues/3038) +- CUDA compute capability version check problem [\#3026](https://github.com/kokkos/kokkos/issues/3026) +- Make DynRankView fencing consistent [\#3014](https://github.com/kokkos/kokkos/issues/3014) +- nvcc\_wrapper cant handle -Xcompiler -o out.o [\#2993](https://github.com/kokkos/kokkos/issues/2993) +- Reductions of non-trivial types of size 4 fail in CUDA shfl operations [\#2990](https://github.com/kokkos/kokkos/issues/2990) +- complex\_double misalignment in reduce, clang+CUDA [\#2989](https://github.com/kokkos/kokkos/issues/2989) +- Span of degenerated \(zero-length\) subviews is not zero in some special cases [\#2979](https://github.com/kokkos/kokkos/issues/2979) +- Rank 1 custom layouts dont work as expected. [\#2840](https://github.com/kokkos/kokkos/issues/2840) + +## [3.1.01](https://github.com/kokkos/kokkos/tree/3.1.1) (2020-04-14) +[Full Changelog](https://github.com/kokkos/kokkos/compare/3.1.00...3.1.1) + +**Fixed bugs:** + +- Fix complex_double misalignment in reduce, clang+CUDA [\#2989](https://github.com/kokkos/kokkos/issues/2989) +- Fix compilation fails when profiling disabled and CUDA enabled [\#3001](https://github.com/kokkos/kokkos/issues/3001) +- Fix cuda reduction of non-trivial scalars of size 4 [\#2990](https://github.com/kokkos/kokkos/issues/2990) +- Configure and install version file when building in Trilinos [\#2957](https://github.com/kokkos/kokkos/pull/2957) +- Fix OpenMPTarget build missing include and namespace [\#3000](https://github.com/kokkos/kokkos/issues/3000) +- fix typo in KOKKOS_SET_EXE_PROPERTY() [\#2959](https://github.com/kokkos/kokkos/issues/2959) +- Fix non-zero span subviews of zero sized subviews [\#2979](https://github.com/kokkos/kokkos/issues/2979) + +## [3.1.00](https://github.com/kokkos/kokkos/tree/3.1.00) (2020-04-14) +[Full Changelog](https://github.com/kokkos/kokkos/compare/3.0.00...3.1.00) + +**Features:** + +- HIP Support for AMD +- OpenMPTarget Support with clang +- Windows VS19 (Serial) Support [\#1533](https://github.com/kokkos/kokkos/issues/1533) + +**Implemented enhancements:** + +- generate\_makefile.bash should allow tests to be disabled [\#2886](https://github.com/kokkos/kokkos/issues/2886) +- clang/7+cuda/9 build -Werror-unused parameter error in nightly test [\#2884](https://github.com/kokkos/kokkos/issues/2884) +- ScatterView memory space is not user settable [\#2826](https://github.com/kokkos/kokkos/issues/2826) +- clang/8+cuda/10.0 build error with c++17 [\#2809](https://github.com/kokkos/kokkos/issues/2809) +- warnings.... [\#2805](https://github.com/kokkos/kokkos/issues/2805) +- Kokkos version in cpp define [\#2787](https://github.com/kokkos/kokkos/issues/2787) +- Remove Defunct QThreads Backend [\#2751](https://github.com/kokkos/kokkos/issues/2751) +- Improve Kokkos::fence behavior with multiple execution spaces [\#2659](https://github.com/kokkos/kokkos/issues/2659) +- polylithic\(?\) initialization of Kokkos [\#2658](https://github.com/kokkos/kokkos/issues/2658) +- Unnecessary\(?\) check for host execution space initialization from Cuda initialization [\#2652](https://github.com/kokkos/kokkos/issues/2652) +- Kokkos error reporting failures with CUDA GPUs in exclusive mode [\#2471](https://github.com/kokkos/kokkos/issues/2471) +- atomicMax equivalent \(and other atomics\) [\#2401](https://github.com/kokkos/kokkos/issues/2401) +- Fix alignment for Kokkos::complex [\#2255](https://github.com/kokkos/kokkos/issues/2255) +- Warnings with Cuda 10.1 [\#2206](https://github.com/kokkos/kokkos/issues/2206) +- dual view with Kokkos::ViewAllocateWithoutInitializing [\#2188](https://github.com/kokkos/kokkos/issues/2188) +- Check error code from cudaOccupancyMaxActiveBlocksPerMultiprocessor [\#2172](https://github.com/kokkos/kokkos/issues/2172) +- Add non-member Kokkos::resize/realloc for DualView [\#2170](https://github.com/kokkos/kokkos/issues/2170) +- Construct DualView without initialization [\#2046](https://github.com/kokkos/kokkos/issues/2046) +- Expose is\_assignable to determine if one view can be assigned to another [\#1936](https://github.com/kokkos/kokkos/issues/1936) +- profiling label [\#1935](https://github.com/kokkos/kokkos/issues/1935) +- team\_broadcast of bool failed on CUDA backend [\#1908](https://github.com/kokkos/kokkos/issues/1908) +- View static\_extent [\#660](https://github.com/kokkos/kokkos/issues/660) +- Misleading Kokkos::Cuda::initialize ERROR message when compiled for wrong GPU architecture [\#1944](https://github.com/kokkos/kokkos/issues/1944) +- Cryptic Error When Malloc Fails [\#2164](https://github.com/kokkos/kokkos/issues/2164) +- Drop support for intermediate standards in CMake [\#2336](https://github.com/kokkos/kokkos/issues/2336) + +**Fixed bugs:** + +- DualView sync\_device with length zero creates cuda errors [\#2946](https://github.com/kokkos/kokkos/issues/2946) +- building with nvcc and clang \(or clang based XL\) as host compiler: "Kokkos::atomic\_fetch\_min\(volatile int \*, int\)" has already been defined [\#2903](https://github.com/kokkos/kokkos/issues/2903) +- Cuda 9.1,10.1 debug builds failing due to -Werror=unused-parameter [\#2880](https://github.com/kokkos/kokkos/issues/2880) +- clang -Werror: Kokkos\_FixedBufferMemoryPool.hpp:140:28: error: unused parameter 'alloc\_size' [\#2869](https://github.com/kokkos/kokkos/issues/2869) +- intel/16.0.1, intel/17.0.1 nightly build failures with debugging enabled [\#2867](https://github.com/kokkos/kokkos/issues/2867) +- intel/16.0.1 debug build errors [\#2863](https://github.com/kokkos/kokkos/issues/2863) +- xl/16.1.1 with cpp14, openmp build, nightly test failures [\#2856](https://github.com/kokkos/kokkos/issues/2856) +- Intel nightly test failures: team\_vector [\#2852](https://github.com/kokkos/kokkos/issues/2852) +- Kokkos Views with intmax/2\<N\<intmax can hang during construction [\#2850](https://github.com/kokkos/kokkos/issues/2850) +- workgraph\_fib test seg-faults with threads backend and hwloc [\#2797](https://github.com/kokkos/kokkos/issues/2797) +- cuda.view\_64bit test hangs on Power8+Kepler37 system - develop and 2.9.00 branches [\#2771](https://github.com/kokkos/kokkos/issues/2771) +- device\_type for Kokkos\_Random ? [\#2693](https://github.com/kokkos/kokkos/issues/2693) +- "More than one tag given" error in Experimental::require\(\) [\#2608](https://github.com/kokkos/kokkos/issues/2608) +- Segfault on Marvell from our finalization stack [\#2542](https://github.com/kokkos/kokkos/issues/2542) + +## [3.0.00](https://github.com/kokkos/kokkos/tree/3.0.00) (2020-01-27) +[Full Changelog](https://github.com/kokkos/kokkos/compare/2.9.00...3.0.00) + +**Implemented enhancements:** + +- BuildSystem: Standalone Modern CMake Support [\#2104](https://github.com/kokkos/kokkos/issues/2104) +- StyleFormat: ClangFormat Style [\#2157](https://github.com/kokkos/kokkos/issues/2157) +- Documentation: Document build system and CMake philosophy [\#2263](https://github.com/kokkos/kokkos/issues/2263) +- BuildSystem: Add Alias with Namespace Kokkos:: to Interal Libraries [\#2530](https://github.com/kokkos/kokkos/issues/2530) +- BuildSystem: Universal Kokkos find\_package [\#2099](https://github.com/kokkos/kokkos/issues/2099) +- BuildSystem: Dropping support for Kokkos\_{DEVICES,OPTIONS,ARCH} in CMake [\#2329](https://github.com/kokkos/kokkos/issues/2329) +- BuildSystem: Set Kokkos\_DEVICES and Kokkos\_ARCH variables in exported CMake configuration [\#2193](https://github.com/kokkos/kokkos/issues/2193) +- BuildSystem: Drop support for CUDA 7 and CUDA 8 [\#2489](https://github.com/kokkos/kokkos/issues/2489) +- BuildSystem: Drop CMake option SEPARATE\_TESTS [\#2266](https://github.com/kokkos/kokkos/issues/2266) +- BuildSystem: Support expt-relaxed-constexpr same as expt-extended-lambda [\#2411](https://github.com/kokkos/kokkos/issues/2411) +- BuildSystem: Add Xnvlink to command line options allowed in nvcc\_wrapper [\#2197](https://github.com/kokkos/kokkos/issues/2197) +- BuildSystem: Install Kokkos config files and target files to lib/cmake/Kokkos [\#2162](https://github.com/kokkos/kokkos/issues/2162) +- BuildSystem: nvcc\_wrappers and c++ 14 [\#2035](https://github.com/kokkos/kokkos/issues/2035) +- BuildSystem: Kokkos version major/version minor \(Feature request\) [\#1930](https://github.com/kokkos/kokkos/issues/1930) +- BuildSystem: CMake namespaces \(and other modern cmake cleanup\) [\#1924](https://github.com/kokkos/kokkos/issues/1924) +- BuildSystem: Remove capability to install Kokkos via GNU Makefiles [\#2332](https://github.com/kokkos/kokkos/issues/2332) +- Documentation: Remove PDF ProgrammingGuide in Kokkos replace with link [\#2244](https://github.com/kokkos/kokkos/issues/2244) +- View: Add Method to Resize View without Initialization [\#2048](https://github.com/kokkos/kokkos/issues/2048) +- Vector: implement “insert” method for Kokkos\_Vector \(as a serial function on host\) [\#2437](https://github.com/kokkos/kokkos/issues/2437) + +**Fixed bugs:** + +- ParallelScan: Kokkos::parallel\scan fix race condition seen in inter-block fence [\#2681](https://github.com/kokkos/kokkos/issues/2681) +- OffsetView: Kokkos::OffsetView missing constructor which takes pointer [\#2247](https://github.com/kokkos/kokkos/issues/2247) +- OffsetView: Kokkos::OffsetView: allow offset=0 [\#2246](https://github.com/kokkos/kokkos/issues/2246) +- DeepCopy: Missing DeepCopy instrumentation in Kokkos [\#2522](https://github.com/kokkos/kokkos/issues/2522) +- nvcc\_wrapper: --host-only fails with multiple -W\* flags [\#2484](https://github.com/kokkos/kokkos/issues/2484) +- nvcc\_wrapper: taking first -std option is counterintuitive [\#2553](https://github.com/kokkos/kokkos/issues/2553) +- Subview: Error taking subviews of views with static_extents of min rank [\#2448](https://github.com/kokkos/kokkos/issues/2448) +- TeamPolicy: reducers with valuetypes without += broken on CUDA [\#2410](https://github.com/kokkos/kokkos/issues/2410) +- Libs: Fix inconsistency of Kokkos library names in Kokkos and Trilinos [\#1902](https://github.com/kokkos/kokkos/issues/1902) +- Complex: operator\>\> for complex\<T\> uses std::ostream, not std::istream [\#2313](https://github.com/kokkos/kokkos/issues/2313) +- Macros: Restrict not honored for non-intel compilers [\#1922](https://github.com/kokkos/kokkos/issues/1922) + + +## [2.9.00](https://github.com/kokkos/kokkos/tree/2.9.00) (2019-06-24) +[Full Changelog](https://github.com/kokkos/kokkos/compare/2.8.00...2.9.00) + +**Implemented enhancements:** + +- Capability: CUDA Streams [\#1723](https://github.com/kokkos/kokkos/issues/1723) +- Capability: CUDA Stream support for parallel\_reduce [\#2061](https://github.com/kokkos/kokkos/issues/2061) +- Capability: Feature Request: TeamVectorRange [\#713](https://github.com/kokkos/kokkos/issues/713) +- Capability: Adding HPX backend [\#2080](https://github.com/kokkos/kokkos/issues/2080) +- Capability: TaskScheduler to have multiple queues [\#565](https://github.com/kokkos/kokkos/issues/565) +- Capability: Support for additional reductions in ScatterView [\#1674](https://github.com/kokkos/kokkos/issues/1674) +- Capability: Request: deep\_copy within parallel regions [\#689](https://github.com/kokkos/kokkos/issues/689) +- Capability: Feature Request: `create\_mirror\_view\_without\_initializing` [\#1765](https://github.com/kokkos/kokkos/issues/1765) +- View: Use SFINAE to restrict possible View type conversions [\#2127](https://github.com/kokkos/kokkos/issues/2127) +- Deprecation: Deprecate ExecutionSpace::fence\(\) as static function and make it non-static [\#2140](https://github.com/kokkos/kokkos/issues/2140) +- Deprecation: Deprecate LayoutTileLeft [\#2122](https://github.com/kokkos/kokkos/issues/2122) +- Macros: KOKKOS\_RESTRICT defined for non-Intel compilers [\#2038](https://github.com/kokkos/kokkos/issues/2038) + +**Fixed bugs:** + +- Cuda: TeamThreadRange loop count on device is passed by reference to host static constexpr [\#1733](https://github.com/kokkos/kokkos/issues/1733) +- Cuda: Build error with relocatable device code with CUDA 10.1 GCC 7.3 [\#2134](https://github.com/kokkos/kokkos/issues/2134) +- Cuda: cudaFuncSetCacheConfig is setting CachePreferShared too often [\#2066](https://github.com/kokkos/kokkos/issues/2066) +- Cuda: TeamPolicy doesn't throw then created with non-viable vector length and also doesn't backscale to viable one [\#2020](https://github.com/kokkos/kokkos/issues/2020) +- Cuda: cudaMemcpy error for large league sizes on V100 [\#1991](https://github.com/kokkos/kokkos/issues/1991) +- Cuda: illegal warp sync in parallel\_reduce by functor on Turing 75 [\#1958](https://github.com/kokkos/kokkos/issues/1958) +- TeamThreadRange: Inconsistent results from TeamThreadRange reduction [\#1905](https://github.com/kokkos/kokkos/issues/1905) +- Atomics: atomic\_fetch\_oper & atomic\_oper\_fetch don't build for complex\<float\> [\#1964](https://github.com/kokkos/kokkos/issues/1964) +- Views: Kokkos randomread Views leak memory [\#2155](https://github.com/kokkos/kokkos/issues/2155) +- ScatterView: LayoutLeft overload currently non-functional [\#2165](https://github.com/kokkos/kokkos/issues/2165) +- KNL: With intel 17.2.174 illegal instruction in random number test [\#2078](https://github.com/kokkos/kokkos/issues/2078) +- Bitset: Enable copy constructor on device [\#2094](https://github.com/kokkos/kokkos/issues/2094) +- Examples: do not compile due to template deduction error \(multi\_fem\) [\#1928](https://github.com/kokkos/kokkos/issues/1928) + +## [2.8.00](https://github.com/kokkos/kokkos/tree/2.8.00) (2019-02-05) +[Full Changelog](https://github.com/kokkos/kokkos/compare/2.7.24...2.8.00) + +**Implemented enhancements:** + +- Capability, Tests: C++14 support and testing [\#1914](https://github.com/kokkos/kokkos/issues/1914) +- Capability: Add environment variables for all command line arguments [\#1798](https://github.com/kokkos/kokkos/issues/1798) +- Capability: --kokkos-ndevices not working for Slurm [\#1920](https://github.com/kokkos/kokkos/issues/1920) +- View: Undefined behavior when deep copying from and to an empty unmanaged view [\#1967](https://github.com/kokkos/kokkos/issues/1967) +- BuildSystem: nvcc\_wrapper should stop immediately if nvcc is not in PATH [\#1861](https://github.com/kokkos/kokkos/issues/1861) + +**Fixed bugs:** + +- Cuda: Fix Volta Issues 1 Non-deterministic behavior on Volta, runs fine on Pascal [\#1949](https://github.com/kokkos/kokkos/issues/1949) +- Cuda: Fix Volta Issues 2 CUDA Team Scan gives wrong values on Volta with -G compile flag [\#1942](https://github.com/kokkos/kokkos/issues/1942) +- Cuda: illegal warp sync in parallel\_reduce by functor on Turing 75 [\#1958](https://github.com/kokkos/kokkos/issues/1958) +- Threads: Pthreads backend does not handle RangePolicy with offset correctly [\#1976](https://github.com/kokkos/kokkos/issues/1976) +- Atomics: atomic\_fetch\_oper has no case for Kokkos::complex\<double\> or other 16-byte types [\#1951](https://github.com/kokkos/kokkos/issues/1951) +- MDRangePolicy: Fix zero-length range [\#1948](https://github.com/kokkos/kokkos/issues/1948) +- TeamThreadRange: TeamThreadRange MaxLoc reduce doesnt compile [\#1909](https://github.com/kokkos/kokkos/issues/1909) + +## [2.7.24](https://github.com/kokkos/kokkos/tree/2.7.24) (2018-11-04) +[Full Changelog](https://github.com/kokkos/kokkos/compare/2.7.00...2.7.24) + +**Implemented enhancements:** + +- DualView: Add non-templated functions for sync, need\_sync, view, modify [\#1858](https://github.com/kokkos/kokkos/issues/1858) +- DualView: Avoid needlessly allocates and initializes modify\_host and modify\_device flag views [\#1831](https://github.com/kokkos/kokkos/issues/1831) +- DualView: Incorrect deduction of "not device type" [\#1659](https://github.com/kokkos/kokkos/issues/1659) +- BuildSystem: Add KOKKOS\_ENABLE\_CXX14 and KOKKOS\_ENABLE\_CXX17 [\#1602](https://github.com/kokkos/kokkos/issues/1602) +- BuildSystem: Installed kokkos\_generated\_settings.cmake contains build directories instead of install directories [\#1838](https://github.com/kokkos/kokkos/issues/1838) +- BuildSystem: KOKKOS\_ARCH: add ticks to printout of improper arch setting [\#1649](https://github.com/kokkos/kokkos/issues/1649) +- BuildSystem: Make core/src/Makefile for Cuda use needed nvcc\_wrapper [\#1296](https://github.com/kokkos/kokkos/issues/1296) +- Build: Support PGI as host compiler for NVCC [\#1828](https://github.com/kokkos/kokkos/issues/1828) +- Build: Many Warnings Fixed e.g.[\#1786](https://github.com/kokkos/kokkos/issues/1786) +- Capability: OffsetView with non-zero begin index [\#567](https://github.com/kokkos/kokkos/issues/567) +- Capability: Reductions into device side view [\#1788](https://github.com/kokkos/kokkos/issues/1788) +- Capability: Add max\_size to Kokkos::Array [\#1760](https://github.com/kokkos/kokkos/issues/1760) +- Capability: View Assignment: LayoutStride -\> LayoutLeft and LayoutStride -\> LayoutRight [\#1594](https://github.com/kokkos/kokkos/issues/1594) +- Capability: Atomic function allow implicit conversion of update argument [\#1571](https://github.com/kokkos/kokkos/issues/1571) +- Capability: Add team\_size\_max with tagged functors [\#663](https://github.com/kokkos/kokkos/issues/663) +- Capability: Fix allignment of views from Kokkos\_ScratchSpace should use different alignment [\#1700](https://github.com/kokkos/kokkos/issues/1700) +- Capabilitiy: create\_mirror\_view\_and\_copy for DynRankView [\#1651](https://github.com/kokkos/kokkos/issues/1651) +- Capability: DeepCopy HBWSpace / HostSpace [\#548](https://github.com/kokkos/kokkos/issues/548) +- ROCm: support team vector scan [\#1645](https://github.com/kokkos/kokkos/issues/1645) +- ROCm: Merge from rocm-hackathon2 [\#1636](https://github.com/kokkos/kokkos/issues/1636) +- ROCm: Add ParallelScanWithTotal [\#1611](https://github.com/kokkos/kokkos/issues/1611) +- ROCm: Implement MDRange in ROCm [\#1314](https://github.com/kokkos/kokkos/issues/1314) +- ROCm: Implement Reducers for Nested Parallelism Levels [\#963](https://github.com/kokkos/kokkos/issues/963) +- ROCm: Add asynchronous deep copy [\#959](https://github.com/kokkos/kokkos/issues/959) +- Tests: Memory pool test seems to allocate 8GB [\#1830](https://github.com/kokkos/kokkos/issues/1830) +- Tests: Add unit\_test for team\_broadcast [\#734](https://github.com/kokkos/kokkos/issues/734) + +**Fixed bugs:** + +- BuildSystem: Makefile.kokkos gets gcc-toolchain wrong if gcc is cached [\#1841](https://github.com/kokkos/kokkos/issues/1841) +- BuildSystem: kokkos\_generated\_settings.cmake placement is inconsistent [\#1771](https://github.com/kokkos/kokkos/issues/1771) +- BuildSystem: Invalid escape sequence \. in kokkos\_functions.cmake [\#1661](https://github.com/kokkos/kokkos/issues/1661) +- BuildSystem: Problem in Kokkos generated cmake file [\#1770](https://github.com/kokkos/kokkos/issues/1770) +- BuildSystem: invalid file names on windows [\#1671](https://github.com/kokkos/kokkos/issues/1671) +- Tests: reducers min/max\_loc test fails randomly due to multiple min values and thus multiple valid locations [\#1681](https://github.com/kokkos/kokkos/issues/1681) +- Tests: cuda.scatterview unit test causes "Bus error" when force\_uvm and enable\_lambda are enabled [\#1852](https://github.com/kokkos/kokkos/issues/1852) +- Tests: cuda.cxx11 unit test fails when force\_uvm and enable\_lambda are enabled [\#1850](https://github.com/kokkos/kokkos/issues/1850) +- Tests: threads.reduce\_device\_view\_range\_policy failing with Cuda/8.0.44 and RDC [\#1836](https://github.com/kokkos/kokkos/issues/1836) +- Build: compile error when compiling Kokkos with hwloc 2.0.1 \(on OSX 10.12.6, with g++ 7.2.0\) [\#1506](https://github.com/kokkos/kokkos/issues/1506) +- Build: dual\_view.view broken with UVM [\#1834](https://github.com/kokkos/kokkos/issues/1834) +- Build: White cuda/9.2 + gcc/7.2 warnings triggering errors [\#1833](https://github.com/kokkos/kokkos/issues/1833) +- Build: warning: enum constant in boolean context [\#1813](https://github.com/kokkos/kokkos/issues/1813) +- Capability: Fix overly conservative max\_team\_size thingy [\#1808](https://github.com/kokkos/kokkos/issues/1808) +- DynRankView: Ctors taking ViewAllocateWithoutInitializing broken [\#1783](https://github.com/kokkos/kokkos/issues/1783) +- Cuda: Apollo cuda.team\_broadcast test fail with clang-6.0 [\#1762](https://github.com/kokkos/kokkos/issues/1762) +- Cuda: Clang spurious test failure in impl\_view\_accessible [\#1753](https://github.com/kokkos/kokkos/issues/1753) +- Cuda: Kokkos::complex\<double\> atomic deadlocks with Clang 6 Cuda build with -O0 [\#1752](https://github.com/kokkos/kokkos/issues/1752) +- Cuda: LayoutStride Test fails for UVM as default memory space [\#1688](https://github.com/kokkos/kokkos/issues/1688) +- Cuda: Scan wrong values on Volta [\#1676](https://github.com/kokkos/kokkos/issues/1676) +- Cuda: Kokkos::deep\_copy error with CudaUVM and Kokkos::Serial spaces [\#1652](https://github.com/kokkos/kokkos/issues/1652) +- Cuda: cudaErrorInvalidConfiguration with debug build [\#1647](https://github.com/kokkos/kokkos/issues/1647) +- Cuda: parallel\_for with TeamPolicy::team\_size\_recommended with launch bounds not working -- reported by Daniel Holladay [\#1283](https://github.com/kokkos/kokkos/issues/1283) +- Cuda: Using KOKKOS\_CLASS\_LAMBDA in a class with Kokkos::Random\_XorShift64\_Pool member data [\#1696](https://github.com/kokkos/kokkos/issues/1696) +- Long Build Times on Darwin [\#1721](https://github.com/kokkos/kokkos/issues/1721) +- Capability: Typo in Kokkos\_Sort.hpp - BinOp3D - wrong comparison [\#1720](https://github.com/kokkos/kokkos/issues/1720) +- Buffer overflow in SharedAllocationRecord in Kokkos\_HostSpace.cpp [\#1673](https://github.com/kokkos/kokkos/issues/1673) +- Serial unit test failure [\#1632](https://github.com/kokkos/kokkos/issues/1632) + +## [2.7.00](https://github.com/kokkos/kokkos/tree/2.7.00) (2018-05-24) +[Full Changelog](https://github.com/kokkos/kokkos/compare/2.6.00...2.7.00) + +**Part of the Kokkos C++ Performance Portability Programming EcoSystem 2.7** + +**Implemented enhancements:** + +- Deprecate team\_size auto adjusting to maximal value possible [\#1618](https://github.com/kokkos/kokkos/issues/1618) +- DynamicView - remove restrictions to std::is\_trivial types and value\_type is power of two [\#1586](https://github.com/kokkos/kokkos/issues/1586) +- Kokkos::StaticCrsGraph does not propagate memory traits \(e.g., Unmanaged\) [\#1581](https://github.com/kokkos/kokkos/issues/1581) +- Adding ETI for DeepCopy / ViewFill etc. [\#1578](https://github.com/kokkos/kokkos/issues/1578) +- Deprecate all the left over KOKKOS\_HAVE\_ Macros and Kokkos\_OldMacros.hpp [\#1572](https://github.com/kokkos/kokkos/issues/1572) +- Error if Kokkos\_ARCH set in CMake [\#1555](https://github.com/kokkos/kokkos/issues/1555) +- Deprecate ExecSpace::initialize / ExecSpace::finalize [\#1532](https://github.com/kokkos/kokkos/issues/1532) +- New API for TeamPolicy property setting [\#1531](https://github.com/kokkos/kokkos/issues/1531) +- clang 6.0 + cuda debug out-of-memory test failure [\#1521](https://github.com/kokkos/kokkos/issues/1521) +- Cuda UniqueToken interface not consistent with other backends [\#1505](https://github.com/kokkos/kokkos/issues/1505) +- Move Reducers out of Experimental namespace [\#1494](https://github.com/kokkos/kokkos/issues/1494) +- Provide scope guard for initialize/finalize [\#1479](https://github.com/kokkos/kokkos/issues/1479) +- Check Kokkos::is\_initialized in SharedAllocationRecord dtor [\#1465](https://github.com/kokkos/kokkos/issues/1465) +- Remove static list of allocations [\#1464](https://github.com/kokkos/kokkos/issues/1464) +- Makefiles: Support single compile/link line use case [\#1402](https://github.com/kokkos/kokkos/issues/1402) +- ThreadVectorRange with a range [\#1400](https://github.com/kokkos/kokkos/issues/1400) +- Exclusive scan + last value API [\#1358](https://github.com/kokkos/kokkos/issues/1358) +- Install kokkos\_generated\_settings.cmake [\#1348](https://github.com/kokkos/kokkos/issues/1348) +- Kokkos arrays \(not views!\) don't do bounds checking in debug mode [\#1342](https://github.com/kokkos/kokkos/issues/1342) +- Expose round-robin GPU assignment outside of initialize\(int, char\*\*\) [\#1318](https://github.com/kokkos/kokkos/issues/1318) +- DynamicView misses use\_count and label function [\#1298](https://github.com/kokkos/kokkos/issues/1298) +- View constructor should check arguments [\#1286](https://github.com/kokkos/kokkos/issues/1286) +- False Positive on Oversubscription Warning [\#1207](https://github.com/kokkos/kokkos/issues/1207) +- Allow \(require\) execution space for 1st arg of VerifyExecutionCanAccessMemorySpace [\#1192](https://github.com/kokkos/kokkos/issues/1192) +- ROCm: Add ROCmHostPinnedSpace [\#958](https://github.com/kokkos/kokkos/issues/958) +- power of two functions [\#656](https://github.com/kokkos/kokkos/issues/656) +- CUDA 8 has 64bit \_\_shfl [\#361](https://github.com/kokkos/kokkos/issues/361) +- Add TriBITS/CMake configure information about node types [\#243](https://github.com/kokkos/kokkos/issues/243) + +**Fixed bugs:** + +- CUDA atomic\_fetch\_sub for doubles is hitting CAS instead of intrinsic [\#1624](https://github.com/kokkos/kokkos/issues/1624) +- Bug: use of ballot on Volta [\#1612](https://github.com/kokkos/kokkos/issues/1612) +- Kokkos::deep\_copy memory access failures [\#1583](https://github.com/kokkos/kokkos/issues/1583) +- g++ -std option doubly set for cmake project [\#1548](https://github.com/kokkos/kokkos/issues/1548) +- ViewFill for 1D Views of larger 32bit entries fails [\#1541](https://github.com/kokkos/kokkos/issues/1541) +- CUDA Volta another warpsync bug [\#1520](https://github.com/kokkos/kokkos/issues/1520) +- triple\_nested\_parallelism fails with KOKKOS\_DEBUG and CUDA [\#1513](https://github.com/kokkos/kokkos/issues/1513) +- Jenkins errors in Kokkos\_SharedAlloc.cpp with debug build [\#1511](https://github.com/kokkos/kokkos/issues/1511) +- Kokkos::Sort out-of-bounds with empty bins [\#1504](https://github.com/kokkos/kokkos/issues/1504) +- Get rid of deprecated functions inside Kokkos [\#1484](https://github.com/kokkos/kokkos/issues/1484) +- get\_work\_partition casts int64\_t to int, causing a seg fault [\#1481](https://github.com/kokkos/kokkos/issues/1481) +- NVCC bug with \_\_device\_\_ on defaulted function [\#1470](https://github.com/kokkos/kokkos/issues/1470) +- CMake example broken with CUDA backend [\#1468](https://github.com/kokkos/kokkos/issues/1468) + + +## [2.6.00](https://github.com/kokkos/kokkos/tree/2.6.00) (2018-03-07) +[Full Changelog](https://github.com/kokkos/kokkos/compare/2.5.00...2.6.00) + +**Part of the Kokkos C++ Performance Portability Programming EcoSystem 2.6** + +**Implemented enhancements:** + +- Support NVIDIA Volta microarchitecture [\#1466](https://github.com/kokkos/kokkos/issues/1466) +- Kokkos - Define empty functions when profiling disabled [\#1424](https://github.com/kokkos/kokkos/issues/1424) +- Don't use \_\_constant\_\_ cache for lock arrays, enable once per run update instead of once per call [\#1385](https://github.com/kokkos/kokkos/issues/1385) +- task dag enhancement. [\#1354](https://github.com/kokkos/kokkos/issues/1354) +- Cuda task team collectives and stack size [\#1353](https://github.com/kokkos/kokkos/issues/1353) +- Replace View operator acceptance of more than rank integers with 'access' function [\#1333](https://github.com/kokkos/kokkos/issues/1333) +- Interoperability: Do not shut down backend execution space runtimes upon calling finalize. [\#1305](https://github.com/kokkos/kokkos/issues/1305) +- shmem\_size for LayoutStride [\#1291](https://github.com/kokkos/kokkos/issues/1291) +- Kokkos::resize performs poorly on 1D Views [\#1270](https://github.com/kokkos/kokkos/issues/1270) +- stride\(\) is inconsistent with dimension\(\), extent\(\), etc. [\#1214](https://github.com/kokkos/kokkos/issues/1214) +- Kokkos::sort defaults to std::sort on host [\#1208](https://github.com/kokkos/kokkos/issues/1208) +- DynamicView with host size grow [\#1206](https://github.com/kokkos/kokkos/issues/1206) +- Unmanaged View with Anonymous Memory Space [\#1175](https://github.com/kokkos/kokkos/issues/1175) +- Sort subset of Kokkos::DynamicView [\#1160](https://github.com/kokkos/kokkos/issues/1160) +- MDRange policy doesn't support lambda reductions [\#1054](https://github.com/kokkos/kokkos/issues/1054) +- Add ability to set hook on Kokkos::finalize [\#714](https://github.com/kokkos/kokkos/issues/714) +- Atomics with Serial Backend - Default should be Disable? [\#549](https://github.com/kokkos/kokkos/issues/549) +- KOKKOS\_ENABLE\_DEPRECATED\_CODE [\#1359](https://github.com/kokkos/kokkos/issues/1359) + +**Fixed bugs:** + +- cuda\_internal\_maximum\_warp\_count returns 8, but I believe it should return 16 for P100 [\#1269](https://github.com/kokkos/kokkos/issues/1269) +- Cuda: level 1 scratch memory bug \(reported by Stan Moore\) [\#1434](https://github.com/kokkos/kokkos/issues/1434) +- MDRangePolicy Reduction requires value\_type typedef in Functor [\#1379](https://github.com/kokkos/kokkos/issues/1379) +- Kokkos DeepCopy between empty views fails [\#1369](https://github.com/kokkos/kokkos/issues/1369) +- Several issues with new CMake build infrastructure \(reported by Eric Phipps\) [\#1365](https://github.com/kokkos/kokkos/issues/1365) +- deep\_copy between rank-1 host/device views of differing layouts without UVM no longer works \(reported by Eric Phipps\) [\#1363](https://github.com/kokkos/kokkos/issues/1363) +- Profiling can't be disabled in CMake, and a parallel\_for is missing for tasks \(reported by Kyungjoo Kim\) [\#1349](https://github.com/kokkos/kokkos/issues/1349) +- get\_work\_partition int overflow \(reported by berryj5\) [\#1327](https://github.com/kokkos/kokkos/issues/1327) +- Kokkos::deep\_copy must fence even if the two views are the same [\#1303](https://github.com/kokkos/kokkos/issues/1303) +- CudaUVMSpace::allocate/deallocate must fence [\#1302](https://github.com/kokkos/kokkos/issues/1302) +- ViewResize on CUDA fails in Debug because of too many resources requested [\#1299](https://github.com/kokkos/kokkos/issues/1299) +- Cuda 9 and intrepid2 calls from Panzer. [\#1183](https://github.com/kokkos/kokkos/issues/1183) +- Slowdown due to tracking\_enabled\(\) in 2.04.00 \(found by Albany app\) [\#1016](https://github.com/kokkos/kokkos/issues/1016) +- Bounds checking fails with zero-span Views \(reported by Stan Moore\) [\#1411](https://github.com/kokkos/kokkos/issues/1411) + + +## [2.5.00](https://github.com/kokkos/kokkos/tree/2.5.00) (2017-12-15) +[Full Changelog](https://github.com/kokkos/kokkos/compare/2.04.11...2.5.00) + +**Part of the Kokkos C++ Performance Portability Programming EcoSystem 2.5** + +**Implemented enhancements:** + +- Provide Makefile.kokkos logic for CMake and TriBITS [\#878](https://github.com/kokkos/kokkos/issues/878) +- Add Scatter View [\#825](https://github.com/kokkos/kokkos/issues/825) +- Drop gcc 4.7 and intel 14 from supported compiler list [\#603](https://github.com/kokkos/kokkos/issues/603) +- Enable construction of unmanaged view using common\_view\_alloc\_prop [\#1170](https://github.com/kokkos/kokkos/issues/1170) +- Unused Function Warning with XL [\#1267](https://github.com/kokkos/kokkos/issues/1267) +- Add memory pool parameter check [\#1218](https://github.com/kokkos/kokkos/issues/1218) +- CUDA9: Fix warning for unsupported long double [\#1189](https://github.com/kokkos/kokkos/issues/1189) +- CUDA9: fix warning on defaulted function marking [\#1188](https://github.com/kokkos/kokkos/issues/1188) +- CUDA9: fix warnings for deprecated warp level functions [\#1187](https://github.com/kokkos/kokkos/issues/1187) +- Add CUDA 9.0 nightly testing [\#1174](https://github.com/kokkos/kokkos/issues/1174) +- {OMPI,MPICH}\_CXX hack breaks nvcc\_wrapper use case [\#1166](https://github.com/kokkos/kokkos/issues/1166) +- KOKKOS\_HAVE\_CUDA\_LAMBDA became KOKKOS\_CUDA\_USE\_LAMBDA [\#1274](https://github.com/kokkos/kokkos/issues/1274) + +**Fixed bugs:** + +- MinMax Reducer with tagged operator doesn't compile [\#1251](https://github.com/kokkos/kokkos/issues/1251) +- Reducers for Tagged operators give wrong answer [\#1250](https://github.com/kokkos/kokkos/issues/1250) +- Kokkos not Compatible with Big Endian Machines? [\#1235](https://github.com/kokkos/kokkos/issues/1235) +- Parallel Scan hangs forever on BG/Q [\#1234](https://github.com/kokkos/kokkos/issues/1234) +- Threads backend doesn't compile with Clang on OS X [\#1232](https://github.com/kokkos/kokkos/issues/1232) +- $\(shell date\) needs quote [\#1264](https://github.com/kokkos/kokkos/issues/1264) +- Unqualified parallel\_for call conflicts with user-defined parallel\_for [\#1219](https://github.com/kokkos/kokkos/issues/1219) +- KokkosAlgorithms: CMake issue in unit tests [\#1212](https://github.com/kokkos/kokkos/issues/1212) +- Intel 18 Error: "simd pragma has been deprecated" [\#1210](https://github.com/kokkos/kokkos/issues/1210) +- Memory leak in Kokkos::initialize [\#1194](https://github.com/kokkos/kokkos/issues/1194) +- CUDA9: compiler error with static assert template arguments [\#1190](https://github.com/kokkos/kokkos/issues/1190) +- Kokkos::Serial::is\_initialized returns always true [\#1184](https://github.com/kokkos/kokkos/issues/1184) +- Triple nested parallelism still fails on bowman [\#1093](https://github.com/kokkos/kokkos/issues/1093) +- OpenMP openmp.range on Develop Runs Forever on POWER7+ with RHEL7 and GCC4.8.5 [\#995](https://github.com/kokkos/kokkos/issues/995) +- Rendezvous performance at global scope [\#985](https://github.com/kokkos/kokkos/issues/985) + + +## [2.04.11](https://github.com/kokkos/kokkos/tree/2.04.11) (2017-10-28) +[Full Changelog](https://github.com/kokkos/kokkos/compare/2.04.04...2.04.11) + +**Implemented enhancements:** + +- Add Subview pattern. [\#648](https://github.com/kokkos/kokkos/issues/648) +- Add Kokkos "global" is\_initialized [\#1060](https://github.com/kokkos/kokkos/issues/1060) +- Add create\_mirror\_view\_and\_copy [\#1161](https://github.com/kokkos/kokkos/issues/1161) +- Add KokkosConcepts SpaceAccessibility function [\#1092](https://github.com/kokkos/kokkos/issues/1092) +- Option to Disable Initialize Warnings [\#1142](https://github.com/kokkos/kokkos/issues/1142) +- Mature task-DAG capability [\#320](https://github.com/kokkos/kokkos/issues/320) +- Promote Work DAG from experimental [\#1126](https://github.com/kokkos/kokkos/issues/1126) +- Implement new WorkGraph push/pop [\#1108](https://github.com/kokkos/kokkos/issues/1108) +- Kokkos\_ENABLE\_Cuda\_Lambda should default ON [\#1101](https://github.com/kokkos/kokkos/issues/1101) +- Add multidimensional parallel for example and improve unit test [\#1064](https://github.com/kokkos/kokkos/issues/1064) +- Fix ROCm: Performance tests not building [\#1038](https://github.com/kokkos/kokkos/issues/1038) +- Make KOKKOS\_ALIGN\_SIZE a configure-time option [\#1004](https://github.com/kokkos/kokkos/issues/1004) +- Make alignment consistent [\#809](https://github.com/kokkos/kokkos/issues/809) +- Improve subview construction on Cuda backend [\#615](https://github.com/kokkos/kokkos/issues/615) + +**Fixed bugs:** + +- Kokkos::vector fixes for application [\#1134](https://github.com/kokkos/kokkos/issues/1134) +- DynamicView non-power of two value\_type [\#1177](https://github.com/kokkos/kokkos/issues/1177) +- Memory pool bug [\#1154](https://github.com/kokkos/kokkos/issues/1154) +- Cuda launch bounds performance regression bug [\#1140](https://github.com/kokkos/kokkos/issues/1140) +- Significant performance regression in LAMMPS after updating Kokkos [\#1139](https://github.com/kokkos/kokkos/issues/1139) +- CUDA compile error [\#1128](https://github.com/kokkos/kokkos/issues/1128) +- MDRangePolicy neg idx test failure in debug mode [\#1113](https://github.com/kokkos/kokkos/issues/1113) +- subview construction on Cuda backend [\#615](https://github.com/kokkos/kokkos/issues/615) + +## [2.04.04](https://github.com/kokkos/kokkos/tree/2.04.04) (2017-09-11) +[Full Changelog](https://github.com/kokkos/kokkos/compare/2.04.00...2.04.04) + +**Implemented enhancements:** + +- OpenMP partition: set number of threads on nested level [\#1082](https://github.com/kokkos/kokkos/issues/1082) +- Add StaticCrsGraph row\(\) method [\#1071](https://github.com/kokkos/kokkos/issues/1071) +- Enhance Kokkos complex operator overloading [\#1052](https://github.com/kokkos/kokkos/issues/1052) +- Tell Trilinos packages about host+device lambda [\#1019](https://github.com/kokkos/kokkos/issues/1019) +- Function markup for defaulted class members [\#952](https://github.com/kokkos/kokkos/issues/952) +- Add deterministic random number generator [\#857](https://github.com/kokkos/kokkos/issues/857) + +**Fixed bugs:** + +- Fix reduction\_identity\<T\>::max for floating point numbers [\#1048](https://github.com/kokkos/kokkos/issues/1048) +- Fix MD iteration policy ignores lower bound on GPUs [\#1041](https://github.com/kokkos/kokkos/issues/1041) +- (Experimental) HBWSpace Linking issues in KokkosKernels [\#1094](https://github.com/kokkos/kokkos/issues/1094) +- (Experimental) ROCm: algorithms/unit\_tests test\_sort failing with segfault [\#1070](https://github.com/kokkos/kokkos/issues/1070) + +## [2.04.00](https://github.com/kokkos/kokkos/tree/2.04.00) (2017-08-16) +[Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.13...2.04.00) + +**Implemented enhancements:** + +- Added ROCm backend to support AMD GPUs +- Kokkos::complex\<T\> behaves slightly differently from std::complex\<T\> [\#1011](https://github.com/kokkos/kokkos/issues/1011) +- Kokkos::Experimental::Crs constructor arguments were in the wrong order [\#992](https://github.com/kokkos/kokkos/issues/992) +- Work graph construction ease-of-use (one lambda for count and fill) [\#991](https://github.com/kokkos/kokkos/issues/991) +- when\_all returns pointer of futures (improved interface) [\#990](https://github.com/kokkos/kokkos/issues/990) +- Allow assignment of LayoutLeft to LayoutRight or vice versa for rank-0 Views [\#594](https://github.com/kokkos/kokkos/issues/594) +- Changed the meaning of Kokkos\_ENABLE\_CXX11\_DISPATCH\_LAMBDA [\#1035](https://github.com/kokkos/kokkos/issues/1035) + +**Fixed bugs:** + +- memory pool default constructor does not properly set member variables. [\#1007](https://github.com/kokkos/kokkos/issues/1007) + +## [2.03.13](https://github.com/kokkos/kokkos/tree/2.03.13) (2017-07-27) +[Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.05...2.03.13) + +**Implemented enhancements:** + +- Disallow enabling both OpenMP and Threads in the same executable [\#406](https://github.com/kokkos/kokkos/issues/406) +- Make Kokkos::OpenMP respect OMP environment even if hwloc is available [\#630](https://github.com/kokkos/kokkos/issues/630) +- Improve Atomics Performance on KNL/Broadwell where PREFETCHW/RFO is Available [\#898](https://github.com/kokkos/kokkos/issues/898) +- Kokkos::resize should test whether dimensions have changed before resizing [\#904](https://github.com/kokkos/kokkos/issues/904) +- Develop performance-regression/acceptance tests [\#737](https://github.com/kokkos/kokkos/issues/737) +- Make the deep\_copy Profiling hook a start/end system [\#890](https://github.com/kokkos/kokkos/issues/890) +- Add deep\_copy Profiling hook [\#843](https://github.com/kokkos/kokkos/issues/843) +- Append tag name to parallel construct name for Profiling [\#842](https://github.com/kokkos/kokkos/issues/842) +- Add view label to `View bounds error` message for CUDA backend [\#870](https://github.com/kokkos/kokkos/issues/870) +- Disable printing the loaded profiling library [\#824](https://github.com/kokkos/kokkos/issues/824) +- "Declared but never referenced" warnings [\#853](https://github.com/kokkos/kokkos/issues/853) +- Warnings about lock\_address\_cuda\_space [\#852](https://github.com/kokkos/kokkos/issues/852) +- WorkGraph execution policy [\#771](https://github.com/kokkos/kokkos/issues/771) +- Simplify makefiles by guarding compilation with appropriate KOKKOS\_ENABLE\_\#\#\# macros [\#716](https://github.com/kokkos/kokkos/issues/716) +- Cmake build: wrong include install directory [\#668](https://github.com/kokkos/kokkos/issues/668) +- Derived View type and allocation [\#566](https://github.com/kokkos/kokkos/issues/566) +- Fix Compiler warnings when compiling core unit tests for Cuda [\#214](https://github.com/kokkos/kokkos/issues/214) + +**Fixed bugs:** + +- Out-of-bounds read in Kokkos\_Layout.hpp [\#975](https://github.com/kokkos/kokkos/issues/975) +- CudaClang: Fix failing test with Clang 4.0 [\#941](https://github.com/kokkos/kokkos/issues/941) +- Respawn when memory pool allocation fails \(not available memory\) [\#940](https://github.com/kokkos/kokkos/issues/940) +- Memory pool aborts on zero allocation request, returns NULL for \< minimum [\#939](https://github.com/kokkos/kokkos/issues/939) +- Error with TaskScheduler query of underlying memory pool [\#917](https://github.com/kokkos/kokkos/issues/917) +- Profiling::\*Callee static variables declared in header [\#863](https://github.com/kokkos/kokkos/issues/863) +- calling \*Space::name\(\) causes compile error [\#862](https://github.com/kokkos/kokkos/issues/862) +- bug in Profiling::deallocateData [\#860](https://github.com/kokkos/kokkos/issues/860) +- task\_depend test failing, CUDA 8.0 + Pascal + RDC [\#829](https://github.com/kokkos/kokkos/issues/829) +- \[develop branch\] Standalone cmake issues [\#826](https://github.com/kokkos/kokkos/issues/826) +- Kokkos CUDA failes to compile with OMPI\_CXX and MPICH\_CXX wrappers [\#776](https://github.com/kokkos/kokkos/issues/776) +- Task Team reduction on Pascal [\#767](https://github.com/kokkos/kokkos/issues/767) +- CUDA stack overflow with TaskDAG test [\#758](https://github.com/kokkos/kokkos/issues/758) +- TeamVector test on Cuda [\#670](https://github.com/kokkos/kokkos/issues/670) +- Clang 4.0 Cuda Build broken again [\#560](https://github.com/kokkos/kokkos/issues/560) + + +## [2.03.05](https://github.com/kokkos/kokkos/tree/2.03.05) (2017-05-27) +[Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.00...2.03.05) + +**Implemented enhancements:** + +- Harmonize Custom Reductions over nesting levels [\#802](https://github.com/kokkos/kokkos/issues/802) +- Prevent users directly including KokkosCore\_config.h [\#815](https://github.com/kokkos/kokkos/issues/815) +- DualView aborts on concurrent host/device modify \(in debug mode\) [\#814](https://github.com/kokkos/kokkos/issues/814) +- Abort when running on a NVIDIA CC5.0 or higher architecture with code compiled for CC \< 5.0 [\#813](https://github.com/kokkos/kokkos/issues/813) +- Add "name" function to ExecSpaces [\#806](https://github.com/kokkos/kokkos/issues/806) +- Allow null Future in task spawn dependences [\#795](https://github.com/kokkos/kokkos/issues/795) +- Add Unit Tests for Kokkos::complex [\#785](https://github.com/kokkos/kokkos/issues/785) +- Add pow function for Kokkos::complex [\#784](https://github.com/kokkos/kokkos/issues/784) +- Square root of a complex [\#729](https://github.com/kokkos/kokkos/issues/729) +- Command line processing of --threads argument prevents users from having any commandline arguments starting with --threads [\#760](https://github.com/kokkos/kokkos/issues/760) +- Protected deprecated API with appropriate macro [\#756](https://github.com/kokkos/kokkos/issues/756) +- Allow task scheduler memory pool to be used by tasks [\#747](https://github.com/kokkos/kokkos/issues/747) +- View bounds checking on host-side performance: constructing a std::string [\#723](https://github.com/kokkos/kokkos/issues/723) +- Add check for AppleClang as compiler distinct from check for Clang. [\#705](https://github.com/kokkos/kokkos/issues/705) +- Uninclude source files for specific configurations to prevent link warning. [\#701](https://github.com/kokkos/kokkos/issues/701) +- Add --small option to snapshot script [\#697](https://github.com/kokkos/kokkos/issues/697) +- CMake Standalone Support [\#674](https://github.com/kokkos/kokkos/issues/674) +- CMake build unit test and install [\#808](https://github.com/kokkos/kokkos/issues/808) +- CMake: Fix having kokkos as a subdirectory in a pure cmake project [\#629](https://github.com/kokkos/kokkos/issues/629) +- Tribits macro assumes build directory is in top level source directory [\#654](https://github.com/kokkos/kokkos/issues/654) +- Use bin/nvcc\_wrapper, not config/nvcc\_wrapper [\#562](https://github.com/kokkos/kokkos/issues/562) +- Allow MemoryPool::allocate\(\) to be called from multiple threads per warp. [\#487](https://github.com/kokkos/kokkos/issues/487) +- Allow MemoryPool::allocate\\(\\) to be called from multiple threads per warp. [\#487](https://github.com/kokkos/kokkos/issues/487) +- Move OpenMP 4.5 OpenMPTarget backend into Develop [\#456](https://github.com/kokkos/kokkos/issues/456) +- Testing on ARM testbed [\#288](https://github.com/kokkos/kokkos/issues/288) + +**Fixed bugs:** + +- Fix label in OpenMP parallel\_reduce verify\_initialized [\#834](https://github.com/kokkos/kokkos/issues/834) +- TeamScratch Level 1 on Cuda hangs [\#820](https://github.com/kokkos/kokkos/issues/820) +- \[bug\] memory pool. [\#786](https://github.com/kokkos/kokkos/issues/786) +- Some Reduction Tests fail on Intel 18 with aggressive vectorization on [\#774](https://github.com/kokkos/kokkos/issues/774) +- Error copying dynamic view on copy of memory pool [\#773](https://github.com/kokkos/kokkos/issues/773) +- CUDA stack overflow with TaskDAG test [\#758](https://github.com/kokkos/kokkos/issues/758) +- ThreadVectorRange Customized Reduction Bug [\#739](https://github.com/kokkos/kokkos/issues/739) +- set\_scratch\_size overflows [\#726](https://github.com/kokkos/kokkos/issues/726) +- Get wrong results for compiler checks in Makefile on OS X. [\#706](https://github.com/kokkos/kokkos/issues/706) +- Fix check if multiple host architectures enabled. [\#702](https://github.com/kokkos/kokkos/issues/702) +- Threads Backend Does not Pass on Cray Compilers [\#609](https://github.com/kokkos/kokkos/issues/609) +- Rare bug in memory pool where allocation can finish on superblock in empty state [\#452](https://github.com/kokkos/kokkos/issues/452) +- LDFLAGS in core/unit\_test/Makefile: potential "undefined reference" to pthread lib [\#148](https://github.com/kokkos/kokkos/issues/148) + +## [2.03.00](https://github.com/kokkos/kokkos/tree/2.03.00) (2017-04-25) +[Full Changelog](https://github.com/kokkos/kokkos/compare/2.02.15...2.03.00) + +**Implemented enhancements:** + +- UnorderedMap: make it accept Devices or MemorySpaces [\#711](https://github.com/kokkos/kokkos/issues/711) +- sort to accept DynamicView and \[begin,end\) indices [\#691](https://github.com/kokkos/kokkos/issues/691) +- ENABLE Macros should only be used via \#ifdef or \#if defined [\#675](https://github.com/kokkos/kokkos/issues/675) +- Remove impl/Kokkos\_Synchronic\_\* [\#666](https://github.com/kokkos/kokkos/issues/666) +- Turning off IVDEP for Intel 14. [\#638](https://github.com/kokkos/kokkos/issues/638) +- Using an installed Kokkos in a target application using CMake [\#633](https://github.com/kokkos/kokkos/issues/633) +- Create Kokkos Bill of Materials [\#632](https://github.com/kokkos/kokkos/issues/632) +- MDRangePolicy and tagged evaluators [\#547](https://github.com/kokkos/kokkos/issues/547) +- Add PGI support [\#289](https://github.com/kokkos/kokkos/issues/289) + +**Fixed bugs:** + +- Output from PerTeam fails [\#733](https://github.com/kokkos/kokkos/issues/733) +- Cuda: architecture flag not added to link line [\#688](https://github.com/kokkos/kokkos/issues/688) +- Getting large chunks of memory for a thread team in a universal way [\#664](https://github.com/kokkos/kokkos/issues/664) +- Kokkos RNG normal\(\) function hangs for small seed value [\#655](https://github.com/kokkos/kokkos/issues/655) +- Kokkos Tests Errors on Shepard/HSW Builds [\#644](https://github.com/kokkos/kokkos/issues/644) + +## [2.02.15](https://github.com/kokkos/kokkos/tree/2.02.15) (2017-02-10) +[Full Changelog](https://github.com/kokkos/kokkos/compare/2.02.07...2.02.15) + +**Implemented enhancements:** + +- Containers: Adding block partitioning to StaticCrsGraph [\#625](https://github.com/kokkos/kokkos/issues/625) +- Kokkos Make System can induce Errors on Cray Volta System [\#610](https://github.com/kokkos/kokkos/issues/610) +- OpenMP: error out if KOKKOS\_HAVE\_OPENMP is defined but not \_OPENMP [\#605](https://github.com/kokkos/kokkos/issues/605) +- CMake: fix standalone build with tests [\#604](https://github.com/kokkos/kokkos/issues/604) +- Change README \(that GitHub shows when opening Kokkos project page\) to tell users how to submit PRs [\#597](https://github.com/kokkos/kokkos/issues/597) +- Add correctness testing for all operators of Atomic View [\#420](https://github.com/kokkos/kokkos/issues/420) +- Allow assignment of Views with compatible memory spaces [\#290](https://github.com/kokkos/kokkos/issues/290) +- Build only one version of Kokkos library for tests [\#213](https://github.com/kokkos/kokkos/issues/213) +- Clean out old KOKKOS\_HAVE\_CXX11 macros clauses [\#156](https://github.com/kokkos/kokkos/issues/156) +- Harmonize Macro names [\#150](https://github.com/kokkos/kokkos/issues/150) + +**Fixed bugs:** + +- Cray and PGI: Kokkos\_Parallel\_Reduce [\#634](https://github.com/kokkos/kokkos/issues/634) +- Kokkos Make System can induce Errors on Cray Volta System [\#610](https://github.com/kokkos/kokkos/issues/610) +- Normal\(\) function random number generator doesn't give the expected distribution [\#592](https://github.com/kokkos/kokkos/issues/592) + +## [2.02.07](https://github.com/kokkos/kokkos/tree/2.02.07) (2016-12-16) +[Full Changelog](https://github.com/kokkos/kokkos/compare/2.02.01...2.02.07) + +**Implemented enhancements:** + +- Add CMake option to enable Cuda Lambda support [\#589](https://github.com/kokkos/kokkos/issues/589) +- Add CMake option to enable Cuda RDC support [\#588](https://github.com/kokkos/kokkos/issues/588) +- Add Initial Intel Sky Lake Xeon-HPC Compiler Support to Kokkos Make System [\#584](https://github.com/kokkos/kokkos/issues/584) +- Building Tutorial Examples [\#582](https://github.com/kokkos/kokkos/issues/582) +- Internal way for using ThreadVectorRange without TeamHandle [\#574](https://github.com/kokkos/kokkos/issues/574) +- Testing: Add testing for uvm and rdc [\#571](https://github.com/kokkos/kokkos/issues/571) +- Profiling: Add Memory Tracing and Region Markers [\#557](https://github.com/kokkos/kokkos/issues/557) +- nvcc\_wrapper not installed with Kokkos built with CUDA through CMake [\#543](https://github.com/kokkos/kokkos/issues/543) +- Improve DynRankView debug check [\#541](https://github.com/kokkos/kokkos/issues/541) +- Benchmarks: Add Gather benchmark [\#536](https://github.com/kokkos/kokkos/issues/536) +- Testing: add spot\_check option to test\_all\_sandia [\#535](https://github.com/kokkos/kokkos/issues/535) +- Deprecate Kokkos::Impl::VerifyExecutionCanAccessMemorySpace [\#527](https://github.com/kokkos/kokkos/issues/527) +- Add AtomicAdd support for 64bit float for Pascal [\#522](https://github.com/kokkos/kokkos/issues/522) +- Add Restrict and Aligned memory trait [\#517](https://github.com/kokkos/kokkos/issues/517) +- Kokkos Tests are Not Run using Compiler Optimization [\#501](https://github.com/kokkos/kokkos/issues/501) +- Add support for clang 3.7 w/ openmp backend [\#393](https://github.com/kokkos/kokkos/issues/393) +- Provide an error throw class [\#79](https://github.com/kokkos/kokkos/issues/79) + +**Fixed bugs:** + +- Cuda UVM Allocation test broken with UVM as default space [\#586](https://github.com/kokkos/kokkos/issues/586) +- Bug \(develop branch only\): multiple tests are now failing when forcing uvm usage. [\#570](https://github.com/kokkos/kokkos/issues/570) +- Error in generate\_makefile.sh for Kokkos when Compiler is Empty String/Fails [\#568](https://github.com/kokkos/kokkos/issues/568) +- XL 13.1.4 incorrect C++11 flag [\#553](https://github.com/kokkos/kokkos/issues/553) +- Improve DynRankView debug check [\#541](https://github.com/kokkos/kokkos/issues/541) +- Installing Library on MAC broken due to cp -u [\#539](https://github.com/kokkos/kokkos/issues/539) +- Intel Nightly Testing with Debug enabled fails [\#534](https://github.com/kokkos/kokkos/issues/534) + +## [2.02.01](https://github.com/kokkos/kokkos/tree/2.02.01) (2016-11-01) +[Full Changelog](https://github.com/kokkos/kokkos/compare/2.02.00...2.02.01) + +**Implemented enhancements:** + +- Add Changelog generation to our process. [\#506](https://github.com/kokkos/kokkos/issues/506) + +**Fixed bugs:** + +- Test scratch\_request fails in Serial with Debug enabled [\#520](https://github.com/kokkos/kokkos/issues/520) +- Bug In BoundsCheck for DynRankView [\#516](https://github.com/kokkos/kokkos/issues/516) + +## [2.02.00](https://github.com/kokkos/kokkos/tree/2.02.00) (2016-10-30) +[Full Changelog](https://github.com/kokkos/kokkos/compare/2.01.10...2.02.00) + +**Implemented enhancements:** + +- Add PowerPC assembly for grabbing clock register in memory pool [\#511](https://github.com/kokkos/kokkos/issues/511) +- Add GCC 6.x support [\#508](https://github.com/kokkos/kokkos/issues/508) +- Test install and build against installed library [\#498](https://github.com/kokkos/kokkos/issues/498) +- Makefile.kokkos adds expt-extended-lambda to cuda build with clang [\#490](https://github.com/kokkos/kokkos/issues/490) +- Add top-level makefile option to just test kokkos-core unit-test [\#485](https://github.com/kokkos/kokkos/issues/485) +- Split and harmonize Object Files of Core UnitTests to increase build parallelism [\#484](https://github.com/kokkos/kokkos/issues/484) +- LayoutLeft to LayoutLeft subview for 3D and 4D views [\#473](https://github.com/kokkos/kokkos/issues/473) +- Add official Cuda 8.0 support [\#468](https://github.com/kokkos/kokkos/issues/468) +- Allow C++1Z Flag for Class Lambda capture [\#465](https://github.com/kokkos/kokkos/issues/465) +- Add Clang 4.0+ compilation of Cuda code [\#455](https://github.com/kokkos/kokkos/issues/455) +- Possible Issue with Intel 17.0.098 and GCC 6.1.0 in Develop Branch [\#445](https://github.com/kokkos/kokkos/issues/445) +- Add name of view to "View bounds error" [\#432](https://github.com/kokkos/kokkos/issues/432) +- Move Sort Binning Operators into Kokkos namespace [\#421](https://github.com/kokkos/kokkos/issues/421) +- TaskPolicy - generate error when attempt to use uninitialized [\#396](https://github.com/kokkos/kokkos/issues/396) +- Import WithoutInitializing and AllowPadding into Kokkos namespace [\#325](https://github.com/kokkos/kokkos/issues/325) +- TeamThreadRange requires begin, end to be the same type [\#305](https://github.com/kokkos/kokkos/issues/305) +- CudaUVMSpace should track \# allocations, due to CUDA limit on \# UVM allocations [\#300](https://github.com/kokkos/kokkos/issues/300) +- Remove old View and its infrastructure [\#259](https://github.com/kokkos/kokkos/issues/259) + +**Fixed bugs:** + +- Bug in TestCuda\_Other.cpp: most likely assembly inserted into Device code [\#515](https://github.com/kokkos/kokkos/issues/515) +- Cuda Compute Capability check of GPU is outdated [\#509](https://github.com/kokkos/kokkos/issues/509) +- multi\_scratch test with hwloc and pthreads seg-faults. [\#504](https://github.com/kokkos/kokkos/issues/504) +- generate\_makefile.bash: "make install" is broken [\#503](https://github.com/kokkos/kokkos/issues/503) +- make clean in Out of Source Build/Tests Does Not Work Correctly [\#502](https://github.com/kokkos/kokkos/issues/502) +- Makefiles for test and examples have issues in Cuda when CXX is not explicitly specified [\#497](https://github.com/kokkos/kokkos/issues/497) +- Dispatch lambda test directly inside GTEST macro doesn't work with nvcc [\#491](https://github.com/kokkos/kokkos/issues/491) +- UnitTests with HWLOC enabled fail if run with mpirun bound to a single core [\#489](https://github.com/kokkos/kokkos/issues/489) +- Failing Reducer Test on Mac with Pthreads [\#479](https://github.com/kokkos/kokkos/issues/479) +- make test Dumps Error with Clang Not Found [\#471](https://github.com/kokkos/kokkos/issues/471) +- OpenMP TeamPolicy member broadcast not using correct volatile shared variable [\#424](https://github.com/kokkos/kokkos/issues/424) +- TaskPolicy - generate error when attempt to use uninitialized [\#396](https://github.com/kokkos/kokkos/issues/396) +- New task policy implementation is pulling in old experimental code. [\#372](https://github.com/kokkos/kokkos/issues/372) +- MemoryPool unit test hangs on Power8 with GCC 6.1.0 [\#298](https://github.com/kokkos/kokkos/issues/298) + +## [2.01.10](https://github.com/kokkos/kokkos/tree/2.01.10) (2016-09-27) +[Full Changelog](https://github.com/kokkos/kokkos/compare/2.01.06...2.01.10) + +**Implemented enhancements:** + +- Enable Profiling by default in Tribits build [\#438](https://github.com/kokkos/kokkos/issues/438) +- parallel\_reduce\(0\), parallel\_scan\(0\) unit tests [\#436](https://github.com/kokkos/kokkos/issues/436) +- data\(\)==NULL after realloc with LayoutStride [\#351](https://github.com/kokkos/kokkos/issues/351) +- Fix tutorials to track new Kokkos::View [\#323](https://github.com/kokkos/kokkos/issues/323) +- Rename team policy set\_scratch\_size. [\#195](https://github.com/kokkos/kokkos/issues/195) + +**Fixed bugs:** + +- Possible Issue with Intel 17.0.098 and GCC 6.1.0 in Develop Branch [\#445](https://github.com/kokkos/kokkos/issues/445) +- Makefile spits syntax error [\#435](https://github.com/kokkos/kokkos/issues/435) +- Kokkos::sort fails for view with all the same values [\#422](https://github.com/kokkos/kokkos/issues/422) +- Generic Reducers: can't accept inline constructed reducer [\#404](https://github.com/kokkos/kokkos/issues/404) +- data\\(\\)==NULL after realloc with LayoutStride [\#351](https://github.com/kokkos/kokkos/issues/351) +- const subview of const view with compile time dimensions on Cuda backend [\#310](https://github.com/kokkos/kokkos/issues/310) +- Kokkos \(in Trilinos\) Causes Internal Compiler Error on CUDA 8.0.21-EA on POWER8 [\#307](https://github.com/kokkos/kokkos/issues/307) +- Core Oversubscription Detection Broken? [\#159](https://github.com/kokkos/kokkos/issues/159) + + +## [2.01.06](https://github.com/kokkos/kokkos/tree/2.01.06) (2016-09-02) +[Full Changelog](https://github.com/kokkos/kokkos/compare/2.01.00...2.01.06) + +**Implemented enhancements:** + +- Add "standard" reducers for lambda-supportable customized reduce [\#411](https://github.com/kokkos/kokkos/issues/411) +- TaskPolicy - single thread back-end execution [\#390](https://github.com/kokkos/kokkos/issues/390) +- Kokkos master clone tag [\#387](https://github.com/kokkos/kokkos/issues/387) +- Query memory requirements from task policy [\#378](https://github.com/kokkos/kokkos/issues/378) +- Output order of test\_atomic.cpp is confusing [\#373](https://github.com/kokkos/kokkos/issues/373) +- Missing testing for atomics [\#341](https://github.com/kokkos/kokkos/issues/341) +- Feature request for Kokkos to provide Kokkos::atomic\_fetch\_max and atomic\_fetch\_min [\#336](https://github.com/kokkos/kokkos/issues/336) +- TaskPolicy\<Cuda\> performance requires teams mapped to warps [\#218](https://github.com/kokkos/kokkos/issues/218) + +**Fixed bugs:** + +- Reduce with Teams broken for custom initialize [\#407](https://github.com/kokkos/kokkos/issues/407) +- Failing Kokkos build on Debian [\#402](https://github.com/kokkos/kokkos/issues/402) +- Failing Tests on NVIDIA Pascal GPUs [\#398](https://github.com/kokkos/kokkos/issues/398) +- Algorithms: fill\_random assumes dimensions fit in unsigned int [\#389](https://github.com/kokkos/kokkos/issues/389) +- Kokkos::subview with RandomAccess Memory Trait [\#385](https://github.com/kokkos/kokkos/issues/385) +- Build warning \(signed / unsigned comparison\) in Cuda implementation [\#365](https://github.com/kokkos/kokkos/issues/365) +- wrong results for a parallel\_reduce with CUDA8 / Maxwell50 [\#352](https://github.com/kokkos/kokkos/issues/352) +- Hierarchical parallelism - 3 level unit test [\#344](https://github.com/kokkos/kokkos/issues/344) +- Can I allocate a View w/ both WithoutInitializing & AllowPadding? [\#324](https://github.com/kokkos/kokkos/issues/324) +- subview View layout determination [\#309](https://github.com/kokkos/kokkos/issues/309) +- Unit tests with Cuda - Maxwell [\#196](https://github.com/kokkos/kokkos/issues/196) + +## [2.01.00](https://github.com/kokkos/kokkos/tree/2.01.00) (2016-07-21) +[Full Changelog](https://github.com/kokkos/kokkos/compare/End_C++98...2.01.00) + +**Implemented enhancements:** + +- Edit ViewMapping so assigning Views with the same custom layout compiles when const casting [\#327](https://github.com/kokkos/kokkos/issues/327) +- DynRankView: Performance improvement for operator\(\) [\#321](https://github.com/kokkos/kokkos/issues/321) +- Interoperability between static and dynamic rank views [\#295](https://github.com/kokkos/kokkos/issues/295) +- subview member function ? [\#280](https://github.com/kokkos/kokkos/issues/280) +- Inter-operatibility between View and DynRankView. [\#245](https://github.com/kokkos/kokkos/issues/245) +- \(Trilinos\) build warning in atomic\_assign, with Kokkos::complex [\#177](https://github.com/kokkos/kokkos/issues/177) +- View\<\>::shmem\_size should runtime check for number of arguments equal to rank [\#176](https://github.com/kokkos/kokkos/issues/176) +- Custom reduction join via lambda argument [\#99](https://github.com/kokkos/kokkos/issues/99) +- DynRankView with 0 dimensions passed in at construction [\#293](https://github.com/kokkos/kokkos/issues/293) +- Inject view\_alloc and friends into Kokkos namespace [\#292](https://github.com/kokkos/kokkos/issues/292) +- Less restrictive TeamPolicy reduction on Cuda [\#286](https://github.com/kokkos/kokkos/issues/286) +- deep\_copy using remap with source execution space [\#267](https://github.com/kokkos/kokkos/issues/267) +- Suggestion: Enable opt-in L1 caching via nvcc-wrapper [\#261](https://github.com/kokkos/kokkos/issues/261) +- More flexible create\_mirror functions [\#260](https://github.com/kokkos/kokkos/issues/260) +- Rename View::memory\_span to View::required\_allocation\_size [\#256](https://github.com/kokkos/kokkos/issues/256) +- Use of subviews and views with compile-time dimensions [\#237](https://github.com/kokkos/kokkos/issues/237) +- Use of subviews and views with compile-time dimensions [\#237](https://github.com/kokkos/kokkos/issues/237) +- Kokkos::Timer [\#234](https://github.com/kokkos/kokkos/issues/234) +- Fence CudaUVMSpace allocations [\#230](https://github.com/kokkos/kokkos/issues/230) +- View::operator\(\) accept std::is\_integral and std::is\_enum [\#227](https://github.com/kokkos/kokkos/issues/227) +- Allocating zero size View [\#216](https://github.com/kokkos/kokkos/issues/216) +- Thread scalable memory pool [\#212](https://github.com/kokkos/kokkos/issues/212) +- Add a way to disable memory leak output [\#194](https://github.com/kokkos/kokkos/issues/194) +- Kokkos exec space init should init Kokkos profiling [\#192](https://github.com/kokkos/kokkos/issues/192) +- Runtime rank wrapper for View [\#189](https://github.com/kokkos/kokkos/issues/189) +- Profiling Interface [\#158](https://github.com/kokkos/kokkos/issues/158) +- Fix View assignment \(of managed to unmanaged\) [\#153](https://github.com/kokkos/kokkos/issues/153) +- Add unit test for assignment of managed View to unmanaged View [\#152](https://github.com/kokkos/kokkos/issues/152) +- Check for oversubscription of threads with MPI in Kokkos::initialize [\#149](https://github.com/kokkos/kokkos/issues/149) +- Dynamic resizeable 1dimensional view [\#143](https://github.com/kokkos/kokkos/issues/143) +- Develop TaskPolicy for CUDA [\#142](https://github.com/kokkos/kokkos/issues/142) +- New View : Test Compilation Downstream [\#138](https://github.com/kokkos/kokkos/issues/138) +- New View Implementation [\#135](https://github.com/kokkos/kokkos/issues/135) +- Add variant of subview that lets users add traits [\#134](https://github.com/kokkos/kokkos/issues/134) +- NVCC-WRAPPER: Add --host-only flag [\#121](https://github.com/kokkos/kokkos/issues/121) +- Address gtest issue with TriBITS Kokkos build outside of Trilinos [\#117](https://github.com/kokkos/kokkos/issues/117) +- Make tests pass with -expt-extended-lambda on CUDA [\#108](https://github.com/kokkos/kokkos/issues/108) +- Dynamic scheduling for parallel\_for and parallel\_reduce [\#106](https://github.com/kokkos/kokkos/issues/106) +- Runtime or compile time error when reduce functor's join is not properly specified as const member function or with volatile arguments [\#105](https://github.com/kokkos/kokkos/issues/105) +- Error out when the number of threads is modified after kokkos is initialized [\#104](https://github.com/kokkos/kokkos/issues/104) +- Porting to POWER and remove assumption of X86 default [\#103](https://github.com/kokkos/kokkos/issues/103) +- Dynamic scheduling option for RangePolicy [\#100](https://github.com/kokkos/kokkos/issues/100) +- SharedMemory Support for Lambdas [\#81](https://github.com/kokkos/kokkos/issues/81) +- Recommended TeamSize for Lambdas [\#80](https://github.com/kokkos/kokkos/issues/80) +- Add Aggressive Vectorization Compilation mode [\#72](https://github.com/kokkos/kokkos/issues/72) +- Dynamic scheduling team execution policy [\#53](https://github.com/kokkos/kokkos/issues/53) +- UVM allocations in multi-GPU systems [\#50](https://github.com/kokkos/kokkos/issues/50) +- Synchronic in Kokkos::Impl [\#44](https://github.com/kokkos/kokkos/issues/44) +- index and dimension types in for loops [\#28](https://github.com/kokkos/kokkos/issues/28) +- Subview assign of 1D Strided with stride 1 to LayoutLeft/Right [\#1](https://github.com/kokkos/kokkos/issues/1) + +**Fixed bugs:** + +- misspelled variable name in Kokkos\_Atomic\_Fetch + missing unit tests [\#340](https://github.com/kokkos/kokkos/issues/340) +- seg fault Kokkos::Impl::CudaInternal::print\_configuration [\#338](https://github.com/kokkos/kokkos/issues/338) +- Clang compiler error with named parallel\_reduce, tags, and TeamPolicy. [\#335](https://github.com/kokkos/kokkos/issues/335) +- Shared Memory Allocation Error at parallel\_reduce [\#311](https://github.com/kokkos/kokkos/issues/311) +- DynRankView: Fix resize and realloc [\#303](https://github.com/kokkos/kokkos/issues/303) +- Scratch memory and dynamic scheduling [\#279](https://github.com/kokkos/kokkos/issues/279) +- MemoryPool infinite loop when out of memory [\#312](https://github.com/kokkos/kokkos/issues/312) +- Kokkos DynRankView changes break Sacado and Panzer [\#299](https://github.com/kokkos/kokkos/issues/299) +- MemoryPool fails to compile on non-cuda non-x86 [\#297](https://github.com/kokkos/kokkos/issues/297) +- Random Number Generator Fix [\#296](https://github.com/kokkos/kokkos/issues/296) +- View template parameter ordering Bug [\#282](https://github.com/kokkos/kokkos/issues/282) +- Serial task policy broken. [\#281](https://github.com/kokkos/kokkos/issues/281) +- deep\_copy with LayoutStride should not memcpy [\#262](https://github.com/kokkos/kokkos/issues/262) +- DualView::need\_sync should be a const method [\#248](https://github.com/kokkos/kokkos/issues/248) +- Arbitrary-sized atomics on GPUs broken; loop forever [\#238](https://github.com/kokkos/kokkos/issues/238) +- boolean reduction value\_type changes answer [\#225](https://github.com/kokkos/kokkos/issues/225) +- Custom init\(\) function for parallel\_reduce with array value\_type [\#210](https://github.com/kokkos/kokkos/issues/210) +- unit\_test Makefile is Broken - Recursively Calls itself until Machine Apocalypse. [\#202](https://github.com/kokkos/kokkos/issues/202) +- nvcc\_wrapper Does Not Support -Xcompiler \<compiler option\> [\#198](https://github.com/kokkos/kokkos/issues/198) +- Kokkos exec space init should init Kokkos profiling [\#192](https://github.com/kokkos/kokkos/issues/192) +- Kokkos Threads Backend impl\_shared\_alloc Broken on Intel 16.1 \(Shepard Haswell\) [\#186](https://github.com/kokkos/kokkos/issues/186) +- pthread back end hangs if used uninitialized [\#182](https://github.com/kokkos/kokkos/issues/182) +- parallel\_reduce of size 0, not calling init/join [\#175](https://github.com/kokkos/kokkos/issues/175) +- Bug in Threads with OpenMP enabled [\#173](https://github.com/kokkos/kokkos/issues/173) +- KokkosExp\_SharedAlloc, m\_team\_work\_index inaccessible [\#166](https://github.com/kokkos/kokkos/issues/166) +- 128-bit CAS without Assembly Broken? [\#161](https://github.com/kokkos/kokkos/issues/161) +- fatal error: Cuda/Kokkos\_Cuda\_abort.hpp: No such file or directory [\#157](https://github.com/kokkos/kokkos/issues/157) +- Power8: Fix OpenMP backend [\#139](https://github.com/kokkos/kokkos/issues/139) +- Data race in Kokkos OpenMP initialization [\#131](https://github.com/kokkos/kokkos/issues/131) +- parallel\_launch\_local\_memory and cuda 7.5 [\#125](https://github.com/kokkos/kokkos/issues/125) +- Resize can fail with Cuda due to asynchronous dispatch [\#119](https://github.com/kokkos/kokkos/issues/119) +- Qthread taskpolicy initialization bug. [\#92](https://github.com/kokkos/kokkos/issues/92) +- Windows: sys/mman.h [\#89](https://github.com/kokkos/kokkos/issues/89) +- Windows: atomic\_fetch\_sub\(\) [\#88](https://github.com/kokkos/kokkos/issues/88) +- Windows: snprintf [\#87](https://github.com/kokkos/kokkos/issues/87) +- Parallel\_Reduce with TeamPolicy and league size of 0 returns garbage [\#85](https://github.com/kokkos/kokkos/issues/85) +- Throw with Cuda when using \(2D\) team\_policy parallel\_reduce with less than a warp size [\#76](https://github.com/kokkos/kokkos/issues/76) +- Scalar views don't work with Kokkos::Atomic memory trait [\#69](https://github.com/kokkos/kokkos/issues/69) +- Reduce the number of threads per team for Cuda [\#63](https://github.com/kokkos/kokkos/issues/63) +- Named Kernels fail for reductions with CUDA [\#60](https://github.com/kokkos/kokkos/issues/60) +- Kokkos View dimension\_\(\) for long returning unsigned int [\#20](https://github.com/kokkos/kokkos/issues/20) +- atomic test hangs with LLVM [\#6](https://github.com/kokkos/kokkos/issues/6) +- OpenMP Test should set omp\_set\_num\_threads to 1 [\#4](https://github.com/kokkos/kokkos/issues/4) + +**Closed issues:** + +- develop branch broken with CUDA 8 and --expt-extended-lambda [\#354](https://github.com/kokkos/kokkos/issues/354) +- --arch=KNL with Intel 2016 build failure [\#349](https://github.com/kokkos/kokkos/issues/349) +- Error building with Cuda when passing -DKOKKOS\_CUDA\_USE\_LAMBDA to generate\_makefile.bash [\#343](https://github.com/kokkos/kokkos/issues/343) +- Can I safely use int indices in a 2-D View with capacity \> 2B? [\#318](https://github.com/kokkos/kokkos/issues/318) +- Kokkos::ViewAllocateWithoutInitializing is not working [\#317](https://github.com/kokkos/kokkos/issues/317) +- Intel build on Mac OS X [\#277](https://github.com/kokkos/kokkos/issues/277) +- deleted [\#271](https://github.com/kokkos/kokkos/issues/271) +- Broken Mira build [\#268](https://github.com/kokkos/kokkos/issues/268) +- 32-bit build [\#246](https://github.com/kokkos/kokkos/issues/246) +- parallel\_reduce with RDC crashes linker [\#232](https://github.com/kokkos/kokkos/issues/232) +- build of Kokkos\_Sparse\_MV\_impl\_spmv\_Serial.cpp.o fails if you use nvcc and have cuda disabled [\#209](https://github.com/kokkos/kokkos/issues/209) +- Kokkos Serial execution space is not tested with TeamPolicy. [\#207](https://github.com/kokkos/kokkos/issues/207) +- Unit test failure on Hansen KokkosCore\_UnitTest\_Cuda\_MPI\_1 [\#200](https://github.com/kokkos/kokkos/issues/200) +- nvcc compiler warning: calling a \_\_host\_\_ function from a \_\_host\_\_ \_\_device\_\_ function is not allowed [\#180](https://github.com/kokkos/kokkos/issues/180) +- Intel 15 build error with defaulted "move" operators [\#171](https://github.com/kokkos/kokkos/issues/171) +- missing libkokkos.a during Trilinos 12.4.2 build, yet other libkokkos\*.a libs are there [\#165](https://github.com/kokkos/kokkos/issues/165) +- Tie atomic updates to execution space or even to thread team? \(speculation\) [\#144](https://github.com/kokkos/kokkos/issues/144) +- New View: Compiletime/size Test [\#137](https://github.com/kokkos/kokkos/issues/137) +- New View : Performance Test [\#136](https://github.com/kokkos/kokkos/issues/136) +- Signed/unsigned comparison warning in CUDA parallel [\#130](https://github.com/kokkos/kokkos/issues/130) +- Kokkos::complex: Need op\* w/ std::complex & real [\#126](https://github.com/kokkos/kokkos/issues/126) +- Use uintptr\_t for casting pointers [\#110](https://github.com/kokkos/kokkos/issues/110) +- Default thread mapping behavior between P and Q threads. [\#91](https://github.com/kokkos/kokkos/issues/91) +- Windows: Atomic\_Fetch\_Exchange\(\) return type [\#90](https://github.com/kokkos/kokkos/issues/90) +- Synchronic unit test is way too long [\#84](https://github.com/kokkos/kokkos/issues/84) +- nvcc\_wrapper -\> $\(NVCC\_WRAPPER\) [\#42](https://github.com/kokkos/kokkos/issues/42) +- Check compiler version and print helpful message [\#39](https://github.com/kokkos/kokkos/issues/39) +- Kokkos shared memory on Cuda uses a lot of registers [\#31](https://github.com/kokkos/kokkos/issues/31) +- Can not pass unit test `cuda.space` without a GT 720 [\#25](https://github.com/kokkos/kokkos/issues/25) +- Makefile.kokkos lacks bounds checking option that CMake has [\#24](https://github.com/kokkos/kokkos/issues/24) +- Kokkos can not complete unit tests with CUDA UVM enabled [\#23](https://github.com/kokkos/kokkos/issues/23) +- Simplify teams + shared memory histogram example to remove vectorization [\#21](https://github.com/kokkos/kokkos/issues/21) +- Kokkos needs to rever to ${PROJECT\_NAME}\_ENABLE\_CXX11 not Trilinos\_ENABLE\_CXX11 [\#17](https://github.com/kokkos/kokkos/issues/17) +- Kokkos Base Makefile adds AVX to KNC Build [\#16](https://github.com/kokkos/kokkos/issues/16) +- MS Visual Studio 2013 Build Errors [\#9](https://github.com/kokkos/kokkos/issues/9) +- subview\(X, ALL\(\), j\) for 2-D LayoutRight View X: should it view a column? [\#5](https://github.com/kokkos/kokkos/issues/5) + +## [End_C++98](https://github.com/kokkos/kokkos/tree/End_C++98) (2015-04-15) + + +\* *This Change Log was automatically generated by [github_changelog_generator](https://github.com/skywinder/Github-Changelog-Generator)* diff --git a/packages/kokkos/CMakeLists.txt b/packages/kokkos/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..6fc1bf7d2f7fd3b02a785b1184923cde07b438b2 --- /dev/null +++ b/packages/kokkos/CMakeLists.txt @@ -0,0 +1,289 @@ + +# Disable in-source builds to prevent source tree corruption. +if( "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}" ) + message( FATAL_ERROR "FATAL: In-source builds are not allowed. You should create a separate directory for build files." ) +endif() + +# We want to determine if options are given with the wrong case +# In order to detect which arguments are given to compare against +# the list of valid arguments, at the beginning here we need to +# form a list of all the given variables. If it begins with any +# case of KoKkOS, we add it to the list. + + +GET_CMAKE_PROPERTY(_variableNames VARIABLES) +SET(KOKKOS_GIVEN_VARIABLES) +FOREACH (var ${_variableNames}) + STRING(TOUPPER ${var} UC_VAR) + STRING(FIND ${UC_VAR} KOKKOS IDX) + IF (${IDX} EQUAL 0) + LIST(APPEND KOKKOS_GIVEN_VARIABLES ${var}) + ENDIF() +ENDFOREACH() + +# Basic initialization (Used in KOKKOS_SETTINGS) +SET(Kokkos_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) +SET(KOKKOS_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) +SET(KOKKOS_SRC_PATH ${Kokkos_SOURCE_DIR}) +SET(KOKKOS_PATH ${Kokkos_SOURCE_DIR}) +SET(KOKKOS_TOP_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}) + +# Needed to simplify syntax of if statements +CMAKE_POLICY(SET CMP0054 NEW) +# Needed to make IN_LIST a valid operator +CMAKE_POLICY(SET CMP0057 NEW) + +# Is this a build as part of Trilinos? +IF(COMMAND TRIBITS_PACKAGE_DECL) + SET(KOKKOS_HAS_TRILINOS ON) +ELSE() + SET(KOKKOS_HAS_TRILINOS OFF) +ENDIF() +# Is this build a subdirectory of another project +GET_DIRECTORY_PROPERTY(HAS_PARENT PARENT_DIRECTORY) + + +INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_functions.cmake) +INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_pick_cxx_std.cmake) + +SET(KOKKOS_ENABLED_OPTIONS) #exported in config file +SET(KOKKOS_ENABLED_DEVICES) #exported in config file +SET(KOKKOS_ENABLED_TPLS) #exported in config file +SET(KOKKOS_ENABLED_ARCH_LIST) #exported in config file + +#These are helper flags used for sanity checks during config +#Certain features should depend on other features being configured first +SET(KOKKOS_CFG_DAG_NONE On) #sentinel to indicate no dependencies +SET(KOKKOS_CFG_DAG_DEVICES_DONE Off) +SET(KOKKOS_CFG_DAG_OPTIONS_DONE Off) +SET(KOKKOS_CFG_DAG_ARCH_DONE Off) +SET(KOKKOS_CFG_DAG_CXX_STD_DONE Off) +SET(KOKKOS_CFG_DAG_COMPILER_ID_DONE Off) +FUNCTION(KOKKOS_CFG_DEPENDS SUCCESSOR PRECURSOR) + SET(PRE_FLAG KOKKOS_CFG_DAG_${PRECURSOR}) + SET(POST_FLAG KOKKOS_CFG_DAG_${SUCCESSOR}) + IF (NOT ${PRE_FLAG}) + MESSAGE(FATAL_ERROR "Bad CMake refactor: feature ${SUCCESSOR} cannot be configured until ${PRECURSOR} is configured") + ENDIF() + GLOBAL_SET(${POST_FLAG} On) +ENDFUNCTION() + + +LIST(APPEND CMAKE_MODULE_PATH cmake/Modules) + +IF(NOT KOKKOS_HAS_TRILINOS) + cmake_minimum_required(VERSION 3.16 FATAL_ERROR) + set(CMAKE_DISABLE_SOURCE_CHANGES ON) + set(CMAKE_DISABLE_IN_SOURCE_BUILD ON) + IF (Spack_WORKAROUND) + #if we are explicitly using Spack for development, + #nuke the Spack compiler + SET(SPACK_CXX $ENV{SPACK_CXX}) + IF(SPACK_CXX) + SET(CMAKE_CXX_COMPILER ${SPACK_CXX} CACHE STRING "the C++ compiler" FORCE) + SET(ENV{CXX} ${SPACK_CXX}) + ENDIF() + ENDIF() + # Always call the project command to define Kokkos_ variables + # and to make sure that C++ is an enabled language + PROJECT(Kokkos CXX) + IF(NOT HAS_PARENT) + IF (NOT CMAKE_BUILD_TYPE) + SET(DEFAULT_BUILD_TYPE "RelWithDebInfo") + MESSAGE(STATUS "Setting build type to '${DEFAULT_BUILD_TYPE}' as none was specified.") + SET(CMAKE_BUILD_TYPE "${DEFAULT_BUILD_TYPE}" CACHE STRING + "Choose the type of build, options are: Debug, Release, RelWithDebInfo and MinSizeRel." + FORCE) + ENDIF() + ENDIF() +ENDIF() + +IF (NOT CMAKE_SIZEOF_VOID_P) + STRING(FIND ${CMAKE_CXX_COMPILER} nvcc_wrapper FIND_IDX) + IF (NOT FIND_IDX STREQUAL -1) + MESSAGE(FATAL_ERROR "Kokkos did not configure correctly and failed to validate compiler. The most likely cause is CUDA linkage using nvcc_wrapper. Please ensure your CUDA environment is correctly configured.") + ELSE() + MESSAGE(FATAL_ERROR "Kokkos did not configure correctly and failed to validate compiler. The most likely cause is linkage errors during CMake compiler validation. Please consult the CMake error log shown below for the exact error during compiler validation") + ENDIF() +ELSEIF (NOT CMAKE_SIZEOF_VOID_P EQUAL 8) + MESSAGE(FATAL_ERROR "Kokkos assumes a 64-bit build; i.e., 8-byte pointers, but found ${CMAKE_SIZEOF_VOID_P}-byte pointers instead") +ENDIF() + + +set(Kokkos_VERSION_MAJOR 3) +set(Kokkos_VERSION_MINOR 4) +set(Kokkos_VERSION_PATCH 00) +set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}") +math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}") + +MESSAGE(STATUS "Setting policy CMP0074 to use <Package>_ROOT variables") +CMAKE_POLICY(SET CMP0074 NEW) + +# Load either the real TriBITS or a TriBITS wrapper +# for certain utility functions that are universal (like GLOBAL_SET) +INCLUDE(${KOKKOS_SRC_PATH}/cmake/fake_tribits.cmake) + +IF (Kokkos_ENABLE_CUDA) + # If we are building CUDA, we have tricked CMake because we declare a CXX project + # If the default C++ standard for a given compiler matches the requested + # standard, then CMake just omits the -std flag in later versions of CMake + # This breaks CUDA compilation (CUDA compiler can have a different default + # -std then the underlying host compiler by itself). Setting this variable + # forces CMake to always add the -std flag even if it thinks it doesn't need it + GLOBAL_SET(CMAKE_CXX_STANDARD_DEFAULT 98) +ENDIF() + +# These are the variables we will append to as we go +# I really wish these were regular variables +# but scoping issues can make it difficult +GLOBAL_SET(KOKKOS_COMPILE_OPTIONS) +GLOBAL_SET(KOKKOS_LINK_OPTIONS) +GLOBAL_SET(KOKKOS_CUDA_OPTIONS) +GLOBAL_SET(KOKKOS_CUDAFE_OPTIONS) +GLOBAL_SET(KOKKOS_XCOMPILER_OPTIONS) +# We need to append text here for making sure TPLs +# we import are available for an installed Kokkos +GLOBAL_SET(KOKKOS_TPL_EXPORTS) +# KOKKOS_DEPENDENCE is used by kokkos_launch_compiler +GLOBAL_SET(KOKKOS_COMPILE_DEFINITIONS KOKKOS_DEPENDENCE) +# MSVC never goes through kokkos_launch_compiler +IF(NOT MSVC) + GLOBAL_APPEND(KOKKOS_LINK_OPTIONS -DKOKKOS_DEPENDENCE) +ENDIF() + +# Include a set of Kokkos-specific wrapper functions that +# will either call raw CMake or TriBITS +# These are functions like KOKKOS_INCLUDE_DIRECTORIES +INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_tribits.cmake) + + +# Check the environment and set certain variables +# to allow platform-specific checks +INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_check_env.cmake) + +# The build environment setup goes in the following steps +# 1) Check all the enable options. This includes checking Kokkos_DEVICES +# 2) Check the compiler ID (type and version) +# 3) Check the CXX standard and select important CXX flags +# 4) Check for any third-party libraries (TPLs) like hwloc +# 5) Check if optimizing for a particular architecture and add arch-specific flags +KOKKOS_SETUP_BUILD_ENVIRONMENT() + +# Finish off the build +# 6) Recurse into subdirectories and configure individual libraries +# 7) Export and install targets + +OPTION(BUILD_SHARED_LIBS "Build shared libraries" OFF) +# Workaround for building position independent code. +IF(BUILD_SHARED_LIBS) + SET(CMAKE_POSITION_INDEPENDENT_CODE ON) +ENDIF() + +SET(KOKKOS_EXT_LIBRARIES Kokkos::kokkos Kokkos::kokkoscore Kokkos::kokkoscontainers Kokkos::kokkosalgorithms) +SET(KOKKOS_INT_LIBRARIES kokkos kokkoscore kokkoscontainers kokkosalgorithms) +SET_PROPERTY(GLOBAL PROPERTY KOKKOS_INT_LIBRARIES ${KOKKOS_INT_LIBRARIES}) + +IF (KOKKOS_HAS_TRILINOS) + SET(TRILINOS_INCDIR ${CMAKE_INSTALL_PREFIX}/${${PROJECT_NAME}_INSTALL_INCLUDE_DIR}) + SET(KOKKOS_HEADER_DIR ${TRILINOS_INCDIR}) + SET(KOKKOS_IS_SUBDIRECTORY TRUE) +ELSEIF(HAS_PARENT) + SET(KOKKOS_HEADER_DIR "include/kokkos") + SET(KOKKOS_IS_SUBDIRECTORY TRUE) +ELSE() + SET(KOKKOS_HEADER_DIR "${CMAKE_INSTALL_INCLUDEDIR}") + SET(KOKKOS_IS_SUBDIRECTORY FALSE) +ENDIF() + +#------------------------------------------------------------------------------ +# +# A) Forward declare the package so that certain options are also defined for +# subpackages + +## This restores the old behavior of ProjectCompilerPostConfig.cmake +# It sets the CMAKE_CXX_FLAGS globally to those used by Kokkos +# We must do this before KOKKOS_PACKAGE_DECL +IF (KOKKOS_HAS_TRILINOS) + # Overwrite the old flags at the top-level + # Because Tribits doesn't use lists, it uses spaces for the list of CXX flags + # we have to match the annoying behavior + STRING(REPLACE ";" " " KOKKOSCORE_COMPILE_OPTIONS "${KOKKOS_COMPILE_OPTIONS}") + LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS ${KOKKOS_COMPILE_OPTIONS}) + IF (KOKKOS_ENABLE_CUDA) + LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS ${KOKKOS_CUDA_OPTIONS}) + ENDIF() + FOREACH(XCOMP_FLAG ${KOKKOS_XCOMPILER_OPTIONS}) + SET(KOKKOSCORE_XCOMPILER_OPTIONS "${KOKKOSCORE_XCOMPILER_OPTIONS} -Xcompiler ${XCOMP_FLAG}") + LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS -Xcompiler ${XCOMP_FLAG}) + ENDFOREACH() + SET(KOKKOSCORE_CXX_FLAGS "${KOKKOSCORE_COMPILE_OPTIONS} ${KOKKOSCORE_XCOMPILER_OPTIONS}") + IF (KOKKOS_ENABLE_CUDA) + STRING(REPLACE ";" " " KOKKOSCORE_CUDA_OPTIONS "${KOKKOS_CUDA_OPTIONS}") + FOREACH(CUDAFE_FLAG ${KOKKOS_CUDAFE_OPTIONS}) + SET(KOKKOSCORE_CUDAFE_OPTIONS "${KOKKOSCORE_CUDAFE_OPTIONS} -Xcudafe ${CUDAFE_FLAG}") + LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS -Xcudafe ${CUDAFE_FLAG}) + ENDFOREACH() + SET(KOKKOSCORE_CXX_FLAGS "${KOKKOSCORE_CXX_FLAGS} ${KOKKOSCORE_CUDA_OPTIONS} ${KOKKOSCORE_CUDAFE_OPTIONS}") + ENDIF() + # Both parent scope and this package + # In ProjectCompilerPostConfig.cmake, we capture the "global" flags Trilinos wants in + # TRILINOS_TOPLEVEL_CXX_FLAGS + SET(CMAKE_CXX_FLAGS "${TRILINOS_TOPLEVEL_CXX_FLAGS} ${KOKKOSCORE_CXX_FLAGS}" PARENT_SCOPE) + SET(CMAKE_CXX_FLAGS "${TRILINOS_TOPLEVEL_CXX_FLAGS} ${KOKKOSCORE_CXX_FLAGS}") + #CMAKE_CXX_FLAGS will get added to Kokkos and Kokkos dependencies automatically here + #These flags get set up in KOKKOS_PACKAGE_DECL, which means they + #must be configured before KOKKOS_PACKAGE_DECL + SET(KOKKOS_ALL_COMPILE_OPTIONS + $<$<COMPILE_LANGUAGE:CXX>:${KOKKOS_ALL_COMPILE_OPTIONS}>) +ENDIF() + +KOKKOS_PACKAGE_DECL() + + +#------------------------------------------------------------------------------ +# +# D) Process the subpackages (subdirectories) for Kokkos +# +KOKKOS_PROCESS_SUBPACKAGES() + + +#------------------------------------------------------------------------------ +# +# E) If Kokkos itself is enabled, process the Kokkos package +# + +KOKKOS_PACKAGE_DEF() +KOKKOS_EXCLUDE_AUTOTOOLS_FILES() +KOKKOS_PACKAGE_POSTPROCESS() +KOKKOS_CONFIGURE_CORE() + +IF (NOT KOKKOS_HAS_TRILINOS AND NOT Kokkos_INSTALL_TESTING) + ADD_LIBRARY(kokkos INTERFACE) + #Make sure in-tree projects can reference this as Kokkos:: + #to match the installed target names + ADD_LIBRARY(Kokkos::kokkos ALIAS kokkos) + TARGET_LINK_LIBRARIES(kokkos INTERFACE kokkoscore kokkoscontainers kokkosalgorithms) + KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL(kokkos) +ENDIF() +INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_install.cmake) + +# nvcc_wrapper is Kokkos' wrapper for NVIDIA's NVCC CUDA compiler. +# Kokkos needs nvcc_wrapper in order to build. Other libraries and +# executables also need nvcc_wrapper. Thus, we need to install it. +# If the argument of DESTINATION is a relative path, CMake computes it +# as relative to ${CMAKE_INSTALL_PATH}. +# KOKKOS_INSTALL_ADDITIONAL_FILES will install nvcc wrapper and other generated +# files +KOKKOS_INSTALL_ADDITIONAL_FILES() + + +# Finally - if we are a subproject - make sure the enabled devices are visible +IF (HAS_PARENT) + FOREACH(DEV Kokkos_ENABLED_DEVICES) + #I would much rather not make these cache variables or global properties, but I can't + #make any guarantees on whether PARENT_SCOPE is good enough to make + #these variables visible where I need them + SET(Kokkos_ENABLE_${DEV} ON PARENT_SCOPE) + SET_PROPERTY(GLOBAL PROPERTY Kokkos_ENABLE_${DEV} ON) + ENDFOREACH() +ENDIF() diff --git a/packages/kokkos/CONTRIBUTING.md b/packages/kokkos/CONTRIBUTING.md new file mode 100644 index 0000000000000000000000000000000000000000..b4f3057cef2cb707b55b3ae3331000ff4e55c0e1 --- /dev/null +++ b/packages/kokkos/CONTRIBUTING.md @@ -0,0 +1,14 @@ +# Contributing to Kokkos + +## Pull Requests +We actively welcome pull requests. +1. Fork the repo and create your branch from `develop`. +2. If you've added code that should be tested, add tests. +3. If you've changed APIs, update the documentation. +4. Ensure the test suite passes. + +## Issues +We use GitHub issues to track public bugs. Please ensure your description is clear and has sufficient instructions to be able to reproduce the issue. + +## License +By contributing to Kokkos, you agree that your contributions will be licensed under the LICENSE file in the root directory of this source tree. diff --git a/packages/kokkos/Copyright.txt b/packages/kokkos/Copyright.txt new file mode 100644 index 0000000000000000000000000000000000000000..5e2f8d8647b53b8def2e240c92fdbad04b1550ec --- /dev/null +++ b/packages/kokkos/Copyright.txt @@ -0,0 +1,41 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER diff --git a/packages/kokkos/HOW_TO_SNAPSHOT b/packages/kokkos/HOW_TO_SNAPSHOT new file mode 100644 index 0000000000000000000000000000000000000000..ad3f78efb4f8dd8399e3fb2889def7e841b531f9 --- /dev/null +++ b/packages/kokkos/HOW_TO_SNAPSHOT @@ -0,0 +1,73 @@ + +Developers of Kokkos (those who commit modifications to Kokkos) +must maintain the snapshot of Kokkos in the Trilinos repository. + +This file contains instructions for how to +snapshot Kokkos from github.com/kokkos to Trilinos. + +------------------------------------------------------------------------ +*** EVERYTHING GOES RIGHT WORKFLOW *** + +1) Given a 'git clone' of Kokkos and of Trilinos repositories. +1.1) Let ${KOKKOS} be the absolute path to the Kokkos clone. + This path *must* terminate with the directory name 'kokkos'; + e.g., ${HOME}/kokkos . +1.2) Let ${TRILINOS} be the absolute path to the Trilinos directory. + +2) Given that the Kokkos build & test is clean and + changes are committed to the Kokkos clone. + +3) Snapshot the current commit in the Kokkos clone into the Trilinos clone. + This overwrites ${TRILINOS}/packages/kokkos with the content of ${KOKKOS}: + ${KOKKOS}/scripts/snapshot.py --verbose ${KOKKOS} ${TRILINOS}/packages + +4) Verify the snapshot commit happened as expected + cd ${TRILINOS}/packages/kokkos + git log -1 --name-only + +5) Modify, build, and test Trilinos with the Kokkos snapshot. + +6) Given that that the Trilinos build & test is clean and + changes are committed to the Trilinos clone. + +7) Attempt push to the Kokkos repository. + If push fails then you must 'remove the Kokkos snapshot' + from your Trilinos clone. + See below. + +8) Attempt to push to the Trilinos repository. + If updating for a failed push requires you to change Kokkos you must + 'remove the Kokkos snapshot' from your Trilinos clone. + See below. + +------------------------------------------------------------------------ +*** WHEN SOMETHING GOES WRONG AND YOU MUST *** +*** REMOVE THE KOKKOS SNAPSHOT FROM YOUR TRILINOS CLONE *** + +1) Query the Trilinos clone commit log. + git log --oneline + +2) Note the <SHA1> of the commit to the Trillinos clone + immediately BEFORE the Kokkos snapshot commit. + Copy this <SHA1> for use in the next command. + +3) IF more than one outstanding commit then you can remove just the + Kokkos snapshot commit with 'git rebase -i'. Edit the rebase file. + Remove or comment out the Kokkos snapshot commit entry. + git rebase -i <SHA1> + +4) IF the Kokkos snapshot commit is the one and only + outstanding commit then remove just than commit. + git reset --hard HEAD~1 + +------------------------------------------------------------------------ +*** REGARDING 'snapshot.py' TOOL *** + +The 'snapshot.py' tool is developed and maintained by the +Center for Computing Research (CCR) +Software Engineering, Maintenance, and Support (SEMS) team. + +Contact Brent Perschbacher <bmpersc@sandia.gov> for questions> + +------------------------------------------------------------------------ + diff --git a/packages/kokkos/LICENSE b/packages/kokkos/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..c6f17087d5a1b160a5fddeaab39ed9380328b485 --- /dev/null +++ b/packages/kokkos/LICENSE @@ -0,0 +1,43 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Kokkos is licensed under 3-clause BSD terms of use: +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER diff --git a/packages/kokkos/Makefile.kokkos b/packages/kokkos/Makefile.kokkos new file mode 100644 index 0000000000000000000000000000000000000000..2599121d70ada48567c61fdc63ba94925a402267 --- /dev/null +++ b/packages/kokkos/Makefile.kokkos @@ -0,0 +1,1385 @@ +# Default settings common options. + +KOKKOS_VERSION_MAJOR = 3 +KOKKOS_VERSION_MINOR = 4 +KOKKOS_VERSION_PATCH = 00 +KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc) + +# Options: Cuda,HIP,OpenMP,Pthread,Serial +#KOKKOS_DEVICES ?= "OpenMP" +KOKKOS_DEVICES ?= "Pthread" +# Options: +# Intel: KNC,KNL,SNB,HSW,BDW,SKX +# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86 +# ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX +# IBM: BGQ,Power7,Power8,Power9 +# AMD-GPUS: Vega900,Vega906,Vega908 +# AMD-CPUS: AMDAVX,Zen,Zen2 +KOKKOS_ARCH ?= "" +# Options: yes,no +KOKKOS_DEBUG ?= "no" +# Options: hwloc,librt,experimental_memkind +KOKKOS_USE_TPLS ?= "" +# Options: c++14,c++1y,c++17,c++1z,c++2a +KOKKOS_CXX_STANDARD ?= "c++14" +# Options: aggressive_vectorization,disable_profiling,enable_large_mem_tests,disable_complex_align +KOKKOS_OPTIONS ?= "" +KOKKOS_CMAKE ?= "no" +KOKKOS_TRIBITS ?= "no" +KOKKOS_STANDALONE_CMAKE ?= "no" + +# Default settings specific options. +# Options: force_uvm,use_ldg,rdc,enable_lambda,enable_constexpr +KOKKOS_CUDA_OPTIONS ?= "" + +# Options: rdc +KOKKOS_HIP_OPTIONS ?= "" + +# Default settings specific options. +# Options: enable_async_dispatch +KOKKOS_HPX_OPTIONS ?= "" + +# Helper functions for conversion to upper case +uppercase_TABLE:=a,A b,B c,C d,D e,E f,F g,G h,H i,I j,J k,K l,L m,M n,N o,O p,P q,Q r,R s,S t,T u,U v,V w,W x,X y,Y z,Z +uppercase_internal=$(if $1,$$(subst $(firstword $1),$(call uppercase_internal,$(wordlist 2,$(words $1),$1),$2)),$2) +uppercase=$(eval uppercase_RESULT:=$(call uppercase_internal,$(uppercase_TABLE),$1))$(uppercase_RESULT) +# Return a 1 if a string contains a substring and 0 if not +# Note the search string should be without '"' +# Example: $(call kokkos_has_string,"hwloc,librt",hwloc) +# Will return a 1 +kokkos_has_string=$(if $(findstring $(call uppercase,$2),$(call uppercase,$1)),1,0) +# Returns 1 if the path exists, 0 otherwise +# Example: $(call kokkos_path_exists,/path/to/file) +# Will return a 1 if /path/to/file exists +kokkos_path_exists=$(if $(wildcard $1),1,0) + +# Check for general settings + +KOKKOS_INTERNAL_ENABLE_DEBUG := $(call kokkos_has_string,$(KOKKOS_DEBUG),yes) +KOKKOS_INTERNAL_ENABLE_CXX14 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++14) +KOKKOS_INTERNAL_ENABLE_CXX1Y := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++1y) +KOKKOS_INTERNAL_ENABLE_CXX17 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++17) +KOKKOS_INTERNAL_ENABLE_CXX1Z := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++1z) +KOKKOS_INTERNAL_ENABLE_CXX2A := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++2a) +KOKKOS_INTERNAL_ENABLE_CXX20 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++20) + +# Check for external libraries. +KOKKOS_INTERNAL_USE_HWLOC := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),hwloc) +KOKKOS_INTERNAL_USE_LIBRT := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),librt) +KOKKOS_INTERNAL_USE_MEMKIND := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),experimental_memkind) + +# Check for advanced settings. +KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),compiler_warnings) +KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION := $(call kokkos_has_string,$(KOKKOS_OPTIONS),aggressive_vectorization) +KOKKOS_INTERNAL_ENABLE_TUNING := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_tuning) +KOKKOS_INTERNAL_DISABLE_COMPLEX_ALIGN := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_complex_align) +KOKKOS_INTERNAL_DISABLE_DUALVIEW_MODIFY_CHECK := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_dualview_modify_check) +KOKKOS_INTERNAL_ENABLE_PROFILING_LOAD_PRINT := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_profile_load_print) +KOKKOS_INTERNAL_ENABLE_LARGE_MEM_TESTS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_large_mem_tests) +KOKKOS_INTERNAL_CUDA_USE_LDG := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),use_ldg) +KOKKOS_INTERNAL_CUDA_USE_UVM := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),force_uvm) +KOKKOS_INTERNAL_CUDA_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),rdc) +KOKKOS_INTERNAL_CUDA_USE_LAMBDA := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_lambda) +KOKKOS_INTERNAL_CUDA_USE_CONSTEXPR := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_constexpr) +KOKKOS_INTERNAL_HPX_ENABLE_ASYNC_DISPATCH := $(call kokkos_has_string,$(KOKKOS_HPX_OPTIONS),enable_async_dispatch) + +KOKKOS_INTERNAL_HIP_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_HIP_OPTIONS),rdc) + +# Check for Kokkos Host Execution Spaces one of which must be on. +KOKKOS_INTERNAL_USE_OPENMP := $(call kokkos_has_string,$(subst OpenMPTarget,,$(KOKKOS_DEVICES)),OpenMP) +KOKKOS_INTERNAL_USE_PTHREADS := $(call kokkos_has_string,$(KOKKOS_DEVICES),Pthread) +KOKKOS_INTERNAL_USE_HPX := $(call kokkos_has_string,$(KOKKOS_DEVICES),HPX) +KOKKOS_INTERNAL_USE_SERIAL := $(call kokkos_has_string,$(KOKKOS_DEVICES),Serial) + +ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 0) + ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 0) + ifeq ($(KOKKOS_INTERNAL_USE_HPX), 0) + KOKKOS_INTERNAL_USE_SERIAL := 1 + endif + endif +endif + +# Check for other Execution Spaces. +KOKKOS_INTERNAL_USE_CUDA := $(call kokkos_has_string,$(KOKKOS_DEVICES),Cuda) +KOKKOS_INTERNAL_USE_HIP := $(call kokkos_has_string,$(KOKKOS_DEVICES),HIP) +KOKKOS_INTERNAL_USE_OPENMPTARGET := $(call kokkos_has_string,$(KOKKOS_DEVICES),OpenMPTarget) + +KOKKOS_DEVICELIST = +ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) + KOKKOS_DEVICELIST += Serial +endif +ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) + KOKKOS_DEVICELIST += OpenMP +endif +ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) + KOKKOS_DEVICELIST += Threads +endif +ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) + KOKKOS_DEVICELIST += HPX +endif +ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) + KOKKOS_DEVICELIST += Cuda +endif +ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) + KOKKOS_DEVICELIST += HIP +endif +ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) + KOKKOS_DEVICELIST += OPENMPTARGET + KOKKOS_INTERNAL_HAVE_CXX17_OR_NEWER := $(shell expr $(KOKKOS_INTERNAL_ENABLE_CXX17) \ + + $(KOKKOS_INTERNAL_ENABLE_CXX20) \ + + $(KOKKOS_INTERNAL_ENABLE_CXX2A)) + ifneq ($(KOKKOS_INTERNAL_HAVE_CXX17_OR_NEWER), 1) + $(error OpenMPTarget backend requires C++17 or newer) + endif +endif + +ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) + KOKKOS_INTERNAL_NVCC_PATH := $(shell which nvcc) + ifeq ($(origin CUDA_PATH), undefined) + CUDA_PATH = $(KOKKOS_INTERNAL_NVCC_PATH:/bin/nvcc=) + endif + ifeq ($(CUDA_PATH),) + CUDA_PATH = $(KOKKOS_INTERNAL_NVCC_PATH:/bin/nvcc=) + endif + KOKKOS_INTERNAL_COMPILER_NVCC_VERSION := $(shell nvcc --version 2>&1 | grep release | cut -d' ' -f5 | cut -d',' -f1 | tr -d .) +endif + +# Check OS. +KOKKOS_OS := $(strip $(shell uname -s)) +KOKKOS_INTERNAL_OS_CYGWIN := $(call kokkos_has_string,$(KOKKOS_OS),CYGWIN) +KOKKOS_INTERNAL_OS_LINUX := $(call kokkos_has_string,$(KOKKOS_OS),Linux) +KOKKOS_INTERNAL_OS_DARWIN := $(call kokkos_has_string,$(KOKKOS_OS),Darwin) + +# Check compiler. +KOKKOS_CXX_VERSION := $(strip $(shell $(CXX) --version 2>&1)) +KOKKOS_INTERNAL_COMPILER_INTEL := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),Intel Corporation) +KOKKOS_INTERNAL_COMPILER_PGI := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),PGI) +KOKKOS_INTERNAL_COMPILER_XL := $(strip $(shell $(CXX) -qversion 2>&1 | grep -c XL)) +KOKKOS_INTERNAL_COMPILER_CRAY := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep -c "CC-")) +KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell echo "$(shell export OMPI_CXX=$(OMPI_CXX); export MPICH_CXX=$(MPICH_CXX); $(CXX) --version 2>&1 | grep -c nvcc)>0" | bc)) +KOKKOS_INTERNAL_COMPILER_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),clang) +KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),Apple clang) +KOKKOS_INTERNAL_COMPILER_HCC := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),HCC) +KOKKOS_INTERNAL_COMPILER_GCC := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),GCC) + +# Check Host Compiler if using NVCC through nvcc_wrapper +ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) + KOKKOS_INTERNAL_COMPILER_NVCC_WRAPPER := $(strip $(shell echo $(CXX) | grep -c nvcc_wrapper)) + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC_WRAPPER), 1) + + KOKKOS_CXX_HOST_VERSION := $(strip $(shell $(CXX) $(CXXFLAGS) --host-version 2>&1)) + KOKKOS_INTERNAL_COMPILER_PGI := $(call kokkos_has_string,$(KOKKOS_CXX_HOST_VERSION),PGI) + KOKKOS_INTERNAL_COMPILER_INTEL := $(call kokkos_has_string,$(KOKKOS_CXX_HOST_VERSION),Intel Corporation) + KOKKOS_INTERNAL_COMPILER_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_HOST_VERSION),clang) + endif +endif + +ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 2) + KOKKOS_INTERNAL_COMPILER_CLANG = 1 +endif +ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 2) + KOKKOS_INTERNAL_COMPILER_XL = 1 +endif + +# Apple Clang passes both clang and apple clang tests, so turn off clang. +ifeq ($(KOKKOS_INTERNAL_COMPILER_APPLE_CLANG), 1) + KOKKOS_INTERNAL_COMPILER_CLANG = 0 +endif +# AMD HCC passes both clang and hcc test so turn off clang +ifeq ($(KOKKOS_INTERNAL_COMPILER_HCC), 1) + KOKKOS_INTENAL_COMPILER_CLANG = 0 +endif + +ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + KOKKOS_INTERNAL_COMPILER_CLANG_VERSION := $(shell $(CXX) --version | grep version | cut -d ' ' -f3 | tr -d '.') + + ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) + ifeq ($(shell test $(KOKKOS_INTERNAL_COMPILER_CLANG_VERSION) -lt 400; echo $$?),0) + $(error Compiling Cuda code directly with Clang requires version 4.0.0 or higher) + endif + + KOKKOS_INTERNAL_CUDA_USE_LAMBDA := 1 + endif +endif + +# Set compiler warnings flags. +ifeq ($(KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS), 1) + ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) + # TODO check if PGI accepts GNU style warnings + KOKKOS_INTERNAL_COMPILER_WARNINGS = + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wunused-parameter -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_APPLE_CLANG), 1) + KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wunused-parameter -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1) + KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wunused-parameter -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) + # TODO check if cray accepts GNU style warnings + KOKKOS_INTERNAL_COMPILER_WARNINGS = + else + #gcc + KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wunused-parameter -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized + endif + endif + endif + endif + endif +else + KOKKOS_INTERNAL_COMPILER_WARNINGS = +endif + +# Set OpenMP flags. +ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) + KOKKOS_INTERNAL_OPENMP_FLAG := -mp +else + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp=libomp + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_APPLE_CLANG), 1) + KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp=libomp + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1) + KOKKOS_INTERNAL_OPENMP_FLAG := -qsmp=omp + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) + # OpenMP is turned on by default in Cray compiler environment. + KOKKOS_INTERNAL_OPENMP_FLAG := + else + KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp + endif + endif + endif + endif +endif +ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1) + KOKKOS_INTERNAL_OPENMPTARGET_FLAG := -DKOKKOS_IBM_XL_OMP45_WORKAROUND -qsmp=omp -qoffload -qnoeh +else + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + #KOKKOS_INTERNAL_OPENMPTARGET_FLAG := -DKOKKOS_BUG_WORKAROUND_IBM_CLANG_OMP45_VIEW_INIT -fopenmp-implicit-declare-target -fopenmp-targets=nvptx64-nvidia-cuda -fopenmp -fopenmp=libomp + KOKKOS_INTERNAL_OPENMPTARGET_FLAG := -DKOKKOS_WORKAROUND_OPENMPTARGET_CLANG -fopenmp -fopenmp=libomp + KOKKOS_INTERNAL_OPENMPTARGET_LIB := -lomptarget + else + #Assume GCC + KOKKOS_INTERNAL_OPENMPTARGET_FLAG := -fopenmp -foffload=nvptx-none + endif +endif + +# Set C++ version flags. +ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) + KOKKOS_INTERNAL_CXX14_FLAG := --c++14 + KOKKOS_INTERNAL_CXX17_FLAG := --c++17 +else + ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1) + KOKKOS_INTERNAL_CXX14_FLAG := -std=c++14 + KOKKOS_INTERNAL_CXX1Y_FLAG := -std=c++1y + #KOKKOS_INTERNAL_CXX17_FLAG := -std=c++17 + #KOKKOS_INTERNAL_CXX1Z_FLAG := -std=c++1Z + #KOKKOS_INTERNAL_CXX2A_FLAG := -std=c++2a + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) + KOKKOS_INTERNAL_CXX14_FLAG := -hstd=c++14 + #KOKKOS_INTERNAL_CXX1Y_FLAG := -hstd=c++1y + #KOKKOS_INTERNAL_CXX17_FLAG := -hstd=c++17 + #KOKKOS_INTERNAL_CXX1Z_FLAG := -hstd=c++1z + #KOKKOS_INTERNAL_CXX2A_FLAG := -hstd=c++2a + else + KOKKOS_INTERNAL_CXX14_FLAG := -std=c++14 + KOKKOS_INTERNAL_CXX1Y_FLAG := -std=c++1y + KOKKOS_INTERNAL_CXX17_FLAG := -std=c++17 + KOKKOS_INTERNAL_CXX1Z_FLAG := -std=c++1z + KOKKOS_INTERNAL_CXX2A_FLAG := -std=c++2a + endif + endif +endif + +# Check for Kokkos Architecture settings. + +# Intel based. +KOKKOS_INTERNAL_USE_ARCH_KNC := $(call kokkos_has_string,$(KOKKOS_ARCH),KNC) +KOKKOS_INTERNAL_USE_ARCH_WSM := $(call kokkos_has_string,$(KOKKOS_ARCH),WSM) +KOKKOS_INTERNAL_USE_ARCH_SNB := $(call kokkos_has_string,$(KOKKOS_ARCH),SNB) +KOKKOS_INTERNAL_USE_ARCH_HSW := $(call kokkos_has_string,$(KOKKOS_ARCH),HSW) +KOKKOS_INTERNAL_USE_ARCH_BDW := $(call kokkos_has_string,$(KOKKOS_ARCH),BDW) +KOKKOS_INTERNAL_USE_ARCH_SKX := $(call kokkos_has_string,$(KOKKOS_ARCH),SKX) +KOKKOS_INTERNAL_USE_ARCH_KNL := $(call kokkos_has_string,$(KOKKOS_ARCH),KNL) + +# NVIDIA based. +NVCC_WRAPPER := $(KOKKOS_PATH)/bin/nvcc_wrapper +KOKKOS_INTERNAL_USE_ARCH_KEPLER30 := $(call kokkos_has_string,$(KOKKOS_ARCH),Kepler30) +KOKKOS_INTERNAL_USE_ARCH_KEPLER32 := $(call kokkos_has_string,$(KOKKOS_ARCH),Kepler32) +KOKKOS_INTERNAL_USE_ARCH_KEPLER35 := $(call kokkos_has_string,$(KOKKOS_ARCH),Kepler35) +KOKKOS_INTERNAL_USE_ARCH_KEPLER37 := $(call kokkos_has_string,$(KOKKOS_ARCH),Kepler37) +KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(call kokkos_has_string,$(KOKKOS_ARCH),Maxwell50) +KOKKOS_INTERNAL_USE_ARCH_MAXWELL52 := $(call kokkos_has_string,$(KOKKOS_ARCH),Maxwell52) +KOKKOS_INTERNAL_USE_ARCH_MAXWELL53 := $(call kokkos_has_string,$(KOKKOS_ARCH),Maxwell53) +KOKKOS_INTERNAL_USE_ARCH_PASCAL61 := $(call kokkos_has_string,$(KOKKOS_ARCH),Pascal61) +KOKKOS_INTERNAL_USE_ARCH_PASCAL60 := $(call kokkos_has_string,$(KOKKOS_ARCH),Pascal60) +KOKKOS_INTERNAL_USE_ARCH_VOLTA70 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volta70) +KOKKOS_INTERNAL_USE_ARCH_VOLTA72 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volta72) +KOKKOS_INTERNAL_USE_ARCH_TURING75 := $(call kokkos_has_string,$(KOKKOS_ARCH),Turing75) +KOKKOS_INTERNAL_USE_ARCH_AMPERE80 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere80) +KOKKOS_INTERNAL_USE_ARCH_AMPERE86 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere86) +KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \ + + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \ + + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \ + + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37) \ + + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \ + + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \ + + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) \ + + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61) \ + + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL60) \ + + $(KOKKOS_INTERNAL_USE_ARCH_VOLTA70) \ + + $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \ + + $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE80) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86)) + +#SEK: This seems like a bug to me +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0) + KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(call kokkos_has_string,$(KOKKOS_ARCH),Maxwell) + KOKKOS_INTERNAL_USE_ARCH_KEPLER35 := $(call kokkos_has_string,$(KOKKOS_ARCH),Kepler) + KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \ + + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50)) +endif + +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 1) + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + KOKKOS_INTERNAL_NVCC_PATH := $(shell which nvcc) + CUDA_PATH ?= $(KOKKOS_INTERNAL_NVCC_PATH:/bin/nvcc=) + ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) + KOKKOS_INTERNAL_OPENMPTARGET_FLAG := $(KOKKOS_INTERNAL_OPENMPTARGET_FLAG) --cuda-path=$(CUDA_PATH) + endif + endif +endif +# ARM based. +KOKKOS_INTERNAL_USE_ARCH_ARMV80 := $(call kokkos_has_string,$(KOKKOS_ARCH),ARMv80) +KOKKOS_INTERNAL_USE_ARCH_ARMV81 := $(call kokkos_has_string,$(KOKKOS_ARCH),ARMv81) +KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX := $(call kokkos_has_string,$(KOKKOS_ARCH),ARMv8-ThunderX) +KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2 := $(call kokkos_has_string,$(KOKKOS_ARCH),ARMv8-TX2) +KOKKOS_INTERNAL_USE_ARCH_A64FX := $(call kokkos_has_string,$(KOKKOS_ARCH),A64FX) +KOKKOS_INTERNAL_USE_ARCH_ARM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_ARMV80)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV81)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2)+$(KOKKOS_INTERNAL_USE_ARCH_A64FX) | bc)) + +# IBM based. +KOKKOS_INTERNAL_USE_ARCH_BGQ := $(call kokkos_has_string,$(KOKKOS_ARCH),BGQ) +KOKKOS_INTERNAL_USE_ARCH_POWER7 := $(call kokkos_has_string,$(KOKKOS_ARCH),Power7) +KOKKOS_INTERNAL_USE_ARCH_POWER8 := $(call kokkos_has_string,$(KOKKOS_ARCH),Power8) +KOKKOS_INTERNAL_USE_ARCH_POWER9 := $(call kokkos_has_string,$(KOKKOS_ARCH),Power9) +KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_BGQ)+$(KOKKOS_INTERNAL_USE_ARCH_POWER7)+$(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc)) + +# AMD based. +KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(call kokkos_has_string,$(KOKKOS_ARCH),AMDAVX) +KOKKOS_INTERNAL_USE_ARCH_ZEN2 := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen2) +KOKKOS_INTERNAL_USE_ARCH_ZEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen) +KOKKOS_INTERNAL_USE_ARCH_VEGA900 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega900) +KOKKOS_INTERNAL_USE_ARCH_VEGA906 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega906) +KOKKOS_INTERNAL_USE_ARCH_VEGA908 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega908) + +# Any AVX? +KOKKOS_INTERNAL_USE_ARCH_SSE42 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM)) +KOKKOS_INTERNAL_USE_ARCH_AVX := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_AMDAVX)) +KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2)) +KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNL)) +KOKKOS_INTERNAL_USE_ARCH_AVX512XEON := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SKX)) + +# Decide what ISA level we are able to support. +KOKKOS_INTERNAL_USE_ISA_X86_64 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM) + $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_KNL) + $(KOKKOS_INTERNAL_USE_ARCH_SKX) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2)) +KOKKOS_INTERNAL_USE_ISA_KNC := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNC)) +KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POWER8) + $(KOKKOS_INTERNAL_USE_ARCH_POWER9)) +KOKKOS_INTERNAL_USE_ISA_POWERPCBE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POWER7)) + +# Decide whether we can support transactional memory +KOKKOS_INTERNAL_USE_TM := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_SKX)) + +# Incompatible flags? +KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1" | bc )) +KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1" | bc)) + +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIHOST), 1) + $(error Defined Multiple Host architectures: KOKKOS_ARCH=$(KOKKOS_ARCH) ) +endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIGPU), 1) + $(error Defined Multiple GPU architectures: KOKKOS_ARCH=$(KOKKOS_ARCH) ) +endif + +# Generating the list of Flags. + +KOKKOS_CPPFLAGS = +KOKKOS_LIBDIRS = +ifneq ($(KOKKOS_CMAKE), yes) + KOKKOS_CPPFLAGS = -I./ -I$(KOKKOS_PATH)/core/src -I$(KOKKOS_PATH)/containers/src -I$(KOKKOS_PATH)/algorithms/src +endif +KOKKOS_TPL_INCLUDE_DIRS = +KOKKOS_TPL_LIBRARY_DIRS = +KOKKOS_TPL_LIBRARY_NAMES = + +ifeq ($(KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS), 1) + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_WARNINGS) +endif + +KOKKOS_LIBS = -ldl +KOKKOS_TPL_LIBRARY_NAMES += dl +ifneq ($(KOKKOS_CMAKE), yes) + KOKKOS_LIBDIRS = -L$(shell pwd) + # CXXLDFLAGS is used together with CXXFLAGS in a combined compile/link command + KOKKOS_CXXLDFLAGS = -L$(shell pwd) +endif +KOKKOS_LINK_FLAGS = +KOKKOS_SRC = +KOKKOS_HEADERS = + +# Generating the KokkosCore_config.h file. + +KOKKOS_INTERNAL_CONFIG_TMP=KokkosCore_config.tmp +KOKKOS_CONFIG_HEADER=KokkosCore_config.h +# Functions for generating config header file +kokkos_append_header = $(shell echo $1 >> $(KOKKOS_INTERNAL_CONFIG_TMP)) + +# assign hash sign to variable for compat. with make 4.3 +H := \# + +# Do not append first line +tmp := $(shell echo "/* ---------------------------------------------" > KokkosCore_config.tmp) +tmp := $(call kokkos_append_header,"Makefile constructed configuration:") +tmp := $(call kokkos_append_header,"----------------------------------------------*/") + +tmp := $(call kokkos_append_header,'$H''if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)') +tmp := $(call kokkos_append_header,'$H''error "Do not include $(KOKKOS_CONFIG_HEADER) directly; include Kokkos_Macros.hpp instead."') +tmp := $(call kokkos_append_header,'$H''else') +tmp := $(call kokkos_append_header,'$H''define KOKKOS_CORE_CONFIG_H') +tmp := $(call kokkos_append_header,'$H''endif') + +tmp := $(call kokkos_append_header,"") +tmp := $(call kokkos_append_header,"$H""define KOKKOS_VERSION $(KOKKOS_VERSION)") +tmp := $(call kokkos_append_header,"") + +tmp := $(call kokkos_append_header,"/* Execution Spaces */") + +ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_COMPILER_CUDA_VERSION $(KOKKOS_INTERNAL_COMPILER_NVCC_VERSION)") +endif + +ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) + tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_HIP') +endif + +ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) + tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_OPENMPTARGET') + ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_WORKAROUND_OPENMPTARGET_GCC") + endif +endif + +ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) + tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_OPENMP') +endif + +ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_THREADS") +endif + +ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HPX") +endif + +ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_SERIAL") +endif + +ifeq ($(KOKKOS_INTERNAL_USE_TM), 1) + tmp := $(call kokkos_append_header,"$H""ifndef __CUDA_ARCH__") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_TM") + tmp := $(call kokkos_append_header,"$H""endif") +endif + +ifeq ($(KOKKOS_INTERNAL_USE_ISA_X86_64), 1) + tmp := $(call kokkos_append_header,"$H""ifndef __CUDA_ARCH__") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_USE_ISA_X86_64") + tmp := $(call kokkos_append_header,"$H""endif") +endif + +ifeq ($(KOKKOS_INTERNAL_USE_ISA_KNC), 1) + tmp := $(call kokkos_append_header,"$H""ifndef __CUDA_ARCH__") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_USE_ISA_KNC") + tmp := $(call kokkos_append_header,"$H""endif") +endif + +ifeq ($(KOKKOS_INTERNAL_USE_ISA_POWERPCLE), 1) + tmp := $(call kokkos_append_header,"$H""ifndef __CUDA_ARCH__") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_USE_ISA_POWERPCLE") + tmp := $(call kokkos_append_header,"$H""endif") +endif + +ifeq ($(KOKKOS_INTERNAL_USE_ISA_POWERPCBE), 1) + tmp := $(call kokkos_append_header,"$H""ifndef __CUDA_ARCH__") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_USE_ISA_POWERPCBE") + tmp := $(call kokkos_append_header,"$H""endif") +endif + +#only add the c++ standard flags if this is not CMake +tmp := $(call kokkos_append_header,"/* General Settings */") +ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX14), 1) +ifneq ($(KOKKOS_STANDALONE_CMAKE), yes) + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX14_FLAG) +endif + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX14") +endif +ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX1Y), 1) + #I cannot make CMake add this in a good way - so add it here + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX1Y_FLAG) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX14") +endif +ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX17), 1) +ifneq ($(KOKKOS_STANDALONE_CMAKE), yes) + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX17_FLAG) +endif + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX17") +endif +ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX1Z), 1) + #I cannot make CMake add this in a good way - so add it here + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX1Z_FLAG) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX17") +endif +ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX2A), 1) + #I cannot make CMake add this in a good way - so add it here + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX2A_FLAG) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX20") +endif +ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX20), 1) + #I cannot make CMake add this in a good way - so add it here + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX20_FLAG) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX20") +endif + +ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1) + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) + KOKKOS_CXXFLAGS += -lineinfo + endif + + KOKKOS_CXXFLAGS += -g + KOKKOS_LDFLAGS += -g + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_DEBUG") + ifeq ($(KOKKOS_INTERNAL_DISABLE_DUALVIEW_MODIFY_CHECK), 0) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK") + endif +endif +ifeq ($(KOKKOS_INTERNAL_DISABLE_COMPLEX_ALIGN), 0) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_COMPLEX_ALIGN") +endif + +ifeq ($(KOKKOS_INTERNAL_ENABLE_PROFILING_LOAD_PRINT), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_PROFILING_LOAD_PRINT") +endif + +ifeq ($(KOKKOS_INTERNAL_ENABLE_TUNING), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_TUNING") +endif + +tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_LIBDL") + +ifeq ($(KOKKOS_INTERNAL_USE_HWLOC), 1) + ifneq ($(KOKKOS_CMAKE), yes) + ifneq ($(HWLOC_PATH),) + KOKKOS_CPPFLAGS += -I$(HWLOC_PATH)/include + KOKKOS_LIBDIRS += -L$(HWLOC_PATH)/lib + KOKKOS_CXXLDFLAGS += -L$(HWLOC_PATH)/lib + KOKKOS_TPL_INCLUDE_DIRS += $(HWLOC_PATH)/include + KOKKOS_TPL_LIBRARY_DIRS += $(HWLOC_PATH)/lib + endif + KOKKOS_LIBS += -lhwloc + KOKKOS_TPL_LIBRARY_NAMES += hwloc + endif + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HWLOC") +endif + +ifeq ($(KOKKOS_INTERNAL_USE_LIBRT), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_USE_LIBRT") + KOKKOS_LIBS += -lrt + KOKKOS_TPL_LIBRARY_NAMES += rt +endif + +ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1) + ifneq ($(KOKKOS_CMAKE), yes) + ifneq ($(MEMKIND_PATH),) + KOKKOS_CPPFLAGS += -I$(MEMKIND_PATH)/include + KOKKOS_LIBDIRS += -L$(MEMKIND_PATH)/lib + KOKKOS_CXXLDFLAGS += -L$(MEMKIND_PATH)/lib + KOKKOS_TPL_INCLUDE_DIRS += $(MEMKIND_PATH)/include + KOKKOS_TPL_LIBRARY_DIRS += $(MEMKIND_PATH)/lib + endif + KOKKOS_LIBS += -lmemkind -lnuma + KOKKOS_TPL_LIBRARY_NAMES += memkind numa + endif + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HBWSPACE") +endif + +ifeq ($(KOKKOS_INTERNAL_ENABLE_LARGE_MEM_TESTS), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_LARGE_MEM_TESTS") +endif + +tmp := $(call kokkos_append_header,"/* Optimization Settings */") + +ifeq ($(KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION") +endif + +tmp := $(call kokkos_append_header,"/* Cuda Settings */") + +ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) + ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LDG), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_LDG_INTRINSIC") + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_LDG_INTRINSIC") + endif + endif + + ifeq ($(KOKKOS_INTERNAL_CUDA_USE_UVM), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_UVM") + endif + + ifeq ($(KOKKOS_INTERNAL_CUDA_USE_RELOC), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE") + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + KOKKOS_CXXFLAGS += -fcuda-rdc + KOKKOS_LDFLAGS += -fcuda-rdc + else + KOKKOS_CXXFLAGS += --relocatable-device-code=true + KOKKOS_LDFLAGS += --relocatable-device-code=true + endif + endif + + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) + ifeq ($(shell test $(KOKKOS_INTERNAL_COMPILER_NVCC_VERSION) -ge 90; echo $$?),0) + # This diagnostic is just plain wrong in CUDA 9 + # See https://github.com/kokkos/kokkos/issues/1470 + KOKKOS_CXXFLAGS += -Xcudafe --diag_suppress=esa_on_defaulted_function_ignored + endif + endif + + ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LAMBDA), 1) + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) + ifeq ($(shell test $(KOKKOS_INTERNAL_COMPILER_NVCC_VERSION) -gt 70; echo $$?),0) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_LAMBDA") + KOKKOS_CXXFLAGS += -expt-extended-lambda + else + $(warning Warning: Cuda Lambda support was requested but NVCC version is too low. This requires NVCC for Cuda version 7.5 or higher. Disabling Lambda support now.) + endif + endif + + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_LAMBDA") + endif + endif + + ifeq ($(KOKKOS_INTERNAL_CUDA_USE_CONSTEXPR), 1) + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) + ifeq ($(shell test $(KOKKOS_INTERNAL_COMPILER_NVCC_VERSION) -ge 80; echo $$?),0) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_CONSTEXPR") + KOKKOS_CXXFLAGS += -expt-relaxed-constexpr + else + $(warning Warning: Cuda relaxed constexpr support was requested but NVCC version is too low. This requires NVCC for Cuda version 8.0 or higher. Disabling relaxed constexpr support now.) + endif + endif + + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_CONSTEXPR") + endif + endif + + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_IMPL_CUDA_CLANG_WORKAROUND") + endif +endif + +ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) + ifeq ($(KOKKOS_INTERNAL_HPX_ENABLE_ASYNC_DISPATCH), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HPX_ASYNC_DISPATCH") + endif +endif + +# Add Architecture flags. + +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV80), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV80") + + ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) + KOKKOS_CXXFLAGS += + KOKKOS_LDFLAGS += + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) + KOKKOS_CXXFLAGS += + KOKKOS_LDFLAGS += + else + KOKKOS_CXXFLAGS += -march=armv8-a + KOKKOS_LDFLAGS += -march=armv8-a + endif + endif +endif + +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV81), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV81") + + ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) + KOKKOS_CXXFLAGS += + KOKKOS_LDFLAGS += + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) + KOKKOS_CXXFLAGS += + KOKKOS_LDFLAGS += + else + KOKKOS_CXXFLAGS += -march=armv8.1-a + KOKKOS_LDFLAGS += -march=armv8.1-a + endif + endif +endif + +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_A64FX), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_A64FX") + + KOKKOS_CXXFLAGS += -march=armv8.2-a+sve + KOKKOS_LDFLAGS += -march=armv8.2-a+sve + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + KOKKOS_CXXFLAGS += -msve-vector-bits=512 + KOKKOS_LDFLAGS += -msve-vector-bits=512 + endif + ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1) + KOKKOS_CXXFLAGS += -msve-vector-bits=512 + KOKKOS_LDFLAGS += -msve-vector-bits=512 + endif +endif + +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_ZEN") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_AVX2") + + ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1) + KOKKOS_CXXFLAGS += -mavx2 + KOKKOS_LDFLAGS += -mavx2 + else + KOKKOS_CXXFLAGS += -march=znver1 -mtune=znver1 + KOKKOS_LDFLAGS += -march=znver1 -mtune=znver1 + endif +endif + +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN2), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_ZEN2") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_AVX2") + + ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1) + KOKKOS_CXXFLAGS += -mavx2 + KOKKOS_LDFLAGS += -mavx2 + else + KOKKOS_CXXFLAGS += -march=znver2 -mtune=znver2 + KOKKOS_LDFLAGS += -march=znver2 -mtune=znver2 + endif +endif + +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV80") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV8_THUNDERX") + + ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) + KOKKOS_CXXFLAGS += + KOKKOS_LDFLAGS += + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) + KOKKOS_CXXFLAGS += + KOKKOS_LDFLAGS += + else + KOKKOS_CXXFLAGS += -march=armv8-a -mtune=thunderx + KOKKOS_LDFLAGS += -march=armv8-a -mtune=thunderx + endif + endif +endif + +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV81") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV8_THUNDERX2") + + ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) + KOKKOS_CXXFLAGS += + KOKKOS_LDFLAGS += + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) + KOKKOS_CXXFLAGS += + KOKKOS_LDFLAGS += + else + KOKKOS_CXXFLAGS += -mtune=thunderx2t99 -mcpu=thunderx2t99 + KOKKOS_LDFLAGS += -mtune=thunderx2t99 -mcpu=thunderx2t99 + endif + endif +endif + +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_SSE42), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_SSE42") + + ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1) + KOKKOS_CXXFLAGS += -xSSE4.2 + KOKKOS_LDFLAGS += -xSSE4.2 + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) + + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) + KOKKOS_CXXFLAGS += -tp=nehalem + KOKKOS_LDFLAGS += -tp=nehalem + else + # Assume that this is a really a GNU compiler. + KOKKOS_CXXFLAGS += -msse4.2 + KOKKOS_LDFLAGS += -msse4.2 + endif + endif + endif +endif + +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX") + + ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1) + KOKKOS_CXXFLAGS += -mavx + KOKKOS_LDFLAGS += -mavx + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) + + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) + KOKKOS_CXXFLAGS += -tp=sandybridge + KOKKOS_LDFLAGS += -tp=sandybridge + else + # Assume that this is a really a GNU compiler. + KOKKOS_CXXFLAGS += -mavx + KOKKOS_LDFLAGS += -mavx + endif + endif + endif +endif + +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER7), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_POWER7") + + ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) + + else + # Assume that this is a really a GNU compiler or it could be XL on P8. + KOKKOS_CXXFLAGS += -mcpu=power7 -mtune=power7 + KOKKOS_LDFLAGS += -mcpu=power7 -mtune=power7 + endif +endif + +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_POWER8") + + ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) + + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1) + KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8 + KOKKOS_LDFLAGS += -mcpu=power8 -mtune=power8 + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) + + else + # Assume that this is a really a GNU compiler on P8. + KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8 + KOKKOS_LDFLAGS += -mcpu=power8 -mtune=power8 + endif + endif + endif +endif + +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER9), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_POWER9") + + ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) + + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1) + KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9 + KOKKOS_LDFLAGS += -mcpu=power9 -mtune=power9 + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) + + else + # Assume that this is a really a GNU compiler on P9 + KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9 + KOKKOS_LDFLAGS += -mcpu=power9 -mtune=power9 + endif + endif + endif +endif + +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HSW), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX2") + + ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1) + KOKKOS_CXXFLAGS += -xCORE-AVX2 + KOKKOS_LDFLAGS += -xCORE-AVX2 + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) + + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) + KOKKOS_CXXFLAGS += -tp=haswell + KOKKOS_LDFLAGS += -tp=haswell + else + # Assume that this is a really a GNU compiler. + KOKKOS_CXXFLAGS += -march=core-avx2 -mtune=core-avx2 + KOKKOS_LDFLAGS += -march=core-avx2 -mtune=core-avx2 + endif + endif + endif +endif + +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_BDW), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX2") + + ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1) + KOKKOS_CXXFLAGS += -xCORE-AVX2 + KOKKOS_LDFLAGS += -xCORE-AVX2 + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) + + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) + KOKKOS_CXXFLAGS += -tp=haswell + KOKKOS_LDFLAGS += -tp=haswell + else + # Assume that this is a really a GNU compiler. + KOKKOS_CXXFLAGS += -march=core-avx2 -mtune=core-avx2 -mrtm + KOKKOS_LDFLAGS += -march=core-avx2 -mtune=core-avx2 -mrtm + endif + endif + endif +endif + +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX512MIC") + + ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1) + KOKKOS_CXXFLAGS += -xMIC-AVX512 + KOKKOS_LDFLAGS += -xMIC-AVX512 + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) + + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) + + else + # Asssume that this is really a GNU compiler. + KOKKOS_CXXFLAGS += -march=knl -mtune=knl + KOKKOS_LDFLAGS += -march=knl -mtune=knl + endif + endif + endif +endif + +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX512XEON") + + ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1) + KOKKOS_CXXFLAGS += -xCORE-AVX512 + KOKKOS_LDFLAGS += -xCORE-AVX512 + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) + + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) + + else + # Nothing here yet. + KOKKOS_CXXFLAGS += -march=skylake-avx512 -mtune=skylake-avx512 -mrtm + KOKKOS_LDFLAGS += -march=skylake-avx512 -mtune=skylake-avx512 -mrtm + endif + endif + endif +endif + +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KNC), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KNC") + KOKKOS_CXXFLAGS += -mmic + KOKKOS_LDFLAGS += -mmic +endif + +# Figure out the architecture flag for Cuda. +ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) + KOKKOS_INTERNAL_USE_CUDA_ARCH=1 +endif +ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) + KOKKOS_INTERNAL_USE_CUDA_ARCH=1 + endif +endif +ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1) + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) + KOKKOS_INTERNAL_CUDA_ARCH_FLAG=-arch + else ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + KOKKOS_INTERNAL_CUDA_ARCH_FLAG=--cuda-gpu-arch + KOKKOS_CXXFLAGS += -x cuda + else + $(error Makefile.kokkos: CUDA is enabled but the compiler is neither NVCC nor Clang (got version string $(KOKKOS_CXX_VERSION)) ) + endif + KOKKOS_INTERNAL_USE_CUDA_ARCH = 1 +endif + +ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + KOKKOS_INTERNAL_CUDA_ARCH_FLAG=-fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march + endif + KOKKOS_INTERNAL_USE_CUDA_ARCH = 1 +endif + +ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1) + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER30), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER30") + KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_30 + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER32), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER32") + KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_32 + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER35), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER35") + KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_35 + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER37), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER37") + KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_37 + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL50") + KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_50 + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL52") + KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_52 + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL53") + KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_53 + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL60), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL60") + KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_60 + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL61), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL61") + KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_61 + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA70), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA70") + KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_70 + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA72), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA72") + KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_72 + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_TURING75), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_TURING") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_TURING75") + KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_75 + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE80), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE80") + KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_80 + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE86), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86") + KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86 + endif + + ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0) + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG) + + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) + KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG) + endif + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) + KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG) + endif + endif + endif + ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) + KOKKOS_CXXFLAGS += --expt-extended-lambda + endif +endif + + +# Figure out the architecture flag for ROCm. +ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) + # Lets start with adding architecture defines + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA900), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HIP 900") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA900") + KOKKOS_INTERNAL_HIP_ARCH_FLAG := --amdgpu-target=gfx900 + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA906), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HIP 906") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA906") + KOKKOS_INTERNAL_HIP_ARCH_FLAG := --amdgpu-target=gfx906 + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA908), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HIP 908") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA908") + KOKKOS_INTERNAL_HIP_ARCH_FLAG := --amdgpu-target=gfx908 + endif + + + KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/HIP/*.cpp) + KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/HIP/*.hpp) + + KOKKOS_CXXFLAGS+=$(KOKKOS_INTERNAL_HIP_ARCH_FLAG) + KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_HIP_ARCH_FLAG) + + ifeq ($(KOKKOS_INTERNAL_HIP_USE_RELOC), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE") + KOKKOS_CXXFLAGS+=-fgpu-rdc + KOKKOS_LDFLAGS+=-fgpu-rdc + else + KOKKOS_CXXFLAGS+=-fno-gpu-rdc + KOKKOS_LDFLAGS+=-fno-gpu-rdc + endif +endif + + +KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h 2>&1) + +ifeq ($(KOKKOS_INTERNAL_LS_CONFIG), KokkosCore_config.h) + KOKKOS_INTERNAL_NEW_CONFIG := $(strip $(shell diff KokkosCore_config.h KokkosCore_config.tmp | grep -c define)) +else + KOKKOS_INTERNAL_NEW_CONFIG := 1 +endif + +ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0) + tmp := $(shell cp KokkosCore_config.tmp KokkosCore_config.h) +endif + +# Functions for generating config header file +kokkos_start_config_header = $(shell sed 's~@INCLUDE_NEXT_FILE@~~g' $(KOKKOS_PATH)/cmake/KokkosCore_Config_HeaderSet.in > $1) +kokkos_update_config_header = $(shell sed 's~@HEADER_GUARD_TAG@~$1~g' $2 > $3) +kokkos_append_config_header = $(shell echo $1 >> $2)) +tmp := $(call kokkos_start_config_header, "KokkosCore_Config_FwdBackend.tmp") +tmp := $(call kokkos_start_config_header, "KokkosCore_Config_SetupBackend.tmp") +tmp := $(call kokkos_start_config_header, "KokkosCore_Config_DeclareBackend.tmp") +tmp := $(call kokkos_start_config_header, "KokkosCore_Config_PostInclude.tmp") +tmp := $(call kokkos_update_config_header, KOKKOS_FWD_HPP_, "KokkosCore_Config_FwdBackend.tmp", "KokkosCore_Config_FwdBackend.hpp") +tmp := $(call kokkos_update_config_header, KOKKOS_SETUP_HPP_, "KokkosCore_Config_SetupBackend.tmp", "KokkosCore_Config_SetupBackend.hpp") +tmp := $(call kokkos_update_config_header, KOKKOS_DECLARE_HPP_, "KokkosCore_Config_DeclareBackend.tmp", "KokkosCore_Config_DeclareBackend.hpp") +tmp := $(call kokkos_update_config_header, KOKKOS_POST_INCLUDE_HPP_, "KokkosCore_Config_PostInclude.tmp", "KokkosCore_Config_PostInclude.hpp") +ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) + tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_CUDA.hpp>","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_CUDA.hpp>","KokkosCore_Config_DeclareBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include <setup/Kokkos_Setup_Cuda.hpp>","KokkosCore_Config_SetupBackend.hpp") + ifeq ($(KOKKOS_INTERNAL_CUDA_USE_UVM), 1) + else + endif +endif +ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) + tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_OPENMPTARGET.hpp>","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_OPENMPTARGET.hpp>","KokkosCore_Config_DeclareBackend.hpp") +endif +ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) + tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HIP.hpp>","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HIP.hpp>","KokkosCore_Config_DeclareBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include <setup/Kokkos_Setup_HIP.hpp>","KokkosCore_Config_SetupBackend.hpp") +endif +ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) + tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_OPENMP.hpp>","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_OPENMP.hpp>","KokkosCore_Config_DeclareBackend.hpp") +endif +ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) + tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_THREADS.hpp>","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_THREADS.hpp>","KokkosCore_Config_DeclareBackend.hpp") +endif +ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) + tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HPX.hpp>","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HPX.hpp>","KokkosCore_Config_DeclareBackend.hpp") +endif +ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) + tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_SERIAL.hpp>","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_SERIAL.hpp>","KokkosCore_Config_DeclareBackend.hpp") +endif +ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1) + tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HBWSpace.hpp>","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HBWSpace.hpp>","KokkosCore_Config_DeclareBackend.hpp") +endif +KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/*.hpp) +KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/impl/*.hpp) +KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/containers/src/*.hpp) +KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.hpp) +KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/algorithms/src/*.hpp) + +KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/impl/*.cpp) +KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.cpp) + +ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) + KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.cpp) + KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp) + ifneq ($(CUDA_PATH),) + KOKKOS_CPPLAGS += -I$(CUDA_PATH)/include + ifeq ($(call kokkos_path_exists,$(CUDA_PATH)/lib64), 1) + KOKKOS_LIBDIRS += -L$(CUDA_PATH)/lib64 + KOKKOS_CXXLDFLAGS += -L$(CUDA_PATH)/lib64 + KOKKOS_TPL_LIBRARY_DIRS += $(CUDA_PATH)/lib64 + else ifeq ($(call kokkos_path_exists,$(CUDA_PATH)/lib), 1) + KOKKOS_LIBDIRS += -L$(CUDA_PATH)/lib + KOKKOS_CXXLDFLAGS += -L$(CUDA_PATH)/lib + KOKKOS_TPL_LIBRARY_DIRS += $(CUDA_PATH)/lib + else + $(error Can't find CUDA library directory: no lib64 or lib directory in $(CUDA_PATH)) + endif + KOKKOS_TPL_INCLUDE_DIRS += $(CUDA_PATH)/include + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + KOKKOS_CXXFLAGS += --cuda-path=$(CUDA_PATH) + endif + endif + KOKKOS_LIBS += -lcudart -lcuda + KOKKOS_TPL_LIBRARY_NAMES += cudart cuda +endif + +ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) + KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/OpenMPTarget/*.cpp) + KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/OpenMPTarget/*.hpp) + ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) + KOKKOS_CXXFLAGS += -Xcompiler $(KOKKOS_INTERNAL_OPENMPTARGET_FLAG) + else + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_OPENMPTARGET_FLAG) + endif + KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_OPENMPTARGET_FLAG) + KOKKOS_LIBS += $(KOKKOS_INTERNAL_OPENMPTARGET_LIB) +endif + +ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) + KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.cpp) + KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.hpp) + + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) + KOKKOS_CXXFLAGS += -Xcompiler $(KOKKOS_INTERNAL_OPENMP_FLAG) + else + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_OPENMP_FLAG) + endif + + KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_OPENMP_FLAG) + KOKKOS_LINK_FLAGS += $(KOKKOS_INTERNAL_OPENMP_FLAG) +endif + +ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) + KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.cpp) + KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.hpp) + KOKKOS_LIBS += -lpthread + KOKKOS_TPL_LIBRARY_NAMES += pthread +endif + +ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) + KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/HPX/*.cpp) + KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/HPX/*.hpp) + ifneq ($(HPX_PATH),) + ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1) + KOKKOS_CXXFLAGS += $(shell PKG_CONFIG_PATH=$(HPX_PATH)/lib64/pkgconfig pkg-config --cflags hpx_application_debug) + KOKKOS_CXXLDFLAGS += $(shell PKG_CONFIG_PATH=$(HPX_PATH)/lib64/pkgconfig pkg-config --libs hpx_application_debug) + KOKKOS_LIBS += $(shell PKG_CONFIG_PATH=$(HPX_PATH)/lib64/pkgconfig pkg-config --libs hpx_application_debug) + else + KOKKOS_CXXFLAGS += $(shell PKG_CONFIG_PATH=$(HPX_PATH)/lib64/pkgconfig pkg-config --cflags hpx_application) + KOKKOS_CXXLDFLAGS += $(shell PKG_CONFIG_PATH=$(HPX_PATH)/lib64/pkgconfig pkg-config --libs hpx_application) + KOKKOS_LIBS += $(shell PKG_CONFIG_PATH=$(HPX_PATH)/lib64/pkgconfig pkg-config --libs hpx_application) + endif + else + ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1) + KOKKOS_CXXFLAGS += $(shell pkg-config --cflags hpx_application_debug) + KOKKOS_CXXLDFLAGS += $(shell pkg-config --libs hpx_application_debug) + KOKKOS_LIBS += $(shell pkg-config --libs hpx_application_debug) + else + KOKKOS_CXXFLAGS += $(shell pkg-config --cflags hpx_application) + KOKKOS_CXXLDFLAGS += $(shell pkg-config --libs hpx_application) + KOKKOS_LIBS += $(shell pkg-config --libs hpx_application) + endif + endif + KOKKOS_TPL_LIBRARY_NAMES += hpx +endif + +# Explicitly set the GCC Toolchain for Clang. +ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + KOKKOS_INTERNAL_GCC_PATH = $(shell which g++) + KOKKOS_INTERNAL_GCC_TOOLCHAIN = $(KOKKOS_INTERNAL_GCC_PATH:/bin/g++=) + KOKKOS_CXXFLAGS += --gcc-toolchain=$(KOKKOS_INTERNAL_GCC_TOOLCHAIN) + KOKKOS_LDFLAGS += --gcc-toolchain=$(KOKKOS_INTERNAL_GCC_TOOLCHAIN) +endif + +# Don't include Kokkos_HBWSpace.cpp if not using MEMKIND to avoid a link warning. +ifneq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1) + KOKKOS_SRC := $(filter-out $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp,$(KOKKOS_SRC)) +endif + +# Don't include Kokkos_Serial.cpp or Kokkos_Serial_Task.cpp if not using Serial +# device to avoid a link warning. +ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) +endif +ifneq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) + KOKKOS_SRC := $(filter-out $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial.cpp,$(KOKKOS_SRC)) + KOKKOS_SRC := $(filter-out $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp,$(KOKKOS_SRC)) +endif + +# With Cygwin functions such as fdopen and fileno are not defined +# when strict ansi is enabled. strict ansi gets enabled with -std=c++14 +# though. So we hard undefine it here. Not sure if that has any bad side effects +# This is needed for gtest actually, not for Kokkos itself! +ifeq ($(KOKKOS_INTERNAL_OS_CYGWIN), 1) + KOKKOS_CXXFLAGS += -U__STRICT_ANSI__ +endif + +# Set KokkosExtraLibs and add -lkokkos to link line +KOKKOS_EXTRA_LIBS := ${KOKKOS_LIBS} +KOKKOS_LIBS := -lkokkos ${KOKKOS_LIBS} + +# Setting up dependencies. + +KokkosCore_config.h: + +KOKKOS_CPP_DEPENDS := KokkosCore_config.h $(KOKKOS_HEADERS) + +KOKKOS_OBJ = $(KOKKOS_SRC:.cpp=.o) +KOKKOS_OBJ_LINK = $(notdir $(KOKKOS_OBJ)) + +include $(KOKKOS_PATH)/Makefile.targets + +kokkos-clean: + rm -f $(KOKKOS_OBJ_LINK) KokkosCore_config.h KokkosCore_config.tmp libkokkos.a KokkosCore_Config_SetupBackend.hpp \ + KokkosCore_Config_FwdBackend.hpp KokkosCore_Config_DeclareBackend.hpp KokkosCore_Config_DeclareBackend.tmp \ + KokkosCore_Config_FwdBackend.tmp KokkosCore_Config_PostInclude.hpp KokkosCore_Config_PostInclude.tmp KokkosCore_Config_SetupBackend.tmp + +libkokkos.a: $(KOKKOS_OBJ_LINK) $(KOKKOS_SRC) $(KOKKOS_HEADERS) + ar cr libkokkos.a $(KOKKOS_OBJ_LINK) + ranlib libkokkos.a + +print-cxx-flags: + echo "$(KOKKOS_CXXFLAGS)" + +KOKKOS_LINK_DEPENDS=libkokkos.a + +#we have carefully separated LDFLAGS from LIBS and LIBDIRS +#we have also separated CPPFLAGS from CXXFLAGS +#if this is not cmake, for backwards compatibility +#we just jam everything together into the CXXFLAGS and LDFLAGS +ifneq ($(KOKKOS_CMAKE), yes) + KOKKOS_CXXFLAGS += $(KOKKOS_CPPFLAGS) + KOKKOS_LDFLAGS += $(KOKKOS_LIBDIRS) +endif diff --git a/packages/kokkos/Makefile.targets b/packages/kokkos/Makefile.targets new file mode 100644 index 0000000000000000000000000000000000000000..cf9fc242420e1dbbb519b3312cf1a4c3b4354738 --- /dev/null +++ b/packages/kokkos/Makefile.targets @@ -0,0 +1,95 @@ +Kokkos_UnorderedMap_impl.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/containers/src/impl/Kokkos_UnorderedMap_impl.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/containers/src/impl/Kokkos_UnorderedMap_impl.cpp +Kokkos_Core.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Core.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Core.cpp +Kokkos_CPUDiscovery.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_CPUDiscovery.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_CPUDiscovery.cpp +Kokkos_Error.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Error.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Error.cpp +Kokkos_Stacktrace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Stacktrace.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Stacktrace.cpp +Kokkos_ExecPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_ExecPolicy.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_ExecPolicy.cpp +Kokkos_HostSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace.cpp +Kokkos_hwloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_hwloc.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_hwloc.cpp +Kokkos_Serial.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial.cpp +Kokkos_Serial_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp +Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp +Kokkos_HostThreadTeam.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp +Kokkos_Spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp +Kokkos_HostBarrier.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostBarrier.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostBarrier.cpp +Kokkos_Profiling.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling.cpp +Kokkos_SharedAlloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_SharedAlloc.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_SharedAlloc.cpp +Kokkos_MemoryPool.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp +Kokkos_MemorySpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_MemorySpace.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemorySpace.cpp +Kokkos_HostSpace_deepcopy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp +Kokkos_NumericTraits.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_NumericTraits.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_NumericTraits.cpp + +ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) +Kokkos_Cuda_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Instance.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Instance.cpp +Kokkos_CudaSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp +Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp +Kokkos_Cuda_Locks.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp +endif + +ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) +Kokkos_HIP_Space.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Space.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Space.cpp +Kokkos_HIP_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Instance.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Instance.cpp +Kokkos_HIP_Locks.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Locks.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Locks.cpp +endif + +ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) +Kokkos_ThreadsExec_base.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec_base.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec_base.cpp +Kokkos_ThreadsExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp +endif + +ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) +Kokkos_OpenMP_Exec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp +Kokkos_OpenMP_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp +endif + +ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) +Kokkos_HPX.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX.cpp +Kokkos_HPX_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX_Task.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX_Task.cpp +endif + +ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) +Kokkos_OpenMPTarget_Exec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp +Kokkos_OpenMPTarget_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp +Kokkos_OpenMPTargetSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp +Kokkos_OpenMPTarget_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp +endif + +Kokkos_HBWSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp diff --git a/packages/kokkos/README.md b/packages/kokkos/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d55ef2caac93ae6803aa97925ea7b081d3f05ca3 --- /dev/null +++ b/packages/kokkos/README.md @@ -0,0 +1,293 @@ + + +# Kokkos: Core Libraries + +Kokkos Core implements a programming model in C++ for writing performance portable +applications targeting all major HPC platforms. For that purpose it provides +abstractions for both parallel execution of code and data management. +Kokkos is designed to target complex node architectures with N-level memory +hierarchies and multiple types of execution resources. It currently can use +CUDA, HPX, OpenMP and Pthreads as backend programming models with several other +backends in development. + +Kokkos Core is part of the Kokkos C++ Performance Portability Programming EcoSystem, +which also provides math kernels (https://github.com/kokkos/kokkos-kernels), as well as +profiling and debugging tools (https://github.com/kokkos/kokkos-tools). + +# Learning about Kokkos + +A programming guide can be found on the Wiki, the API reference is under development. + +For questions find us on Slack: https://kokkosteam.slack.com or open a github issue. + +For non-public questions send an email to +crtrott(at)sandia.gov + +A separate repository with extensive tutorial material can be found under +https://github.com/kokkos/kokkos-tutorials. + +Furthermore, the 'example/tutorial' directory provides step by step tutorial +examples which explain many of the features of Kokkos. They work with +simple Makefiles. To build with g++ and OpenMP simply type 'make' +in the 'example/tutorial' directory. This will build all examples in the +subfolders. To change the build options refer to the Programming Guide +in the compilation section. + +To learn more about Kokkos consider watching one of our presentations: +* GTC 2015: + - http://on-demand.gputechconf.com/gtc/2015/video/S5166.html + - http://on-demand.gputechconf.com/gtc/2015/presentation/S5166-H-Carter-Edwards.pdf + + +# Contributing to Kokkos + +We are open and try to encourage contributions from external developers. +To do so please first open an issue describing the contribution and then issue +a pull request against the develop branch. For larger features it may be good +to get guidance from the core development team first through the github issue. + +Note that Kokkos Core is licensed under standard 3-clause BSD terms of use. +Which means contributing to Kokkos allows anyone else to use your contributions +not just for public purposes but also for closed source commercial projects. +For specifics see the LICENSE file contained in the repository or distribution. + +# Requirements + +### Primary tested compilers on X86 are: +* GCC 5.3.0 +* GCC 5.4.0 +* GCC 5.5.0 +* GCC 6.1.0 +* GCC 7.2.0 +* GCC 7.3.0 +* GCC 8.1.0 +* Intel 17.0.1 +* Intel 17.4.196 +* Intel 18.2.128 +* Clang 4.0.0 +* Clang 6.0.0 for CUDA (CUDA Toolkit 9.0) +* Clang 7.0.0 for CUDA (CUDA Toolkit 9.1) +* Clang 8.0.0 for CUDA (CUDA Toolkit 9.2) +* PGI 18.7 +* NVCC 9.1 for CUDA (with gcc 6.1.0) +* NVCC 9.2 for CUDA (with gcc 7.2.0) +* NVCC 10.0 for CUDA (with gcc 7.4.0) +* NVCC 10.1 for CUDA (with gcc 7.4.0) +* NVCC 11.0 for CUDA (with gcc 8.4.0) + +### Primary tested compilers on Power 8 are: +* GCC 6.4.0 (OpenMP,Serial) +* GCC 7.2.0 (OpenMP,Serial) +* IBM XL 16.1.0 (OpenMP, Serial) +* NVCC 9.2.88 for CUDA (with gcc 7.2.0 and XL 16.1.0) + +### Primary tested compilers on Intel KNL are: +* Intel 17.2.174 (with gcc 6.2.0 and 6.4.0) +* Intel 18.2.199 (with gcc 6.2.0 and 6.4.0) + +### Primary tested compilers on ARM (Cavium ThunderX2) +* GCC 7.2.0 +* ARM/Clang 18.4.0 + +### Other compilers working: +* X86: + * Cygwin 2.1.0 64bit with gcc 4.9.3 + * GCC 8.1.0 (not warning free) + +### Known non-working combinations: +* Power8: + * Pthreads backend +* ARM + * Pthreads backend + +### Build system: +* CMake >= 3.10: required +* CMake >= 3.13: recommended +* CMake >= 3.18: Fortran linkage. This does not affect most mixed Fortran/Kokkos builds. See [build issues](BUILD.md#KnownIssues). + +Primary tested compiler are passing in release mode +with warnings as errors. They also are tested with a comprehensive set of +backend combinations (i.e. OpenMP, Pthreads, Serial, OpenMP+Serial, ...). +We are using the following set of flags: +* GCC: + ```` + -Wall -Wunused-parameter -Wshadow -pedantic + -Werror -Wsign-compare -Wtype-limits + -Wignored-qualifiers -Wempty-body + -Wclobbered -Wuninitialized + ```` +* Intel: + ```` + -Wall -Wunused-parameter -Wshadow -pedantic + -Werror -Wsign-compare -Wtype-limits + -Wuninitialized + ```` +* Clang: + ```` + -Wall -Wunused-parameter -Wshadow -pedantic + -Werror -Wsign-compare -Wtype-limits + -Wuninitialized + ```` + +* NVCC: + ```` + -Wall -Wunused-parameter -Wshadow -pedantic + -Werror -Wsign-compare -Wtype-limits + -Wuninitialized + ```` + +Other compilers are tested occasionally, in particular when pushing from develop to +master branch. These are tested less rigorously without `-Werror` and only for a select set of backends. + +# Building and Installing Kokkos +Kokkos provide a CMake build system and a raw Makefile build system. +The CMake build system is strongly encouraged and will be the most rigorously supported in future releases. +Full details are given in the [build instructions](BUILD.md). Basic setups are shown here: + +## CMake + +The best way to install Kokkos is using the CMake build system. Assuming Kokkos lives in `$srcdir`: +````bash +cmake $srcdir \ + -DCMAKE_CXX_COMPILER=$path_to_compiler \ + -DCMAKE_INSTALL_PREFIX=$path_to_install \ + -DKokkos_ENABLE_OPENMP=On \ + -DKokkos_ARCH_HSW=On \ + -DKokkos_ENABLE_HWLOC=On \ + -DKokkos_HWLOC_DIR=$path_to_hwloc +```` +then simply type `make install`. The Kokkos CMake package will then be installed in `$path_to_install` to be used by downstream packages. + +To validate the Kokkos build, configure with +```` + -DKokkos_ENABLE_TESTS=On +```` +and run `make test` after completing the build. + +For your CMake project using Kokkos, code such as the following: + +````cmake +find_package(Kokkos) +... +target_link_libraries(myTarget Kokkos::kokkos) +```` +should be added to your CMakeLists.txt. Your configure should additionally include +```` +-DKokkos_DIR=$path_to_install/cmake/lib/Kokkos +```` +or +```` +-DKokkos_ROOT=$path_to_install +```` +for the install location given above. + +## Spack +An alternative to manually building with the CMake is to use the Spack package manager. +To get started, download the Spack [repo](https://github.com/spack/spack). +```` +A basic installation would be done as: +````bash +> spack install kokkos +```` +Spack allows options and and compilers to be tuned in the install command. +````bash +> spack install kokkos@3.0 %gcc@7.3.0 +openmp +```` +This example illustrates the three most common parameters to Spack: +* Variants: specified with, e.g. `+openmp`, this activates (or deactivates with, e.g. `~openmp`) certain options. +* Version: immediately following `kokkos` the `@version` can specify a particular Kokkos to build +* Compiler: a default compiler will be chosen if not specified, but an exact compiler version can be given with the `%`option. + +For a complete list of Kokkos options, run: +````bash +> spack info kokkos +```` +Spack currently installs packages to a location determined by a unique hash. This hash name is not really "human readable". +Generally, Spack usage should never really require you to reference the computer-generated unique install folder. +More details are given in the [build instructions](BUILD.md). If you must know, you can locate Spack Kokkos installations with: +````bash +> spack find -p kokkos ... +```` +where `...` is the unique spec identifying the particular Kokkos configuration and version. +Some more details can found in the Kokkos spack [documentation](Spack.md) or the Spack [website](https://spack.readthedocs.io/en/latest). + +## Raw Makefile +A bash script is provided to generate raw makefiles. +To install Kokkos as a library create a build directory and run the following +````bash +> $KOKKOS_PATH/generate_makefile.bash --prefix=$path_to_install +```` +Once the Makefile is generated, run: +````bash +> make kokkoslib +> make install +```` +To additionally run the unit tests: +````bash +> make build-test +> make test +```` +Run `generate_makefile.bash --help` for more detailed options such as +changing the device type for which to build. + +## Inline Builds vs. Installed Package +For individual projects, it may be preferable to build Kokkos inline rather than link to an installed package. +The main reason is that you may otherwise need many different +configurations of Kokkos installed depending on the required compile time +features an application needs. For example there is only one default +execution space, which means you need different installations to have OpenMP +or Pthreads as the default space. Also for the CUDA backend there are certain +choices, such as allowing relocatable device code, which must be made at +installation time. Building Kokkos inline uses largely the same process +as compiling an application against an installed Kokkos library. + +For CMake, this means copying over the Kokkos source code into your project and adding `add_subdirectory(kokkos)` to your CMakeLists.txt. + +For raw Makefiles, see the example benchmarks/bytes_and_flops/Makefile which can be used with an installed library and or an inline build. + +# Kokkos and CUDA UVM + +Kokkos does support UVM as a specific memory space called CudaUVMSpace. +Allocations made with that space are accessible from host and device. +You can tell Kokkos to use that as the default space for Cuda allocations. +In either case UVM comes with a number of restrictions: +* You can't access allocations on the host while a kernel is potentially +running. This will lead to segfaults. To avoid that you either need to +call Kokkos::Cuda::fence() (or just Kokkos::fence()), after kernels, or +you can set the environment variable CUDA_LAUNCH_BLOCKING=1. +* In multi socket multi GPU machines without NVLINK, UVM defaults +to using zero copy allocations for technical reasons related to using multiple +GPUs from the same process. If an executable doesn't do that (e.g. each +MPI rank of an application uses a single GPU [can be the same GPU for +multiple MPI ranks]) you can set CUDA_MANAGED_FORCE_DEVICE_ALLOC=1. +This will enforce proper UVM allocations, but can lead to errors if +more than a single GPU is used by a single process. + + +# Citing Kokkos + +If you publish work which mentions Kokkos, please cite the following paper: + +````BibTeX +@article{CarterEdwards20143202, + title = "Kokkos: Enabling manycore performance portability through polymorphic memory access patterns ", + journal = "Journal of Parallel and Distributed Computing ", + volume = "74", + number = "12", + pages = "3202 - 3216", + year = "2014", + note = "Domain-Specific Languages and High-Level Frameworks for High-Performance Computing ", + issn = "0743-7315", + doi = "https://doi.org/10.1016/j.jpdc.2014.07.003", + url = "http://www.sciencedirect.com/science/article/pii/S0743731514001257", + author = "H. Carter Edwards and Christian R. Trott and Daniel Sunderland" +} +```` + +##### [LICENSE](https://github.com/kokkos/kokkos/blob/master/LICENSE) + +[](https://opensource.org/licenses/BSD-3-Clause) + +Under the terms of Contract DE-NA0003525 with NTESS, +the U.S. Government retains certain rights in this software. + diff --git a/packages/kokkos/Spack.md b/packages/kokkos/Spack.md new file mode 100644 index 0000000000000000000000000000000000000000..31a07deb56a0c9dc09e4453e196a8c8302634b19 --- /dev/null +++ b/packages/kokkos/Spack.md @@ -0,0 +1,267 @@ + + +# Kokkos Spack + +This gives instructions for using Spack to install Kokkos and developing packages that depend on Kokkos. + +## Getting Started + +Make sure you have downloaded [Spack](https://github.com/spack/spack). +The easiest way to configure the Spack environment is: +````bash +> source spack/share/spack/setup-env.sh +```` +with other scripts available for other shells. +You can display information about how to install packages with: +````bash +> spack info kokkos +```` +This will print all the information about how to install Kokkos with Spack. +For detailed instructions on how to use Spack, see the [User Manual](https://spack.readthedocs.io). + +## Setting Up Spack: Avoiding the Package Cascade +By default, Spack doesn't 'see' anything on your system - including things like CMake and CUDA. +This can be limited by adding a `packages.yaml` to your `$HOME/.spack` folder that includes CMake (and CUDA, if applicable). For example, your `packages.yaml` file could be: +````yaml +packages: + cuda: + modules: + cuda@10.1.243: [cuda/10.1.243] + paths: + cuda@10.1.243: + /opt/local/ppc64le-pwr8-nvidia/cuda/10.1.243 + buildable: false + cmake: + modules: + cmake: [cmake/3.16.8] + paths: + cmake: + /opt/local/ppc64le/cmake/3.16.8 + buildable: false +```` +The `modules` entry is only necessary on systems that require loading Modules (i.e. most DOE systems). +The `buildable` flag is useful to make sure Spack crashes if there is a path error, +rather than having a type-o and Spack rebuilding everything because `cmake` isn't found. +You can verify your environment is set up correctly by running `spack graph` or `spack spec`. +For example: +````bash +> spack graph kokkos +cuda +o kokkos +|\ +o | cuda + / +o cmake +```` +Without the existing CUDA and CMake being identified in `packages.yaml`, a (subset!) of the output would be: +````bash +o kokkos +|\ +| o cmake +| |\ +| | | |\ +| | | | | |\ +| | | | | | | |\ +| | | | | | | | | |\ +| | | | | | | o | | | libarchive +| | | | | | | |\ \ \ \ +| | | | | | | | | |\ \ \ \ +| | | | | | | | | | | | |_|/ +| | | | | | | | | | | |/| | +| | | | | | | | | | | | | o curl +| | |_|_|_|_|_|_|_|_|_|_|/| +| |/| | | |_|_|_|_|_|_|_|/ +| | | | |/| | | | | | | | +| | | | o | | | | | | | | openssl +| |/| | | | | | | | | | | +| | | | | | | | | | o | | libxml2 +| | |_|_|_|_|_|_|_|/| | | +| | | | | | | | | | |\ \ \ +| o | | | | | | | | | | | | zlib +| / / / / / / / / / / / / +| o | | | | | | | | | | | xz +| / / / / / / / / / / / +| o | | | | | | | | | | rhash +| / / / / / / / / / / +| | | | o | | | | | | nettle +| | | | |\ \ \ \ \ \ \ +| | | o | | | | | | | | libuv +| | | | o | | | | | | | autoconf +| | |_|/| | | | | | | | +| | | | |/ / / / / / / +| o | | | | | | | | | perl +| o | | | | | | | | | gdbm +| o | | | | | | | | | readline +```` + +## Configuring Kokkos as a Project Dependency +Say you have a project "SuperScience" which needs to use Kokkos. +In your `package.py` file, you would generally include something like: +````python +class SuperScience(CMakePackage): + ... + depends_on("kokkos") +```` +Often projects want to tweak behavior when using certain features, e.g. +````python + depends_on("kokkos+cuda", when="+cuda") +```` +if your project needs CUDA-specific logic to configure and build. +This illustrates the general principle in Spack of "flowing-up". +A user requests a feature in the final app: +````bash +> spack install superscience+cuda +```` +This flows upstream to the Kokkos dependency, causing the `kokkos+cuda` variant to build. +The downstream app (SuperScience) tells the upstream app (Kokkos) how to build. + +Because Kokkos is a performance portability library, it somewhat inverts this principle. +Kokkos "flows-down", telling your application how best to configure for performance. +Rather than a downstream app (SuperScience) telling the upstream (Kokkos) what variants to build, +a pre-built Kokkos should be telling the downstream app SuperScience what variants to use. +Kokkos works best when there is an "expert" configuration installed on your system. +Your build should simply request `-DKokkos_ROOT=<BEST_KOKKOS_FOR_MY_SYSTEM>` and configure appropriately based on the Kokkos it finds. + +Kokkos has many, many build variants. +Where possible, projects should only depend on a general Kokkos, not specific variants. +We recommend instead adding for each system you build on a Kokkos configuration to your `packages.yaml` file (usually found in `~/.spack` for specific users). +For a Xeon + Volta system, this could look like: +````yaml + kokkos: + variants: +cuda +openmp +cuda_lambda +wrapper ^cuda@10.1 cuda_arch=70 + compiler: [gcc@7.2.0] +```` +which gives the "best" Kokkos configuration as CUDA+OpenMP optimized for a Volta 70 architecture using CUDA 10.1. +It also enables support for CUDA Lambdas. +The `+wrapper` option tells Kokkos to build with the special `nvcc_wrapper` (more below). +Note here that we use the built-in `cuda_arch` variant of Spack to specify the archicture. +For a Haswell system, we use +````yaml + kokkos: + variants: +openmp std=14 target=haswell + compiler: [intel@18] +```` +which uses the built-in microarchitecture variants of Spack. +Consult the Spack documentation for more details of Spack microarchitectures +and CUDA architectures. +Spack does not currently provide an AMD GPU microarchitecture option. +If building for HIP or an AMD GPU, Kokkos provides an `amd_gpu_arch` similar to `cuda_arch`. +````yaml + kokkos: + variants: +hip amd_gpu_arch=vega900 +```` + +Without an optimal default in your `packages.yaml` file, it is highly likely that the default Kokkos configuration you get will not be what you want. +For example, CUDA is not enabled by default (there is no easy logic to conditionally activate this for CUDA-enabled systems). +If you don't specify a CUDA build variant in a `packages.yaml` and you build your Kokkos-dependent project: +````bash +> spack install superscience +```` +you may end up just getting the default Kokkos (i.e. Serial). +Some examples are included in the `config/yaml` folder for common platforms. +Before running `spack install <package>` we recommend running `spack spec <package>` to confirm your dependency tree is correct. +For example, with Kokkos Kernels: +````bash +kokkos-kernels@3.0%gcc@8.3.0~blas build_type=RelWithDebInfo ~cblas~complex_double~complex_float~cublas~cuda cuda_arch=none ~cusparse~diy+double execspace_cuda=auto execspace_openmp=auto execspace_serial=auto execspace_threads=auto ~float~lapack~lapacke+layoutleft~layoutright memspace_cudaspace=auto memspace_cudauvmspace=auto +memspace_hostspace~mkl+offset_int+offset_size_t~openmp+ordinal_int~ordinal_int64_t~serial~superlu arch=linux-rhel7-skylake_avx512 + ^cmake@3.16.2%gcc@8.3.0~doc+ncurses+openssl+ownlibs~qt arch=linux-rhel7-skylake_avx512 + ^kokkos@3.0%gcc@8.3.0~aggressive_vectorization~amdavx~armv80~armv81~armv8_thunderx~armv8_tx2~bdw~bgq build_type=RelWithDebInfo ~carrizo~compiler_warnings+cuda cuda_arch=none +cuda_lambda~cuda_ldg_intrinsic~cuda_relocatable_device_code~cuda_uvm~debug~debug_bounds_check~debug_dualview_modify_check~deprecated_code~diy~epyc~examples~explicit_instantiation~fiji~gfx901~hpx~hpx_async_dispatch~hsw~hwloc~kaveri~kepler30~kepler32~kepler35~kepler37~knc~knl~maxwell50~maxwell52~maxwell53~memkind~numactl+openmp~pascal60~pascal61~power7~power8~power9+profiling~profiling_load_print~pthread~qthread~rocm~ryzen~serial~skx~snb std=14 ~tests~turing75~vega+volta70~volta72+wrapper~wsm arch=linux-rhel7-skylake_avx512 + ^cuda@10.1%gcc@8.3.0 arch=linux-rhel7-skylake_avx512 + ^kokkos-nvcc-wrapper@old%gcc@8.3.0 build_type=RelWithDebInfo +mpi arch=linux-rhel7-skylake_avx512 + ^openmpi@4.0.2%gcc@8.3.0~cuda+cxx_exceptions fabrics=none ~java~legacylaunchers~memchecker patches=073477a76bba780c67c36e959cd3ee6910743e2735c7e76850ffba6791d498e4 ~pmi schedulers=none ~sqlite3~thread_multiple+vt arch=linux-rhel7-skylake_avx512 +```` +The output can be very verbose, but we can verify the expected `kokkos`: +````bash +kokkos@3.0%gcc@8.3.0~aggressive_vectorization~amdavx~armv80~armv81~armv8_thunderx~armv8_tx2~bdw~bgq build_type=RelWithDebInfo ~carrizo~compiler_warnings+cuda cuda_arch=none +cuda_lambda~cuda_ldg_intrinsic~cuda_relocatable_device_code~cuda_uvm~debug~debug_bounds_check~debug_dualview_modify_check~deprecated_code~diy~epyc~examples~explicit_instantiation~fiji~gfx901~hpx~hpx_async_dispatch~hsw~hwloc~kaveri~kepler30~kepler32~kepler35~kepler37~knc~knl~maxwell50~maxwell52~maxwell53~memkind~numactl+openmp~pascal60~pascal61~power7~power8~power9+profiling~profiling_load_print~pthread~qthread~rocm~ryzen~serial~skx~snb std=11 ~tests~turing75~vega+volta70~volta72+wrapper~wsm arch=linux-rhel7-skylake_avx512 +```` +We see that we do have `+volta70` and `+wrapper`, e.g. + +### Spack Environments +The encouraged way to use Spack is with Spack environments ([more details here](https://spack-tutorial.readthedocs.io/en/latest/tutorial_environments.html#dealing-with-many-specs-at-once)). +Rather than installing packages one-at-a-time, you add packages to an environment. +After adding all packages, you concretize and install them all. +Using environments, one can explicitly add a desired Kokkos for the environment, e.g. +````bash +> spack add kokkos +cuda +cuda_lambda +volta70 +> spack add my_project +my_variant +> ... +> spack install +```` +All packages within the environment will build against the CUDA-enabled Kokkos, +even if they only request a default Kokkos. + +## NVCC Wrapper +Kokkos is a C++ project, but often builds for the CUDA backend. +This is particularly problematic with CMake. At this point, `nvcc` does not accept all the flags that normally get passed to a C++ compiler. +Kokkos provides `nvcc_wrapper` that identifies correctly as a C++ compiler to CMake and accepts C++ flags, but uses `nvcc` as the underlying compiler. +`nvcc` itself also uses an underlying host compiler, e.g. GCC. + +In Spack, the underlying host compiler is specified as below, e.g.: +````bash +> spack install package %gcc@8.0.0 +```` +This is still valid for Kokkos. To use the special wrapper for CUDA builds, request a desired compiler and simply add the `+wrapper` variant. +````bash +> spack install kokkos +cuda +wrapper %gcc@7.2.0 +```` +Downstream projects depending on Kokkos need to override their compiler. +Kokkos provides the compiler in a `kokkos_cxx` variable, +which points to either `nvcc_wrapper` when needed or the regular compiler otherwise. +Spack projects already do this to use MPI compiler wrappers. +````python +def cmake_args(self): + options = [] + ... + options.append("-DCMAKE_CXX_COMPILER=%s" % self.spec["kokkos"].kokkos_cxx) + ... + return options +```` +Note: `nvcc_wrapper` works with the MPI compiler wrappers. +If building your project with MPI, do NOT set your compiler to `nvcc_wrapper`. +Instead set your compiler to `mpicxx` and `nvcc_wrapper` will be used under the hood. +````python +def cmake_args(self): + options = [] + ... + options.append("-DCMAKE_CXX_COMPILER=%s" % self.spec["mpi"].mpicxx) + ... + return options +```` +To accomplish this, `nvcc_wrapper` must depend on MPI (even though it uses no MPI). +This has the unfortunate consequence that Kokkos CUDA projects not using MPI will implicitly depend on MPI anyway. +This behavior is necessary for now, but will hopefully be removed later. +When using environments, if MPI is not needed, you can remove the MPI dependency with: +````bash +> spack add kokkos-nvcc-wrapper ~mpi +```` + +## Developing With Spack + +Spack has historically been much more suited to *deployment* of mature packages than active testing or developing. +However, recent features have improved support for development. +Future releases are likely to make this even easier and incorporate Git integration. +The most common commands will do a full build and install of the packages. +If doing development, you may wish to merely set up a build environment. +This allows you to modify the source and re-build. +In this case, you can stop after configuring. +Suppose you have Kokkos checkout in the folder `kokkos-src`: +````bash +> spack dev-build -d kokkos-src -u cmake kokkos@develop +wrapper +openmp +```` +This sets up a development environment for you in `kokkos-src` which you can use (Bash example shown): +Note: Always specify `develop` as the version when doing `dev-build`, except in rare cases. +You are usually developing a feature branch that will merge into `develop`, +hence you are making a new `develop` branch. + +````bash +> cd kokko-src +> source spack-build-env.txt +> cd spack-build +> make +```` +Before sourcing the Spack development environment, you may wish to save your current environment: +````bash +> declare -px > myenv.sh +```` +When done with Spack, you can then restore your original environment: +````bash +> source myenv.sh +```` diff --git a/packages/kokkos/algorithms/CMakeLists.txt b/packages/kokkos/algorithms/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..fd099054ba457e2b1a0557fd08be835f50eef939 --- /dev/null +++ b/packages/kokkos/algorithms/CMakeLists.txt @@ -0,0 +1,14 @@ + + +KOKKOS_SUBPACKAGE(Algorithms) + +IF (NOT Kokkos_INSTALL_TESTING) + ADD_SUBDIRECTORY(src) +ENDIF() + +KOKKOS_ADD_TEST_DIRECTORIES(unit_tests) + +KOKKOS_SUBPACKAGE_POSTPROCESS() + + + diff --git a/packages/kokkos/algorithms/cmake/Dependencies.cmake b/packages/kokkos/algorithms/cmake/Dependencies.cmake new file mode 100644 index 0000000000000000000000000000000000000000..1b413106817cc6adf18dc94189203a27e641c6d5 --- /dev/null +++ b/packages/kokkos/algorithms/cmake/Dependencies.cmake @@ -0,0 +1,5 @@ +TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( + LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers + LIB_OPTIONAL_TPLS Pthread CUDA HWLOC HPX + TEST_OPTIONAL_TPLS CUSPARSE + ) diff --git a/packages/kokkos/algorithms/cmake/KokkosAlgorithms_config.h.in b/packages/kokkos/algorithms/cmake/KokkosAlgorithms_config.h.in new file mode 100644 index 0000000000000000000000000000000000000000..67334b70f36b6db55b225f25c91d8a8c4cb3aaab --- /dev/null +++ b/packages/kokkos/algorithms/cmake/KokkosAlgorithms_config.h.in @@ -0,0 +1,4 @@ +#ifndef KOKKOS_ALGORITHMS_CONFIG_H +#define KOKKOS_ALGORITHMS_CONFIG_H + +#endif diff --git a/packages/kokkos/algorithms/src/CMakeLists.txt b/packages/kokkos/algorithms/src/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..cf5564032c20bdae4593f44fc66c8b1e39e0833f --- /dev/null +++ b/packages/kokkos/algorithms/src/CMakeLists.txt @@ -0,0 +1,36 @@ + +KOKKOS_CONFIGURE_FILE(${PACKAGE_NAME}_config.h) + +#I have to leave these here for tribits +KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +#----------------------------------------------------------------------------- + +FILE(GLOB ALGO_HEADERS *.hpp) +FILE(GLOB ALGO_SOURCES *.cpp) +LIST(APPEND ALGO_HEADERS ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}_config.h) + +INSTALL ( + DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/" + DESTINATION ${KOKKOS_HEADER_DIR} + FILES_MATCHING PATTERN "*.hpp" +) + +#----------------------------------------------------------------------------- + +# We have to pass the sources in here for Tribits +# These will get ignored for standalone CMake and a true interface library made +KOKKOS_ADD_INTERFACE_LIBRARY( + kokkosalgorithms + HEADERS ${ALGO_HEADERS} + SOURCES ${ALGO_SOURCES} +) +KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkosalgorithms + ${KOKKOS_TOP_BUILD_DIR} + ${CMAKE_CURRENT_BINARY_DIR} + ${CMAKE_CURRENT_SOURCE_DIR} +) + + + diff --git a/packages/kokkos/algorithms/src/KokkosAlgorithms_dummy.cpp b/packages/kokkos/algorithms/src/KokkosAlgorithms_dummy.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9c08a088b0e108f78728fcc00742baaab441d4e2 --- /dev/null +++ b/packages/kokkos/algorithms/src/KokkosAlgorithms_dummy.cpp @@ -0,0 +1 @@ +void KOKKOS_ALGORITHMS_SRC_DUMMY_PREVENT_LINK_ERROR() {} diff --git a/packages/kokkos/algorithms/src/Kokkos_Random.hpp b/packages/kokkos/algorithms/src/Kokkos_Random.hpp new file mode 100644 index 0000000000000000000000000000000000000000..904cf5ccb967037d94ac9b4a06144a4f7333dd3d --- /dev/null +++ b/packages/kokkos/algorithms/src/Kokkos_Random.hpp @@ -0,0 +1,1727 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_RANDOM_HPP +#define KOKKOS_RANDOM_HPP + +#include <Kokkos_Core.hpp> +#include <Kokkos_Complex.hpp> +#include <cstdio> +#include <cstdlib> +#include <cmath> + +/// \file Kokkos_Random.hpp +/// \brief Pseudorandom number generators +/// +/// These generators are based on Vigna, Sebastiano (2014). "An +/// experimental exploration of Marsaglia's xorshift generators, +/// scrambled." See: http://arxiv.org/abs/1402.6246 + +namespace Kokkos { + +// clang-format off + /*Template functions to get equidistributed random numbers from a generator for a specific Scalar type + + template<class Generator,Scalar> + struct rand{ + + //Max value returned by draw(Generator& gen) + KOKKOS_INLINE_FUNCTION + static Scalar max(); + + //Returns a value between zero and max() + KOKKOS_INLINE_FUNCTION + static Scalar draw(Generator& gen); + + //Returns a value between zero and range() + //Note: for floating point values range can be larger than max() + KOKKOS_INLINE_FUNCTION + static Scalar draw(Generator& gen, const Scalar& range){} + + //Return value between start and end + KOKKOS_INLINE_FUNCTION + static Scalar draw(Generator& gen, const Scalar& start, const Scalar& end); + }; + + The Random number generators themselves have two components a state-pool and the actual generator + A state-pool manages a number of generators, so that each active thread is able to grep its own. + This allows the generation of random numbers which are independent between threads. Note that + in contrast to CuRand none of the functions of the pool (or the generator) are collectives, + i.e. all functions can be called inside conditionals. + + template<class Device> + class Pool { + public: + //The Kokkos device type + using device_type = Device; + //The actual generator type + using generator_type = Generator<Device>; + + //Default constructor: does not initialize a pool + Pool(); + + //Initializing constructor: calls init(seed,Device_Specific_Number); + Pool(unsigned int seed); + + //Initialize Pool with seed as a starting seed with a pool_size of num_states + //The Random_XorShift64 generator is used in serial to initialize all states, + //thus the initialization process is platform independent and deterministic. + void init(unsigned int seed, int num_states); + + //Get a generator. This will lock one of the states, guaranteeing that each thread + //will have its private generator. Note: on Cuda getting a state involves atomics, + //and is thus not deterministic! + generator_type get_state(); + + //Give a state back to the pool. This unlocks the state, and writes the modified + //state of the generator back to the pool. + void free_state(generator_type gen); + + } + + template<class Device> + class Generator { + public: + //The Kokkos device type + using device_type = DeviceType; + + //Max return values of respective [X]rand[S]() functions + enum {MAX_URAND = 0xffffffffU}; + enum {MAX_URAND64 = 0xffffffffffffffffULL-1}; + enum {MAX_RAND = static_cast<int>(0xffffffffU/2)}; + enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)}; + + + //Init with a state and the idx with respect to pool. Note: in serial the + //Generator can be used by just giving it the necessary state arguments + KOKKOS_INLINE_FUNCTION + Generator (STATE_ARGUMENTS, int state_idx = 0); + + //Draw a equidistributed uint32_t in the range [0,MAX_URAND) + KOKKOS_INLINE_FUNCTION + uint32_t urand(); + + //Draw a equidistributed uint64_t in the range [0,MAX_URAND64) + KOKKOS_INLINE_FUNCTION + uint64_t urand64(); + + //Draw a equidistributed uint32_t in the range [0,range) + KOKKOS_INLINE_FUNCTION + uint32_t urand(const uint32_t& range); + + //Draw a equidistributed uint32_t in the range [start,end) + KOKKOS_INLINE_FUNCTION + uint32_t urand(const uint32_t& start, const uint32_t& end ); + + //Draw a equidistributed uint64_t in the range [0,range) + KOKKOS_INLINE_FUNCTION + uint64_t urand64(const uint64_t& range); + + //Draw a equidistributed uint64_t in the range [start,end) + KOKKOS_INLINE_FUNCTION + uint64_t urand64(const uint64_t& start, const uint64_t& end ); + + //Draw a equidistributed int in the range [0,MAX_RAND) + KOKKOS_INLINE_FUNCTION + int rand(); + + //Draw a equidistributed int in the range [0,range) + KOKKOS_INLINE_FUNCTION + int rand(const int& range); + + //Draw a equidistributed int in the range [start,end) + KOKKOS_INLINE_FUNCTION + int rand(const int& start, const int& end ); + + //Draw a equidistributed int64_t in the range [0,MAX_RAND64) + KOKKOS_INLINE_FUNCTION + int64_t rand64(); + + //Draw a equidistributed int64_t in the range [0,range) + KOKKOS_INLINE_FUNCTION + int64_t rand64(const int64_t& range); + + //Draw a equidistributed int64_t in the range [start,end) + KOKKOS_INLINE_FUNCTION + int64_t rand64(const int64_t& start, const int64_t& end ); + + //Draw a equidistributed float in the range [0,1.0) + KOKKOS_INLINE_FUNCTION + float frand(); + + //Draw a equidistributed float in the range [0,range) + KOKKOS_INLINE_FUNCTION + float frand(const float& range); + + //Draw a equidistributed float in the range [start,end) + KOKKOS_INLINE_FUNCTION + float frand(const float& start, const float& end ); + + //Draw a equidistributed double in the range [0,1.0) + KOKKOS_INLINE_FUNCTION + double drand(); + + //Draw a equidistributed double in the range [0,range) + KOKKOS_INLINE_FUNCTION + double drand(const double& range); + + //Draw a equidistributed double in the range [start,end) + KOKKOS_INLINE_FUNCTION + double drand(const double& start, const double& end ); + + //Draw a standard normal distributed double + KOKKOS_INLINE_FUNCTION + double normal() ; + + //Draw a normal distributed double with given mean and standard deviation + KOKKOS_INLINE_FUNCTION + double normal(const double& mean, const double& std_dev=1.0); + } + + //Additional Functions: + + //Fills view with random numbers in the range [0,range) + template<class ViewType, class PoolType> + void fill_random(ViewType view, PoolType pool, ViewType::value_type range); + + //Fills view with random numbers in the range [start,end) + template<class ViewType, class PoolType> + void fill_random(ViewType view, PoolType pool, + ViewType::value_type start, ViewType::value_type end); + +*/ +// clang-format on + +template <class Generator, class Scalar> +struct rand; + +template <class Generator> +struct rand<Generator, char> { + KOKKOS_INLINE_FUNCTION + static short max() { return 127; } + KOKKOS_INLINE_FUNCTION + static short draw(Generator& gen) { + return short((gen.rand() & 0xff + 256) % 256); + } + KOKKOS_INLINE_FUNCTION + static short draw(Generator& gen, const char& range) { + return char(gen.rand(range)); + } + KOKKOS_INLINE_FUNCTION + static short draw(Generator& gen, const char& start, const char& end) { + return char(gen.rand(start, end)); + } +}; + +template <class Generator> +struct rand<Generator, short> { + KOKKOS_INLINE_FUNCTION + static short max() { return 32767; } + KOKKOS_INLINE_FUNCTION + static short draw(Generator& gen) { + return short((gen.rand() & 0xffff + 65536) % 32768); + } + KOKKOS_INLINE_FUNCTION + static short draw(Generator& gen, const short& range) { + return short(gen.rand(range)); + } + KOKKOS_INLINE_FUNCTION + static short draw(Generator& gen, const short& start, const short& end) { + return short(gen.rand(start, end)); + } +}; + +template <class Generator> +struct rand<Generator, int> { + KOKKOS_INLINE_FUNCTION + static int max() { return Generator::MAX_RAND; } + KOKKOS_INLINE_FUNCTION + static int draw(Generator& gen) { return gen.rand(); } + KOKKOS_INLINE_FUNCTION + static int draw(Generator& gen, const int& range) { return gen.rand(range); } + KOKKOS_INLINE_FUNCTION + static int draw(Generator& gen, const int& start, const int& end) { + return gen.rand(start, end); + } +}; + +template <class Generator> +struct rand<Generator, unsigned int> { + KOKKOS_INLINE_FUNCTION + static unsigned int max() { return Generator::MAX_URAND; } + KOKKOS_INLINE_FUNCTION + static unsigned int draw(Generator& gen) { return gen.urand(); } + KOKKOS_INLINE_FUNCTION + static unsigned int draw(Generator& gen, const unsigned int& range) { + return gen.urand(range); + } + KOKKOS_INLINE_FUNCTION + static unsigned int draw(Generator& gen, const unsigned int& start, + const unsigned int& end) { + return gen.urand(start, end); + } +}; + +template <class Generator> +struct rand<Generator, long> { + KOKKOS_INLINE_FUNCTION + static long max() { + // FIXME (mfh 26 Oct 2014) It would be better to select the + // return value at compile time, using something like enable_if. + return sizeof(long) == 4 ? static_cast<long>(Generator::MAX_RAND) + : static_cast<long>(Generator::MAX_RAND64); + } + KOKKOS_INLINE_FUNCTION + static long draw(Generator& gen) { + // FIXME (mfh 26 Oct 2014) It would be better to select the + // return value at compile time, using something like enable_if. + return sizeof(long) == 4 ? static_cast<long>(gen.rand()) + : static_cast<long>(gen.rand64()); + } + KOKKOS_INLINE_FUNCTION + static long draw(Generator& gen, const long& range) { + // FIXME (mfh 26 Oct 2014) It would be better to select the + // return value at compile time, using something like enable_if. + return sizeof(long) == 4 + ? static_cast<long>(gen.rand(static_cast<int>(range))) + : static_cast<long>(gen.rand64(range)); + } + KOKKOS_INLINE_FUNCTION + static long draw(Generator& gen, const long& start, const long& end) { + // FIXME (mfh 26 Oct 2014) It would be better to select the + // return value at compile time, using something like enable_if. + return sizeof(long) == 4 + ? static_cast<long>( + gen.rand(static_cast<int>(start), static_cast<int>(end))) + : static_cast<long>(gen.rand64(start, end)); + } +}; + +template <class Generator> +struct rand<Generator, unsigned long> { + KOKKOS_INLINE_FUNCTION + static unsigned long max() { + // FIXME (mfh 26 Oct 2014) It would be better to select the + // return value at compile time, using something like enable_if. + return sizeof(unsigned long) == 4 + ? static_cast<unsigned long>(Generator::MAX_URAND) + : static_cast<unsigned long>(Generator::MAX_URAND64); + } + KOKKOS_INLINE_FUNCTION + static unsigned long draw(Generator& gen) { + // FIXME (mfh 26 Oct 2014) It would be better to select the + // return value at compile time, using something like enable_if. + return sizeof(unsigned long) == 4 + ? static_cast<unsigned long>(gen.urand()) + : static_cast<unsigned long>(gen.urand64()); + } + KOKKOS_INLINE_FUNCTION + static unsigned long draw(Generator& gen, const unsigned long& range) { + // FIXME (mfh 26 Oct 2014) It would be better to select the + // return value at compile time, using something like enable_if. + return sizeof(unsigned long) == 4 + ? static_cast<unsigned long>( + gen.urand(static_cast<unsigned int>(range))) + : static_cast<unsigned long>(gen.urand64(range)); + } + KOKKOS_INLINE_FUNCTION + static unsigned long draw(Generator& gen, const unsigned long& start, + const unsigned long& end) { + // FIXME (mfh 26 Oct 2014) It would be better to select the + // return value at compile time, using something like enable_if. + return sizeof(unsigned long) == 4 + ? static_cast<unsigned long>( + gen.urand(static_cast<unsigned int>(start), + static_cast<unsigned int>(end))) + : static_cast<unsigned long>(gen.urand64(start, end)); + } +}; + +// NOTE (mfh 26 oct 2014) This is a partial specialization for long +// long, a C99 / C++11 signed type which is guaranteed to be at +// least 64 bits. Do NOT write a partial specialization for +// int64_t!!! This is just an alias! It could be either long or +// long long. We don't know which a priori, and I've seen both. +// The types long and long long are guaranteed to differ, so it's +// always safe to specialize for both. +template <class Generator> +struct rand<Generator, long long> { + KOKKOS_INLINE_FUNCTION + static long long max() { + // FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits. + return Generator::MAX_RAND64; + } + KOKKOS_INLINE_FUNCTION + static long long draw(Generator& gen) { + // FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits. + return gen.rand64(); + } + KOKKOS_INLINE_FUNCTION + static long long draw(Generator& gen, const long long& range) { + // FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits. + return gen.rand64(range); + } + KOKKOS_INLINE_FUNCTION + static long long draw(Generator& gen, const long long& start, + const long long& end) { + // FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits. + return gen.rand64(start, end); + } +}; + +// NOTE (mfh 26 oct 2014) This is a partial specialization for +// unsigned long long, a C99 / C++11 unsigned type which is +// guaranteed to be at least 64 bits. Do NOT write a partial +// specialization for uint64_t!!! This is just an alias! It could +// be either unsigned long or unsigned long long. We don't know +// which a priori, and I've seen both. The types unsigned long and +// unsigned long long are guaranteed to differ, so it's always safe +// to specialize for both. +template <class Generator> +struct rand<Generator, unsigned long long> { + KOKKOS_INLINE_FUNCTION + static unsigned long long max() { + // FIXME (mfh 26 Oct 2014) It's legal for unsigned long long to be > 64 + // bits. + return Generator::MAX_URAND64; + } + KOKKOS_INLINE_FUNCTION + static unsigned long long draw(Generator& gen) { + // FIXME (mfh 26 Oct 2014) It's legal for unsigned long long to be > 64 + // bits. + return gen.urand64(); + } + KOKKOS_INLINE_FUNCTION + static unsigned long long draw(Generator& gen, + const unsigned long long& range) { + // FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits. + return gen.urand64(range); + } + KOKKOS_INLINE_FUNCTION + static unsigned long long draw(Generator& gen, + const unsigned long long& start, + const unsigned long long& end) { + // FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits. + return gen.urand64(start, end); + } +}; + +template <class Generator> +struct rand<Generator, float> { + KOKKOS_INLINE_FUNCTION + static float max() { return 1.0f; } + KOKKOS_INLINE_FUNCTION + static float draw(Generator& gen) { return gen.frand(); } + KOKKOS_INLINE_FUNCTION + static float draw(Generator& gen, const float& range) { + return gen.frand(range); + } + KOKKOS_INLINE_FUNCTION + static float draw(Generator& gen, const float& start, const float& end) { + return gen.frand(start, end); + } +}; + +template <class Generator> +struct rand<Generator, double> { + KOKKOS_INLINE_FUNCTION + static double max() { return 1.0; } + KOKKOS_INLINE_FUNCTION + static double draw(Generator& gen) { return gen.drand(); } + KOKKOS_INLINE_FUNCTION + static double draw(Generator& gen, const double& range) { + return gen.drand(range); + } + KOKKOS_INLINE_FUNCTION + static double draw(Generator& gen, const double& start, const double& end) { + return gen.drand(start, end); + } +}; + +template <class Generator> +struct rand<Generator, Kokkos::complex<float> > { + KOKKOS_INLINE_FUNCTION + static Kokkos::complex<float> max() { + return Kokkos::complex<float>(1.0, 1.0); + } + KOKKOS_INLINE_FUNCTION + static Kokkos::complex<float> draw(Generator& gen) { + const float re = gen.frand(); + const float im = gen.frand(); + return Kokkos::complex<float>(re, im); + } + KOKKOS_INLINE_FUNCTION + static Kokkos::complex<float> draw(Generator& gen, + const Kokkos::complex<float>& range) { + const float re = gen.frand(real(range)); + const float im = gen.frand(imag(range)); + return Kokkos::complex<float>(re, im); + } + KOKKOS_INLINE_FUNCTION + static Kokkos::complex<float> draw(Generator& gen, + const Kokkos::complex<float>& start, + const Kokkos::complex<float>& end) { + const float re = gen.frand(real(start), real(end)); + const float im = gen.frand(imag(start), imag(end)); + return Kokkos::complex<float>(re, im); + } +}; + +template <class Generator> +struct rand<Generator, Kokkos::complex<double> > { + KOKKOS_INLINE_FUNCTION + static Kokkos::complex<double> max() { + return Kokkos::complex<double>(1.0, 1.0); + } + KOKKOS_INLINE_FUNCTION + static Kokkos::complex<double> draw(Generator& gen) { + const double re = gen.drand(); + const double im = gen.drand(); + return Kokkos::complex<double>(re, im); + } + KOKKOS_INLINE_FUNCTION + static Kokkos::complex<double> draw(Generator& gen, + const Kokkos::complex<double>& range) { + const double re = gen.drand(real(range)); + const double im = gen.drand(imag(range)); + return Kokkos::complex<double>(re, im); + } + KOKKOS_INLINE_FUNCTION + static Kokkos::complex<double> draw(Generator& gen, + const Kokkos::complex<double>& start, + const Kokkos::complex<double>& end) { + const double re = gen.drand(real(start), real(end)); + const double im = gen.drand(imag(start), imag(end)); + return Kokkos::complex<double>(re, im); + } +}; + +template <class DeviceType> +class Random_XorShift1024_Pool; + +namespace Impl { + +template <bool UseCArrayState> +struct Random_XorShift1024_State { + uint64_t state_[16]; + KOKKOS_DEFAULTED_FUNCTION + Random_XorShift1024_State() = default; + + template <class StateViewType> + KOKKOS_FUNCTION Random_XorShift1024_State(const StateViewType& v, + int state_idx) { + for (int i = 0; i < 16; i++) state_[i] = v(state_idx, i); + } + + KOKKOS_FUNCTION + uint64_t operator[](const int i) const { return state_[i]; } + + KOKKOS_FUNCTION + uint64_t& operator[](const int i) { return state_[i]; } +}; + +template <> +struct Random_XorShift1024_State<false> { + uint64_t* state_; + const int stride_; + KOKKOS_FUNCTION + Random_XorShift1024_State() : state_(nullptr), stride_(1){}; + + template <class StateViewType> + KOKKOS_FUNCTION Random_XorShift1024_State(const StateViewType& v, + int state_idx) + : state_(&v(state_idx, 0)), stride_(v.stride_1()) {} + + KOKKOS_FUNCTION + uint64_t operator[](const int i) const { return state_[i * stride_]; } + + KOKKOS_FUNCTION + uint64_t& operator[](const int i) { return state_[i * stride_]; } +}; + +template <class ExecutionSpace> +struct Random_XorShift1024_UseCArrayState : std::true_type {}; + +#ifdef KOKKOS_ENABLE_CUDA +template <> +struct Random_XorShift1024_UseCArrayState<Kokkos::Cuda> : std::false_type {}; +#endif +#ifdef KOKKOS_ENABLE_HIP +template <> +struct Random_XorShift1024_UseCArrayState<Kokkos::Experimental::HIP> + : std::false_type {}; +#endif +#ifdef KOKKOS_ENABLE_OPENMPTARGET +template <> +struct Random_XorShift1024_UseCArrayState<Kokkos::Experimental::OpenMPTarget> + : std::false_type {}; +#endif + +template <class ExecutionSpace> +struct Random_UniqueIndex { + using locks_view_type = View<int*, ExecutionSpace>; + KOKKOS_FUNCTION + static int get_state_idx(const locks_view_type) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + const int i = ExecutionSpace::impl_hardware_thread_id(); + return i; +#else + return 0; +#endif + } +}; + +#ifdef KOKKOS_ENABLE_CUDA +template <> +struct Random_UniqueIndex<Kokkos::Cuda> { + using locks_view_type = View<int*, Kokkos::Cuda>; + KOKKOS_FUNCTION + static int get_state_idx(const locks_view_type& locks_) { +#ifdef __CUDA_ARCH__ + const int i_offset = + (threadIdx.x * blockDim.y + threadIdx.y) * blockDim.z + threadIdx.z; + int i = (((blockIdx.x * gridDim.y + blockIdx.y) * gridDim.z + blockIdx.z) * + blockDim.x * blockDim.y * blockDim.z + + i_offset) % + locks_.extent(0); + while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) { + i += blockDim.x * blockDim.y * blockDim.z; + if (i >= static_cast<int>(locks_.extent(0))) { + i = i_offset; + } + } + return i; +#else + (void)locks_; + return 0; +#endif + } +}; +#endif + +#ifdef KOKKOS_ENABLE_HIP +template <> +struct Random_UniqueIndex<Kokkos::Experimental::HIP> { + using locks_view_type = View<int*, Kokkos::Experimental::HIP>; + KOKKOS_FUNCTION + static int get_state_idx(const locks_view_type& locks_) { +#ifdef __HIP_DEVICE_COMPILE__ + const int i_offset = + (threadIdx.x * blockDim.y + threadIdx.y) * blockDim.z + threadIdx.z; + int i = (((blockIdx.x * gridDim.y + blockIdx.y) * gridDim.z + blockIdx.z) * + blockDim.x * blockDim.y * blockDim.z + + i_offset) % + locks_.extent(0); + while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) { + i += blockDim.x * blockDim.y * blockDim.z; + if (i >= static_cast<int>(locks_.extent(0))) { + i = i_offset; + } + } + return i; +#else + (void)locks_; + return 0; +#endif + } +}; +#endif + +#ifdef KOKKOS_ENABLE_SYCL +template <> +struct Random_UniqueIndex<Kokkos::Experimental::SYCL> { + using locks_view_type = View<int*, Kokkos::Experimental::SYCL>; + KOKKOS_FUNCTION + static int get_state_idx(const locks_view_type& locks_) { +#ifdef KOKKOS_ARCH_INTEL_GEN + int i = Kokkos::Impl::clock_tic() % locks_.extent(0); +#else + int i = 0; +#endif + while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) { + i = (i + 1) % static_cast<int>(locks_.extent(0)); + } + return i; + } +}; +#endif + +} // namespace Impl + +template <class DeviceType> +class Random_XorShift64_Pool; + +template <class DeviceType> +class Random_XorShift64 { + private: + uint64_t state_; + const int state_idx_; + friend class Random_XorShift64_Pool<DeviceType>; + + public: + using device_type = DeviceType; + + constexpr static uint32_t MAX_URAND = std::numeric_limits<uint32_t>::max(); + constexpr static uint64_t MAX_URAND64 = std::numeric_limits<uint64_t>::max(); + constexpr static int32_t MAX_RAND = std::numeric_limits<int32_t>::max(); + constexpr static int64_t MAX_RAND64 = std::numeric_limits<int64_t>::max(); + + KOKKOS_INLINE_FUNCTION + Random_XorShift64(uint64_t state, int state_idx = 0) + : state_(state == 0 ? uint64_t(1318319) : state), state_idx_(state_idx) {} + + KOKKOS_INLINE_FUNCTION + uint32_t urand() { + state_ ^= state_ >> 12; + state_ ^= state_ << 25; + state_ ^= state_ >> 27; + + uint64_t tmp = state_ * 2685821657736338717ULL; + tmp = tmp >> 16; + return static_cast<uint32_t>(tmp & MAX_URAND); + } + + KOKKOS_INLINE_FUNCTION + uint64_t urand64() { + state_ ^= state_ >> 12; + state_ ^= state_ << 25; + state_ ^= state_ >> 27; + return (state_ * 2685821657736338717ULL) - 1; + } + + KOKKOS_INLINE_FUNCTION + uint32_t urand(const uint32_t& range) { + const uint32_t max_val = (MAX_URAND / range) * range; + uint32_t tmp = urand(); + while (tmp >= max_val) tmp = urand(); + return tmp % range; + } + + KOKKOS_INLINE_FUNCTION + uint32_t urand(const uint32_t& start, const uint32_t& end) { + return urand(end - start) + start; + } + + KOKKOS_INLINE_FUNCTION + uint64_t urand64(const uint64_t& range) { + const uint64_t max_val = (MAX_URAND64 / range) * range; + uint64_t tmp = urand64(); + while (tmp >= max_val) tmp = urand64(); + return tmp % range; + } + + KOKKOS_INLINE_FUNCTION + uint64_t urand64(const uint64_t& start, const uint64_t& end) { + return urand64(end - start) + start; + } + + KOKKOS_INLINE_FUNCTION + int rand() { return static_cast<int>(urand() / 2); } + + KOKKOS_INLINE_FUNCTION + int rand(const int& range) { + const int max_val = (MAX_RAND / range) * range; + int tmp = rand(); + while (tmp >= max_val) tmp = rand(); + return tmp % range; + } + + KOKKOS_INLINE_FUNCTION + int rand(const int& start, const int& end) { + return rand(end - start) + start; + } + + KOKKOS_INLINE_FUNCTION + int64_t rand64() { return static_cast<int64_t>(urand64() / 2); } + + KOKKOS_INLINE_FUNCTION + int64_t rand64(const int64_t& range) { + const int64_t max_val = (MAX_RAND64 / range) * range; + int64_t tmp = rand64(); + while (tmp >= max_val) tmp = rand64(); + return tmp % range; + } + + KOKKOS_INLINE_FUNCTION + int64_t rand64(const int64_t& start, const int64_t& end) { + return rand64(end - start) + start; + } + + KOKKOS_INLINE_FUNCTION + float frand() { return urand64() / static_cast<float>(MAX_URAND64); } + + KOKKOS_INLINE_FUNCTION + float frand(const float& range) { + return range * urand64() / static_cast<float>(MAX_URAND64); + } + + KOKKOS_INLINE_FUNCTION + float frand(const float& start, const float& end) { + return frand(end - start) + start; + } + + KOKKOS_INLINE_FUNCTION + double drand() { return urand64() / static_cast<double>(MAX_URAND64); } + + KOKKOS_INLINE_FUNCTION + double drand(const double& range) { + return range * urand64() / static_cast<double>(MAX_URAND64); + } + + KOKKOS_INLINE_FUNCTION + double drand(const double& start, const double& end) { + return drand(end - start) + start; + } + + // Marsaglia polar method for drawing a standard normal distributed random + // number + KOKKOS_INLINE_FUNCTION + double normal() { + double S = 2.0; + double U; + while (S >= 1.0) { + U = 2.0 * drand() - 1.0; + const double V = 2.0 * drand() - 1.0; + S = U * U + V * V; + } + return U * std::sqrt(-2.0 * std::log(S) / S); + } + + KOKKOS_INLINE_FUNCTION + double normal(const double& mean, const double& std_dev = 1.0) { + return mean + normal() * std_dev; + } +}; + +template <class DeviceType = Kokkos::DefaultExecutionSpace> +class Random_XorShift64_Pool { + private: + using execution_space = typename DeviceType::execution_space; + using locks_type = View<int*, execution_space>; + using state_data_type = View<uint64_t*, DeviceType>; + locks_type locks_; + state_data_type state_; + int num_states_; + + public: + using generator_type = Random_XorShift64<DeviceType>; + using device_type = DeviceType; + + KOKKOS_INLINE_FUNCTION + Random_XorShift64_Pool() { num_states_ = 0; } + Random_XorShift64_Pool(uint64_t seed) { + num_states_ = 0; + + init(seed, execution_space().concurrency()); + } + + KOKKOS_INLINE_FUNCTION + Random_XorShift64_Pool(const Random_XorShift64_Pool& src) + : locks_(src.locks_), state_(src.state_), num_states_(src.num_states_) {} + + KOKKOS_INLINE_FUNCTION + Random_XorShift64_Pool operator=(const Random_XorShift64_Pool& src) { + locks_ = src.locks_; + state_ = src.state_; + num_states_ = src.num_states_; + return *this; + } + + void init(uint64_t seed, int num_states) { + if (seed == 0) seed = uint64_t(1318319); + + num_states_ = num_states; + + locks_ = locks_type("Kokkos::Random_XorShift64::locks", num_states_); + state_ = state_data_type("Kokkos::Random_XorShift64::state", num_states_); + + typename state_data_type::HostMirror h_state = create_mirror_view(state_); + typename locks_type::HostMirror h_lock = create_mirror_view(locks_); + + // Execute on the HostMirror's default execution space. + Random_XorShift64<typename state_data_type::HostMirror::execution_space> + gen(seed, 0); + for (int i = 0; i < 17; i++) gen.rand(); + for (int i = 0; i < num_states_; i++) { + int n1 = gen.rand(); + int n2 = gen.rand(); + int n3 = gen.rand(); + int n4 = gen.rand(); + h_state(i) = (((static_cast<uint64_t>(n1)) & 0xffff) << 00) | + (((static_cast<uint64_t>(n2)) & 0xffff) << 16) | + (((static_cast<uint64_t>(n3)) & 0xffff) << 32) | + (((static_cast<uint64_t>(n4)) & 0xffff) << 48); + h_lock(i) = 0; + } + deep_copy(state_, h_state); + deep_copy(locks_, h_lock); + } + + KOKKOS_INLINE_FUNCTION + Random_XorShift64<DeviceType> get_state() const { + const int i = + Impl::Random_UniqueIndex<execution_space>::get_state_idx(locks_); + return Random_XorShift64<DeviceType>(state_(i), i); + } + + // NOTE: state_idx MUST be unique and less than num_states + KOKKOS_INLINE_FUNCTION + Random_XorShift64<DeviceType> get_state(const int state_idx) const { + return Random_XorShift64<DeviceType>(state_(state_idx), state_idx); + } + + KOKKOS_INLINE_FUNCTION + void free_state(const Random_XorShift64<DeviceType>& state) const { + state_(state.state_idx_) = state.state_; + locks_(state.state_idx_) = 0; + } +}; + +template <class DeviceType> +class Random_XorShift1024 { + using execution_space = typename DeviceType::execution_space; + + private: + int p_; + const int state_idx_; + Impl::Random_XorShift1024_State< + Impl::Random_XorShift1024_UseCArrayState<execution_space>::value> + state_; + friend class Random_XorShift1024_Pool<DeviceType>; + + public: + using pool_type = Random_XorShift1024_Pool<DeviceType>; + using device_type = DeviceType; + + constexpr static uint32_t MAX_URAND = std::numeric_limits<uint32_t>::max(); + constexpr static uint64_t MAX_URAND64 = std::numeric_limits<uint64_t>::max(); + constexpr static int32_t MAX_RAND = std::numeric_limits<int32_t>::max(); + constexpr static int64_t MAX_RAND64 = std::numeric_limits<int64_t>::max(); + + KOKKOS_INLINE_FUNCTION + Random_XorShift1024(const typename pool_type::state_data_type& state, int p, + int state_idx = 0) + : p_(p), state_idx_(state_idx), state_(state, state_idx) {} + + KOKKOS_INLINE_FUNCTION + uint32_t urand() { + uint64_t state_0 = state_[p_]; + uint64_t state_1 = state_[p_ = (p_ + 1) & 15]; + state_1 ^= state_1 << 31; + state_1 ^= state_1 >> 11; + state_0 ^= state_0 >> 30; + uint64_t tmp = (state_[p_] = state_0 ^ state_1) * 1181783497276652981ULL; + tmp = tmp >> 16; + return static_cast<uint32_t>(tmp & MAX_URAND); + } + + KOKKOS_INLINE_FUNCTION + uint64_t urand64() { + uint64_t state_0 = state_[p_]; + uint64_t state_1 = state_[p_ = (p_ + 1) & 15]; + state_1 ^= state_1 << 31; + state_1 ^= state_1 >> 11; + state_0 ^= state_0 >> 30; + return ((state_[p_] = state_0 ^ state_1) * 1181783497276652981LL) - 1; + } + + KOKKOS_INLINE_FUNCTION + uint32_t urand(const uint32_t& range) { + const uint32_t max_val = (MAX_URAND / range) * range; + uint32_t tmp = urand(); + while (tmp >= max_val) tmp = urand(); + return tmp % range; + } + + KOKKOS_INLINE_FUNCTION + uint32_t urand(const uint32_t& start, const uint32_t& end) { + return urand(end - start) + start; + } + + KOKKOS_INLINE_FUNCTION + uint64_t urand64(const uint64_t& range) { + const uint64_t max_val = (MAX_URAND64 / range) * range; + uint64_t tmp = urand64(); + while (tmp >= max_val) tmp = urand64(); + return tmp % range; + } + + KOKKOS_INLINE_FUNCTION + uint64_t urand64(const uint64_t& start, const uint64_t& end) { + return urand64(end - start) + start; + } + + KOKKOS_INLINE_FUNCTION + int rand() { return static_cast<int>(urand() / 2); } + + KOKKOS_INLINE_FUNCTION + int rand(const int& range) { + const int max_val = (MAX_RAND / range) * range; + int tmp = rand(); + while (tmp >= max_val) tmp = rand(); + return tmp % range; + } + + KOKKOS_INLINE_FUNCTION + int rand(const int& start, const int& end) { + return rand(end - start) + start; + } + + KOKKOS_INLINE_FUNCTION + int64_t rand64() { return static_cast<int64_t>(urand64() / 2); } + + KOKKOS_INLINE_FUNCTION + int64_t rand64(const int64_t& range) { + const int64_t max_val = (MAX_RAND64 / range) * range; + int64_t tmp = rand64(); + while (tmp >= max_val) tmp = rand64(); + return tmp % range; + } + + KOKKOS_INLINE_FUNCTION + int64_t rand64(const int64_t& start, const int64_t& end) { + return rand64(end - start) + start; + } + + KOKKOS_INLINE_FUNCTION + float frand() { return urand64() / static_cast<float>(MAX_URAND64); } + + KOKKOS_INLINE_FUNCTION + float frand(const float& range) { + return range * urand64() / static_cast<float>(MAX_URAND64); + } + + KOKKOS_INLINE_FUNCTION + float frand(const float& start, const float& end) { + return frand(end - start) + start; + } + + KOKKOS_INLINE_FUNCTION + double drand() { return urand64() / static_cast<double>(MAX_URAND64); } + + KOKKOS_INLINE_FUNCTION + double drand(const double& range) { + return range * urand64() / static_cast<double>(MAX_URAND64); + } + + KOKKOS_INLINE_FUNCTION + double drand(const double& start, const double& end) { + return drand(end - start) + start; + } + + // Marsaglia polar method for drawing a standard normal distributed random + // number + KOKKOS_INLINE_FUNCTION + double normal() { + double S = 2.0; + double U; + while (S >= 1.0) { + U = 2.0 * drand() - 1.0; + const double V = 2.0 * drand() - 1.0; + S = U * U + V * V; + } + return U * std::sqrt(-2.0 * std::log(S) / S); + } + + KOKKOS_INLINE_FUNCTION + double normal(const double& mean, const double& std_dev = 1.0) { + return mean + normal() * std_dev; + } +}; + +template <class DeviceType = Kokkos::DefaultExecutionSpace> +class Random_XorShift1024_Pool { + private: + using execution_space = typename DeviceType::execution_space; + using locks_type = View<int*, execution_space>; + using int_view_type = View<int*, DeviceType>; + using state_data_type = View<uint64_t * [16], DeviceType>; + + locks_type locks_; + state_data_type state_; + int_view_type p_; + int num_states_; + friend class Random_XorShift1024<DeviceType>; + + public: + using generator_type = Random_XorShift1024<DeviceType>; + + using device_type = DeviceType; + + KOKKOS_INLINE_FUNCTION + Random_XorShift1024_Pool() { num_states_ = 0; } + + inline Random_XorShift1024_Pool(uint64_t seed) { + num_states_ = 0; + + init(seed, execution_space().concurrency()); + } + + KOKKOS_INLINE_FUNCTION + Random_XorShift1024_Pool(const Random_XorShift1024_Pool& src) + : locks_(src.locks_), + state_(src.state_), + p_(src.p_), + num_states_(src.num_states_) {} + + KOKKOS_INLINE_FUNCTION + Random_XorShift1024_Pool operator=(const Random_XorShift1024_Pool& src) { + locks_ = src.locks_; + state_ = src.state_; + p_ = src.p_; + num_states_ = src.num_states_; + return *this; + } + + inline void init(uint64_t seed, int num_states) { + if (seed == 0) seed = uint64_t(1318319); + num_states_ = num_states; + locks_ = locks_type("Kokkos::Random_XorShift1024::locks", num_states_); + state_ = state_data_type("Kokkos::Random_XorShift1024::state", num_states_); + p_ = int_view_type("Kokkos::Random_XorShift1024::p", num_states_); + + typename state_data_type::HostMirror h_state = create_mirror_view(state_); + typename locks_type::HostMirror h_lock = create_mirror_view(locks_); + typename int_view_type::HostMirror h_p = create_mirror_view(p_); + + // Execute on the HostMirror's default execution space. + Random_XorShift64<typename state_data_type::HostMirror::execution_space> + gen(seed, 0); + for (int i = 0; i < 17; i++) gen.rand(); + for (int i = 0; i < num_states_; i++) { + for (int j = 0; j < 16; j++) { + int n1 = gen.rand(); + int n2 = gen.rand(); + int n3 = gen.rand(); + int n4 = gen.rand(); + h_state(i, j) = (((static_cast<uint64_t>(n1)) & 0xffff) << 00) | + (((static_cast<uint64_t>(n2)) & 0xffff) << 16) | + (((static_cast<uint64_t>(n3)) & 0xffff) << 32) | + (((static_cast<uint64_t>(n4)) & 0xffff) << 48); + } + h_p(i) = 0; + h_lock(i) = 0; + } + deep_copy(state_, h_state); + deep_copy(locks_, h_lock); + } + + KOKKOS_INLINE_FUNCTION + Random_XorShift1024<DeviceType> get_state() const { + const int i = + Impl::Random_UniqueIndex<execution_space>::get_state_idx(locks_); + return Random_XorShift1024<DeviceType>(state_, p_(i), i); + }; + + // NOTE: state_idx MUST be unique and less than num_states + KOKKOS_INLINE_FUNCTION + Random_XorShift1024<DeviceType> get_state(const int state_idx) const { + return Random_XorShift1024<DeviceType>(state_, p_(state_idx), state_idx); + } + + KOKKOS_INLINE_FUNCTION + void free_state(const Random_XorShift1024<DeviceType>& state) const { + for (int i = 0; i < 16; i++) state_(state.state_idx_, i) = state.state_[i]; + p_(state.state_idx_) = state.p_; + locks_(state.state_idx_) = 0; + } +}; + +namespace Impl { + +template <class ViewType, class RandomPool, int loops, int rank, + class IndexType> +struct fill_random_functor_range; +template <class ViewType, class RandomPool, int loops, int rank, + class IndexType> +struct fill_random_functor_begin_end; + +template <class ViewType, class RandomPool, int loops, class IndexType> +struct fill_random_functor_range<ViewType, RandomPool, loops, 1, IndexType> { + using execution_space = typename ViewType::execution_space; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type range; + + using Rand = rand<typename RandomPool::generator_type, + typename ViewType::non_const_value_type>; + + fill_random_functor_range(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type range_) + : a(a_), rand_pool(rand_pool_), range(range_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const IndexType& i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for (IndexType j = 0; j < loops; j++) { + const IndexType idx = i * loops + j; + if (idx < static_cast<IndexType>(a.extent(0))) + a(idx) = Rand::draw(gen, range); + } + rand_pool.free_state(gen); + } +}; + +template <class ViewType, class RandomPool, int loops, class IndexType> +struct fill_random_functor_range<ViewType, RandomPool, loops, 2, IndexType> { + using execution_space = typename ViewType::execution_space; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type range; + + using Rand = rand<typename RandomPool::generator_type, + typename ViewType::non_const_value_type>; + + fill_random_functor_range(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type range_) + : a(a_), rand_pool(rand_pool_), range(range_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(IndexType i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for (IndexType j = 0; j < loops; j++) { + const IndexType idx = i * loops + j; + if (idx < static_cast<IndexType>(a.extent(0))) { + for (IndexType k = 0; k < static_cast<IndexType>(a.extent(1)); k++) + a(idx, k) = Rand::draw(gen, range); + } + } + rand_pool.free_state(gen); + } +}; + +template <class ViewType, class RandomPool, int loops, class IndexType> +struct fill_random_functor_range<ViewType, RandomPool, loops, 3, IndexType> { + using execution_space = typename ViewType::execution_space; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type range; + + using Rand = rand<typename RandomPool::generator_type, + typename ViewType::non_const_value_type>; + + fill_random_functor_range(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type range_) + : a(a_), rand_pool(rand_pool_), range(range_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(IndexType i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for (IndexType j = 0; j < loops; j++) { + const IndexType idx = i * loops + j; + if (idx < static_cast<IndexType>(a.extent(0))) { + for (IndexType k = 0; k < static_cast<IndexType>(a.extent(1)); k++) + for (IndexType l = 0; l < static_cast<IndexType>(a.extent(2)); l++) + a(idx, k, l) = Rand::draw(gen, range); + } + } + rand_pool.free_state(gen); + } +}; + +template <class ViewType, class RandomPool, int loops, class IndexType> +struct fill_random_functor_range<ViewType, RandomPool, loops, 4, IndexType> { + using execution_space = typename ViewType::execution_space; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type range; + + using Rand = rand<typename RandomPool::generator_type, + typename ViewType::non_const_value_type>; + + fill_random_functor_range(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type range_) + : a(a_), rand_pool(rand_pool_), range(range_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(IndexType i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for (IndexType j = 0; j < loops; j++) { + const IndexType idx = i * loops + j; + if (idx < static_cast<IndexType>(a.extent(0))) { + for (IndexType k = 0; k < static_cast<IndexType>(a.extent(1)); k++) + for (IndexType l = 0; l < static_cast<IndexType>(a.extent(2)); l++) + for (IndexType m = 0; m < static_cast<IndexType>(a.extent(3)); m++) + a(idx, k, l, m) = Rand::draw(gen, range); + } + } + rand_pool.free_state(gen); + } +}; + +template <class ViewType, class RandomPool, int loops, class IndexType> +struct fill_random_functor_range<ViewType, RandomPool, loops, 5, IndexType> { + using execution_space = typename ViewType::execution_space; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type range; + + using Rand = rand<typename RandomPool::generator_type, + typename ViewType::non_const_value_type>; + + fill_random_functor_range(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type range_) + : a(a_), rand_pool(rand_pool_), range(range_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(IndexType i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for (IndexType j = 0; j < loops; j++) { + const IndexType idx = i * loops + j; + if (idx < static_cast<IndexType>(a.extent(0))) { + for (IndexType k = 0; k < static_cast<IndexType>(a.extent(1)); k++) + for (IndexType l = 0; l < static_cast<IndexType>(a.extent(2)); l++) + for (IndexType m = 0; m < static_cast<IndexType>(a.extent(3)); m++) + for (IndexType n = 0; n < static_cast<IndexType>(a.extent(4)); + n++) + a(idx, k, l, m, n) = Rand::draw(gen, range); + } + } + rand_pool.free_state(gen); + } +}; + +template <class ViewType, class RandomPool, int loops, class IndexType> +struct fill_random_functor_range<ViewType, RandomPool, loops, 6, IndexType> { + using execution_space = typename ViewType::execution_space; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type range; + + using Rand = rand<typename RandomPool::generator_type, + typename ViewType::non_const_value_type>; + + fill_random_functor_range(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type range_) + : a(a_), rand_pool(rand_pool_), range(range_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(IndexType i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for (IndexType j = 0; j < loops; j++) { + const IndexType idx = i * loops + j; + if (idx < static_cast<IndexType>(a.extent(0))) { + for (IndexType k = 0; k < static_cast<IndexType>(a.extent(1)); k++) + for (IndexType l = 0; l < static_cast<IndexType>(a.extent(2)); l++) + for (IndexType m = 0; m < static_cast<IndexType>(a.extent(3)); m++) + for (IndexType n = 0; n < static_cast<IndexType>(a.extent(4)); + n++) + for (IndexType o = 0; o < static_cast<IndexType>(a.extent(5)); + o++) + a(idx, k, l, m, n, o) = Rand::draw(gen, range); + } + } + rand_pool.free_state(gen); + } +}; + +template <class ViewType, class RandomPool, int loops, class IndexType> +struct fill_random_functor_range<ViewType, RandomPool, loops, 7, IndexType> { + using execution_space = typename ViewType::execution_space; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type range; + + using Rand = rand<typename RandomPool::generator_type, + typename ViewType::non_const_value_type>; + + fill_random_functor_range(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type range_) + : a(a_), rand_pool(rand_pool_), range(range_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(IndexType i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for (IndexType j = 0; j < loops; j++) { + const IndexType idx = i * loops + j; + if (idx < static_cast<IndexType>(a.extent(0))) { + for (IndexType k = 0; k < static_cast<IndexType>(a.extent(1)); k++) + for (IndexType l = 0; l < static_cast<IndexType>(a.extent(2)); l++) + for (IndexType m = 0; m < static_cast<IndexType>(a.extent(3)); m++) + for (IndexType n = 0; n < static_cast<IndexType>(a.extent(4)); + n++) + for (IndexType o = 0; o < static_cast<IndexType>(a.extent(5)); + o++) + for (IndexType p = 0; p < static_cast<IndexType>(a.extent(6)); + p++) + a(idx, k, l, m, n, o, p) = Rand::draw(gen, range); + } + } + rand_pool.free_state(gen); + } +}; + +template <class ViewType, class RandomPool, int loops, class IndexType> +struct fill_random_functor_range<ViewType, RandomPool, loops, 8, IndexType> { + using execution_space = typename ViewType::execution_space; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type range; + + using Rand = rand<typename RandomPool::generator_type, + typename ViewType::non_const_value_type>; + + fill_random_functor_range(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type range_) + : a(a_), rand_pool(rand_pool_), range(range_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(IndexType i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for (IndexType j = 0; j < loops; j++) { + const IndexType idx = i * loops + j; + if (idx < static_cast<IndexType>(a.extent(0))) { + for (IndexType k = 0; k < static_cast<IndexType>(a.extent(1)); k++) + for (IndexType l = 0; l < static_cast<IndexType>(a.extent(2)); l++) + for (IndexType m = 0; m < static_cast<IndexType>(a.extent(3)); m++) + for (IndexType n = 0; n < static_cast<IndexType>(a.extent(4)); + n++) + for (IndexType o = 0; o < static_cast<IndexType>(a.extent(5)); + o++) + for (IndexType p = 0; p < static_cast<IndexType>(a.extent(6)); + p++) + for (IndexType q = 0; + q < static_cast<IndexType>(a.extent(7)); q++) + a(idx, k, l, m, n, o, p, q) = Rand::draw(gen, range); + } + } + rand_pool.free_state(gen); + } +}; +template <class ViewType, class RandomPool, int loops, class IndexType> +struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 1, + IndexType> { + using execution_space = typename ViewType::execution_space; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type begin, end; + + using Rand = rand<typename RandomPool::generator_type, + typename ViewType::non_const_value_type>; + + fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type begin_, + typename ViewType::const_value_type end_) + : a(a_), rand_pool(rand_pool_), begin(begin_), end(end_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(IndexType i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for (IndexType j = 0; j < loops; j++) { + const IndexType idx = i * loops + j; + if (idx < static_cast<IndexType>(a.extent(0))) + a(idx) = Rand::draw(gen, begin, end); + } + rand_pool.free_state(gen); + } +}; + +template <class ViewType, class RandomPool, int loops, class IndexType> +struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 2, + IndexType> { + using execution_space = typename ViewType::execution_space; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type begin, end; + + using Rand = rand<typename RandomPool::generator_type, + typename ViewType::non_const_value_type>; + + fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type begin_, + typename ViewType::const_value_type end_) + : a(a_), rand_pool(rand_pool_), begin(begin_), end(end_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(IndexType i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for (IndexType j = 0; j < loops; j++) { + const IndexType idx = i * loops + j; + if (idx < static_cast<IndexType>(a.extent(0))) { + for (IndexType k = 0; k < static_cast<IndexType>(a.extent(1)); k++) + a(idx, k) = Rand::draw(gen, begin, end); + } + } + rand_pool.free_state(gen); + } +}; + +template <class ViewType, class RandomPool, int loops, class IndexType> +struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 3, + IndexType> { + using execution_space = typename ViewType::execution_space; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type begin, end; + + using Rand = rand<typename RandomPool::generator_type, + typename ViewType::non_const_value_type>; + + fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type begin_, + typename ViewType::const_value_type end_) + : a(a_), rand_pool(rand_pool_), begin(begin_), end(end_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(IndexType i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for (IndexType j = 0; j < loops; j++) { + const IndexType idx = i * loops + j; + if (idx < static_cast<IndexType>(a.extent(0))) { + for (IndexType k = 0; k < static_cast<IndexType>(a.extent(1)); k++) + for (IndexType l = 0; l < static_cast<IndexType>(a.extent(2)); l++) + a(idx, k, l) = Rand::draw(gen, begin, end); + } + } + rand_pool.free_state(gen); + } +}; + +template <class ViewType, class RandomPool, int loops, class IndexType> +struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 4, + IndexType> { + using execution_space = typename ViewType::execution_space; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type begin, end; + + using Rand = rand<typename RandomPool::generator_type, + typename ViewType::non_const_value_type>; + + fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type begin_, + typename ViewType::const_value_type end_) + : a(a_), rand_pool(rand_pool_), begin(begin_), end(end_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(IndexType i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for (IndexType j = 0; j < loops; j++) { + const IndexType idx = i * loops + j; + if (idx < static_cast<IndexType>(a.extent(0))) { + for (IndexType k = 0; k < static_cast<IndexType>(a.extent(1)); k++) + for (IndexType l = 0; l < static_cast<IndexType>(a.extent(2)); l++) + for (IndexType m = 0; m < static_cast<IndexType>(a.extent(3)); m++) + a(idx, k, l, m) = Rand::draw(gen, begin, end); + } + } + rand_pool.free_state(gen); + } +}; + +template <class ViewType, class RandomPool, int loops, class IndexType> +struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 5, + IndexType> { + using execution_space = typename ViewType::execution_space; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type begin, end; + + using Rand = rand<typename RandomPool::generator_type, + typename ViewType::non_const_value_type>; + + fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type begin_, + typename ViewType::const_value_type end_) + : a(a_), rand_pool(rand_pool_), begin(begin_), end(end_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(IndexType i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for (IndexType j = 0; j < loops; j++) { + const IndexType idx = i * loops + j; + if (idx < static_cast<IndexType>(a.extent(0))) { + for (IndexType l = 0; l < static_cast<IndexType>(a.extent(1)); l++) + for (IndexType m = 0; m < static_cast<IndexType>(a.extent(2)); m++) + for (IndexType n = 0; n < static_cast<IndexType>(a.extent(3)); n++) + for (IndexType o = 0; o < static_cast<IndexType>(a.extent(4)); + o++) + a(idx, l, m, n, o) = Rand::draw(gen, begin, end); + } + } + rand_pool.free_state(gen); + } +}; + +template <class ViewType, class RandomPool, int loops, class IndexType> +struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 6, + IndexType> { + using execution_space = typename ViewType::execution_space; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type begin, end; + + using Rand = rand<typename RandomPool::generator_type, + typename ViewType::non_const_value_type>; + + fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type begin_, + typename ViewType::const_value_type end_) + : a(a_), rand_pool(rand_pool_), begin(begin_), end(end_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(IndexType i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for (IndexType j = 0; j < loops; j++) { + const IndexType idx = i * loops + j; + if (idx < static_cast<IndexType>(a.extent(0))) { + for (IndexType k = 0; k < static_cast<IndexType>(a.extent(1)); k++) + for (IndexType l = 0; l < static_cast<IndexType>(a.extent(2)); l++) + for (IndexType m = 0; m < static_cast<IndexType>(a.extent(3)); m++) + for (IndexType n = 0; n < static_cast<IndexType>(a.extent(4)); + n++) + for (IndexType o = 0; o < static_cast<IndexType>(a.extent(5)); + o++) + a(idx, k, l, m, n, o) = Rand::draw(gen, begin, end); + } + } + rand_pool.free_state(gen); + } +}; + +template <class ViewType, class RandomPool, int loops, class IndexType> +struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 7, + IndexType> { + using execution_space = typename ViewType::execution_space; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type begin, end; + + using Rand = rand<typename RandomPool::generator_type, + typename ViewType::non_const_value_type>; + + fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type begin_, + typename ViewType::const_value_type end_) + : a(a_), rand_pool(rand_pool_), begin(begin_), end(end_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(IndexType i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for (IndexType j = 0; j < loops; j++) { + const IndexType idx = i * loops + j; + if (idx < static_cast<IndexType>(a.extent(0))) { + for (IndexType k = 0; k < static_cast<IndexType>(a.extent(1)); k++) + for (IndexType l = 0; l < static_cast<IndexType>(a.extent(2)); l++) + for (IndexType m = 0; m < static_cast<IndexType>(a.extent(3)); m++) + for (IndexType n = 0; n < static_cast<IndexType>(a.extent(4)); + n++) + for (IndexType o = 0; o < static_cast<IndexType>(a.extent(5)); + o++) + for (IndexType p = 0; p < static_cast<IndexType>(a.extent(6)); + p++) + a(idx, k, l, m, n, o, p) = Rand::draw(gen, begin, end); + } + } + rand_pool.free_state(gen); + } +}; + +template <class ViewType, class RandomPool, int loops, class IndexType> +struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 8, + IndexType> { + using execution_space = typename ViewType::execution_space; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type begin, end; + + using Rand = rand<typename RandomPool::generator_type, + typename ViewType::non_const_value_type>; + + fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type begin_, + typename ViewType::const_value_type end_) + : a(a_), rand_pool(rand_pool_), begin(begin_), end(end_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(IndexType i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for (IndexType j = 0; j < loops; j++) { + const IndexType idx = i * loops + j; + if (idx < static_cast<IndexType>(a.extent(0))) { + for (IndexType k = 0; k < static_cast<IndexType>(a.extent(1)); k++) + for (IndexType l = 0; l < static_cast<IndexType>(a.extent(2)); l++) + for (IndexType m = 0; m < static_cast<IndexType>(a.extent(3)); m++) + for (IndexType n = 0; n < static_cast<IndexType>(a.extent(4)); + n++) + for (IndexType o = 0; o < static_cast<IndexType>(a.extent(5)); + o++) + for (IndexType p = 0; p < static_cast<IndexType>(a.extent(6)); + p++) + for (IndexType q = 0; + q < static_cast<IndexType>(a.extent(7)); q++) + a(idx, k, l, m, n, o, p, q) = Rand::draw(gen, begin, end); + } + } + rand_pool.free_state(gen); + } +}; + +} // namespace Impl + +template <class ViewType, class RandomPool, class IndexType = int64_t> +void fill_random(ViewType a, RandomPool g, + typename ViewType::const_value_type range) { + int64_t LDA = a.extent(0); + if (LDA > 0) + parallel_for("Kokkos::fill_random", (LDA + 127) / 128, + Impl::fill_random_functor_range<ViewType, RandomPool, 128, + ViewType::Rank, IndexType>( + a, g, range)); +} + +template <class ViewType, class RandomPool, class IndexType = int64_t> +void fill_random(ViewType a, RandomPool g, + typename ViewType::const_value_type begin, + typename ViewType::const_value_type end) { + int64_t LDA = a.extent(0); + if (LDA > 0) + parallel_for("Kokkos::fill_random", (LDA + 127) / 128, + Impl::fill_random_functor_begin_end<ViewType, RandomPool, 128, + ViewType::Rank, IndexType>( + a, g, begin, end)); +} +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/algorithms/src/Kokkos_Sort.hpp b/packages/kokkos/algorithms/src/Kokkos_Sort.hpp new file mode 100644 index 0000000000000000000000000000000000000000..d17c02776ff5653045d9259d7a4fae2207546e21 --- /dev/null +++ b/packages/kokkos/algorithms/src/Kokkos_Sort.hpp @@ -0,0 +1,564 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_SORT_HPP_ +#define KOKKOS_SORT_HPP_ + +#include <Kokkos_Core.hpp> + +#include <algorithm> + +namespace Kokkos { + +namespace Impl { + +template <class DstViewType, class SrcViewType, int Rank = DstViewType::Rank> +struct CopyOp; + +template <class DstViewType, class SrcViewType> +struct CopyOp<DstViewType, SrcViewType, 1> { + KOKKOS_INLINE_FUNCTION + static void copy(DstViewType const& dst, size_t i_dst, SrcViewType const& src, + size_t i_src) { + dst(i_dst) = src(i_src); + } +}; + +template <class DstViewType, class SrcViewType> +struct CopyOp<DstViewType, SrcViewType, 2> { + KOKKOS_INLINE_FUNCTION + static void copy(DstViewType const& dst, size_t i_dst, SrcViewType const& src, + size_t i_src) { + for (int j = 0; j < (int)dst.extent(1); j++) dst(i_dst, j) = src(i_src, j); + } +}; + +template <class DstViewType, class SrcViewType> +struct CopyOp<DstViewType, SrcViewType, 3> { + KOKKOS_INLINE_FUNCTION + static void copy(DstViewType const& dst, size_t i_dst, SrcViewType const& src, + size_t i_src) { + for (int j = 0; j < dst.extent(1); j++) + for (int k = 0; k < dst.extent(2); k++) + dst(i_dst, j, k) = src(i_src, j, k); + } +}; +} // namespace Impl + +//---------------------------------------------------------------------------- + +template <class KeyViewType, class BinSortOp, + class Space = typename KeyViewType::device_type, + class SizeType = typename KeyViewType::memory_space::size_type> +class BinSort { + public: + template <class DstViewType, class SrcViewType> + struct copy_functor { + using src_view_type = typename SrcViewType::const_type; + + using copy_op = Impl::CopyOp<DstViewType, src_view_type>; + + DstViewType dst_values; + src_view_type src_values; + int dst_offset; + + copy_functor(DstViewType const& dst_values_, int const& dst_offset_, + SrcViewType const& src_values_) + : dst_values(dst_values_), + src_values(src_values_), + dst_offset(dst_offset_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int& i) const { + copy_op::copy(dst_values, i + dst_offset, src_values, i); + } + }; + + template <class DstViewType, class PermuteViewType, class SrcViewType> + struct copy_permute_functor { + // If a Kokkos::View then can generate constant random access + // otherwise can only use the constant type. + + using src_view_type = typename std::conditional< + Kokkos::is_view<SrcViewType>::value, + Kokkos::View<typename SrcViewType::const_data_type, + typename SrcViewType::array_layout, + typename SrcViewType::device_type, + Kokkos::MemoryTraits<Kokkos::RandomAccess> >, + typename SrcViewType::const_type>::type; + + using perm_view_type = typename PermuteViewType::const_type; + + using copy_op = Impl::CopyOp<DstViewType, src_view_type>; + + DstViewType dst_values; + perm_view_type sort_order; + src_view_type src_values; + int src_offset; + + copy_permute_functor(DstViewType const& dst_values_, + PermuteViewType const& sort_order_, + SrcViewType const& src_values_, int const& src_offset_) + : dst_values(dst_values_), + sort_order(sort_order_), + src_values(src_values_), + src_offset(src_offset_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int& i) const { + copy_op::copy(dst_values, i, src_values, src_offset + sort_order(i)); + } + }; + + using execution_space = typename Space::execution_space; + using bin_op_type = BinSortOp; + + struct bin_count_tag {}; + struct bin_offset_tag {}; + struct bin_binning_tag {}; + struct bin_sort_bins_tag {}; + + public: + using size_type = SizeType; + using value_type = size_type; + + using offset_type = Kokkos::View<size_type*, Space>; + using bin_count_type = Kokkos::View<const int*, Space>; + + using const_key_view_type = typename KeyViewType::const_type; + + // If a Kokkos::View then can generate constant random access + // otherwise can only use the constant type. + + using const_rnd_key_view_type = typename std::conditional< + Kokkos::is_view<KeyViewType>::value, + Kokkos::View<typename KeyViewType::const_data_type, + typename KeyViewType::array_layout, + typename KeyViewType::device_type, + Kokkos::MemoryTraits<Kokkos::RandomAccess> >, + const_key_view_type>::type; + + using non_const_key_scalar = typename KeyViewType::non_const_value_type; + using const_key_scalar = typename KeyViewType::const_value_type; + + using bin_count_atomic_type = + Kokkos::View<int*, Space, Kokkos::MemoryTraits<Kokkos::Atomic> >; + + private: + const_key_view_type keys; + const_rnd_key_view_type keys_rnd; + + public: + BinSortOp bin_op; + offset_type bin_offsets; + bin_count_atomic_type bin_count_atomic; + bin_count_type bin_count_const; + offset_type sort_order; + + int range_begin; + int range_end; + bool sort_within_bins; + + public: + BinSort() = default; + + //---------------------------------------- + // Constructor: takes the keys, the binning_operator and optionally whether to + // sort within bins (default false) + BinSort(const_key_view_type keys_, int range_begin_, int range_end_, + BinSortOp bin_op_, bool sort_within_bins_ = false) + : keys(keys_), + keys_rnd(keys_), + bin_op(bin_op_), + bin_offsets(), + bin_count_atomic(), + bin_count_const(), + sort_order(), + range_begin(range_begin_), + range_end(range_end_), + sort_within_bins(sort_within_bins_) { + bin_count_atomic = Kokkos::View<int*, Space>( + "Kokkos::SortImpl::BinSortFunctor::bin_count", bin_op.max_bins()); + bin_count_const = bin_count_atomic; + bin_offsets = + offset_type(view_alloc(WithoutInitializing, + "Kokkos::SortImpl::BinSortFunctor::bin_offsets"), + bin_op.max_bins()); + sort_order = + offset_type(view_alloc(WithoutInitializing, + "Kokkos::SortImpl::BinSortFunctor::sort_order"), + range_end - range_begin); + } + + BinSort(const_key_view_type keys_, BinSortOp bin_op_, + bool sort_within_bins_ = false) + : BinSort(keys_, 0, keys_.extent(0), bin_op_, sort_within_bins_) {} + + //---------------------------------------- + // Create the permutation vector, the bin_offset array and the bin_count + // array. Can be called again if keys changed + void create_permute_vector() { + const size_t len = range_end - range_begin; + Kokkos::parallel_for( + "Kokkos::Sort::BinCount", + Kokkos::RangePolicy<execution_space, bin_count_tag>(0, len), *this); + Kokkos::parallel_scan("Kokkos::Sort::BinOffset", + Kokkos::RangePolicy<execution_space, bin_offset_tag>( + 0, bin_op.max_bins()), + *this); + + Kokkos::deep_copy(bin_count_atomic, 0); + Kokkos::parallel_for( + "Kokkos::Sort::BinBinning", + Kokkos::RangePolicy<execution_space, bin_binning_tag>(0, len), *this); + + if (sort_within_bins) + Kokkos::parallel_for( + "Kokkos::Sort::BinSort", + Kokkos::RangePolicy<execution_space, bin_sort_bins_tag>( + 0, bin_op.max_bins()), + *this); + } + + // Sort a subset of a view with respect to the first dimension using the + // permutation array + template <class ValuesViewType> + void sort(ValuesViewType const& values, int values_range_begin, + int values_range_end) const { + using scratch_view_type = + Kokkos::View<typename ValuesViewType::data_type, + typename ValuesViewType::array_layout, + typename ValuesViewType::device_type>; + + const size_t len = range_end - range_begin; + const size_t values_len = values_range_end - values_range_begin; + if (len != values_len) { + Kokkos::abort( + "BinSort::sort: values range length != permutation vector length"); + } + + scratch_view_type sorted_values( + view_alloc(WithoutInitializing, + "Kokkos::SortImpl::BinSortFunctor::sorted_values"), + values.rank_dynamic > 0 ? len : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + values.rank_dynamic > 1 ? values.extent(1) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + values.rank_dynamic > 2 ? values.extent(2) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + values.rank_dynamic > 3 ? values.extent(3) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + values.rank_dynamic > 4 ? values.extent(4) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + values.rank_dynamic > 5 ? values.extent(5) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + values.rank_dynamic > 6 ? values.extent(6) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + values.rank_dynamic > 7 ? values.extent(7) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG); + + { + copy_permute_functor<scratch_view_type /* DstViewType */ + , + offset_type /* PermuteViewType */ + , + ValuesViewType /* SrcViewType */ + > + functor(sorted_values, sort_order, values, + values_range_begin - range_begin); + + parallel_for("Kokkos::Sort::CopyPermute", + Kokkos::RangePolicy<execution_space>(0, len), functor); + } + + { + copy_functor<ValuesViewType, scratch_view_type> functor( + values, range_begin, sorted_values); + + parallel_for("Kokkos::Sort::Copy", + Kokkos::RangePolicy<execution_space>(0, len), functor); + } + + execution_space().fence(); + } + + template <class ValuesViewType> + void sort(ValuesViewType const& values) const { + this->sort(values, 0, /*values.extent(0)*/ range_end - range_begin); + } + + // Get the permutation vector + KOKKOS_INLINE_FUNCTION + offset_type get_permute_vector() const { return sort_order; } + + // Get the start offsets for each bin + KOKKOS_INLINE_FUNCTION + offset_type get_bin_offsets() const { return bin_offsets; } + + // Get the count for each bin + KOKKOS_INLINE_FUNCTION + bin_count_type get_bin_count() const { return bin_count_const; } + + public: + KOKKOS_INLINE_FUNCTION + void operator()(const bin_count_tag& /*tag*/, const int i) const { + const int j = range_begin + i; + bin_count_atomic(bin_op.bin(keys, j))++; + } + + KOKKOS_INLINE_FUNCTION + void operator()(const bin_offset_tag& /*tag*/, const int i, + value_type& offset, const bool& final) const { + if (final) { + bin_offsets(i) = offset; + } + offset += bin_count_const(i); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const bin_binning_tag& /*tag*/, const int i) const { + const int j = range_begin + i; + const int bin = bin_op.bin(keys, j); + const int count = bin_count_atomic(bin)++; + + sort_order(bin_offsets(bin) + count) = j; + } + + KOKKOS_INLINE_FUNCTION + void operator()(const bin_sort_bins_tag& /*tag*/, const int i) const { + auto bin_size = bin_count_const(i); + if (bin_size <= 1) return; + int upper_bound = bin_offsets(i) + bin_size; + bool sorted = false; + while (!sorted) { + sorted = true; + int old_idx = sort_order(bin_offsets(i)); + int new_idx = 0; + for (int k = bin_offsets(i) + 1; k < upper_bound; k++) { + new_idx = sort_order(k); + + if (!bin_op(keys_rnd, old_idx, new_idx)) { + sort_order(k - 1) = new_idx; + sort_order(k) = old_idx; + sorted = false; + } else { + old_idx = new_idx; + } + } + upper_bound--; + } + } +}; + +//---------------------------------------------------------------------------- + +template <class KeyViewType> +struct BinOp1D { + int max_bins_; + double mul_; + typename KeyViewType::const_value_type range_; + typename KeyViewType::const_value_type min_; + + BinOp1D() + : max_bins_(0), + mul_(0.0), + range_(typename KeyViewType::const_value_type()), + min_(typename KeyViewType::const_value_type()) {} + + // Construct BinOp with number of bins, minimum value and maxuimum value + BinOp1D(int max_bins__, typename KeyViewType::const_value_type min, + typename KeyViewType::const_value_type max) + : max_bins_(max_bins__ + 1), + mul_(1.0 * max_bins__ / (max - min)), + range_(max - min), + min_(min) {} + + // Determine bin index from key value + template <class ViewType> + KOKKOS_INLINE_FUNCTION int bin(ViewType& keys, const int& i) const { + return int(mul_ * (keys(i) - min_)); + } + + // Return maximum bin index + 1 + KOKKOS_INLINE_FUNCTION + int max_bins() const { return max_bins_; } + + // Compare to keys within a bin if true new_val will be put before old_val + template <class ViewType, typename iType1, typename iType2> + KOKKOS_INLINE_FUNCTION bool operator()(ViewType& keys, iType1& i1, + iType2& i2) const { + return keys(i1) < keys(i2); + } +}; + +template <class KeyViewType> +struct BinOp3D { + int max_bins_[3]; + double mul_[3]; + typename KeyViewType::non_const_value_type range_[3]; + typename KeyViewType::non_const_value_type min_[3]; + + BinOp3D() = default; + + BinOp3D(int max_bins__[], typename KeyViewType::const_value_type min[], + typename KeyViewType::const_value_type max[]) { + max_bins_[0] = max_bins__[0]; + max_bins_[1] = max_bins__[1]; + max_bins_[2] = max_bins__[2]; + mul_[0] = 1.0 * max_bins__[0] / (max[0] - min[0]); + mul_[1] = 1.0 * max_bins__[1] / (max[1] - min[1]); + mul_[2] = 1.0 * max_bins__[2] / (max[2] - min[2]); + range_[0] = max[0] - min[0]; + range_[1] = max[1] - min[1]; + range_[2] = max[2] - min[2]; + min_[0] = min[0]; + min_[1] = min[1]; + min_[2] = min[2]; + } + + template <class ViewType> + KOKKOS_INLINE_FUNCTION int bin(ViewType& keys, const int& i) const { + return int((((int(mul_[0] * (keys(i, 0) - min_[0])) * max_bins_[1]) + + int(mul_[1] * (keys(i, 1) - min_[1]))) * + max_bins_[2]) + + int(mul_[2] * (keys(i, 2) - min_[2]))); + } + + KOKKOS_INLINE_FUNCTION + int max_bins() const { return max_bins_[0] * max_bins_[1] * max_bins_[2]; } + + template <class ViewType, typename iType1, typename iType2> + KOKKOS_INLINE_FUNCTION bool operator()(ViewType& keys, iType1& i1, + iType2& i2) const { + if (keys(i1, 0) > keys(i2, 0)) + return true; + else if (keys(i1, 0) == keys(i2, 0)) { + if (keys(i1, 1) > keys(i2, 1)) + return true; + else if (keys(i1, 1) == keys(i2, 1)) { + if (keys(i1, 2) > keys(i2, 2)) return true; + } + } + return false; + } +}; + +namespace Impl { + +template <class ViewType> +bool try_std_sort(ViewType view) { + bool possible = true; + size_t stride[8] = {view.stride_0(), view.stride_1(), view.stride_2(), + view.stride_3(), view.stride_4(), view.stride_5(), + view.stride_6(), view.stride_7()}; + possible = possible && + std::is_same<typename ViewType::memory_space, HostSpace>::value; + possible = possible && (ViewType::Rank == 1); + possible = possible && (stride[0] == 1); + if (possible) { + std::sort(view.data(), view.data() + view.extent(0)); + } + return possible; +} + +template <class ViewType> +struct min_max_functor { + using minmax_scalar = + Kokkos::MinMaxScalar<typename ViewType::non_const_value_type>; + + ViewType view; + min_max_functor(const ViewType& view_) : view(view_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const size_t& i, minmax_scalar& minmax) const { + if (view(i) < minmax.min_val) minmax.min_val = view(i); + if (view(i) > minmax.max_val) minmax.max_val = view(i); + } +}; + +} // namespace Impl + +template <class ViewType> +void sort(ViewType const& view, bool const always_use_kokkos_sort = false) { + if (!always_use_kokkos_sort) { + if (Impl::try_std_sort(view)) return; + } + using CompType = BinOp1D<ViewType>; + + Kokkos::MinMaxScalar<typename ViewType::non_const_value_type> result; + Kokkos::MinMax<typename ViewType::non_const_value_type> reducer(result); + parallel_reduce("Kokkos::Sort::FindExtent", + Kokkos::RangePolicy<typename ViewType::execution_space>( + 0, view.extent(0)), + Impl::min_max_functor<ViewType>(view), reducer); + if (result.min_val == result.max_val) return; + BinSort<ViewType, CompType> bin_sort( + view, CompType(view.extent(0) / 2, result.min_val, result.max_val), true); + bin_sort.create_permute_vector(); + bin_sort.sort(view); +} + +template <class ViewType> +void sort(ViewType view, size_t const begin, size_t const end) { + using range_policy = Kokkos::RangePolicy<typename ViewType::execution_space>; + using CompType = BinOp1D<ViewType>; + + Kokkos::MinMaxScalar<typename ViewType::non_const_value_type> result; + Kokkos::MinMax<typename ViewType::non_const_value_type> reducer(result); + + parallel_reduce("Kokkos::Sort::FindExtent", range_policy(begin, end), + Impl::min_max_functor<ViewType>(view), reducer); + + if (result.min_val == result.max_val) return; + + BinSort<ViewType, CompType> bin_sort( + view, begin, end, + CompType((end - begin) / 2, result.min_val, result.max_val), true); + + bin_sort.create_permute_vector(); + bin_sort.sort(view, begin, end); +} + +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/algorithms/unit_tests/CMakeLists.txt b/packages/kokkos/algorithms/unit_tests/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..9109837985a91ad14245133682af15aca59be503 --- /dev/null +++ b/packages/kokkos/algorithms/unit_tests/CMakeLists.txt @@ -0,0 +1,75 @@ + +#Leave these here for now - I don't need transitive deps anyway +KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) +KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src ) +KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files) + + +SET(GTEST_SOURCE_DIR ${${PARENT_PACKAGE_NAME}_SOURCE_DIR}/tpls/gtest) +KOKKOS_INCLUDE_DIRECTORIES(${GTEST_SOURCE_DIR}) + +# mfh 03 Nov 2017: The gtest library used here must have a different +# name than that of the gtest library built in KokkosCore. We can't +# just refer to the library in KokkosCore's tests, because it's +# possible to build only (e.g.,) KokkosAlgorithms tests, without +# building KokkosCore tests. + + +KOKKOS_ADD_TEST_LIBRARY( + kokkosalgorithms_gtest + HEADERS ${GTEST_SOURCE_DIR}/gtest/gtest.h + SOURCES ${GTEST_SOURCE_DIR}/gtest/gtest-all.cc +) + +# avoid deprecation warnings from MSVC +TARGET_COMPILE_DEFINITIONS(kokkosalgorithms_gtest PUBLIC GTEST_HAS_TR1_TUPLE=0 GTEST_HAS_PTHREAD=0) + +IF((NOT (Kokkos_ENABLE_CUDA AND WIN32)) AND (NOT ("${KOKKOS_CXX_COMPILER_ID}" STREQUAL "Fujitsu"))) + TARGET_COMPILE_FEATURES(kokkosalgorithms_gtest PUBLIC cxx_std_14) +ENDIF() + +# Suppress clang-tidy diagnostics on code that we do not have control over +IF(CMAKE_CXX_CLANG_TIDY) + SET_TARGET_PROPERTIES(kokkosalgorithms_gtest PROPERTIES CXX_CLANG_TIDY "") +ENDIF() + +SET(ALGORITHM UnitTestMain.cpp) + +IF(Kokkos_ENABLE_OPENMP) + LIST(APPEND ALGORITHM_SOURCES + TestOpenMP_Sort1D.cpp + TestOpenMP_Sort3D.cpp + TestOpenMP_SortDynamicView.cpp + ) +ENDIF() + +foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL) + # Because there is always an exception to the rule + if(Tag STREQUAL "Threads") + set(DEVICE "PTHREAD") + else() + string(TOUPPER ${Tag} DEVICE) + endif() + + if(Kokkos_ENABLE_${DEVICE}) + set(dir ${CMAKE_CURRENT_BINARY_DIR}) + set(file ${dir}/Test${Tag}.cpp) + # Write to a temporary intermediate file and call configure_file to avoid + # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs. + file(WRITE ${dir}/dummy.cpp + "#include <Test${Tag}_Category.hpp>\n" + "#include <TestRandomCommon.hpp>\n" + "#include <TestSortCommon.hpp>\n" + ) + configure_file(${dir}/dummy.cpp ${file}) + list(APPEND ALGORITHM_SOURCES ${file}) + endif() +endforeach() + +KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest + SOURCES + UnitTestMain.cpp + ${ALGORITHM_SOURCES} +) diff --git a/packages/kokkos/algorithms/unit_tests/Makefile b/packages/kokkos/algorithms/unit_tests/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..dd0aa87de0b2c76fe76d03f8ea77092833dd9f63 --- /dev/null +++ b/packages/kokkos/algorithms/unit_tests/Makefile @@ -0,0 +1,121 @@ +KOKKOS_PATH = ../.. + +GTEST_PATH = ../../TPL/gtest + +vpath %.cpp ${KOKKOS_PATH}/algorithms/unit_tests + +default: build_all + echo "End Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) + CXX = $(KOKKOS_PATH)/bin/nvcc_wrapper +else + CXX = g++ +endif + +CXXFLAGS = -O3 +LINK ?= $(CXX) +LDFLAGS ?= +override LDFLAGS += -lpthread + +include $(KOKKOS_PATH)/Makefile.kokkos + +KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/algorithms/unit_tests -I${KOKKOS_PATH}/core/unit_test/category_files + +TEST_TARGETS = +TARGETS = + +tmp := $(foreach device, $(KOKKOS_DEVICELIST), \ + $(if $(filter Test$(device).cpp, $(shell ls Test$(device).cpp 2>/dev/null)),,\ + $(shell echo "\#include <Test"${device}"_Category.hpp>" > Test$(device).cpp); \ + $(shell echo "\#include <TestRandomCommon.hpp>" >> Test$(device).cpp); \ + $(shell echo "\#include <TestSortCommon.hpp>" >> Test$(device).cpp); \ + ) \ +) + +ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) + OBJ_CUDA = TestCuda.o UnitTestMain.o gtest-all.o + TARGETS += KokkosAlgorithms_UnitTest_Cuda + TEST_TARGETS += test-cuda +endif + +ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) + OBJ_HIP = TestHIP.o UnitTestMain.o gtest-all.o + TARGETS += KokkosAlgorithms_UnitTest_HIP + TEST_TARGETS += test-hip +endif + +ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) + OBJ_THREADS = TestThreads.o UnitTestMain.o gtest-all.o + TARGETS += KokkosAlgorithms_UnitTest_Threads + TEST_TARGETS += test-threads +endif + +ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) + OBJ_OPENMP = TestOpenMP.o TestOpenMP_Sort1D.o TestOpenMP_Sort3D.o TestOpenMP_SortDynamicView.o UnitTestMain.o gtest-all.o + TARGETS += KokkosAlgorithms_UnitTest_OpenMP + TEST_TARGETS += test-openmp +endif + +ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) + OBJ_HPX = TestHPX.o UnitTestMain.o gtest-all.o + TARGETS += KokkosAlgorithms_UnitTest_HPX + TEST_TARGETS += test-hpx +endif + +ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) + OBJ_SERIAL = TestSerial.o UnitTestMain.o gtest-all.o + TARGETS += KokkosAlgorithms_UnitTest_Serial + TEST_TARGETS += test-serial +endif + +KokkosAlgorithms_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Cuda + +KokkosAlgorithms_UnitTest_HIP: $(OBJ_HIP) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(EXTRA_PATH) $(OBJ_HIP) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_HIP + +KokkosAlgorithms_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Threads + +KokkosAlgorithms_UnitTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(EXTRA_PATH) $(OBJ_OPENMP) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_OpenMP + +KokkosAlgorithms_UnitTest_HPX: $(OBJ_HPX) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(EXTRA_PATH) $(OBJ_HPX) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_HPX + +KokkosAlgorithms_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(EXTRA_PATH) $(OBJ_SERIAL) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Serial + +test-cuda: KokkosAlgorithms_UnitTest_Cuda + ./KokkosAlgorithms_UnitTest_Cuda + +test-hip: KokkosAlgorithms_UnitTest_HIP + ./KokkosAlgorithms_UnitTest_HIP + +test-threads: KokkosAlgorithms_UnitTest_Threads + ./KokkosAlgorithms_UnitTest_Threads + +test-openmp: KokkosAlgorithms_UnitTest_OpenMP + ./KokkosAlgorithms_UnitTest_OpenMP + +test-hpx: KokkosAlgorithms_UnitTest_HPX + ./KokkosAlgorithms_UnitTest_HPX + +test-serial: KokkosAlgorithms_UnitTest_Serial + ./KokkosAlgorithms_UnitTest_Serial + +build_all: $(TARGETS) + +test: $(TEST_TARGETS) + +clean: kokkos-clean + rm -f *.o $(TARGETS) + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< + +gtest-all.o:$(GTEST_PATH)/gtest/gtest-all.cc + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $(GTEST_PATH)/gtest/gtest-all.cc diff --git a/packages/kokkos/algorithms/unit_tests/TestOpenMP_Sort1D.cpp b/packages/kokkos/algorithms/unit_tests/TestOpenMP_Sort1D.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4a5839f0c80a5298c14ff91422d74664b9dd95bd --- /dev/null +++ b/packages/kokkos/algorithms/unit_tests/TestOpenMP_Sort1D.cpp @@ -0,0 +1,67 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Macros.hpp> +#ifdef KOKKOS_ENABLE_OPENMP + +#include <gtest/gtest.h> +#include <Kokkos_Core.hpp> + +//---------------------------------------------------------------------------- +#include <TestRandom.hpp> +#include <TestSort.hpp> +#include <iomanip> + +namespace Test { + +TEST(openmp, SortUnsigned1D) { + Impl::test_1D_sort<Kokkos::OpenMP, unsigned>(171); +} + +TEST(openmp, SortIssue1160) { Impl::test_issue_1160_sort<Kokkos::OpenMP>(); } + +} // namespace Test +#else +void KOKKOS_ALGORITHMS_UNITTESTS_TESTOPENMP_PREVENT_LINK_ERROR() {} +#endif diff --git a/packages/kokkos/algorithms/unit_tests/TestOpenMP_Sort3D.cpp b/packages/kokkos/algorithms/unit_tests/TestOpenMP_Sort3D.cpp new file mode 100644 index 0000000000000000000000000000000000000000..127d911d7ca3856957646698b431089b5deb2caa --- /dev/null +++ b/packages/kokkos/algorithms/unit_tests/TestOpenMP_Sort3D.cpp @@ -0,0 +1,65 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Macros.hpp> +#ifdef KOKKOS_ENABLE_OPENMP + +#include <gtest/gtest.h> +#include <Kokkos_Core.hpp> + +//---------------------------------------------------------------------------- +#include <TestRandom.hpp> +#include <TestSort.hpp> +#include <iomanip> + +namespace Test { + +TEST(openmp, SortUnsigned3D) { + Impl::test_3D_sort<Kokkos::OpenMP, unsigned>(171); +} + +} // namespace Test +#else +void KOKKOS_ALGORITHMS_UNITTESTS_TESTOPENMP_PREVENT_LINK_ERROR() {} +#endif diff --git a/packages/kokkos/algorithms/unit_tests/TestOpenMP_SortDynamicView.cpp b/packages/kokkos/algorithms/unit_tests/TestOpenMP_SortDynamicView.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3dc88540443f7af219b18b85425408afbc1fda6e --- /dev/null +++ b/packages/kokkos/algorithms/unit_tests/TestOpenMP_SortDynamicView.cpp @@ -0,0 +1,65 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Macros.hpp> +#ifdef KOKKOS_ENABLE_OPENMP + +#include <gtest/gtest.h> +#include <Kokkos_Core.hpp> + +//---------------------------------------------------------------------------- +#include <TestRandom.hpp> +#include <TestSort.hpp> +#include <iomanip> + +namespace Test { + +TEST(openmp, SortUnsignedDynamicView) { + Impl::test_dynamic_view_sort<Kokkos::OpenMP, unsigned>(171); +} + +} // namespace Test +#else +void KOKKOS_ALGORITHMS_UNITTESTS_TESTOPENMP_PREVENT_LINK_ERROR() {} +#endif diff --git a/packages/kokkos/algorithms/unit_tests/TestRandom.hpp b/packages/kokkos/algorithms/unit_tests/TestRandom.hpp new file mode 100644 index 0000000000000000000000000000000000000000..1f14875096dd2fbd0bebf4feea796d4c6ccd79f0 --- /dev/null +++ b/packages/kokkos/algorithms/unit_tests/TestRandom.hpp @@ -0,0 +1,524 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER + +#ifndef KOKKOS_TEST_DUALVIEW_HPP +#define KOKKOS_TEST_DUALVIEW_HPP + +#include <gtest/gtest.h> +#include <iostream> +#include <cstdlib> +#include <cstdio> +#include <impl/Kokkos_Timer.hpp> +#include <Kokkos_Core.hpp> +#include <Kokkos_Random.hpp> +#include <cmath> +#include <chrono> + +namespace Test { + +namespace Impl { + +// This test runs the random number generators and uses some statistic tests to +// check the 'goodness' of the random numbers: +// (i) mean: the mean is expected to be 0.5*RAND_MAX +// (ii) variance: the variance is 1/3*mean*mean +// (iii) covariance: the covariance is 0 +// (iv) 1-tupledistr: the mean, variance and covariance of a 1D Histrogram +// of random numbers (v) 3-tupledistr: the mean, variance and covariance of +// a 3D Histrogram of random numbers + +#define HIST_DIM3D 24 +#define HIST_DIM1D (HIST_DIM3D * HIST_DIM3D * HIST_DIM3D) + +struct RandomProperties { + uint64_t count; + double mean; + double variance; + double covariance; + double min; + double max; + + KOKKOS_INLINE_FUNCTION + RandomProperties() { + count = 0; + mean = 0.0; + variance = 0.0; + covariance = 0.0; + min = 1e64; + max = -1e64; + } + + KOKKOS_INLINE_FUNCTION + RandomProperties& operator+=(const RandomProperties& add) { + count += add.count; + mean += add.mean; + variance += add.variance; + covariance += add.covariance; + min = add.min < min ? add.min : min; + max = add.max > max ? add.max : max; + return *this; + } + + KOKKOS_INLINE_FUNCTION + void operator+=(const volatile RandomProperties& add) volatile { + count += add.count; + mean += add.mean; + variance += add.variance; + covariance += add.covariance; + min = add.min < min ? add.min : min; + max = add.max > max ? add.max : max; + } +}; + +template <class GeneratorPool, class Scalar> +struct test_random_functor { + using rnd_type = typename GeneratorPool::generator_type; + + using value_type = RandomProperties; + using device_type = typename GeneratorPool::device_type; + + GeneratorPool rand_pool; + const double mean; + + // NOTE (mfh 03 Nov 2014): Kokkos::rand::max() is supposed to define + // an exclusive upper bound on the range of random numbers that + // draw() can generate. However, for the float specialization, some + // implementations might violate this upper bound, due to rounding + // error. Just in case, we leave an extra space at the end of each + // dimension, in the View types below. + using type_1d = + Kokkos::View<int[HIST_DIM1D + 1], typename GeneratorPool::device_type>; + type_1d density_1d; + using type_3d = + Kokkos::View<int[HIST_DIM3D + 1][HIST_DIM3D + 1][HIST_DIM3D + 1], + typename GeneratorPool::device_type>; + type_3d density_3d; + + test_random_functor(GeneratorPool rand_pool_, type_1d d1d, type_3d d3d) + : rand_pool(rand_pool_), + mean(0.5 * Kokkos::rand<rnd_type, Scalar>::max()), + density_1d(d1d), + density_3d(d3d) {} + + KOKKOS_INLINE_FUNCTION + void operator()(int /*i*/, RandomProperties& prop) const { + using Kokkos::atomic_fetch_add; + + rnd_type rand_gen = rand_pool.get_state(); + for (int k = 0; k < 1024; ++k) { + const Scalar tmp = Kokkos::rand<rnd_type, Scalar>::draw(rand_gen); + prop.count++; + prop.mean += tmp; + prop.variance += (tmp - mean) * (tmp - mean); + const Scalar tmp2 = Kokkos::rand<rnd_type, Scalar>::draw(rand_gen); + prop.count++; + prop.mean += tmp2; + prop.variance += (tmp2 - mean) * (tmp2 - mean); + prop.covariance += (tmp - mean) * (tmp2 - mean); + const Scalar tmp3 = Kokkos::rand<rnd_type, Scalar>::draw(rand_gen); + prop.count++; + prop.mean += tmp3; + prop.variance += (tmp3 - mean) * (tmp3 - mean); + prop.covariance += (tmp2 - mean) * (tmp3 - mean); + + // NOTE (mfh 03 Nov 2014): Kokkos::rand::max() is supposed to + // define an exclusive upper bound on the range of random + // numbers that draw() can generate. However, for the float + // specialization, some implementations might violate this upper + // bound, due to rounding error. Just in case, we have left an + // extra space at the end of each dimension of density_1d and + // density_3d. + // + // Please note that those extra entries might not get counted in + // the histograms. However, if Kokkos::rand is broken and only + // returns values of max(), the histograms will still catch this + // indirectly, since none of the other values will be filled in. + + const Scalar theMax = Kokkos::rand<rnd_type, Scalar>::max(); + + const uint64_t ind1_1d = + static_cast<uint64_t>(1.0 * HIST_DIM1D * tmp / theMax); + const uint64_t ind2_1d = + static_cast<uint64_t>(1.0 * HIST_DIM1D * tmp2 / theMax); + const uint64_t ind3_1d = + static_cast<uint64_t>(1.0 * HIST_DIM1D * tmp3 / theMax); + + const uint64_t ind1_3d = + static_cast<uint64_t>(1.0 * HIST_DIM3D * tmp / theMax); + const uint64_t ind2_3d = + static_cast<uint64_t>(1.0 * HIST_DIM3D * tmp2 / theMax); + const uint64_t ind3_3d = + static_cast<uint64_t>(1.0 * HIST_DIM3D * tmp3 / theMax); + + atomic_fetch_add(&density_1d(ind1_1d), 1); + atomic_fetch_add(&density_1d(ind2_1d), 1); + atomic_fetch_add(&density_1d(ind3_1d), 1); + atomic_fetch_add(&density_3d(ind1_3d, ind2_3d, ind3_3d), 1); + } + rand_pool.free_state(rand_gen); + } +}; + +template <class DeviceType> +struct test_histogram1d_functor { + using value_type = RandomProperties; + using execution_space = typename DeviceType::execution_space; + using memory_space = typename DeviceType::memory_space; + + // NOTE (mfh 03 Nov 2014): Kokkos::rand::max() is supposed to define + // an exclusive upper bound on the range of random numbers that + // draw() can generate. However, for the float specialization, some + // implementations might violate this upper bound, due to rounding + // error. Just in case, we leave an extra space at the end of each + // dimension, in the View type below. + using type_1d = Kokkos::View<int[HIST_DIM1D + 1], memory_space>; + type_1d density_1d; + double mean; + + test_histogram1d_functor(type_1d d1d, int num_draws) + : density_1d(d1d), mean(1.0 * num_draws / HIST_DIM1D * 3) {} + + KOKKOS_INLINE_FUNCTION void operator()( + const typename memory_space::size_type i, RandomProperties& prop) const { + using size_type = typename memory_space::size_type; + const double count = density_1d(i); + prop.mean += count; + prop.variance += 1.0 * (count - mean) * (count - mean); + // prop.covariance += 1.0*count*count; + prop.min = count < prop.min ? count : prop.min; + prop.max = count > prop.max ? count : prop.max; + if (i < static_cast<size_type>(HIST_DIM1D - 1)) { + prop.covariance += (count - mean) * (density_1d(i + 1) - mean); + } + } +}; + +template <class DeviceType> +struct test_histogram3d_functor { + using value_type = RandomProperties; + using execution_space = typename DeviceType::execution_space; + using memory_space = typename DeviceType::memory_space; + + // NOTE (mfh 03 Nov 2014): Kokkos::rand::max() is supposed to define + // an exclusive upper bound on the range of random numbers that + // draw() can generate. However, for the float specialization, some + // implementations might violate this upper bound, due to rounding + // error. Just in case, we leave an extra space at the end of each + // dimension, in the View type below. + using type_3d = + Kokkos::View<int[HIST_DIM3D + 1][HIST_DIM3D + 1][HIST_DIM3D + 1], + memory_space>; + type_3d density_3d; + double mean; + + test_histogram3d_functor(type_3d d3d, int num_draws) + : density_3d(d3d), mean(1.0 * num_draws / HIST_DIM1D) {} + + KOKKOS_INLINE_FUNCTION void operator()( + const typename memory_space::size_type i, RandomProperties& prop) const { + using size_type = typename memory_space::size_type; + const double count = density_3d( + i / (HIST_DIM3D * HIST_DIM3D), + (i % (HIST_DIM3D * HIST_DIM3D)) / HIST_DIM3D, i % HIST_DIM3D); + prop.mean += count; + prop.variance += (count - mean) * (count - mean); + if (i < static_cast<size_type>(HIST_DIM1D - 1)) { + const double count_next = + density_3d((i + 1) / (HIST_DIM3D * HIST_DIM3D), + ((i + 1) % (HIST_DIM3D * HIST_DIM3D)) / HIST_DIM3D, + (i + 1) % HIST_DIM3D); + prop.covariance += (count - mean) * (count_next - mean); + } + } +}; + +// +// Templated test that uses the above functors. +// +template <class RandomGenerator, class Scalar> +struct test_random_scalar { + using rnd_type = typename RandomGenerator::generator_type; + + int pass_mean, pass_var, pass_covar; + int pass_hist1d_mean, pass_hist1d_var, pass_hist1d_covar; + int pass_hist3d_mean, pass_hist3d_var, pass_hist3d_covar; + + test_random_scalar( + typename test_random_functor<RandomGenerator, int>::type_1d& density_1d, + typename test_random_functor<RandomGenerator, int>::type_3d& density_3d, + RandomGenerator& pool, unsigned int num_draws) { + using Kokkos::parallel_reduce; + using std::cout; + using std::endl; + + { + cout << " -- Testing randomness properties" << endl; + + RandomProperties result; + using functor_type = test_random_functor<RandomGenerator, Scalar>; + parallel_reduce(num_draws / 1024, + functor_type(pool, density_1d, density_3d), result); + + // printf("Result: %lf %lf + // %lf\n",result.mean/num_draws/3,result.variance/num_draws/3,result.covariance/num_draws/2); + double tolerance = 1.6 * std::sqrt(1.0 / num_draws); + double mean_expect = 0.5 * Kokkos::rand<rnd_type, Scalar>::max(); + double variance_expect = 1.0 / 3.0 * mean_expect * mean_expect; + double mean_eps = mean_expect / (result.mean / num_draws / 3) - 1.0; + double variance_eps = + variance_expect / (result.variance / num_draws / 3) - 1.0; + double covariance_eps = + result.covariance / num_draws / 2 / variance_expect; + pass_mean = ((-tolerance < mean_eps) && (tolerance > mean_eps)) ? 1 : 0; + pass_var = ((-1.5 * tolerance < variance_eps) && + (1.5 * tolerance > variance_eps)) + ? 1 + : 0; + pass_covar = ((-2.0 * tolerance < covariance_eps) && + (2.0 * tolerance > covariance_eps)) + ? 1 + : 0; + cout << "Pass: " << pass_mean << " " << pass_var << " " << mean_eps << " " + << variance_eps << " " << covariance_eps << " || " << tolerance + << endl; + } + { + cout << " -- Testing 1-D histogram" << endl; + + RandomProperties result; + using functor_type = + test_histogram1d_functor<typename RandomGenerator::device_type>; + parallel_reduce(HIST_DIM1D, functor_type(density_1d, num_draws), result); + + double tolerance = 6 * std::sqrt(1.0 / HIST_DIM1D); + double mean_expect = 1.0 * num_draws * 3 / HIST_DIM1D; + double variance_expect = + 1.0 * num_draws * 3 / HIST_DIM1D * (1.0 - 1.0 / HIST_DIM1D); + double covariance_expect = -1.0 * num_draws * 3 / HIST_DIM1D / HIST_DIM1D; + double mean_eps = mean_expect / (result.mean / HIST_DIM1D) - 1.0; + double variance_eps = + variance_expect / (result.variance / HIST_DIM1D) - 1.0; + double covariance_eps = + (result.covariance / HIST_DIM1D - covariance_expect) / mean_expect; + pass_hist1d_mean = ((-0.0001 < mean_eps) && (0.0001 > mean_eps)) ? 1 : 0; + pass_hist1d_var = + ((-0.07 < variance_eps) && (0.07 > variance_eps)) ? 1 : 0; + pass_hist1d_covar = + ((-0.06 < covariance_eps) && (0.06 > covariance_eps)) ? 1 : 0; + + cout << "Density 1D: " << mean_eps << " " << variance_eps << " " + << (result.covariance / HIST_DIM1D / HIST_DIM1D) << " || " + << tolerance << " " << result.min << " " << result.max << " || " + << result.variance / HIST_DIM1D << " " + << 1.0 * num_draws * 3 / HIST_DIM1D * (1.0 - 1.0 / HIST_DIM1D) + << " || " << result.covariance / HIST_DIM1D << " " + << -1.0 * num_draws * 3 / HIST_DIM1D / HIST_DIM1D << endl; + } + { + cout << " -- Testing 3-D histogram" << endl; + + RandomProperties result; + using functor_type = + test_histogram3d_functor<typename RandomGenerator::device_type>; + parallel_reduce(HIST_DIM1D, functor_type(density_3d, num_draws), result); + + double tolerance = 6 * std::sqrt(1.0 / HIST_DIM1D); + double mean_expect = 1.0 * num_draws / HIST_DIM1D; + double variance_expect = + 1.0 * num_draws / HIST_DIM1D * (1.0 - 1.0 / HIST_DIM1D); + double covariance_expect = -1.0 * num_draws / HIST_DIM1D / HIST_DIM1D; + double mean_eps = mean_expect / (result.mean / HIST_DIM1D) - 1.0; + double variance_eps = + variance_expect / (result.variance / HIST_DIM1D) - 1.0; + double covariance_eps = + (result.covariance / HIST_DIM1D - covariance_expect) / mean_expect; + pass_hist3d_mean = + ((-tolerance < mean_eps) && (tolerance > mean_eps)) ? 1 : 0; + pass_hist3d_var = ((-1.2 * tolerance < variance_eps) && + (1.2 * tolerance > variance_eps)) + ? 1 + : 0; + pass_hist3d_covar = + ((-tolerance < covariance_eps) && (tolerance > covariance_eps)) ? 1 + : 0; + + cout << "Density 3D: " << mean_eps << " " << variance_eps << " " + << result.covariance / HIST_DIM1D / HIST_DIM1D << " || " << tolerance + << " " << result.min << " " << result.max << endl; + } + } +}; + +template <class RandomGenerator> +void test_random(unsigned int num_draws) { + using std::cout; + using std::endl; + typename test_random_functor<RandomGenerator, int>::type_1d density_1d("D1d"); + typename test_random_functor<RandomGenerator, int>::type_3d density_3d("D3d"); + + uint64_t ticks = + std::chrono::high_resolution_clock::now().time_since_epoch().count(); + cout << "Test Seed:" << ticks << endl; + + RandomGenerator pool(ticks); + + cout << "Test Scalar=int" << endl; + test_random_scalar<RandomGenerator, int> test_int(density_1d, density_3d, + pool, num_draws); + ASSERT_EQ(test_int.pass_mean, 1); + ASSERT_EQ(test_int.pass_var, 1); + ASSERT_EQ(test_int.pass_covar, 1); + ASSERT_EQ(test_int.pass_hist1d_mean, 1); + ASSERT_EQ(test_int.pass_hist1d_var, 1); + ASSERT_EQ(test_int.pass_hist1d_covar, 1); + ASSERT_EQ(test_int.pass_hist3d_mean, 1); + ASSERT_EQ(test_int.pass_hist3d_var, 1); + ASSERT_EQ(test_int.pass_hist3d_covar, 1); + deep_copy(density_1d, 0); + deep_copy(density_3d, 0); + + cout << "Test Scalar=unsigned int" << endl; + test_random_scalar<RandomGenerator, unsigned int> test_uint( + density_1d, density_3d, pool, num_draws); + ASSERT_EQ(test_uint.pass_mean, 1); + ASSERT_EQ(test_uint.pass_var, 1); + ASSERT_EQ(test_uint.pass_covar, 1); + ASSERT_EQ(test_uint.pass_hist1d_mean, 1); + ASSERT_EQ(test_uint.pass_hist1d_var, 1); + ASSERT_EQ(test_uint.pass_hist1d_covar, 1); + ASSERT_EQ(test_uint.pass_hist3d_mean, 1); + ASSERT_EQ(test_uint.pass_hist3d_var, 1); + ASSERT_EQ(test_uint.pass_hist3d_covar, 1); + deep_copy(density_1d, 0); + deep_copy(density_3d, 0); + + cout << "Test Scalar=int64_t" << endl; + test_random_scalar<RandomGenerator, int64_t> test_int64( + density_1d, density_3d, pool, num_draws); + ASSERT_EQ(test_int64.pass_mean, 1); + ASSERT_EQ(test_int64.pass_var, 1); + ASSERT_EQ(test_int64.pass_covar, 1); + ASSERT_EQ(test_int64.pass_hist1d_mean, 1); + ASSERT_EQ(test_int64.pass_hist1d_var, 1); + ASSERT_EQ(test_int64.pass_hist1d_covar, 1); + ASSERT_EQ(test_int64.pass_hist3d_mean, 1); + ASSERT_EQ(test_int64.pass_hist3d_var, 1); + ASSERT_EQ(test_int64.pass_hist3d_covar, 1); + deep_copy(density_1d, 0); + deep_copy(density_3d, 0); + + cout << "Test Scalar=uint64_t" << endl; + test_random_scalar<RandomGenerator, uint64_t> test_uint64( + density_1d, density_3d, pool, num_draws); + ASSERT_EQ(test_uint64.pass_mean, 1); + ASSERT_EQ(test_uint64.pass_var, 1); + ASSERT_EQ(test_uint64.pass_covar, 1); + ASSERT_EQ(test_uint64.pass_hist1d_mean, 1); + ASSERT_EQ(test_uint64.pass_hist1d_var, 1); + ASSERT_EQ(test_uint64.pass_hist1d_covar, 1); + ASSERT_EQ(test_uint64.pass_hist3d_mean, 1); + ASSERT_EQ(test_uint64.pass_hist3d_var, 1); + ASSERT_EQ(test_uint64.pass_hist3d_covar, 1); + deep_copy(density_1d, 0); + deep_copy(density_3d, 0); + + cout << "Test Scalar=float" << endl; + test_random_scalar<RandomGenerator, float> test_float(density_1d, density_3d, + pool, num_draws); + ASSERT_EQ(test_float.pass_mean, 1); + ASSERT_EQ(test_float.pass_var, 1); + ASSERT_EQ(test_float.pass_covar, 1); + ASSERT_EQ(test_float.pass_hist1d_mean, 1); + ASSERT_EQ(test_float.pass_hist1d_var, 1); + ASSERT_EQ(test_float.pass_hist1d_covar, 1); + ASSERT_EQ(test_float.pass_hist3d_mean, 1); + ASSERT_EQ(test_float.pass_hist3d_var, 1); + ASSERT_EQ(test_float.pass_hist3d_covar, 1); + deep_copy(density_1d, 0); + deep_copy(density_3d, 0); + + cout << "Test Scalar=double" << endl; + test_random_scalar<RandomGenerator, double> test_double( + density_1d, density_3d, pool, num_draws); + ASSERT_EQ(test_double.pass_mean, 1); + ASSERT_EQ(test_double.pass_var, 1); + ASSERT_EQ(test_double.pass_covar, 1); + ASSERT_EQ(test_double.pass_hist1d_mean, 1); + ASSERT_EQ(test_double.pass_hist1d_var, 1); + ASSERT_EQ(test_double.pass_hist1d_covar, 1); + ASSERT_EQ(test_double.pass_hist3d_mean, 1); + ASSERT_EQ(test_double.pass_hist3d_var, 1); + ASSERT_EQ(test_double.pass_hist3d_covar, 1); +} +} // namespace Impl + +template <typename ExecutionSpace> +void test_random_xorshift64() { +#if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_CUDA) || \ + defined(KOKKOS_ENABLE_HIP) + const int num_draws = 132141141; +#else // SERIAL, HPX, OPENMP + const int num_draws = 10240000; +#endif + Impl::test_random<Kokkos::Random_XorShift64_Pool<ExecutionSpace>>(num_draws); + Impl::test_random<Kokkos::Random_XorShift64_Pool< + Kokkos::Device<ExecutionSpace, typename ExecutionSpace::memory_space>>>( + num_draws); +} + +template <typename ExecutionSpace> +void test_random_xorshift1024() { +#if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_CUDA) || \ + defined(KOKKOS_ENABLE_HIP) + const int num_draws = 52428813; +#else // SERIAL, HPX, OPENMP + const int num_draws = 10130144; +#endif + Impl::test_random<Kokkos::Random_XorShift1024_Pool<ExecutionSpace>>( + num_draws); + Impl::test_random<Kokkos::Random_XorShift1024_Pool< + Kokkos::Device<ExecutionSpace, typename ExecutionSpace::memory_space>>>( + num_draws); +} +} // namespace Test + +#endif // KOKKOS_TEST_UNORDERED_MAP_HPP diff --git a/packages/kokkos/algorithms/unit_tests/TestRandomCommon.hpp b/packages/kokkos/algorithms/unit_tests/TestRandomCommon.hpp new file mode 100644 index 0000000000000000000000000000000000000000..c6d3b59ae1f12422c448a13f5f91f2ed74cc58ff --- /dev/null +++ b/packages/kokkos/algorithms/unit_tests/TestRandomCommon.hpp @@ -0,0 +1,60 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_ALGORITHMS_UNITTESTS_TESTRANDOM_COMMON_HPP +#define KOKKOS_ALGORITHMS_UNITTESTS_TESTRANDOM_COMMON_HPP + +#include <TestRandom.hpp> + +namespace Test { + +TEST(TEST_CATEGORY, Random_XorShift64) { + test_random_xorshift64<TEST_EXECSPACE>(); +} +TEST(TEST_CATEGORY, Random_XorShift1024_0) { + test_random_xorshift1024<TEST_EXECSPACE>(); +} +} // namespace Test + +#endif diff --git a/packages/kokkos/algorithms/unit_tests/TestSort.hpp b/packages/kokkos/algorithms/unit_tests/TestSort.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a3c362ec201bae07df05867d07136e26e73204d0 --- /dev/null +++ b/packages/kokkos/algorithms/unit_tests/TestSort.hpp @@ -0,0 +1,378 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER + +#ifndef KOKKOS_ALGORITHMS_UNITTESTS_TESTSORT_HPP +#define KOKKOS_ALGORITHMS_UNITTESTS_TESTSORT_HPP + +#include <gtest/gtest.h> +#include <Kokkos_Core.hpp> +#include <Kokkos_DynamicView.hpp> +#include <Kokkos_Random.hpp> +#include <Kokkos_Sort.hpp> + +namespace Test { + +namespace Impl { + +template <class ExecutionSpace, class Scalar> +struct is_sorted_struct { + using value_type = unsigned int; + using execution_space = ExecutionSpace; + + Kokkos::View<Scalar*, ExecutionSpace> keys; + + is_sorted_struct(Kokkos::View<Scalar*, ExecutionSpace> keys_) : keys(keys_) {} + KOKKOS_INLINE_FUNCTION + void operator()(int i, unsigned int& count) const { + if (keys(i) > keys(i + 1)) count++; + } +}; + +template <class ExecutionSpace, class Scalar> +struct sum { + using value_type = double; + using execution_space = ExecutionSpace; + + Kokkos::View<Scalar*, ExecutionSpace> keys; + + sum(Kokkos::View<Scalar*, ExecutionSpace> keys_) : keys(keys_) {} + KOKKOS_INLINE_FUNCTION + void operator()(int i, double& count) const { count += keys(i); } +}; + +template <class ExecutionSpace, class Scalar> +struct bin3d_is_sorted_struct { + using value_type = unsigned int; + using execution_space = ExecutionSpace; + + Kokkos::View<Scalar * [3], ExecutionSpace> keys; + + int max_bins; + Scalar min; + Scalar max; + + bin3d_is_sorted_struct(Kokkos::View<Scalar * [3], ExecutionSpace> keys_, + int max_bins_, Scalar min_, Scalar max_) + : keys(keys_), max_bins(max_bins_), min(min_), max(max_) {} + KOKKOS_INLINE_FUNCTION + void operator()(int i, unsigned int& count) const { + int ix1 = int((keys(i, 0) - min) / max * max_bins); + int iy1 = int((keys(i, 1) - min) / max * max_bins); + int iz1 = int((keys(i, 2) - min) / max * max_bins); + int ix2 = int((keys(i + 1, 0) - min) / max * max_bins); + int iy2 = int((keys(i + 1, 1) - min) / max * max_bins); + int iz2 = int((keys(i + 1, 2) - min) / max * max_bins); + + if (ix1 > ix2) + count++; + else if (ix1 == ix2) { + if (iy1 > iy2) + count++; + else if ((iy1 == iy2) && (iz1 > iz2)) + count++; + } + } +}; + +template <class ExecutionSpace, class Scalar> +struct sum3D { + using value_type = double; + using execution_space = ExecutionSpace; + + Kokkos::View<Scalar * [3], ExecutionSpace> keys; + + sum3D(Kokkos::View<Scalar * [3], ExecutionSpace> keys_) : keys(keys_) {} + KOKKOS_INLINE_FUNCTION + void operator()(int i, double& count) const { + count += keys(i, 0); + count += keys(i, 1); + count += keys(i, 2); + } +}; + +template <class ExecutionSpace, typename KeyType> +void test_1D_sort_impl(unsigned int n, bool force_kokkos) { + using KeyViewType = Kokkos::View<KeyType*, ExecutionSpace>; + KeyViewType keys("Keys", n); + + // Test sorting array with all numbers equal + Kokkos::deep_copy(keys, KeyType(1)); + Kokkos::sort(keys, force_kokkos); + + Kokkos::Random_XorShift64_Pool<ExecutionSpace> g(1931); + Kokkos::fill_random(keys, g, + Kokkos::Random_XorShift64_Pool< + ExecutionSpace>::generator_type::MAX_URAND); + + double sum_before = 0.0; + double sum_after = 0.0; + unsigned int sort_fails = 0; + + Kokkos::parallel_reduce(n, sum<ExecutionSpace, KeyType>(keys), sum_before); + + Kokkos::sort(keys, force_kokkos); + + Kokkos::parallel_reduce(n, sum<ExecutionSpace, KeyType>(keys), sum_after); + Kokkos::parallel_reduce( + n - 1, is_sorted_struct<ExecutionSpace, KeyType>(keys), sort_fails); + + double ratio = sum_before / sum_after; + double epsilon = 1e-10; + unsigned int equal_sum = + (ratio > (1.0 - epsilon)) && (ratio < (1.0 + epsilon)) ? 1 : 0; + + ASSERT_EQ(sort_fails, 0); + ASSERT_EQ(equal_sum, 1); +} + +template <class ExecutionSpace, typename KeyType> +void test_3D_sort_impl(unsigned int n) { + using KeyViewType = Kokkos::View<KeyType * [3], ExecutionSpace>; + + KeyViewType keys("Keys", n * n * n); + + Kokkos::Random_XorShift64_Pool<ExecutionSpace> g(1931); + Kokkos::fill_random(keys, g, 100.0); + + double sum_before = 0.0; + double sum_after = 0.0; + unsigned int sort_fails = 0; + + Kokkos::parallel_reduce(keys.extent(0), sum3D<ExecutionSpace, KeyType>(keys), + sum_before); + + int bin_1d = 1; + while (bin_1d * bin_1d * bin_1d * 4 < (int)keys.extent(0)) bin_1d *= 2; + int bin_max[3] = {bin_1d, bin_1d, bin_1d}; + typename KeyViewType::value_type min[3] = {0, 0, 0}; + typename KeyViewType::value_type max[3] = {100, 100, 100}; + + using BinOp = Kokkos::BinOp3D<KeyViewType>; + BinOp bin_op(bin_max, min, max); + Kokkos::BinSort<KeyViewType, BinOp> Sorter(keys, bin_op, false); + Sorter.create_permute_vector(); + Sorter.template sort<KeyViewType>(keys); + + Kokkos::parallel_reduce(keys.extent(0), sum3D<ExecutionSpace, KeyType>(keys), + sum_after); + Kokkos::parallel_reduce(keys.extent(0) - 1, + bin3d_is_sorted_struct<ExecutionSpace, KeyType>( + keys, bin_1d, min[0], max[0]), + sort_fails); + + double ratio = sum_before / sum_after; + double epsilon = 1e-10; + unsigned int equal_sum = + (ratio > (1.0 - epsilon)) && (ratio < (1.0 + epsilon)) ? 1 : 0; + + if (sort_fails) + printf("3D Sort Sum: %f %f Fails: %u\n", sum_before, sum_after, sort_fails); + + ASSERT_EQ(sort_fails, 0); + ASSERT_EQ(equal_sum, 1); +} + +//---------------------------------------------------------------------------- + +template <class ExecutionSpace, typename KeyType> +void test_dynamic_view_sort_impl(unsigned int n) { + using KeyDynamicViewType = + Kokkos::Experimental::DynamicView<KeyType*, ExecutionSpace>; + using KeyViewType = Kokkos::View<KeyType*, ExecutionSpace>; + + const size_t upper_bound = 2 * n; + const size_t min_chunk_size = 1024; + + KeyDynamicViewType keys("Keys", min_chunk_size, upper_bound); + + keys.resize_serial(n); + + KeyViewType keys_view("KeysTmp", n); + + // Test sorting array with all numbers equal + Kokkos::deep_copy(keys_view, KeyType(1)); + Kokkos::deep_copy(keys, keys_view); + Kokkos::sort(keys, 0 /* begin */, n /* end */); + + Kokkos::Random_XorShift64_Pool<ExecutionSpace> g(1931); + Kokkos::fill_random(keys_view, g, + Kokkos::Random_XorShift64_Pool< + ExecutionSpace>::generator_type::MAX_URAND); + + ExecutionSpace().fence(); + Kokkos::deep_copy(keys, keys_view); + // ExecutionSpace().fence(); + + double sum_before = 0.0; + double sum_after = 0.0; + unsigned int sort_fails = 0; + + Kokkos::parallel_reduce(n, sum<ExecutionSpace, KeyType>(keys_view), + sum_before); + + Kokkos::sort(keys, 0 /* begin */, n /* end */); + + ExecutionSpace().fence(); // Need this fence to prevent BusError with Cuda + Kokkos::deep_copy(keys_view, keys); + // ExecutionSpace().fence(); + + Kokkos::parallel_reduce(n, sum<ExecutionSpace, KeyType>(keys_view), + sum_after); + Kokkos::parallel_reduce( + n - 1, is_sorted_struct<ExecutionSpace, KeyType>(keys_view), sort_fails); + + double ratio = sum_before / sum_after; + double epsilon = 1e-10; + unsigned int equal_sum = + (ratio > (1.0 - epsilon)) && (ratio < (1.0 + epsilon)) ? 1 : 0; + + if (sort_fails != 0 || equal_sum != 1) { + std::cout << " N = " << n << " ; sum_before = " << sum_before + << " ; sum_after = " << sum_after << " ; ratio = " << ratio + << std::endl; + } + + ASSERT_EQ(sort_fails, 0); + ASSERT_EQ(equal_sum, 1); +} + +//---------------------------------------------------------------------------- + +template <class ExecutionSpace> +void test_issue_1160_impl() { + Kokkos::View<int*, ExecutionSpace> element_("element", 10); + Kokkos::View<double*, ExecutionSpace> x_("x", 10); + Kokkos::View<double*, ExecutionSpace> v_("y", 10); + + auto h_element = Kokkos::create_mirror_view(element_); + auto h_x = Kokkos::create_mirror_view(x_); + auto h_v = Kokkos::create_mirror_view(v_); + + h_element(0) = 9; + h_element(1) = 8; + h_element(2) = 7; + h_element(3) = 6; + h_element(4) = 5; + h_element(5) = 4; + h_element(6) = 3; + h_element(7) = 2; + h_element(8) = 1; + h_element(9) = 0; + + for (int i = 0; i < 10; ++i) { + h_v.access(i, 0) = h_x.access(i, 0) = double(h_element(i)); + } + Kokkos::deep_copy(element_, h_element); + Kokkos::deep_copy(x_, h_x); + Kokkos::deep_copy(v_, h_v); + + using KeyViewType = decltype(element_); + using BinOp = Kokkos::BinOp1D<KeyViewType>; + + int begin = 3; + int end = 8; + auto max = h_element(begin); + auto min = h_element(end - 1); + BinOp binner(end - begin, min, max); + + Kokkos::BinSort<KeyViewType, BinOp> Sorter(element_, begin, end, binner, + false); + Sorter.create_permute_vector(); + Sorter.sort(element_, begin, end); + + Sorter.sort(x_, begin, end); + Sorter.sort(v_, begin, end); + + Kokkos::deep_copy(h_element, element_); + Kokkos::deep_copy(h_x, x_); + Kokkos::deep_copy(h_v, v_); + + ASSERT_EQ(h_element(0), 9); + ASSERT_EQ(h_element(1), 8); + ASSERT_EQ(h_element(2), 7); + ASSERT_EQ(h_element(3), 2); + ASSERT_EQ(h_element(4), 3); + ASSERT_EQ(h_element(5), 4); + ASSERT_EQ(h_element(6), 5); + ASSERT_EQ(h_element(7), 6); + ASSERT_EQ(h_element(8), 1); + ASSERT_EQ(h_element(9), 0); + + for (int i = 0; i < 10; ++i) { + ASSERT_EQ(h_element(i), int(h_x.access(i, 0))); + ASSERT_EQ(h_element(i), int(h_v.access(i, 0))); + } +} + +//---------------------------------------------------------------------------- + +template <class ExecutionSpace, typename KeyType> +void test_1D_sort(unsigned int N) { + test_1D_sort_impl<ExecutionSpace, KeyType>(N * N * N, true); + test_1D_sort_impl<ExecutionSpace, KeyType>(N * N * N, false); +} + +template <class ExecutionSpace, typename KeyType> +void test_3D_sort(unsigned int N) { + test_3D_sort_impl<ExecutionSpace, KeyType>(N); +} + +template <class ExecutionSpace, typename KeyType> +void test_dynamic_view_sort(unsigned int N) { + test_dynamic_view_sort_impl<ExecutionSpace, KeyType>(N * N); +} + +template <class ExecutionSpace> +void test_issue_1160_sort() { + test_issue_1160_impl<ExecutionSpace>(); +} + +template <class ExecutionSpace, typename KeyType> +void test_sort(unsigned int N) { + test_1D_sort<ExecutionSpace, KeyType>(N); + test_3D_sort<ExecutionSpace, KeyType>(N); + test_dynamic_view_sort<ExecutionSpace, KeyType>(N); + test_issue_1160_sort<ExecutionSpace>(); +} +} // namespace Impl +} // namespace Test +#endif /* KOKKOS_ALGORITHMS_UNITTESTS_TESTSORT_HPP */ diff --git a/packages/kokkos/algorithms/unit_tests/TestSortCommon.hpp b/packages/kokkos/algorithms/unit_tests/TestSortCommon.hpp new file mode 100644 index 0000000000000000000000000000000000000000..56657b6574b865419a1f93e01a49aa2a3e648736 --- /dev/null +++ b/packages/kokkos/algorithms/unit_tests/TestSortCommon.hpp @@ -0,0 +1,55 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_ALGORITHMS_UNITTESTS_TESTSORT_COMMON_HPP +#define KOKKOS_ALGORITHMS_UNITTESTS_TESTSORT_COMMON_HPP + +#include <TestSort.hpp> + +namespace Test { +TEST(TEST_CATEGORY, SortUnsigned) { + Impl::test_sort<TEST_EXECSPACE, unsigned>(171); +} +} // namespace Test +#endif diff --git a/packages/kokkos/algorithms/unit_tests/UnitTestMain.cpp b/packages/kokkos/algorithms/unit_tests/UnitTestMain.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e245aad35fc33a595a16f711dbd4a63a0c7f8948 --- /dev/null +++ b/packages/kokkos/algorithms/unit_tests/UnitTestMain.cpp @@ -0,0 +1,54 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> +#include <Kokkos_Core.hpp> + +int main(int argc, char *argv[]) { + Kokkos::initialize(argc, argv); + ::testing::InitGoogleTest(&argc, argv); + int result = RUN_ALL_TESTS(); + Kokkos::finalize(); + return result; +} diff --git a/packages/kokkos/appveyor.yml b/packages/kokkos/appveyor.yml new file mode 100644 index 0000000000000000000000000000000000000000..e8763c0b665c4a992f74b70eab0caa915beb33dd --- /dev/null +++ b/packages/kokkos/appveyor.yml @@ -0,0 +1,6 @@ +image: + - Visual Studio 2019 +clone_folder: c:\projects\source +build_script: +- cmd: >- + cmake c:\projects\source -DKokkos_ENABLE_TESTS=ON -DCMAKE_CXX_FLAGS="/W0 /EHsc /d1reportClassLayoutChanges" -DCTEST_ARGS="-C Debug -V --output-on-failure" -DBUILD_NAME=MSVC-2019 -DBUILD_TYPE=Debug -DSITE=AppVeyor -DTARGET=install -P cmake/KokkosCI.cmake diff --git a/packages/kokkos/benchmarks/atomic/Makefile b/packages/kokkos/benchmarks/atomic/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..636c0ad4ab468a7611974cc2de68970b59a2f73c --- /dev/null +++ b/packages/kokkos/benchmarks/atomic/Makefile @@ -0,0 +1,51 @@ +KOKKOS_DEVICES=Cuda +KOKKOS_CUDA_OPTIONS=enable_lambda +KOKKOS_ARCH = "SNB,Volta70" + + +MAKEFILE_PATH := $(subst Makefile,,$(abspath $(lastword $(MAKEFILE_LIST)))) + +ifndef KOKKOS_PATH + KOKKOS_PATH = $(MAKEFILE_PATH)../.. +endif + +SRC = $(wildcard $(MAKEFILE_PATH)*.cpp) +HEADERS = $(wildcard $(MAKEFILE_PATH)*.hpp) + +vpath %.cpp $(sort $(dir $(SRC))) + +default: build + echo "Start Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +EXE = atomic_perf.cuda +else +CXX = g++ +EXE = atomic_perf.exe +endif + +CXXFLAGS ?= -O3 -g +override CXXFLAGS += -I$(MAKEFILE_PATH) + +DEPFLAGS = -M +LINK = ${CXX} +LINKFLAGS = + +OBJ = $(notdir $(SRC:.cpp=.o)) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o atomic_perf.cuda atomic_perf.exe + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(HEADERS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/packages/kokkos/benchmarks/atomic/main.cpp b/packages/kokkos/benchmarks/atomic/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7b5caa1aee1658104a8916bec314759e3e5ba30a --- /dev/null +++ b/packages/kokkos/benchmarks/atomic/main.cpp @@ -0,0 +1,120 @@ +#include <Kokkos_Core.hpp> +#include <impl/Kokkos_Timer.hpp> +#include <Kokkos_Random.hpp> + +template <class Scalar> +double test_atomic(int L, int N, int M, int K, int R, + Kokkos::View<const int*> offsets) { + Kokkos::View<Scalar*> output("Output", N); + Kokkos::Impl::Timer timer; + + for (int r = 0; r < R; r++) + Kokkos::parallel_for( + L, KOKKOS_LAMBDA(const int& i) { + Scalar s = 2; + for (int m = 0; m < M; m++) { + for (int k = 0; k < K; k++) s = s * s + s; + const int idx = (i + offsets(i, m)) % N; + Kokkos::atomic_add(&output(idx), s); + } + }); + Kokkos::fence(); + double time = timer.seconds(); + + return time; +} + +template <class Scalar> +double test_no_atomic(int L, int N, int M, int K, int R, + Kokkos::View<const int*> offsets) { + Kokkos::View<Scalar*> output("Output", N); + Kokkos::Impl::Timer timer; + for (int r = 0; r < R; r++) + Kokkos::parallel_for( + L, KOKKOS_LAMBDA(const int& i) { + Scalar s = 2; + for (int m = 0; m < M; m++) { + for (int k = 0; k < K; k++) s = s * s + s; + const int idx = (i + offsets(i, m)) % N; + output(idx) += s; + } + }); + Kokkos::fence(); + double time = timer.seconds(); + return time; +} + +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); + { + if (argc < 8) { + printf("Arguments: L N M D K R T\n"); + printf(" L: Number of iterations to run\n"); + printf(" N: Length of array to do atomics into\n"); + printf(" M: Number of atomics per iteration to do\n"); + printf(" D: Distance from index i to do atomics into (randomly)\n"); + printf(" K: Number of FMAD per atomic\n"); + printf(" R: Number of repeats of the experiments\n"); + printf(" T: Type of atomic\n"); + printf(" 1 - int\n"); + printf(" 2 - long\n"); + printf(" 3 - float\n"); + printf(" 4 - double\n"); + printf(" 5 - complex<double>\n"); + printf("Example Input GPU:\n"); + printf(" Histogram : 1000000 1000 1 1000 1 10 1\n"); + printf(" MD Force : 100000 100000 100 1000 20 10 4\n"); + printf(" Matrix Assembly : 100000 1000000 50 1000 20 10 4\n"); + Kokkos::finalize(); + return 0; + } + + int L = std::stoi(argv[1]); + int N = std::stoi(argv[2]); + int M = std::stoi(argv[3]); + int D = std::stoi(argv[4]); + int K = std::stoi(argv[5]); + int R = std::stoi(argv[6]); + int type = std::stoi(argv[7]); + + Kokkos::View<int*> offsets("Offsets", L, M); + Kokkos::Random_XorShift64_Pool<> pool(12371); + Kokkos::fill_random(offsets, pool, D); + double time = 0; + if (type == 1) time = test_atomic<int>(L, N, M, K, R, offsets); + if (type == 2) time = test_atomic<long>(L, N, M, K, R, offsets); + if (type == 3) time = test_atomic<float>(L, N, M, K, R, offsets); + if (type == 4) time = test_atomic<double>(L, N, M, K, R, offsets); + if (type == 5) + time = test_atomic<Kokkos::complex<double> >(L, N, M, K, R, offsets); + + double time2 = 1; + if (type == 1) time2 = test_no_atomic<int>(L, N, M, K, R, offsets); + if (type == 2) time2 = test_no_atomic<long>(L, N, M, K, R, offsets); + if (type == 3) time2 = test_no_atomic<float>(L, N, M, K, R, offsets); + if (type == 4) time2 = test_no_atomic<double>(L, N, M, K, R, offsets); + if (type == 5) + time2 = test_no_atomic<Kokkos::complex<double> >(L, N, M, K, R, offsets); + + int size = 0; + if (type == 1) size = sizeof(int); + if (type == 2) size = sizeof(long); + if (type == 3) size = sizeof(float); + if (type == 4) size = sizeof(double); + if (type == 5) size = sizeof(Kokkos::complex<double>); + + printf("%i\n", size); + printf( + "Time: %s %i %i %i %i %i %i (t_atomic: %e t_nonatomic: %e ratio: %lf " + ")( GUpdates/s: %lf GB/s: %lf )\n", + (type == 1) + ? "int" + : ((type == 2) + ? "long" + : ((type == 3) ? "float" + : ((type == 4) ? "double" : "complex"))), + L, N, M, D, K, R, time, time2, time / time2, 1.e-9 * L * R * M / time, + 1.0 * L * R * M * 2 * size / time / 1024 / 1024 / 1024); + } + Kokkos::finalize(); +} diff --git a/packages/kokkos/benchmarks/benchmark_suite/scripts/build_code.bash b/packages/kokkos/benchmarks/benchmark_suite/scripts/build_code.bash new file mode 100755 index 0000000000000000000000000000000000000000..0b885293e27ad4810ecc89e2f0ffdc4cc61b2f2f --- /dev/null +++ b/packages/kokkos/benchmarks/benchmark_suite/scripts/build_code.bash @@ -0,0 +1,84 @@ +#!/bin/bash + +# ---- Default Settings ----- + +# Paths +KOKKOS_PATH=${PWD}/kokkos +KOKKOS_KERNELS_PATH=${PWD}/kokkos-kernels +MINIMD_PATH=${PWD}/miniMD/kokkos +MINIFE_PATH=${PWD}/miniFE/kokkos + +# Kokkos Configure Options +KOKKOS_DEVICES=OpenMP +KOKKOS_ARCH=SNB + +# Compiler Options +CXX=mpicxx +OPT_FLAG="-O3" + +while [[ $# > 0 ]] +do + key="$1" + + case $key in + --kokkos-path*) + KOKKOS_PATH="${key#*=}" + ;; + --kokkos-kernels-path*) + KOKKOS_KERNELS_PATH="${key#*=}" + ;; + --minimd-path*) + MINIMD_PATH="${key#*=}" + ;; + --minife-path*) + MINIFE_PATH="${key#*=}" + ;; + --device-list*) + KOKKOS_DEVICES="${key#*=}" + ;; + --arch*) + KOKKOS_ARCH="--arch=${key#*=}" + ;; + --opt-flag*) + OPT_FLAG="${key#*=}" + ;; + --compiler*) + CXX="${key#*=}" + ;; + --with-cuda-options*) + KOKKOS_CUDA_OPTIONS="--with-cuda-options=${key#*=}" + ;; + --help*) + PRINT_HELP=True + ;; + *) + # args, just append + ARGS="$ARGS $1" + ;; + esac + + shift +done + +mkdir build + +# Build BytesAndFlops +mkdir build/bytes_and_flops +cd build/bytes_and_flops +make KOKKOS_ARCH=${KOKKOS_ARCH} KOKKOS_DEVICES=${KOKKOS_DEVICES} CXX=${CXX} KOKKOS_PATH=${KOKKOS_PATH}\ + CXXFLAGS=${OPT_FLAG} -f ${KOKKOS_PATH}/benchmarks/bytes_and_flops/Makefile -j 16 +cd ../.. + +mkdir build/miniMD +cd build/miniMD +make KOKKOS_ARCH=${KOKKOS_ARCH} KOKKOS_DEVICES=${KOKKOS_DEVICES} CXX=${CXX} KOKKOS_PATH=${KOKKOS_PATH} \ + CXXFLAGS=${OPT_FLAG} -f ${MINIMD_PATH}/Makefile -j 16 +cd ../../ + +mkdir build/miniFE +cd build/miniFE +make KOKKOS_ARCH=${KOKKOS_ARCH} KOKKOS_DEVICES=${KOKKOS_DEVICES} CXX=${CXX} KOKKOS_PATH=${KOKKOS_PATH} \ + CXXFLAGS=${OPT_FLAG} -f ${MINIFE_PATH}/src/Makefile -j 16 +cd ../../ + + diff --git a/packages/kokkos/benchmarks/benchmark_suite/scripts/checkout_repos.bash b/packages/kokkos/benchmarks/benchmark_suite/scripts/checkout_repos.bash new file mode 100755 index 0000000000000000000000000000000000000000..9b52a36d89ac107297c9c0501027c946b0cdf55a --- /dev/null +++ b/packages/kokkos/benchmarks/benchmark_suite/scripts/checkout_repos.bash @@ -0,0 +1,37 @@ +#!/bin/bash + +# Kokkos +if [ ! -d "kokkos" ]; then + git clone https://github.com/kokkos/kokkos +fi +cd kokkos +git checkout develop +git pull +cd .. + +# KokkosKernels +if [ ! -d "kokkos-kernels" ]; then +git clone https://github.com/kokkos/kokkos-kernels +fi +cd kokkos-kernels +git pull +cd .. + +# MiniMD +if [ ! -d "miniMD" ]; then + git clone https://github.com/mantevo/miniMD +fi +cd miniMD +git pull +cd .. + +# MiniFE +if [ ! -d "miniFE" ]; then + git clone https://github.com/mantevo/miniFE +fi +cd miniFE +git pull +cd .. + + + diff --git a/packages/kokkos/benchmarks/benchmark_suite/scripts/run_benchmark.bash b/packages/kokkos/benchmarks/benchmark_suite/scripts/run_benchmark.bash new file mode 100755 index 0000000000000000000000000000000000000000..6afa05f5fcfbdd1ff3a55b529edd8d52d92bf2d8 --- /dev/null +++ b/packages/kokkos/benchmarks/benchmark_suite/scripts/run_benchmark.bash @@ -0,0 +1,14 @@ +#!/bin/bash +SCRIPT_PATH=$1 +KOKKOS_DEVICES=$2 +KOKKOS_ARCH=$3 +COMPILER=$4 +if [[ $# < 4 ]]; then + echo "Usage: ./run_benchmark.bash PATH_TO_SCRIPTS KOKKOS_DEVICES KOKKOS_ARCH COMPILER" +else + +${SCRIPT_PATH}/checkout_repos.bash +${SCRIPT_PATH}/build_code.bash --arch=${KOKKOS_ARCH} --device-list=${KOKKOS_DEVICES} --compiler=${COMPILER} +${SCRIPT_PATH}/run_tests.bash + +fi \ No newline at end of file diff --git a/packages/kokkos/benchmarks/benchmark_suite/scripts/run_tests.bash b/packages/kokkos/benchmarks/benchmark_suite/scripts/run_tests.bash new file mode 100755 index 0000000000000000000000000000000000000000..4fcac3df9fa394216b327ef11bf3a14eaf0123eb --- /dev/null +++ b/packages/kokkos/benchmarks/benchmark_suite/scripts/run_tests.bash @@ -0,0 +1,44 @@ +#!/bin/bash + +# BytesAndFlops +cd build/bytes_and_flops + +USE_CUDA=`grep "_CUDA" KokkosCore_config.h | wc -l` + +if [[ ${USE_CUDA} > 0 ]]; then + BAF_EXE=bytes_and_flops.cuda + TEAM_SIZE=256 +else + BAF_EXE=bytes_and_flops.exe + TEAM_SIZE=1 +fi + +BAF_PERF_1=`./${BAF_EXE} 2 100000 1024 1 1 1 1 ${TEAM_SIZE} 6000 | awk '{print $12/174.5}'` +BAF_PERF_2=`./${BAF_EXE} 2 100000 1024 16 1 8 64 ${TEAM_SIZE} 6000 | awk '{print $14/1142.65}'` + +echo "BytesAndFlops: ${BAF_PERF_1} ${BAF_PERF_2}" +cd ../.. + + +# MiniMD +cd build/miniMD +cp ../../miniMD/kokkos/Cu_u6.eam ./ +MD_PERF_1=`./miniMD --half_neigh 0 -s 60 --ntypes 1 -t ${OMP_NUM_THREADS} -i ../../miniMD/kokkos/in.eam.miniMD | grep PERF_SUMMARY | awk '{print $10/21163341}'` +MD_PERF_2=`./miniMD --half_neigh 0 -s 20 --ntypes 1 -t ${OMP_NUM_THREADS} -i ../../miniMD/kokkos/in.eam.miniMD | grep PERF_SUMMARY | awk '{print $10/13393417}'` + +echo "MiniMD: ${MD_PERF_1} ${MD_PERF_2}" +cd ../.. + +# MiniFE +cd build/miniFE +rm *.yaml +./miniFE.x -nx 100 &> /dev/null +FE_PERF_1=`grep "CG Mflop" *.yaml | awk '{print $4/14174}'` +rm *.yaml +./miniFE.x -nx 50 &> /dev/null +FE_PERF_2=`grep "CG Mflop" *.yaml | awk '{print $4/11897}'` +cd ../.. +echo "MiniFE: ${FE_PERF_1} ${FE_PERF_2}" + +PERF_RESULT=`echo "${BAF_PERF_1} ${BAF_PERF_2} ${MD_PERF_1} ${MD_PERF_2} ${FE_PERF_1} ${FE_PERF_2}" | awk '{print ($1+$2+$3+$4+$5+$6)/6}'` +echo "Total Result: " ${PERF_RESULT} diff --git a/packages/kokkos/benchmarks/bytes_and_flops/Makefile b/packages/kokkos/benchmarks/bytes_and_flops/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..1aa4edddcdbcffd1439c48127aa314d985e34c8b --- /dev/null +++ b/packages/kokkos/benchmarks/bytes_and_flops/Makefile @@ -0,0 +1,51 @@ +KOKKOS_DEVICES=Cuda +KOKKOS_CUDA_OPTIONS=enable_lambda +KOKKOS_ARCH = "SNB,Volta70" + + +MAKEFILE_PATH := $(subst Makefile,,$(abspath $(lastword $(MAKEFILE_LIST)))) + +ifndef KOKKOS_PATH + KOKKOS_PATH = $(MAKEFILE_PATH)../.. +endif + +SRC = $(wildcard $(MAKEFILE_PATH)*.cpp) +HEADERS = $(wildcard $(MAKEFILE_PATH)*.hpp) + +vpath %.cpp $(sort $(dir $(SRC))) + +default: build + echo "Start Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +EXE = bytes_and_flops.cuda +else +CXX = g++ +EXE = bytes_and_flops.exe +endif + +CXXFLAGS ?= -O3 -g +override CXXFLAGS += -I$(MAKEFILE_PATH) + +DEPFLAGS = -M +LINK = ${CXX} +LINKFLAGS = + +OBJ = $(notdir $(SRC:.cpp=.o)) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o *.cuda *.host + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(HEADERS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/packages/kokkos/benchmarks/bytes_and_flops/bench.hpp b/packages/kokkos/benchmarks/bytes_and_flops/bench.hpp new file mode 100644 index 0000000000000000000000000000000000000000..62d7ef4a4cf387191c6d0276c4ea360c289d4de5 --- /dev/null +++ b/packages/kokkos/benchmarks/bytes_and_flops/bench.hpp @@ -0,0 +1,93 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <impl/Kokkos_Timer.hpp> + +template <class Scalar, int Unroll, int Stride> +struct Run { + static void run(int N, int K, int R, int F, int T, int S); +}; + +template <class Scalar, int Stride> +struct RunStride { + static void run_1(int N, int K, int R, int F, int T, int S); + static void run_2(int N, int K, int R, int F, int T, int S); + static void run_3(int N, int K, int R, int F, int T, int S); + static void run_4(int N, int K, int R, int F, int T, int S); + static void run_5(int N, int K, int R, int F, int T, int S); + static void run_6(int N, int K, int R, int F, int T, int S); + static void run_7(int N, int K, int R, int F, int T, int S); + static void run_8(int N, int K, int R, int F, int T, int S); + static void run(int N, int K, int R, int U, int F, int T, int S); +}; + +#define STRIDE 1 +#include <bench_stride.hpp> +#undef STRIDE +#define STRIDE 2 +#include <bench_stride.hpp> +#undef STRIDE +#define STRIDE 4 +#include <bench_stride.hpp> +#undef STRIDE +#define STRIDE 8 +#include <bench_stride.hpp> +#undef STRIDE +#define STRIDE 16 +#include <bench_stride.hpp> +#undef STRIDE +#define STRIDE 32 +#include <bench_stride.hpp> +#undef STRIDE + +template <class Scalar> +void run_stride_unroll(int N, int K, int R, int D, int U, int F, int T, int S) { + if (D == 1) RunStride<Scalar, 1>::run(N, K, R, U, F, T, S); + if (D == 2) RunStride<Scalar, 2>::run(N, K, R, U, F, T, S); + if (D == 4) RunStride<Scalar, 4>::run(N, K, R, U, F, T, S); + if (D == 8) RunStride<Scalar, 8>::run(N, K, R, U, F, T, S); + if (D == 16) RunStride<Scalar, 16>::run(N, K, R, U, F, T, S); + if (D == 32) RunStride<Scalar, 32>::run(N, K, R, U, F, T, S); +} diff --git a/packages/kokkos/benchmarks/bytes_and_flops/bench_stride.hpp b/packages/kokkos/benchmarks/bytes_and_flops/bench_stride.hpp new file mode 100644 index 0000000000000000000000000000000000000000..64817fe9dc4b5d1efcad168d9b2e0915da6d492a --- /dev/null +++ b/packages/kokkos/benchmarks/bytes_and_flops/bench_stride.hpp @@ -0,0 +1,123 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#define UNROLL 1 +#include <bench_unroll_stride.hpp> +#undef UNROLL +#define UNROLL 2 +#include <bench_unroll_stride.hpp> +#undef UNROLL +#define UNROLL 3 +#include <bench_unroll_stride.hpp> +#undef UNROLL +#define UNROLL 4 +#include <bench_unroll_stride.hpp> +#undef UNROLL +#define UNROLL 5 +#include <bench_unroll_stride.hpp> +#undef UNROLL +#define UNROLL 6 +#include <bench_unroll_stride.hpp> +#undef UNROLL +#define UNROLL 7 +#include <bench_unroll_stride.hpp> +#undef UNROLL +#define UNROLL 8 +#include <bench_unroll_stride.hpp> +#undef UNROLL + +template <class Scalar> +struct RunStride<Scalar, STRIDE> { + static void run_1(int N, int K, int R, int F, int T, int S) { + Run<Scalar, 1, STRIDE>::run(N, K, R, F, T, S); + } + static void run_2(int N, int K, int R, int F, int T, int S) { + Run<Scalar, 2, STRIDE>::run(N, K, R, F, T, S); + } + static void run_3(int N, int K, int R, int F, int T, int S) { + Run<Scalar, 3, STRIDE>::run(N, K, R, F, T, S); + } + static void run_4(int N, int K, int R, int F, int T, int S) { + Run<Scalar, 4, STRIDE>::run(N, K, R, F, T, S); + } + static void run_5(int N, int K, int R, int F, int T, int S) { + Run<Scalar, 5, STRIDE>::run(N, K, R, F, T, S); + } + static void run_6(int N, int K, int R, int F, int T, int S) { + Run<Scalar, 6, STRIDE>::run(N, K, R, F, T, S); + } + static void run_7(int N, int K, int R, int F, int T, int S) { + Run<Scalar, 7, STRIDE>::run(N, K, R, F, T, S); + } + static void run_8(int N, int K, int R, int F, int T, int S) { + Run<Scalar, 8, STRIDE>::run(N, K, R, F, T, S); + } + + static void run(int N, int K, int R, int U, int F, int T, int S) { + if (U == 1) { + run_1(N, K, R, F, T, S); + } + if (U == 2) { + run_2(N, K, R, F, T, S); + } + if (U == 3) { + run_3(N, K, R, F, T, S); + } + if (U == 4) { + run_4(N, K, R, F, T, S); + } + if (U == 5) { + run_5(N, K, R, F, T, S); + } + if (U == 6) { + run_6(N, K, R, F, T, S); + } + if (U == 7) { + run_7(N, K, R, F, T, S); + } + if (U == 8) { + run_8(N, K, R, F, T, S); + } + } +}; diff --git a/packages/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp b/packages/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp new file mode 100644 index 0000000000000000000000000000000000000000..00ce635a489f677ff43b05c782856dcdfa1cafa9 --- /dev/null +++ b/packages/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp @@ -0,0 +1,151 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +template <class Scalar> +struct Run<Scalar, UNROLL, STRIDE> { + static void run(int N, int K, int R, int F, int T, int S) { + Kokkos::View<Scalar* * [STRIDE], Kokkos::LayoutRight> A("A", N, K); + Kokkos::View<Scalar* * [STRIDE], Kokkos::LayoutRight> B("B", N, K); + Kokkos::View<Scalar* * [STRIDE], Kokkos::LayoutRight> C("C", N, K); + + Kokkos::deep_copy(A, Scalar(1.5)); + Kokkos::deep_copy(B, Scalar(2.5)); + Kokkos::deep_copy(C, Scalar(3.5)); + + Kokkos::Timer timer; + Kokkos::parallel_for( + "BenchmarkKernel", + Kokkos::TeamPolicy<>(N, T).set_scratch_size(0, Kokkos::PerTeam(S)), + KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type& team) { + const int n = team.league_rank(); + for (int r = 0; r < R; r++) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, 0, K), [&](const int& i) { + Scalar a1 = A(n, i, 0); + const Scalar b = B(n, i, 0); +#if (UNROLL > 1) + Scalar a2 = a1 * 1.3; +#endif +#if (UNROLL > 2) + Scalar a3 = a2 * 1.1; +#endif +#if (UNROLL > 3) + Scalar a4 = a3 * 1.1; +#endif +#if (UNROLL > 4) + Scalar a5 = a4 * 1.3; +#endif +#if (UNROLL > 5) + Scalar a6 = a5 * 1.1; +#endif +#if (UNROLL > 6) + Scalar a7 = a6 * 1.1; +#endif +#if (UNROLL > 7) + Scalar a8 = a7 * 1.1; +#endif + + for (int f = 0; f < F; f++) { + a1 += b * a1; +#if (UNROLL > 1) + a2 += b * a2; +#endif +#if (UNROLL > 2) + a3 += b * a3; +#endif +#if (UNROLL > 3) + a4 += b * a4; +#endif +#if (UNROLL > 4) + a5 += b * a5; +#endif +#if (UNROLL > 5) + a6 += b * a6; +#endif +#if (UNROLL > 6) + a7 += b * a7; +#endif +#if (UNROLL > 7) + a8 += b * a8; +#endif + } +#if (UNROLL == 1) + C(n, i, 0) = a1; +#endif +#if (UNROLL == 2) + C(n, i, 0) = a1 + a2; +#endif +#if (UNROLL == 3) + C(n, i, 0) = a1 + a2 + a3; +#endif +#if (UNROLL == 4) + C(n, i, 0) = a1 + a2 + a3 + a4; +#endif +#if (UNROLL == 5) + C(n, i, 0) = a1 + a2 + a3 + a4 + a5; +#endif +#if (UNROLL == 6) + C(n, i, 0) = a1 + a2 + a3 + a4 + a5 + a6; +#endif +#if (UNROLL == 7) + C(n, i, 0) = a1 + a2 + a3 + a4 + a5 + a6 + a7; +#endif +#if (UNROLL == 8) + C(n, i, 0) = a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8; +#endif + }); + } + }); + Kokkos::fence(); + double seconds = timer.seconds(); + + double bytes = 1.0 * N * K * R * 3 * sizeof(Scalar); + double flops = 1.0 * N * K * R * (F * 2 * UNROLL + 2 * (UNROLL - 1)); + printf( + "NKRUFTS: %i %i %i %i %i %i %i Time: %lfs Bandwidth: %lfGiB/s GFlop/s: " + "%lf\n", + N, K, R, UNROLL, F, T, S, seconds, + 1.0 * bytes / seconds / 1024 / 1024 / 1024, 1.e-9 * flops / seconds); + } +}; diff --git a/packages/kokkos/benchmarks/bytes_and_flops/main.cpp b/packages/kokkos/benchmarks/bytes_and_flops/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6da2407a08b7afb981ffb8c3a970b2df7d55f951 --- /dev/null +++ b/packages/kokkos/benchmarks/bytes_and_flops/main.cpp @@ -0,0 +1,107 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <impl/Kokkos_Timer.hpp> +#include <bench.hpp> +#include <cstdlib> + +int main(int argc, char* argv[]) { + Kokkos::initialize(); + + if (argc < 10) { + printf("Arguments: N K R D U F T S\n"); + printf(" P: Precision (1==float, 2==double)\n"); + printf(" N,K: dimensions of the 2D array to allocate\n"); + printf(" R: how often to loop through the K dimension with each team\n"); + printf(" D: distance between loaded elements (stride)\n"); + printf(" U: how many independent flops to do per load\n"); + printf( + " F: how many times to repeat the U unrolled operations before " + "reading next element\n"); + printf(" T: team size\n"); + printf( + " S: shared memory per team (used to control occupancy on GPUs)\n"); + printf("Example Input GPU:\n"); + printf(" Bandwidth Bound : 2 100000 1024 1 1 1 1 256 6000\n"); + printf(" Cache Bound : 2 100000 1024 64 1 1 1 512 20000\n"); + printf(" Compute Bound : 2 100000 1024 1 1 8 64 256 6000\n"); + printf(" Load Slots Used : 2 20000 256 32 16 1 1 256 6000\n"); + printf(" Inefficient Load: 2 20000 256 32 2 1 1 256 20000\n"); + Kokkos::finalize(); + return 0; + } + + int P = std::stoi(argv[1]); + int N = std::stoi(argv[2]); + int K = std::stoi(argv[3]); + int R = std::stoi(argv[4]); + int D = std::stoi(argv[5]); + int U = std::stoi(argv[6]); + int F = std::stoi(argv[7]); + int T = std::stoi(argv[8]); + int S = std::stoi(argv[9]); + + if (U > 8) { + printf("U must be 1-8\n"); + return 0; + } + if ((D != 1) && (D != 2) && (D != 4) && (D != 8) && (D != 16) && (D != 32)) { + printf("D must be one of 1,2,4,8,16,32\n"); + return 0; + } + if ((P != 1) && (P != 2)) { + printf("P must be one of 1,2\n"); + return 0; + } + + if (P == 1) { + run_stride_unroll<float>(N, K, R, D, U, F, T, S); + } + if (P == 2) { + run_stride_unroll<double>(N, K, R, D, U, F, T, S); + } + + Kokkos::finalize(); +} diff --git a/packages/kokkos/benchmarks/gather/Makefile b/packages/kokkos/benchmarks/gather/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..6827995bed5e5f5e896febe1989d2b9f0b8d040e --- /dev/null +++ b/packages/kokkos/benchmarks/gather/Makefile @@ -0,0 +1,51 @@ +KOKKOS_DEVICES=Cuda +KOKKOS_CUDA_OPTIONS=enable_lambda +KOKKOS_ARCH = "SNB,Volta70" + + +MAKEFILE_PATH := $(subst Makefile,,$(abspath $(lastword $(MAKEFILE_LIST)))) + +ifndef KOKKOS_PATH + KOKKOS_PATH = $(MAKEFILE_PATH)../.. +endif + +SRC = $(wildcard $(MAKEFILE_PATH)*.cpp) +HEADERS = $(wildcard $(MAKEFILE_PATH)*.hpp) + +vpath %.cpp $(sort $(dir $(SRC))) + +default: build + echo "Start Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +EXE = gather.cuda +else +CXX = g++ +EXE = gather.exe +endif + +CXXFLAGS ?= -O3 -g +override CXXFLAGS += -I$(MAKEFILE_PATH) + +DEPFLAGS = -M +LINK = ${CXX} +LINKFLAGS = + +OBJ = $(notdir $(SRC:.cpp=.o)) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o gather.cuda gather.exe + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(HEADERS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/packages/kokkos/benchmarks/gather/gather.hpp b/packages/kokkos/benchmarks/gather/gather.hpp new file mode 100644 index 0000000000000000000000000000000000000000..239614184ba13aad35fe54190ce5eaf507c61fde --- /dev/null +++ b/packages/kokkos/benchmarks/gather/gather.hpp @@ -0,0 +1,85 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +template <class Scalar, int UNROLL> +struct RunGather { + static void run(int N, int K, int D, int R, int F); +}; + +#define UNROLL 1 +#include <gather_unroll.hpp> +#undef UNROLL +#define UNROLL 2 +#include <gather_unroll.hpp> +#undef UNROLL +#define UNROLL 3 +#include <gather_unroll.hpp> +#undef UNROLL +#define UNROLL 4 +#include <gather_unroll.hpp> +#undef UNROLL +#define UNROLL 5 +#include <gather_unroll.hpp> +#undef UNROLL +#define UNROLL 6 +#include <gather_unroll.hpp> +#undef UNROLL +#define UNROLL 7 +#include <gather_unroll.hpp> +#undef UNROLL +#define UNROLL 8 +#include <gather_unroll.hpp> +#undef UNROLL + +template <class Scalar> +void run_gather_test(int N, int K, int D, int R, int U, int F) { + if (U == 1) RunGather<Scalar, 1>::run(N, K, D, R, F); + if (U == 2) RunGather<Scalar, 2>::run(N, K, D, R, F); + if (U == 3) RunGather<Scalar, 3>::run(N, K, D, R, F); + if (U == 4) RunGather<Scalar, 4>::run(N, K, D, R, F); + if (U == 5) RunGather<Scalar, 5>::run(N, K, D, R, F); + if (U == 6) RunGather<Scalar, 6>::run(N, K, D, R, F); + if (U == 7) RunGather<Scalar, 7>::run(N, K, D, R, F); + if (U == 8) RunGather<Scalar, 8>::run(N, K, D, R, F); +} diff --git a/packages/kokkos/benchmarks/gather/gather_unroll.hpp b/packages/kokkos/benchmarks/gather/gather_unroll.hpp new file mode 100644 index 0000000000000000000000000000000000000000..4dc046f99c33125f86d7674f61b38b77caff99c2 --- /dev/null +++ b/packages/kokkos/benchmarks/gather/gather_unroll.hpp @@ -0,0 +1,173 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <Kokkos_Random.hpp> + +template <class Scalar> +struct RunGather<Scalar, UNROLL> { + static void run(int N, int K, int D, int R, int F) { + Kokkos::View<int**> connectivity("Connectivity", N, K); + Kokkos::View<Scalar*> A_in("Input", N); + Kokkos::View<Scalar*> B_in("Input", N); + Kokkos::View<Scalar*> C("Output", N); + + Kokkos::Random_XorShift64_Pool<> rand_pool(12313); + + Kokkos::deep_copy(A_in, 1.5); + Kokkos::deep_copy(B_in, 2.0); + + Kokkos::View<const Scalar*, Kokkos::MemoryTraits<Kokkos::RandomAccess> > A( + A_in); + Kokkos::View<const Scalar*, Kokkos::MemoryTraits<Kokkos::RandomAccess> > B( + B_in); + + Kokkos::parallel_for( + "InitKernel", N, KOKKOS_LAMBDA(const int& i) { + auto rand_gen = rand_pool.get_state(); + for (int jj = 0; jj < K; jj++) { + connectivity(i, jj) = (rand_gen.rand(D) + i - D / 2 + N) % N; + } + rand_pool.free_state(rand_gen); + }); + Kokkos::fence(); + + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::parallel_for( + "BenchmarkKernel", N, KOKKOS_LAMBDA(const int& i) { + Scalar c = Scalar(0.0); + for (int jj = 0; jj < K; jj++) { + const int j = connectivity(i, jj); + Scalar a1 = A(j); + const Scalar b = B(j); +#if (UNROLL > 1) + Scalar a2 = a1 * Scalar(1.3); +#endif +#if (UNROLL > 2) + Scalar a3 = a2 * Scalar(1.1); +#endif +#if (UNROLL > 3) + Scalar a4 = a3 * Scalar(1.1); +#endif +#if (UNROLL > 4) + Scalar a5 = a4 * Scalar(1.3); +#endif +#if (UNROLL > 5) + Scalar a6 = a5 * Scalar(1.1); +#endif +#if (UNROLL > 6) + Scalar a7 = a6 * Scalar(1.1); +#endif +#if (UNROLL > 7) + Scalar a8 = a7 * Scalar(1.1); +#endif + + for (int f = 0; f < F; f++) { + a1 += b * a1; +#if (UNROLL > 1) + a2 += b * a2; +#endif +#if (UNROLL > 2) + a3 += b * a3; +#endif +#if (UNROLL > 3) + a4 += b * a4; +#endif +#if (UNROLL > 4) + a5 += b * a5; +#endif +#if (UNROLL > 5) + a6 += b * a6; +#endif +#if (UNROLL > 6) + a7 += b * a7; +#endif +#if (UNROLL > 7) + a8 += b * a8; +#endif + } +#if (UNROLL == 1) + c += a1; +#endif +#if (UNROLL == 2) + c += a1 + a2; +#endif +#if (UNROLL == 3) + c += a1 + a2 + a3; +#endif +#if (UNROLL == 4) + c += a1 + a2 + a3 + a4; +#endif +#if (UNROLL == 5) + c += a1 + a2 + a3 + a4 + a5; +#endif +#if (UNROLL == 6) + c += a1 + a2 + a3 + a4 + a5 + a6; +#endif +#if (UNROLL == 7) + c += a1 + a2 + a3 + a4 + a5 + a6 + a7; +#endif +#if (UNROLL == 8) + c += a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8; +#endif + } + C(i) = c; + }); + Kokkos::fence(); + } + double seconds = timer.seconds(); + + double bytes = 1.0 * N * K * R * (2 * sizeof(Scalar) + sizeof(int)) + + 1.0 * N * R * sizeof(Scalar); + double flops = 1.0 * N * K * R * (F * 2 * UNROLL + 2 * (UNROLL - 1)); + double gather_ops = 1.0 * N * K * R * 2; + printf( + "SNKDRUF: %i %i %i %i %i %i %i Time: %lfs Bandwidth: %lfGiB/s GFlop/s: " + "%lf GGather/s: %lf\n", + sizeof(Scalar) / 4, N, K, D, R, UNROLL, F, seconds, + 1.0 * bytes / seconds / 1024 / 1024 / 1024, 1.e-9 * flops / seconds, + 1.e-9 * gather_ops / seconds); + } +}; diff --git a/packages/kokkos/benchmarks/gather/main.cpp b/packages/kokkos/benchmarks/gather/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5f10e4dcc1aa509c191d3c7a6486114b3c0b7de9 --- /dev/null +++ b/packages/kokkos/benchmarks/gather/main.cpp @@ -0,0 +1,101 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <impl/Kokkos_Timer.hpp> +#include <gather.hpp> +#include <cstdlib> + +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); + + if (argc < 8) { + printf("Arguments: S N K D\n"); + printf( + " S: Scalar Type Size (1==float, 2==double, 4=complex<double>)\n"); + printf(" N: Number of entities\n"); + printf(" K: Number of things to gather per entity\n"); + printf(" D: Max distance of gathered things of an entity\n"); + printf(" R: how often to loop through the K dimension with each team\n"); + printf(" U: how many independent flops to do per load\n"); + printf( + " F: how many times to repeat the U unrolled operations before " + "reading next element\n"); + printf("Example Input GPU:\n"); + printf(" Bandwidth Bound : 2 10000000 1 1 10 1 1\n"); + printf(" Cache Bound : 2 10000000 64 1 10 1 1\n"); + printf(" Cache Gather : 2 10000000 64 256 10 1 1\n"); + printf(" Global Gather : 2 100000000 16 100000000 1 1 1\n"); + printf(" Typical MD : 2 100000 32 512 1000 8 2\n"); + Kokkos::finalize(); + return 0; + } + + int S = std::stoi(argv[1]); + int N = std::stoi(argv[2]); + int K = std::stoi(argv[3]); + int D = std::stoi(argv[4]); + int R = std::stoi(argv[5]); + int U = std::stoi(argv[6]); + int F = std::stoi(argv[7]); + + if ((S != 1) && (S != 2) && (S != 4)) { + printf("S must be one of 1,2,4\n"); + return 0; + } + if (N < D) { + printf("N must be larger or equal to D\n"); + return 0; + } + if (S == 1) { + run_gather_test<float>(N, K, D, R, U, F); + } + if (S == 2) { + run_gather_test<double>(N, K, D, R, U, F); + } + if (S == 4) { + run_gather_test<Kokkos::complex<double> >(N, K, D, R, U, F); + } + Kokkos::finalize(); +} diff --git a/packages/kokkos/benchmarks/gups/Makefile b/packages/kokkos/benchmarks/gups/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..2a90621d8ca20af96b991ff525641d06bf831ce5 --- /dev/null +++ b/packages/kokkos/benchmarks/gups/Makefile @@ -0,0 +1,51 @@ +KOKKOS_DEVICES=Cuda +KOKKOS_CUDA_OPTIONS=enable_lambda +KOKKOS_ARCH = "SNB,Volta70" + + +MAKEFILE_PATH := $(subst Makefile,,$(abspath $(lastword $(MAKEFILE_LIST)))) + +ifndef KOKKOS_PATH + KOKKOS_PATH = $(MAKEFILE_PATH)../.. +endif + +SRC = $(wildcard $(MAKEFILE_PATH)*.cpp) +HEADERS = $(wildcard $(MAKEFILE_PATH)*.hpp) + +vpath %.cpp $(sort $(dir $(SRC))) + +default: build + echo "Start Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +EXE = gups.cuda +else +CXX = g++ +EXE = gups.exe +endif + +CXXFLAGS ?= -O3 -g +override CXXFLAGS += -I$(MAKEFILE_PATH) + +DEPFLAGS = -M +LINK = ${CXX} +LINKFLAGS = + +OBJ = $(notdir $(SRC:.cpp=.o)) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o gups.cuda gups.exe + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(HEADERS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/packages/kokkos/benchmarks/gups/gups-kokkos.cpp b/packages/kokkos/benchmarks/gups/gups-kokkos.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5a3ad23800f8f20d7a66eaad4c509938fc23c778 --- /dev/null +++ b/packages/kokkos/benchmarks/gups/gups-kokkos.cpp @@ -0,0 +1,201 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// ************************************************************************ +//@HEADER +*/ + +#include "Kokkos_Core.hpp" +#include <cstdio> +#include <cstdlib> +#include <cmath> + +#include <sys/time.h> + +#define HLINE "-------------------------------------------------------------\n" + +#if defined(KOKKOS_ENABLE_CUDA) +using GUPSHostArray = Kokkos::View<int64_t*, Kokkos::CudaSpace>::HostMirror; +using GUPSDeviceArray = Kokkos::View<int64_t*, Kokkos::CudaSpace>; +#else +using GUPSHostArray = Kokkos::View<int64_t*, Kokkos::HostSpace>::HostMirror; +using GUPSDeviceArray = Kokkos::View<int64_t*, Kokkos::HostSpace>; +#endif + +using GUPSIndex = int; + +double now() { + struct timeval now; + gettimeofday(&now, nullptr); + + return (double)now.tv_sec + ((double)now.tv_usec * 1.0e-6); +} + +void randomize_indices(GUPSHostArray& indices, GUPSDeviceArray& dev_indices, + const int64_t dataCount) { + for (GUPSIndex i = 0; i < indices.extent(0); ++i) { + indices[i] = lrand48() % dataCount; + } + + Kokkos::deep_copy(dev_indices, indices); +} + +void run_gups(GUPSDeviceArray& indices, GUPSDeviceArray& data, + const int64_t datum, const bool performAtomics) { + if (performAtomics) { + Kokkos::parallel_for( + "bench-gups-atomic", indices.extent(0), + KOKKOS_LAMBDA(const GUPSIndex i) { + Kokkos::atomic_fetch_xor(&data[indices[i]], datum); + }); + } else { + Kokkos::parallel_for( + "bench-gups-non-atomic", indices.extent(0), + KOKKOS_LAMBDA(const GUPSIndex i) { data[indices[i]] ^= datum; }); + } + + Kokkos::fence(); +} + +int run_benchmark(const GUPSIndex indicesCount, const GUPSIndex dataCount, + const int repeats, const bool useAtomics) { + printf("Reports fastest timing per kernel\n"); + printf("Creating Views...\n"); + + printf("Memory Sizes:\n"); + printf("- Elements: %15" PRIu64 " (%12.4f MB)\n", + static_cast<uint64_t>(dataCount), + 1.0e-6 * ((double)dataCount * (double)sizeof(int64_t))); + printf("- Indices: %15" PRIu64 " (%12.4f MB)\n", + static_cast<uint64_t>(indicesCount), + 1.0e-6 * ((double)indicesCount * (double)sizeof(int64_t))); + printf(" - Atomics: %15s\n", (useAtomics ? "Yes" : "No")); + printf("Benchmark kernels will be performed for %d iterations.\n", repeats); + + printf(HLINE); + + GUPSDeviceArray dev_indices("indices", indicesCount); + GUPSDeviceArray dev_data("data", dataCount); + int64_t datum = -1; + + GUPSHostArray indices = Kokkos::create_mirror_view(dev_indices); + GUPSHostArray data = Kokkos::create_mirror_view(dev_data); + + double gupsTime = 0.0; + + printf("Initializing Views...\n"); + +#if defined(KOKKOS_HAVE_OPENMP) + Kokkos::parallel_for( + "init-data", Kokkos::RangePolicy<Kokkos::OpenMP>(0, dataCount), +#else + Kokkos::parallel_for( + "init-data", Kokkos::RangePolicy<Kokkos::Serial>(0, dataCount), +#endif + KOKKOS_LAMBDA(const int i) { data[i] = 10101010101; }); + +#if defined(KOKKOS_HAVE_OPENMP) + Kokkos::parallel_for( + "init-indices", Kokkos::RangePolicy<Kokkos::OpenMP>(0, indicesCount), +#else + Kokkos::parallel_for( + "init-indices", Kokkos::RangePolicy<Kokkos::Serial>(0, indicesCount), +#endif + KOKKOS_LAMBDA(const int i) { indices[i] = 0; }); + + Kokkos::deep_copy(dev_data, data); + Kokkos::deep_copy(dev_indices, indices); + double start; + + printf("Starting benchmarking...\n"); + + for (GUPSIndex k = 0; k < repeats; ++k) { + randomize_indices(indices, dev_indices, data.extent(0)); + + start = now(); + run_gups(dev_indices, dev_data, datum, useAtomics); + gupsTime += now() - start; + } + + Kokkos::deep_copy(indices, dev_indices); + Kokkos::deep_copy(data, dev_data); + + printf(HLINE); + printf( + "GUP/s Random: %18.6f\n", + (1.0e-9 * ((double)repeats) * (double)dev_indices.extent(0)) / gupsTime); + printf(HLINE); + + return 0; +} + +int main(int argc, char* argv[]) { + printf(HLINE); + printf("Kokkos GUPS Benchmark\n"); + printf(HLINE); + + srand48(1010101); + + Kokkos::initialize(argc, argv); + + int64_t indices = 8192; + int64_t data = 33554432; + int64_t repeats = 10; + bool useAtomics = false; + + for (int i = 1; i < argc; ++i) { + if (strcmp(argv[i], "--indices") == 0) { + indices = std::atoll(argv[i + 1]); + ++i; + } else if (strcmp(argv[i], "--data") == 0) { + data = std::atoll(argv[i + 1]); + ++i; + } else if (strcmp(argv[i], "--repeats") == 0) { + repeats = std::atoll(argv[i + 1]); + ++i; + } else if (strcmp(argv[i], "--atomics") == 0) { + useAtomics = true; + } + } + + const int rc = run_benchmark(indices, data, repeats, useAtomics); + + Kokkos::finalize(); + + return rc; +} diff --git a/packages/kokkos/benchmarks/policy_performance/Makefile b/packages/kokkos/benchmarks/policy_performance/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..f50aea720ef173483d0006c5c551317e08b8a03e --- /dev/null +++ b/packages/kokkos/benchmarks/policy_performance/Makefile @@ -0,0 +1,51 @@ +KOKKOS_DEVICES=Cuda +KOKKOS_CUDA_OPTIONS=enable_lambda +KOKKOS_ARCH = "SNB,Volta70" + + +MAKEFILE_PATH := $(subst Makefile,,$(abspath $(lastword $(MAKEFILE_LIST)))) + +ifndef KOKKOS_PATH + KOKKOS_PATH = $(MAKEFILE_PATH)../.. +endif + +SRC = $(wildcard $(MAKEFILE_PATH)*.cpp) +HEADERS = $(wildcard $(MAKEFILE_PATH)*.hpp) + +vpath %.cpp $(sort $(dir $(SRC))) + +default: build + echo "Start Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +EXE = policy_perf.cuda +else +CXX = g++ +EXE = policy_perf.exe +endif + +CXXFLAGS ?= -O3 -g +override CXXFLAGS += -I$(MAKEFILE_PATH) + +DEPFLAGS = -M +LINK = ${CXX} +LINKFLAGS = + +OBJ = $(notdir $(SRC:.cpp=.o)) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o policy_perf.cuda policy_perf.exe + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(HEADERS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/packages/kokkos/benchmarks/policy_performance/main.cpp b/packages/kokkos/benchmarks/policy_performance/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..da49cdb019a86eeb193555fdc383df800b31bcfe --- /dev/null +++ b/packages/kokkos/benchmarks/policy_performance/main.cpp @@ -0,0 +1,219 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include "policy_perf_test.hpp" + +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); + + if (argc < 10) { + printf(" Ten arguments are needed to run this program:\n"); + printf( + " (1)team_range, (2)thread_range, (3)vector_range, (4)outer_repeat, " + "(5)thread_repeat, (6)vector_repeat, (7)team_size, (8)vector_size, " + "(9)schedule, (10)test_type\n"); + printf(" team_range: number of teams (league_size)\n"); + printf(" thread_range: range for nested TeamThreadRange parallel_*\n"); + printf(" vector_range: range for nested ThreadVectorRange parallel_*\n"); + printf(" outer_repeat: number of repeats for outer parallel_* call\n"); + printf( + " thread_repeat: number of repeats for TeamThreadRange parallel_* " + "call\n"); + printf( + " vector_repeat: number of repeats for ThreadVectorRange parallel_* " + "call\n"); + printf(" team_size: number of team members (team_size)\n"); + printf(" vector_size: desired vectorization (if possible)\n"); + printf(" schedule: 1 == Static 2 == Dynamic\n"); + printf( + " test_type: 3-digit code XYZ for testing (nested) parallel_*\n"); + printf( + " code key: XYZ X in {1,2,3,4,5}, Y in {0,1,2}, Z in " + "{0,1,2}\n"); + printf(" TeamPolicy:\n"); + printf( + " X: 0 = none (never used, makes no sense); 1 = " + "parallel_for; 2 = parallel_reduce\n"); + printf( + " Y: 0 = none; 1 = parallel_for; 2 = " + "parallel_reduce\n"); + printf( + " Z: 0 = none; 1 = parallel_for; 2 = " + "parallel_reduce\n"); + printf(" RangePolicy:\n"); + printf( + " X: 3 = parallel_for; 4 = parallel_reduce; 5 = " + "parallel_scan\n"); + printf(" Y: 0 = none\n"); + printf(" Z: 0 = none\n"); + printf(" Example Input:\n"); + printf(" 100000 32 32 100 100 100 8 1 1 100\n"); + Kokkos::finalize(); + return 0; + } + + int team_range = std::stoi(argv[1]); + int thread_range = std::stoi(argv[2]); + int vector_range = std::stoi(argv[3]); + + int outer_repeat = std::stoi(argv[4]); + int thread_repeat = std::stoi(argv[5]); + int vector_repeat = std::stoi(argv[6]); + + int team_size = std::stoi(argv[7]); + int vector_size = std::stoi(argv[8]); + int schedule = std::stoi(argv[9]); + int test_type = std::stoi(argv[10]); + + int disable_verbose_output = 0; + if (argc > 11) { + disable_verbose_output = std::stoi(argv[11]); + } + + if (schedule != 1 && schedule != 2) { + printf("schedule: %d\n", schedule); + printf("Options for schedule are: 1 == Static 2 == Dynamic\n"); + Kokkos::finalize(); + return -1; + } + + if (test_type != 100 && test_type != 110 && test_type != 111 && + test_type != 112 && test_type != 120 && test_type != 121 && + test_type != 122 && test_type != 200 && test_type != 210 && + test_type != 211 && test_type != 212 && test_type != 220 && + test_type != 221 && test_type != 222 && test_type != 300 && + test_type != 400 && test_type != 500) { + printf("Incorrect test_type option\n"); + Kokkos::finalize(); + return -2; + } + + double result = 0.0; + + Kokkos::parallel_reduce( + "parallel_reduce warmup", Kokkos::TeamPolicy<>(10, 1), + KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type team, + double& lval) { lval += 1; }, + result); + + using view_type_1d = Kokkos::View<double*, Kokkos::LayoutRight>; + using view_type_2d = Kokkos::View<double**, Kokkos::LayoutRight>; + using view_type_3d = Kokkos::View<double***, Kokkos::LayoutRight>; + + // Allocate view without initializing + // Call a 'warmup' test with 1 repeat - this will initialize the corresponding + // view appropriately for test and should obey first-touch etc Second call to + // test is the one we actually care about and time + view_type_1d v_1(Kokkos::view_alloc(Kokkos::WithoutInitializing, "v_1"), + team_range * team_size); + view_type_2d v_2(Kokkos::view_alloc(Kokkos::WithoutInitializing, "v_2"), + team_range * team_size, thread_range); + view_type_3d v_3(Kokkos::view_alloc(Kokkos::WithoutInitializing, "v_3"), + team_range * team_size, thread_range, vector_range); + + double result_computed = 0.0; + double result_expect = 0.0; + double time = 0.0; + + if (schedule == 1) { + if (test_type != 500) { + // warmup - no repeat of loops + test_policy<Kokkos::Schedule<Kokkos::Static>, int>( + team_range, thread_range, vector_range, 1, 1, 1, team_size, + vector_size, test_type, v_1, v_2, v_3, result_computed, result_expect, + time); + test_policy<Kokkos::Schedule<Kokkos::Static>, int>( + team_range, thread_range, vector_range, outer_repeat, thread_repeat, + vector_repeat, team_size, vector_size, test_type, v_1, v_2, v_3, + result_computed, result_expect, time); + } else { + // parallel_scan: initialize 1d view for parallel_scan + test_policy<Kokkos::Schedule<Kokkos::Static>, int>( + team_range, thread_range, vector_range, 1, 1, 1, team_size, + vector_size, 100, v_1, v_2, v_3, result_computed, result_expect, + time); + test_policy<Kokkos::Schedule<Kokkos::Static>, int>( + team_range, thread_range, vector_range, outer_repeat, thread_repeat, + vector_repeat, team_size, vector_size, test_type, v_1, v_2, v_3, + result_computed, result_expect, time); + } + } + if (schedule == 2) { + if (test_type != 500) { + // warmup - no repeat of loops + test_policy<Kokkos::Schedule<Kokkos::Dynamic>, int>( + team_range, thread_range, vector_range, 1, 1, 1, team_size, + vector_size, test_type, v_1, v_2, v_3, result_computed, result_expect, + time); + test_policy<Kokkos::Schedule<Kokkos::Dynamic>, int>( + team_range, thread_range, vector_range, outer_repeat, thread_repeat, + vector_repeat, team_size, vector_size, test_type, v_1, v_2, v_3, + result_computed, result_expect, time); + } else { + // parallel_scan: initialize 1d view for parallel_scan + test_policy<Kokkos::Schedule<Kokkos::Static>, int>( + team_range, thread_range, vector_range, 1, 1, 1, team_size, + vector_size, 100, v_1, v_2, v_3, result_computed, result_expect, + time); + test_policy<Kokkos::Schedule<Kokkos::Static>, int>( + team_range, thread_range, vector_range, outer_repeat, thread_repeat, + vector_repeat, team_size, vector_size, test_type, v_1, v_2, v_3, + result_computed, result_expect, time); + } + } + + if (disable_verbose_output == 0) { + printf("%7i %4i %2i %9i %4i %4i %4i %2i %1i %3i %e %e %lf\n", team_range, + thread_range, vector_range, outer_repeat, thread_repeat, + vector_repeat, team_size, vector_size, schedule, test_type, + result_computed, result_expect, time); + } else { + printf("%lf\n", time); + } + + Kokkos::finalize(); + + return 0; +} diff --git a/packages/kokkos/benchmarks/policy_performance/policy_perf_test.hpp b/packages/kokkos/benchmarks/policy_performance/policy_perf_test.hpp new file mode 100644 index 0000000000000000000000000000000000000000..8e6cd7447dbbdde87911d88cf4b0d0c6f9ca3a75 --- /dev/null +++ b/packages/kokkos/benchmarks/policy_performance/policy_perf_test.hpp @@ -0,0 +1,435 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> + +template <class ViewType> +struct ParallelScanFunctor { + using value_type = double; + ViewType v; + + ParallelScanFunctor(const ViewType& v_) : v(v_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int idx, value_type& val, const bool& final) const { + // inclusive scan + val += v(idx); + if (final) { + v(idx) = val; + } + } +}; + +template <class ScheduleType, class IndexType, class ViewType1, class ViewType2, + class ViewType3> +void test_policy(int team_range, int thread_range, int vector_range, + int outer_repeat, int thread_repeat, int inner_repeat, + int team_size, int vector_size, int test_type, ViewType1& v1, + ViewType2& v2, ViewType3& v3, double& result, + double& result_expect, double& time) { + using t_policy = Kokkos::TeamPolicy<ScheduleType, IndexType>; + using t_team = typename t_policy::member_type; + Kokkos::Timer timer; + + for (int orep = 0; orep < outer_repeat; orep++) { + if (test_type == 100) { + Kokkos::parallel_for( + "100 outer for", t_policy(team_range, team_size), + KOKKOS_LAMBDA(const t_team& team) { + long idx = team.league_rank() * team.team_size() + team.team_rank(); + v1(idx) = idx; + // prevent compiler optimizing loop away + }); + } + + if (test_type == 110) { + Kokkos::parallel_for( + "110 outer for", t_policy(team_range, team_size), + KOKKOS_LAMBDA(const t_team& team) { + long idx = team.league_rank() * team.team_size() + team.team_rank(); + for (int tr = 0; tr < thread_repeat; ++tr) { + // Each team launches a parallel_for; thread_range is partitioned + // among team members + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, thread_range), + [&](const int t) { + v2(idx, t) = t; + // prevent compiler optimizing loop away + }); + } + }); + } + if (test_type == 111) { + Kokkos::parallel_for( + "111 outer for", t_policy(team_range, team_size, vector_size), + KOKKOS_LAMBDA(const t_team& team) { + long idx = team.league_rank() * team.team_size() + team.team_rank(); + for (int tr = 0; tr < thread_repeat; ++tr) { + // Each team launches a parallel_for; thread_range is partitioned + // among team members + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, thread_range), + [&](const int t) { + for (int vr = 0; vr < inner_repeat; ++vr) + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(team, vector_range), + [&](const int vi) { + v3(idx, t, vi) = vi; + // prevent compiler optimizing loop away + }); + }); + } + }); + } + if (test_type == 112) { + Kokkos::parallel_for( + "112 outer for", t_policy(team_range, team_size, vector_size), + KOKKOS_LAMBDA(const t_team& team) { + long idx = team.league_rank() * team.team_size() + team.team_rank(); + for (int tr = 0; tr < thread_repeat; ++tr) { + // Each team launches a parallel_for; thread_range is partitioned + // among team members + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, thread_range), + [&](const int t) { + double vector_result = 0.0; + for (int vr = 0; vr < inner_repeat; ++vr) { + vector_result = 0.0; + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, vector_range), + [&](const int vi, double& vval) { vval += 1; }, + vector_result); + } + v2(idx, t) = vector_result; + // prevent compiler optimizing loop away + }); + } + }); + } + if (test_type == 120) { + Kokkos::parallel_for( + "120 outer for", t_policy(team_range, team_size), + KOKKOS_LAMBDA(const t_team& team) { + long idx = team.league_rank() * team.team_size() + team.team_rank(); + double team_result = 0.0; + for (int tr = 0; tr < thread_repeat; ++tr) { + team_result = 0.0; + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(team, thread_range), + [&](const int t, double& lval) { lval += 1; }, team_result); + } + v1(idx) = team_result; + // prevent compiler optimizing loop away + }); + } + if (test_type == 121) { + Kokkos::parallel_for( + "121 outer for", t_policy(team_range, team_size, vector_size), + KOKKOS_LAMBDA(const t_team& team) { + long idx = team.league_rank() * team.team_size() + team.team_rank(); + double team_result = 0.0; + for (int tr = 0; tr < thread_repeat; ++tr) { + team_result = 0.0; + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(team, thread_range), + [&](const int t, double& lval) { + lval += 1; + for (int vr = 0; vr < inner_repeat; ++vr) { + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(team, vector_range), + [&](const int vi) { + v3(idx, t, vi) = vi; + // prevent compiler optimizing loop away + }); + } + }, + team_result); + } + v3(idx, 0, 0) = team_result; + // prevent compiler optimizing loop away + }); + } + if (test_type == 122) { + Kokkos::parallel_for( + "122 outer for", t_policy(team_range, team_size, vector_size), + KOKKOS_LAMBDA(const t_team& team) { + long idx = team.league_rank() * team.team_size() + team.team_rank(); + double team_result = 0.0; + for (int tr = 0; tr < thread_repeat; ++tr) { + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(team, thread_range), + [&](const int t, double& lval) { + double vector_result = 0.0; + for (int vr = 0; vr < inner_repeat; ++vr) { + vector_result = 0.0; + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, vector_range), + [&](const int vi, double& vval) { vval += 1; }, + vector_result); + lval += vector_result; + } + }, + team_result); + } + v1(idx) = team_result; + // prevent compiler optimizing loop away + }); + } + if (test_type == 200) { + Kokkos::parallel_reduce( + "200 outer reduce", t_policy(team_range, team_size), + KOKKOS_LAMBDA(const t_team& team, double& lval) { + lval += team.team_size() * team.league_rank() + team.team_rank(); + }, + result); + result_expect = + 0.5 * (team_range * team_size) * (team_range * team_size - 1); + // sum ( seq( [0, team_range*team_size) ) + } + if (test_type == 210) { + Kokkos::parallel_reduce( + "210 outer reduce", t_policy(team_range, team_size), + KOKKOS_LAMBDA(const t_team& team, double& lval) { + long idx = team.league_rank() * team.team_size() + team.team_rank(); + double thread_for = 1.0; + for (int tr = 0; tr < thread_repeat; tr++) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, thread_range), + [&](const int t) { + v2(idx, t) = t; + // prevent compiler optimizing loop away + }); + } + lval += (team.team_size() * team.league_rank() + team.team_rank() + + thread_for); + }, + result); + result_expect = + 0.5 * (team_range * team_size) * (team_range * team_size - 1) + + (team_range * team_size); + // sum ( seq( [0, team_range*team_size) + 1 per team_member (total of + // team_range*team_size) ) + } + if (test_type == 211) { + Kokkos::parallel_reduce( + "211 outer reduce", t_policy(team_range, team_size, vector_size), + KOKKOS_LAMBDA(const t_team& team, double& lval) { + long idx = team.league_rank() * team.team_size() + team.team_rank(); + double thread_for = 1.0; + for (int tr = 0; tr < thread_repeat; tr++) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, thread_range), + [&](const int t) { + for (int vr = 0; vr < inner_repeat; ++vr) + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(team, vector_range), + [&](const int vi) { + v3(idx, t, vi) = vi; + // prevent compiler optimizing loop away + }); + }); + } + lval += idx + thread_for; + }, + result); + result_expect = + 0.5 * (team_range * team_size) * (team_range * team_size - 1) + + (team_range * team_size); + // sum ( seq( [0, team_range*team_size) + 1 per team_member (total of + // team_range*team_size) ) + } + if (test_type == 212) { + Kokkos::parallel_reduce( + "212 outer reduce", t_policy(team_range, team_size, vector_size), + KOKKOS_LAMBDA(const t_team& team, double& lval) { + long idx = team.league_rank() * team.team_size() + team.team_rank(); + double vector_result = 0.0; + for (int tr = 0; tr < thread_repeat; tr++) { + // This parallel_for is executed by each team; the thread_range is + // partitioned among the team members + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, thread_range), + [&](const int t) { + v2(idx, t) = t; + // prevent compiler optimizing loop away + for (int vr = 0; vr < inner_repeat; ++vr) { + vector_result = 0.0; + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, vector_range), + [&](const int vi, double& vval) { vval += vi; }, + vector_result); + } + }); + } + lval += idx + vector_result; + }, + result); + result_expect = + 0.5 * (team_range * team_size) * (team_range * team_size - 1) + + (0.5 * vector_range * (vector_range - 1) * team_range * team_size); + // sum ( seq( [0, team_range*team_size) + sum( seq( [0, vector_range) ) + // per team_member (total of team_range*team_size) ) + } + if (test_type == 220) { + Kokkos::parallel_reduce( + "220 outer reduce", t_policy(team_range, team_size), + KOKKOS_LAMBDA(const t_team& team, double& lval) { + double team_result = 0.0; + for (int tr = 0; tr < thread_repeat; tr++) { + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(team, thread_range), + [&](const int t, double& tval) { tval += t; }, team_result); + } + lval += team_result * team.league_rank(); // constant * league_rank + }, + result); + result_expect = 0.5 * (team_range) * (team_range - 1) * team_size * 0.5 * + (thread_range) * (thread_range - 1); + // sum ( seq( [0, team_range) * constant ); constant = sum( seq( [0, + // thread_range) )*team_size (1 per member, result for each team) + } + if (test_type == 221) { + Kokkos::parallel_reduce( + "221 outer reduce", t_policy(team_range, team_size, vector_size), + KOKKOS_LAMBDA(const t_team& team, double& lval) { + long idx = team.league_rank() * team.team_size() + team.team_rank(); + double team_result = 0; + for (int tr = 0; tr < thread_repeat; tr++) { + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(team, thread_range), + [&](const int t, double& tval) { + double vector_for = 1.0; + for (int vr = 0; vr < inner_repeat; ++vr) { + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(team, vector_range), + [&](const int vi) { + v3(idx, t, vi) = vi; + // prevent compiler optimizing loop away + }); + } + tval += t + vector_for; + }, + team_result); + } + lval += team_result * team.league_rank(); + }, + result); + result_expect = + 0.5 * (team_range) * (team_range - 1) * team_size * + (0.5 * (thread_range) * (thread_range - 1) + thread_range); + // sum ( seq( [0, team_range) * constant ) + 1 per member per team; + // constant = sum( seq( [0, thread_range) )*team_size (1 per member, + // result for each team) + } + if (test_type == 222) { + Kokkos::parallel_reduce( + "222 outer reduce", t_policy(team_range, team_size, vector_size), + KOKKOS_LAMBDA(const t_team& team, double& lval) { + double team_result = 0.0; + for (int tr = 0; tr < thread_repeat; tr++) { + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(team, thread_range), + [&](const int t, double& tval) { + double vector_result = 0.0; + for (int vr = 0; vr < inner_repeat; ++vr) { + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, vector_range), + [&](const int vi, double& vval) { vval += vi; }, + vector_result); + } + tval += t + vector_result; + }, + team_result); + } + lval += team_result * team.league_rank(); + }, + result); + result_expect = + 0.5 * (team_range) * (team_range - 1) * team_size * + (0.5 * (thread_range) * (thread_range - 1) + + thread_range * 0.5 * (vector_range) * (vector_range - 1)); + // sum ( seq( [0, team_range) * constant ) + 1 + sum( seq([0,vector_range) + // ) per member per team; constant = sum( seq( [0, thread_range) + // )*team_size (1 per member, result for each team) + } + + // parallel_for RangePolicy: range = team_size*team_range + if (test_type == 300) { + Kokkos::parallel_for( + "300 outer for", team_size * team_range, + KOKKOS_LAMBDA(const int idx) { + v1(idx) = idx; + // prevent compiler from optimizing away the loop + }); + } + // parallel_reduce RangePolicy: range = team_size*team_range + if (test_type == 400) { + Kokkos::parallel_reduce( + "400 outer reduce", team_size * team_range, + KOKKOS_LAMBDA(const int idx, double& val) { val += idx; }, result); + result_expect = + 0.5 * (team_size * team_range) * (team_size * team_range - 1); + } + // parallel_scan RangePolicy: range = team_size*team_range + if (test_type == 500) { + Kokkos::parallel_scan("500 outer scan", team_size * team_range, + ParallelScanFunctor<ViewType1>(v1) +#if 0 + // This does not compile with pre Cuda 8.0 - see Github Issue #913 for explanation + KOKKOS_LAMBDA (const int idx, double& val, const bool& final) { + // inclusive scan + val += v1(idx); + if ( final ) { + v1(idx) = val; + } + } +#endif + ); + // result = v1( team_size*team_range - 1 ); // won't work with Cuda - need + // to copy result back to host to print result_expect = + // 0.5*(team_size*team_range)*(team_size*team_range-1); + } + + } // end outer for loop + + time = timer.seconds(); +} // end test_policy diff --git a/packages/kokkos/benchmarks/policy_performance/script_basic_testing.sh b/packages/kokkos/benchmarks/policy_performance/script_basic_testing.sh new file mode 100755 index 0000000000000000000000000000000000000000..e621fffbd435bcbdedfc3244250a0330f83bb928 --- /dev/null +++ b/packages/kokkos/benchmarks/policy_performance/script_basic_testing.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +# Script to check policy_perf_test code works with each possible combo of options + +echo "Performance test results for parallel_reduce code computing sum of sequence [0,N) with various (nested) policies" + +EXECUTABLE=policy_performance + +TEAMRANGE=1000 +THREADRANGE=4 +VECTORRANGE=32 +TEAMSIZE=4 +VECTORSIZE=1 +OREPEAT=1 +MREPEAT=1 +IREPEAT=1 +SCHEDULE=1 + +SUFFIX=host +if [ -e $EXECUTABLE.$SUFFIX ] +then +SCHEDULE=1 +echo "Host tests Static schedule" +for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500} +do + OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE +done + +SCHEDULE=2 +echo "Host tests Dynamic schedule" +for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500} +do + OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE +done +fi + +SUFFIX=cuda +if [ -e $EXECUTABLE.$SUFFIX ] +then +SCHEDULE=1 +echo "Cuda tests Static schedule" +for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500} +do + ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE +done + +SCHEDULE=2 +echo "Cuda tests Dynamic schedule" +for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500} +do + ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE +done +fi diff --git a/packages/kokkos/benchmarks/policy_performance/script_sample_usage.sh b/packages/kokkos/benchmarks/policy_performance/script_sample_usage.sh new file mode 100755 index 0000000000000000000000000000000000000000..1c2db56648ccfa74e9ab95c2bf381434f0132e7b --- /dev/null +++ b/packages/kokkos/benchmarks/policy_performance/script_sample_usage.sh @@ -0,0 +1,126 @@ +#!/bin/bash + +# Sample script for benchmarking policy performance + +# Suggested environment variables to export prior to executing script: +# KNL: +# OMP_NUM_THREADS=256 KMP_AFFINITY=compact +# Power: +# OMP_NUM_THREADS=64 OMP_PROC_BIND=true + +# Constants and Variables: +# Vary: TEAMSIZE, and THREADRANGE +# for TEAMSIZE in {1,2,4,5,8}; do +# for THREADRANGE in {32,41,1000}; do +# Fixed: TEAMRANGE, VECTORRANGE, VECTORSIZE +# System specific: Adjust REPEAT values to architecture tests are run on + +# Tests +# Static SCHEDULE = 1 +# Tier 1: parallel_for + RangePolicy 300 +# Tier 2: parallel_reduce, parallel_scan + RangePolicy 400 500 +# Tier 3: 'outer' parallel_for with TeamPolicy (nested parallelism) 1XY +# Tier 4: 'outer' parallel_reduce with TeamPolicy (nested parallelism) 2XY +# Dynamic SCHEDULE = 2 +# Tier 5: parallel_for + RangePolicy 300 +# Tier 6: parallel_reduce, parallel_scan + RangePolicy 400 500 +# Tier 7: 'outer' parallel_for with TeamPolicy (nested parallelism) 1XY +# Tier 8: 'outer' parallel_reduce with TeamPolicy (nested parallelism) 2XY + +# Results grouped by: +# 0) SCHEDULE 1) CODE (test) 2) TEAMRANGE 3) TEAMSIZE 4) THREADRANGE + +EXECUTABLE=policy_performance + +# Default defined values +TEAMRANGE=1000 +THREADRANGE=1 +VECTORRANGE=32 +TEAMSIZE=1 +VECTORSIZE=1 +OREPEAT=1 +MREPEAT=1 +IREPEAT=1 +SCHEDULE=1 + +# Host tests +SUFFIX=host +if [ -e $EXECUTABLE.$SUFFIX ]; then +echo "Host" + +for SCHEDULE in {1,2}; do + +# Tier 1 and 2, 5 and 6 +for CODE in {300,400,500}; do + for TEAMSIZE in {1,2,4,5,8}; do + OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE + done +done + +# Tier 3, 7 +for CODE in {100,110,111,112,120,121,122}; do + for TEAMSIZE in {1,2,4,5,8}; do + for THREADRANGE in {32,41,1000}; do + OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE + done + done +done + +# Tier 4, 8 +for CODE in {200,210,211,212,220,221,222}; do + for TEAMSIZE in {1,2,4,5,8}; do + for THREADRANGE in {32,41,1000}; do + OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE + done + done +done + +done # end SCHEDULE + +fi # end host + + +# Cuda tests +SUFFIX=cuda +# TEAMRANGE=10000, TEAMSIZE=8 too large +# TEAMRANGE=10000, TEAMSIZE=8, THREADRANGE=1000 too large +if [ -e $EXECUTABLE.$SUFFIX ]; then +echo "Cuda" + +for SCHEDULE in {1,2}; do + +# Reset defaults +TEAMRANGE=1000 +THREADRANGE=1 +VECTORRANGE=32 +TEAMSIZE=1 +VECTORSIZE=1 + +# Tier 1 and 2, 5 and 6 +for CODE in {300,400,500}; do + for TEAMSIZE in {1,2,4,5,8}; do + ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE + done +done + +# Tier 3, 7 +for CODE in {100,110,111,112,120,121,122}; do + for TEAMSIZE in {1,2,4,5,8}; do + for THREADRANGE in {32,41,1000}; do + ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE + done + done +done + +# Tier 4, 8 +for CODE in {200,210,211,212,220,221,222}; do + for TEAMSIZE in {1,2,4,5,8}; do + for THREADRANGE in {32,41,1000}; do + ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE + done + done +done + +done # end SCHEDULE + +fi #end cuda diff --git a/packages/kokkos/benchmarks/stream/Makefile b/packages/kokkos/benchmarks/stream/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..47a13838a47d7fbdb67e29aab26c89fb53318a33 --- /dev/null +++ b/packages/kokkos/benchmarks/stream/Makefile @@ -0,0 +1,51 @@ +KOKKOS_DEVICES=Cuda +KOKKOS_CUDA_OPTIONS=enable_lambda +KOKKOS_ARCH = "SNB,Volta70" + + +MAKEFILE_PATH := $(subst Makefile,,$(abspath $(lastword $(MAKEFILE_LIST)))) + +ifndef KOKKOS_PATH + KOKKOS_PATH = $(MAKEFILE_PATH)../.. +endif + +SRC = $(wildcard $(MAKEFILE_PATH)*.cpp) +HEADERS = $(wildcard $(MAKEFILE_PATH)*.hpp) + +vpath %.cpp $(sort $(dir $(SRC))) + +default: build + echo "Start Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +EXE = stream.cuda +else +CXX = g++ +EXE = stream.exe +endif + +CXXFLAGS ?= -O3 -g +override CXXFLAGS += -I$(MAKEFILE_PATH) + +DEPFLAGS = -M +LINK = ${CXX} +LINKFLAGS = + +OBJ = $(notdir $(SRC:.cpp=.o)) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o stream.cuda stream.exe + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(HEADERS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/packages/kokkos/benchmarks/stream/stream-kokkos.cpp b/packages/kokkos/benchmarks/stream/stream-kokkos.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e7ef67e0805c9c4424fbf1781423cf907dca3eec --- /dev/null +++ b/packages/kokkos/benchmarks/stream/stream-kokkos.cpp @@ -0,0 +1,271 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// ************************************************************************ +//@HEADER +*/ + +#include "Kokkos_Core.hpp" +#include <cstdio> +#include <cstdlib> +#include <cmath> + +#include <sys/time.h> + +#define STREAM_ARRAY_SIZE 100000000 +#define STREAM_NTIMES 20 + +#define HLINE "-------------------------------------------------------------\n" + +#if defined(KOKKOS_ENABLE_CUDA) +using StreamHostArray = Kokkos::View<double*, Kokkos::CudaSpace>::HostMirror; +using StreamDeviceArray = Kokkos::View<double*, Kokkos::CudaSpace>; +#else +using StreamHostArray = Kokkos::View<double*, Kokkos::HostSpace>::HostMirror; +using StreamDeviceArray = Kokkos::View<double*, Kokkos::HostSpace>; +#endif + +using StreamIndex = int; + +double now() { + struct timeval now; + gettimeofday(&now, nullptr); + + return (double)now.tv_sec + ((double)now.tv_usec * 1.0e-6); +} + +void perform_copy(StreamDeviceArray& a, StreamDeviceArray& b, + StreamDeviceArray& c) { + Kokkos::parallel_for( + "copy", a.extent(0), KOKKOS_LAMBDA(const StreamIndex i) { c[i] = a[i]; }); + + Kokkos::fence(); +} + +void perform_scale(StreamDeviceArray& a, StreamDeviceArray& b, + StreamDeviceArray& c, const double scalar) { + Kokkos::parallel_for( + "copy", a.extent(0), + KOKKOS_LAMBDA(const StreamIndex i) { b[i] = scalar * c[i]; }); + + Kokkos::fence(); +} + +void perform_add(StreamDeviceArray& a, StreamDeviceArray& b, + StreamDeviceArray& c) { + Kokkos::parallel_for( + "add", a.extent(0), + KOKKOS_LAMBDA(const StreamIndex i) { c[i] = a[i] + b[i]; }); + + Kokkos::fence(); +} + +void perform_triad(StreamDeviceArray& a, StreamDeviceArray& b, + StreamDeviceArray& c, const double scalar) { + Kokkos::parallel_for( + "triad", a.extent(0), + KOKKOS_LAMBDA(const StreamIndex i) { a[i] = b[i] + scalar * c[i]; }); + + Kokkos::fence(); +} + +int perform_validation(StreamHostArray& a, StreamHostArray& b, + StreamHostArray& c, const StreamIndex arraySize, + const double scalar) { + double ai = 1.0; + double bi = 2.0; + double ci = 0.0; + + for (StreamIndex i = 0; i < arraySize; ++i) { + ci = ai; + bi = scalar * ci; + ci = ai + bi; + ai = bi + scalar * ci; + }; + + double aError = 0.0; + double bError = 0.0; + double cError = 0.0; + + for (StreamIndex i = 0; i < arraySize; ++i) { + aError = std::abs(a[i] - ai); + bError = std::abs(b[i] - bi); + cError = std::abs(c[i] - ci); + } + + double aAvgError = aError / (double)arraySize; + double bAvgError = bError / (double)arraySize; + double cAvgError = cError / (double)arraySize; + + const double epsilon = 1.0e-13; + int errorCount = 0; + + if (std::abs(aAvgError / ai) > epsilon) { + fprintf(stderr, "Error: validation check on View a failed.\n"); + errorCount++; + } + + if (std::abs(bAvgError / bi) > epsilon) { + fprintf(stderr, "Error: validation check on View b failed.\n"); + errorCount++; + } + + if (std::abs(cAvgError / ci) > epsilon) { + fprintf(stderr, "Error: validation check on View c failed.\n"); + errorCount++; + } + + if (errorCount == 0) { + printf("All solutions checked and verified.\n"); + } + + return errorCount; +} + +int run_benchmark() { + printf("Reports fastest timing per kernel\n"); + printf("Creating Views...\n"); + + printf("Memory Sizes:\n"); + printf("- Array Size: %" PRIu64 "\n", + static_cast<uint64_t>(STREAM_ARRAY_SIZE)); + printf("- Per Array: %12.2f MB\n", + 1.0e-6 * (double)STREAM_ARRAY_SIZE * (double)sizeof(double)); + printf("- Total: %12.2f MB\n", + 3.0e-6 * (double)STREAM_ARRAY_SIZE * (double)sizeof(double)); + + printf("Benchmark kernels will be performed for %d iterations.\n", + STREAM_NTIMES); + + printf(HLINE); + + StreamDeviceArray dev_a("a", STREAM_ARRAY_SIZE); + StreamDeviceArray dev_b("b", STREAM_ARRAY_SIZE); + StreamDeviceArray dev_c("c", STREAM_ARRAY_SIZE); + + StreamHostArray a = Kokkos::create_mirror_view(dev_a); + StreamHostArray b = Kokkos::create_mirror_view(dev_b); + StreamHostArray c = Kokkos::create_mirror_view(dev_c); + + const double scalar = 3.0; + + double copyTime = std::numeric_limits<double>::max(); + double scaleTime = std::numeric_limits<double>::max(); + double addTime = std::numeric_limits<double>::max(); + double triadTime = std::numeric_limits<double>::max(); + + printf("Initializing Views...\n"); + +#if defined(KOKKOS_HAVE_OPENMP) + Kokkos::parallel_for( + "init", Kokkos::RangePolicy<Kokkos::OpenMP>(0, STREAM_ARRAY_SIZE), +#else + Kokkos::parallel_for( + "init", Kokkos::RangePolicy<Kokkos::Serial>(0, STREAM_ARRAY_SIZE), +#endif + KOKKOS_LAMBDA(const int i) { + a[i] = 1.0; + b[i] = 2.0; + c[i] = 0.0; + }); + + // Copy contents of a (from the host) to the dev_a (device) + Kokkos::deep_copy(dev_a, a); + Kokkos::deep_copy(dev_b, b); + Kokkos::deep_copy(dev_c, c); + + double start; + + printf("Starting benchmarking...\n"); + + for (StreamIndex k = 0; k < STREAM_NTIMES; ++k) { + start = now(); + perform_copy(dev_a, dev_b, dev_c); + copyTime = std::min(copyTime, (now() - start)); + + start = now(); + perform_scale(dev_a, dev_b, dev_c, scalar); + scaleTime = std::min(scaleTime, (now() - start)); + + start = now(); + perform_add(dev_a, dev_b, dev_c); + addTime = std::min(addTime, (now() - start)); + + start = now(); + perform_triad(dev_a, dev_b, dev_c, scalar); + triadTime = std::min(triadTime, (now() - start)); + } + + Kokkos::deep_copy(a, dev_a); + Kokkos::deep_copy(b, dev_b); + Kokkos::deep_copy(c, dev_c); + + printf("Performing validation...\n"); + int rc = perform_validation(a, b, c, STREAM_ARRAY_SIZE, scalar); + + printf(HLINE); + + printf("Copy %11.2f MB/s\n", + (1.0e-06 * 2.0 * (double)sizeof(double) * (double)STREAM_ARRAY_SIZE) / + copyTime); + printf("Scale %11.2f MB/s\n", + (1.0e-06 * 2.0 * (double)sizeof(double) * (double)STREAM_ARRAY_SIZE) / + scaleTime); + printf("Add %11.2f MB/s\n", + (1.0e-06 * 3.0 * (double)sizeof(double) * (double)STREAM_ARRAY_SIZE) / + addTime); + printf("Triad %11.2f MB/s\n", + (1.0e-06 * 3.0 * (double)sizeof(double) * (double)STREAM_ARRAY_SIZE) / + triadTime); + + printf(HLINE); + + return rc; +} + +int main(int argc, char* argv[]) { + printf(HLINE); + printf("Kokkos STREAM Benchmark\n"); + printf(HLINE); + + Kokkos::initialize(argc, argv); + const int rc = run_benchmark(); + Kokkos::finalize(); + + return rc; +} diff --git a/packages/kokkos/bin/hpcbind b/packages/kokkos/bin/hpcbind new file mode 100755 index 0000000000000000000000000000000000000000..6af091a7d8b60766cddae67c6076b5df1f8ad12f --- /dev/null +++ b/packages/kokkos/bin/hpcbind @@ -0,0 +1,648 @@ +#!/usr/bin/env bash + +################################################################################ +# Check if hwloc commands exist +################################################################################ +declare -i HPCBIND_HAS_HWLOC=1 +type hwloc-bind >/dev/null 2>&1 +HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?)) + +type hwloc-distrib >/dev/null 2>&1 +HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?)) + +type hwloc-ls >/dev/null 2>&1 +HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?)) + +type hwloc-calc >/dev/null 2>&1 +HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?)) + +type hwloc-ps >/dev/null 2>&1 +HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?)) + +if [[ ${HPCBIND_HAS_HWLOC} -eq 0 ]]; then + echo "hwloc not found, no process binding will occur" +fi + +# Get parent cpuset +HPCBIND_HWLOC_PARENT_CPUSET="" +if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then + HPCBIND_HWLOC_VERSION="$(hwloc-ls --version | cut -d ' ' -f 2)" + MY_PID="$BASHPID" + HPCBIND_HWLOC_PARENT_CPUSET="$(hwloc-ps -a --cpuset | grep ${MY_PID} | cut -f 2)" +fi + +################################################################################ +# Check if nvidia-smi exist +################################################################################ +declare -i HPCBIND_HAS_NVIDIA=0 +type nvidia-smi >/dev/null 2>&1 +HPCBIND_HAS_NVIDIA=$((!$?)) + + +################################################################################ +# Get visible gpu +################################################################################ +declare -i NUM_GPUS=0 +HPCBIND_VISIBLE_GPUS="" +if [[ ${HPCBIND_HAS_NVIDIA} -eq 1 ]]; then + NUM_GPUS=$(nvidia-smi -L | wc -l); + HPCBIND_HAS_NVIDIA=$((!$?)) + if [[ ${HPCBIND_HAS_NVIDIA} -eq 1 ]]; then + GPU_LIST="$( seq 0 $((NUM_GPUS-1)) )" + HPCBIND_VISIBLE_GPUS=${CUDA_VISIBLE_DEVICES:-${GPU_LIST}} + fi +fi + +declare -i HPCBIND_ENABLE_GPU_MAPPING=$((NUM_GPUS > 0)) + + +################################################################################ +# Get queue id +# supports sbatch, bsub, aprun +################################################################################ +HPCBIND_QUEUE_NAME="" +declare -i HPCBIND_QUEUE_RANK=0 +declare -i HPCBIND_QUEUE_SIZE=0 +declare -i HPCBIND_QUEUE_MAPPING=0 + +if [[ ! -z "${PMI_RANK}" ]]; then + HPCBIND_QUEUE_MAPPING=1 + HPCBIND_QUEUE_NAME="mpich" + HPCBIND_QUEUE_RANK=${PMI_RANK} + HPCBIND_QUEUE_SIZE=${PMI_SIZE} +elif [[ ! -z "${OMPI_COMM_WORLD_RANK}" ]]; then + HPCBIND_QUEUE_MAPPING=1 + HPCBIND_QUEUE_NAME="openmpi" + HPCBIND_QUEUE_RANK=${OMPI_COMM_WORLD_RANK} + HPCBIND_QUEUE_SIZE=${OMPI_COMM_WORLD_SIZE} +elif [[ ! -z "${MV2_COMM_WORLD_RANK}" ]]; then + HPCBIND_QUEUE_MAPPING=1 + HPCBIND_QUEUE_NAME="mvapich2" + HPCBIND_QUEUE_RANK=${MV2_COMM_WORLD_RANK} + HPCBIND_QUEUE_SIZE=${MV2_COMM_WORLD_SIZE} +elif [[ ! -z "${SLURM_LOCAL_ID}" ]]; then + HPCBIND_QUEUE_MAPPING=1 + HPCBIND_QUEUE_NAME="slurm" + HPCBIND_QUEUE_RANK=${SLURM_PROCID} + HPCBIND_QUEUE_SIZE=${SLURM_NPROCS} +elif [[ ! -z "${ALPS_APP_PE}" ]]; then + HPCBIND_QUEUE_MAPPING=1 + HPCBIND_QUEUE_NAME="aprun" + HPCBIND_QUEUE_RANK=${ALPS_APP_PE} +elif [[ ! -z "${LBS_JOBINDEX}" ]]; then + HPCBIND_QUEUE_MAPPING=1 + HPCBIND_QUEUE_NAME="bsub" + HPCBIND_QUEUE_RANK=${LBS_JOBINDEX} +fi + +################################################################################ +# Show help +################################################################################ +function show_help { + local cmd=$(basename "$0") + echo "Usage: ${cmd} <options> -- command ..." + echo " Set the process mask, OMP environment variables and CUDA environment" + echo " variables to sane values if possible. Uses hwloc and nvidia-smi if" + echo " available. Will preserve the current process binding, so it is safe" + echo " to use with a queuing system or mpiexec." + echo "" + echo "Options:" + echo " --no-hwloc-bind Disable binding" + echo " --proc-bind=<LOC> Set the initial process mask for the script" + echo " LOC can be any valid location argument for" + echo " hwloc-calc Default: all" + echo " --whole-system ${cmd} will ignore the its parent process binding" + echo " --distribute=N Distribute the current cpuset into N partitions" + echo " --distribute-partition=I" + echo " Use the i'th partition (zero based)" + echo " --visible-gpus=<L> Comma separated list of gpu ids" + echo " Default: CUDA_VISIBLE_DEVICES or all gpus in" + echo " sequential order" + echo " --ignore-queue Ignore queue job id when choosing visible GPU and partition" + echo " --no-gpu-mapping Do not set CUDA_VISIBLE_DEVICES" + echo " --openmp=M.m Set env variables for the given OpenMP version" + echo " Default: 4.0" + echo " --openmp-ratio=N/D Ratio of the cpuset to use for OpenMP" + echo " Default: 1" + echo " --openmp-places=<Op> Op=threads|cores|sockets. Default: threads" + echo " --openmp-num-threads=N" + echo " Override logic for selecting OMP_NUM_THREADS" + echo " --openmp-proc-bind=<OP>" + echo " Override logic for selecting OMP_PROC_BIND" + echo " --openmp-nested Set OMP_NESTED to true" + echo " --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES" + echo " --output-prefix=<P> Save the output to files of the form" + echo " P.hpcbind.N, P.stdout.N and P.stderr.N where P is " + echo " the prefix and N is the rank (no spaces)" + echo " --output-mode=<Op> How console output should be handled." + echo " Options are all, rank0, and none. Default: rank0" + echo " --lstopo Show bindings in lstopo" + echo " --save-topology=<Xml> Save the topology to the given xml file" + echo " --load-topology=<Xml> Load a previously saved topology from an xml file" + echo " -v|--verbose Print bindings and relevant environment variables" + echo " -h|--help Show this message" + echo "" + echo "Sample Usage:" + echo "" + echo " Split the current process cpuset into 4 and use the 3rd partition" + echo " ${cmd} --distribute=4 --distribute-partition=2 -v -- command ..." + echo "" + echo " Launch 16 jobs over 4 nodes with 4 jobs per node using only the even pus" + echo " and save the output to rank specific files" + echo " mpiexec -N 16 -npernode 4 ${cmd} --whole-system --proc-bind=pu:even \\" + echo " --distribute=4 -v --output-prefix=output -- command ..." + echo "" + echo " Bind the process to all even cores" + echo " ${cmd} --proc-bind=core:even -v -- command ..." + echo "" + echo " Bind the the even cores of socket 0 and the odd cores of socket 1" + echo " ${cmd} --proc-bind='socket:0.core:even socket:1.core:odd' -v -- command ..." + echo "" + echo " Skip GPU 0 when mapping visible devices" + echo " ${cmd} --distribute=4 --distribute-partition=0 --visible-gpus=1,2 -v -- command ..." + echo "" + echo " Display the current bindings" + echo " ${cmd} --proc-bind=numa:0 -- command" + echo "" + echo " Display the current bindings using lstopo" + echo " ${cmd} --proc-bind=numa:0.core:odd --lstopo" + echo "" +} + + +################################################################################ +# Parse command line arguments +################################################################################ +# Show help if no command line arguments given +if [[ "$#" -eq 0 ]]; then + show_help + exit 0 +fi + +declare -a UNKNOWN_ARGS=() +declare -i HPCBIND_ENABLE_HWLOC_BIND=${HPCBIND_HAS_HWLOC} +declare -i HPCBIND_DISTRIBUTE=1 +declare -i HPCBIND_PARTITION=-1 +HPCBIND_PROC_BIND="all" +HPCBIND_OPENMP_VERSION=4.0 +declare -i HPCBIND_OPENMP_RATIO_NUMERATOR=1 +declare -i HPCBIND_OPENMP_RATIO_DENOMINATOR=1 +HPCBIND_OPENMP_PLACES=${OMP_PLACES:-threads} +declare -i HPCBIND_OPENMP_PROC_BIND=1 +HPCBIND_OPENMP_FORCE_NUM_THREADS="" +HPCBIND_OPENMP_FORCE_PROC_BIND="" +declare -i HPCBIND_OPENMP_NESTED=0 +declare -i HPCBIND_VERBOSE=0 + +declare -i HPCBIND_LSTOPO=0 + +HPCBIND_OUTPUT_PREFIX="" +HPCBIND_OUTPUT_MODE="rank0" + +HPCBIND_OUTPUT_TOPOLOGY="" +HPCBIND_INPUT_TOPOLOGY="" + +declare -i HPCBIND_HAS_COMMAND=0 + +for i in "$@"; do + case "$i" in + # number of partitions to create + --no-hwloc-bind) + HPCBIND_ENABLE_HWLOC_BIND=0 + shift + ;; + --proc-bind=*) + HPCBIND_PROC_BIND="${i#*=}" + shift + ;; + --whole-system) + HPCBIND_HWLOC_PARENT_CPUSET="" + shift + ;; + --distribute=*) + HPCBIND_DISTRIBUTE="${i#*=}" + if [[ ${HPCBIND_DISTRIBUTE} -le 0 ]]; then + HPCBIND_DISTRIBUTE=1 + fi + shift + ;; + # which partition to use + --distribute-partition=*) + HPCBIND_PARTITION="${i#*=}" + shift + ;; + --visible-gpus=*) + HPCBIND_VISIBLE_GPUS=$(echo "${i#*=}" | tr ',' ' ') + shift + ;; + --ignore-queue) + HPCBIND_QUEUE_MAPPING=0 + shift + ;; + --no-gpu-mapping) + HPCBIND_ENABLE_GPU_MAPPING=0 + shift + ;; + --openmp=*) + HPCBIND_OPENMP_VERSION="${i#*=}" + shift + ;; + --openmp-ratio=*) + IFS=/ read HPCBIND_OPENMP_RATIO_NUMERATOR HPCBIND_OPENMP_RATIO_DENOMINATOR <<< "${i#*=}" + if [[ ${HPCBIND_OPENMP_RATIO_NUMERATOR} -le 0 ]]; then + HPCBIND_OPENMP_RATIO_NUMERATOR=1 + fi + if [[ ${HPCBIND_OPENMP_RATIO_DENOMINATOR} -le 0 ]]; then + HPCBIND_OPENMP_RATIO_DENOMINATOR=1 + fi + if [[ ${HPCBIND_OPENMP_RATIO_NUMERATOR} -gt ${HPCBIND_OPENMP_RATIO_DENOMINATOR} ]]; then + HPCBIND_OPENMP_RATIO_NUMERATOR=1 + HPCBIND_OPENMP_RATIO_DENOMINATOR=1 + fi + shift + ;; + --openmp-places=*) + HPCBIND_OPENMP_PLACES="${i#*=}" + shift + ;; + --no-openmp-proc-bind) + HPCBIND_OPENMP_PROC_BIND=0 + shift + ;; + --force-openmp-proc-bind=*) + HPCBIND_OPENMP_FORCE_PROC_BIND="${i#*=}" + shift + ;; + --force-openmp-num-threads=*) + HPCBIND_OPENMP_FORCE_NUM_THREADS="${i#*=}" + shift + ;; + --no-openmp-nested) + HPCBIND_OPENMP_NESTED=0 + shift + ;; + --openmp-nested) + HPCBIND_OPENMP_NESTED=1 + shift + ;; + --output-prefix=*) + HPCBIND_OUTPUT_PREFIX="${i#*=}" + shift + ;; + --save-topology=*) + HPCBIND_OUTPUT_TOPOLOGY="${i#*=}" + shift + ;; + --load-topology=*) + HPCBIND_INPUT_TOPOLOGY="${i#*=}" + shift + ;; + --output-mode=*) + HPCBIND_OUTPUT_MODE="${i#*=}" + #convert to lower case + HPCBIND_OUTPUT_MODE="${HPCBIND_OUTPUT_MODE,,}" + shift + ;; + --lstopo) + HPCBIND_VERBOSE=1 + HPCBIND_LSTOPO=1 + shift + ;; + -v|--verbose) + HPCBIND_VERBOSE=1 + shift + ;; + -h|--help) + show_help + exit 0 + ;; + # ignore remaining arguments + --) + HPCBIND_HAS_COMMAND=1 + shift + break + ;; + # unknown option + *) + UNKNOWN_ARGS+=("$i") + shift + ;; + esac +done + +################################################################################ +# Check output mode +################################################################################ +declare -i HPCBIND_TEE=0 + +if [[ "${HPCBIND_OUTPUT_MODE}" == "none" ]]; then + HPCBIND_TEE=0 +elif [[ "${HPCBIND_OUTPUT_MODE}" == "all" ]]; then + HPCBIND_TEE=1 +elif [[ ${HPCBIND_QUEUE_RANK} -eq 0 ]]; then + #default to rank0 printing to screen + HPCBIND_TEE=1 +fi + +# Save the topology to the given xml file +if [[ "${HPCBIND_OUTPUT_TOPOLOGY}" != "" ]]; then + if [[ ${HPCBIND_QUEUE_RANK} -eq 0 ]]; then + lstopo-no-graphics "${HPCBIND_OUTPUT_TOPOLOGY}" + else + lstopo-no-graphics >/dev/null 2>&1 + fi +fi + +# Load the topology to the given xml file +if [[ "${HPCBIND_INPUT_TOPOLOGY}" != "" ]]; then + if [ -f ${HPCBIND_INPUT_TOPOLOGY} ]; then + export HWLOC_XMLFILE="${HPCBIND_INPUT_TOPOLOGY}" + export HWLOC_THISSYSTEM=1 + fi +fi + +if [[ "${HPCBIND_OUTPUT_PREFIX}" == "" ]]; then + HPCBIND_LOG=/dev/null + HPCBIND_ERR=/dev/null + HPCBIND_OUT=/dev/null +else + if [[ ${HPCBIND_QUEUE_SIZE} -le 0 ]]; then + HPCBIND_QUEUE_SIZE=1 + fi + HPCBIND_STR_QUEUE_SIZE="${HPCBIND_QUEUE_SIZE}" + HPCBIND_STR_QUEUE_RANK=$(printf %0*d ${#HPCBIND_STR_QUEUE_SIZE} ${HPCBIND_QUEUE_RANK}) + + HPCBIND_LOG="${HPCBIND_OUTPUT_PREFIX}.hpcbind.${HPCBIND_STR_QUEUE_RANK}" + HPCBIND_ERR="${HPCBIND_OUTPUT_PREFIX}.stderr.${HPCBIND_STR_QUEUE_RANK}" + HPCBIND_OUT="${HPCBIND_OUTPUT_PREFIX}.stdout.${HPCBIND_STR_QUEUE_RANK}" + > ${HPCBIND_LOG} +fi + + +################################################################################ +# Check unknown arguments +################################################################################ +if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then + echo "HPCBIND Unknown options: ${UNKNOWN_ARGS[*]}" > >(tee -a ${HPCBIND_LOG}) + exit 1 +fi + +################################################################################ +# Check that visible gpus are valid +################################################################################ +HPCBIND_VISIBLE_GPUS=(${HPCBIND_VISIBLE_GPUS}) +if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then + for ((i=0; i < ${#HPCBIND_VISIBLE_GPUS[*]}; i++)); do + if [[ ${HPCBIND_VISIBLE_GPUS[$i]} -ge ${NUM_GPUS} || + ${HPCBIND_VISIBLE_GPUS[$i]} -lt 0 ]]; then + echo "HPCBIND Invaild GPU ID ${HPCBIND_VISIBLE_GPUS[$i]} (setting to 0)" > >(tee -a ${HPCBIND_LOG}) + HPCBIND_VISIBLE_GPUS[$i]=0; + fi + done + NUM_GPUS=${#HPCBIND_VISIBLE_GPUS[@]} +fi + + +################################################################################ +#choose the correct partition +################################################################################ +if [[ ${HPCBIND_PARTITION} -lt 0 && ${HPCBIND_QUEUE_MAPPING} -eq 1 ]]; then + HPCBIND_PARTITION=${HPCBIND_QUEUE_RANK} +elif [[ ${HPCBIND_PARTITION} -lt 0 ]]; then + HPCBIND_PARTITION=0 +fi + +if [[ ${HPCBIND_PARTITION} -ge ${HPCBIND_DISTRIBUTE} ]]; then + HPCBIND_PARTITION=$((HPCBIND_PARTITION % HPCBIND_DISTRIBUTE)) +fi + +################################################################################ +# Find cpuset and num threads +################################################################################ +HPCBIND_HWLOC_CPUSET="" +declare -i HPCBIND_NUM_PUS=0 + +if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then + if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then + BINDING=$(hwloc-calc ${HPCBIND_PROC_BIND[*]}) + else + BINDING=$(hwloc-calc --restrict ${HPCBIND_HWLOC_PARENT_CPUSET} ${HPCBIND_PROC_BIND[*]}) + fi + + if [[ ${HPCBIND_DISTRIBUTE} -gt 1 ]]; then + CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${HPCBIND_DISTRIBUTE})) + HPCBIND_HWLOC_CPUSET="${CPUSETS[${HPCBIND_PARTITION}]}" + else + HPCBIND_HWLOC_CPUSET="${BINDING}" + fi + HPCBIND_NUM_PUS=$(hwloc-calc -q -N pu ${HPCBIND_HWLOC_CPUSET} ) + if [ $? -ne 0 ]; then + HPCBIND_NUM_PUS=1 + fi + HPCBIND_NUM_CORES=$(hwloc-calc -q -N core ${HPCBIND_HWLOC_CPUSET} ) + if [ $? -ne 0 ]; then + HPCBIND_NUM_CORES=1 + fi + HPCBIND_NUM_NUMAS=$(hwloc-calc -q -N numa ${HPCBIND_HWLOC_CPUSET} ) + if [ $? -ne 0 ]; then + HPCBIND_NUM_NUMAS=1 + fi + HPCBIND_NUM_SOCKETS=$(hwloc-calc -q -N socket ${HPCBIND_HWLOC_CPUSET} ) + if [ $? -ne 0 ]; then + HPCBIND_NUM_SOCKETS=1 + fi +else + HPCBIND_NUM_PUS=$(cat /proc/cpuinfo | grep -c processor) + HPCBIND_NUM_CORES=${HPCBIND_NUM_PUS} + HPCBIND_NUM_NUMAS=1 + HPCBIND_NUM_SOCKETS=1 +fi + + +if [[ ${HPCBIND_OPENMP_FORCE_NUM_THREADS} != "" ]]; then + HPCBIND_OPENMP_NUM_THREADS=${HPCBIND_OPENMP_FORCE_NUM_THREADS} +else + declare -i HPCBIND_OPENMP_NUM_THREADS=$((HPCBIND_NUM_PUS * HPCBIND_OPENMP_RATIO_NUMERATOR / HPCBIND_OPENMP_RATIO_DENOMINATOR)) + + if [[ ${HPCBIND_OPENMP_NUM_THREADS} -lt 1 ]]; then + HPCBIND_OPENMP_NUM_THREADS=1 + elif [[ ${HPCBIND_OPENMP_NUM_THREADS} -gt ${HPCBIND_NUM_PUS} ]]; then + HPCBIND_OPENMP_NUM_THREADS=${HPCBIND_NUM_PUS} + fi +fi + +################################################################################ +# Set OpenMP environment variables +################################################################################ + +# set OMP_NUM_THREADS +if [[ ${HPCBIND_OPENMP_NESTED} -eq 1 ]]; then + export OMP_NUM_THREADS="${HPCBIND_OPENMP_NUM_THREADS},1" +else + export OMP_NUM_THREADS=${HPCBIND_OPENMP_NUM_THREADS} +fi + +# set OMP_PROC_BIND and OMP_PLACES +if [[ ${HPCBIND_OPENMP_PROC_BIND} -eq 1 ]]; then + if [[ "${HPCBIND_OPENMP_FORCE_PROC_BIND}" == "" ]]; then + #default proc bind logic + if [[ "${HPCBIND_OPENMP_VERSION}" == "4.0" || "${HPCBIND_OPENMP_VERSION}" > "4.0" ]]; then + export OMP_PLACES="${HPCBIND_OPENMP_PLACES}" + if [[ ${HPCBIND_OPENMP_NESTED} -eq 1 ]]; then + export OMP_PROC_BIND="spread,spread" + else + export OMP_PROC_BIND="spread" + fi + else + export OMP_PROC_BIND="true" + unset OMP_PLACES + fi + else + #force proc bind + export OMP_PLACES="${HPCBIND_OPENMP_PLACES}" + export OMP_PROC_BIND="${HPCBIND_OPENMP_FORCE_PROC_BIND}" + fi +else + # no openmp proc bind + unset OMP_PLACES + unset OMP_PROC_BIND +fi + +# set up hot teams (intel specific) +if [[ ${HPCBIND_OPENMP_NESTED} -eq 1 ]]; then + export OMP_NESTED="true" + export OMP_MAX_ACTIVE_LEVELS=2 + export KMP_HOT_TEAMS=1 + export KMP_HOT_TEAMS_MAX_LEVEL=2 +else + export OMP_NESTED="false" +fi + +# set OMP_NESTED + +################################################################################ +# Set CUDA environment variables +################################################################################ + +if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then + if [[ ${HPCBIND_QUEUE_MAPPING} -eq 0 ]]; then + declare -i GPU_ID=$((HPCBIND_PARTITION % NUM_GPUS)) + export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}" + else + declare -i MY_TASK_ID=$((HPCBIND_QUEUE_RANK * HPCBIND_DISTRIBUTE + HPCBIND_PARTITION)) + declare -i GPU_ID=$((MY_TASK_ID % NUM_GPUS)) + export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}" + fi +fi + +################################################################################ +# Set hpcbind environment variables +################################################################################ +export HPCBIND_HWLOC_VERSION=${HPCBIND_HWLOC_VERSION} +export HPCBIND_HAS_HWLOC=${HPCBIND_HAS_HWLOC} +export HPCBIND_HAS_NVIDIA=${HPCBIND_HAS_NVIDIA} +export HPCBIND_NUM_PUS=${HPCBIND_NUM_PUS} +export HPCBIND_NUM_CORES=${HPCBIND_NUM_CORES} +export HPCBIND_NUM_NUMAS=${HPCBIND_NUM_NUMAS} +export HPCBIND_NUM_SOCKETS=${HPCBIND_NUM_SOCKETS} +export HPCBIND_HWLOC_CPUSET="${HPCBIND_HWLOC_CPUSET}" +export HPCBIND_HWLOC_DISTRIBUTE=${HPCBIND_DISTRIBUTE} +export HPCBIND_HWLOC_DISTRIBUTE_PARTITION=${HPCBIND_PARTITION} +export HPCBIND_OPENMP_RATIO="${HPCBIND_OPENMP_RATIO_NUMERATOR}/${HPCBIND_OPENMP_RATIO_DENOMINATOR}" +if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then + export HPCBIND_HWLOC_PARENT_CPUSET="all" +else + export HPCBIND_HWLOC_PARENT_CPUSET="${HPCBIND_HWLOC_PARENT_CPUSET}" +fi +export HPCBIND_HWLOC_PROC_BIND="${HPCBIND_PROC_BIND}" +export HPCBIND_NVIDIA_ENABLE_GPU_MAPPING=${HPCBIND_ENABLE_GPU_MAPPING} +export HPCBIND_NVIDIA_VISIBLE_GPUS=$(echo "${HPCBIND_VISIBLE_GPUS[*]}" | tr ' ' ',') +export HPCBIND_OPENMP_VERSION="${HPCBIND_OPENMP_VERSION}" +if [[ "${HPCBIND_QUEUE_NAME}" != "" ]]; then + export HPCBIND_QUEUE_RANK=${HPCBIND_QUEUE_RANK} + export HPCBIND_QUEUE_SIZE=${HPCBIND_QUEUE_SIZE} + export HPCBIND_QUEUE_NAME="${HPCBIND_QUEUE_NAME}" + export HPCBIND_QUEUE_MAPPING=${HPCBIND_QUEUE_MAPPING} +fi + + +################################################################################ +# Print verbose +################################################################################ + +TMP_ENV=$(env | sort) +if [[ ${HPCBIND_TEE} -eq 0 || ${HPCBIND_VERBOSE} -eq 0 ]]; then + echo "[HOST]" >> ${HPCBIND_LOG} + hostname -s >> ${HPCBIND_LOG} + echo "[HPCBIND]" >> ${HPCBIND_LOG} + echo "${TMP_ENV}" | grep -E "^HPCBIND_" >> ${HPCBIND_LOG} + echo "[HWLOC]" >> ${HPCBIND_LOG} + echo "${TMP_ENV}" | grep -E "^HWLOC_" >> ${HPCBIND_LOG} + echo "[CUDA]" >> ${HPCBIND_LOG} + echo "${TMP_ENV}" | grep -E "^CUDA_" >> ${HPCBIND_LOG} + echo "[OPENMP]" >> ${HPCBIND_LOG} + echo "${TMP_ENV}" | grep -E "^OMP_" >> ${HPCBIND_LOG} + echo "[GOMP] (gcc, g++, and gfortran)" >> ${HPCBIND_LOG} + echo "${TMP_ENV}" | grep -E "^GOMP_" >> ${HPCBIND_LOG} + echo "[KMP] (icc, icpc, and ifort)" >> ${HPCBIND_LOG} + echo "${TMP_ENV}" | grep -E "^KMP_" >> ${HPCBIND_LOG} + echo "[XLSMPOPTS] (xlc, xlc++, and xlf)" >> ${HPCBIND_LOG} + echo "${TMP_ENV}" | grep -E "^XLSMPOPTS" >> ${HPCBIND_LOG} + + if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then + echo "[BINDINGS]" >> ${HPCBIND_LOG} + hwloc-ls --restrict "${HPCBIND_HWLOC_CPUSET}" >> ${HPCBIND_LOG} + else + echo "Unable to show bindings, hwloc not available." >> ${HPCBIND_LOG} + fi +else + echo "[HOST]" > >(tee -a ${HPCBIND_LOG}) + hostname -s > >(tee -a ${HPCBIND_LOG}) + echo "[HPCBIND]" > >(tee -a ${HPCBIND_LOG}) + echo "${TMP_ENV}" | grep -E "^HPCBIND_" > >(tee -a ${HPCBIND_LOG}) + echo "[HWLOC]" > >(tee -a ${HPCBIND_LOG}) + echo "${TMP_ENV}" | grep -E "^HWLOC_" > >(tee -a ${HPCBIND_LOG}) + echo "[CUDA]" > >(tee -a ${HPCBIND_LOG}) + echo "${TMP_ENV}" | grep -E "^CUDA_" > >(tee -a ${HPCBIND_LOG}) + echo "[OPENMP]" > >(tee -a ${HPCBIND_LOG}) + echo "${TMP_ENV}" | grep -E "^OMP_" > >(tee -a ${HPCBIND_LOG}) + echo "[GOMP] (gcc, g++, and gfortran)" > >(tee -a ${HPCBIND_LOG}) + echo "${TMP_ENV}" | grep -E "^GOMP_" > >(tee -a ${HPCBIND_LOG}) + echo "[KMP] (icc, icpc, and ifort)" > >(tee -a ${HPCBIND_LOG}) + echo "${TMP_ENV}" | grep -E "^KMP_" > >(tee -a ${HPCBIND_LOG}) + echo "[XLSMPOPTS] (xlc, xlc++, and xlf)" > >(tee -a ${HPCBIND_LOG}) + echo "${TMP_ENV}" | grep -E "^XLSMPOPTS" > >(tee -a ${HPCBIND_LOG}) + + if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then + echo "[BINDINGS]" > >(tee -a ${HPCBIND_LOG}) + hwloc-ls --restrict "${HPCBIND_HWLOC_CPUSET}" --no-io --no-bridges > >(tee -a ${HPCBIND_LOG}) + else + echo "Unable to show bindings, hwloc not available." > >(tee -a ${HPCBIND_LOG}) + fi +fi + +################################################################################ +# Run command +################################################################################ + +# must be the last executed command so that the return value is correct +if [[ ${HPCBIND_LSTOPO} -eq 1 && ${HPCBIND_HAS_HWLOC} -eq 1 && ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 && ! -z ${DISPLAY} ]]; then + hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- lstopo --pid 0 +elif [[ ${HPCBIND_HAS_COMMAND} -eq 1 ]]; then + # clear output files + > ${HPCBIND_ERR} + > ${HPCBIND_OUT} + if [[ ${HPCBIND_TEE} -eq 0 ]]; then + if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then + hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR} + else + eval $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR} + fi + else + if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then + hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2) + else + eval $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2) + fi + fi +fi diff --git a/packages/kokkos/bin/kokkos_launch_compiler b/packages/kokkos/bin/kokkos_launch_compiler new file mode 100755 index 0000000000000000000000000000000000000000..d929d24f1dca42fc277940ffb27f54d374e89cd1 --- /dev/null +++ b/packages/kokkos/bin/kokkos_launch_compiler @@ -0,0 +1,121 @@ +#!/bin/bash -e +# +# This script allows CMAKE_CXX_COMPILER to be a standard +# C++ compiler and Kokkos sets RULE_LAUNCH_COMPILE and +# RULE_LAUNCH_LINK in CMake so that all compiler and link +# commands are prefixed with this script followed by the +# C++ compiler. Thus if $1 == $2 then we know the command +# was intended for the C++ compiler and we discard both +# $1 and $2 and redirect the command to NVCC_WRAPPER. +# If $1 != $2 then we know that the command was not intended +# for the C++ compiler and we just discard $1 and launch +# the original command. Examples of when $2 will not equal +# $1 are 'ar', 'cmake', etc. during the linking phase +# + +# emit a message about the underlying command executed +: ${DEBUG:=0} +: ${KOKKOS_DEBUG_LAUNCH_COMPILER:=${DEBUG}} + +debug-message() +{ + if [ "${KOKKOS_DEBUG_LAUNCH_COMPILER}" -ne 0 ]; then + echo -e "##### $(basename ${BASH_SOURCE[0]}) executing: \"$@\"... #####" + fi +} + +# check the arguments for the KOKKOS_DEPENDENCE compiler definition +KOKKOS_DEPENDENCE=0 +for i in ${@} +do + if [ -n "$(echo ${i} | grep 'KOKKOS_DEPENDENCE$')" ]; then + KOKKOS_DEPENDENCE=1 + break + fi +done + +# if Kokkos compiler is not passed, someone is probably trying to invoke it directly +if [ -z "${1}" ]; then + echo -e "\n${BASH_SOURCE[0]} was invoked without the Kokkos compiler as the first argument." + echo "This script is not indended to be directly invoked by any mechanism other" + echo -e "than through a RULE_LAUNCH_COMPILE or RULE_LAUNCH_LINK property set in CMake.\n" + exit 1 +fi + +# if Kokkos compiler is not passed, someone is probably trying to invoke it directly +if [ -z "${2}" ]; then + echo -e "\n${BASH_SOURCE[0]} was invoked without the C++ compiler as the second argument." + echo "This script is not indended to be directly invoked by any mechanism other" + echo -e "than through a RULE_LAUNCH_COMPILE or RULE_LAUNCH_LINK property set in CMake.\n" + exit 1 +fi + +# if there aren't two args, this isn't necessarily invalid, just a bit strange +if [ -z "${3}" ]; then exit 0; fi + +# store the Kokkos compiler +KOKKOS_COMPILER=${1} + +# remove the Kokkos compiler from the arguments +shift + +# store the expected C++ compiler +CXX_COMPILER=${1} + +# remove the expected C++ compiler from the arguments +shift + +# NOTE: in below, ${KOKKOS_COMPILER} is usually nvcc_wrapper +# +# after the above shifts, $1 is now the exe for the compile or link command, e.g. +# kokkos_launch_compiler ${KOKKOS_COMPILER} g++ gcc -c file.c -o file.o +# becomes: +# kokkos_launch_compiler gcc -c file.c -o file.o +# We check to see if the executable is the C++ compiler and if it is not, then +# just execute the command. +# +# Summary: +# kokkos_launch_compiler ${KOKKOS_COMPILER} g++ gcc -c file.c -o file.o +# results in this command being executed: +# gcc -c file.c -o file.o +# and +# kokkos_launch_compiler ${KOKKOS_COMPILER} g++ g++ -c file.cpp -o file.o +# results in this command being executed: +# ${KOKKOS_COMPILER} -c file.cpp -o file.o +if [[ "${KOKKOS_DEPENDENCE}" -eq "0" || "${CXX_COMPILER}" != "${1}" ]]; then + debug-message $@ + # the command does not depend on Kokkos so just execute the command w/o re-directing to ${KOKKOS_COMPILER} + eval $@ +else + # the executable is the C++ compiler, so we need to re-direct to ${KOKKOS_COMPILER} + if [ ! -f "${KOKKOS_COMPILER}" ]; then + echo -e "\nError: the compiler redirect for Kokkos was not found at ${KOKKOS_COMPILER}\n" + exit 1 + fi + + # find the nvcc_wrapper from the same build/install + NVCC_WRAPPER="$(dirname ${BASH_SOURCE[0]})/nvcc_wrapper" + if [ "${KOKKOS_COMPILER}" = "${NVCC_WRAPPER}" ]; then + # this should only be valid in the install tree -- it will be set to CMAKE_CXX_COMPILER used using Kokkos installation + if [ -z $(echo "@NVCC_WRAPPER_DEFAULT_COMPILER@" | grep 'NVCC_WRAPPER_DEFAULT_COMPILER') ]; then + : ${NVCC_WRAPPER_DEFAULT_COMPILER:="@NVCC_WRAPPER_DEFAULT_COMPILER@"} + fi + + # set default nvcc wrapper compiler if not specified + : ${NVCC_WRAPPER_DEFAULT_COMPILER:=${CXX_COMPILER}} + export NVCC_WRAPPER_DEFAULT_COMPILER + + # nvcc_wrapper calling itself will cause an infinitely long build + if [ "${NVCC_WRAPPER}" = "${NVCC_WRAPPER_DEFAULT_COMPILER}" ]; then + echo -e "\nError: NVCC_WRAPPER == NVCC_WRAPPER_DEFAULT_COMPILER. Terminating to avoid infinite loop!\n" + exit 1 + fi + fi + + # discard the compiler from the command + shift + + debug-message ${KOKKOS_COMPILER} $@ + # execute ${KOKKOS_COMPILER} (again, usually nvcc_wrapper) + ${KOKKOS_COMPILER} $@ +fi diff --git a/packages/kokkos/bin/nvcc_wrapper b/packages/kokkos/bin/nvcc_wrapper new file mode 100755 index 0000000000000000000000000000000000000000..5556e888e34b2f7c2dd18bdb6f47071abde0574b --- /dev/null +++ b/packages/kokkos/bin/nvcc_wrapper @@ -0,0 +1,520 @@ +#!/bin/bash +# +# This shell script (nvcc_wrapper) wraps both the host compiler and +# NVCC, if you are building legacy C or C++ code with CUDA enabled. +# The script remedies some differences between the interface of NVCC +# and that of the host compiler, in particular for linking. +# It also means that a legacy code doesn't need separate .cu files; +# it can just use .cpp files. +# +# Default settings: change those according to your machine. For +# example, you may have have two different wrappers with either icpc +# or g++ as their back-end compiler. The defaults can be overwritten +# by using the usual arguments (e.g., -arch=sm_30 -ccbin icpc). + +default_arch="sm_35" +#default_arch="sm_50" + +# +# The default C++ compiler. +# +host_compiler=${NVCC_WRAPPER_DEFAULT_COMPILER:-"g++"} + +# Default to whatever is in the path +nvcc_compiler=nvcc +if [ ! -z $CUDA_ROOT ]; then + nvcc_compiler="$CUDA_ROOT/bin/nvcc" +fi + +#host_compiler="icpc" +#host_compiler="/usr/local/gcc/4.8.3/bin/g++" +#host_compiler="/usr/local/gcc/4.9.1/bin/g++" + +# +# Internal variables +# + +# C++ files +cpp_files="" + +# Host compiler arguments +xcompiler_args="" + +# Cuda (NVCC) only arguments +cuda_args="" + +# Arguments for both NVCC and Host compiler +shared_args="" + +# Argument -c +compile_arg="" + +# Argument -o <obj> +output_arg="" + +# Linker arguments +xlinker_args="" + +# Object files passable to NVCC +object_files="" + +# Link objects for the host linker only +object_files_xlinker="" + +# Shared libraries with version numbers are not handled correctly by NVCC +shared_versioned_libraries_host="" +shared_versioned_libraries="" + +# Does the User set the architecture +arch_set=0 + +# Does the user overwrite the host compiler +ccbin_set=0 + +#Error code of compilation +error_code=0 + +# Do a dry run without actually compiling +dry_run=0 + +# Skip NVCC compilation and use host compiler directly +host_only=0 +host_only_args="" + +# Just run version on host compiler +get_host_version=0 + +# Enable workaround for CUDA 6.5 for pragma ident +replace_pragma_ident=0 + +# Mark first host compiler argument +first_xcompiler_arg=1 + +# Allow for setting temp dir without setting TMPDIR in parent (see https://docs.olcf.ornl.gov/systems/summit_user_guide.html#setting-tmpdir-causes-jsm-jsrun-errors-job-state-flip-flop) +if [[ ! -z ${NVCC_WRAPPER_TMPDIR+x} ]]; then + temp_dir=${TMPDIR:-/tmp} +else + temp_dir=${NVCC_WRAPPER_TMPDIR+x} +fi + +# optimization flag added as a command-line argument +optimization_flag="" + +# std standard flag added as a command-line argument +std_flag="" + +# Run nvcc a second time to generate dependencies if needed +depfile_separate=0 +depfile_output_arg="" +depfile_target_arg="" + +# Option to remove duplicate libraries and object files +remove_duplicate_link_files=0 + +function warn_std_flag() { + echo "nvcc_wrapper - *warning* you have set multiple standard flags (-std=c++1* or --std=c++1*), only the last is used because nvcc can only accept a single std setting" +} + +#echo "Arguments: $# $@" + +while [ $# -gt 0 ] +do + case $1 in + #show the executed command + --show|--nvcc-wrapper-show) + dry_run=1 + ;; + #run host compilation only + --host-only) + host_only=1 + ;; + #get the host version only + --host-version) + get_host_version=1 + ;; + #replace '#pragma ident' with '#ident' this is needed to compile OpenMPI due to a configure script bug and a non standardized behaviour of pragma with macros + --replace-pragma-ident) + replace_pragma_ident=1 + ;; + #remove duplicate link files + --remove-duplicate-link-files) + remove_duplicate_link_files=1 + ;; + #handle source files to be compiled as cuda files + *.cpp|*.cxx|*.cc|*.C|*.c++|*.cu) + cpp_files="$cpp_files $1" + ;; + # Ensure we only have one optimization flag because NVCC doesn't allow muliple + -O*) + if [ -n "$optimization_flag" ]; then + echo "nvcc_wrapper - *warning* you have set multiple optimization flags (-O*), only the last is used because nvcc can only accept a single optimization setting." + shared_args=${shared_args/ $optimization_flag/} + fi + if [ "$1" = "-O" ]; then + optimization_flag="-O2" + else + optimization_flag=$1 + fi + shared_args="$shared_args $optimization_flag" + ;; + #Handle shared args (valid for both nvcc and the host compiler) + -D*) + unescape_commas=`echo "$1" | sed -e 's/\\\,/,/g'` + arg=`printf "%q" $unescape_commas` + shared_args="$shared_args $arg" + ;; + -I*|-L*|-l*|-g|--help|--version|-E|-M|-shared|-w) + shared_args="$shared_args $1" + ;; + #Handle compilation argument + -c) + compile_arg="$1" + ;; + #Handle output argument + -o) + output_arg="$output_arg $1 $2" + shift + ;; + # Handle depfile arguments. We map them to a separate call to nvcc. + -MD|-MMD) + depfile_separate=1 + host_only_args="$host_only_args $1" + ;; + -MF) + depfile_output_arg="-o $2" + host_only_args="$host_only_args $1 $2" + shift + ;; + -MT) + depfile_target_arg="$1 $2" + host_only_args="$host_only_args $1 $2" + shift + ;; + #Handle known nvcc args + --dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|-expt-relaxed-constexpr|--resource-usage|-Xptxas*|--fmad*|--use_fast_math|--Wext-lambda-captures-this|-Wext-lambda-captures-this) + cuda_args="$cuda_args $1" + ;; + #Handle more known nvcc args + --expt-extended-lambda|--expt-relaxed-constexpr|--Wno-deprecated-gpu-targets|-Wno-deprecated-gpu-targets) + cuda_args="$cuda_args $1" + ;; + #Handle known nvcc args that have an argument + -rdc|-maxrregcount|--default-stream|-Xnvlink|--fmad|-cudart|--cudart|-include) + cuda_args="$cuda_args $1 $2" + shift + ;; + -rdc=*|-maxrregcount*|--maxrregcount*) + cuda_args="$cuda_args $1" + ;; + #Handle unsupported standard flags + --std=c++1y|-std=c++1y|--std=gnu++1y|-std=gnu++1y|--std=c++1z|-std=c++1z|--std=gnu++1z|-std=gnu++1z|--std=c++2a|-std=c++2a) + fallback_std_flag="-std=c++14" + # this is hopefully just occurring in a downstream project during CMake feature tests + # we really have no choice here but to accept the flag and change to an accepted C++ standard + echo "nvcc_wrapper does not accept standard flags $1 since partial standard flags and standards after C++17 are not supported. nvcc_wrapper will use $fallback_std_flag instead. It is undefined behavior to use this flag. This should only be occurring during CMake configuration." + if [ -n "$std_flag" ]; then + warn_std_flag + shared_args=${shared_args/ $std_flag/} + fi + std_flag=$fallback_std_flag + shared_args="$shared_args $std_flag" + ;; + -std=gnu*) + corrected_std_flag=${1/gnu/c} + echo "nvcc_wrapper has been given GNU extension standard flag $1 - reverting flag to $corrected_std_flag" + if [ -n "$std_flag" ]; then + warn_std_flag + shared_args=${shared_args/ $std_flag/} + fi + std_flag=$corrected_std_flag + shared_args="$shared_args $std_flag" + ;; + --std=c++17|-std=c++17) + if [ -n "$std_flag" ]; then + warn_std_flag + shared_args=${shared_args/ $std_flag/} + fi + # NVCC only has C++17 from version 11 on + cuda_main_version=$([[ $(${nvcc_compiler} --version) =~ V([0-9]+) ]] && echo ${BASH_REMATCH[1]}) + if [ ${cuda_main_version} -lt 11 ]; then + fallback_std_flag="-std=c++14" + # this is hopefully just occurring in a downstream project during CMake feature tests + # we really have no choice here but to accept the flag and change to an accepted C++ standard + echo "nvcc_wrapper does not accept standard flags $1 since partial standard flags and standards after C++14 are not supported. nvcc_wrapper will use $fallback_std_flag instead. It is undefined behavior to use this flag. This should only be occurring during CMake configuration." + std_flag=$fallback_std_flag + else + std_flag=$1 + fi + shared_args="$shared_args $std_flag" + ;; + --std=c++11|-std=c++11|--std=c++14|-std=c++14) + if [ -n "$std_flag" ]; then + warn_std_flag + shared_args=${shared_args/ $std_flag/} + fi + std_flag=$1 + shared_args="$shared_args $std_flag" + ;; + + #convert PGI standard flags to something nvcc can handle + --c++11|--c++14|--c++17) + if [ -n "$std_flag" ]; then + warn_std_flag + shared_args=${shared_args/ $std_flag/} + fi + std_flag="-std=${1#--}" + shared_args="$shared_args $std_flag" + ;; + + #ignore PGI forcing ISO C++-conforming code + -A) + ;; + + #strip of -std=c++98 due to nvcc warnings and Tribits will place both -std=c++11 and -std=c++98 + -std=c++98|--std=c++98) + ;; + #strip of pedantic because it produces endless warnings about #LINE added by the preprocessor + -pedantic|-Wpedantic|-ansi) + ;; + #strip of -Woverloaded-virtual to avoid "cc1: warning: command line option ‘-Woverloaded-virtual’ is valid for C++/ObjC++ but not for C" + -Woverloaded-virtual) + ;; + #strip -Xcompiler because we add it + -Xcompiler) + if [[ $2 != "-o" ]]; then + if [ $first_xcompiler_arg -eq 1 ]; then + xcompiler_args="$2" + first_xcompiler_arg=0 + else + xcompiler_args="$xcompiler_args,$2" + fi + shift + fi + # else this we have -Xcompiler -o <filename>, in this case just drop -Xcompiler and process + # the -o flag with the filename (done above) + ;; + #strip of "-x cu" because we add that + -x) + if [[ $2 != "cu" ]]; then + if [ $first_xcompiler_arg -eq 1 ]; then + xcompiler_args="-x,$2" + first_xcompiler_arg=0 + else + xcompiler_args="$xcompiler_args,-x,$2" + fi + fi + shift + ;; + #Handle -+ (same as -x c++, specifically used for xl compilers, but mutually exclusive with -x. So replace it with -x c++) + -+) + if [ $first_xcompiler_arg -eq 1 ]; then + xcompiler_args="-x,c++" + first_xcompiler_arg=0 + else + xcompiler_args="$xcompiler_args,-x,c++" + fi + ;; + #Handle -ccbin (if its not set we can set it to a default value) + -ccbin) + cuda_args="$cuda_args $1 $2" + ccbin_set=1 + host_compiler=$2 + shift + ;; + + #Handle -arch argument (if its not set use a default) this is the version with = sign + -arch*|-gencode*) + cuda_args="$cuda_args $1" + arch_set=1 + ;; + #Handle -code argument (if its not set use a default) this is the version with = sign + -code*) + cuda_args="$cuda_args $1" + ;; + #Handle -arch argument (if its not set use a default) this is the version without = sign + -arch|-gencode) + cuda_args="$cuda_args $1 $2" + arch_set=1 + shift + ;; + #Handle -code argument (if its not set use a default) this is the version without = sign + -code) + cuda_args="$cuda_args $1 $2" + shift + ;; + #Handle -Xcudafe argument + -Xcudafe) + cuda_args="$cuda_args -Xcudafe $2" + shift + ;; + #Handle -Xlinker argument + -Xlinker) + xlinker_args="$xlinker_args -Xlinker $2" + shift + ;; + #Handle args that should be sent to the linker + -Wl,*) + xlinker_args="$xlinker_args -Xlinker ${1:4:${#1}}" + host_linker_args="$host_linker_args ${1:4:${#1}}" + ;; + #Handle object files: -x cu applies to all input files, so give them to linker, except if only linking + *.a|*.so|*.o|*.obj) + object_files="$object_files $1" + object_files_xlinker="$object_files_xlinker -Xlinker $1" + ;; + #Handle object files which always need to use "-Xlinker": -x cu applies to all input files, so give them to linker, except if only linking + @*|*.dylib) + object_files="$object_files -Xlinker $1" + object_files_xlinker="$object_files_xlinker -Xlinker $1" + ;; + #Handle shared libraries with *.so.* names which nvcc can't do. + *.so.*) + shared_versioned_libraries_host="$shared_versioned_libraries_host $1" + shared_versioned_libraries="$shared_versioned_libraries -Xlinker $1" + ;; + #All other args are sent to the host compiler + *) + if [ $first_xcompiler_arg -eq 1 ]; then + xcompiler_args=$1 + first_xcompiler_arg=0 + else + xcompiler_args="$xcompiler_args,$1" + fi + ;; + esac + + shift +done + +# Only print host compiler version +if [ $get_host_version -eq 1 ]; then + $host_compiler --version + exit +fi + +#Remove duplicate object files +if [ $remove_duplicate_link_files -eq 1 ]; then +for obj in $object_files +do + object_files_reverse="$obj $object_files_reverse" +done + +object_files_reverse_clean="" +for obj in $object_files_reverse +do + exists=false + for obj2 in $object_files_reverse_clean + do + if [ "$obj" == "$obj2" ] + then + exists=true + echo "Exists: $obj" + fi + done + if [ "$exists" == "false" ] + then + object_files_reverse_clean="$object_files_reverse_clean $obj" + fi +done + +object_files="" +for obj in $object_files_reverse_clean +do + object_files="$obj $object_files" +done +fi + +#Add default host compiler if necessary +if [ $ccbin_set -ne 1 ]; then + cuda_args="$cuda_args -ccbin $host_compiler" +fi + +#Add architecture command +if [ $arch_set -ne 1 ]; then + cuda_args="$cuda_args -arch=$default_arch" +fi + +#Compose compilation command +nvcc_command="$nvcc_compiler $cuda_args $shared_args $xlinker_args $shared_versioned_libraries" +if [ $first_xcompiler_arg -eq 0 ]; then + nvcc_command="$nvcc_command -Xcompiler $xcompiler_args" +fi + +#Replace all commas in xcompiler_args with a space for the host only command +xcompiler_args=${xcompiler_args//,/" "} + +#Compose host only command +host_command="$host_compiler $shared_args $host_only_args $compile_arg $output_arg $xcompiler_args $host_linker_args $shared_versioned_libraries_host" + +#nvcc does not accept '#pragma ident SOME_MACRO_STRING' but it does accept '#ident SOME_MACRO_STRING' +if [ $replace_pragma_ident -eq 1 ]; then + cpp_files2="" + for file in $cpp_files + do + var=`grep pragma ${file} | grep ident | grep "#"` + if [ "${#var}" -gt 0 ] + then + sed 's/#[\ \t]*pragma[\ \t]*ident/#ident/g' $file > $temp_dir/nvcc_wrapper_tmp_$file + cpp_files2="$cpp_files2 $temp_dir/nvcc_wrapper_tmp_$file" + else + cpp_files2="$cpp_files2 $file" + fi + done + cpp_files=$cpp_files2 + #echo $cpp_files +fi + +if [ "$cpp_files" ]; then + nvcc_command="$nvcc_command $object_files_xlinker -x cu $cpp_files" +else + nvcc_command="$nvcc_command $object_files" +fi + +if [ "$cpp_files" ]; then + host_command="$host_command $object_files $cpp_files" +else + host_command="$host_command $object_files" +fi + +if [ $depfile_separate -eq 1 ]; then + # run nvcc a second time to generate dependencies (without compiling) + nvcc_depfile_command="$nvcc_command -M $depfile_target_arg $depfile_output_arg" +else + nvcc_depfile_command="" +fi + +nvcc_command="$nvcc_command $compile_arg $output_arg" + +#Print command for dryrun +if [ $dry_run -eq 1 ]; then + if [ $host_only -eq 1 ]; then + echo $host_command + elif [ -n "$nvcc_depfile_command" ]; then + echo $nvcc_command "&&" $nvcc_depfile_command + else + echo $nvcc_command + fi + exit 0 +fi + +#Run compilation command +if [ $host_only -eq 1 ]; then + if [ "$NVCC_WRAPPER_SHOW_COMMANDS_BEING_RUN" == "1" ] ; then + echo "$host_command" + fi + $host_command +elif [ -n "$nvcc_depfile_command" ]; then + if [ "$NVCC_WRAPPER_SHOW_COMMANDS_BEING_RUN" == "1" ] ; then + echo "$nvcc_command && $nvcc_depfile_command" + fi + $nvcc_command && $nvcc_depfile_command +else + if [ "$NVCC_WRAPPER_SHOW_COMMANDS_BEING_RUN" == "1" ] ; then + echo "$nvcc_command" + fi + $nvcc_command +fi +error_code=$? + +#Report error code +exit $error_code diff --git a/packages/kokkos/bin/runtest b/packages/kokkos/bin/runtest new file mode 100755 index 0000000000000000000000000000000000000000..92411fe5badf5398b3e2cee325161f225d98f33a --- /dev/null +++ b/packages/kokkos/bin/runtest @@ -0,0 +1,165 @@ +#!/usr/bin/env bash + +function get_path() { + cd "$(dirname "$0")" + cd .. + echo "$(pwd -P)" +} + +KOKKOS_PATH="$(get_path "$0")" + +function show_help() { + local cmd=$(basename "$0") + echo "Usage: ${cmd} <options> " + echo " Build and run the tests" + echo "" + echo "Options:" + echo " -j=N|--make-j=N Build the tests in parallel" + echo " -c|--clean Clean build and regenerate make files" + echo " --clean-on-pass Clean build when runtest passes" + echo " --output-prefix=<pre> Prefix of log files Default: runtest" + echo " --build-only Only build the tests" + echo " -v|--verbose Tee STDOUT and STDERR to screen and files" + echo " -h|--help Show this message" + echo "" + ${KOKKOS_PATH}/generate_makefile.bash --help + return 0 +} + + +declare -a GENERATE_ARGS=() +declare -i VERBOSE=0 +declare -i CLEAN=0 +declare -i CLEAN_ON_PASS=0 +declare -i BUILD_ONLY=0 +OUTPUT="runtest" + +declare -i MAKE_J=${HPCBIND_NUM_PUS:-1} + +for i in $@; do + case $i in + -j=*|--make-j=*) + MAKE_J=${i#*=} + shift + ;; + -c|--clean) + CLEAN=1 + shift + ;; + --clean-on-pass) + CLEAN_ON_PASS=1 + shift + ;; + --output-prefix=*) + OUTPUT=${i#*=} + shift + ;; + --build-only) + BUILD_ONLY=1 + shift + ;; + -v|--verbose) + VERBOSE=1 + shift + ;; + -h|--help) + show_help + exit 0 + ;; + *) + GENERATE_ARGS+=("$i") + shift + ;; + esac +done + +if [[ "$(pwd -P)" == ${KOKKOS_PATH} ]]; then + echo "Cannot call $0 from root repository path ${KOKKOS_PATH}" + exit 1 +fi + +# Some makefile dependencies are incorrect, so clean needs to force +# a new call to generate_makefiles.bash +if [[ ${CLEAN} -eq 1 ]]; then + START=${SECONDS} + echo "Cleaning" + /bin/rm -rf algorithms containers core example install Makefile >/dev/null 2>&1 + END=${SECONDS} + echo " $((END-START)) seconds" + if [[ ${VERBOSE} -eq 1 ]]; then + echo "" + echo "" + fi +fi + +declare -i START=${SECONDS} +echo "Generating Makefile" +echo " ${KOKKOS_PATH}/generate_makefile.bash --kokkos-path=${KOKKOS_PATH} ${GENERATE_ARGS[@]}" + +if [[ ${VERBOSE} -eq 0 ]]; then + "${KOKKOS_PATH}"/generate_makefile.bash --kokkos-path="${KOKKOS_PATH}" "${GENERATE_ARGS[@]}" > ${OUTPUT}.out 2> >(tee ${OUTPUT}.err >&2) +else + "${KOKKOS_PATH}"/generate_makefile.bash --kokkos-path="${KOKKOS_PATH}" "${GENERATE_ARGS[@]}" > >(tee ${OUTPUT}.out) 2> >(tee ${OUTPUT}.err >&2) +fi +declare -i RESULT=$? +declare -i END=${SECONDS} +if [[ ${RESULT} -eq 0 ]]; then + echo " PASS: $((END-START)) seconds" + if [[ ${VERBOSE} -eq 1 ]]; then + echo "" + echo "" + fi +else + cat ${OUTPUT}.out | grep "FAIL" + cat ${OUTPUT}.err | grep "FAIL" + echo " FAIL: $((END-START)) seconds" + exit 1 +fi + +START=${SECONDS} +echo "Building" +if [[ ${VERBOSE} -eq 0 ]]; then + make --keep-going -j ${MAKE_J} build-test >> ${OUTPUT}.out 2> >(tee -a ${OUTPUT}.err >&2) +else + make --keep-going -j ${MAKE_J} build-test > >(tee -a ${OUTPUT}.out) 2> >(tee -a ${OUTPUT}.err >&2) +fi +RESULT=$? +END=${SECONDS} +if [[ ${RESULT} -eq 0 ]]; then + echo " PASS: $((END-START)) seconds" + if [[ ${VERBOSE} -eq 1 ]]; then + echo "" + echo "" + fi +else + cat ${OUTPUT}.out | grep -E "[[:space:]]error:[[:space:]]" + cat ${OUTPUT}.err | grep -E "[[:space:]]error:[[:space:]]" + echo " FAIL: $((END-START)) seconds" + exit 1 +fi + +if [[ ${BUILD_ONLY} -eq 0 ]]; then + START=${SECONDS} + echo "Testing" + if [[ ${VERBOSE} -eq 0 ]]; then + make --keep-going test >> ${OUTPUT}.out 2> >(tee -a ${OUTPUT}.err >&2) + else + make --keep-going test > >(tee -a ${OUTPUT}.out) 2> >(tee -a ${OUTPUT}.err >&2) + fi + RESULT=$? + END=${SECONDS} + if [[ ${RESULT} -eq 0 ]]; then + echo " PASS: $((END-START)) seconds" + if [[ ${CLEAN_ON_PASS} -eq 1 ]]; then + make clean + fi + else + cat ${OUTPUT}.out | grep "FAIL" + cat ${OUTPUT}.err | grep "FAIL" + echo " FAIL: $((END-START)) seconds" + exit 1 + fi +fi + +exit ${RESULT} + diff --git a/packages/kokkos/cmake/CTestConfig.cmake.in b/packages/kokkos/cmake/CTestConfig.cmake.in new file mode 100644 index 0000000000000000000000000000000000000000..1f82c0d64d15e0a4fb346cfb7227be9cd41e5f17 --- /dev/null +++ b/packages/kokkos/cmake/CTestConfig.cmake.in @@ -0,0 +1,91 @@ +#----------------------------------------------------------------------------------------# +# +# CTestConfig.cmake template for Kokkos +# +#----------------------------------------------------------------------------------------# + +# +# dash-board related +# +set(CTEST_PROJECT_NAME "Kokkos") +set(CTEST_NIGHTLY_START_TIME "01:00:00 UTC") +set(CTEST_DROP_METHOD "https") +set(CTEST_DROP_SITE "cdash.nersc.gov") +set(CTEST_DROP_LOCATION "/submit.php?project=${CTEST_PROJECT_NAME}") +set(CTEST_CDASH_VERSION "1.6") +set(CTEST_CDASH_QUERY_VERSION TRUE) +set(CTEST_SUBMIT_RETRY_COUNT "1") +set(CTEST_SUBMIT_RETRY_DELAY "30") + +# +# configure/build related +# +set(CTEST_BUILD_NAME "@BUILD_NAME@") +set(CTEST_MODEL "@MODEL@") +set(CTEST_SITE "@SITE@") +set(CTEST_CONFIGURATION_TYPE "@BUILD_TYPE@") +set(CTEST_SOURCE_DIRECTORY "@SOURCE_REALDIR@") +set(CTEST_BINARY_DIRECTORY "@BINARY_REALDIR@") + +# +# configure/build related +# +set(CTEST_UPDATE_TYPE "git") +set(CTEST_UPDATE_VERSION_ONLY ON) +# set(CTEST_GENERATOR "") +# set(CTEST_GENERATOR_PLATFORM "") + +# +# testing related +# +set(CTEST_TIMEOUT "7200") +set(CTEST_TEST_TIMEOUT "7200") +set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_ERRORS "100") +set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_WARNINGS "100") +set(CTEST_CUSTOM_MAXIMUM_PASSED_TEST_OUTPUT_SIZE "1048576") + +# +# coverage related +# +set(CTEST_CUSTOM_COVERAGE_EXCLUDE ".*tpls/.*;/usr/.*;.*unit_test/.*;.*unit_tests/.*;.*perf_test/.*") + +# +# commands +# +if(NOT "@CHECKOUT_COMMAND@" STREQUAL "") + set(CTEST_CHECKOUT_COMMAND "@CHECKOUT_COMMAND@") +endif() +set(CTEST_UPDATE_COMMAND "@GIT_EXECUTABLE@") +set(CTEST_CONFIGURE_COMMAND "@CMAKE_COMMAND@ -DCMAKE_BUILD_TYPE=@BUILD_TYPE@ -DKokkos_ENABLE_TESTS=ON @CONFIG_ARGS@ @SOURCE_REALDIR@") +set(CTEST_BUILD_COMMAND "@CMAKE_COMMAND@ --build @BINARY_REALDIR@ --target @TARGET@") +if(NOT WIN32) + set(CTEST_BUILD_COMMAND "${CTEST_BUILD_COMMAND} -- -j@BUILD_JOBS@") +endif() +set(CTEST_COVERAGE_COMMAND "gcov") +set(CTEST_MEMORYCHECK_COMMAND "valgrind") +set(CTEST_GIT_COMMAND "@GIT_EXECUTABLE@") + +# +# various configs +# +set(APPEND_VALUE @APPEND@) +if(APPEND_VALUE) + set(APPEND_CTEST APPEND) +endif() + +macro(SET_TEST_PROP VAR) + if(NOT "${ARGS}" STREQUAL "") + set(${VAR}_CTEST ${VAR} ${ARGN}) + endif() +endmacro() + +set_test_prop(START @START@) +set_test_prop(END @END@) +set_test_prop(STRIDE @STRIDE@) +set_test_prop(INCLUDE @INCLUDE@) +set_test_prop(EXCLUDE @EXCLUDE@) +set_test_prop(INCLUDE_LABEL @INCLUDE_LABEL@) +set_test_prop(EXCLUDE_LABEL @EXCLUDE_LABEL@) +set_test_prop(PARALLEL_LEVEL @PARALLEL_LEVEL@) +set_test_prop(STOP_TIME @STOP_TIME@) +set_test_prop(COVERAGE_LABELS @LABELS@) diff --git a/packages/kokkos/cmake/Dependencies.cmake b/packages/kokkos/cmake/Dependencies.cmake new file mode 100644 index 0000000000000000000000000000000000000000..c0be9f56411311a38a0f43a9b07b1109a0135bd8 --- /dev/null +++ b/packages/kokkos/cmake/Dependencies.cmake @@ -0,0 +1,9 @@ +TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( + SUBPACKAGES_DIRS_CLASSIFICATIONS_OPTREQS + #SubPackageName Directory Class Req/Opt + # + # New Kokkos subpackages: + Core core PS REQUIRED + Containers containers PS OPTIONAL + Algorithms algorithms PS OPTIONAL + ) diff --git a/packages/kokkos/cmake/KokkosCI.cmake b/packages/kokkos/cmake/KokkosCI.cmake new file mode 100644 index 0000000000000000000000000000000000000000..e8c9af37ad544a93a62f498e9a903696321a1c75 --- /dev/null +++ b/packages/kokkos/cmake/KokkosCI.cmake @@ -0,0 +1,350 @@ +cmake_minimum_required(VERSION 3.16 FATAL_ERROR) + +message(STATUS "") + +get_cmake_property(_cached_vars CACHE_VARIABLES) +set(KOKKOS_CMAKE_ARGS) +set(EXCLUDED_VARIABLES "CMAKE_COMMAND" "CMAKE_CPACK_COMMAND" "CMAKE_CTEST_COMMAND" "CMAKE_ROOT" + "CTEST_ARGS" "BUILD_NAME" "CMAKE_CXX_FLAGS" "CMAKE_BUILD_TYPE") +list(SORT _cached_vars) +foreach(_var ${_cached_vars}) + if(NOT "${_var}" IN_LIST EXCLUDED_VARIABLES) + list(APPEND KOKKOS_CMAKE_ARGS ${_var}) + if("${_var}" STREQUAL "CMAKE_BUILD_TYPE") + set(BUILD_TYPE "${CMAKE_BUILD_TYPE}") + endif() + endif() +endforeach() + + +#----------------------------------------------------------------------------------------# +# +# Macros and variables +# +#----------------------------------------------------------------------------------------# + +macro(CHECK_REQUIRED VAR) + if(NOT DEFINED ${VAR}) + message(FATAL_ERROR "Error! Variable '${VAR}' must be defined") + endif() +endmacro() + +# require the build name variable +CHECK_REQUIRED(BUILD_NAME) + +# uses all args +macro(SET_DEFAULT VAR) + if(NOT DEFINED ${VAR}) + set(${VAR} ${ARGN}) + endif() + # remove these ctest configuration variables from the defines + # passed to the Kokkos configuration + if("${VAR}" IN_LIST KOKKOS_CMAKE_ARGS) + list(REMOVE_ITEM KOKKOS_CMAKE_ARGS "${VAR}") + endif() +endmacro() + +# uses first arg -- useful for selecting via priority from multiple +# potentially defined variables, e.g.: +# +# set_default_arg1(BUILD_NAME ${TRAVIS_BUILD_NAME} ${BUILD_NAME}) +# +macro(SET_DEFAULT_ARG1 VAR) + if(NOT DEFINED ${VAR}) + foreach(_ARG ${ARGN}) + if(NOT "${_ARG}" STREQUAL "") + set(${VAR} ${_ARG}) + break() + endif() + endforeach() + endif() + # remove these ctest configuration variables from the defines + # passed to the Kokkos configuration + if("${VAR}" IN_LIST KOKKOS_CMAKE_ARGS) + list(REMOVE_ITEM KOKKOS_CMAKE_ARGS "${VAR}") + endif() +endmacro() + +# determine the default working directory +if(NOT "$ENV{WORKSPACE}" STREQUAL "") + set(WORKING_DIR "$ENV{WORKSPACE}") +else() + get_filename_component(WORKING_DIR ${CMAKE_CURRENT_LIST_DIR} DIRECTORY) +endif() + +# determine the hostname +execute_process(COMMAND hostname + OUTPUT_VARIABLE HOSTNAME + OUTPUT_STRIP_TRAILING_WHITESPACE) + +SET_DEFAULT(HOSTNAME "$ENV{HOSTNAME}") + +# get the number of processors +include(ProcessorCount) +ProcessorCount(NUM_PROCESSORS) + +# find git +find_package(Git QUIET) +if(NOT GIT_EXECUTABLE) + unset(GIT_EXECUTABLE CACHE) + unset(GIT_EXECUTABLE) +endif() + +function(EXECUTE_GIT_COMMAND VAR) + set(${VAR} "" PARENT_SCOPE) + execute_process(COMMAND ${GIT_EXECUTABLE} ${ARGN} + OUTPUT_VARIABLE VAL + RESULT_VARIABLE RET + OUTPUT_STRIP_TRAILING_WHITESPACE + WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR} + ERROR_QUIET) + string(REPLACE ";" " " _CMD "${GIT_EXECUTABLE} ${ARGN}") + set(LAST_GIT_COMMAND "${_CMD}" PARENT_SCOPE) + if(RET EQUAL 0) + set(${VAR} "${VAL}" PARENT_SCOPE) + endif() +endfunction() + +# just gets the git branch name if available +function(GET_GIT_BRANCH_NAME VAR) + execute_git_command(GIT_BRANCH branch --show-current) + set(_INVALID "%D" "HEAD") + if(NOT GIT_BRANCH OR "${GIT_BRANCH}" IN_LIST _INVALID) + execute_git_command(GIT_BRANCH show -s --format=%D) + if(NOT GIT_BRANCH OR "${GIT_BRANCH}" IN_LIST _INVALID) + execute_git_command(GIT_BRANCH --describe all) + endif() + endif() + # + if(GIT_BRANCH) + string(REPLACE " " ";" _DESC "${GIT_BRANCH}") + # just set it to last one via loop instead of wonky cmake index manip + foreach(_ITR ${_DESC}) + set(GIT_BRANCH "${_ITR}") + endforeach() + set(${VAR} "${GIT_BRANCH}" PARENT_SCOPE) + message(STATUS "GIT BRANCH via '${LAST_GIT_COMMAND}': ${GIT_BRANCH}") + endif() +endfunction() + +# just gets the git branch name if available +function(GET_GIT_AUTHOR_NAME VAR) + execute_git_command(GIT_AUTHOR show -s --format=%an) + if(GIT_AUTHOR) + string(LENGTH "${GIT_AUTHOR}" STRLEN) + # if the build name gets too long, this can cause submission errors + if(STRLEN GREATER 24) + # remove middle initial + string(REGEX REPLACE " [A-Z]\. " " " GIT_AUTHOR "${GIT_AUTHOR}") + # get first and sur name + string(REGEX REPLACE "([A-Za-z]+) ([A-Za-z]+)" "\\1" F_NAME "${GIT_AUTHOR}") + string(REGEX REPLACE "([A-Za-z]+) ([A-Za-z]+)" "\\2" S_NAME "${GIT_AUTHOR}") + if(S_NAME) + set(GIT_AUTHOR "${S_NAME}") + elseif(F_NAME) + set(GIT_AUTHOR "${F_NAME}") + endif() + endif() + # remove any spaces, quotes, periods, etc. + string(REGEX REPLACE "[ ',;_\.\"]+" "" GIT_AUTHOR "${GIT_AUTHOR}") + set(${VAR} "${GIT_AUTHOR}" PARENT_SCOPE) + message(STATUS "GIT AUTHOR via '${LAST_GIT_COMMAND}': ${GIT_AUTHOR}") + endif() +endfunction() + +# get the name of the branch +GET_GIT_BRANCH_NAME(GIT_BRANCH) +# get the name of the author +GET_GIT_AUTHOR_NAME(GIT_AUTHOR) +# author, prefer git method for consistency +SET_DEFAULT_ARG1(AUTHOR ${GIT_AUTHOR} $ENV{GIT_AUTHOR} $ENV{AUTHOR}) +# SLUG == owner_name/repo_name +SET_DEFAULT_ARG1(SLUG $ENV{TRAVIS_PULL_REQUEST_SLUG} $ENV{TRAVIS_REPO_SLUG} $ENV{APPVEYOR_REPO_NAME} $ENV{PULL_REQUEST_SLUG} $ENV{REPO_SLUG}) +# branch name +SET_DEFAULT_ARG1(BRANCH $ENV{TRAVIS_PULL_REQUEST_BRANCH} $ENV{TRAVIS_BRANCH} $ENV{APPVEYOR_PULL_REQUEST_HEAD_REPO_BRANCH} $ENV{APPVEYOR_REPO_BRANCH} $ENV{GIT_BRANCH} $ENV{BRANCH_NAME} $ENV{BRANCH} ${GIT_BRANCH}) +# pull request number +SET_DEFAULT_ARG1(PULL_REQUEST_NUM $ENV{TRAVIS_PULL_REQUEST} $ENV{CHANGE_ID} $ENV{APPVEYOR_PULL_REQUEST_NUMBER} $ENV{PULL_REQUEST_NUM}) +# get the event type, e.g. push, pull_request, api, cron, etc. +SET_DEFAULT_ARG1(EVENT_TYPE $ENV{TRAVIS_EVENT_TYPE} ${EVENT_TYPE}) + +if("${BRANCH}" STREQUAL "") + message(STATUS "Checked: environment variables for Travis, Appveyor, Jenkins (git plugin), BRANCH_NAME, BRANCH and 'git branch --show-current'") + message(FATAL_ERROR "Error! Git branch could not be determined. Please provide -DBRANCH=<name>") +endif() + +#----------------------------------------------------------------------------------------# +# +# Set default values if not provided on command-line +# +#----------------------------------------------------------------------------------------# + +SET_DEFAULT(SOURCE_DIR "${WORKING_DIR}") # source directory +SET_DEFAULT(BINARY_DIR "${WORKING_DIR}/build") # build directory +SET_DEFAULT(BUILD_TYPE "${CMAKE_BUILD_TYPE}") # Release, Debug, etc. +SET_DEFAULT(MODEL "Continuous") # Continuous, Nightly, or Experimental +SET_DEFAULT(JOBS 1) # number of parallel ctests +SET_DEFAULT(CTEST_COMMAND "${CMAKE_CTEST_COMMAND}") # just in case +SET_DEFAULT(CTEST_ARGS "-V --output-on-failure") # extra arguments when ctest is called +SET_DEFAULT(GIT_EXECUTABLE "git") # ctest_update +SET_DEFAULT(TARGET "all") # build target +SET_DEFAULT_ARG1(SITE "$ENV{SITE}" + "${HOSTNAME}") # update site +SET_DEFAULT_ARG1(BUILD_JOBS "$ENV{BUILD_JOBS}" + "${NUM_PROCESSORS}") # number of parallel compile jobs +# +# The variable below correspond to ctest arguments, i.e. START,END,STRIDE are +# '-I START,END,STRIDE' +# +SET_DEFAULT(START "") +SET_DEFAULT(END "") +SET_DEFAULT(STRIDE "") +SET_DEFAULT(INCLUDE "") +SET_DEFAULT(EXCLUDE "") +SET_DEFAULT(INCLUDE_LABEL "") +SET_DEFAULT(EXCLUDE_LABEL "") +SET_DEFAULT(PARALLEL_LEVEL "") +SET_DEFAULT(STOP_TIME "") +SET_DEFAULT(LABELS "") +SET_DEFAULT(NOTES "") + +# default static build tag for Nightly +set(BUILD_TAG "${BRANCH}") + +if(NOT BUILD_TYPE) + # default for kokkos if not specified + set(BUILD_TYPE "RelWithDebInfo") +endif() + +# generate dynamic name if continuous or experimental model +if(NOT "${MODEL}" STREQUAL "Nightly") + if(EVENT_TYPE AND PULL_REQUEST_NUM) + # e.g. pull_request/123 + if(AUTHOR) + set(BUILD_TAG "${AUTHOR}/${EVENT_TYPE}/${PULL_REQUEST_NUM}") + else() + set(BUILD_TAG "${EVENT_TYPE}/${PULL_REQUEST_NUM}") + endif() + elseif(SLUG) + # e.g. owner_name/repo_name + set(BUILD_TAG "${SLUG}") + elseif(AUTHOR) + set(BUILD_TAG "${AUTHOR}/${BRANCH}") + endif() + if(EVENT_TYPE AND NOT PULL_REQUEST_NUM) + set(BUILD_TAG "${BUILD_TAG}-${EVENT_TYPE}") + endif() +endif() + +# unnecessary +string(REPLACE "/remotes/" "/" BUILD_TAG "${BUILD_TAG}") +string(REPLACE "/origin/" "/" BUILD_TAG "${BUILD_TAG}") + +message(STATUS "BUILD_TAG: ${BUILD_TAG}") + +set(BUILD_NAME "[${BUILD_TAG}] [${BUILD_NAME}-${BUILD_TYPE}]") + +# colons in build name create extra (empty) entries in CDash +string(REPLACE ":" "-" BUILD_NAME "${BUILD_NAME}") +# unnecessary info +string(REPLACE "/merge]" "]" BUILD_NAME "${BUILD_NAME}") +# consistency +string(REPLACE "/pr/" "/pull/" BUILD_NAME "${BUILD_NAME}") +string(REPLACE "pull_request/" "pull/" BUILD_NAME "${BUILD_NAME}") +# miscellaneous from missing fields +string(REPLACE "--" "-" BUILD_NAME "${BUILD_NAME}") +string(REPLACE "-]" "]" BUILD_NAME "${BUILD_NAME}") + +# check binary directory +if(EXISTS ${BINARY_DIR}) + if(NOT IS_DIRECTORY "${BINARY_DIR}") + message(FATAL_ERROR "Error! '${BINARY_DIR}' already exists and is not a directory!") + endif() + file(GLOB BINARY_DIR_FILES "${BINARY_DIR}/*") + if(NOT "${BINARY_DIR_FILES}" STREQUAL "") + message(FATAL_ERROR "Error! '${BINARY_DIR}' already exists and is not empty!") + endif() +endif() + +get_filename_component(SOURCE_REALDIR ${SOURCE_DIR} REALPATH) +get_filename_component(BINARY_REALDIR ${BINARY_DIR} REALPATH) + +#----------------------------------------------------------------------------------------# +# +# Generate the CTestConfig.cmake +# +#----------------------------------------------------------------------------------------# + +set(CONFIG_ARGS) +foreach(_ARG ${KOKKOS_CMAKE_ARGS}) + if(NOT "${${_ARG}}" STREQUAL "") + get_property(_ARG_TYPE CACHE ${_ARG} PROPERTY TYPE) + if("${_ARG_TYPE}" STREQUAL "UNINITIALIZED") + if("${${_ARG}}" STREQUAL "ON" OR "${${_ARG}}" STREQUAL "OFF") + set(_ARG_TYPE "BOOL") + elseif(EXISTS "${${_ARG}}" AND NOT IS_DIRECTORY "${${_ARG}}") + set(_ARG_TYPE "FILEPATH") + elseif(EXISTS "${${_ARG}}" AND IS_DIRECTORY "${${_ARG}}") + set(_ARG_TYPE "PATH") + elseif(NOT "${${_ARG}}" STREQUAL "") + set(_ARG_TYPE "STRING") + endif() + endif() + set(CONFIG_ARGS "${CONFIG_ARGS}set(${_ARG} \"${${_ARG}}\" CACHE ${_ARG_TYPE} \"\")\n") + endif() +endforeach() + +file(WRITE ${BINARY_REALDIR}/initial-cache.cmake +" +set(CMAKE_CXX_FLAGS \"${CMAKE_CXX_FLAGS}\" CACHE STRING \"\") +${CONFIG_ARGS} +") + +file(READ ${BINARY_REALDIR}/initial-cache.cmake _CACHE_INFO) +message(STATUS "Initial cache:\n${_CACHE_INFO}") + +# initialize the cache +set(CONFIG_ARGS "-C ${BINARY_REALDIR}/initial-cache.cmake") + + +# generate the CTestConfig.cmake +configure_file( + ${CMAKE_CURRENT_LIST_DIR}/CTestConfig.cmake.in + ${BINARY_REALDIR}/CTestConfig.cmake + @ONLY) + +# copy/generate the dashboard script +configure_file( + ${CMAKE_CURRENT_LIST_DIR}/KokkosCTest.cmake.in + ${BINARY_REALDIR}/KokkosCTest.cmake + @ONLY) + +# custom CTest settings go in ${BINARY_DIR}/CTestCustom.cmake +execute_process( + COMMAND ${CMAKE_COMMAND} -E touch CTestCustom.cmake + WORKING_DIRECTORY ${BINARY_REALDIR} + ) + +#----------------------------------------------------------------------------------------# +# +# Execute CTest +# +#----------------------------------------------------------------------------------------# + +message(STATUS "") +message(STATUS "BUILD_NAME: ${BUILD_NAME}") +message(STATUS "Executing '${CTEST_COMMAND} -S KokkosCTest.cmake ${CTEST_ARGS}'...") +message(STATUS "") + +# e.g. -DCTEST_ARGS="--output-on-failure -VV" should really be -DCTEST_ARGS="--output-on-failure;-VV" +string(REPLACE " " ";" CTEST_ARGS "${CTEST_ARGS}") + +execute_process( + COMMAND ${CTEST_COMMAND} -S KokkosCTest.cmake ${CTEST_ARGS} + RESULT_VARIABLE RET + WORKING_DIRECTORY ${BINARY_REALDIR} + ) + +# ensure that any non-zero result variable gets propagated +if(NOT RET EQUAL 0) + message(FATAL_ERROR "CTest return non-zero exit code: ${RET}") +endif() diff --git a/packages/kokkos/cmake/KokkosCTest.cmake.in b/packages/kokkos/cmake/KokkosCTest.cmake.in new file mode 100644 index 0000000000000000000000000000000000000000..b6917f3cc1897aa6b1f0876560bb08c0c87b4c3a --- /dev/null +++ b/packages/kokkos/cmake/KokkosCTest.cmake.in @@ -0,0 +1,261 @@ +cmake_minimum_required(VERSION 3.16 FATAL_ERROR) + +if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/CTestConfig.cmake") + include("${CMAKE_CURRENT_LIST_DIR}/CTestConfig.cmake") +endif() + +include(ProcessorCount) +ProcessorCount(CTEST_PROCESSOR_COUNT) + +cmake_policy(SET CMP0009 NEW) +cmake_policy(SET CMP0011 NEW) + +# ---------------------------------------------------------------------------- # +# -- Commands +# ---------------------------------------------------------------------------- # +find_program(CTEST_CMAKE_COMMAND NAMES cmake) +find_program(CTEST_UNAME_COMMAND NAMES uname) + +find_program(CTEST_BZR_COMMAND NAMES bzr) +find_program(CTEST_CVS_COMMAND NAMES cvs) +find_program(CTEST_GIT_COMMAND NAMES git) +find_program(CTEST_HG_COMMAND NAMES hg) +find_program(CTEST_P4_COMMAND NAMES p4) +find_program(CTEST_SVN_COMMAND NAMES svn) + +find_program(VALGRIND_COMMAND NAMES valgrind) +find_program(GCOV_COMMAND NAMES gcov) +find_program(LCOV_COMMAND NAMES llvm-cov) +find_program(MEMORYCHECK_COMMAND NAMES valgrind ) + +set(MEMORYCHECK_TYPE Valgrind) +# set(MEMORYCHECK_TYPE Purify) +# set(MEMORYCHECK_TYPE BoundsChecker) +# set(MEMORYCHECK_TYPE ThreadSanitizer) +# set(MEMORYCHECK_TYPE AddressSanitizer) +# set(MEMORYCHECK_TYPE LeakSanitizer) +# set(MEMORYCHECK_TYPE MemorySanitizer) +# set(MEMORYCHECK_TYPE UndefinedBehaviorSanitizer) +set(MEMORYCHECK_COMMAND_OPTIONS "--trace-children=yes --leak-check=full") + +# ---------------------------------------------------------------------------- # +# -- Settings +# ---------------------------------------------------------------------------- # +## -- Process timeout in seconds +set(CTEST_TIMEOUT "7200") +## -- Set output to English +set(ENV{LC_MESSAGES} "en_EN" ) + + +# ---------------------------------------------------------------------------- # +# -- Copy ctest configuration file +# ---------------------------------------------------------------------------- # +macro(COPY_CTEST_CONFIG_FILES) + + foreach(_FILE CTestConfig.cmake CTestCustom.cmake) + + # if current directory is not binary or source directory + if(NOT "${CMAKE_CURRENT_LIST_DIR}" STREQUAL "${CTEST_BINARY_DIRECTORY}" AND + NOT "${CTEST_SOURCE_DIRECTORY}" STREQUAL "${CTEST_BINARY_DIRECTORY}") + + # if file exists in current directory + if(EXISTS ${CMAKE_CURRENT_LIST_DIR}/${_FILE}) + configure_file(${CMAKE_CURRENT_LIST_DIR}/${_FILE} + ${CTEST_BINARY_DIRECTORY}/${_FILE} COPYONLY) + endif() + + # if source and binary differ + elseif(NOT "${CTEST_SOURCE_DIRECTORY}" STREQUAL "${CTEST_BINARY_DIRECTORY}") + + # if file exists in source directory but not in binary directory + if(EXISTS ${CTEST_SOURCE_DIRECTORY}/${_FILE} AND + NOT EXISTS ${CTEST_BINARY_DIRECTORY}/${_FILE}) + configure_file(${CTEST_SOURCE_DIRECTORY}/${_FILE} + ${CTEST_BINARY_DIRECTORY}/${_FILE} COPYONLY) + endif() + + endif() + endforeach() + +endmacro() + +ctest_read_custom_files("${CMAKE_CURRENT_LIST_DIR}") + +message(STATUS "CTEST_MODEL: ${CTEST_MODEL}") + +#-------------------------------------------------------------------------# +# Start +# +message(STATUS "") +message(STATUS "[${CTEST_BUILD_NAME}] Running START_CTEST stage...") +message(STATUS "") + +ctest_start(${CTEST_MODEL} TRACK ${CTEST_MODEL} ${APPEND_CTEST} + ${CTEST_SOURCE_DIRECTORY} ${CTEST_BINARY_DIRECTORY}) + + +#-------------------------------------------------------------------------# +# Config +# +copy_ctest_config_files() +ctest_read_custom_files("${CTEST_BINARY_DIRECTORY}") + + +#-------------------------------------------------------------------------# +# Update +# +message(STATUS "") +message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_UPDATE stage...") +message(STATUS "") + +ctest_update(SOURCE "${CTEST_SOURCE_DIRECTORY}" + RETURN_VALUE up_ret) + + +#-------------------------------------------------------------------------# +# Configure +# +message(STATUS "") +message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_CONFIGURE stage...") +message(STATUS "") + +ctest_configure(BUILD "${CTEST_BINARY_DIRECTORY}" + SOURCE ${CTEST_SOURCE_DIRECTORY} + ${APPEND_CTEST} + OPTIONS "${CTEST_CONFIGURE_OPTIONS}" + RETURN_VALUE config_ret) + + +#-------------------------------------------------------------------------# +# Echo configure log bc Damien wants to delay merging this PR for eternity +# +file(GLOB _configure_log "${CTEST_BINARY_DIRECTORY}/Testing/Temporary/LastConfigure*.log") +# should only have one but loop just for safety +foreach(_LOG ${_configure_log}) + file(READ ${_LOG} _LOG_MESSAGE) + message(STATUS "Configure Log: ${_LOG}") + message(STATUS "\n${_LOG_MESSAGE}\n") +endforeach() + + +#-------------------------------------------------------------------------# +# Build +# +message(STATUS "") +message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_BUILD stage...") +message(STATUS "") + +ctest_build(BUILD "${CTEST_BINARY_DIRECTORY}" + ${APPEND_CTEST} + RETURN_VALUE build_ret) + + +#-------------------------------------------------------------------------# +# Echo build log bc Damien wants to delay merging this PR for eternity +# +file(GLOB _build_log "${CTEST_BINARY_DIRECTORY}/Testing/Temporary/LastBuild*.log") +# should only have one but loop just for safety +foreach(_LOG ${_build_log}) + file(READ ${_LOG} _LOG_MESSAGE) + message(STATUS "Build Log: ${_LOG}") + message(STATUS "\n${_LOG_MESSAGE}\n") +endforeach() + + +#-------------------------------------------------------------------------# +# Test +# +message(STATUS "") +message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_TEST stage...") +message(STATUS "") + +ctest_test(RETURN_VALUE test_ret + ${APPEND_CTEST} + ${START_CTEST} + ${END_CTEST} + ${STRIDE_CTEST} + ${INCLUDE_CTEST} + ${EXCLUDE_CTEST} + ${INCLUDE_LABEL_CTEST} + ${EXCLUDE_LABEL_CTEST} + ${PARALLEL_LEVEL_CTEST} + ${STOP_TIME_CTEST} + SCHEDULE_RANDOM OFF) + + +#-------------------------------------------------------------------------# +# Coverage +# +message(STATUS "") +message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_COVERAGE stage...") +message(STATUS "") + +execute_process(COMMAND ${CTEST_COVERAGE_COMMAND} ${CTEST_COVERAGE_EXTRA_FLAGS} + WORKING_DIRECTORY ${CTEST_BINARY_DIRECTORY} + ERROR_QUIET) + +ctest_coverage(${APPEND_CTEST} + ${CTEST_COVERAGE_LABELS} + RETURN_VALUE cov_ret) + + +#-------------------------------------------------------------------------# +# MemCheck +# +message(STATUS "") +message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_MEMCHECK stage...") +message(STATUS "") + +ctest_memcheck(RETURN_VALUE mem_ret + ${APPEND_CTEST} + ${START_CTEST} + ${END_CTEST} + ${STRIDE_CTEST} + ${INCLUDE_CTEST} + ${EXCLUDE_CTEST} + ${INCLUDE_LABEL_CTEST} + ${EXCLUDE_LABEL_CTEST} + ${PARALLEL_LEVEL_CTEST}) + + +#-------------------------------------------------------------------------# +# Submit +# +message(STATUS "") +message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_SUBMIT stage...") +message(STATUS "") + +file(GLOB_RECURSE NOTE_FILES "${CTEST_BINARY_DIRECTORY}/*CTestNotes.cmake") +foreach(_FILE ${NOTE_FILES}) + message(STATUS "Including CTest notes files: \"${_FILE}\"...") + include("${_FILE}") +endforeach() + +# capture submit error so it doesn't fail because of a submission error +ctest_submit(RETURN_VALUE submit_ret + RETRY_COUNT 2 + RETRY_DELAY 10 + CAPTURE_CMAKE_ERROR submit_err) + +#-------------------------------------------------------------------------# +# Submit +# +message(STATUS "") +message(STATUS "[${CTEST_BUILD_NAME}] Finished ${CTEST_MODEL} Stages (${STAGES})") +message(STATUS "") + + +#-------------------------------------------------------------------------# +# Non-zero exit codes for important errors +# +if(NOT config_ret EQUAL 0) + message(FATAL_ERROR "Error during configuration! Exit code: ${config_ret}") +endif() + +if(NOT build_ret EQUAL 0) + message(FATAL_ERROR "Error during build! Exit code: ${build_ret}") +endif() + +if(NOT test_ret EQUAL 0) + message(FATAL_ERROR "Error during testing! Exit code: ${test_ret}") +endif() diff --git a/packages/kokkos/cmake/KokkosConfig.cmake.in b/packages/kokkos/cmake/KokkosConfig.cmake.in new file mode 100644 index 0000000000000000000000000000000000000000..44a8fcd9c319326399ab19146f8cf213dbb51f64 --- /dev/null +++ b/packages/kokkos/cmake/KokkosConfig.cmake.in @@ -0,0 +1,62 @@ +# No need for policy push/pop. CMake also manages a new entry for scripts +# loaded by include() and find_package() commands except when invoked with +# the NO_POLICY_SCOPE option +# CMP0057 + NEW -> IN_LIST operator in IF(...) +CMAKE_POLICY(SET CMP0057 NEW) + +# Compute paths +@PACKAGE_INIT@ + +#Find dependencies +INCLUDE(CMakeFindDependencyMacro) + +#This needs to go above the KokkosTargets in case +#the Kokkos targets depend in some way on the TPL imports +@KOKKOS_TPL_EXPORTS@ + +GET_FILENAME_COMPONENT(Kokkos_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH) +INCLUDE("${Kokkos_CMAKE_DIR}/KokkosTargets.cmake") +INCLUDE("${Kokkos_CMAKE_DIR}/KokkosConfigCommon.cmake") +UNSET(Kokkos_CMAKE_DIR) + +# check for conflicts +IF("launch_compiler" IN_LIST Kokkos_FIND_COMPONENTS AND + "separable_compilation" IN_LIST Kokkos_FIND_COMPONENTS) + MESSAGE(STATUS "'launch_compiler' implies global redirection of targets depending on Kokkos to appropriate compiler.") + MESSAGE(STATUS "'separable_compilation' implies explicitly defining where redirection occurs via 'kokkos_compilation(PROJECT|TARGET|SOURCE|DIRECTORY ...)'") + MESSAGE(FATAL_ERROR "Conflicting COMPONENTS: 'launch_compiler' and 'separable_compilation'") +ENDIF() + +IF("launch_compiler" IN_LIST Kokkos_FIND_COMPONENTS) + # + # if find_package(Kokkos COMPONENTS launch_compiler) then rely on the + # RULE_LAUNCH_COMPILE and RULE_LAUNCH_LINK to always redirect to the + # appropriate compiler for Kokkos + # + + MESSAGE(STATUS "kokkos_launch_compiler is enabled globally. C++ compiler commands with -DKOKKOS_DEPENDENCE will be redirected to the appropriate compiler for Kokkos") + kokkos_compilation( + GLOBAL + CHECK_CUDA_COMPILES) + +ELSEIF(@Kokkos_ENABLE_CUDA@ AND NOT "separable_compilation" IN_LIST Kokkos_FIND_COMPONENTS) + # + # if CUDA was enabled, separable compilation was not specified, and current compiler + # cannot compile CUDA, then set the RULE_LAUNCH_COMPILE and RULE_LAUNCH_LINK globally and + # kokkos_launch_compiler will re-direct to the compiler used to compile CUDA code during installation. + # kokkos_launch_compiler will re-direct if ${CMAKE_CXX_COMPILER} and -DKOKKOS_DEPENDENCE is present, + # otherwise, the original command will be executed + # + + # run test to see if CMAKE_CXX_COMPILER=nvcc_wrapper + kokkos_compiler_is_nvcc(IS_NVCC ${CMAKE_CXX_COMPILER}) + + # if not nvcc_wrapper and Kokkos_LAUNCH_COMPILER was not set to OFF + IF(NOT IS_NVCC AND (NOT DEFINED Kokkos_LAUNCH_COMPILER OR Kokkos_LAUNCH_COMPILER)) + MESSAGE(STATUS "kokkos_launch_compiler is enabled globally. C++ compiler commands with -DKOKKOS_DEPENDENCE will be redirected to the appropriate compiler for Kokkos") + kokkos_compilation(GLOBAL) + ENDIF() + + # be mindful of the environment, pollution is bad + UNSET(IS_NVCC) +ENDIF() diff --git a/packages/kokkos/cmake/KokkosConfigCommon.cmake.in b/packages/kokkos/cmake/KokkosConfigCommon.cmake.in new file mode 100644 index 0000000000000000000000000000000000000000..ab93e65afe97ab9be9295312e6cd879a1aff6b27 --- /dev/null +++ b/packages/kokkos/cmake/KokkosConfigCommon.cmake.in @@ -0,0 +1,274 @@ +SET(Kokkos_DEVICES @KOKKOS_ENABLED_DEVICES@) +SET(Kokkos_OPTIONS @KOKKOS_ENABLED_OPTIONS@) +SET(Kokkos_TPLS @KOKKOS_ENABLED_TPLS@) +SET(Kokkos_ARCH @KOKKOS_ENABLED_ARCH_LIST@) +SET(Kokkos_CXX_COMPILER "@CMAKE_CXX_COMPILER@") +SET(Kokkos_CXX_COMPILER_ID "@KOKKOS_CXX_COMPILER_ID@") + +# These are needed by KokkosKernels +FOREACH(DEV ${Kokkos_DEVICES}) + SET(Kokkos_ENABLE_${DEV} ON) +ENDFOREACH() + +IF(NOT Kokkos_FIND_QUIETLY) + MESSAGE(STATUS "Enabled Kokkos devices: ${Kokkos_DEVICES}") +ENDIF() + +IF (Kokkos_ENABLE_CUDA) + # If we are building CUDA, we have tricked CMake because we declare a CXX project + # If the default C++ standard for a given compiler matches the requested + # standard, then CMake just omits the -std flag in later versions of CMake + # This breaks CUDA compilation (CUDA compiler can have a different default + # -std then the underlying host compiler by itself). Setting this variable + # forces CMake to always add the -std flag even if it thinks it doesn't need it + SET(CMAKE_CXX_STANDARD_DEFAULT 98 CACHE INTERNAL "" FORCE) +ENDIF() + +SET(KOKKOS_USE_CXX_EXTENSIONS @KOKKOS_USE_CXX_EXTENSIONS@) +IF (NOT DEFINED CMAKE_CXX_EXTENSIONS OR CMAKE_CXX_EXTENSIONS) + IF (NOT KOKKOS_USE_CXX_EXTENSIONS) + MESSAGE(WARNING "The installed Kokkos configuration does not support CXX extensions. Forcing -DCMAKE_CXX_EXTENSIONS=Off") + SET(CMAKE_CXX_EXTENSIONS OFF CACHE BOOL "" FORCE) + ENDIF() +ENDIF() + +include(FindPackageHandleStandardArgs) + +# This function makes sure that Kokkos was built with the requested backends +# and target architectures and generates a fatal error if it was not. +# +# kokkos_check( +# [DEVICES <devices>...] # Set of backends (e.g. "OpenMP" and/or "Cuda") +# [ARCH <archs>...] # Target architectures (e.g. "Power9" and/or "Volta70") +# [OPTIONS <options>...] # Optional settings (e.g. "TUNING") +# [TPLS <tpls>...] # Third party libraries +# [RETURN_VALUE <result>] # Set a variable that indicates the result of the +# # check instead of a fatal error +# ) +function(kokkos_check) + set(ALLOWED_ARGS DEVICES ARCH OPTIONS TPLS) + cmake_parse_arguments(KOKKOS_CHECK "" "RETURN_VALUE" "${ALLOWED_ARGS}" ${ARGN}) + foreach(_arg ${KOKKOS_CHECK_UNPARSED_ARGUMENTS}) + message(SEND_ERROR "Argument '${_arg}' passed to kokkos_check() was not recognized") + endforeach() + # Get the list of keywords that were actually passed to the function. + set(REQUESTED_ARGS) + foreach(arg ${ALLOWED_ARGS}) + if(KOKKOS_CHECK_${arg}) + list(APPEND REQUESTED_ARGS ${arg}) + endif() + endforeach() + set(KOKKOS_CHECK_SUCCESS TRUE) + foreach(arg ${REQUESTED_ARGS}) + # Define variables named after the required arguments that are provided by + # the Kokkos install. + foreach(requested ${KOKKOS_CHECK_${arg}}) + foreach(provided ${Kokkos_${arg}}) + STRING(TOUPPER ${requested} REQUESTED_UC) + STRING(TOUPPER ${provided} PROVIDED_UC) + if(PROVIDED_UC STREQUAL REQUESTED_UC) + string(REPLACE ";" " " ${requested} "${KOKKOS_CHECK_${arg}}") + endif() + endforeach() + endforeach() + # Somewhat divert the CMake function below from its original purpose and + # use it to check that there are variables defined for all required + # arguments. Success or failure messages will be displayed but we are + # responsible for signaling failure and skip the build system generation. + if (KOKKOS_CHECK_RETURN_VALUE) + set(Kokkos_${arg}_FIND_QUIETLY ON) + endif() + find_package_handle_standard_args("Kokkos_${arg}" DEFAULT_MSG + ${KOKKOS_CHECK_${arg}}) + if(NOT Kokkos_${arg}_FOUND) + set(KOKKOS_CHECK_SUCCESS FALSE) + endif() + endforeach() + if(NOT KOKKOS_CHECK_SUCCESS AND NOT KOKKOS_CHECK_RETURN_VALUE) + message(FATAL_ERROR "Kokkos does NOT provide all backends and/or architectures requested") + else() + set(${KOKKOS_CHECK_RETURN_VALUE} ${KOKKOS_CHECK_SUCCESS} PARENT_SCOPE) + endif() +endfunction() + +# A test to check whether a downstream project set the C++ compiler to NVCC or not +# this is called only when Kokkos was installed with Kokkos_ENABLE_CUDA=ON +FUNCTION(kokkos_compiler_is_nvcc VAR COMPILER) + # Check if the compiler is nvcc (which really means nvcc_wrapper). + EXECUTE_PROCESS(COMMAND ${COMPILER} ${ARGN} --version + OUTPUT_VARIABLE INTERNAL_COMPILER_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE RET) + # something went wrong + IF(RET GREATER 0) + SET(${VAR} false PARENT_SCOPE) + ELSE() + STRING(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION} ) + STRING(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "nvcc" INTERNAL_COMPILER_VERSION_CONTAINS_NVCC) + STRING(REGEX REPLACE "^ +" "" INTERNAL_HAVE_COMPILER_NVCC "${INTERNAL_HAVE_COMPILER_NVCC}") + IF(${INTERNAL_COMPILER_VERSION_CONTAINS_NVCC} GREATER -1) + SET(${VAR} true PARENT_SCOPE) + ELSE() + SET(${VAR} false PARENT_SCOPE) + ENDIF() + ENDIF() +ENDFUNCTION() + +# this function checks whether the current CXX compiler supports building CUDA +FUNCTION(kokkos_cxx_compiler_cuda_test _VAR _COMPILER) + + FILE(WRITE ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cu +" +#include <cuda.h> +#include <cstdlib> + +__global__ +void kernel(int sz, double* data) +{ + int _beg = blockIdx.x * blockDim.x + threadIdx.x; + for(int i = _beg; i < sz; ++i) + data[i] += static_cast<double>(i); +} + +int main() +{ + double* data = NULL; + int blocks = 64; + int grids = 64; + int ret = cudaMalloc(&data, blocks * grids * sizeof(double)); + if(ret != cudaSuccess) + return EXIT_FAILURE; + kernel<<<grids, blocks>>>(blocks * grids, data); + cudaDeviceSynchronize(); + return EXIT_SUCCESS; +} +") + + # save the command for debugging + SET(_COMMANDS "${_COMPILER} ${ARGN} -c ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cu") + + # use execute_process instead of try compile because we want to set custom compiler + EXECUTE_PROCESS(COMMAND ${_COMPILER} ${ARGN} -c ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cu + RESULT_VARIABLE _RET + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}/compile_tests + TIMEOUT 15 + OUTPUT_QUIET + ERROR_QUIET) + + IF(NOT _RET EQUAL 0) + # save the command for debugging + SET(_COMMANDS "${_COMMAND}\n${_COMPILER} --cuda-gpu-arch=sm_35 ${ARGN} -c ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cu") + # try the compile test again with clang arguments + EXECUTE_PROCESS(COMMAND ${_COMPILER} --cuda-gpu-arch=sm_35 -c ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cu + RESULT_VARIABLE _RET + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}/compile_tests + TIMEOUT 15 + OUTPUT_QUIET + ERROR_QUIET) + ENDIF() + + SET(${_VAR}_COMMANDS "${_COMMANDS}" PARENT_SCOPE) + SET(${_VAR} ${_RET} PARENT_SCOPE) +ENDFUNCTION() + +# this function is provided to easily select which files use the same compiler as Kokkos +# when it was installed (or nvcc_wrapper): +# +# GLOBAL --> all files +# TARGET --> all files in a target +# SOURCE --> specific source files +# DIRECTORY --> all files in directory +# PROJECT --> all files/targets in a project/subproject +# +# Use the COMPILER argument to specify a compiler, if needed. By default, it will +# set the values to ${Kokkos_CXX_COMPILER} unless Kokkos_ENABLE_CUDA=ON and +# Kokkos_CXX_COMPILER_ID is NVIDIA, then it will set it to nvcc_wrapper +# +# Use CHECK_CUDA_COMPILES to run a check when CUDA is enabled +# +FUNCTION(kokkos_compilation) + CMAKE_PARSE_ARGUMENTS(COMP + "GLOBAL;PROJECT;CHECK_CUDA_COMPILES" + "COMPILER" + "DIRECTORY;TARGET;SOURCE;COMMAND_PREFIX" + ${ARGN}) + + # if built w/o CUDA support, we want to basically make this a no-op + SET(_Kokkos_ENABLE_CUDA @Kokkos_ENABLE_CUDA@) + + # search relative first and then absolute + SET(_HINTS "${CMAKE_CURRENT_LIST_DIR}/../.." "@CMAKE_INSTALL_PREFIX@") + + # find kokkos_launch_compiler + FIND_PROGRAM(Kokkos_COMPILE_LAUNCHER + NAMES kokkos_launch_compiler + HINTS ${_HINTS} + PATHS ${_HINTS} + PATH_SUFFIXES bin) + + IF(NOT Kokkos_COMPILE_LAUNCHER) + MESSAGE(FATAL_ERROR "Kokkos could not find 'kokkos_launch_compiler'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/launcher'") + ENDIF() + + # if COMPILER was not specified, assume Kokkos_CXX_COMPILER + IF(NOT COMP_COMPILER) + SET(COMP_COMPILER ${Kokkos_CXX_COMPILER}) + IF(_Kokkos_ENABLE_CUDA AND Kokkos_CXX_COMPILER_ID STREQUAL NVIDIA) + # find nvcc_wrapper + FIND_PROGRAM(Kokkos_NVCC_WRAPPER + NAMES nvcc_wrapper + HINTS ${_HINTS} + PATHS ${_HINTS} + PATH_SUFFIXES bin) + # fatal if we can't nvcc_wrapper + IF(NOT Kokkos_NVCC_WRAPPER) + MESSAGE(FATAL_ERROR "Kokkos could not find nvcc_wrapper. Please set '-DKokkos_NVCC_WRAPPER=/path/to/nvcc_wrapper'") + ENDIF() + SET(COMP_COMPILER ${Kokkos_NVCC_WRAPPER}) + ENDIF() + ENDIF() + + # check that the original compiler still exists! + IF(NOT EXISTS ${COMP_COMPILER}) + MESSAGE(FATAL_ERROR "Kokkos could not find original compiler: '${COMP_COMPILER}'") + ENDIF() + + # try to ensure that compiling cuda code works! + IF(_Kokkos_ENABLE_CUDA AND COMP_CHECK_CUDA_COMPILES) + + # this may fail if kokkos_compiler launcher was used during install + kokkos_cxx_compiler_cuda_test(_COMPILES_CUDA + ${Kokkos_COMPILE_LAUNCHER} ${COMP_COMPILER} ${CMAKE_CXX_COMPILER}) + + # if above failed, throw an error + IF(NOT _COMPILES_CUDA) + MESSAGE(FATAL_ERROR "kokkos_cxx_compiler_cuda_test failed! Test commands:\n${_COMPILES_CUDA_COMMANDS}") + ENDIF() + ENDIF() + + IF(COMP_COMMAND_PREFIX) + SET(_PREFIX "${COMP_COMMAND_PREFIX}") + STRING(REPLACE ";" " " _PREFIX "${COMP_COMMAND_PREFIX}") + SET(Kokkos_COMPILER_LAUNCHER "${_PREFIX} ${Kokkos_COMPILE_LAUNCHER}") + ENDIF() + + IF(COMP_GLOBAL) + # if global, don't bother setting others + SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${COMP_COMPILER} ${CMAKE_CXX_COMPILER}") + SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${COMP_COMPILER} ${CMAKE_CXX_COMPILER}") + ELSE() + FOREACH(_TYPE PROJECT DIRECTORY TARGET SOURCE) + # make project/subproject scoping easy, e.g. KokkosCompilation(PROJECT) after project(...) + IF("${_TYPE}" STREQUAL "PROJECT" AND COMP_${_TYPE}) + LIST(APPEND COMP_DIRECTORY ${PROJECT_SOURCE_DIR}) + UNSET(COMP_${_TYPE}) + ENDIF() + # set the properties if defined + IF(COMP_${_TYPE}) + # MESSAGE(STATUS "Using ${COMP_COMPILER} :: ${_TYPE} :: ${COMP_${_TYPE}}") + SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${COMP_COMPILER} ${CMAKE_CXX_COMPILER}") + SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${COMP_COMPILER} ${CMAKE_CXX_COMPILER}") + ENDIF() + ENDFOREACH() + ENDIF() +ENDFUNCTION() diff --git a/packages/kokkos/cmake/KokkosCore_Config_HeaderSet.in b/packages/kokkos/cmake/KokkosCore_Config_HeaderSet.in new file mode 100644 index 0000000000000000000000000000000000000000..8d1eee31b2a78754e599acc4de4b80ca32dd9d06 --- /dev/null +++ b/packages/kokkos/cmake/KokkosCore_Config_HeaderSet.in @@ -0,0 +1,49 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#ifndef @HEADER_GUARD_TAG@ +#define @HEADER_GUARD_TAG@ + +@INCLUDE_NEXT_FILE@ + +#endif diff --git a/packages/kokkos/cmake/KokkosCore_config.h.in b/packages/kokkos/cmake/KokkosCore_config.h.in new file mode 100644 index 0000000000000000000000000000000000000000..fbfae3711ec14573b4c3067aea4a8625d6b2ad8c --- /dev/null +++ b/packages/kokkos/cmake/KokkosCore_config.h.in @@ -0,0 +1,103 @@ + +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error \ + "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif + +// KOKKOS_VERSION % 100 is the patch level +// KOKKOS_VERSION / 100 % 100 is the minor version +// KOKKOS_VERSION / 10000 is the major version +#cmakedefine KOKKOS_VERSION @KOKKOS_VERSION@ + +/* Execution Spaces */ +#cmakedefine KOKKOS_ENABLE_SERIAL +#cmakedefine KOKKOS_ENABLE_OPENMP +#cmakedefine KOKKOS_ENABLE_OPENMPTARGET +#cmakedefine KOKKOS_ENABLE_THREADS +#cmakedefine KOKKOS_ENABLE_CUDA +#cmakedefine KOKKOS_ENABLE_HIP +#cmakedefine KOKKOS_ENABLE_HPX +#cmakedefine KOKKOS_ENABLE_MEMKIND +#cmakedefine KOKKOS_ENABLE_LIBRT +#cmakedefine KOKKOS_ENABLE_SYCL + +#ifndef __CUDA_ARCH__ +#cmakedefine KOKKOS_ENABLE_TM +#cmakedefine KOKKOS_USE_ISA_X86_64 +#cmakedefine KOKKOS_USE_ISA_KNC +#cmakedefine KOKKOS_USE_ISA_POWERPCLE +#cmakedefine KOKKOS_USE_ISA_POWERPCBE +#endif + +/* General Settings */ +#cmakedefine KOKKOS_ENABLE_CXX14 +#cmakedefine KOKKOS_ENABLE_CXX17 +#cmakedefine KOKKOS_ENABLE_CXX20 + +#cmakedefine KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE +#cmakedefine KOKKOS_ENABLE_CUDA_UVM +#cmakedefine KOKKOS_ENABLE_CUDA_LAMBDA +#cmakedefine KOKKOS_ENABLE_CUDA_CONSTEXPR +#cmakedefine KOKKOS_ENABLE_CUDA_LDG_INTRINSIC +#cmakedefine KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE +#cmakedefine KOKKOS_ENABLE_HPX_ASYNC_DISPATCH +#cmakedefine KOKKOS_ENABLE_DEBUG +#cmakedefine KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK +#cmakedefine KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK +#cmakedefine KOKKOS_ENABLE_COMPILER_WARNINGS +#cmakedefine KOKKOS_ENABLE_PROFILING_LOAD_PRINT +#cmakedefine KOKKOS_ENABLE_TUNING +#cmakedefine KOKKOS_ENABLE_DEPRECATED_CODE +#cmakedefine KOKKOS_ENABLE_LARGE_MEM_TESTS +#cmakedefine KOKKOS_ENABLE_DUALVIEW_MODIFY_CHECK +#cmakedefine KOKKOS_ENABLE_COMPLEX_ALIGN +#cmakedefine KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION + +/* TPL Settings */ +#cmakedefine KOKKOS_ENABLE_HWLOC +#cmakedefine KOKKOS_USE_LIBRT +#cmakedefine KOKKOS_ENABLE_HBWSPACE +#cmakedefine KOKKOS_ENABLE_LIBDL +#cmakedefine KOKKOS_IMPL_CUDA_CLANG_WORKAROUND + +#cmakedefine KOKKOS_COMPILER_CUDA_VERSION @KOKKOS_COMPILER_CUDA_VERSION@ + +#cmakedefine KOKKOS_ARCH_SSE42 +#cmakedefine KOKKOS_ARCH_ARMV80 +#cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX +#cmakedefine KOKKOS_ARCH_ARMV81 +#cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX2 +#cmakedefine KOKKOS_ARCH_AMD_AVX2 +#cmakedefine KOKKOS_ARCH_AVX +#cmakedefine KOKKOS_ARCH_AVX2 +#cmakedefine KOKKOS_ARCH_AVX512XEON +#cmakedefine KOKKOS_ARCH_KNC +#cmakedefine KOKKOS_ARCH_AVX512MIC +#cmakedefine KOKKOS_ARCH_POWER7 +#cmakedefine KOKKOS_ARCH_POWER8 +#cmakedefine KOKKOS_ARCH_POWER9 +#cmakedefine KOKKOS_ARCH_INTEL_GEN +#cmakedefine KOKKOS_ARCH_KEPLER +#cmakedefine KOKKOS_ARCH_KEPLER30 +#cmakedefine KOKKOS_ARCH_KEPLER32 +#cmakedefine KOKKOS_ARCH_KEPLER35 +#cmakedefine KOKKOS_ARCH_KEPLER37 +#cmakedefine KOKKOS_ARCH_MAXWELL +#cmakedefine KOKKOS_ARCH_MAXWELL50 +#cmakedefine KOKKOS_ARCH_MAXWELL52 +#cmakedefine KOKKOS_ARCH_MAXWELL53 +#cmakedefine KOKKOS_ARCH_PASCAL +#cmakedefine KOKKOS_ARCH_PASCAL60 +#cmakedefine KOKKOS_ARCH_PASCAL61 +#cmakedefine KOKKOS_ARCH_VOLTA +#cmakedefine KOKKOS_ARCH_VOLTA70 +#cmakedefine KOKKOS_ARCH_VOLTA72 +#cmakedefine KOKKOS_ARCH_TURING75 +#cmakedefine KOKKOS_ARCH_AMPERE80 +#cmakedefine KOKKOS_ARCH_AMPERE86 +#cmakedefine KOKKOS_ARCH_AMD_ZEN +#cmakedefine KOKKOS_ARCH_AMD_ZEN2 + +#cmakedefine KOKKOS_IMPL_DISABLE_SYCL_DEVICE_PRINTF diff --git a/packages/kokkos/cmake/KokkosTrilinosConfig.cmake.in b/packages/kokkos/cmake/KokkosTrilinosConfig.cmake.in new file mode 100644 index 0000000000000000000000000000000000000000..626ef5a8ebefcaf7adcdeaa3b285f44892527dbc --- /dev/null +++ b/packages/kokkos/cmake/KokkosTrilinosConfig.cmake.in @@ -0,0 +1,17 @@ +IF (NOT TARGET Kokkos::kokkos) + # Compute the installation prefix relative to this file. + get_filename_component(KOKKOS_IMPORT_PREFIX "${CMAKE_CURRENT_LIST_FILE}" PATH) + get_filename_component(KOKKOS_IMPORT_PREFIX "${KOKKOS_IMPORT_PREFIX}" PATH) + get_filename_component(KOKKOS_IMPORT_PREFIX "${KOKKOS_IMPORT_PREFIX}" PATH) + get_filename_component(KOKKOS_IMPORT_PREFIX "${KOKKOS_IMPORT_PREFIX}" PATH) + if(KOKKOS_IMPORT_PREFIX STREQUAL "/") + set(KOKKOS_IMPORT_PREFIX "") + endif() + add_library(Kokkos::kokkos INTERFACE IMPORTED) + set_target_properties(Kokkos::kokkos PROPERTIES + INTERFACE_LINK_LIBRARIES "@Kokkos_LIBRARIES@;@KOKKOS_LINK_OPTIONS@" + INTERFACE_COMPILE_FEATURES "@KOKKOS_CXX_STANDARD_FEATURE@" + INTERFACE_COMPILE_OPTIONS "@KOKKOS_ALL_COMPILE_OPTIONS@" + INTERFACE_INCLUDE_DIRECTORIES "${KOKKOS_IMPORT_PREFIX}/include" + ) +ENDIF() diff --git a/packages/kokkos/cmake/Modules/CudaToolkit.cmake b/packages/kokkos/cmake/Modules/CudaToolkit.cmake new file mode 100644 index 0000000000000000000000000000000000000000..eda5541f7c0633a868285190e9a4c39c275adf6b --- /dev/null +++ b/packages/kokkos/cmake/Modules/CudaToolkit.cmake @@ -0,0 +1,888 @@ +# Distributed under the OSI-approved BSD 3-Clause License. See accompanying +# file Copyright.txt or https://cmake.org/licensing for details. + +#[=======================================================================[.rst: +FindCUDAToolkit +--------------- + +This script locates the NVIDIA CUDA toolkit and the associated libraries, but +does not require the ``CUDA`` language be enabled for a given project. This +module does not search for the NVIDIA CUDA Samples. + +Search Behavior +^^^^^^^^^^^^^^^ + +Finding the CUDA Toolkit requires finding the ``nvcc`` executable, which is +searched for in the following order: + +1. If the ``CUDA`` language has been enabled we will use the directory + containing the compiler as the first search location for ``nvcc``. + +2. If the ``CUDAToolkit_ROOT`` cmake configuration variable (e.g., + ``-DCUDAToolkit_ROOT=/some/path``) *or* environment variable is defined, it + will be searched. If both an environment variable **and** a + configuration variable are specified, the *configuration* variable takes + precedence. + + The directory specified here must be such that the executable ``nvcc`` can be + found underneath the directory specified by ``CUDAToolkit_ROOT``. If + ``CUDAToolkit_ROOT`` is specified, but no ``nvcc`` is found underneath, this + package is marked as **not** found. No subsequent search attempts are + performed. + +3. If the CUDA_PATH environment variable is defined, it will be searched. + +4. The user's path is searched for ``nvcc`` using :command:`find_program`. If + this is found, no subsequent search attempts are performed. Users are + responsible for ensuring that the first ``nvcc`` to show up in the path is + the desired path in the event that multiple CUDA Toolkits are installed. + +5. On Unix systems, if the symbolic link ``/usr/local/cuda`` exists, this is + used. No subsequent search attempts are performed. No default symbolic link + location exists for the Windows platform. + +6. The platform specific default install locations are searched. If exactly one + candidate is found, this is used. The default CUDA Toolkit install locations + searched are: + + +-------------+-------------------------------------------------------------+ + | Platform | Search Pattern | + +=============+=============================================================+ + | macOS | ``/Developer/NVIDIA/CUDA-X.Y`` | + +-------------+-------------------------------------------------------------+ + | Other Unix | ``/usr/local/cuda-X.Y`` | + +-------------+-------------------------------------------------------------+ + | Windows | ``C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\vX.Y`` | + +-------------+-------------------------------------------------------------+ + + Where ``X.Y`` would be a specific version of the CUDA Toolkit, such as + ``/usr/local/cuda-9.0`` or + ``C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0`` + + .. note:: + + When multiple CUDA Toolkits are installed in the default location of a + system (e.g., both ``/usr/local/cuda-9.0`` and ``/usr/local/cuda-10.0`` + exist but the ``/usr/local/cuda`` symbolic link does **not** exist), this + package is marked as **not** found. + + There are too many factors involved in making an automatic decision in + the presence of multiple CUDA Toolkits being installed. In this + situation, users are encouraged to either (1) set ``CUDAToolkit_ROOT`` or + (2) ensure that the correct ``nvcc`` executable shows up in ``$PATH`` for + :command:`find_program` to find. + +Options +^^^^^^^ + +``VERSION`` + If specified, describes the version of the CUDA Toolkit to search for. + +``REQUIRED`` + If specified, configuration will error if a suitable CUDA Toolkit is not + found. + +``QUIET`` + If specified, the search for a suitable CUDA Toolkit will not produce any + messages. + +``EXACT`` + If specified, the CUDA Toolkit is considered found only if the exact + ``VERSION`` specified is recovered. + +Imported targets +^^^^^^^^^^^^^^^^ + +An :ref:`imported target <Imported targets>` named ``CUDA::toolkit`` is provided. + +This module defines :prop_tgt:`IMPORTED` targets for each +of the following libraries that are part of the CUDAToolkit: + +- :ref:`CUDA Runtime Library<cuda_toolkit_rt_lib>` +- :ref:`CUDA Driver Library<cuda_toolkit_driver_lib>` +- :ref:`cuBLAS<cuda_toolkit_cuBLAS>` +- :ref:`cuFFT<cuda_toolkit_cuFFT>` +- :ref:`cuRAND<cuda_toolkit_cuRAND>` +- :ref:`cuSOLVER<cuda_toolkit_cuSOLVER>` +- :ref:`cuSPARSE<cuda_toolkit_cuSPARSE>` +- :ref:`cuPTI<cuda_toolkit_cupti>` +- :ref:`NPP<cuda_toolkit_NPP>` +- :ref:`nvBLAS<cuda_toolkit_nvBLAS>` +- :ref:`nvGRAPH<cuda_toolkit_nvGRAPH>` +- :ref:`nvJPEG<cuda_toolkit_nvJPEG>` +- :ref:`nvidia-ML<cuda_toolkit_nvML>` +- :ref:`nvRTC<cuda_toolkit_nvRTC>` +- :ref:`nvToolsExt<cuda_toolkit_nvToolsExt>` +- :ref:`OpenCL<cuda_toolkit_opencl>` +- :ref:`cuLIBOS<cuda_toolkit_cuLIBOS>` + +.. _`cuda_toolkit_rt_lib`: + +CUDA Runtime Library +"""""""""""""""""""" + +The CUDA Runtime library (cudart) are what most applications will typically +need to link against to make any calls such as `cudaMalloc`, and `cudaFree`. + +Targets Created: + +- ``CUDA::cudart`` +- ``CUDA::cudart_static`` + +.. _`cuda_toolkit_driver_lib`: + +CUDA Driver Library +"""""""""""""""""""" + +The CUDA Driver library (cuda) are used by applications that use calls +such as `cuMemAlloc`, and `cuMemFree`. This is generally used by advanced + + +Targets Created: + +- ``CUDA::cuda_driver`` +- ``CUDA::cuda_driver`` + +.. _`cuda_toolkit_cuBLAS`: + +cuBLAS +"""""" + +The `cuBLAS <https://docs.nvidia.com/cuda/cublas/index.html>`_ library. + +Targets Created: + +- ``CUDA::cublas`` +- ``CUDA::cublas_static`` + +.. _`cuda_toolkit_cuFFT`: + +cuFFT +""""" + +The `cuFFT <https://docs.nvidia.com/cuda/cufft/index.html>`_ library. + +Targets Created: + +- ``CUDA::cufft`` +- ``CUDA::cufftw`` +- ``CUDA::cufft_static`` +- ``CUDA::cufftw_static`` + +cuRAND +"""""" + +The `cuRAND <https://docs.nvidia.com/cuda/curand/index.html>`_ library. + +Targets Created: + +- ``CUDA::curand`` +- ``CUDA::curand_static`` + +.. _`cuda_toolkit_cuSOLVER`: + +cuSOLVER +"""""""" + +The `cuSOLVER <https://docs.nvidia.com/cuda/cusolver/index.html>`_ library. + +Targets Created: + +- ``CUDA::cusolver`` +- ``CUDA::cusolver_static`` + +.. _`cuda_toolkit_cuSPARSE`: + +cuSPARSE +"""""""" + +The `cuSPARSE <https://docs.nvidia.com/cuda/cusparse/index.html>`_ library. + +Targets Created: + +- ``CUDA::cusparse`` +- ``CUDA::cusparse_static`` + +.. _`cuda_toolkit_cupti`: + +cupti +""""" + +The `NVIDIA CUDA Profiling Tools Interface <https://developer.nvidia.com/CUPTI>`_. + +Targets Created: + +- ``CUDA::cupti`` +- ``CUDA::cupti_static`` + +.. _`cuda_toolkit_NPP`: + +NPP +""" + +The `NPP <https://docs.nvidia.com/cuda/npp/index.html>`_ libraries. + +Targets Created: + +- `nppc`: + + - ``CUDA::nppc`` + - ``CUDA::nppc_static`` + +- `nppial`: Arithmetic and logical operation functions in `nppi_arithmetic_and_logical_operations.h` + + - ``CUDA::nppial`` + - ``CUDA::nppial_static`` + +- `nppicc`: Color conversion and sampling functions in `nppi_color_conversion.h` + + - ``CUDA::nppicc`` + - ``CUDA::nppicc_static`` + +- `nppicom`: JPEG compression and decompression functions in `nppi_compression_functions.h` + + - ``CUDA::nppicom`` + - ``CUDA::nppicom_static`` + +- `nppidei`: Data exchange and initialization functions in `nppi_data_exchange_and_initialization.h` + + - ``CUDA::nppidei`` + - ``CUDA::nppidei_static`` + +- `nppif`: Filtering and computer vision functions in `nppi_filter_functions.h` + + - ``CUDA::nppif`` + - ``CUDA::nppif_static`` + +- `nppig`: Geometry transformation functions found in `nppi_geometry_transforms.h` + + - ``CUDA::nppig`` + - ``CUDA::nppig_static`` + +- `nppim`: Morphological operation functions found in `nppi_morphological_operations.h` + + - ``CUDA::nppim`` + - ``CUDA::nppim_static`` + +- `nppist`: Statistics and linear transform in `nppi_statistics_functions.h` and `nppi_linear_transforms.h` + + - ``CUDA::nppist`` + - ``CUDA::nppist_static`` + +- `nppisu`: Memory support functions in `nppi_support_functions.h` + + - ``CUDA::nppisu`` + - ``CUDA::nppisu_static`` + +- `nppitc`: Threshold and compare operation functions in `nppi_threshold_and_compare_operations.h` + + - ``CUDA::nppitc`` + - ``CUDA::nppitc_static`` + +- `npps`: + + - ``CUDA::npps`` + - ``CUDA::npps_static`` + +.. _`cuda_toolkit_nvBLAS`: + +nvBLAS +"""""" + +The `nvBLAS <https://docs.nvidia.com/cuda/nvblas/index.html>`_ libraries. +This is a shared library only. + +Targets Created: + +- ``CUDA::nvblas`` + +.. _`cuda_toolkit_nvGRAPH`: + +nvGRAPH +""""""" + +The `nvGRAPH <https://docs.nvidia.com/cuda/nvgraph/index.html>`_ library. + +Targets Created: + +- ``CUDA::nvgraph`` +- ``CUDA::nvgraph_static`` + + +.. _`cuda_toolkit_nvJPEG`: + +nvJPEG +"""""" + +The `nvJPEG <https://docs.nvidia.com/cuda/nvjpeg/index.html>`_ library. +Introduced in CUDA 10. + +Targets Created: + +- ``CUDA::nvjpeg`` +- ``CUDA::nvjpeg_static`` + +.. _`cuda_toolkit_nvRTC`: + +nvRTC +""""" + +The `nvRTC <https://docs.nvidia.com/cuda/nvrtc/index.html>`_ (Runtime Compilation) library. +This is a shared library only. + +Targets Created: + +- ``CUDA::nvrtc`` + +.. _`cuda_toolkit_nvml`: + +nvidia-ML +""""""""" + +The `NVIDIA Management Library <https://developer.nvidia.com/nvidia-management-library-nvml>`_. +This is a shared library only. + +Targets Created: + +- ``CUDA::nvml`` + +.. _`cuda_toolkit_nvToolsExt`: + +nvToolsExt +"""""""""" + +The `NVIDIA Tools Extension <https://docs.nvidia.com/gameworks/content/gameworkslibrary/nvtx/nvidia_tools_extension_library_nvtx.htm>`_. +This is a shared library only. + +Targets Created: + +- ``CUDA::nvToolsExt`` + +.. _`cuda_toolkit_opencl`: + +OpenCL +"""""" + +The `NVIDIA OpenCL Library <https://developer.nvidia.com/opencl>`_. +This is a shared library only. + +Targets Created: + +- ``CUDA::OpenCL`` + +.. _`cuda_toolkit_cuLIBOS`: + +cuLIBOS +""""""" + +The cuLIBOS library is a backend thread abstraction layer library which is +static only. The ``CUDA::cublas_static``, ``CUDA::cusparse_static``, +``CUDA::cufft_static``, ``CUDA::curand_static``, and (when implemented) NPP +libraries all automatically have this dependency linked. + +Target Created: + +- ``CUDA::culibos`` + +**Note**: direct usage of this target by consumers should not be necessary. + +.. _`cuda_toolkit_cuRAND`: + + + +Result variables +^^^^^^^^^^^^^^^^ + +``CUDAToolkit_FOUND`` + A boolean specifying whether or not the CUDA Toolkit was found. + +``CUDAToolkit_VERSION`` + The exact version of the CUDA Toolkit found (as reported by + ``nvcc --version``). + +``CUDAToolkit_VERSION_MAJOR`` + The major version of the CUDA Toolkit. + +``CUDAToolkit_VERSION_MAJOR`` + The minor version of the CUDA Toolkit. + +``CUDAToolkit_VERSION_PATCH`` + The patch version of the CUDA Toolkit. + +``CUDAToolkit_BIN_DIR`` + The path to the CUDA Toolkit library directory that contains the CUDA + executable ``nvcc``. + +``CUDAToolkit_INCLUDE_DIRS`` + The path to the CUDA Toolkit ``include`` folder containing the header files + required to compile a project linking against CUDA. + +``CUDAToolkit_LIBRARY_DIR`` + The path to the CUDA Toolkit library directory that contains the CUDA + Runtime library ``cudart``. + +``CUDAToolkit_TARGET_DIR`` + The path to the CUDA Toolkit directory including the target architecture + when cross-compiling. When not cross-compiling this will be equivalant to + ``CUDAToolkit_ROOT_DIR``. + +``CUDAToolkit_NVCC_EXECUTABLE`` + The path to the NVIDIA CUDA compiler ``nvcc``. Note that this path may + **not** be the same as + :variable:`CMAKE_CUDA_COMPILER <CMAKE_<LANG>_COMPILER>`. ``nvcc`` must be + found to determine the CUDA Toolkit version as well as determining other + features of the Toolkit. This variable is set for the convenience of + modules that depend on this one. + + +#]=======================================================================] + +# NOTE: much of this was simply extracted from FindCUDA.cmake. + +# James Bigler, NVIDIA Corp (nvidia.com - jbigler) +# Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html +# +# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. +# +# Copyright (c) 2007-2009 +# Scientific Computing and Imaging Institute, University of Utah +# +# This code is licensed under the MIT License. See the FindCUDA.cmake script +# for the text of the license. + +# The MIT License +# +# License for the specific language governing rights and limitations under +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# +############################################################################### + +# For NVCC we can easily deduce the SDK binary directory from the compiler path. +if(CMAKE_CUDA_COMPILER_LOADED AND NOT CUDAToolkit_BIN_DIR AND CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA") + get_filename_component(cuda_dir "${CMAKE_CUDA_COMPILER}" DIRECTORY) + set(CUDAToolkit_BIN_DIR "${cuda_dir}" CACHE PATH "") + mark_as_advanced(CUDAToolkit_BIN_DIR) + unset(cuda_dir) +endif() + +# Try language- or user-provided path first. +if(CUDAToolkit_BIN_DIR) + find_program(CUDAToolkit_NVCC_EXECUTABLE + NAMES nvcc nvcc.exe + PATHS ${CUDAToolkit_BIN_DIR} + NO_DEFAULT_PATH + ) +endif() + +# Search using CUDAToolkit_ROOT +find_program(CUDAToolkit_NVCC_EXECUTABLE + NAMES nvcc nvcc.exe + PATHS ENV CUDA_PATH + PATH_SUFFIXES bin +) + +# If the user specified CUDAToolkit_ROOT but nvcc could not be found, this is an error. +if (NOT CUDAToolkit_NVCC_EXECUTABLE AND (DEFINED CUDAToolkit_ROOT OR DEFINED ENV{CUDAToolkit_ROOT})) + # Declare error messages now, print later depending on find_package args. + set(fail_base "Could not find nvcc executable in path specified by") + set(cuda_root_fail "${fail_base} CUDAToolkit_ROOT=${CUDAToolkit_ROOT}") + set(env_cuda_root_fail "${fail_base} environment variable CUDAToolkit_ROOT=$ENV{CUDAToolkit_ROOT}") + + if (CUDAToolkit_FIND_REQUIRED) + if (DEFINED CUDAToolkit_ROOT) + message(FATAL_ERROR ${cuda_root_fail}) + elseif (DEFINED ENV{CUDAToolkit_ROOT}) + message(FATAL_ERROR ${env_cuda_root_fail}) + endif() + else() + if (NOT CUDAToolkit_FIND_QUIETLY) + if (DEFINED CUDAToolkit_ROOT) + message(STATUS ${cuda_root_fail}) + elseif (DEFINED ENV{CUDAToolkit_ROOT}) + message(STATUS ${env_cuda_root_fail}) + endif() + endif() + set(CUDAToolkit_FOUND FALSE) + unset(fail_base) + unset(cuda_root_fail) + unset(env_cuda_root_fail) + return() + endif() +endif() + +# CUDAToolkit_ROOT cmake / env variable not specified, try platform defaults. +# +# - Linux: /usr/local/cuda-X.Y +# - macOS: /Developer/NVIDIA/CUDA-X.Y +# - Windows: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\vX.Y +# +# We will also search the default symlink location /usr/local/cuda first since +# if CUDAToolkit_ROOT is not specified, it is assumed that the symlinked +# directory is the desired location. +if (NOT CUDAToolkit_NVCC_EXECUTABLE) + if (UNIX) + if (NOT APPLE) + set(platform_base "/usr/local/cuda-") + else() + set(platform_base "/Developer/NVIDIA/CUDA-") + endif() + else() + set(platform_base "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v") + endif() + + # Build out a descending list of possible cuda installations, e.g. + file(GLOB possible_paths "${platform_base}*") + # Iterate the glob results and create a descending list. + set(possible_versions) + foreach (p ${possible_paths}) + # Extract version number from end of string + string(REGEX MATCH "[0-9][0-9]?\\.[0-9]$" p_version ${p}) + if (IS_DIRECTORY ${p} AND p_version) + list(APPEND possible_versions ${p_version}) + endif() + endforeach() + + # Cannot use list(SORT) because that is alphabetical, we need numerical. + # NOTE: this is not an efficient sorting strategy. But even if a user had + # every possible version of CUDA installed, this wouldn't create any + # significant overhead. + set(versions) + foreach (v ${possible_versions}) + list(LENGTH versions num_versions) + # First version, nothing to compare with so just append. + if (num_versions EQUAL 0) + list(APPEND versions ${v}) + else() + # Loop through list. Insert at an index when comparison is + # VERSION_GREATER since we want a descending list. Duplicates will not + # happen since this came from a glob list of directories. + set(i 0) + set(early_terminate FALSE) + while (i LESS num_versions) + list(GET versions ${i} curr) + if (v VERSION_GREATER curr) + list(INSERT versions ${i} ${v}) + set(early_terminate TRUE) + break() + endif() + math(EXPR i "${i} + 1") + endwhile() + # If it did not get inserted, place it at the end. + if (NOT early_terminate) + list(APPEND versions ${v}) + endif() + endif() + endforeach() + + # With a descending list of versions, populate possible paths to search. + set(search_paths) + foreach (v ${versions}) + list(APPEND search_paths "${platform_base}${v}") + endforeach() + + # Force the global default /usr/local/cuda to the front on Unix. + if (UNIX) + list(INSERT search_paths 0 "/usr/local/cuda") + endif() + + # Now search for nvcc again using the platform default search paths. + find_program(CUDAToolkit_NVCC_EXECUTABLE + NAMES nvcc nvcc.exe + PATHS ${search_paths} + PATH_SUFFIXES bin + ) + + # We are done with these variables now, cleanup for caller. + unset(platform_base) + unset(possible_paths) + unset(possible_versions) + unset(versions) + unset(i) + unset(early_terminate) + unset(search_paths) + + if (NOT CUDAToolkit_NVCC_EXECUTABLE) + if (CUDAToolkit_FIND_REQUIRED) + message(FATAL_ERROR "Could not find nvcc, please set CUDAToolkit_ROOT.") + elseif(NOT CUDAToolkit_FIND_QUIETLY) + message(STATUS "Could not find nvcc, please set CUDAToolkit_ROOT.") + endif() + + set(CUDAToolkit_FOUND FALSE) + return() + endif() +endif() + +if(NOT CUDAToolkit_BIN_DIR AND CUDAToolkit_NVCC_EXECUTABLE) + get_filename_component(cuda_dir "${CUDAToolkit_NVCC_EXECUTABLE}" DIRECTORY) + set(CUDAToolkit_BIN_DIR "${cuda_dir}" CACHE PATH "" FORCE) + mark_as_advanced(CUDAToolkit_BIN_DIR) + unset(cuda_dir) +endif() + +if(CUDAToolkit_NVCC_EXECUTABLE AND + CUDAToolkit_NVCC_EXECUTABLE STREQUAL CMAKE_CUDA_COMPILER) + # Need to set these based off the already computed CMAKE_CUDA_COMPILER_VERSION value + # This if statement will always match, but is used to provide variables for MATCH 1,2,3... + if(CMAKE_CUDA_COMPILER_VERSION MATCHES [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=]) + set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}") + set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}") + set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}") + set(CUDAToolkit_VERSION "${CMAKE_CUDA_COMPILER_VERSION}") + endif() +else() + # Compute the version by invoking nvcc + execute_process (COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT) + if(NVCC_OUT MATCHES [=[ V([0-9]+)\.([0-9]+)\.([0-9]+)]=]) + set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}") + set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}") + set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}") + set(CUDAToolkit_VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}") + endif() + unset(NVCC_OUT) +endif() + + +get_filename_component(CUDAToolkit_ROOT_DIR ${CUDAToolkit_BIN_DIR} DIRECTORY ABSOLUTE) + +# Handle cross compilation +if(CMAKE_CROSSCOMPILING) + if(CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7-a") + # Support for NVPACK + set (CUDAToolkit_TARGET_NAME "armv7-linux-androideabi") + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm") + # Support for arm cross compilation + set(CUDAToolkit_TARGET_NAME "armv7-linux-gnueabihf") + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") + # Support for aarch64 cross compilation + if (ANDROID_ARCH_NAME STREQUAL "arm64") + set(CUDAToolkit_TARGET_NAME "aarch64-linux-androideabi") + else() + set(CUDAToolkit_TARGET_NAME "aarch64-linux") + endif (ANDROID_ARCH_NAME STREQUAL "arm64") + elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + set(CUDAToolkit_TARGET_NAME "x86_64-linux") + endif() + + if (EXISTS "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}") + set(CUDAToolkit_TARGET_DIR "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}") + # add known CUDA target root path to the set of directories we search for programs, libraries and headers + list(PREPEND CMAKE_FIND_ROOT_PATH "${CUDAToolkit_TARGET_DIR}") + + # Mark that we need to pop the root search path changes after we have + # found all cuda libraries so that searches for our cross-compilation + # libraries work when another cuda sdk is in CMAKE_PREFIX_PATH or + # PATh + set(_CUDAToolkit_Pop_ROOT_PATH True) + endif() +else() + # Not cross compiling + set(CUDAToolkit_TARGET_DIR "${CUDAToolkit_ROOT_DIR}") + # Now that we have the real ROOT_DIR, find components inside it. + list(APPEND CMAKE_PREFIX_PATH ${CUDAToolkit_ROOT_DIR}) + + # Mark that we need to pop the prefix path changes after we have + # found the cudart library. + set(_CUDAToolkit_Pop_Prefix True) +endif() + + +# Find the include/ directory +find_path(CUDAToolkit_INCLUDE_DIR + NAMES cuda_runtime.h +) + +# And find the CUDA Runtime Library libcudart +find_library(CUDA_CUDART + NAMES cudart + PATH_SUFFIXES lib64 lib/x64 +) +if (NOT CUDA_CUDART) + find_library(CUDA_CUDART + NAMES cudart + PATH_SUFFIXES lib64/stubs lib/x64/stubs + ) +endif() + +if (NOT CUDA_CUDART AND NOT CUDAToolkit_FIND_QUIETLY) + message(STATUS "Unable to find cudart library.") +endif() + +unset(CUDAToolkit_ROOT_DIR) +if(_CUDAToolkit_Pop_Prefix) + list(REMOVE_AT CMAKE_PREFIX_PATH -1) + unset(_CUDAToolkit_Pop_Prefix) +endif() + +#----------------------------------------------------------------------------- +# Perform version comparison and validate all required variables are set. +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(CUDAToolkit + REQUIRED_VARS + CUDAToolkit_INCLUDE_DIR + CUDA_CUDART + CUDAToolkit_NVCC_EXECUTABLE + VERSION_VAR + CUDAToolkit_VERSION +) +mark_as_advanced(CUDA_CUDART + CUDAToolkit_INCLUDE_DIR + CUDAToolkit_NVCC_EXECUTABLE + ) + +#----------------------------------------------------------------------------- +# Construct result variables +if(CUDAToolkit_FOUND) + set(CUDAToolkit_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIR}) + get_filename_component(CUDAToolkit_LIBRARY_DIR ${CUDA_CUDART} DIRECTORY ABSOLUTE) +endif() + +#----------------------------------------------------------------------------- +# Construct import targets +if(CUDAToolkit_FOUND) + + function(_CUDAToolkit_find_and_add_import_lib lib_name) + cmake_parse_arguments(arg "" "" "ALT;DEPS;EXTRA_PATH_SUFFIXES" ${ARGN}) + + set(search_names ${lib_name} ${arg_ALT}) + + find_library(CUDA_${lib_name}_LIBRARY + NAMES ${search_names} + HINTS ${CUDAToolkit_LIBRARY_DIR} + ENV CUDA_PATH + PATH_SUFFIXES nvidia/current lib64 lib/x64 lib + ${arg_EXTRA_PATH_SUFFIXES} + ) + # Don't try any stub directories intil we have exhausted all other + # search locations. + if(NOT CUDA_${lib_name}_LIBRARY) + find_library(CUDA_${lib_name}_LIBRARY + NAMES ${search_names} + HINTS ${CUDAToolkit_LIBRARY_DIR} + ENV CUDA_PATH + PATH_SUFFIXES lib64/stubs lib/x64/stubs lib/stubs stubs + ) + endif() + + mark_as_advanced(CUDA_${lib_name}_LIBRARY) + + if (NOT TARGET CUDA::${lib_name} AND CUDA_${lib_name}_LIBRARY) + add_library(CUDA::${lib_name} IMPORTED INTERFACE) + target_include_directories(CUDA::${lib_name} SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}") + target_link_libraries(CUDA::${lib_name} INTERFACE "${CUDA_${lib_name}_LIBRARY}") + foreach(dep ${arg_DEPS}) + if(TARGET CUDA::${dep}) + target_link_libraries(CUDA::${lib_name} INTERFACE CUDA::${dep}) + endif() + endforeach() + endif() + endfunction() + + if(NOT TARGET CUDA::toolkit) + add_library(CUDA::toolkit IMPORTED INTERFACE) + target_include_directories(CUDA::toolkit SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}") + target_link_directories(CUDA::toolkit INTERFACE "${CUDAToolkit_LIBRARY_DIR}") + endif() + + _CUDAToolkit_find_and_add_import_lib(cuda_driver ALT cuda) + + _CUDAToolkit_find_and_add_import_lib(cudart) + _CUDAToolkit_find_and_add_import_lib(cudart_static) + + # setup dependencies that are required for cudart_static when building + # on linux. These are generally only required when using the CUDA toolkit + # when CUDA language is disabled + if(NOT TARGET CUDA::cudart_static_deps + AND TARGET CUDA::cudart_static) + + add_library(CUDA::cudart_static_deps IMPORTED INTERFACE) + target_link_libraries(CUDA::cudart_static INTERFACE CUDA::cudart_static_deps) + + if(UNIX AND (CMAKE_C_COMPILER OR CMAKE_CXX_COMPILER)) + find_package(Threads REQUIRED) + target_link_libraries(CUDA::cudart_static_deps INTERFACE Threads::Threads ${CMAKE_DL_LIBS}) + endif() + + if(UNIX AND NOT APPLE) + # On Linux, you must link against librt when using the static cuda runtime. + find_library(CUDAToolkit_rt_LIBRARY rt) + mark_as_advanced(CUDAToolkit_rt_LIBRARY) + if(NOT CUDAToolkit_rt_LIBRARY) + message(WARNING "Could not find librt library, needed by CUDA::cudart_static") + else() + target_link_libraries(CUDA::cudart_static_deps INTERFACE ${CUDAToolkit_rt_LIBRARY}) + endif() + endif() + endif() + + _CUDAToolkit_find_and_add_import_lib(culibos) # it's a static library + foreach (cuda_lib cublas cufft curand cusparse nppc nvjpeg) + _CUDAToolkit_find_and_add_import_lib(${cuda_lib}) + _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS culibos) + endforeach() + + # cuFFTW depends on cuFFT + _CUDAToolkit_find_and_add_import_lib(cufftw DEPS cufft) + _CUDAToolkit_find_and_add_import_lib(cufftw DEPS cufft_static) + + # cuSOLVER depends on cuBLAS, and cuSPARSE + _CUDAToolkit_find_and_add_import_lib(cusolver DEPS cublas cusparse) + _CUDAToolkit_find_and_add_import_lib(cusolver_static DEPS cublas_static cusparse_static culibos) + + # nvGRAPH depends on cuRAND, and cuSOLVER. + _CUDAToolkit_find_and_add_import_lib(nvgraph DEPS curand cusolver) + _CUDAToolkit_find_and_add_import_lib(nvgraph_static DEPS curand_static cusolver_static) + + # Process the majority of the NPP libraries. + foreach (cuda_lib nppial nppicc nppidei nppif nppig nppim nppist nppitc npps nppicom nppisu) + _CUDAToolkit_find_and_add_import_lib(${cuda_lib} DEPS nppc) + _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS nppc_static) + endforeach() + + _CUDAToolkit_find_and_add_import_lib(cupti + EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/ + ../extras/CUPTI/lib/) + _CUDAToolkit_find_and_add_import_lib(cupti_static + EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/ + ../extras/CUPTI/lib/) + + _CUDAToolkit_find_and_add_import_lib(nvrtc DEPS cuda_driver) + + _CUDAToolkit_find_and_add_import_lib(nvml ALT nvidia-ml nvml) + + if(WIN32) + # nvtools can be installed outside the CUDA toolkit directory + # so prefer the NVTOOLSEXT_PATH windows only environment variable + # In addition on windows the most common name is nvToolsExt64_1 + find_library(CUDA_nvToolsExt_LIBRARY + NAMES nvToolsExt64_1 nvToolsExt64 nvToolsExt + PATHS ENV NVTOOLSEXT_PATH + ENV CUDA_PATH + PATH_SUFFIXES lib/x64 lib + ) + endif() + _CUDAToolkit_find_and_add_import_lib(nvToolsExt ALT nvToolsExt64) + + _CUDAToolkit_find_and_add_import_lib(OpenCL) +endif() + +if(_CUDAToolkit_Pop_ROOT_PATH) + list(REMOVE_AT CMAKE_FIND_ROOT_PATH 0) + unset(_CUDAToolkit_Pop_ROOT_PATH) +endif() diff --git a/packages/kokkos/cmake/Modules/FindTPLCUDA.cmake b/packages/kokkos/cmake/Modules/FindTPLCUDA.cmake new file mode 100644 index 0000000000000000000000000000000000000000..8d58d96415808499dc39d44ad3600f5f5a64368e --- /dev/null +++ b/packages/kokkos/cmake/Modules/FindTPLCUDA.cmake @@ -0,0 +1,37 @@ +IF (NOT CUDAToolkit_ROOT) + IF (NOT CUDA_ROOT) + SET(CUDA_ROOT $ENV{CUDA_ROOT}) + ENDIF() + IF(CUDA_ROOT) + SET(CUDAToolkit_ROOT ${CUDA_ROOT}) + ENDIF() +ENDIF() + +IF(CMAKE_VERSION VERSION_GREATER_EQUAL "3.17.0") + find_package(CUDAToolkit) +ELSE() + include(${CMAKE_CURRENT_LIST_DIR}/CudaToolkit.cmake) +ENDIF() + + +IF (TARGET CUDA::cudart) + SET(FOUND_CUDART TRUE) + KOKKOS_EXPORT_IMPORTED_TPL(CUDA::cudart) +ELSE() + SET(FOUND_CUDART FALSE) +ENDIF() + +IF (TARGET CUDA::cuda_driver) + SET(FOUND_CUDA_DRIVER TRUE) + KOKKOS_EXPORT_IMPORTED_TPL(CUDA::cuda_driver) +ELSE() + SET(FOUND_CUDA_DRIVER FALSE) +ENDIF() + +include(FindPackageHandleStandardArgs) +FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUDA DEFAULT_MSG FOUND_CUDART FOUND_CUDA_DRIVER) +IF (FOUND_CUDA_DRIVER AND FOUND_CUDART) + KOKKOS_CREATE_IMPORTED_TPL(CUDA INTERFACE + LINK_LIBRARIES CUDA::cuda_driver CUDA::cudart + ) +ENDIF() diff --git a/packages/kokkos/cmake/Modules/FindTPLHPX.cmake b/packages/kokkos/cmake/Modules/FindTPLHPX.cmake new file mode 100644 index 0000000000000000000000000000000000000000..c8b3bc4c9b84505eceff8ba3453501f9bb5d1e01 --- /dev/null +++ b/packages/kokkos/cmake/Modules/FindTPLHPX.cmake @@ -0,0 +1,15 @@ + +FIND_PACKAGE(HPX REQUIRED) +#as of right now, HPX doesn't export correctly +#so let's convert it to an interface target +KOKKOS_CREATE_IMPORTED_TPL(HPX INTERFACE + LINK_LIBRARIES ${HPX_LIBRARIES} + INCLUDES ${HPX_INCLUDE_DIRS} +) +#this is a bit funky since this is a CMake target +#but HPX doesn't export itself correctly +KOKKOS_EXPORT_CMAKE_TPL(HPX) + +#I would prefer all of this gets replaced with +#KOKKOS_IMPORT_CMAKE_TPL(HPX) + diff --git a/packages/kokkos/cmake/Modules/FindTPLHWLOC.cmake b/packages/kokkos/cmake/Modules/FindTPLHWLOC.cmake new file mode 100644 index 0000000000000000000000000000000000000000..cf763b7e5bb585ed77e8dc1fb3b015566a0326f9 --- /dev/null +++ b/packages/kokkos/cmake/Modules/FindTPLHWLOC.cmake @@ -0,0 +1 @@ +KOKKOS_FIND_IMPORTED(HWLOC HEADER hwloc.h LIBRARY hwloc) diff --git a/packages/kokkos/cmake/Modules/FindTPLLIBDL.cmake b/packages/kokkos/cmake/Modules/FindTPLLIBDL.cmake new file mode 100644 index 0000000000000000000000000000000000000000..5fc6a693035cea5e05b379f5fac9d50bcaeb3f7a --- /dev/null +++ b/packages/kokkos/cmake/Modules/FindTPLLIBDL.cmake @@ -0,0 +1 @@ +KOKKOS_FIND_IMPORTED(LIBDL HEADER dlfcn.h LIBRARY dl) diff --git a/packages/kokkos/cmake/Modules/FindTPLLIBNUMA.cmake b/packages/kokkos/cmake/Modules/FindTPLLIBNUMA.cmake new file mode 100644 index 0000000000000000000000000000000000000000..811db5851b9ee359ad996a743bf8a0ac283512f6 --- /dev/null +++ b/packages/kokkos/cmake/Modules/FindTPLLIBNUMA.cmake @@ -0,0 +1 @@ +KOKKOS_FIND_IMPORTED(LIBNUMA HEADER numa.h LIBRARY numa) diff --git a/packages/kokkos/cmake/Modules/FindTPLLIBRT.cmake b/packages/kokkos/cmake/Modules/FindTPLLIBRT.cmake new file mode 100644 index 0000000000000000000000000000000000000000..e75da56b5b5324050236ee0ee4c6847452d5b3cf --- /dev/null +++ b/packages/kokkos/cmake/Modules/FindTPLLIBRT.cmake @@ -0,0 +1 @@ +KOKKOS_FIND_IMPORTED(LIBRT HEADER time.h LIBRARY rt) diff --git a/packages/kokkos/cmake/Modules/FindTPLMEMKIND.cmake b/packages/kokkos/cmake/Modules/FindTPLMEMKIND.cmake new file mode 100644 index 0000000000000000000000000000000000000000..20aaff22955ce9ad026c51b870bf04b7d8b0df42 --- /dev/null +++ b/packages/kokkos/cmake/Modules/FindTPLMEMKIND.cmake @@ -0,0 +1 @@ +KOKKOS_FIND_IMPORTED(MEMKIND HEADER memkind.h LIBRARY memkind) diff --git a/packages/kokkos/cmake/Modules/FindTPLPTHREAD.cmake b/packages/kokkos/cmake/Modules/FindTPLPTHREAD.cmake new file mode 100644 index 0000000000000000000000000000000000000000..a743fca0e45290cf7ad80e3b022e7f66a34947fa --- /dev/null +++ b/packages/kokkos/cmake/Modules/FindTPLPTHREAD.cmake @@ -0,0 +1,20 @@ + +TRY_COMPILE(KOKKOS_HAS_PTHREAD_ARG + ${KOKKOS_TOP_BUILD_DIR}/tpl_tests + ${KOKKOS_SOURCE_DIR}/cmake/compile_tests/pthread.cpp + LINK_LIBRARIES -pthread + COMPILE_DEFINITIONS -pthread +) +# The test no longer requires C++11 +# if we did needed C++ standard support, then we should add option +# ${CMAKE_CXX${KOKKOS_CXX_STANDARD}_STANDARD_COMPILE_OPTION} + +INCLUDE(FindPackageHandleStandardArgs) +FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLPTHREAD DEFAULT_MSG KOKKOS_HAS_PTHREAD_ARG) +#Only create the TPL if we succeed +IF (KOKKOS_HAS_PTHREAD_ARG) + KOKKOS_CREATE_IMPORTED_TPL(PTHREAD + INTERFACE #this is not a real library with a real location + COMPILE_OPTIONS -pthread + LINK_OPTIONS -pthread) +ENDIF() diff --git a/packages/kokkos/cmake/Modules/FindTPLROCM.cmake b/packages/kokkos/cmake/Modules/FindTPLROCM.cmake new file mode 100644 index 0000000000000000000000000000000000000000..512ad6ceb283dcd27f8db1dfb45f045f998d7875 --- /dev/null +++ b/packages/kokkos/cmake/Modules/FindTPLROCM.cmake @@ -0,0 +1,11 @@ +include(FindPackageHandleStandardArgs) + +FIND_LIBRARY(AMD_HIP_LIBRARY amdhip64 PATHS ENV ROCM_PATH PATH_SUFFIXES lib) +FIND_LIBRARY(HSA_RUNTIME_LIBRARY hsa-runtime64 PATHS ENV ROCM_PATH PATH_SUFFIXES lib) + +find_package_handle_standard_args(TPLROCM DEFAULT_MSG AMD_HIP_LIBRARY HSA_RUNTIME_LIBRARY) + +kokkos_create_imported_tpl(ROCM INTERFACE + LINK_LIBRARIES ${HSA_RUNTIME_LIBRARY} ${AMD_HIP_LIBRARY} + COMPILE_DEFINITIONS __HIP_ROCclr__ +) diff --git a/packages/kokkos/cmake/README.md b/packages/kokkos/cmake/README.md new file mode 100644 index 0000000000000000000000000000000000000000..385bbfcd5d5a0a66bcfb893af7953ba555830405 --- /dev/null +++ b/packages/kokkos/cmake/README.md @@ -0,0 +1,332 @@ + + +# Developing Kokkos + +This document contains a build system overview for developers with information on adding new CMake options that could influence +* Header configuration macros +* Optional features +* Third-partly libraries +* Compiler and linker flags +For build system details for users, refer to the [build instructions](../BUILD.md). + +## Build System + +Kokkos uses CMake to configure, build, and install. +Rather than being a completely straightforward use of modern CMake, +Kokkos has several extra complications, primarily due to: +* Kokkos must support linking to an installed version or in-tree builds as a subdirectory of a larger project. +* Kokkos must configure a special compiler `nvcc_wrapper` that allows `nvcc` to accept all C++ flags (which `nvcc` currently does not). +* Kokkos must work as a part of TriBITS, a CMake library providing a particular build idiom for Trilinos. +* Kokkos has many pre-existing users. We need to be careful about breaking previous versions or generating meaningful error messags if we do break backwards compatibility. + +If you are looking at the build system code wondering why certain decisions were made: we have had to balance many competing requirements and certain technical debt. Everything in the build system was done for a reason, trying to adhere as closely as possible to modern CMake best practices while meeting all pre-existing. customer requirements. + +### Modern CMake Philosophy + +Modern CMake relies on understanding the principle of *building* and *using* a code project. +What preprocessor, compiler, and linker flags do I need to *build* my project? +What flags does a downstream project that links to me need to *use* my project? +In CMake terms, flags that are only needed for building are `PRIVATE`. +Only Kokkos needs these flags, not a package that depends on Kokkos. +Flags that must be used in a downstream project are `PUBLIC`. +Kokkos must tell other projects to use them. + +In Kokkos, almost everything is a public flag since Kokkos is driven by headers and Kokkos is in charge of optimizing your code to achieve performance portability! +Include paths, C++ standard flags, architecture-specific optimizations, or OpenMP and CUDA flags are all examples of flags that Kokkos configures and adds to your project. + +Modern CMake now automatically propagates flags through the `target_link_libraries` command. +Suppose you have a library `stencil` that needs to build with Kokkos. +Consider the following CMake code: + +```` +find_package(Kokkos) +add_library(stencil stencil.cpp) +target_link_libraries(stencil Kokkos::kokkos) +```` + +This locates the Kokkos package, adds your library, and tells CMake to link Kokkos to your library. +All public build flags get added automatically through the `target_link_libraries` command. +There is nothing to do. You can be happily oblivious to how Kokkos was configured. +Everything should just work. + +As a Kokkos developer who wants to add new public compiler flags, how do you ensure that CMake does this properly? Modern CMake works through targets and properties. +Each target has a set of standard properties: +* `INTERFACE_COMPILE_OPTIONS` contains all the compiler options that Kokkos should add to downstream projects +* `INTERFACE_INCLUDE_DIRECTORIES` contains all the directories downstream projects must include from Kokkos +* `INTERFACE_COMPILE_DEFINITIONS` contains the list of preprocessor `-D` flags +* `INTERFACE_LINK_LIBRARIES` contains all the libraries downstream projects need to link +* `INTERFACE_COMPILE_FEATURES` essentially adds compiler flags, but with extra complications. Features names are specific to CMake. More later. + +CMake makes it easy to append to these properties using: +* `target_compile_options(kokkos PUBLIC -fmyflag)` +* `target_include_directories(kokkos PUBLIC mySpecialFolder)` +* `target_compile_definitions(kokkos PUBLIC -DmySpecialFlag=0)` +* `target_link_libraries(kokkos PUBLIC mySpecialLibrary)` +* `target_compile_features(kokkos PUBLIC mySpecialFeature)` +Note that all of these use `PUBLIC`! Almost every Kokkos flag is not private to Kokkos, but must also be used by downstream projects. + + +### Compiler Features and Compiler Options +Compiler options are flags like `-fopenmp` that do not need to be "resolved." +The flag is either on or off. +Compiler features are more fine-grained and require conflicting requests to be resolved. +Suppose I have +```` +add_library(A a.cpp) +target_compile_features(A PUBLIC cxx_std_14) +```` +then another target +```` +add_library(B b.cpp) +target_compile_features(B PUBLIC cxx_std_17) +target_link_libraries(A B) +```` +I have requested two different features. +CMake understands the requests and knows that `cxx_std_14` is a subset of `cxx_std_17`. +CMake then picks C++17 for library `B`. +CMake would not have been able to do feature resolution if we had directly done: +```` +target_compile_options(A PUBLIC -std=c++14) +```` + +### Adding Kokkos Options +After configuring for the first time, +CMake creates a cache of configure variables in `CMakeCache.txt`. +Reconfiguring in the folder "restarts" from those variables. +All flags passed as `-DKokkos_SOME_OPTION=X` to `cmake` become variables in the cache. +All Kokkos options begin with camel case `Kokkos_` followed by an upper case option name. + +CMake best practice is to avoid cache variables, if possible. +In essence, you want the minimal amount of state cached between configurations. +And never, ever have behavior influenced by multiple cache variables. +If you want to change the Kokkos configuration, have a single unique variable that needs to be changed. +Never require two cache variables to be changed. + +Kokkos provides a function `KOKKOS_OPTION` for defining valid cache-level variables, +proofreading them, and defining local project variables. +The most common variables are called `Kokkos_ENABLE_X`, +for which a helper function `KOKKOS_ENABLE_OPTION` is provided, e.g. +```` +KOKKOS_ENABLE_OPTION(TESTS OFF "Whether to build tests") +```` +The function checks if `-DKokkos_ENABLE_TESTS` was given, +whether it was given with the wrong case, e.g. `-DKokkos_Enable_Tests`, +and then defines a regular (non-cache) variable `KOKKOS_ENABLE_TESTS` to `ON` or `OFF` +depending on the given default and whether the option was specified. + +### Defining Kokkos Config Macros + +Sometimes you may want to add `#define Kokkos_X` macros to the config header. +This is straightforward with CMake. +Suppose you want to define an optional macro `KOKKOS_SUPER_SCIENCE`. +Simply go into `KokkosCore_config.h.in` and add +```` +#cmakedefine KOKKOS_SUPER_SCIENCE +```` +I can either add +```` +KOKKOS_OPTION(SUPER_SCIENCE ON "Whether to do some super science") +```` +to directly set the variable as a command-line `-D` option. +Alternatively, based on other logic, I could add to a `CMakeLists.txt` +```` +SET(KOKKOS_SUPER_SCIENCE ON) +```` +If not set as a command-line option (cache variable), you must make sure the variable is visible in the top-level scope. +If set in a function, you would need: +```` +SET(KOKKOS_SUPER_SCIENCE ON PARENT_SCOPE) +```` + +### Third-Party Libraries +In much the same way that compiler flags transitively propagate to dependent projects, +modern CMake allows us to propagate dependent libraries. +If Kokkos depends on, e.g. `hwloc` the downstream project will also need to link `hwloc`. +There are three stages in adding a new third-party library (TPL): +* Finding: find the desired library on the system and verify the installation is correct +* Importing: create a CMake target, if necessary, that is compatible with `target_link_libraries`. This is mostly relevant for TPLs not installed with CMake. +* Exporting: make the desired library visible to downstream projects + +TPLs are somewhat complicated by whether the library was installed with CMake or some other build system. +If CMake, our lives are greatly simplified. We simply use `find_package` to locate the installed CMake project then call `target_link_libraries(kokkoscore PUBLIC/PRIVATE TPL)`. For libaries not installed with CMake, the process is a bit more complex. +It is up to the Kokkos developers to "convert" the library into a CMake target as if it had been installed as a valid modern CMake target with properties. +There are helper functions for simplifying the process of importing TPLs in Kokkos, but we walk through the process in detail to clearly illustrate the steps involved. + +#### TPL Search Order + +There are several options for where CMake could try to find a TPL. +If there are multiple installations of the same TPL on the system, +the search order is critical for making sure the correct TPL is found. +There are 3 possibilities that could be used: + +1. Default system paths like /usr +1. User-provided paths through options `<NAME>_ROOT` and `Kokkos_<NAME>_DIR` +1. Additional paths not in the CMake default list or provided by the user that Kokkos decides to add. For example, Kokkos may query `nvcc` or `LD_LIBRARY_PATH` for where to find CUDA libraries. + +The following is the search order that Kokkos follows. Note: This differs from the default search order used by CMake `find_library` and `find_header`. CMake prefers default system paths over user-provided paths. +For Kokkos (and package managers in general), it is better to prefer user-provided paths since this usually indicates a specific version we want. + +1. `<NAME>_ROOT` command line option +1. `<NAME>_ROOT` environment variable +1. `Kokkos_<NAME>_DIR` command line option +1. Paths added by Kokkos CMake logic +1. Default system paths (if allowed) + +Default system paths are allowed in two cases. First, none of the other options are given so the only place to look is system paths. Second, if explicitly given permission, configure will look in system paths. +The rationale for this logic is that if you specify a custom location, you usually *only* want to look in that location. +If you do not find the TPL where you expect it, you should error out rather than grab another random match. + + +#### Finding TPLs + +If finding a TPL that is not a modern CMake project, refer to the `FindHWLOC.cmake` file in `cmake/Modules` for an example. +You will usually need to verify expected headers with `find_path` +```` +find_path(TPL_INCLUDE_DIR mytpl.h PATHS "${KOKKOS_MYTPL_DIR}/include") +```` +This insures that the library header is in the expected include directory and defines the variable `TPL_INCLUDE_DIR` with a valid path if successful. +Similarly, you can verify a library +```` +find_library(TPL_LIBRARY mytpl PATHS "${KOKKOS_MYTPL_DIR/lib") +```` +that then defines the variable `TPL_LIBRARY` with a valid path if successful. +CMake provides a utility for checking if the `find_path` and `find_library` calls were successful that emulates the behavior of `find_package` for a CMake target. +```` +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(MYTPL DEFAULT_MSG + MYTPL_INCLUDE_DIR MYTPL_LIBRARY) +```` +If the find failed, CMake will print standard error messages explaining the failure. + +#### Importing TPLs + +The installed TPL must be adapted into a CMake target. +CMake allows libraries to be added that are built externally as follows: +```` +add_library(Kokkos::mytpl UNKNOWN IMPORTED) +```` +Importantly, we use a `Kokkos::` namespace to avoid name conflicts and identify this specifically as the version imported by Kokkos. +Because we are importing a non-CMake target, we must populate all the target properties that would have been automatically populated for a CMake target. +```` +set_target_properties(Kokkos::mytpl PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${MYTPL_INCLUDE_DIR}" + IMPORTED_LOCATION "${MYTPL_LIBRARY}" +) +```` + +#### Exporting TPLs + +Kokkos may now depend on the target `Kokkos::mytpl` as a `PUBLIC` library (remember building and using). +This means that downstream projects must also know about `Kokkos::myptl` - so Kokkos must export them. +In the `KokkosConfig.cmake.in` file, we need to add code like the following: +```` +set(MYTPL_LIBRARY @MYTPL_LIBRARY@) +set(MYTPL_INCLUDE_DIR @MYTPL_INCLUDE_DIR@) +add_library(Kokkos::mytpl UNKNOWN IMPORTED) +set_target_properties(Kokkos::mytpl PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${MYTPL_INCLUDE_DIR}" + IMPORTED_LOCATION "${MYTPL_LIBRARY}" +) +```` +If this looks familiar, that's because it is exactly the same code as above for importing the TPL. +Exporting a TPL really just means importing the TPL when Kokkos is loaded by an external project. +We will describe helper functions that simplify this process. + +#### Interface TPLs + +If a TPL is just a library and set of headers, we can make a simple `IMPORTED` target. +However, a TPL is actually completely flexible and need not be limited to just headers and libraries. +TPLs can configure compiler flags, linker flags, or multiple different libraries. +For this, we use a special type of CMake target: `INTERFACE` libraries. +These libraries don't build anything. +They simply populate properties that will configure flags for dependent targets. +We consider the example: +```` +add_library(PTHREAD INTERFACE) +target_compile_options(PTHREAD PUBLIC -pthread) +```` +Kokkos uses the compiler flag `-pthread` to define compiler macros for re-entrant functions rather than treating it simply as a library with header `pthread.h` and library `-lpthread`. +Any property can be configured, e.g. +```` +target_link_libraries(MYTPL ...) +```` +In contrast to imported TPLs which require direct modification of `KokkosConfig.cmake.in`, +we can use CMake's built-in export functions: +```` +INSTALL( + TARGETS MYTPL + EXPORT KokkosTargets + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} +) +```` +These interface targets will be automatically populated in the config file. + +#### Linking the TPL +After finishing the import process, it still remains to link the imported target as needed. +For example, +```` +target_link_libraries(kokkoscore PUBLIC Kokkos::HWLOC) +```` +The complexity of which includes, options, and libraries the TPL requires +should be encapsulated in the CMake target. + +#### TPL Helper Functions +##### KOKKOS_IMPORT_TPL +This function can be invoked as, e.g. +```` +KOKKOS_IMPORT_TPL(HWLOC) +```` +This function checks if the TPL was enabled by a `-DKokkos_ENABLE_HWLOC=On` flag. +If so, it calls `find_package(TPLHWLOC)`. +This invokes the file `FindTPLHWLOC.cmake` which should be contained in the `cmake/Modules` folder. +If successful, another function `KOKKOS_EXPORT_CMAKE_TPL` gets invoked. +This automatically adds all the necessary import commands to `KokkosConfig.cmake`. + +##### KOKKOS_FIND_IMPORTED +Inside a `FindTPLX.cmake` file, the simplest way to import a library is to call, e.g. +```` +KOKKOS_FIND_IMPORTED(HWLOC LIBRARY hwloc HEADER hwloc.h) +```` +This finds the location of the library and header and creates an imported target `Kokkos::HWLOC` +that can be linked against. +The library/header find can be guided with `-DHWLOC_ROOT=` or `-DKokkos_HWLOC_DIR=` during CMake configure. +These both specify the install prefix. + +##### KOKKOS_LINK_TPL +This function checks if the TPL has been enabled. +If so, it links a given library against the imported (or interface) TPL target. + +##### KOKKOS_CREATE_IMPORTED_TPL +This helper function is best understood by reading the actual code. +This function takes arguments specifying the properties and creates the actual TPL target. +The most important thing to understand for this function is whether you call this function with the optional `INTERFACE` keyword. +This tells the project to either create the target as an imported target or interface target, as discussed above. + +##### KOKKOS_EXPORT_CMAKE_TPL +Even if the TPL just loads a valid CMake target, we still must "export" it into the config file. +When Kokkos is loaded by a downstream project, this TPL must be loaded. +Calling this function simply appends text recording the location where the TPL was found +and adding a `find_dependency(...)` call that will reload the CMake target. + +### The Great TriBITS Compromise + +TriBITS was a masterpiece of CMake version 2 before the modern CMake idioms of building and using. +TriBITS greatly limited verbosity of CMake files, handled complicated dependency trees between packages, and handled automatically setting up include and linker paths for dependent libraries. + +Kokkos is now used by numerous projects that don't (and won't) depend on TriBITS for their build systems. +Kokkos has to work outside of TriBITS and provide a standard CMake 3+ build system. +At the same time, Kokkos is used by numerous projects that depend on TriBITS and don't (and won't) switch to a standard CMake 3+ build system. + +Instead of calling functions `TRIBITS_X(...)`, the CMake calls wrapper functions `KOKKOS_X(...)`. +If TriBITS is available (as in Trilinos), `KOKKOS_X` will just be a thin wrapper around `TRIBITS_X`. +If TriBITS is not available, Kokkos maps `KOKKOS_X` calls to native CMake that complies with CMake 3 idioms. +For the time being, this seems the most sensible way to handle the competing requirements of a standalone modern CMake and TriBITS build system. + +##### [LICENSE](https://github.com/kokkos/kokkos/blob/devel/LICENSE) + +[](https://opensource.org/licenses/BSD-3-Clause) + +Under the terms of Contract DE-NA0003525 with NTESS, +the U.S. Government retains certain rights in this software. diff --git a/packages/kokkos/cmake/compile_tests/clang_omp.cpp b/packages/kokkos/cmake/compile_tests/clang_omp.cpp new file mode 100644 index 0000000000000000000000000000000000000000..60a5c522820cdb03cbde8a2cf3a796c57292b46b --- /dev/null +++ b/packages/kokkos/cmake/compile_tests/clang_omp.cpp @@ -0,0 +1,9 @@ +#include <omp.h> + +int main(int, char**) { + int thr = omp_get_num_threads(); + if (thr > 0) + return thr; + else + return 0; +} diff --git a/packages/kokkos/cmake/compile_tests/cplusplus14.cpp b/packages/kokkos/cmake/compile_tests/cplusplus14.cpp new file mode 100644 index 0000000000000000000000000000000000000000..52ec9885ec3ed5f4e7c0871f59de3d651df33efe --- /dev/null +++ b/packages/kokkos/cmake/compile_tests/cplusplus14.cpp @@ -0,0 +1,8 @@ +#include <type_traits> + +int main() { + // _t versions of type traits were added in C++14 + std::remove_cv_t<int> i = 0; + + return i; +} diff --git a/packages/kokkos/cmake/compile_tests/cuda_compute_capability.cc b/packages/kokkos/cmake/compile_tests/cuda_compute_capability.cc new file mode 100644 index 0000000000000000000000000000000000000000..a26ac5af4bf2dee2c26f1ee20c6c500fe465bf9f --- /dev/null +++ b/packages/kokkos/cmake/compile_tests/cuda_compute_capability.cc @@ -0,0 +1,83 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <iostream> + +int main() { + cudaDeviceProp device_properties; + const cudaError_t error = cudaGetDeviceProperties(&device_properties, + /*device*/ 0); + if (error != cudaSuccess) { + std::cout << "CUDA error: " << cudaGetErrorString(error) << '\n'; + return error; + } + unsigned int const compute_capability = + device_properties.major * 10 + device_properties.minor; +#ifdef SM_ONLY + std::cout << compute_capability; +#else + switch (compute_capability) { + // clang-format off + case 30: std::cout << "Set -DKokkos_ARCH_KEPLER30=ON ." << std::endl; break; + case 32: std::cout << "Set -DKokkos_ARCH_KEPLER32=ON ." << std::endl; break; + case 35: std::cout << "Set -DKokkos_ARCH_KEPLER35=ON ." << std::endl; break; + case 37: std::cout << "Set -DKokkos_ARCH_KEPLER37=ON ." << std::endl; break; + case 50: std::cout << "Set -DKokkos_ARCH_MAXWELL50=ON ." << std::endl; break; + case 52: std::cout << "Set -DKokkos_ARCH_MAXWELL52=ON ." << std::endl; break; + case 53: std::cout << "Set -DKokkos_ARCH_MAXWELL53=ON ." << std::endl; break; + case 60: std::cout << "Set -DKokkos_ARCH_PASCAL60=ON ." << std::endl; break; + case 61: std::cout << "Set -DKokkos_ARCH_PASCAL61=ON ." << std::endl; break; + case 70: std::cout << "Set -DKokkos_ARCH_VOLTA70=ON ." << std::endl; break; + case 72: std::cout << "Set -DKokkos_ARCH_VOLTA72=ON ." << std::endl; break; + case 75: std::cout << "Set -DKokkos_ARCH_TURING75=ON ." << std::endl; break; + case 80: std::cout << "Set -DKokkos_ARCH_AMPERE80=ON ." << std::endl; break; + case 86: std::cout << "Set -DKokkos_ARCH_AMPERE86=ON ." << std::endl; break; + default: + std::cout << "Compute capability " << compute_capability + << " is not supported" << std::endl; + // clang-format on + } +#endif + return 0; +} diff --git a/packages/kokkos/cmake/compile_tests/pthread.cpp b/packages/kokkos/cmake/compile_tests/pthread.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3f83bf6a5f7fe399fc4a44547792e738177facfb --- /dev/null +++ b/packages/kokkos/cmake/compile_tests/pthread.cpp @@ -0,0 +1,14 @@ +#include <pthread.h> + +void* kokkos_test(void* args) { return args; } + +int main() { + pthread_t thread; + /* Use NULL to avoid C++11. Some compilers + do not have C++11 by default. Forcing C++11 + in the compile tests can be done, but is unnecessary + */ + pthread_create(&thread, NULL, kokkos_test, NULL); + pthread_join(thread, NULL); + return 0; +} diff --git a/packages/kokkos/cmake/cray.cmake b/packages/kokkos/cmake/cray.cmake new file mode 100644 index 0000000000000000000000000000000000000000..08912f5130f92fec97a4bdb6abb90e860d0b9cda --- /dev/null +++ b/packages/kokkos/cmake/cray.cmake @@ -0,0 +1,9 @@ + + +function(kokkos_set_cray_flags full_standard int_standard) + STRING(TOLOWER ${full_standard} FULL_LC_STANDARD) + STRING(TOLOWER ${int_standard} INT_LC_STANDARD) + SET(KOKKOS_CXX_STANDARD_FLAG "-hstd=c++${FULL_LC_STANDARD}", PARENT_SCOPE) + SET(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "-hstd=c++${INT_LC_STANDARD}" PARENT_SCOPE) +endfunction() + diff --git a/packages/kokkos/cmake/deps/CUDA.cmake b/packages/kokkos/cmake/deps/CUDA.cmake new file mode 100644 index 0000000000000000000000000000000000000000..beaf4e6d6cd922e5916dd5f1e35bf43f58bee7da --- /dev/null +++ b/packages/kokkos/cmake/deps/CUDA.cmake @@ -0,0 +1,66 @@ +# @HEADER +# ************************************************************************ +# +# Kokkos v. 3.0 +# Copyright (2020) National Technology & Engineering +# Solutions of Sandia, LLC (NTESS). +# +# Under the terms of Contract DE-NA0003525 with NTESS, +# the U.S. Government retains certain rights in this software. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# 3. Neither the name of the Corporation nor the names of the +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Questions? Contact Christian R. Trott (crtrott@sandia.gov) +# +# ************************************************************************ +# @HEADER + +# Check for CUDA support + +SET(_CUDA_FAILURE OFF) + +# Have CMake find CUDA +IF(NOT _CUDA_FAILURE) + FIND_PACKAGE(CUDA 3.2) + IF (NOT CUDA_FOUND) + SET(_CUDA_FAILURE ON) + ENDIF() +ENDIF() + +IF(NOT _CUDA_FAILURE) + # if we haven't met failure + macro(PACKAGE_ADD_CUDA_LIBRARY cuda_target) + TRIBITS_ADD_LIBRARY(${cuda_target} ${ARGN} CUDALIBRARY) + endmacro() + GLOBAL_SET(TPL_CUDA_LIBRARY_DIRS) + GLOBAL_SET(TPL_CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE}) + GLOBAL_SET(TPL_CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY} ${CUDA_cublas_LIBRARY} ${CUDA_cufft_LIBRARY}) + KOKKOS_CREATE_IMPORTED_TPL_LIBRARY(CUSPARSE) +ELSE() + SET(TPL_ENABLE_CUDA OFF) +ENDIF() diff --git a/packages/kokkos/cmake/deps/CUSPARSE.cmake b/packages/kokkos/cmake/deps/CUSPARSE.cmake new file mode 100644 index 0000000000000000000000000000000000000000..073c40d8140157bb95aac681bf084da2aeadaf5d --- /dev/null +++ b/packages/kokkos/cmake/deps/CUSPARSE.cmake @@ -0,0 +1,51 @@ +# @HEADER +# ************************************************************************ +# +# Kokkos v. 3.0 +# Copyright (2020) National Technology & Engineering +# Solutions of Sandia, LLC (NTESS). +# +# Under the terms of Contract DE-NA0003525 with NTESS, +# the U.S. Government retains certain rights in this software. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# 3. Neither the name of the Corporation nor the names of the +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Questions? Contact Christian R. Trott (crtrott@sandia.gov) +# +# ************************************************************************ +# @HEADER + +#include(${TRIBITS_DEPS_DIR}/CUDA.cmake) + +#IF (TPL_ENABLE_CUDA) +# GLOBAL_SET(TPL_CUSPARSE_LIBRARY_DIRS) +# GLOBAL_SET(TPL_CUSPARSE_INCLUDE_DIRS ${TPL_CUDA_INCLUDE_DIRS}) +# GLOBAL_SET(TPL_CUSPARSE_LIBRARIES ${CUDA_cusparse_LIBRARY}) +# KOKKOS_CREATE_IMPORTED_TPL_LIBRARY(CUSPARSE) +#ENDIF() + diff --git a/packages/kokkos/cmake/deps/HWLOC.cmake b/packages/kokkos/cmake/deps/HWLOC.cmake new file mode 100644 index 0000000000000000000000000000000000000000..f8402db00a7287e3a2526c7569ebd7387941da1b --- /dev/null +++ b/packages/kokkos/cmake/deps/HWLOC.cmake @@ -0,0 +1,57 @@ +# @HEADER +# ************************************************************************ +# +# Kokkos v. 3.0 +# Copyright (2020) National Technology & Engineering +# Solutions of Sandia, LLC (NTESS). +# +# Under the terms of Contract DE-NA0003525 with NTESS, +# the U.S. Government retains certain rights in this software. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# 3. Neither the name of the Corporation nor the names of the +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Questions? Contact Christian R. Trott (crtrott@sandia.gov) +# +# ************************************************************************ +# @HEADER + + +#----------------------------------------------------------------------------- +# Hardware locality detection and control library. +# +# Acquisition information: +# Date checked: November 2011 +# Checked by: H. Carter Edwards <hcedwar AT sandia.gov> +# Source: http://www.open-mpi.org/projects/hwloc/ +# Version: 1.3 +# + +KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( HWLOC + REQUIRED_HEADERS hwloc.h + REQUIRED_LIBS_NAMES "hwloc" + ) diff --git a/packages/kokkos/cmake/deps/Pthread.cmake b/packages/kokkos/cmake/deps/Pthread.cmake new file mode 100644 index 0000000000000000000000000000000000000000..639e4ef6975167f717c7de114b36d6642badd67d --- /dev/null +++ b/packages/kokkos/cmake/deps/Pthread.cmake @@ -0,0 +1,70 @@ +# @HEADER +# ************************************************************************ +# +# Kokkos v. 3.0 +# Copyright (2020) National Technology & Engineering +# Solutions of Sandia, LLC (NTESS). +# +# Under the terms of Contract DE-NA0003525 with NTESS, +# the U.S. Government retains certain rights in this software. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# 3. Neither the name of the Corporation nor the names of the +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Questions? Contact Christian R. Trott (crtrott@sandia.gov) +# +# ************************************************************************ +# @HEADER + + +SET(USE_THREADS FALSE) + +IF(NOT TPL_Pthread_INCLUDE_DIRS AND NOT TPL_Pthread_LIBRARY_DIRS AND NOT TPL_Pthread_LIBRARIES) + # Use CMake's Thread finder since it is a bit smarter in determining + # whether pthreads is already built into the compiler and doesn't need + # a library to link. + FIND_PACKAGE(Threads) + #If Threads found a copy of pthreads make sure it is one of the cases the tribits + #tpl system cannot handle. + IF(Threads_FOUND AND CMAKE_USE_PTHREADS_INIT) + IF(CMAKE_THREAD_LIBS_INIT STREQUAL "" OR CMAKE_THREAD_LIBS_INIT STREQUAL "-pthread") + SET(USE_THREADS TRUE) + ENDIF() + ENDIF() +ENDIF() + +IF(USE_THREADS) + SET(TPL_Pthread_INCLUDE_DIRS "") + SET(TPL_Pthread_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}") + SET(TPL_Pthread_LIBRARY_DIRS "") + KOKKOS_CREATE_IMPORTED_TPL_LIBRARY(Pthread) +ELSE() + KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( Pthread + REQUIRED_HEADERS pthread.h + REQUIRED_LIBS_NAMES pthread + ) +ENDIF() diff --git a/packages/kokkos/cmake/fake_tribits.cmake b/packages/kokkos/cmake/fake_tribits.cmake new file mode 100644 index 0000000000000000000000000000000000000000..fbd6745a602caa8976958d10cf7d9b4c1fa3c471 --- /dev/null +++ b/packages/kokkos/cmake/fake_tribits.cmake @@ -0,0 +1,350 @@ +#These are tribits wrappers used by all projects in the Kokkos ecosystem + +INCLUDE(CMakeParseArguments) +INCLUDE(CTest) + +cmake_policy(SET CMP0054 NEW) + +FUNCTION(ASSERT_DEFINED VARS) + FOREACH(VAR ${VARS}) + IF(NOT DEFINED ${VAR}) + MESSAGE(SEND_ERROR "Error, the variable ${VAR} is not defined!") + ENDIF() + ENDFOREACH() +ENDFUNCTION() + +MACRO(KOKKOS_ADD_OPTION_AND_DEFINE USER_OPTION_NAME MACRO_DEFINE_NAME DOCSTRING DEFAULT_VALUE ) +SET( ${USER_OPTION_NAME} "${DEFAULT_VALUE}" CACHE BOOL "${DOCSTRING}" ) +IF(NOT ${MACRO_DEFINE_NAME} STREQUAL "") + IF(${USER_OPTION_NAME}) + GLOBAL_SET(${MACRO_DEFINE_NAME} ON) + ELSE() + GLOBAL_SET(${MACRO_DEFINE_NAME} OFF) + ENDIF() +ENDIF() +ENDMACRO() + +MACRO(GLOBAL_OVERWRITE VARNAME VALUE TYPE) + SET(${VARNAME} ${VALUE} CACHE ${TYPE} "" FORCE) +ENDMACRO() + +IF (NOT KOKKOS_HAS_TRILINOS) +MACRO(APPEND_GLOB VAR) + FILE(GLOB LOCAL_TMP_VAR ${ARGN}) + LIST(APPEND ${VAR} ${LOCAL_TMP_VAR}) +ENDMACRO() + +MACRO(GLOBAL_SET VARNAME) + SET(${VARNAME} ${ARGN} CACHE INTERNAL "" FORCE) +ENDMACRO() + +MACRO(PREPEND_GLOBAL_SET VARNAME) + ASSERT_DEFINED(${VARNAME}) + GLOBAL_SET(${VARNAME} ${ARGN} ${${VARNAME}}) +ENDMACRO() + +MACRO(PREPEND_TARGET_SET VARNAME TARGET_NAME TYPE) + IF(TYPE STREQUAL "REQUIRED") + SET(REQUIRED TRUE) + ELSE() + SET(REQUIRED FALSE) + ENDIF() + IF(TARGET ${TARGET_NAME}) + PREPEND_GLOBAL_SET(${VARNAME} ${TARGET_NAME}) + ELSE() + IF(REQUIRED) + MESSAGE(FATAL_ERROR "Missing dependency ${TARGET_NAME}") + ENDIF() + ENDIF() +ENDMACRO() +endif() + + +FUNCTION(KOKKOS_CONFIGURE_FILE PACKAGE_NAME_CONFIG_FILE) + if (KOKKOS_HAS_TRILINOS) + TRIBITS_CONFIGURE_FILE(${PACKAGE_NAME_CONFIG_FILE}) + else() + # Configure the file + CONFIGURE_FILE( + ${PACKAGE_SOURCE_DIR}/cmake/${PACKAGE_NAME_CONFIG_FILE}.in + ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME_CONFIG_FILE} + ) + endif() +ENDFUNCTION() + +MACRO(ADD_INTERFACE_LIBRARY LIB_NAME) + FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp "") + ADD_LIBRARY(${LIB_NAME} STATIC ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp) + SET_TARGET_PROPERTIES(${LIB_NAME} PROPERTIES INTERFACE TRUE) +ENDMACRO() + +FUNCTION(KOKKOS_ADD_TEST) + if (KOKKOS_HAS_TRILINOS) + CMAKE_PARSE_ARGUMENTS(TEST + "SKIP_TRIBITS" + "EXE;NAME;TOOL" + "ARGS" + ${ARGN}) + + IF(TEST_SKIP_TRIBITS) + MESSAGE(STATUS "Skipping test ${TEST_NAME} in TriBits") + RETURN() + ENDIF() + + IF(TEST_EXE) + SET(EXE_ROOT ${TEST_EXE}) + ELSE() + SET(EXE_ROOT ${TEST_NAME}) + ENDIF() + + TRIBITS_ADD_TEST( + ${EXE_ROOT} + NAME ${TEST_NAME} + COMM serial mpi + NUM_MPI_PROCS 1 + ARGS ${TEST_ARGS} + ${TEST_UNPARSED_ARGUMENTS} + ADDED_TESTS_NAMES_OUT ALL_TESTS_ADDED + ) + + # We will get prepended package name here + SET(TEST_NAME ${PACKAGE_NAME}_${TEST_NAME}) + SET(EXE ${PACKAGE_NAME}_${EXE_ROOT}) + + # The function TRIBITS_ADD_TEST() has a CATEGORIES argument that defaults + # to BASIC. If a project elects to only enable tests marked as PERFORMANCE, + # the test won't actually be added and attempting to set a property on it below + # will yield an error. + if(TARGET ${EXE}) + if(TEST_TOOL) + add_dependencies(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool + foreach(TEST_ADDED ${ALL_TESTS_ADDED}) + set_property(TEST ${TEST_ADDED} APPEND PROPERTY ENVIRONMENT "KOKKOS_PROFILE_LIBRARY=$<TARGET_FILE:${TEST_TOOL}>") + endforeach() + endif() + endif() + else() + CMAKE_PARSE_ARGUMENTS(TEST + "WILL_FAIL;SKIP_TRIBITS" + "FAIL_REGULAR_EXPRESSION;PASS_REGULAR_EXPRESSION;EXE;NAME;TOOL" + "CATEGORIES;ARGS" + ${ARGN}) + # To match Tribits, we should always be receiving + # the root names of exes/libs + IF(TEST_EXE) + SET(EXE_ROOT ${TEST_EXE}) + ELSE() + SET(EXE_ROOT ${TEST_NAME}) + ENDIF() + # Prepend package name to the test name + # These should be the full target name + SET(TEST_NAME ${PACKAGE_NAME}_${TEST_NAME}) + SET(EXE ${PACKAGE_NAME}_${EXE_ROOT}) + IF(WIN32) + ADD_TEST(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} + COMMAND ${EXE}${CMAKE_EXECUTABLE_SUFFIX} ${TEST_ARGS}) + ELSE() + ADD_TEST(NAME ${TEST_NAME} COMMAND ${EXE} ${TEST_ARGS}) + ENDIF() + IF(TEST_WILL_FAIL) + SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES WILL_FAIL ${TEST_WILL_FAIL}) + ENDIF() + IF(TEST_FAIL_REGULAR_EXPRESSION) + SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${TEST_FAIL_REGULAR_EXPRESSION}) + ENDIF() + IF(TEST_PASS_REGULAR_EXPRESSION) + SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${TEST_PASS_REGULAR_EXPRESSION}) + ENDIF() + IF(TEST_TOOL) + ADD_DEPENDENCIES(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool + SET_PROPERTY(TEST ${TEST_NAME} APPEND_STRING PROPERTY ENVIRONMENT "KOKKOS_PROFILE_LIBRARY=$<TARGET_FILE:${TEST_TOOL}>") + ENDIF() + VERIFY_EMPTY(KOKKOS_ADD_TEST ${TEST_UNPARSED_ARGUMENTS}) + ENDIF() +ENDFUNCTION() + +FUNCTION(KOKKOS_ADD_ADVANCED_TEST) + if (KOKKOS_HAS_TRILINOS) + TRIBITS_ADD_ADVANCED_TEST(${ARGN}) + else() + # TODO Write this + endif() +ENDFUNCTION() + +MACRO(KOKKOS_CREATE_IMPORTED_TPL_LIBRARY TPL_NAME) + ADD_INTERFACE_LIBRARY(TPL_LIB_${TPL_NAME}) + TARGET_LINK_LIBRARIES(TPL_LIB_${TPL_NAME} LINK_PUBLIC ${TPL_${TPL_NAME}_LIBRARIES}) + TARGET_INCLUDE_DIRECTORIES(TPL_LIB_${TPL_NAME} INTERFACE ${TPL_${TPL_NAME}_INCLUDE_DIRS}) +ENDMACRO() + +FUNCTION(KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES TPL_NAME) + if (KOKKOS_HAS_TRILINOS) + TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES(${TPL_NAME} ${ARGN}) + else() + CMAKE_PARSE_ARGUMENTS(PARSE + "" + "" + "REQUIRED_HEADERS;REQUIRED_LIBS_NAMES" + ${ARGN}) + + SET(_${TPL_NAME}_ENABLE_SUCCESS TRUE) + IF (PARSE_REQUIRED_LIBS_NAMES) + FIND_LIBRARY(TPL_${TPL_NAME}_LIBRARIES NAMES ${PARSE_REQUIRED_LIBS_NAMES}) + IF(NOT TPL_${TPL_NAME}_LIBRARIES) + SET(_${TPL_NAME}_ENABLE_SUCCESS FALSE) + ENDIF() + ENDIF() + IF (PARSE_REQUIRED_HEADERS) + FIND_PATH(TPL_${TPL_NAME}_INCLUDE_DIRS NAMES ${PARSE_REQUIRED_HEADERS}) + IF(NOT TPL_${TPL_NAME}_INCLUDE_DIRS) + SET(_${TPL_NAME}_ENABLE_SUCCESS FALSE) + ENDIF() + ENDIF() + IF (_${TPL_NAME}_ENABLE_SUCCESS) + KOKKOS_CREATE_IMPORTED_TPL_LIBRARY(${TPL_NAME}) + ENDIF() + VERIFY_EMPTY(KOKKOS_CREATE_IMPORTED_TPL_LIBRARY ${PARSE_UNPARSED_ARGUMENTS}) + endif() +ENDFUNCTION() + +MACRO(KOKKOS_TARGET_COMPILE_OPTIONS TARGET) +if(KOKKOS_HAS_TRILINOS) + TARGET_COMPILE_OPTIONS(${TARGET} ${ARGN}) +else() + TARGET_COMPILE_OPTIONS(${TARGET} ${ARGN}) +endif() +ENDMACRO() + + +MACRO(KOKKOS_EXCLUDE_AUTOTOOLS_FILES) + if (KOKKOS_HAS_TRILINOS) + TRIBITS_EXCLUDE_AUTOTOOLS_FILES() + else() + #do nothing + endif() +ENDMACRO() + +FUNCTION(KOKKOS_LIB_TYPE LIB RET) +GET_TARGET_PROPERTY(PROP ${LIB} TYPE) +IF (${PROP} STREQUAL "INTERFACE_LIBRARY") + SET(${RET} "INTERFACE" PARENT_SCOPE) +ELSE() + SET(${RET} "PUBLIC" PARENT_SCOPE) +ENDIF() +ENDFUNCTION() + +FUNCTION(KOKKOS_TARGET_INCLUDE_DIRECTORIES TARGET) +IF(KOKKOS_HAS_TRILINOS) + KOKKOS_LIB_TYPE(${TARGET} INCTYPE) + #don't trust tribits to do this correctly - but need to add package name + TARGET_INCLUDE_DIRECTORIES(${TARGET} ${INCTYPE} ${ARGN}) +ELSEIF(TARGET ${TARGET}) + #the target actually exists - this means we are doing separate libs + #or this a test library + KOKKOS_LIB_TYPE(${TARGET} INCTYPE) + TARGET_INCLUDE_DIRECTORIES(${TARGET} ${INCTYPE} ${ARGN}) +ELSE() + GET_PROPERTY(LIBS GLOBAL PROPERTY KOKKOS_LIBRARIES_NAMES) + IF (${TARGET} IN_LIST LIBS) + SET_PROPERTY(GLOBAL APPEND PROPERTY KOKKOS_LIBRARY_INCLUDES ${ARGN}) + ELSE() + MESSAGE(FATAL_ERROR "Trying to set include directories on unknown target ${TARGET}") + ENDIF() +ENDIF() +ENDFUNCTION() + +FUNCTION(KOKKOS_LINK_INTERNAL_LIBRARY TARGET DEPLIB) +IF(KOKKOS_HAS_TRILINOS) + #do nothing +ELSE() + SET(options INTERFACE) + SET(oneValueArgs) + SET(multiValueArgs) + CMAKE_PARSE_ARGUMENTS(PARSE + "INTERFACE" + "" + "" + ${ARGN}) + SET(LINK_TYPE) + IF(PARSE_INTERFACE) + SET(LINK_TYPE INTERFACE) + ELSE() + SET(LINK_TYPE PUBLIC) + ENDIF() + TARGET_LINK_LIBRARIES(${TARGET} ${LINK_TYPE} ${DEPLIB}) + VERIFY_EMPTY(KOKKOS_LINK_INTERNAL_LIBRARY ${PARSE_UNPARSED_ARGUMENTS}) +ENDIF() +ENDFUNCTION() + +FUNCTION(KOKKOS_ADD_TEST_LIBRARY NAME) +IF (KOKKOS_HAS_TRILINOS) + TRIBITS_ADD_LIBRARY(${NAME} ${ARGN} TESTONLY) +ELSE() + SET(oneValueArgs) + SET(multiValueArgs HEADERS SOURCES) + + CMAKE_PARSE_ARGUMENTS(PARSE + "STATIC;SHARED" + "" + "HEADERS;SOURCES;DEPLIBS" + ${ARGN}) + + SET(LIB_TYPE) + IF (PARSE_STATIC) + SET(LIB_TYPE STATIC) + ELSEIF (PARSE_SHARED) + SET(LIB_TYPE SHARED) + ENDIF() + + IF(PARSE_HEADERS) + LIST(REMOVE_DUPLICATES PARSE_HEADERS) + ENDIF() + IF(PARSE_SOURCES) + LIST(REMOVE_DUPLICATES PARSE_SOURCES) + ENDIF() + ADD_LIBRARY(${NAME} ${LIB_TYPE} ${PARSE_SOURCES}) + IF (PARSE_DEPLIBS) + TARGET_LINK_LIBRARIES(${NAME} PRIVATE ${PARSE_DEPLIBS}) + ENDIF() +ENDIF() +ENDFUNCTION() + + +FUNCTION(KOKKOS_INCLUDE_DIRECTORIES) +IF(KOKKOS_HAS_TRILINOS) + TRIBITS_INCLUDE_DIRECTORIES(${ARGN}) +ELSE() + CMAKE_PARSE_ARGUMENTS( + INC + "REQUIRED_DURING_INSTALLATION_TESTING" + "" + "" + ${ARGN} + ) + INCLUDE_DIRECTORIES(${INC_UNPARSED_ARGUMENTS}) +ENDIF() +ENDFUNCTION() + + +MACRO(PRINTALL match) +get_cmake_property(_variableNames VARIABLES) +list (SORT _variableNames) +foreach (_variableName ${_variableNames}) + if("${_variableName}" MATCHES "${match}") + message(STATUS "${_variableName}=${${_variableName}}") + endif() +endforeach() +ENDMACRO() + +MACRO(SET_GLOBAL_REPLACE SUBSTR VARNAME) + STRING(REPLACE ${SUBSTR} ${${VARNAME}} TEMP) + GLOBAL_SET(${VARNAME} ${TEMP}) +ENDMACRO() + +FUNCTION(GLOBAL_APPEND VARNAME) + #We make this a function since we are setting variables + #and want to use scope to avoid overwriting local variables + SET(TEMP ${${VARNAME}}) + LIST(APPEND TEMP ${ARGN}) + GLOBAL_SET(${VARNAME} ${TEMP}) +ENDFUNCTION() diff --git a/packages/kokkos/cmake/gnu.cmake b/packages/kokkos/cmake/gnu.cmake new file mode 100644 index 0000000000000000000000000000000000000000..aa11fe87b111970ea440a3765c06d0b31b402d15 --- /dev/null +++ b/packages/kokkos/cmake/gnu.cmake @@ -0,0 +1,23 @@ + +FUNCTION(kokkos_set_gnu_flags full_standard int_standard) + STRING(TOLOWER ${full_standard} FULL_LC_STANDARD) + STRING(TOLOWER ${int_standard} INT_LC_STANDARD) + # The following three blocks of code were copied from + # /Modules/Compiler/Intel-CXX.cmake from CMake 3.7.2 and then modified. + IF(CMAKE_CXX_SIMULATE_ID STREQUAL MSVC) + SET(_std -Qstd) + SET(_ext c++) + ELSE() + SET(_std -std) + SET(_ext gnu++) + ENDIF() + + IF (CMAKE_CXX_EXTENSIONS) + SET(KOKKOS_CXX_STANDARD_FLAG "-std=gnu++${FULL_LC_STANDARD}" PARENT_SCOPE) + SET(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "-std=gnu++${INT_LC_STANDARD}" PARENT_SCOPE) + ELSE() + SET(KOKKOS_CXX_STANDARD_FLAG "-std=c++${FULL_LC_STANDARD}" PARENT_SCOPE) + SET(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "-std=c++${INT_LC_STANDARD}" PARENT_SCOPE) + ENDIF() +ENDFUNCTION() + diff --git a/packages/kokkos/cmake/intel.cmake b/packages/kokkos/cmake/intel.cmake new file mode 100644 index 0000000000000000000000000000000000000000..7e6ee3358c90940195b7b8dd589f1fa500ad063f --- /dev/null +++ b/packages/kokkos/cmake/intel.cmake @@ -0,0 +1,18 @@ + +FUNCTION(kokkos_set_intel_flags full_standard int_standard) + STRING(TOLOWER ${full_standard} FULL_LC_STANDARD) + STRING(TOLOWER ${int_standard} INT_LC_STANDARD) + # The following three blocks of code were copied from + # /Modules/Compiler/Intel-CXX.cmake from CMake 3.18.1 and then modified. + IF(CMAKE_CXX_SIMULATE_ID STREQUAL MSVC) + SET(_std -Qstd) + SET(_ext c++) + ELSE() + SET(_std -std) + SET(_ext gnu++) + ENDIF() + SET(KOKKOS_CXX_STANDARD_FLAG "${_std}=c++${FULL_LC_STANDARD}" PARENT_SCOPE) + SET(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "${_std}=${_ext}${INT_LC_STANDARD}" PARENT_SCOPE) +ENDFUNCTION() + + diff --git a/packages/kokkos/cmake/kokkos_arch.cmake b/packages/kokkos/cmake/kokkos_arch.cmake new file mode 100644 index 0000000000000000000000000000000000000000..ec18e70a36a34dbecc305f978e0d7b84c482da37 --- /dev/null +++ b/packages/kokkos/cmake/kokkos_arch.cmake @@ -0,0 +1,649 @@ + +FUNCTION(KOKKOS_ARCH_OPTION SUFFIX DEV_TYPE DESCRIPTION) + #all optimizations off by default + KOKKOS_OPTION(ARCH_${SUFFIX} OFF BOOL "Optimize for ${DESCRIPTION} (${DEV_TYPE})") + SET(KOKKOS_ARCH_${SUFFIX} ${KOKKOS_ARCH_${SUFFIX}} PARENT_SCOPE) + SET(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE) + SET(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE) + SET(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE) + IF(KOKKOS_ARCH_${SUFFIX}) + LIST(APPEND KOKKOS_ENABLED_ARCH_LIST ${SUFFIX}) + SET(KOKKOS_ENABLED_ARCH_LIST ${KOKKOS_ENABLED_ARCH_LIST} PARENT_SCOPE) + ENDIF() +ENDFUNCTION() + + +# Make sure devices and compiler ID are done +KOKKOS_CFG_DEPENDS(ARCH COMPILER_ID) +KOKKOS_CFG_DEPENDS(ARCH DEVICES) +KOKKOS_CFG_DEPENDS(ARCH OPTIONS) + +KOKKOS_CHECK_DEPRECATED_OPTIONS( + ARCH_EPYC "Please replace EPYC with ZEN or ZEN2, depending on your platform" + ARCH_RYZEN "Please replace RYZEN with ZEN or ZEN2, depending on your platform" +) + +#------------------------------------------------------------------------------- +# List of possible host architectures. +#------------------------------------------------------------------------------- +SET(KOKKOS_ARCH_LIST) + + +KOKKOS_DEPRECATED_LIST(ARCH ARCH) +KOKKOS_ARCH_OPTION(AMDAVX HOST "AMD chip") +KOKKOS_ARCH_OPTION(ARMV80 HOST "ARMv8.0 Compatible CPU") +KOKKOS_ARCH_OPTION(ARMV81 HOST "ARMv8.1 Compatible CPU") +KOKKOS_ARCH_OPTION(ARMV8_THUNDERX HOST "ARMv8 Cavium ThunderX CPU") +KOKKOS_ARCH_OPTION(ARMV8_THUNDERX2 HOST "ARMv8 Cavium ThunderX2 CPU") +KOKKOS_ARCH_OPTION(A64FX HOST "ARMv8.2 with SVE Support") +KOKKOS_ARCH_OPTION(WSM HOST "Intel Westmere CPU") +KOKKOS_ARCH_OPTION(SNB HOST "Intel Sandy/Ivy Bridge CPUs") +KOKKOS_ARCH_OPTION(HSW HOST "Intel Haswell CPUs") +KOKKOS_ARCH_OPTION(BDW HOST "Intel Broadwell Xeon E-class CPUs") +KOKKOS_ARCH_OPTION(SKX HOST "Intel Sky Lake Xeon E-class HPC CPUs (AVX512)") +KOKKOS_ARCH_OPTION(KNC HOST "Intel Knights Corner Xeon Phi") +KOKKOS_ARCH_OPTION(KNL HOST "Intel Knights Landing Xeon Phi") +KOKKOS_ARCH_OPTION(BGQ HOST "IBM Blue Gene Q") +KOKKOS_ARCH_OPTION(POWER7 HOST "IBM POWER7 CPUs") +KOKKOS_ARCH_OPTION(POWER8 HOST "IBM POWER8 CPUs") +KOKKOS_ARCH_OPTION(POWER9 HOST "IBM POWER9 CPUs") +KOKKOS_ARCH_OPTION(KEPLER30 GPU "NVIDIA Kepler generation CC 3.0") +KOKKOS_ARCH_OPTION(KEPLER32 GPU "NVIDIA Kepler generation CC 3.2") +KOKKOS_ARCH_OPTION(KEPLER35 GPU "NVIDIA Kepler generation CC 3.5") +KOKKOS_ARCH_OPTION(KEPLER37 GPU "NVIDIA Kepler generation CC 3.7") +KOKKOS_ARCH_OPTION(MAXWELL50 GPU "NVIDIA Maxwell generation CC 5.0") +KOKKOS_ARCH_OPTION(MAXWELL52 GPU "NVIDIA Maxwell generation CC 5.2") +KOKKOS_ARCH_OPTION(MAXWELL53 GPU "NVIDIA Maxwell generation CC 5.3") +KOKKOS_ARCH_OPTION(PASCAL60 GPU "NVIDIA Pascal generation CC 6.0") +KOKKOS_ARCH_OPTION(PASCAL61 GPU "NVIDIA Pascal generation CC 6.1") +KOKKOS_ARCH_OPTION(VOLTA70 GPU "NVIDIA Volta generation CC 7.0") +KOKKOS_ARCH_OPTION(VOLTA72 GPU "NVIDIA Volta generation CC 7.2") +KOKKOS_ARCH_OPTION(TURING75 GPU "NVIDIA Turing generation CC 7.5") +KOKKOS_ARCH_OPTION(AMPERE80 GPU "NVIDIA Ampere generation CC 8.0") +KOKKOS_ARCH_OPTION(AMPERE86 GPU "NVIDIA Ampere generation CC 8.6") +KOKKOS_ARCH_OPTION(ZEN HOST "AMD Zen architecture") +KOKKOS_ARCH_OPTION(ZEN2 HOST "AMD Zen2 architecture") +KOKKOS_ARCH_OPTION(VEGA900 GPU "AMD GPU MI25 GFX900") +KOKKOS_ARCH_OPTION(VEGA906 GPU "AMD GPU MI50/MI60 GFX906") +KOKKOS_ARCH_OPTION(VEGA908 GPU "AMD GPU MI100 GFX908") +KOKKOS_ARCH_OPTION(INTEL_GEN GPU "Intel GPUs Gen9+") + + + +IF(KOKKOS_ENABLE_COMPILER_WARNINGS) + SET(COMMON_WARNINGS + "-Wall" "-Wunused-parameter" "-Wshadow" "-pedantic" + "-Wsign-compare" "-Wtype-limits" "-Wuninitialized") + + # OpenMPTarget compilers give erroneous warnings about sign comparison in loops + IF(KOKKOS_ENABLE_OPENMPTARGET) + LIST(REMOVE_ITEM COMMON_WARNINGS "-Wsign-compare") + ENDIF() + + SET(GNU_WARNINGS "-Wempty-body" "-Wclobbered" "-Wignored-qualifiers" + ${COMMON_WARNINGS}) + + COMPILER_SPECIFIC_FLAGS( + COMPILER_ID CMAKE_CXX_COMPILER_ID + PGI NO-VALUE-SPECIFIED + GNU ${GNU_WARNINGS} + DEFAULT ${COMMON_WARNINGS} + ) +ENDIF() + + +#------------------------------- KOKKOS_CUDA_OPTIONS --------------------------- +#clear anything that might be in the cache +GLOBAL_SET(KOKKOS_CUDA_OPTIONS) +# Construct the Makefile options +IF (KOKKOS_ENABLE_CUDA_LAMBDA) + IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-expt-extended-lambda") + IF(KOKKOS_COMPILER_CUDA_VERSION GREATER_EQUAL 110) + GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-Wext-lambda-captures-this") + ENDIF() + ENDIF() +ENDIF() + +IF (KOKKOS_ENABLE_CUDA_CONSTEXPR) + IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-expt-relaxed-constexpr") + ENDIF() +ENDIF() + +IF (KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + SET(CUDA_ARCH_FLAG "--cuda-gpu-arch") + GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -x cuda) + # Kokkos_CUDA_DIR has priority over CUDAToolkit_BIN_DIR + IF (Kokkos_CUDA_DIR) + GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS --cuda-path=${Kokkos_CUDA_DIR}) + ELSEIF(CUDAToolkit_BIN_DIR) + GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS --cuda-path=${CUDAToolkit_BIN_DIR}/..) + ENDIF() + IF (KOKKOS_ENABLE_CUDA) + SET(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND ON CACHE BOOL "enable CUDA Clang workarounds" FORCE) + ENDIF() +ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + SET(CUDA_ARCH_FLAG "-arch") +ENDIF() + +IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + STRING(TOUPPER "${CMAKE_BUILD_TYPE}" _UPPERCASE_CMAKE_BUILD_TYPE) + IF (KOKKOS_ENABLE_DEBUG OR _UPPERCASE_CMAKE_BUILD_TYPE STREQUAL "DEBUG") + GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -lineinfo) + ENDIF() + UNSET(_UPPERCASE_CMAKE_BUILD_TYPE) + IF (KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 9.0 AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 10.0) + GLOBAL_APPEND(KOKKOS_CUDAFE_OPTIONS --diag_suppress=esa_on_defaulted_function_ignored) + ENDIF() +ENDIF() + + +#------------------------------- KOKKOS_HIP_OPTIONS --------------------------- +#clear anything that might be in the cache +GLOBAL_SET(KOKKOS_AMDGPU_OPTIONS) +IF(KOKKOS_ENABLE_HIP) + IF(KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) + SET(AMDGPU_ARCH_FLAG "--amdgpu-target") + ELSE() + SET(AMDGPU_ARCH_FLAG "--offload-arch") + GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS -x hip) + IF(DEFINED ENV{ROCM_PATH}) + GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS --rocm-path=$ENV{ROCM_PATH}) + ENDIF() + ENDIF() +ENDIF() + + +IF (KOKKOS_ARCH_ARMV80) + COMPILER_SPECIFIC_FLAGS( + Cray NO-VALUE-SPECIFIED + PGI NO-VALUE-SPECIFIED + DEFAULT -march=armv8-a + ) +ENDIF() + +IF (KOKKOS_ARCH_ARMV81) + COMPILER_SPECIFIC_FLAGS( + Cray NO-VALUE-SPECIFIED + PGI NO-VALUE-SPECIFIED + DEFAULT -march=armv8.1-a + ) +ENDIF() + +IF (KOKKOS_ARCH_ARMV8_THUNDERX) + SET(KOKKOS_ARCH_ARMV80 ON) #Not a cache variable + COMPILER_SPECIFIC_FLAGS( + Cray NO-VALUE-SPECIFIED + PGI NO-VALUE-SPECIFIED + DEFAULT -march=armv8-a -mtune=thunderx + ) +ENDIF() + +IF (KOKKOS_ARCH_ARMV8_THUNDERX2) + SET(KOKKOS_ARCH_ARMV81 ON) #Not a cache variable + COMPILER_SPECIFIC_FLAGS( + Cray NO-VALUE-SPECIFIED + PGI NO-VALUE-SPECIFIED + DEFAULT -mcpu=thunderx2t99 -mtune=thunderx2t99 + ) +ENDIF() + +IF (KOKKOS_ARCH_A64FX) + COMPILER_SPECIFIC_FLAGS( + DEFAULT -march=armv8.2-a+sve + Clang -march=armv8.2-a+sve -msve-vector-bits=512 + GCC -march=armv8.2-a+sve -msve-vector-bits=512 + ) +ENDIF() + +IF (KOKKOS_ARCH_ZEN) + COMPILER_SPECIFIC_FLAGS( + Intel -mavx2 + DEFAULT -march=znver1 -mtune=znver1 + ) + SET(KOKKOS_ARCH_AMD_ZEN ON) + SET(KOKKOS_ARCH_AMD_AVX2 ON) +ENDIF() + +IF (KOKKOS_ARCH_ZEN2) + COMPILER_SPECIFIC_FLAGS( + Intel -mavx2 + DEFAULT -march=znver2 -mtune=znver2 + ) + SET(KOKKOS_ARCH_AMD_ZEN2 ON) + SET(KOKKOS_ARCH_AMD_AVX2 ON) +ENDIF() + +IF (KOKKOS_ARCH_WSM) + COMPILER_SPECIFIC_FLAGS( + Intel -xSSE4.2 + PGI -tp=nehalem + Cray NO-VALUE-SPECIFIED + DEFAULT -msse4.2 + ) + SET(KOKKOS_ARCH_SSE42 ON) +ENDIF() + +IF (KOKKOS_ARCH_SNB OR KOKKOS_ARCH_AMDAVX) + SET(KOKKOS_ARCH_AVX ON) + COMPILER_SPECIFIC_FLAGS( + Intel -mavx + PGI -tp=sandybridge + Cray NO-VALUE-SPECIFIED + DEFAULT -mavx + ) +ENDIF() + +IF (KOKKOS_ARCH_HSW) + SET(KOKKOS_ARCH_AVX2 ON) + COMPILER_SPECIFIC_FLAGS( + Intel -xCORE-AVX2 + PGI -tp=haswell + Cray NO-VALUE-SPECIFIED + DEFAULT -march=core-avx2 -mtune=core-avx2 + ) +ENDIF() + +IF (KOKKOS_ARCH_BDW) + SET(KOKKOS_ARCH_AVX2 ON) + COMPILER_SPECIFIC_FLAGS( + Intel -xCORE-AVX2 + PGI -tp=haswell + Cray NO-VALUE-SPECIFIED + DEFAULT -march=core-avx2 -mtune=core-avx2 -mrtm + ) +ENDIF() + +IF (KOKKOS_ARCH_KNL) + #avx512-mic + SET(KOKKOS_ARCH_AVX512MIC ON) #not a cache variable + COMPILER_SPECIFIC_FLAGS( + Intel -xMIC-AVX512 + PGI NO-VALUE-SPECIFIED + Cray NO-VALUE-SPECIFIED + DEFAULT -march=knl -mtune=knl + ) +ENDIF() + +IF (KOKKOS_ARCH_KNC) + SET(KOKKOS_USE_ISA_KNC ON) + COMPILER_SPECIFIC_FLAGS( + DEFAULT -mmic + ) +ENDIF() + +IF (KOKKOS_ARCH_SKX) + #avx512-xeon + SET(KOKKOS_ARCH_AVX512XEON ON) + COMPILER_SPECIFIC_FLAGS( + Intel -xCORE-AVX512 + PGI NO-VALUE-SPECIFIED + Cray NO-VALUE-SPECIFIED + DEFAULT -march=skylake-avx512 -mtune=skylake-avx512 -mrtm + ) +ENDIF() + +IF (KOKKOS_ARCH_WSM OR KOKKOS_ARCH_SNB OR KOKKOS_ARCH_HSW OR KOKKOS_ARCH_BDW OR KOKKOS_ARCH_KNL OR KOKKOS_ARCH_SKX OR KOKKOS_ARCH_ZEN OR KOKKOS_ARCH_ZEN2) + SET(KOKKOS_USE_ISA_X86_64 ON) +ENDIF() + +IF (KOKKOS_ARCH_BDW OR KOKKOS_ARCH_SKX) + SET(KOKKOS_ENABLE_TM ON) #not a cache variable +ENDIF() + +IF (KOKKOS_ARCH_POWER7) + COMPILER_SPECIFIC_FLAGS( + PGI NO-VALUE-SPECIFIED + DEFAULT -mcpu=power7 -mtune=power7 + ) + SET(KOKKOS_USE_ISA_POWERPCBE ON) +ENDIF() + +IF (KOKKOS_ARCH_POWER8) + COMPILER_SPECIFIC_FLAGS( + PGI NO-VALUE-SPECIFIED + NVIDIA NO-VALUE-SPECIFIED + DEFAULT -mcpu=power8 -mtune=power8 + ) +ENDIF() + +IF (KOKKOS_ARCH_POWER9) + COMPILER_SPECIFIC_FLAGS( + PGI NO-VALUE-SPECIFIED + NVIDIA NO-VALUE-SPECIFIED + DEFAULT -mcpu=power9 -mtune=power9 + ) +ENDIF() + +IF (KOKKOS_ARCH_POWER8 OR KOKKOS_ARCH_POWER9) + SET(KOKKOS_USE_ISA_POWERPCLE ON) +ENDIF() + +IF (KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) + COMPILER_SPECIFIC_FLAGS( + Clang -fcuda-rdc + NVIDIA --relocatable-device-code=true + ) +ENDIF() + +# Clang needs mcx16 option enabled for Windows atomic functions +IF (CMAKE_CXX_COMPILER_ID STREQUAL Clang AND WIN32) + COMPILER_SPECIFIC_OPTIONS( + Clang -mcx16 + ) +ENDIF() + +# MSVC ABI has many deprecation warnings, so ignore them +IF (CMAKE_CXX_COMPILER_ID STREQUAL MSVC OR "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") + COMPILER_SPECIFIC_DEFS( + Clang _CRT_SECURE_NO_WARNINGS + ) +ENDIF() + + +#Right now we cannot get the compiler ID when cross-compiling, so just check +#that HIP is enabled +IF (KOKKOS_ENABLE_HIP) + IF (KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) + COMPILER_SPECIFIC_FLAGS( + DEFAULT -fgpu-rdc + ) + ELSE() + COMPILER_SPECIFIC_FLAGS( + DEFAULT -fno-gpu-rdc + ) + ENDIF() +ENDIF() + +IF (KOKKOS_ENABLE_SYCL) + COMPILER_SPECIFIC_FLAGS( + DEFAULT -fsycl + ) + COMPILER_SPECIFIC_OPTIONS( + DEFAULT -fsycl-unnamed-lambda + ) +ENDIF() + + +SET(CUDA_ARCH_ALREADY_SPECIFIED "") +FUNCTION(CHECK_CUDA_ARCH ARCH FLAG) + IF(KOKKOS_ARCH_${ARCH}) + IF(CUDA_ARCH_ALREADY_SPECIFIED) + MESSAGE(FATAL_ERROR "Multiple GPU architectures given! Already have ${CUDA_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again.") + ENDIF() + SET(CUDA_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE) + IF (NOT KOKKOS_ENABLE_CUDA AND NOT KOKKOS_ENABLE_OPENMPTARGET AND NOT KOKKOS_ENABLE_SYCL) + MESSAGE(WARNING "Given CUDA arch ${ARCH}, but Kokkos_ENABLE_CUDA and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored.") + UNSET(KOKKOS_ARCH_${ARCH} PARENT_SCOPE) + ELSE() + SET(KOKKOS_CUDA_ARCH_FLAG ${FLAG} PARENT_SCOPE) + GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "${CUDA_ARCH_FLAG}=${FLAG}") + IF(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE OR KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + GLOBAL_APPEND(KOKKOS_LINK_OPTIONS "${CUDA_ARCH_FLAG}=${FLAG}") + ENDIF() + ENDIF() + ENDIF() + LIST(APPEND KOKKOS_CUDA_ARCH_FLAGS ${FLAG}) + SET(KOKKOS_CUDA_ARCH_FLAGS ${KOKKOS_CUDA_ARCH_FLAGS} PARENT_SCOPE) + LIST(APPEND KOKKOS_CUDA_ARCH_LIST ${ARCH}) + SET(KOKKOS_CUDA_ARCH_LIST ${KOKKOS_CUDA_ARCH_LIST} PARENT_SCOPE) +ENDFUNCTION() + + +#These will define KOKKOS_CUDA_ARCH_FLAG +#to the corresponding flag name if ON +CHECK_CUDA_ARCH(KEPLER30 sm_30) +CHECK_CUDA_ARCH(KEPLER32 sm_32) +CHECK_CUDA_ARCH(KEPLER35 sm_35) +CHECK_CUDA_ARCH(KEPLER37 sm_37) +CHECK_CUDA_ARCH(MAXWELL50 sm_50) +CHECK_CUDA_ARCH(MAXWELL52 sm_52) +CHECK_CUDA_ARCH(MAXWELL53 sm_53) +CHECK_CUDA_ARCH(PASCAL60 sm_60) +CHECK_CUDA_ARCH(PASCAL61 sm_61) +CHECK_CUDA_ARCH(VOLTA70 sm_70) +CHECK_CUDA_ARCH(VOLTA72 sm_72) +CHECK_CUDA_ARCH(TURING75 sm_75) +CHECK_CUDA_ARCH(AMPERE80 sm_80) +CHECK_CUDA_ARCH(AMPERE86 sm_86) + +SET(AMDGPU_ARCH_ALREADY_SPECIFIED "") +FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG) + IF(KOKKOS_ARCH_${ARCH}) + IF(AMDGPU_ARCH_ALREADY_SPECIFIED) + MESSAGE(FATAL_ERROR "Multiple GPU architectures given! Already have ${AMDGPU_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again.") + ENDIF() + SET(AMDGPU_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE) + IF (NOT KOKKOS_ENABLE_HIP AND NOT KOKKOS_ENABLE_OPENMPTARGET) + MESSAGE(WARNING "Given AMD GPU architecture ${ARCH}, but Kokkos_ENABLE_HIP and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored.") + UNSET(KOKKOS_ARCH_${ARCH} PARENT_SCOPE) + ELSE() + SET(KOKKOS_AMDGPU_ARCH_FLAG ${FLAG} PARENT_SCOPE) + GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}") + IF(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) + GLOBAL_APPEND(KOKKOS_LINK_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}") + ENDIF() + ENDIF() + ENDIF() +ENDFUNCTION() + +#These will define KOKKOS_AMDGPU_ARCH_FLAG +#to the corresponding flag name if ON +CHECK_AMDGPU_ARCH(VEGA900 gfx900) # Radeon Instinct MI25 +CHECK_AMDGPU_ARCH(VEGA906 gfx906) # Radeon Instinct MI50 and MI60 +CHECK_AMDGPU_ARCH(VEGA908 gfx908) + +IF(KOKKOS_ENABLE_HIP AND NOT AMDGPU_ARCH_ALREADY_SPECIFIED) + MESSAGE(SEND_ERROR "HIP enabled but no AMD GPU architecture currently enabled. " + "Please enable one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'.") +ENDIF() + +IF (KOKKOS_ENABLE_OPENMPTARGET) + SET(CLANG_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG}) + IF (CLANG_CUDA_ARCH) + STRING(REPLACE "sm_" "cc" PGI_CUDA_ARCH ${CLANG_CUDA_ARCH}) + COMPILER_SPECIFIC_FLAGS( + Clang -Xopenmp-target -march=${CLANG_CUDA_ARCH} -fopenmp-targets=nvptx64-nvidia-cuda + XL -qtgtarch=${KOKKOS_CUDA_ARCH_FLAG} + PGI -gpu=${PGI_CUDA_ARCH} + ) + ENDIF() + SET(CLANG_AMDGPU_ARCH ${KOKKOS_AMDGPU_ARCH_FLAG}) + IF (CLANG_AMDGPU_ARCH) + COMPILER_SPECIFIC_FLAGS( + Clang -Xopenmp-target=amdgcn-amd-amdhsa -march=${CLANG_AMDGPU_ARCH} -fopenmp-targets=amdgcn-amd-amdhsa + ) + ENDIF() + IF (KOKKOS_ARCH_INTEL_GEN) + COMPILER_SPECIFIC_FLAGS( + IntelClang -fopenmp-targets=spir64 -D__STRICT_ANSI__ + ) + ENDIF() +ENDIF() + +IF (KOKKOS_ENABLE_SYCL) + IF(CUDA_ARCH_ALREADY_SPECIFIED) + IF(KOKKOS_ENABLE_UNSUPPORTED_ARCHS) + COMPILER_SPECIFIC_FLAGS( + DEFAULT -fsycl-targets=nvptx64-nvidia-cuda-sycldevice + ) + # FIXME_SYCL The CUDA backend doesn't support printf yet. + GLOBAL_SET(KOKKOS_IMPL_DISABLE_SYCL_DEVICE_PRINTF ON) + ELSE() + MESSAGE(SEND_ERROR "Setting a CUDA architecture for SYCL is only allowed with Kokkos_ENABLE_UNSUPPORTED_ARCHS=ON!") + ENDIF() + ELSEIF(KOKKOS_ARCH_INTEL_GEN) + COMPILER_SPECIFIC_FLAGS( + DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device skl" + ) + ENDIF() +ENDIF() + +IF(KOKKOS_ENABLE_CUDA AND NOT CUDA_ARCH_ALREADY_SPECIFIED) + # Try to autodetect the CUDA Compute Capability by asking the device + SET(_BINARY_TEST_DIR ${CMAKE_CURRENT_BINARY_DIR}/cmake/compile_tests/CUDAComputeCapabilityWorkdir) + FILE(REMOVE_RECURSE ${_BINARY_TEST_DIR}) + FILE(MAKE_DIRECTORY ${_BINARY_TEST_DIR}) + + TRY_RUN( + _RESULT + _COMPILE_RESULT + ${_BINARY_TEST_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc + COMPILE_DEFINITIONS -DSM_ONLY + RUN_OUTPUT_VARIABLE _CUDA_COMPUTE_CAPABILITY) + + # if user is using kokkos_compiler_launcher, above will fail. + IF(NOT _COMPILE_RESULT OR NOT _RESULT EQUAL 0) + # check to see if CUDA is not already enabled (may happen when Kokkos is subproject) + GET_PROPERTY(_ENABLED_LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES) + # language has to be fully enabled, just checking for CMAKE_CUDA_COMPILER isn't enough + IF(NOT "CUDA" IN_LIST _ENABLED_LANGUAGES) + # make sure the user knows that we aren't using CUDA compiler for anything else + MESSAGE(STATUS "CUDA auto-detection of architecture failed with ${CMAKE_CXX_COMPILER}. Enabling CUDA language ONLY to auto-detect architecture...") + INCLUDE(CheckLanguage) + CHECK_LANGUAGE(CUDA) + IF(CMAKE_CUDA_COMPILER) + ENABLE_LANGUAGE(CUDA) + ELSE() + MESSAGE(STATUS "CUDA language could not be enabled") + ENDIF() + ENDIF() + + # if CUDA was enabled, this will be defined + IF(CMAKE_CUDA_COMPILER) + # copy our test to .cu so cmake compiles as CUDA + CONFIGURE_FILE( + ${PROJECT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc + ${PROJECT_BINARY_DIR}/compile_tests/cuda_compute_capability.cu + COPYONLY + ) + # run test again + TRY_RUN( + _RESULT + _COMPILE_RESULT + ${_BINARY_TEST_DIR} + ${PROJECT_BINARY_DIR}/compile_tests/cuda_compute_capability.cu + COMPILE_DEFINITIONS -DSM_ONLY + RUN_OUTPUT_VARIABLE _CUDA_COMPUTE_CAPABILITY) + ENDIF() + ENDIF() + + LIST(FIND KOKKOS_CUDA_ARCH_FLAGS sm_${_CUDA_COMPUTE_CAPABILITY} FLAG_INDEX) + IF(_COMPILE_RESULT AND _RESULT EQUAL 0 AND NOT FLAG_INDEX EQUAL -1) + MESSAGE(STATUS "Detected CUDA Compute Capability ${_CUDA_COMPUTE_CAPABILITY}") + LIST(GET KOKKOS_CUDA_ARCH_LIST ${FLAG_INDEX} ARCHITECTURE) + KOKKOS_SET_OPTION(ARCH_${ARCHITECTURE} ON) + CHECK_CUDA_ARCH(${ARCHITECTURE} sm_${_CUDA_COMPUTE_CAPABILITY}) + LIST(APPEND KOKKOS_ENABLED_ARCH_LIST ${ARCHITECTURE}) + ELSE() + MESSAGE(SEND_ERROR "CUDA enabled but no NVIDIA GPU architecture currently enabled and auto-detection failed. " + "Please give one -DKokkos_ARCH_{..}=ON' to enable an NVIDIA GPU architecture.\n" + "You can yourself try to compile ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc and run the executable. " + "If you are cross-compiling, you should try to do this on a compute node.") + ENDIF() +ENDIF() + +IF (KOKKOS_ENABLE_CUDA) + #Regardless of version, make sure we define the general architecture name + IF (KOKKOS_ARCH_KEPLER30 OR KOKKOS_ARCH_KEPLER32 OR KOKKOS_ARCH_KEPLER35 OR KOKKOS_ARCH_KEPLER37) + SET(KOKKOS_ARCH_KEPLER ON) + ENDIF() + + #Regardless of version, make sure we define the general architecture name + IF (KOKKOS_ARCH_MAXWELL50 OR KOKKOS_ARCH_MAXWELL52 OR KOKKOS_ARCH_MAXWELL53) + SET(KOKKOS_ARCH_MAXWELL ON) + ENDIF() + + #Regardless of version, make sure we define the general architecture name + IF (KOKKOS_ARCH_PASCAL60 OR KOKKOS_ARCH_PASCAL61) + SET(KOKKOS_ARCH_PASCAL ON) + ENDIF() + + #Regardless of version, make sure we define the general architecture name + IF (KOKKOS_ARCH_VOLTA70 OR KOKKOS_ARCH_VOLTA72) + SET(KOKKOS_ARCH_VOLTA ON) + ENDIF() + + IF (KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86) + SET(KOKKOS_ARCH_AMPERE ON) + ENDIF() +ENDIF() + +#CMake verbose is kind of pointless +#Let's just always print things +MESSAGE(STATUS "Built-in Execution Spaces:") + +FOREACH (_BACKEND Cuda OpenMPTarget HIP SYCL) + STRING(TOUPPER ${_BACKEND} UC_BACKEND) + IF(KOKKOS_ENABLE_${UC_BACKEND}) + IF(_DEVICE_PARALLEL) + MESSAGE(FATAL_ERROR "Multiple device parallel execution spaces are not allowed! " + "Trying to enable execution space ${_BACKEND}, " + "but execution space ${_DEVICE_PARALLEL} is already enabled. " + "Remove the CMakeCache.txt file and re-configure.") + ENDIF() + IF (${_BACKEND} STREQUAL "Cuda") + IF(KOKKOS_ENABLE_CUDA_UVM) + SET(_DEFAULT_DEVICE_MEMSPACE "Kokkos::${_BACKEND}UVMSpace") + ELSE() + SET(_DEFAULT_DEVICE_MEMSPACE "Kokkos::${_BACKEND}Space") + ENDIF() + SET(_DEVICE_PARALLEL "Kokkos::${_BACKEND}") + ELSE() + SET(_DEFAULT_DEVICE_MEMSPACE "Kokkos::Experimental::${_BACKEND}Space") + SET(_DEVICE_PARALLEL "Kokkos::Experimental::${_BACKEND}") + ENDIF() + ENDIF() +ENDFOREACH() +IF(NOT _DEVICE_PARALLEL) + SET(_DEVICE_PARALLEL "NoTypeDefined") + SET(_DEFAULT_DEVICE_MEMSPACE "NoTypeDefined") +ENDIF() +MESSAGE(STATUS " Device Parallel: ${_DEVICE_PARALLEL}") +IF(KOKKOS_ENABLE_PTHREAD) + SET(KOKKOS_ENABLE_THREADS ON) +ENDIF() + +FOREACH (_BACKEND OpenMP Threads HPX) + STRING(TOUPPER ${_BACKEND} UC_BACKEND) + IF(KOKKOS_ENABLE_${UC_BACKEND}) + IF(_HOST_PARALLEL) + MESSAGE(FATAL_ERROR "Multiple host parallel execution spaces are not allowed! " + "Trying to enable execution space ${_BACKEND}, " + "but execution space ${_HOST_PARALLEL} is already enabled. " + "Remove the CMakeCache.txt file and re-configure.") + ENDIF() + IF (${_BACKEND} STREQUAL "HPX") + SET(_HOST_PARALLEL "Kokkos::Experimental::${_BACKEND}") + ELSE() + SET(_HOST_PARALLEL "Kokkos::${_BACKEND}") + ENDIF() + ENDIF() +ENDFOREACH() + +IF(NOT _HOST_PARALLEL AND NOT KOKKOS_ENABLE_SERIAL) + MESSAGE(FATAL_ERROR "At least one host execution space must be enabled, " + "but no host parallel execution space was requested " + "and Kokkos_ENABLE_SERIAL=OFF.") +ENDIF() + +IF(_HOST_PARALLEL) +MESSAGE(STATUS " Host Parallel: ${_HOST_PARALLEL}") +ELSE() + SET(_HOST_PARALLEL "NoTypeDefined") + MESSAGE(STATUS " Host Parallel: NoTypeDefined") +ENDIF() + +IF(KOKKOS_ENABLE_SERIAL) + MESSAGE(STATUS " Host Serial: SERIAL") +ELSE() + MESSAGE(STATUS " Host Serial: NONE") +ENDIF() + +MESSAGE(STATUS "") +MESSAGE(STATUS "Architectures:") +FOREACH(Arch ${KOKKOS_ENABLED_ARCH_LIST}) + MESSAGE(STATUS " ${Arch}") +ENDFOREACH() diff --git a/packages/kokkos/cmake/kokkos_check_env.cmake b/packages/kokkos/cmake/kokkos_check_env.cmake new file mode 100644 index 0000000000000000000000000000000000000000..a455a403b9d5ed0fa3772d2d8e619347061bd65e --- /dev/null +++ b/packages/kokkos/cmake/kokkos_check_env.cmake @@ -0,0 +1,12 @@ +SET(CRAYPE_VERSION $ENV{CRAYPE_VERSION}) +IF (CRAYPE_VERSION) + SET(KOKKOS_IS_CRAYPE TRUE) + SET(CRAYPE_LINK_TYPE $ENV{CRAYPE_LINK_TYPE}) + IF (CRAYPE_LINK_TYPE) + IF (NOT CRAYPE_LINK_TYPE STREQUAL "dynamic") + MESSAGE(WARNING "CRAYPE_LINK_TYPE is set to ${CRAYPE_LINK_TYPE}. Linking is likely to fail unless this is set to 'dynamic'") + ENDIF() + ELSE() + MESSAGE(WARNING "CRAYPE_LINK_TYPE is not set. Linking is likely to fail unless this is set to 'dynamic'") + ENDIF() +ENDIF() diff --git a/packages/kokkos/cmake/kokkos_compiler_id.cmake b/packages/kokkos/cmake/kokkos_compiler_id.cmake new file mode 100644 index 0000000000000000000000000000000000000000..4434d6928f46429ad7525c944a0c1c6c351c4cdd --- /dev/null +++ b/packages/kokkos/cmake/kokkos_compiler_id.cmake @@ -0,0 +1,173 @@ +KOKKOS_CFG_DEPENDS(COMPILER_ID NONE) + +SET(KOKKOS_CXX_COMPILER ${CMAKE_CXX_COMPILER}) +SET(KOKKOS_CXX_COMPILER_ID ${CMAKE_CXX_COMPILER_ID}) +SET(KOKKOS_CXX_COMPILER_VERSION ${CMAKE_CXX_COMPILER_VERSION}) + +MACRO(kokkos_internal_have_compiler_nvcc) + # Check if the compiler is nvcc (which really means nvcc_wrapper). + EXECUTE_PROCESS(COMMAND ${ARGN} --version + OUTPUT_VARIABLE INTERNAL_COMPILER_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE) + STRING(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION} ) + STRING(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "nvcc" INTERNAL_COMPILER_VERSION_CONTAINS_NVCC) + STRING(REGEX REPLACE "^ +" "" INTERNAL_HAVE_COMPILER_NVCC "${INTERNAL_HAVE_COMPILER_NVCC}") + IF(${INTERNAL_COMPILER_VERSION_CONTAINS_NVCC} GREATER -1) + SET(INTERNAL_HAVE_COMPILER_NVCC true) + ELSE() + SET(INTERNAL_HAVE_COMPILER_NVCC false) + ENDIF() +ENDMACRO() + +IF(Kokkos_ENABLE_CUDA) + # find kokkos_launch_compiler + FIND_PROGRAM(Kokkos_COMPILE_LAUNCHER + NAMES kokkos_launch_compiler + HINTS ${PROJECT_SOURCE_DIR} + PATHS ${PROJECT_SOURCE_DIR} + PATH_SUFFIXES bin) + + FIND_PROGRAM(Kokkos_NVCC_WRAPPER + NAMES nvcc_wrapper + HINTS ${PROJECT_SOURCE_DIR} + PATHS ${PROJECT_SOURCE_DIR} + PATH_SUFFIXES bin) + + # check if compiler was set to nvcc_wrapper + kokkos_internal_have_compiler_nvcc(${CMAKE_CXX_COMPILER}) + # if launcher was found and nvcc_wrapper was not specified as + # compiler, set to use launcher. Will ensure CMAKE_CXX_COMPILER + # is replaced by nvcc_wrapper + IF(Kokkos_COMPILE_LAUNCHER AND NOT INTERNAL_HAVE_COMPILER_NVCC AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + # the first argument to launcher is always the C++ compiler defined by cmake + # if the second argument matches the C++ compiler, it forwards the rest of the + # args to nvcc_wrapper + kokkos_internal_have_compiler_nvcc( + ${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER} -DKOKKOS_DEPENDENCE) + SET(INTERNAL_USE_COMPILER_LAUNCHER true) + ENDIF() +ENDIF() + +IF(INTERNAL_HAVE_COMPILER_NVCC) + # Save the host compiler id before overwriting it. + SET(KOKKOS_CXX_HOST_COMPILER_ID ${KOKKOS_CXX_COMPILER_ID}) + + # SET the compiler id to nvcc. We use the value used by CMake 3.8. + SET(KOKKOS_CXX_COMPILER_ID NVIDIA CACHE STRING INTERNAL FORCE) + + STRING(REGEX MATCH "V[0-9]+\\.[0-9]+\\.[0-9]+" + TEMP_CXX_COMPILER_VERSION ${INTERNAL_COMPILER_VERSION_ONE_LINE}) + STRING(SUBSTRING ${TEMP_CXX_COMPILER_VERSION} 1 -1 TEMP_CXX_COMPILER_VERSION) + SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) + MESSAGE(STATUS "Compiler Version: ${KOKKOS_CXX_COMPILER_VERSION}") + IF(INTERNAL_USE_COMPILER_LAUNCHER) + MESSAGE(STATUS "kokkos_launch_compiler (${Kokkos_COMPILE_LAUNCHER}) is enabled...") + kokkos_compilation(GLOBAL) + ENDIF() +ENDIF() + +IF(Kokkos_ENABLE_HIP) + # get HIP version + EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version + OUTPUT_VARIABLE INTERNAL_COMPILER_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE) + + STRING(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION} ) + + STRING(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "HIP version" INTERNAL_COMPILER_VERSION_CONTAINS_HIP) + IF(INTERNAL_COMPILER_VERSION_CONTAINS_HIP GREATER -1) + SET(KOKKOS_CXX_COMPILER_ID HIPCC CACHE STRING INTERNAL FORCE) + ENDIF() + + STRING(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" + TEMP_CXX_COMPILER_VERSION ${INTERNAL_COMPILER_VERSION_ONE_LINE}) + SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) + MESSAGE(STATUS "Compiler Version: ${KOKKOS_CXX_COMPILER_VERSION}") +ENDIF() + +IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + # The Cray compiler reports as Clang to most versions of CMake + EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version + COMMAND grep -c Cray + OUTPUT_VARIABLE INTERNAL_HAVE_CRAY_COMPILER + OUTPUT_STRIP_TRAILING_WHITESPACE) + IF (INTERNAL_HAVE_CRAY_COMPILER) #not actually Clang + SET(KOKKOS_CLANG_IS_CRAY TRUE) + ENDIF() + # The clang based Intel compiler reports as Clang to most versions of CMake + EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version + COMMAND grep -c "DPC++\\|icpx" + OUTPUT_VARIABLE INTERNAL_HAVE_INTEL_COMPILER + OUTPUT_STRIP_TRAILING_WHITESPACE) + IF (INTERNAL_HAVE_INTEL_COMPILER) #not actually Clang + SET(KOKKOS_CLANG_IS_INTEL TRUE) + SET(KOKKOS_CXX_COMPILER_ID IntelClang CACHE STRING INTERNAL FORCE) + ENDIF() +ENDIF() + +IF(KOKKOS_CXX_COMPILER_ID STREQUAL Cray OR KOKKOS_CLANG_IS_CRAY) + # SET Cray's compiler version. + EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version + OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE) + + STRING(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" + TEMP_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) + IF (KOKKOS_CLANG_IS_CRAY) + SET(KOKKOS_CLANG_CRAY_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION}) + ELSE() + SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) + ENDIF() +ENDIF() + +IF(KOKKOS_CXX_COMPILER_ID STREQUAL Fujitsu) + # SET Fujitsus compiler version which is not detected by CMake + EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version + OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE) + + STRING(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" + TEMP_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) + SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) +ENDIF() + +# Enforce the minimum compilers supported by Kokkos. +SET(KOKKOS_MESSAGE_TEXT "Compiler not supported by Kokkos. Required compiler versions:") +SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang 4.0.0 or higher") +SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n GCC 5.3.0 or higher") +SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Intel 17.0.0 or higher") +SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n NVCC 9.2.88 or higher") +SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n HIPCC 3.8.0 or higher") +SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n PGI 17.4 or higher\n") + +IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 4.0.0) + MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + ENDIF() +ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL GNU) + IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 5.3.0) + MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + ENDIF() +ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) + IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 17.0.0) + MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + ENDIF() +ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 9.2.88) + MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + ENDIF() + SET(CMAKE_CXX_EXTENSIONS OFF CACHE BOOL "Kokkos turns off CXX extensions" FORCE) +ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) + IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 3.8.0) + MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + ENDIF() +ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL PGI) + IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 17.4) + MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + ENDIF() +ENDIF() + +STRING(REPLACE "." ";" VERSION_LIST ${KOKKOS_CXX_COMPILER_VERSION}) +LIST(GET VERSION_LIST 0 KOKKOS_COMPILER_VERSION_MAJOR) +LIST(GET VERSION_LIST 1 KOKKOS_COMPILER_VERSION_MINOR) +LIST(GET VERSION_LIST 2 KOKKOS_COMPILER_VERSION_PATCH) diff --git a/packages/kokkos/cmake/kokkos_corner_cases.cmake b/packages/kokkos/cmake/kokkos_corner_cases.cmake new file mode 100644 index 0000000000000000000000000000000000000000..a84ac2b63027e7112cb3a7b76e5e9a7b8fc892e3 --- /dev/null +++ b/packages/kokkos/cmake/kokkos_corner_cases.cmake @@ -0,0 +1,62 @@ +IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND KOKKOS_ENABLE_OPENMP AND NOT KOKKOS_CLANG_IS_CRAY AND NOT KOKKOS_COMPILER_CLANG_MSVC) + # The clang "version" doesn't actually tell you what runtimes and tools + # were built into Clang. We should therefore make sure that libomp + # was actually built into Clang. Otherwise the user will get nonsensical + # errors when they try to build. + + #Try compile is the height of CMake nonsense + #I can't just give it compiler and link flags + #I have to hackily pretend that compiler flags are compiler definitions + #and that linker flags are libraries + #also - this is easier to use than CMakeCheckCXXSourceCompiles + TRY_COMPILE(CLANG_HAS_OMP + ${KOKKOS_TOP_BUILD_DIR}/corner_cases + ${KOKKOS_SOURCE_DIR}/cmake/compile_tests/clang_omp.cpp + COMPILE_DEFINITIONS -fopenmp=libomp + LINK_LIBRARIES -fopenmp=libomp + ) + IF (NOT CLANG_HAS_OMP) + UNSET(CLANG_HAS_OMP CACHE) #make sure CMake always re-runs this + MESSAGE(FATAL_ERROR "Clang failed OpenMP check. You have requested -DKokkos_ENABLE_OPENMP=ON, but the Clang compiler does not appear to have been built with OpenMP support") + ENDIF() + UNSET(CLANG_HAS_OMP CACHE) #make sure CMake always re-runs this +ENDIF() + +IF(KOKKOS_CXX_COMPILER_ID STREQUAL AppleClang AND KOKKOS_ENABLE_OPENMP) + # The clang "version" doesn't actually tell you what runtimes and tools + # were built into Clang. We should therefore make sure that libomp + # was actually built into Clang. Otherwise the user will get nonsensical + # errors when they try to build. + + #Try compile is the height of CMake nonsense + #I can't just give it compiler and link flags + #I have to hackily pretend that compiler flags are compiler definitions + #and that linker flags are libraries + #also - this is easier to use than CMakeCheckCXXSourceCompiles + TRY_COMPILE(APPLECLANG_HAS_OMP + ${KOKKOS_TOP_BUILD_DIR}/corner_cases + ${KOKKOS_SOURCE_DIR}/cmake/compile_tests/clang_omp.cpp + COMPILE_DEFINITIONS -Xpreprocessor -fopenmp + LINK_LIBRARIES -lomp + ) + IF (NOT APPLECLANG_HAS_OMP) + UNSET(APPLECLANG_HAS_OMP CACHE) #make sure CMake always re-runs this + MESSAGE(FATAL_ERROR "AppleClang failed OpenMP check. You have requested -DKokkos_ENABLE_OPENMP=ON, but the AppleClang compiler does not appear to have been built with OpenMP support") + ENDIF() + UNSET(APPLECLANG_HAS_OMP CACHE) #make sure CMake always re-runs this +ENDIF() + + +IF (KOKKOS_CXX_STANDARD STREQUAL 17) + IF (KOKKOS_CXX_COMPILER_ID STREQUAL GNU AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 7) + MESSAGE(FATAL_ERROR "You have requested C++17 support for GCC ${KOKKOS_CXX_COMPILER_VERSION}. Although CMake has allowed this and GCC accepts -std=c++1z/c++17, GCC < 7 does not properly support *this capture. Please reduce the C++ standard to 14 or upgrade the compiler if you do need C++17 support.") + ENDIF() + + IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 11) + MESSAGE(FATAL_ERROR "You have requested C++17 support for NVCC ${KOKKOS_CXX_COMPILER_VERSION}. NVCC only supports C++17 from version 11 on. Please reduce the C++ standard to 14 or upgrade the compiler if you need C++17 support.") + ENDIF() + IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND KOKKOS_ENABLE_CUDA_CONSTEXPR) + MESSAGE(WARNING "You have requested -DKokkos_ENABLE_CUDA_CONSTEXPR=ON with C++17 support for NVCC ${KOKKOS_CXX_COMPILER_VERSION} which is known to trigger compiler bugs. See https://github.com/kokkos/kokkos/issues/3496") + ENDIF() +ENDIF() + diff --git a/packages/kokkos/cmake/kokkos_enable_devices.cmake b/packages/kokkos/cmake/kokkos_enable_devices.cmake new file mode 100644 index 0000000000000000000000000000000000000000..445dad47ce561979037bf5b1622413ddda05f3b3 --- /dev/null +++ b/packages/kokkos/cmake/kokkos_enable_devices.cmake @@ -0,0 +1,161 @@ + +FUNCTION(KOKKOS_DEVICE_OPTION SUFFIX DEFAULT DEV_TYPE DOCSTRING) + KOKKOS_OPTION(ENABLE_${SUFFIX} ${DEFAULT} BOOL ${DOCSTRING}) + STRING(TOUPPER ${SUFFIX} UC_NAME) + IF (KOKKOS_ENABLE_${UC_NAME}) + LIST(APPEND KOKKOS_ENABLED_DEVICES ${SUFFIX}) + #I hate that CMake makes me do this + SET(KOKKOS_ENABLED_DEVICES ${KOKKOS_ENABLED_DEVICES} PARENT_SCOPE) + ENDIF() + SET(KOKKOS_ENABLE_${UC_NAME} ${KOKKOS_ENABLE_${UC_NAME}} PARENT_SCOPE) + IF (KOKKOS_ENABLE_${UC_NAME} AND DEV_TYPE STREQUAL "HOST") + SET(KOKKOS_HAS_HOST ON PARENT_SCOPE) + ENDIF() +ENDFUNCTION() + +KOKKOS_CFG_DEPENDS(DEVICES NONE) + +# Put a check in just in case people are using this option +KOKKOS_DEPRECATED_LIST(DEVICES ENABLE) + + +KOKKOS_DEVICE_OPTION(PTHREAD OFF HOST "Whether to build Pthread backend") +IF (KOKKOS_ENABLE_PTHREAD) + #patch the naming here + SET(KOKKOS_ENABLE_THREADS ON) +ENDIF() + +# detect clang++ / cl / clang-cl clashes +IF (CMAKE_CXX_COMPILER_ID STREQUAL Clang AND "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") + # this specific test requires CMake >= 3.15 + IF ("x${CMAKE_CXX_COMPILER_FRONTEND_VARIANT}" STREQUAL "xGNU") + # use pure clang++ instead of clang-cl + SET(KOKKOS_COMPILER_CLANG_MSVC OFF) + ELSE() + # it defaults to clang-cl + SET(KOKKOS_COMPILER_CLANG_MSVC ON) + ENDIF() +ENDIF() + +IF(Trilinos_ENABLE_Kokkos AND Trilinos_ENABLE_OpenMP) + SET(OMP_DEFAULT ON) +ELSE() + SET(OMP_DEFAULT OFF) +ENDIF() +KOKKOS_DEVICE_OPTION(OPENMP ${OMP_DEFAULT} HOST "Whether to build OpenMP backend") +IF(KOKKOS_ENABLE_OPENMP) + SET(ClangOpenMPFlag -fopenmp=libomp) + IF(KOKKOS_CLANG_IS_CRAY) + SET(ClangOpenMPFlag -fopenmp) + ENDIF() + IF(KOKKOS_COMPILER_CLANG_MSVC) + #for clang-cl expression /openmp yields an error, so directly add the specific Clang flag + SET(ClangOpenMPFlag /clang:-fopenmp=libomp) + ENDIF() + IF(WIN32 AND CMAKE_CXX_COMPILER_ID STREQUAL Clang) + #link omp library from LLVM lib dir, no matter if it is clang-cl or clang++ + get_filename_component(LLVM_BIN_DIR ${CMAKE_CXX_COMPILER_AR} DIRECTORY) + COMPILER_SPECIFIC_LIBS(Clang "${LLVM_BIN_DIR}/../lib/libomp.lib") + ENDIF() + IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + COMPILER_SPECIFIC_FLAGS( + COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID + Clang -Xcompiler ${ClangOpenMPFlag} + IntelClang -Xcompiler -fiopenmp + PGI -Xcompiler -mp + Cray NO-VALUE-SPECIFIED + XL -Xcompiler -qsmp=omp + DEFAULT -Xcompiler -fopenmp + ) + ELSE() + COMPILER_SPECIFIC_FLAGS( + Clang ${ClangOpenMPFlag} + IntelClang -fiopenmp + AppleClang -Xpreprocessor -fopenmp + PGI -mp + Cray NO-VALUE-SPECIFIED + XL -qsmp=omp + DEFAULT -fopenmp + ) + COMPILER_SPECIFIC_LIBS( + AppleClang -lomp + ) + ENDIF() +ENDIF() + +KOKKOS_DEVICE_OPTION(OPENMPTARGET OFF DEVICE "Whether to build the OpenMP target backend") +IF (KOKKOS_ENABLE_OPENMPTARGET) + SET(ClangOpenMPFlag -fopenmp=libomp) + IF(KOKKOS_CLANG_IS_CRAY) + SET(ClangOpenMPFlag -fopenmp) + ENDIF() + + COMPILER_SPECIFIC_FLAGS( + Clang ${ClangOpenMPFlag} -Wno-openmp-mapping + IntelClang -fiopenmp -Wno-openmp-mapping + XL -qsmp=omp -qoffload -qnoeh + PGI -mp=gpu + DEFAULT -fopenmp + ) + COMPILER_SPECIFIC_DEFS( + XL KOKKOS_IBM_XL_OMP45_WORKAROUND + Clang KOKKOS_WORKAROUND_OPENMPTARGET_CLANG + ) +# Are there compilers which identify as Clang and need this library? +# COMPILER_SPECIFIC_LIBS( +# Clang -lopenmptarget +# ) + IF(KOKKOS_CXX_STANDARD LESS 17) + MESSAGE(FATAL_ERROR "OpenMPTarget backend requires C++17 or newer") + ENDIF() +ENDIF() + +IF(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_CUDA) + SET(CUDA_DEFAULT ON) +ELSE() + SET(CUDA_DEFAULT OFF) +ENDIF() +KOKKOS_DEVICE_OPTION(CUDA ${CUDA_DEFAULT} DEVICE "Whether to build CUDA backend") + +IF (KOKKOS_ENABLE_CUDA) + GLOBAL_SET(KOKKOS_DONT_ALLOW_EXTENSIONS "CUDA enabled") + IF(WIN32 AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + GLOBAL_APPEND(KOKKOS_COMPILE_OPTIONS -x cu) + ENDIF() +## Cuda has extra setup requirements, turn on Kokkos_Setup_Cuda.hpp in macros + LIST(APPEND DEVICE_SETUP_LIST Cuda) +ENDIF() + +# We want this to default to OFF for cache reasons, but if no +# host space is given, then activate serial +IF (KOKKOS_HAS_TRILINOS) + #However, Trilinos always wants Serial ON + SET(SERIAL_DEFAULT ON) +ELSEIF (KOKKOS_HAS_HOST) + SET(SERIAL_DEFAULT OFF) +ELSE() + SET(SERIAL_DEFAULT ON) + IF (NOT DEFINED Kokkos_ENABLE_SERIAL) + MESSAGE(STATUS "SERIAL backend is being turned on to ensure there is at least one Host space. To change this, you must enable another host execution space and configure with -DKokkos_ENABLE_SERIAL=OFF or change CMakeCache.txt") + ENDIF() +ENDIF() +KOKKOS_DEVICE_OPTION(SERIAL ${SERIAL_DEFAULT} HOST "Whether to build serial backend") + +KOKKOS_DEVICE_OPTION(HPX OFF HOST "Whether to build HPX backend (experimental)") + +KOKKOS_DEVICE_OPTION(HIP OFF DEVICE "Whether to build HIP backend") + +## HIP has extra setup requirements, turn on Kokkos_Setup_HIP.hpp in macros +IF (KOKKOS_ENABLE_HIP) + LIST(APPEND DEVICE_SETUP_LIST HIP) +ENDIF() + +KOKKOS_DEVICE_OPTION(SYCL OFF DEVICE "Whether to build SYCL backend") + +## SYCL has extra setup requirements, turn on Kokkos_Setup_SYCL.hpp in macros +IF (KOKKOS_ENABLE_SYCL) + IF(KOKKOS_CXX_STANDARD LESS 17) + MESSAGE(FATAL_ERROR "SYCL backend requires C++17 or newer!") + ENDIF() + LIST(APPEND DEVICE_SETUP_LIST SYCL) +ENDIF() diff --git a/packages/kokkos/cmake/kokkos_enable_options.cmake b/packages/kokkos/cmake/kokkos_enable_options.cmake new file mode 100644 index 0000000000000000000000000000000000000000..95bce66c7bee32f8800cbd6e0324f9d4c599c97c --- /dev/null +++ b/packages/kokkos/cmake/kokkos_enable_options.cmake @@ -0,0 +1,125 @@ +########################## NOTES ############################################### +# List the options for configuring kokkos using CMake method of doing it. +# These options then get mapped onto KOKKOS_SETTINGS environment variable by +# kokkos_settings.cmake. It is separate to allow other packages to override +# these variables (e.g., TriBITS). + +########################## AVAILABLE OPTIONS ################################### +# Use lists for documentation, verification, and programming convenience + + +FUNCTION(KOKKOS_ENABLE_OPTION SUFFIX DEFAULT DOCSTRING) + KOKKOS_OPTION(ENABLE_${SUFFIX} ${DEFAULT} BOOL ${DOCSTRING}) + STRING(TOUPPER ${SUFFIX} UC_NAME) + IF (KOKKOS_ENABLE_${UC_NAME}) + LIST(APPEND KOKKOS_ENABLED_OPTIONS ${UC_NAME}) + #I hate that CMake makes me do this + SET(KOKKOS_ENABLED_OPTIONS ${KOKKOS_ENABLED_OPTIONS} PARENT_SCOPE) + ENDIF() + SET(KOKKOS_ENABLE_${UC_NAME} ${KOKKOS_ENABLE_${UC_NAME}} PARENT_SCOPE) +ENDFUNCTION() + +# Certain defaults will depend on knowing the enabled devices +KOKKOS_CFG_DEPENDS(OPTIONS DEVICES) +KOKKOS_CFG_DEPENDS(OPTIONS COMPILER_ID) + +# Put a check in just in case people are using this option +KOKKOS_DEPRECATED_LIST(OPTIONS ENABLE) + +KOKKOS_ENABLE_OPTION(CUDA_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for CUDA") +KOKKOS_ENABLE_OPTION(CUDA_UVM OFF "Whether to use unified memory (UM) for CUDA by default") +KOKKOS_ENABLE_OPTION(CUDA_LDG_INTRINSIC OFF "Whether to use CUDA LDG intrinsics") +KOKKOS_ENABLE_OPTION(HIP_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for HIP") +KOKKOS_ENABLE_OPTION(HPX_ASYNC_DISPATCH OFF "Whether HPX supports asynchronous dispatch") +KOKKOS_ENABLE_OPTION(TESTS OFF "Whether to build the unit tests") +KOKKOS_ENABLE_OPTION(EXAMPLES OFF "Whether to build the examples") +STRING(TOUPPER "${CMAKE_BUILD_TYPE}" UPPERCASE_CMAKE_BUILD_TYPE) +IF(UPPERCASE_CMAKE_BUILD_TYPE STREQUAL "DEBUG") + KOKKOS_ENABLE_OPTION(DEBUG ON "Whether to activate extra debug features - may increase compile times") + KOKKOS_ENABLE_OPTION(DEBUG_DUALVIEW_MODIFY_CHECK ON "Debug check on dual views") +ELSE() + KOKKOS_ENABLE_OPTION(DEBUG OFF "Whether to activate extra debug features - may increase compile times") + KOKKOS_ENABLE_OPTION(DEBUG_DUALVIEW_MODIFY_CHECK OFF "Debug check on dual views") +ENDIF() +UNSET(_UPPERCASE_CMAKE_BUILD_TYPE) +KOKKOS_ENABLE_OPTION(LARGE_MEM_TESTS OFF "Whether to perform extra large memory tests") +KOKKOS_ENABLE_OPTION(DEBUG_BOUNDS_CHECK OFF "Whether to use bounds checking - will increase runtime") +KOKKOS_ENABLE_OPTION(COMPILER_WARNINGS OFF "Whether to print all compiler warnings") +KOKKOS_ENABLE_OPTION(PROFILING_LOAD_PRINT OFF "Whether to print information about which profiling tools got loaded") +KOKKOS_ENABLE_OPTION(TUNING OFF "Whether to create bindings for tuning tools") +KOKKOS_ENABLE_OPTION(AGGRESSIVE_VECTORIZATION OFF "Whether to aggressively vectorize loops") +KOKKOS_ENABLE_OPTION(LAUNCH_COMPILER ON "Whether to potentially use the launch compiler") + +IF (KOKKOS_ENABLE_CUDA) + SET(KOKKOS_COMPILER_CUDA_VERSION "${KOKKOS_COMPILER_VERSION_MAJOR}${KOKKOS_COMPILER_VERSION_MINOR}") +ENDIF() + +IF (Trilinos_ENABLE_Kokkos AND TPL_ENABLE_CUDA) + SET(CUDA_LAMBDA_DEFAULT ON) +ELSEIF (KOKKOS_ENABLE_CUDA AND (KOKKOS_CXX_COMPILER_ID STREQUAL Clang)) + SET(CUDA_LAMBDA_DEFAULT ON) +ELSE() + SET(CUDA_LAMBDA_DEFAULT OFF) +ENDIF() +KOKKOS_ENABLE_OPTION(CUDA_LAMBDA ${CUDA_LAMBDA_DEFAULT} "Whether to activate experimental lambda features") +IF (Trilinos_ENABLE_Kokkos) + SET(COMPLEX_ALIGN_DEFAULT OFF) +ELSE() + SET(COMPLEX_ALIGN_DEFAULT ON) +ENDIF() +KOKKOS_ENABLE_OPTION(COMPLEX_ALIGN ${COMPLEX_ALIGN_DEFAULT} "Whether to align Kokkos::complex to 2*alignof(RealType)") + +IF (KOKKOS_ENABLE_TESTS) + SET(HEADER_SELF_CONTAINMENT_TESTS_DEFAULT ON) +ELSE() + SET(HEADER_SELF_CONTAINMENT_TESTS_DEFAULT OFF) +ENDIF() +KOKKOS_ENABLE_OPTION(HEADER_SELF_CONTAINMENT_TESTS ${HEADER_SELF_CONTAINMENT_TESTS_DEFAULT} "Enable header self-containment unit tests") +IF (NOT KOKKOS_ENABLE_TESTS AND KOKKOS_ENABLE_HEADER_SELF_CONTAINMENT_TESTS) + MESSAGE(WARNING "Kokkos_ENABLE_HEADER_SELF_CONTAINMENT_TESTS is ON but Kokkos_ENABLE_TESTS is OFF. Option will be ignored.") +ENDIF() + +IF (KOKKOS_ENABLE_CUDA AND (KOKKOS_CXX_COMPILER_ID STREQUAL Clang)) + SET(CUDA_CONSTEXPR_DEFAULT ON) +ELSE() + SET(CUDA_CONSTEXPR_DEFAULT OFF) +ENDIF() +KOKKOS_ENABLE_OPTION(CUDA_CONSTEXPR ${CUDA_CONSTEXPR_DEFAULT} "Whether to activate experimental relaxed constexpr functions") + +Kokkos_ENABLE_OPTION(UNSUPPORTED_ARCHS OFF "Whether to allow architectures in backends Kokkos doesn't optimize for") + +FUNCTION(check_device_specific_options) + CMAKE_PARSE_ARGUMENTS(SOME "" "DEVICE" "OPTIONS" ${ARGN}) + IF(NOT KOKKOS_ENABLE_${SOME_DEVICE}) + FOREACH(OPTION ${SOME_OPTIONS}) + IF(NOT DEFINED CACHE{Kokkos_ENABLE_${OPTION}} OR NOT DEFINED CACHE{Kokkos_ENABLE_${SOME_DEVICE}}) + MESSAGE(FATAL_ERROR "Internal logic error: option '${OPTION}' or device '${SOME_DEVICE}' not recognized.") + ENDIF() + IF(KOKKOS_ENABLE_${OPTION}) + MESSAGE(WARNING "Kokkos_ENABLE_${OPTION} is ON but ${SOME_DEVICE} backend is not enabled. Option will be ignored.") + UNSET(KOKKOS_ENABLE_${OPTION} PARENT_SCOPE) + ENDIF() + ENDFOREACH() + ENDIF() +ENDFUNCTION() + +CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE CUDA OPTIONS CUDA_UVM CUDA_RELOCATABLE_DEVICE_CODE CUDA_LAMBDA CUDA_CONSTEXPR CUDA_LDG_INTRINSIC) +CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE HIP OPTIONS HIP_RELOCATABLE_DEVICE_CODE) +CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE HPX OPTIONS HPX_ASYNC_DISPATCH) + +# Needed due to change from deprecated name to new header define name +IF (KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) + SET(KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ON) +ENDIF() + +# This is known to occur with Clang 9. We would need to use nvcc as the linker +# http://lists.llvm.org/pipermail/cfe-dev/2018-June/058296.html +# TODO: Through great effort we can use a different linker by hacking +# CMAKE_CXX_LINK_EXECUTABLE in a future release +IF (KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + MESSAGE(FATAL_ERROR "Relocatable device code is currently not supported with Clang - must use nvcc_wrapper or turn off RDC") +ENDIF() + +IF (KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE AND BUILD_SHARED_LIBS) + MESSAGE(FATAL_ERROR "Relocatable device code requires static libraries.") +ENDIF() diff --git a/packages/kokkos/cmake/kokkos_functions.cmake b/packages/kokkos/cmake/kokkos_functions.cmake new file mode 100644 index 0000000000000000000000000000000000000000..858322394d7aefcb9fe23f55a60863f3a8f63484 --- /dev/null +++ b/packages/kokkos/cmake/kokkos_functions.cmake @@ -0,0 +1,1006 @@ +################################### FUNCTIONS ################################## +# List of functions +# kokkos_option + +# Validate options are given with correct case and define an internal +# upper-case version for use within + +# +# +# @FUNCTION: kokkos_deprecated_list +# +# Function that checks if a deprecated list option like Kokkos_ARCH was given. +# This prints an error and prevents configure from completing. +# It attempts to print a helpful message about updating the options for the new CMake. +# Kokkos_${SUFFIX} is the name of the option (like Kokkos_ARCH) being checked. +# Kokkos_${PREFIX}_X is the name of new option to be defined from a list X,Y,Z,... +FUNCTION(kokkos_deprecated_list SUFFIX PREFIX) + SET(CAMEL_NAME Kokkos_${SUFFIX}) + STRING(TOUPPER ${CAMEL_NAME} UC_NAME) + + #I don't love doing it this way but better to be safe + FOREACH(opt ${KOKKOS_GIVEN_VARIABLES}) + STRING(TOUPPER ${opt} OPT_UC) + IF ("${OPT_UC}" STREQUAL "${UC_NAME}") + STRING(REPLACE "," ";" optlist "${${opt}}") + SET(ERROR_MSG "Given deprecated option list ${opt}. This must now be given as separate -D options, which assuming you spelled options correctly would be:") + FOREACH(entry ${optlist}) + STRING(TOUPPER ${entry} ENTRY_UC) + STRING(APPEND ERROR_MSG "\n -DKokkos_${PREFIX}_${ENTRY_UC}=ON") + ENDFOREACH() + STRING(APPEND ERROR_MSG "\nRemove CMakeCache.txt and re-run. For a list of valid options, refer to BUILD.md or even look at CMakeCache.txt (before deleting it).") + IF (KOKKOS_HAS_TRILINOS) + MESSAGE(WARNING ${ERROR_MSG}) + FOREACH(entry ${optlist}) + STRING(TOUPPER ${entry} ENTRY_UC) + SET(${CAMEL_NAME}_${ENTRY_UC} ON CACHE BOOL "Deprecated Trilinos translation") + ENDFOREACH() + UNSET(${opt} CACHE) + ELSE() + MESSAGE(SEND_ERROR ${ERROR_MSG}) + ENDIF() + ENDIF() + ENDFOREACH() +ENDFUNCTION() + +FUNCTION(kokkos_option CAMEL_SUFFIX DEFAULT TYPE DOCSTRING) + SET(CAMEL_NAME Kokkos_${CAMEL_SUFFIX}) + STRING(TOUPPER ${CAMEL_NAME} UC_NAME) + + LIST(APPEND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX}) + SET(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE) + LIST(APPEND KOKKOS_OPTION_VALUES "${DOCSTRING}") + SET(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE) + LIST(APPEND KOKKOS_OPTION_TYPES ${TYPE}) + SET(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE) + + # Make sure this appears in the cache with the appropriate DOCSTRING + SET(${CAMEL_NAME} ${DEFAULT} CACHE ${TYPE} ${DOCSTRING}) + + #I don't love doing it this way because it's N^2 in number options, but cest la vie + FOREACH(opt ${KOKKOS_GIVEN_VARIABLES}) + STRING(TOUPPER ${opt} OPT_UC) + IF ("${OPT_UC}" STREQUAL "${UC_NAME}") + IF (NOT "${opt}" STREQUAL "${CAMEL_NAME}") + IF (KOKKOS_HAS_TRILINOS) + #Allow this for now if Trilinos... we need to bootstrap our way to integration + MESSAGE(WARNING "Deprecated option ${opt} found - please change spelling to ${CAMEL_NAME}") + SET(${CAMEL_NAME} "${${opt}}" CACHE ${TYPE} ${DOCSTRING} FORCE) + UNSET(${opt} CACHE) + ELSE() + MESSAGE(FATAL_ERROR "Matching option found for ${CAMEL_NAME} with the wrong case ${opt}. Please delete your CMakeCache.txt and change option to -D${CAMEL_NAME}=${${opt}}. This is now enforced to avoid hard-to-debug CMake cache inconsistencies.") + ENDIF() + ENDIF() + ENDIF() + ENDFOREACH() + + #okay, great, we passed the validation test - use the default + IF (DEFINED ${CAMEL_NAME}) + SET(${UC_NAME} ${${CAMEL_NAME}} PARENT_SCOPE) + ELSE() + SET(${UC_NAME} ${DEFAULT} PARENT_SCOPE) + ENDIF() +ENDFUNCTION() + +FUNCTION(kokkos_set_option CAMEL_SUFFIX VALUE) + LIST(FIND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX} OPTION_INDEX) + IF(OPTION_INDEX EQUAL -1) + MESSAGE(FATAL_ERROR "Couldn't set value for Kokkos_${CAMEL_SUFFIX}") + ENDIF() + SET(CAMEL_NAME Kokkos_${CAMEL_SUFFIX}) + STRING(TOUPPER ${CAMEL_NAME} UC_NAME) + + LIST(GET KOKKOS_OPTION_VALUES ${OPTION_INDEX} DOCSTRING) + LIST(GET KOKKOS_OPTION_TYPES ${OPTION_INDEX} TYPE) + SET(${CAMEL_NAME} ${VALUE} CACHE ${TYPE} ${DOCSTRING} FORCE) + MESSAGE(STATUS "Setting ${CAMEL_NAME}=${VALUE}") + SET(${UC_NAME} ${VALUE} PARENT_SCOPE) +ENDFUNCTION() + +FUNCTION(kokkos_append_config_line LINE) + GLOBAL_APPEND(KOKKOS_TPL_EXPORTS "${LINE}") +ENDFUNCTION() + +MACRO(kokkos_export_cmake_tpl NAME) + #CMake TPLs are located with a call to find_package + #find_package locates XConfig.cmake files through + #X_DIR or X_ROOT variables set prior to calling find_package + + #If Kokkos was configured to find the TPL through a _DIR variable + #make sure thar DIR variable is available to downstream packages + IF (DEFINED ${NAME}_DIR) + #The downstream project may override the TPL location that Kokkos used + #Check if the downstream project chose its own TPL location + #If not, make the Kokkos found location available + KOKKOS_APPEND_CONFIG_LINE("IF(NOT DEFINED ${NAME}_DIR)") + KOKKOS_APPEND_CONFIG_LINE(" SET(${NAME}_DIR ${${NAME}_DIR})") + KOKKOS_APPEND_CONFIG_LINE("ENDIF()") + ENDIF() + + IF (DEFINED ${NAME}_ROOT) + #The downstream project may override the TPL location that Kokkos used + #Check if the downstream project chose its own TPL location + #If not, make the Kokkos found location available + KOKKOS_APPEND_CONFIG_LINE("IF(NOT DEFINED ${NAME}_ROOT)") + KOKKOS_APPEND_CONFIG_LINE(" SET(${NAME}_ROOT ${${NAME}_ROOT})") + KOKKOS_APPEND_CONFIG_LINE("ENDIF()") + ENDIF() + KOKKOS_APPEND_CONFIG_LINE("FIND_DEPENDENCY(${NAME})") +ENDMACRO() + +MACRO(kokkos_export_imported_tpl NAME) + IF (NOT KOKKOS_HAS_TRILINOS) + GET_TARGET_PROPERTY(LIB_IMPORTED ${NAME} IMPORTED) + IF (NOT LIB_IMPORTED) + # This is not an imported target + # This an interface library that we created + INSTALL( + TARGETS ${NAME} + EXPORT KokkosTargets + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + ) + ELSE() + #make sure this also gets "exported" in the config file + KOKKOS_APPEND_CONFIG_LINE("IF(NOT TARGET ${NAME})") + + GET_TARGET_PROPERTY(LIB_TYPE ${NAME} TYPE) + IF (${LIB_TYPE} STREQUAL "INTERFACE_LIBRARY") + KOKKOS_APPEND_CONFIG_LINE("ADD_LIBRARY(${NAME} INTERFACE IMPORTED)") + KOKKOS_APPEND_CONFIG_LINE("SET_TARGET_PROPERTIES(${NAME} PROPERTIES") + ELSE() + KOKKOS_APPEND_CONFIG_LINE("ADD_LIBRARY(${NAME} UNKNOWN IMPORTED)") + KOKKOS_APPEND_CONFIG_LINE("SET_TARGET_PROPERTIES(${NAME} PROPERTIES") + GET_TARGET_PROPERTY(TPL_LIBRARY ${NAME} IMPORTED_LOCATION) + IF(TPL_LIBRARY) + KOKKOS_APPEND_CONFIG_LINE("IMPORTED_LOCATION \"${TPL_LIBRARY}\"") + ENDIF() + ENDIF() + + GET_TARGET_PROPERTY(TPL_INCLUDES ${NAME} INTERFACE_INCLUDE_DIRECTORIES) + IF(TPL_INCLUDES) + KOKKOS_APPEND_CONFIG_LINE("INTERFACE_INCLUDE_DIRECTORIES \"${TPL_INCLUDES}\"") + ENDIF() + + GET_TARGET_PROPERTY(TPL_COMPILE_OPTIONS ${NAME} INTERFACE_COMPILE_OPTIONS) + IF(TPL_COMPILE_OPTIONS) + KOKKOS_APPEND_CONFIG_LINE("INTERFACE_COMPILE_OPTIONS ${TPL_COMPILE_OPTIONS}") + ENDIF() + + SET(TPL_LINK_OPTIONS) + GET_TARGET_PROPERTY(TPL_LINK_OPTIONS ${NAME} INTERFACE_LINK_OPTIONS) + IF(TPL_LINK_OPTIONS) + KOKKOS_APPEND_CONFIG_LINE("INTERFACE_LINK_OPTIONS ${TPL_LINK_OPTIONS}") + ENDIF() + + GET_TARGET_PROPERTY(TPL_LINK_LIBRARIES ${NAME} INTERFACE_LINK_LIBRARIES) + IF(TPL_LINK_LIBRARIES) + KOKKOS_APPEND_CONFIG_LINE("INTERFACE_LINK_LIBRARIES \"${TPL_LINK_LIBRARIES}\"") + ENDIF() + KOKKOS_APPEND_CONFIG_LINE(")") + KOKKOS_APPEND_CONFIG_LINE("ENDIF()") + ENDIF() + ENDIF() +ENDMACRO() + + +# +# @MACRO: KOKKOS_IMPORT_TPL() +# +# Function that checks if a third-party library (TPL) has been enabled and calls `find_package` +# to create an imported target encapsulating all the flags and libraries +# needed to use the TPL +# +# Usage:: +# +# KOKKOS_IMPORT_TPL( +# <NAME> +# NO_EXPORT +# INTERFACE +# +# ``NO_EXPORT`` +# +# If specified, this TPL will not be added to KokkosConfig.cmake as an export +# +# ``INTERFACE`` +# +# If specified, this TPL will build an INTERFACE library rather than an +# IMPORTED target +IF (KOKKOS_HAS_TRILINOS) +MACRO(kokkos_import_tpl NAME) + #do nothing +ENDMACRO() +ELSE() +MACRO(kokkos_import_tpl NAME) + CMAKE_PARSE_ARGUMENTS(TPL + "NO_EXPORT;INTERFACE" + "" + "" + ${ARGN}) + IF (TPL_INTERFACE) + SET(TPL_IMPORTED_NAME ${NAME}) + ELSE() + SET(TPL_IMPORTED_NAME Kokkos::${NAME}) + ENDIF() + + # Even though this policy gets set in the top-level CMakeLists.txt, + # I have still been getting errors about ROOT variables being ignored + # I'm not sure if this is a scope issue - but make sure + # the policy is set before we do any find_package calls + CMAKE_POLICY(SET CMP0074 NEW) + + IF (KOKKOS_ENABLE_${NAME}) + #Tack on a TPL here to make sure we avoid using anyone else's find + FIND_PACKAGE(TPL${NAME} REQUIRED MODULE) + IF(NOT TARGET ${TPL_IMPORTED_NAME}) + MESSAGE(FATAL_ERROR "Find module succeeded for ${NAME}, but did not produce valid target ${TPL_IMPORTED_NAME}") + ENDIF() + IF(NOT TPL_NO_EXPORT) + KOKKOS_EXPORT_IMPORTED_TPL(${TPL_IMPORTED_NAME}) + ENDIF() + LIST(APPEND KOKKOS_ENABLED_TPLS ${NAME}) + ENDIF() +ENDMACRO(kokkos_import_tpl) +ENDIF() + +MACRO(kokkos_import_cmake_tpl MODULE_NAME) + kokkos_import_tpl(${MODULE_NAME} ${ARGN} NO_EXPORT) + CMAKE_PARSE_ARGUMENTS(TPL + "NO_EXPORT" + "OPTION_NAME" + "" + ${ARGN}) + + IF (NOT TPL_OPTION_NAME) + SET(TPL_OPTION_NAME ${MODULE_NAME}) + ENDIF() + + IF (NOT TPL_NO_EXPORT) + KOKKOS_EXPORT_CMAKE_TPL(${MODULE_NAME}) + ENDIF() +ENDMACRO() + +# +# @MACRO: KOKKOS_CREATE_IMPORTED_TPL() +# +# Function that creates an imported target encapsulating all the flags +# and libraries needed to use the TPL +# +# Usage:: +# +# KOKKOS_CREATE_IMPORTED_TPL( +# <NAME> +# INTERFACE +# LIBRARY <path_to_librarY> +# LINK_LIBRARIES <lib1> <lib2> ... +# COMPILE_OPTIONS <opt1> <opt2> ... +# LINK_OPTIONS <opt1> <opt2> ... +# +# ``INTERFACE`` +# +# If specified, this TPL will build an INTERFACE library rather than an +# IMPORTED target +# +# ``LIBRARY <path_to_library>`` +# +# If specified, this gives the IMPORTED_LOCATION of the library. +# +# ``LINK_LIBRARIES <lib1> <lib2> ...`` +# +# If specified, this gives a list of dependent libraries that also +# need to be linked against. Each entry can be a library path or +# the name of a valid CMake target. +# +# ``INCLUDES <path1> <path2> ...`` +# +# If specified, this gives a list of directories that must be added +# to the include path for using this library. +# +# ``COMPILE_OPTIONS <opt1> <opt2> ...`` +# +# If specified, this gives a list of compiler flags that must be used +# for using this library. +# +# ``LINK_OPTIONS <opt1> <opt2> ...`` +# +# If specified, this gives a list of linker flags that must be used +# for using this library. +MACRO(kokkos_create_imported_tpl NAME) + CMAKE_PARSE_ARGUMENTS(TPL + "INTERFACE" + "LIBRARY" + "LINK_LIBRARIES;INCLUDES;COMPILE_DEFINITIONS;COMPILE_OPTIONS;LINK_OPTIONS" + ${ARGN}) + + + IF (KOKKOS_HAS_TRILINOS) + #TODO: we need to set a bunch of cache variables here + ELSEIF (TPL_INTERFACE) + ADD_LIBRARY(${NAME} INTERFACE) + #Give this an importy-looking name + ADD_LIBRARY(Kokkos::${NAME} ALIAS ${NAME}) + IF (TPL_LIBRARY) + MESSAGE(SEND_ERROR "TPL Interface library ${NAME} should not have an IMPORTED_LOCATION") + ENDIF() + #Things have to go in quoted in case we have multiple list entries + IF(TPL_LINK_LIBRARIES) + TARGET_LINK_LIBRARIES(${NAME} INTERFACE ${TPL_LINK_LIBRARIES}) + ENDIF() + IF(TPL_INCLUDES) + TARGET_INCLUDE_DIRECTORIES(${NAME} INTERFACE ${TPL_INCLUDES}) + ENDIF() + IF(TPL_COMPILE_DEFINITIONS) + TARGET_COMPILE_DEFINITIONS(${NAME} INTERFACE ${TPL_COMPILE_DEFINITIONS}) + ENDIF() + IF(TPL_COMPILE_OPTIONS) + TARGET_COMPILE_OPTIONS(${NAME} INTERFACE ${TPL_COMPILE_OPTIONS}) + ENDIF() + IF(TPL_LINK_OPTIONS) + TARGET_LINK_LIBRARIES(${NAME} INTERFACE ${TPL_LINK_OPTIONS}) + ENDIF() + ELSE() + ADD_LIBRARY(${NAME} UNKNOWN IMPORTED) + IF(TPL_LIBRARY) + SET_TARGET_PROPERTIES(${NAME} PROPERTIES + IMPORTED_LOCATION ${TPL_LIBRARY}) + ENDIF() + #Things have to go in quoted in case we have multiple list entries + IF(TPL_LINK_LIBRARIES) + SET_TARGET_PROPERTIES(${NAME} PROPERTIES + INTERFACE_LINK_LIBRARIES "${TPL_LINK_LIBRARIES}") + ENDIF() + IF(TPL_INCLUDES) + SET_TARGET_PROPERTIES(${NAME} PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${TPL_INCLUDES}") + ENDIF() + IF(TPL_COMPILE_DEFINITIONS) + SET_TARGET_PROPERTIES(${NAME} PROPERTIES + INTERFACE_COMPILE_DEFINITIONS "${TPL_COMPILE_DEFINITIONS}") + ENDIF() + IF(TPL_COMPILE_OPTIONS) + SET_TARGET_PROPERTIES(${NAME} PROPERTIES + INTERFACE_COMPILE_OPTIONS "${TPL_COMPILE_OPTIONS}") + ENDIF() + IF(TPL_LINK_OPTIONS) + SET_TARGET_PROPERTIES(${NAME} PROPERTIES + INTERFACE_LINK_LIBRARIES "${TPL_LINK_OPTIONS}") + ENDIF() + ENDIF() +ENDMACRO() + +# +# @MACRO: KOKKOS_FIND_HEADER +# +# Function that finds a particular header. This searches custom paths +# or default system paths depending on options. In constrast to CMake +# default, custom paths are prioritized over system paths. The searched +# order is: +# 1. <NAME>_ROOT variable +# 2. <NAME>_ROOT environment variable +# 3. Kokkos_<NAME>_DIR variable +# 4. Locations in the PATHS option +# 5. Default system paths, if allowed. +# +# Default system paths are allowed if none of options (1)-(4) are specified +# or if default paths are specifically allowed via ALLOW_SYSTEM_PATH_FALLBACK +# +# Usage:: +# +# KOKKOS_FIND_HEADER( +# <VAR_NAME> +# <HEADER> +# <TPL_NAME> +# [ALLOW_SYSTEM_PATH_FALLBACK] +# [PATHS path1 [path2 ...]] +# ) +# +# ``<VAR_NAME>`` +# +# The variable to define with the success or failure of the find +# +# ``<HEADER>`` +# +# The name of the header to find +# +# ``<TPL_NAME>`` +# +# The name of the TPL the header corresponds to +# +# ``[ALLOW_SYSTEM_PATH_FALLBACK]`` +# +# If custom paths are given and the header is not found +# should we be allowed to search default system paths +# or error out if not found in given paths +# +# ``[PATHS path1 [path2 ...]]`` +# +# Custom paths to search for the header +# +MACRO(kokkos_find_header VAR_NAME HEADER TPL_NAME) + CMAKE_PARSE_ARGUMENTS(TPL + "ALLOW_SYSTEM_PATH_FALLBACK" + "" + "PATHS" + ${ARGN}) + + SET(${VAR_NAME} "${VARNAME}-NOTFOUND") + SET(HAVE_CUSTOM_PATHS FALSE) + + IF(DEFINED ${TPL_NAME}_ROOT OR + DEFINED ENV{${TPL_NAME}_ROOT} OR + DEFINED KOKKOS_${TPL_NAME}_DIR OR + TPL_PATHS) + FIND_PATH(${VAR_NAME} ${HEADER} + PATHS + ${${TPL_NAME}_ROOT} + $ENV{${TPL_NAME}_ROOT} + ${KOKKOS_${TPL_NAME}_DIR} + ${TPL_PATHS} + PATH_SUFFIXES include + NO_DEFAULT_PATH) + SET(HAVE_CUSTOM_PATHS TRUE) + ENDIF() + + IF(NOT HAVE_CUSTOM_PATHS OR TPL_ALLOW_SYSTEM_PATH_FALLBACK) + #No-op if ${VAR_NAME} set by previous call + FIND_PATH(${VAR_NAME} ${HEADER}) + ENDIF() + +ENDMACRO() + +# +# @MACRO: KOKKOS_FIND_LIBRARY +# +# Function that find a particular library. This searches custom paths +# or default system paths depending on options. In constrast to CMake +# default, custom paths are prioritized over system paths. The search +# order is: +# 1. <NAME>_ROOT variable +# 2. <NAME>_ROOT environment variable +# 3. Kokkos_<NAME>_DIR variable +# 4. Locations in the PATHS option +# 5. Default system paths, if allowed. +# +# Default system paths are allowed if none of options (1)-(3) are specified +# or if default paths are specifically allowed via ALLOW_SYSTEM_PATH_FALLBACK +# +# Usage:: +# +# KOKKOS_FIND_LIBRARY( +# <VAR_NAME> +# <HEADER> +# <TPL_NAME> +# [ALLOW_SYSTEM_PATH_FALLBACK] +# [PATHS path1 [path2 ...]] +# [SUFFIXES suffix1 [suffix2 ...]] +# ) +# +# ``<VAR_NAME>`` +# +# The variable to define with the success or failure of the find +# +# ``<LIBRARY>`` +# +# The name of the library to find (NOT prefixed with -l) +# +# ``<TPL_NAME>`` +# +# The name of the TPL the library corresponds to +# +# ``ALLOW_SYSTEM_PATH_FALLBACK`` +# +# If custom paths are given and the library is not found +# should we be allowed to search default system paths +# or error out if not found in given paths +# +# ``PATHS`` +# +# Custom paths to search for the library +# +# ``SUFFIXES`` +# +# Suffixes appended to PATHS when attempting to locate +# the library. Defaults to {lib, lib64}. +# +MACRO(kokkos_find_library VAR_NAME LIB TPL_NAME) + CMAKE_PARSE_ARGUMENTS(TPL + "ALLOW_SYSTEM_PATH_FALLBACK" + "" + "PATHS;SUFFIXES" + ${ARGN}) + + IF(NOT TPL_SUFFIXES) + SET(TPL_SUFFIXES lib lib64) + ENDIF() + + SET(${VAR_NAME} "${VARNAME}-NOTFOUND") + SET(HAVE_CUSTOM_PATHS FALSE) + + IF(DEFINED ${TPL_NAME}_ROOT OR + DEFINED ENV{${TPL_NAME}_ROOT} OR + DEFINED KOKKOS_${TPL_NAME}_DIR OR + TPL_PATHS) + FIND_LIBRARY(${VAR_NAME} ${LIB} + PATHS + ${${TPL_NAME}_ROOT} + $ENV{${TPL_NAME}_ROOT} + ${KOKKOS_${TPL_NAME}_DIR} + ${TPL_PATHS} + PATH_SUFFIXES + ${TPL_SUFFIXES} + NO_DEFAULT_PATH) + SET(HAVE_CUSTOM_PATHS TRUE) + ENDIF() + + IF(NOT HAVE_CUSTOM_PATHS OR TPL_ALLOW_SYSTEM_PATH_FALLBACK) + #No-op if ${VAR_NAME} set by previous call + FIND_LIBRARY(${VAR_NAME} ${LIB} PATH_SUFFIXES ${TPL_SUFFIXES}) + ENDIF() + +ENDMACRO() + +# +# @MACRO: KOKKOS_FIND_IMPORTED +# +# Function that finds all libraries and headers needed for the tpl +# and creates an imported target encapsulating all the flags and libraries +# +# Usage:: +# +# KOKKOS_FIND_IMPORTED( +# <NAME> +# INTERFACE +# ALLOW_SYSTEM_PATH_FALLBACK +# MODULE_NAME <name> +# IMPORTED_NAME <name> +# LIBRARY <name> +# LIBRARIES <name1> <name2> ... +# LIBRARY_PATHS <path1> <path2> ... +# LIBRARY_SUFFIXES <suffix1> <suffix2> ... +# HEADER <name> +# HEADERS <name1> <name2> ... +# HEADER_PATHS <path1> <path2> ... +# ) +# +# ``INTERFACE`` +# +# If specified, this TPL will build an INTERFACE library rather than an +# IMPORTED target +# +# ``ALLOW_SYSTEM_PATH_FALLBACK`` +# +# If custom paths are given and the library is not found +# should we be allowed to search default system paths +# or error out if not found in given paths. +# +# ``MODULE_NAME <name>`` +# +# If specified, the name of the enclosing module passed to +# FIND_PACKAGE(<MODULE_NAME>). Defaults to TPL${NAME} if not +# given. +# +# ``IMPORTED_NAME <name>`` +# +# If specified, this gives the name of the target to build. +# Defaults to Kokkos::<NAME> +# +# ``LIBRARY <name>`` +# +# If specified, this gives the name of the library to look for +# +# ``LIBRARIES <name1> <name2> ...`` +# +# If specified, this gives a list of libraries to find for the package +# +# ``LIBRARY_PATHS <path1> <path2> ...`` +# +# If specified, this gives a list of paths to search for the library. +# If not given, <NAME>_ROOT will be searched. +# +# ``LIBRARY_SUFFIXES <suffix1> <suffix2> ...`` +# +# Suffixes appended to LIBRARY_PATHS when attempting to locate +# libraries. If not given, defaults to {lib, lib64}. +# +# ``HEADER <name>`` +# +# If specified, this gives the name of a header to to look for +# +# ``HEADERS <name1> <name2> ...`` +# +# If specified, this gives a list of headers to find for the package +# +# ``HEADER_PATHS <path1> <path2> ...`` +# +# If specified, this gives a list of paths to search for the headers +# If not given, <NAME>_ROOT/include and <NAME>_ROOT/include will be searched. +# +MACRO(kokkos_find_imported NAME) + CMAKE_PARSE_ARGUMENTS(TPL + "INTERFACE;ALLOW_SYSTEM_PATH_FALLBACK" + "IMPORTED_NAME;MODULE_NAME;LIBRARY;HEADER" + "LIBRARIES;LIBRARY_PATHS;LIBRARY_SUFFIXES;HEADERS;HEADER_PATHS" + ${ARGN}) + + IF(NOT TPL_MODULE_NAME) + SET(TPL_MODULE_NAME TPL${NAME}) + ENDIF() + + IF (TPL_ALLOW_SYSTEM_PATH_FALLBACK) + SET(ALLOW_PATH_FALLBACK_OPT ALLOW_SYSTEM_PATH_FALLBACK) + ELSE() + SET(ALLOW_PATH_FALLBACK_OPT) + ENDIF() + + IF (NOT TPL_IMPORTED_NAME) + IF (TPL_INTERFACE) + SET(TPL_IMPORTED_NAME ${NAME}) + ELSE() + SET(TPL_IMPORTED_NAME Kokkos::${NAME}) + ENDIF() + ENDIF() + + IF (NOT TPL_LIBRARY_SUFFIXES) + SET(TPL_LIBRARY_SUFFIXES lib lib64) + ENDIF() + + SET(${NAME}_INCLUDE_DIRS) + IF (TPL_HEADER) + KOKKOS_FIND_HEADER(${NAME}_INCLUDE_DIRS ${TPL_HEADER} ${NAME} ${ALLOW_PATH_FALLBACK_OPT} PATHS ${TPL_HEADER_PATHS}) + ENDIF() + + FOREACH(HEADER ${TPL_HEADERS}) + KOKKOS_FIND_HEADER(HEADER_FIND_TEMP ${HEADER} ${NAME} ${ALLOW_PATH_FALLBACK_OPT} PATHS ${TPL_HEADER_PATHS}) + IF(HEADER_FIND_TEMP) + LIST(APPEND ${NAME}_INCLUDE_DIRS ${HEADER_FIND_TEMP}) + ENDIF() + ENDFOREACH() + + SET(${NAME}_LIBRARY) + IF(TPL_LIBRARY) + KOKKOS_FIND_LIBRARY(${NAME}_LIBRARY ${TPL_LIBRARY} ${NAME} + ${ALLOW_PATH_FALLBACK_OPT} + PATHS ${TPL_LIBRARY_PATHS} + SUFFIXES ${TPL_LIBRARY_SUFFIXES}) + ENDIF() + + SET(${NAME}_FOUND_LIBRARIES) + FOREACH(LIB ${TPL_LIBRARIES}) + KOKKOS_FIND_LIBRARY(${LIB}_LOCATION ${LIB} ${NAME} + ${ALLOW_PATH_FALLBACK_OPT} + PATHS ${TPL_LIBRARY_PATHS} + SUFFIXES ${TPL_LIBRARY_SUFFIXES}) + IF(${LIB}_LOCATION) + LIST(APPEND ${NAME}_FOUND_LIBRARIES ${${LIB}_LOCATION}) + ELSE() + SET(${NAME}_FOUND_LIBRARIES ${${LIB}_LOCATION}) + BREAK() + ENDIF() + ENDFOREACH() + + INCLUDE(FindPackageHandleStandardArgs) + #Collect all the variables we need to be valid for + #find_package to have succeeded + SET(TPL_VARS_NEEDED) + IF (TPL_LIBRARY) + LIST(APPEND TPL_VARS_NEEDED ${NAME}_LIBRARY) + ENDIF() + IF(TPL_HEADER) + LIST(APPEND TPL_VARS_NEEDED ${NAME}_INCLUDE_DIRS) + ENDIF() + IF(TPL_LIBRARIES) + LIST(APPEND TPL_VARS_NEEDED ${NAME}_FOUND_LIBRARIES) + ENDIF() + FIND_PACKAGE_HANDLE_STANDARD_ARGS(${TPL_MODULE_NAME} REQUIRED_VARS ${TPL_VARS_NEEDED}) + + MARK_AS_ADVANCED(${NAME}_INCLUDE_DIRS ${NAME}_FOUND_LIBRARIES ${NAME}_LIBRARY) + + #this is so much fun on a Cray system + #/usr/include should never be added as a -isystem include + #this freaks out the compiler include search order + IF (KOKKOS_IS_CRAYPE) + LIST(REMOVE_ITEM ${NAME}_INCLUDE_DIRS "/usr/include") + ENDIF() + + IF (${TPL_MODULE_NAME}_FOUND) + SET(IMPORT_TYPE) + IF (TPL_INTERFACE) + SET(IMPORT_TYPE "INTERFACE") + ENDIF() + KOKKOS_CREATE_IMPORTED_TPL(${TPL_IMPORTED_NAME} + ${IMPORT_TYPE} + INCLUDES "${${NAME}_INCLUDE_DIRS}" + LIBRARY "${${NAME}_LIBRARY}" + LINK_LIBRARIES "${${NAME}_FOUND_LIBRARIES}") + ENDIF() +ENDMACRO(kokkos_find_imported) + +# +# @MACRO: KOKKOS_LINK_TPL() +# +# Function that checks if a third-party library (TPL) has been enabled and +# calls target_link_libraries on the given target +# +# Usage:: +# +# KOKKOS_LINK_TPL( +# <TARGET> +# PUBLIC +# PRIVATE +# INTERFACE +# IMPORTED_NAME <name> +# <TPL_NAME> +# +# Checks if Kokkos_ENABLE_<TPL_NAME>=ON and if so links the library +# +# ``PUBLIC/PRIVATE/INTERFACE`` +# +# Specifies the linkage mode. One of these arguments should be given. +# This will then invoke target_link_libraries(<TARGET> PUBLIC/PRIVATE/INTERFACE <TPL_NAME>) +# +# ``IMPORTED_NAME <name>`` +# +# If specified, this gives the exact name of the target to link against +# target_link_libraries(<TARGET> <IMPORTED_NAME>) +# +FUNCTION(kokkos_link_tpl TARGET) + CMAKE_PARSE_ARGUMENTS(TPL + "PUBLIC;PRIVATE;INTERFACE" + "IMPORTED_NAME" + "" + ${ARGN}) + #the name of the TPL + SET(TPL ${TPL_UNPARSED_ARGUMENTS}) + IF (KOKKOS_HAS_TRILINOS) + #Do nothing, they will have already been linked + ELSE() + IF (NOT TPL_IMPORTED_NAME) + SET(TPL_IMPORTED_NAME Kokkos::${TPL}) + ENDIF() + IF (KOKKOS_ENABLE_${TPL}) + IF (TPL_PUBLIC) + TARGET_LINK_LIBRARIES(${TARGET} PUBLIC ${TPL_IMPORTED_NAME}) + ELSEIF (TPL_PRIVATE) + TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ${TPL_IMPORTED_NAME}) + ELSEIF (TPL_INTERFACE) + TARGET_LINK_LIBRARIES(${TARGET} INTERFACE ${TPL_IMPORTED_NAME}) + ELSE() + TARGET_LINK_LIBRARIES(${TARGET} ${TPL_IMPORTED_NAME}) + ENDIF() + ENDIF() + ENDIF() +ENDFUNCTION() + +FUNCTION(COMPILER_SPECIFIC_OPTIONS_HELPER) + SET(COMPILERS NVIDIA PGI XL DEFAULT Cray Intel Clang AppleClang IntelClang GNU HIPCC Fujitsu) + CMAKE_PARSE_ARGUMENTS( + PARSE + "LINK_OPTIONS;COMPILE_OPTIONS;COMPILE_DEFINITIONS;LINK_LIBRARIES" + "COMPILER_ID" + "${COMPILERS}" + ${ARGN}) + IF(PARSE_UNPARSED_ARGUMENTS) + MESSAGE(SEND_ERROR "'${PARSE_UNPARSED_ARGUMENTS}' argument(s) not recognized when providing compiler specific options") + ENDIF() + + IF(PARSE_COMPILER_ID) + SET(COMPILER ${${PARSE_COMPILER_ID}}) + ELSE() + SET(COMPILER ${KOKKOS_CXX_COMPILER_ID}) + ENDIF() + + SET(COMPILER_SPECIFIC_FLAGS_TMP) + FOREACH(COMP ${COMPILERS}) + IF (COMPILER STREQUAL "${COMP}") + IF (PARSE_${COMPILER}) + IF (NOT "${PARSE_${COMPILER}}" STREQUAL "NO-VALUE-SPECIFIED") + SET(COMPILER_SPECIFIC_FLAGS_TMP ${PARSE_${COMPILER}}) + ENDIF() + ELSEIF(PARSE_DEFAULT) + SET(COMPILER_SPECIFIC_FLAGS_TMP ${PARSE_DEFAULT}) + ENDIF() + ENDIF() + ENDFOREACH() + + IF (PARSE_COMPILE_OPTIONS) + # The funky logic here is for future handling of argument deduplication + # If we naively pass multiple -Xcompiler flags to target_compile_options + # -Xcompiler will get deduplicated and break the build + IF ("-Xcompiler" IN_LIST COMPILER_SPECIFIC_FLAGS_TMP) + LIST(REMOVE_ITEM COMPILER_SPECIFIC_FLAGS_TMP "-Xcompiler") + GLOBAL_APPEND(KOKKOS_XCOMPILER_OPTIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) + ELSE() + GLOBAL_APPEND(KOKKOS_COMPILE_OPTIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) + ENDIF() + ENDIF() + + IF (PARSE_LINK_OPTIONS) + GLOBAL_APPEND(KOKKOS_LINK_OPTIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) + ENDIF() + + IF (PARSE_COMPILE_DEFINITIONS) + GLOBAL_APPEND(KOKKOS_COMPILE_DEFINITIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) + ENDIF() + + IF (PARSE_LINK_LIBRARIES) + GLOBAL_APPEND(KOKKOS_LINK_LIBRARIES ${COMPILER_SPECIFIC_FLAGS_TMP}) + ENDIF() +ENDFUNCTION(COMPILER_SPECIFIC_OPTIONS_HELPER) + +FUNCTION(COMPILER_SPECIFIC_FLAGS) + COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} COMPILE_OPTIONS LINK_OPTIONS) +ENDFUNCTION(COMPILER_SPECIFIC_FLAGS) + +FUNCTION(COMPILER_SPECIFIC_OPTIONS) + COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} COMPILE_OPTIONS) +ENDFUNCTION(COMPILER_SPECIFIC_OPTIONS) + +FUNCTION(COMPILER_SPECIFIC_LINK_OPTIONS) + COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} LINK_OPTIONS) +ENDFUNCTION(COMPILER_SPECIFIC_LINK_OPTIONS) + +FUNCTION(COMPILER_SPECIFIC_DEFS) + COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} COMPILE_DEFINITIONS) +ENDFUNCTION(COMPILER_SPECIFIC_DEFS) + +FUNCTION(COMPILER_SPECIFIC_LIBS) + COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} LINK_LIBRARIES) +ENDFUNCTION(COMPILER_SPECIFIC_LIBS) +# Given a list of the form +# key1;value1;key2;value2,... +# Create a list of all keys in a variable named ${KEY_LIST_NAME} +# and set the value for each key in a variable ${VAR_PREFIX}key1,... +# kokkos_key_value_map(ARCH ALL_ARCHES key1;value1;key2;value2) +# would produce a list variable ALL_ARCHES=key1;key2 +# and individual variables ARCHkey1=value1 and ARCHkey2=value2 +MACRO(KOKKOS_KEY_VALUE_MAP VAR_PREFIX KEY_LIST_NAME) + SET(PARSE_KEY ON) + SET(${KEY_LIST_NAME}) + FOREACH(ENTRY ${ARGN}) + IF(PARSE_KEY) + SET(CURRENT_KEY ${ENTRY}) + SET(PARSE_KEY OFF) + LIST(APPEND ${KEY_LIST_NAME} ${CURRENT_KEY}) + ELSE() + SET(${VAR_PREFIX}${CURRENT_KEY} ${ENTRY}) + SET(PARSE_KEY ON) + ENDIF() + ENDFOREACH() +ENDMACRO() + +FUNCTION(KOKKOS_CHECK_DEPRECATED_OPTIONS) + KOKKOS_KEY_VALUE_MAP(DEPRECATED_MSG_ DEPRECATED_LIST ${ARGN}) + FOREACH(OPTION_SUFFIX ${DEPRECATED_LIST}) + SET(OPTION_NAME Kokkos_${OPTION_SUFFIX}) + SET(OPTION_MESSAGE ${DEPRECATED_MSG_${OPTION_SUFFIX}}) + IF(DEFINED ${OPTION_NAME}) # This variable has been given by the user as on or off + MESSAGE(SEND_ERROR "Removed option ${OPTION_NAME} has been given with value ${${OPTION_NAME}}. ${OPT_MESSAGE}") + ENDIF() + ENDFOREACH() +ENDFUNCTION() + +# this function checks whether the current CXX compiler supports building CUDA +FUNCTION(kokkos_cxx_compiler_cuda_test _VAR) + # don't run this test every time + IF(DEFINED ${_VAR}) + RETURN() + ENDIF() + + FILE(WRITE ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cpp +" +#include <cuda.h> +#include <cstdlib> + +__global__ +void kernel(int sz, double* data) +{ + auto _beg = blockIdx.x * blockDim.x + threadIdx.x; + for(int i = _beg; i < sz; ++i) + data[i] += static_cast<double>(i); +} + +int main() +{ + double* data = nullptr; + int blocks = 64; + int grids = 64; + auto ret = cudaMalloc(&data, blocks * grids * sizeof(double)); + if(ret != cudaSuccess) + return EXIT_FAILURE; + kernel<<<grids, blocks>>>(blocks * grids, data); + cudaDeviceSynchronize(); + return EXIT_SUCCESS; +} +") + + TRY_COMPILE(_RET + ${PROJECT_BINARY_DIR}/compile_tests + SOURCES ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cpp) + + SET(${_VAR} ${_RET} CACHE STRING "CXX compiler supports building CUDA") +ENDFUNCTION() + +# this function is provided to easily select which files use nvcc_wrapper: +# +# GLOBAL --> all files +# TARGET --> all files in a target +# SOURCE --> specific source files +# DIRECTORY --> all files in directory +# PROJECT --> all files/targets in a project/subproject +# +# NOTE: this is VERY DIFFERENT than the version in KokkosConfigCommon.cmake.in. +# This version explicitly uses nvcc_wrapper. +# +FUNCTION(kokkos_compilation) + # check whether the compiler already supports building CUDA + KOKKOS_CXX_COMPILER_CUDA_TEST(Kokkos_CXX_COMPILER_COMPILES_CUDA) + # if CUDA compile test has already been performed, just return + IF(Kokkos_CXX_COMPILER_COMPILES_CUDA) + RETURN() + ENDIF() + + CMAKE_PARSE_ARGUMENTS(COMP "GLOBAL;PROJECT" "" "DIRECTORY;TARGET;SOURCE" ${ARGN}) + + # find kokkos_launch_compiler + FIND_PROGRAM(Kokkos_COMPILE_LAUNCHER + NAMES kokkos_launch_compiler + HINTS ${PROJECT_SOURCE_DIR} + PATHS ${PROJECT_SOURCE_DIR} + PATH_SUFFIXES bin) + + IF(NOT Kokkos_COMPILE_LAUNCHER) + MESSAGE(FATAL_ERROR "Kokkos could not find 'kokkos_launch_compiler'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/launcher'") + ENDIF() + + # find nvcc_wrapper + FIND_PROGRAM(Kokkos_NVCC_WRAPPER + NAMES nvcc_wrapper + HINTS ${PROJECT_SOURCE_DIR} + PATHS ${PROJECT_SOURCE_DIR} + PATH_SUFFIXES bin) + + IF(NOT Kokkos_COMPILE_LAUNCHER) + MESSAGE(FATAL_ERROR "Kokkos could not find 'nvcc_wrapper'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/nvcc_wrapper'") + ENDIF() + + IF(COMP_GLOBAL) + # if global, don't bother setting others + SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") + SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") + ELSE() + FOREACH(_TYPE PROJECT DIRECTORY TARGET SOURCE) + # make project/subproject scoping easy, e.g. KokkosCompilation(PROJECT) after project(...) + IF("${_TYPE}" STREQUAL "PROJECT" AND COMP_${_TYPE}) + LIST(APPEND COMP_DIRECTORY ${PROJECT_SOURCE_DIR}) + UNSET(COMP_${_TYPE}) + ENDIF() + # set the properties if defined + IF(COMP_${_TYPE}) + # MESSAGE(STATUS "Using nvcc_wrapper :: ${_TYPE} :: ${COMP_${_TYPE}}") + SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") + SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") + ENDIF() + ENDFOREACH() + ENDIF() +ENDFUNCTION() +## KOKKOS_CONFIG_HEADER - parse the data list which is a list of backend names +## and create output config header file...used for +## creating dynamic include files based on enabled backends +## +## SRC_FILE is input file +## TARGET_FILE output file +## HEADER_GUARD TEXT used with include header guard +## HEADER_PREFIX prefix used with include (i.e. fwd, decl, setup) +## DATA_LIST list of backends to include in generated file +FUNCTION(KOKKOS_CONFIG_HEADER SRC_FILE TARGET_FILE HEADER_GUARD HEADER_PREFIX DATA_LIST) + SET(HEADER_GUARD_TAG "${HEADER_GUARD}_HPP_") + CONFIGURE_FILE(cmake/${SRC_FILE} ${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work COPYONLY) + FOREACH( BACKEND_NAME ${DATA_LIST} ) + SET(INCLUDE_NEXT_FILE "#include <${HEADER_PREFIX}_${BACKEND_NAME}.hpp> +\@INCLUDE_NEXT_FILE\@") + CONFIGURE_FILE(${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work ${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work @ONLY) + ENDFOREACH() + SET(INCLUDE_NEXT_FILE "" ) + CONFIGURE_FILE(${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work ${TARGET_FILE} @ONLY) +ENDFUNCTION() diff --git a/packages/kokkos/cmake/kokkos_install.cmake b/packages/kokkos/cmake/kokkos_install.cmake new file mode 100644 index 0000000000000000000000000000000000000000..ff66d015fb421632bd59a7ccc47a5db3de86a3c7 --- /dev/null +++ b/packages/kokkos/cmake/kokkos_install.cmake @@ -0,0 +1,48 @@ +INCLUDE(CMakePackageConfigHelpers) +IF (NOT KOKKOS_HAS_TRILINOS AND NOT Kokkos_INSTALL_TESTING) + INCLUDE(GNUInstallDirs) + + #Set all the variables needed for KokkosConfig.cmake + GET_PROPERTY(KOKKOS_PROP_LIBS GLOBAL PROPERTY KOKKOS_LIBRARIES_NAMES) + SET(KOKKOS_LIBRARIES ${KOKKOS_PROP_LIBS}) + + INCLUDE(CMakePackageConfigHelpers) + CONFIGURE_PACKAGE_CONFIG_FILE( + cmake/KokkosConfig.cmake.in + "${Kokkos_BINARY_DIR}/KokkosConfig.cmake" + INSTALL_DESTINATION ${CMAKE_INSTALL_FULL_LIBDIR}/cmake) + + CONFIGURE_PACKAGE_CONFIG_FILE( + cmake/KokkosConfigCommon.cmake.in + "${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake" + INSTALL_DESTINATION ${CMAKE_INSTALL_FULL_LIBDIR}/cmake) + + WRITE_BASIC_PACKAGE_VERSION_FILE("${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake" + VERSION "${Kokkos_VERSION}" + COMPATIBILITY SameMajorVersion) + + # Install the KokkosConfig*.cmake files + install(FILES + "${Kokkos_BINARY_DIR}/KokkosConfig.cmake" + "${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake" + "${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake" + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Kokkos) + install(EXPORT KokkosTargets NAMESPACE Kokkos:: DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Kokkos) +ELSE() + CONFIGURE_FILE(cmake/KokkosConfigCommon.cmake.in ${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake @ONLY) + file(READ ${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake KOKKOS_CONFIG_COMMON) + file(APPEND "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/KokkosConfig_install.cmake" "${KOKKOS_CONFIG_COMMON}") + CONFIGURE_FILE(cmake/KokkosTrilinosConfig.cmake.in ${Kokkos_BINARY_DIR}/KokkosTrilinosConfig.cmake @ONLY) + file(READ ${Kokkos_BINARY_DIR}/KokkosTrilinosConfig.cmake KOKKOS_TRILINOS_CONFIG) + file(APPEND "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/KokkosConfig_install.cmake" "${KOKKOS_TRILINOS_CONFIG}") + + WRITE_BASIC_PACKAGE_VERSION_FILE("${CMAKE_CURRENT_BINARY_DIR}/KokkosConfigVersion.cmake" + VERSION "${Kokkos_VERSION}" + COMPATIBILITY SameMajorVersion) + + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/KokkosConfigVersion.cmake + DESTINATION "${${PROJECT_NAME}_INSTALL_LIB_DIR}/cmake/${PACKAGE_NAME}") +ENDIF() + +INSTALL(FILES ${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_config.h DESTINATION ${KOKKOS_HEADER_DIR}) + diff --git a/packages/kokkos/cmake/kokkos_pick_cxx_std.cmake b/packages/kokkos/cmake/kokkos_pick_cxx_std.cmake new file mode 100644 index 0000000000000000000000000000000000000000..015873ebd6320d78c6ab3e190d7666c1c7e84824 --- /dev/null +++ b/packages/kokkos/cmake/kokkos_pick_cxx_std.cmake @@ -0,0 +1,44 @@ +# From CMake 3.10 documentation + +#This can run at any time +KOKKOS_OPTION(CXX_STANDARD "" STRING "The C++ standard for Kokkos to use: 14, 17, or 20. If empty, this will default to CMAKE_CXX_STANDARD. If both CMAKE_CXX_STANDARD and Kokkos_CXX_STANDARD are empty, this will default to 14") + +# Set CXX standard flags +SET(KOKKOS_ENABLE_CXX14 OFF) +SET(KOKKOS_ENABLE_CXX17 OFF) +SET(KOKKOS_ENABLE_CXX20 OFF) +IF (KOKKOS_CXX_STANDARD) + IF (${KOKKOS_CXX_STANDARD} STREQUAL "c++98") + MESSAGE(FATAL_ERROR "Kokkos no longer supports C++98 - minimum C++14") + ELSEIF (${KOKKOS_CXX_STANDARD} STREQUAL "c++11") + MESSAGE(FATAL_ERROR "Kokkos no longer supports C++11 - minimum C++14") + ELSEIF(${KOKKOS_CXX_STANDARD} STREQUAL "c++14") + MESSAGE(WARNING "Deprecated Kokkos C++ standard set as 'c++14'. Use '14' instead.") + SET(KOKKOS_CXX_STANDARD "14") + ELSEIF(${KOKKOS_CXX_STANDARD} STREQUAL "c++17") + MESSAGE(WARNING "Deprecated Kokkos C++ standard set as 'c++17'. Use '17' instead.") + SET(KOKKOS_CXX_STANDARD "17") + ELSEIF(${KOKKOS_CXX_STANDARD} STREQUAL "c++1y") + MESSAGE(WARNING "Deprecated Kokkos C++ standard set as 'c++1y'. Use '1Y' instead.") + SET(KOKKOS_CXX_STANDARD "1Y") + ELSEIF(${KOKKOS_CXX_STANDARD} STREQUAL "c++1z") + MESSAGE(WARNING "Deprecated Kokkos C++ standard set as 'c++1z'. Use '1Z' instead.") + SET(KOKKOS_CXX_STANDARD "1Z") + ELSEIF(${KOKKOS_CXX_STANDARD} STREQUAL "c++2a") + MESSAGE(WARNING "Deprecated Kokkos C++ standard set as 'c++2a'. Use '2A' instead.") + SET(KOKKOS_CXX_STANDARD "2A") + ENDIF() +ENDIF() + +IF (NOT KOKKOS_CXX_STANDARD AND NOT CMAKE_CXX_STANDARD) + MESSAGE(STATUS "Setting default Kokkos CXX standard to 14") + SET(KOKKOS_CXX_STANDARD "14") +ELSEIF(NOT KOKKOS_CXX_STANDARD) + MESSAGE(STATUS "Setting default Kokkos CXX standard to ${CMAKE_CXX_STANDARD}") + SET(KOKKOS_CXX_STANDARD ${CMAKE_CXX_STANDARD}) +ENDIF() + + + + + diff --git a/packages/kokkos/cmake/kokkos_test_cxx_std.cmake b/packages/kokkos/cmake/kokkos_test_cxx_std.cmake new file mode 100644 index 0000000000000000000000000000000000000000..707fb000af528694780d6668f160a3fee3472a69 --- /dev/null +++ b/packages/kokkos/cmake/kokkos_test_cxx_std.cmake @@ -0,0 +1,174 @@ +KOKKOS_CFG_DEPENDS(CXX_STD COMPILER_ID) + +FUNCTION(kokkos_set_cxx_standard_feature standard) + SET(EXTENSION_NAME CMAKE_CXX${standard}_EXTENSION_COMPILE_OPTION) + SET(STANDARD_NAME CMAKE_CXX${standard}_STANDARD_COMPILE_OPTION) + SET(FEATURE_NAME cxx_std_${standard}) + #CMake's way of telling us that the standard (or extension) + #flags are supported is the extension/standard variables + IF (NOT DEFINED CMAKE_CXX_EXTENSIONS) + IF(KOKKOS_DONT_ALLOW_EXTENSIONS) + GLOBAL_SET(KOKKOS_USE_CXX_EXTENSIONS OFF) + ELSE() + GLOBAL_SET(KOKKOS_USE_CXX_EXTENSIONS ON) + ENDIF() + ELSEIF(CMAKE_CXX_EXTENSIONS) + IF(KOKKOS_DONT_ALLOW_EXTENSIONS) + MESSAGE(FATAL_ERROR "The chosen configuration does not support CXX extensions flags: ${KOKKOS_DONT_ALLOW_EXTENSIONS}. Must set CMAKE_CXX_EXTENSIONS=OFF to continue") + ELSE() + GLOBAL_SET(KOKKOS_USE_CXX_EXTENSIONS ON) + ENDIF() + ELSE() + #For trilinos, we need to make sure downstream projects + GLOBAL_SET(KOKKOS_USE_CXX_EXTENSIONS OFF) + ENDIF() + + IF (KOKKOS_USE_CXX_EXTENSIONS AND ${EXTENSION_NAME}) + MESSAGE(STATUS "Using ${${EXTENSION_NAME}} for C++${standard} extensions as feature") + GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME}) + ELSEIF(NOT KOKKOS_USE_CXX_EXTENSIONS AND ${STANDARD_NAME}) + MESSAGE(STATUS "Using ${${STANDARD_NAME}} for C++${standard} standard as feature") + IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND (KOKKOS_CXX_HOST_COMPILER_ID STREQUAL GNU OR KOKKOS_CXX_HOST_COMPILER_ID STREQUAL Clang)) + SET(SUPPORTED_NVCC_FLAGS "-std=c++14;-std=c++17") + IF (NOT ${${STANDARD_NAME}} IN_LIST SUPPORTED_NVCC_FLAGS) + MESSAGE(FATAL_ERROR "CMake wants to use ${${STANDARD_NAME}} which is not supported by NVCC. Using a more recent host compiler or a more recent CMake version might help.") + ENDIF() + ENDIF() + GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME}) + ELSEIF (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" OR "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") + #MSVC doesn't need a command line flag, that doesn't mean it has no support + MESSAGE(STATUS "Using no flag for C++${standard} standard as feature") + GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME}) + ELSEIF((KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA") AND WIN32) + MESSAGE(STATUS "Using no flag for C++${standard} standard as feature") + GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE "") + ELSEIF((KOKKOS_CXX_COMPILER_ID STREQUAL "Fujitsu")) + MESSAGE(STATUS "Using no flag for C++${standard} standard as feature") + GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE "") + ELSE() + #nope, we can't do anything here + MESSAGE(WARNING "C++${standard} is not supported as a compiler feature. We will choose custom flags for now, but this behavior has been deprecated. Please open an issue at https://github.com/kokkos/kokkos/issues reporting that ${KOKKOS_CXX_COMPILER_ID} ${KOKKOS_CXX_COMPILER_VERSION} failed for ${KOKKOS_CXX_STANDARD}, preferably including your CMake command.") + GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE "") + ENDIF() + + IF((NOT WIN32) AND (NOT ("${KOKKOS_CXX_COMPILER_ID}" STREQUAL "Fujitsu"))) + IF(NOT ${FEATURE_NAME} IN_LIST CMAKE_CXX_COMPILE_FEATURES) + MESSAGE(FATAL_ERROR "Compiler ${KOKKOS_CXX_COMPILER_ID} should support ${FEATURE_NAME}, but CMake reports feature not supported") + ENDIF() + ENDIF() +ENDFUNCTION() + + +IF (KOKKOS_CXX_STANDARD AND CMAKE_CXX_STANDARD) + #make sure these are consistent + IF (NOT KOKKOS_CXX_STANDARD STREQUAL CMAKE_CXX_STANDARD) + MESSAGE(WARNING "Specified both CMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD} and KOKKOS_CXX_STANDARD=${KOKKOS_CXX_STANDARD}, but they don't match") + SET(CMAKE_CXX_STANDARD ${KOKKOS_CXX_STANDARD} CACHE STRING "C++ standard" FORCE) + ENDIF() +ENDIF() + + +IF(KOKKOS_CXX_STANDARD STREQUAL "14") + kokkos_set_cxx_standard_feature(14) + SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "1Y") + SET(KOKKOS_ENABLE_CXX14 ON) +ELSEIF(KOKKOS_CXX_STANDARD STREQUAL "17") + kokkos_set_cxx_standard_feature(17) + SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "1Z") + SET(KOKKOS_ENABLE_CXX17 ON) +ELSEIF(KOKKOS_CXX_STANDARD STREQUAL "20") + kokkos_set_cxx_standard_feature(20) + SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "2A") + SET(KOKKOS_ENABLE_CXX20 ON) +ELSEIF(KOKKOS_CXX_STANDARD STREQUAL "98" OR KOKKOS_CXX_STANDARD STREQUAL "11") + MESSAGE(FATAL_ERROR "Kokkos requires C++14 or newer!") +ELSE() + MESSAGE(FATAL_ERROR "Unknown C++ standard ${KOKKOS_CXX_STANDARD} - must be 14, 17, or 20") +ENDIF() + +# Enforce that we can compile a simple C++14 program + +TRY_COMPILE(CAN_COMPILE_CPP14 + ${KOKKOS_TOP_BUILD_DIR}/corner_cases + ${KOKKOS_SOURCE_DIR}/cmake/compile_tests/cplusplus14.cpp + OUTPUT_VARIABLE ERROR_MESSAGE + CXX_STANDARD 14 +) +if (NOT CAN_COMPILE_CPP14) + UNSET(CAN_COMPILE_CPP14 CACHE) #make sure CMake always re-runs this + MESSAGE(FATAL_ERROR "C++${KOKKOS_CXX_STANDARD}-compliant compiler detected, but unable to compile C++14 or later program. Verify that ${CMAKE_CXX_COMPILER_ID}:${CMAKE_CXX_COMPILER_VERSION} is set up correctly (e.g., check that correct library headers are being used).\nFailing output:\n ${ERROR_MESSAGE}") +ENDIF() +UNSET(CAN_COMPILE_CPP14 CACHE) #make sure CMake always re-runs this + + +# Enforce that extensions are turned off for nvcc_wrapper. +# For compiling CUDA code using nvcc_wrapper, we will use the host compiler's +# flags for turning on C++14. Since for compiler ID and versioning purposes +# CMake recognizes the host compiler when calling nvcc_wrapper, this just +# works. Both NVCC and nvcc_wrapper only recognize '-std=c++14' which means +# that we can only use host compilers for CUDA builds that use those flags. +# It also means that extensions (gnu++14) can't be turned on for CUDA builds. + +IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + IF(NOT DEFINED CMAKE_CXX_EXTENSIONS) + SET(CMAKE_CXX_EXTENSIONS OFF) + ELSEIF(CMAKE_CXX_EXTENSIONS) + MESSAGE(FATAL_ERROR "NVCC doesn't support C++ extensions. Set -DCMAKE_CXX_EXTENSIONS=OFF") + ENDIF() +ENDIF() + +IF(KOKKOS_ENABLE_CUDA) + # ENFORCE that the compiler can compile CUDA code. + IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 4.0.0) + MESSAGE(FATAL_ERROR "Compiling CUDA code directly with Clang requires version 4.0.0 or higher.") + ENDIF() + IF(NOT DEFINED CMAKE_CXX_EXTENSIONS) + SET(CMAKE_CXX_EXTENSIONS OFF) + ELSEIF(CMAKE_CXX_EXTENSIONS) + MESSAGE(FATAL_ERROR "Compiling CUDA code with clang doesn't support C++ extensions. Set -DCMAKE_CXX_EXTENSIONS=OFF") + ENDIF() + ELSEIF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + MESSAGE(FATAL_ERROR "Invalid compiler for CUDA. The compiler must be nvcc_wrapper or Clang or use kokkos_launch_compiler, but compiler ID was ${KOKKOS_CXX_COMPILER_ID}") + ENDIF() +ENDIF() + +IF (NOT KOKKOS_CXX_STANDARD_FEATURE) + #we need to pick the C++ flags ourselves + UNSET(CMAKE_CXX_STANDARD) + UNSET(CMAKE_CXX_STANDARD CACHE) + IF(KOKKOS_CXX_COMPILER_ID STREQUAL Cray) + INCLUDE(${KOKKOS_SRC_PATH}/cmake/cray.cmake) + kokkos_set_cray_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD}) + ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL PGI) + INCLUDE(${KOKKOS_SRC_PATH}/cmake/pgi.cmake) + kokkos_set_pgi_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD}) + ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) + INCLUDE(${KOKKOS_SRC_PATH}/cmake/intel.cmake) + kokkos_set_intel_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD}) + ELSEIF((KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") OR ((KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA") AND WIN32)) + INCLUDE(${KOKKOS_SRC_PATH}/cmake/msvc.cmake) + kokkos_set_msvc_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD}) + ELSE() + INCLUDE(${KOKKOS_SRC_PATH}/cmake/gnu.cmake) + kokkos_set_gnu_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD}) + ENDIF() + #check that the compiler accepts the C++ standard flag + INCLUDE(CheckCXXCompilerFlag) + IF (DEFINED CXX_STD_FLAGS_ACCEPTED) + UNSET(CXX_STD_FLAGS_ACCEPTED CACHE) + ENDIF() + CHECK_CXX_COMPILER_FLAG("${KOKKOS_CXX_STANDARD_FLAG}" CXX_STD_FLAGS_ACCEPTED) + IF (NOT CXX_STD_FLAGS_ACCEPTED) + CHECK_CXX_COMPILER_FLAG("${KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG}" CXX_INT_STD_FLAGS_ACCEPTED) + IF (NOT CXX_INT_STD_FLAGS_ACCEPTED) + MESSAGE(FATAL_ERROR "${KOKKOS_CXX_COMPILER_ID} did not accept ${KOKKOS_CXX_STANDARD_FLAG} or ${KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG}. You likely need to reduce the level of the C++ standard from ${KOKKOS_CXX_STANDARD}") + ENDIF() + SET(KOKKOS_CXX_STANDARD_FLAG ${KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG}) + ENDIF() + MESSAGE(STATUS "Compiler features not supported, but ${KOKKOS_CXX_COMPILER_ID} accepts ${KOKKOS_CXX_STANDARD_FLAG}") +ENDIF() + + + + diff --git a/packages/kokkos/cmake/kokkos_tpls.cmake b/packages/kokkos/cmake/kokkos_tpls.cmake new file mode 100644 index 0000000000000000000000000000000000000000..d8d044c9d75384a1d8d312a94708623c735d121f --- /dev/null +++ b/packages/kokkos/cmake/kokkos_tpls.cmake @@ -0,0 +1,90 @@ +KOKKOS_CFG_DEPENDS(TPLS OPTIONS) +KOKKOS_CFG_DEPENDS(TPLS DEVICES) +KOKKOS_CFG_DEPENDS(TPLS COMPILER_ID) + +FUNCTION(KOKKOS_TPL_OPTION PKG DEFAULT) + CMAKE_PARSE_ARGUMENTS(PARSED + "" + "TRIBITS" + "" + ${ARGN}) + + IF (PARSED_TRIBITS) + #this is also a TPL option you can activate with Tribits + IF (NOT "${TPL_ENABLE_${PARSED_TRIBITS}}" STREQUAL "") + #Tribits brought its own default that should take precedence + SET(DEFAULT ${TPL_ENABLE_${PARSED_TRIBITS}}) + ENDIF() + ENDIF() + + KOKKOS_ENABLE_OPTION(${PKG} ${DEFAULT} "Whether to enable the ${PKG} library") + KOKKOS_OPTION(${PKG}_DIR "" PATH "Location of ${PKG} library") + SET(KOKKOS_ENABLE_${PKG} ${KOKKOS_ENABLE_${PKG}} PARENT_SCOPE) + SET(KOKKOS_${PKG}_DIR ${KOKKOS_${PKG}_DIR} PARENT_SCOPE) + + IF (KOKKOS_HAS_TRILINOS + AND KOKKOS_ENABLE_${PKG} + AND NOT PARSED_TRIBITS) + #this TPL was enabled, but it is not valid to use inside of TriBITS + MESSAGE(FATAL_ERROR "Enabled TPL ${PKG} inside TriBITS build, " + "but this can only be enabled in a standalone build") + ENDIF() +ENDFUNCTION() + +KOKKOS_TPL_OPTION(HWLOC Off) +KOKKOS_TPL_OPTION(LIBNUMA Off) +KOKKOS_TPL_OPTION(MEMKIND Off) +IF(KOKKOS_ENABLE_MEMKIND) + SET(KOKKOS_ENABLE_HBWSPACE ON) +ENDIF() +KOKKOS_TPL_OPTION(CUDA ${Kokkos_ENABLE_CUDA} TRIBITS CUDA) +KOKKOS_TPL_OPTION(LIBRT Off) +IF(KOKKOS_ENABLE_HIP AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) + SET(ROCM_DEFAULT ON) +ELSE() + SET(ROCM_DEFAULT OFF) +ENDIF() +KOKKOS_TPL_OPTION(ROCM ${ROCM_DEFAULT}) + +IF (WIN32) + SET(LIBDL_DEFAULT Off) +ELSE() + SET(LIBDL_DEFAULT On) +ENDIF() +KOKKOS_TPL_OPTION(LIBDL ${LIBDL_DEFAULT} TRIBITS DLlib) + +IF(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_HPX) +SET(HPX_DEFAULT ON) +ELSE() +SET(HPX_DEFAULT OFF) +ENDIF() +KOKKOS_TPL_OPTION(HPX ${HPX_DEFAULT}) + +IF(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_PTHREAD) +SET(PTHREAD_DEFAULT ON) +ELSE() +SET(PTHREAD_DEFAULT OFF) +ENDIF() +KOKKOS_TPL_OPTION(PTHREAD ${PTHREAD_DEFAULT} TRIBITS Pthread) + + +#Make sure we use our local FindKokkosCuda.cmake +KOKKOS_IMPORT_TPL(HPX INTERFACE) +KOKKOS_IMPORT_TPL(CUDA INTERFACE) +KOKKOS_IMPORT_TPL(HWLOC) +KOKKOS_IMPORT_TPL(LIBNUMA) +KOKKOS_IMPORT_TPL(LIBRT) +KOKKOS_IMPORT_TPL(LIBDL) +KOKKOS_IMPORT_TPL(MEMKIND) +KOKKOS_IMPORT_TPL(PTHREAD INTERFACE) +KOKKOS_IMPORT_TPL(ROCM INTERFACE) + +#Convert list to newlines (which CMake doesn't always like in cache variables) +STRING(REPLACE ";" "\n" KOKKOS_TPL_EXPORT_TEMP "${KOKKOS_TPL_EXPORTS}") +#Convert to a regular variable +UNSET(KOKKOS_TPL_EXPORTS CACHE) +SET(KOKKOS_TPL_EXPORTS ${KOKKOS_TPL_EXPORT_TEMP}) +IF (KOKKOS_ENABLE_MEMKIND) + SET(KOKKOS_ENABLE_HBWSPACE) + LIST(APPEND KOKKOS_MEMSPACE_LIST HBWSpace) +ENDIF() diff --git a/packages/kokkos/cmake/kokkos_tribits.cmake b/packages/kokkos/cmake/kokkos_tribits.cmake new file mode 100644 index 0000000000000000000000000000000000000000..afa036066afeef954c5fed457782546565b7cfa5 --- /dev/null +++ b/packages/kokkos/cmake/kokkos_tribits.cmake @@ -0,0 +1,580 @@ +#These are tribits wrappers only ever called by Kokkos itself + +INCLUDE(CMakeParseArguments) +INCLUDE(CTest) +INCLUDE(GNUInstallDirs) + +MESSAGE(STATUS "The project name is: ${PROJECT_NAME}") + +FUNCTION(VERIFY_EMPTY CONTEXT) + if(${ARGN}) + MESSAGE(FATAL_ERROR "Kokkos does not support all of Tribits. Unhandled arguments in ${CONTEXT}:\n${ARGN}") + endif() +ENDFUNCTION() + +#Leave this here for now - but only do for tribits +#This breaks the standalone CMake +IF (KOKKOS_HAS_TRILINOS) + IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_OpenMP) + SET(${PROJECT_NAME}_ENABLE_OpenMP OFF) + ENDIF() + + IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_HPX) + SET(${PROJECT_NAME}_ENABLE_HPX OFF) + ENDIF() + + IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_DEBUG) + SET(${PROJECT_NAME}_ENABLE_DEBUG OFF) + ENDIF() + + IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_TESTS) + SET(${PROJECT_NAME}_ENABLE_TESTS OFF) + ENDIF() + + IF(NOT DEFINED TPL_ENABLE_Pthread) + SET(TPL_ENABLE_Pthread OFF) + ENDIF() +ENDIF() + +MACRO(KOKKOS_SUBPACKAGE NAME) + if (KOKKOS_HAS_TRILINOS) + TRIBITS_SUBPACKAGE(${NAME}) + else() + SET(PACKAGE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) + SET(PARENT_PACKAGE_NAME ${PACKAGE_NAME}) + SET(PACKAGE_NAME ${PACKAGE_NAME}${NAME}) + STRING(TOUPPER ${PACKAGE_NAME} PACKAGE_NAME_UC) + SET(${PACKAGE_NAME}_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) + #ADD_INTERFACE_LIBRARY(PACKAGE_${PACKAGE_NAME}) + #GLOBAL_SET(${PACKAGE_NAME}_LIBS "") + endif() +ENDMACRO() + +MACRO(KOKKOS_SUBPACKAGE_POSTPROCESS) + if (KOKKOS_HAS_TRILINOS) + TRIBITS_SUBPACKAGE_POSTPROCESS() + endif() +ENDMACRO() + +MACRO(KOKKOS_PACKAGE_DECL) + + if (KOKKOS_HAS_TRILINOS) + TRIBITS_PACKAGE_DECL(Kokkos) + else() + SET(PACKAGE_NAME Kokkos) + SET(${PACKAGE_NAME}_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) + STRING(TOUPPER ${PACKAGE_NAME} PACKAGE_NAME_UC) + endif() + + #SET(TRIBITS_DEPS_DIR "${CMAKE_SOURCE_DIR}/cmake/deps") + #FILE(GLOB TPLS_FILES "${TRIBITS_DEPS_DIR}/*.cmake") + #FOREACH(TPL_FILE ${TPLS_FILES}) + # TRIBITS_PROCESS_TPL_DEP_FILE(${TPL_FILE}) + #ENDFOREACH() + +ENDMACRO() + + +MACRO(KOKKOS_PROCESS_SUBPACKAGES) + if (KOKKOS_HAS_TRILINOS) + TRIBITS_PROCESS_SUBPACKAGES() + else() + ADD_SUBDIRECTORY(core) + ADD_SUBDIRECTORY(containers) + ADD_SUBDIRECTORY(algorithms) + ADD_SUBDIRECTORY(example) + endif() +ENDMACRO() + +MACRO(KOKKOS_PACKAGE_DEF) + if (KOKKOS_HAS_TRILINOS) + TRIBITS_PACKAGE_DEF() + else() + #do nothing + endif() +ENDMACRO() + +MACRO(KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL LIBRARY_NAME) + KOKKOS_LIB_TYPE(${LIBRARY_NAME} INCTYPE) + TARGET_INCLUDE_DIRECTORIES(${LIBRARY_NAME} ${INCTYPE} $<INSTALL_INTERFACE:${KOKKOS_HEADER_DIR}>) + + INSTALL( + TARGETS ${LIBRARY_NAME} + EXPORT ${PROJECT_NAME} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + COMPONENT ${PACKAGE_NAME} + ) + + INSTALL( + TARGETS ${LIBRARY_NAME} + EXPORT KokkosTargets + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + ) + + VERIFY_EMPTY(KOKKOS_ADD_LIBRARY ${PARSE_UNPARSED_ARGUMENTS}) +ENDMACRO() + +FUNCTION(KOKKOS_ADD_EXECUTABLE ROOT_NAME) + if (KOKKOS_HAS_TRILINOS) + TRIBITS_ADD_EXECUTABLE(${ROOT_NAME} ${ARGN}) + else() + CMAKE_PARSE_ARGUMENTS(PARSE + "TESTONLY" + "" + "SOURCES;TESTONLYLIBS" + ${ARGN}) + + SET(EXE_NAME ${PACKAGE_NAME}_${ROOT_NAME}) + ADD_EXECUTABLE(${EXE_NAME} ${PARSE_SOURCES}) + IF (PARSE_TESTONLYLIBS) + TARGET_LINK_LIBRARIES(${EXE_NAME} PRIVATE ${PARSE_TESTONLYLIBS}) + ENDIF() + VERIFY_EMPTY(KOKKOS_ADD_EXECUTABLE ${PARSE_UNPARSED_ARGUMENTS}) + #All executables must link to all the kokkos targets + #This is just private linkage because exe is final + TARGET_LINK_LIBRARIES(${EXE_NAME} PRIVATE Kokkos::kokkos) + endif() +ENDFUNCTION() + +FUNCTION(KOKKOS_ADD_EXECUTABLE_AND_TEST ROOT_NAME) + CMAKE_PARSE_ARGUMENTS(PARSE + "" + "" + "SOURCES;CATEGORIES;ARGS" + ${ARGN}) + VERIFY_EMPTY(KOKKOS_ADD_EXECUTABLE_AND_TEST ${PARSE_UNPARSED_ARGUMENTS}) + + IF (KOKKOS_HAS_TRILINOS) + IF(DEFINED PARSE_ARGS) + STRING(REPLACE ";" " " PARSE_ARGS "${PARSE_ARGS}") + ENDIF() + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ROOT_NAME} + SOURCES ${PARSE_SOURCES} + TESTONLYLIBS kokkos_gtest + NUM_MPI_PROCS 1 + COMM serial mpi + ARGS ${PARSE_ARGS} + CATEGORIES ${PARSE_CATEGORIES} + SOURCES ${PARSE_SOURCES} + FAIL_REGULAR_EXPRESSION " FAILED " + ARGS ${PARSE_ARGS} + ) + ELSE() + KOKKOS_ADD_TEST_EXECUTABLE(${ROOT_NAME} + SOURCES ${PARSE_SOURCES} + ) + IF (PARSE_ARGS) + SET(TEST_NUMBER 0) + FOREACH (ARG_STR ${PARSE_ARGS}) + # This is passed as a single string blob to match TriBITS behavior + # We need this to be turned into a list + STRING(REPLACE " " ";" ARG_STR_LIST ${ARG_STR}) + LIST(APPEND TEST_NAME "${ROOT_NAME}${TEST_NUMBER}") + MATH(EXPR TEST_NUMBER "${TEST_NUMBER} + 1") + KOKKOS_ADD_TEST(NAME ${TEST_NAME} + EXE ${ROOT_NAME} + FAIL_REGULAR_EXPRESSION " FAILED " + ARGS ${ARG_STR_LIST} + ) + ENDFOREACH() + ELSE() + KOKKOS_ADD_TEST(NAME ${ROOT_NAME} + EXE ${ROOT_NAME} + FAIL_REGULAR_EXPRESSION " FAILED " + ) + ENDIF() + ENDIF() +ENDFUNCTION() + +FUNCTION(KOKKOS_SET_EXE_PROPERTY ROOT_NAME) + SET(TARGET_NAME ${PACKAGE_NAME}_${ROOT_NAME}) + IF (NOT TARGET ${TARGET_NAME}) + MESSAGE(SEND_ERROR "No target ${TARGET_NAME} exists - cannot set target properties") + ENDIF() + SET_PROPERTY(TARGET ${TARGET_NAME} PROPERTY ${ARGN}) +ENDFUNCTION() + +MACRO(KOKKOS_SETUP_BUILD_ENVIRONMENT) + # This is needed for both regular build and install tests + INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_compiler_id.cmake) + #set an internal option, if not already set + SET(Kokkos_INSTALL_TESTING OFF CACHE INTERNAL "Whether to build tests and examples against installation") + IF (Kokkos_INSTALL_TESTING) + SET(KOKKOS_ENABLE_TESTS ON) + SET(KOKKOS_ENABLE_EXAMPLES ON) + # This looks a little weird, but what we are doing + # is to NOT build Kokkos but instead look for an + # installed Kokkos - then build examples and tests + # against that installed Kokkos + FIND_PACKAGE(Kokkos REQUIRED) + # Just grab the configuration from the installation + FOREACH(DEV ${Kokkos_DEVICES}) + SET(KOKKOS_ENABLE_${DEV} ON) + ENDFOREACH() + FOREACH(OPT ${Kokkos_OPTIONS}) + SET(KOKKOS_ENABLE_${OPT} ON) + ENDFOREACH() + FOREACH(TPL ${Kokkos_TPLS}) + SET(KOKKOS_ENABLE_${TPL} ON) + ENDFOREACH() + FOREACH(ARCH ${Kokkos_ARCH}) + SET(KOKKOS_ARCH_${ARCH} ON) + ENDFOREACH() + ELSE() + INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_enable_devices.cmake) + INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_enable_options.cmake) + INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_test_cxx_std.cmake) + INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_arch.cmake) + IF (NOT KOKKOS_HAS_TRILINOS) + SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${Kokkos_SOURCE_DIR}/cmake/Modules/") + ENDIF() + INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_tpls.cmake) + INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_corner_cases.cmake) + ENDIF() +ENDMACRO() + +MACRO(KOKKOS_ADD_TEST_EXECUTABLE ROOT_NAME) + CMAKE_PARSE_ARGUMENTS(PARSE + "" + "" + "SOURCES" + ${ARGN}) + KOKKOS_ADD_EXECUTABLE(${ROOT_NAME} + SOURCES ${PARSE_SOURCES} + ${PARSE_UNPARSED_ARGUMENTS} + TESTONLYLIBS kokkos_gtest + ) + SET(EXE_NAME ${PACKAGE_NAME}_${ROOT_NAME}) +ENDMACRO() + +MACRO(KOKKOS_PACKAGE_POSTPROCESS) + if (KOKKOS_HAS_TRILINOS) + TRIBITS_PACKAGE_POSTPROCESS() + endif() +ENDMACRO() + +## KOKKOS_CONFIGURE_CORE Configure/Generate header files for core content based +## on enabled backends. +## KOKKOS_FWD is the forward declare set +## KOKKOS_SETUP is included in Kokkos_Macros.hpp and include prefix includes/defines +## KOKKOS_DECLARE is the declaration set +## KOKKOS_POST_INCLUDE is included at the end of Kokkos_Core.hpp +MACRO(KOKKOS_CONFIGURE_CORE) + SET(FWD_BACKEND_LIST) + FOREACH(MEMSPACE ${KOKKOS_MEMSPACE_LIST}) + LIST(APPEND FWD_BACKEND_LIST ${MEMSPACE}) + ENDFOREACH() + FOREACH(BACKEND_ ${KOKKOS_ENABLED_DEVICES}) + IF( ${BACKEND_} STREQUAL "PTHREAD") + LIST(APPEND FWD_BACKEND_LIST THREADS) + ELSE() + LIST(APPEND FWD_BACKEND_LIST ${BACKEND_}) + ENDIF() + ENDFOREACH() + MESSAGE(STATUS "Kokkos Devices: ${KOKKOS_ENABLED_DEVICES}, Kokkos Backends: ${FWD_BACKEND_LIST}") + KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_FwdBackend.hpp "KOKKOS_FWD" "fwd/Kokkos_Fwd" "${FWD_BACKEND_LIST}") + KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_SetupBackend.hpp "KOKKOS_SETUP" "setup/Kokkos_Setup" "${DEVICE_SETUP_LIST}") + KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_DeclareBackend.hpp "KOKKOS_DECLARE" "decl/Kokkos_Declare" "${FWD_BACKEND_LIST}") + KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_PostInclude.hpp "KOKKOS_POST_INCLUDE" "Kokkos_Post_Include" "${KOKKOS_BACKEND_POST_INCLUDE_LIST}") + SET(_DEFAULT_HOST_MEMSPACE "::Kokkos::HostSpace") + KOKKOS_OPTION(DEFAULT_DEVICE_MEMORY_SPACE "" STRING "Override default device memory space") + KOKKOS_OPTION(DEFAULT_HOST_MEMORY_SPACE "" STRING "Override default host memory space") + KOKKOS_OPTION(DEFAULT_DEVICE_EXECUTION_SPACE "" STRING "Override default device execution space") + KOKKOS_OPTION(DEFAULT_HOST_PARALLEL_EXECUTION_SPACE "" STRING "Override default host parallel execution space") + IF (NOT Kokkos_DEFAULT_DEVICE_EXECUTION_SPACE STREQUAL "") + SET(_DEVICE_PARALLEL ${Kokkos_DEFAULT_DEVICE_EXECUTION_SPACE}) + MESSAGE(STATUS "Override default device execution space: ${_DEVICE_PARALLEL}") + SET(KOKKOS_DEVICE_SPACE_ACTIVE ON) + ELSE() + IF (_DEVICE_PARALLEL STREQUAL "NoTypeDefined") + SET(KOKKOS_DEVICE_SPACE_ACTIVE OFF) + ELSE() + SET(KOKKOS_DEVICE_SPACE_ACTIVE ON) + ENDIF() + ENDIF() + IF (NOT Kokkos_DEFAULT_HOST_PARALLEL_EXECUTION_SPACE STREQUAL "") + SET(_HOST_PARALLEL ${Kokkos_DEFAULT_HOST_PARALLEL_EXECUTION_SPACE}) + MESSAGE(STATUS "Override default host parallel execution space: ${_HOST_PARALLEL}") + SET(KOKKOS_HOSTPARALLEL_SPACE_ACTIVE ON) + ELSE() + IF (_HOST_PARALLEL STREQUAL "NoTypeDefined") + SET(KOKKOS_HOSTPARALLEL_SPACE_ACTIVE OFF) + ELSE() + SET(KOKKOS_HOSTPARALLEL_SPACE_ACTIVE ON) + ENDIF() + ENDIF() + #We are ready to configure the header + CONFIGURE_FILE(cmake/KokkosCore_config.h.in KokkosCore_config.h @ONLY) +ENDMACRO() + +## KOKKOS_INSTALL_ADDITIONAL_FILES - instruct cmake to install files in target destination. +## Includes generated header files, scripts such as nvcc_wrapper and hpcbind, +## as well as other files provided through plugins. +MACRO(KOKKOS_INSTALL_ADDITIONAL_FILES) + + # kokkos_launch_compiler is used by Kokkos to prefix compiler commands so that they forward to original kokkos compiler + # if nvcc_wrapper was not used as CMAKE_CXX_COMPILER, configure the original compiler into kokkos_launch_compiler + IF(NOT "${CMAKE_CXX_COMPILER}" MATCHES "nvcc_wrapper") + SET(NVCC_WRAPPER_DEFAULT_COMPILER "${CMAKE_CXX_COMPILER}") + ELSE() + IF(NOT "$ENV{NVCC_WRAPPER_DEFAULT_COMPILER}" STREQUAL "") + SET(NVCC_WRAPPER_DEFAULT_COMPILER "$ENV{NVCC_WRAPPER_DEFAULT_COMPILER}") + ENDIF() + ENDIF() + + CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/bin/kokkos_launch_compiler + ${PROJECT_BINARY_DIR}/temp/kokkos_launch_compiler + @ONLY) + + INSTALL(PROGRAMS + "${CMAKE_CURRENT_SOURCE_DIR}/bin/nvcc_wrapper" + "${CMAKE_CURRENT_SOURCE_DIR}/bin/hpcbind" + "${CMAKE_CURRENT_SOURCE_DIR}/bin/kokkos_launch_compiler" + "${PROJECT_BINARY_DIR}/temp/kokkos_launch_compiler" + DESTINATION ${CMAKE_INSTALL_BINDIR}) + INSTALL(FILES + "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_config.h" + "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_FwdBackend.hpp" + "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_SetupBackend.hpp" + "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_DeclareBackend.hpp" + "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_PostInclude.hpp" + DESTINATION ${KOKKOS_HEADER_DIR}) +ENDMACRO() + +FUNCTION(KOKKOS_SET_LIBRARY_PROPERTIES LIBRARY_NAME) + CMAKE_PARSE_ARGUMENTS(PARSE + "PLAIN_STYLE" + "" + "" + ${ARGN}) + + IF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.18") + #I can use link options + #check for CXX linkage using the simple 3.18 way + TARGET_LINK_OPTIONS( + ${LIBRARY_NAME} PUBLIC + $<$<LINK_LANGUAGE:CXX>:${KOKKOS_LINK_OPTIONS}> + ) + ELSE() + #I can use link options + #just assume CXX linkage + TARGET_LINK_OPTIONS( + ${LIBRARY_NAME} PUBLIC ${KOKKOS_LINK_OPTIONS} + ) + ENDIF() + + TARGET_COMPILE_OPTIONS( + ${LIBRARY_NAME} PUBLIC + $<$<COMPILE_LANGUAGE:CXX>:${KOKKOS_COMPILE_OPTIONS}> + ) + + TARGET_COMPILE_DEFINITIONS( + ${LIBRARY_NAME} PUBLIC + $<$<COMPILE_LANGUAGE:CXX>:${KOKKOS_COMPILE_DEFINITIONS}> + ) + + TARGET_LINK_LIBRARIES( + ${LIBRARY_NAME} PUBLIC ${KOKKOS_LINK_LIBRARIES} + ) + + IF (KOKKOS_ENABLE_CUDA) + TARGET_COMPILE_OPTIONS( + ${LIBRARY_NAME} + PUBLIC $<$<COMPILE_LANGUAGE:CXX>:${KOKKOS_CUDA_OPTIONS}> + ) + SET(NODEDUP_CUDAFE_OPTIONS) + FOREACH(OPT ${KOKKOS_CUDAFE_OPTIONS}) + LIST(APPEND NODEDUP_CUDAFE_OPTIONS -Xcudafe ${OPT}) + ENDFOREACH() + TARGET_COMPILE_OPTIONS( + ${LIBRARY_NAME} + PUBLIC $<$<COMPILE_LANGUAGE:CXX>:${NODEDUP_CUDAFE_OPTIONS}> + ) + ENDIF() + + IF (KOKKOS_ENABLE_HIP) + TARGET_COMPILE_OPTIONS( + ${LIBRARY_NAME} + PUBLIC $<$<COMPILE_LANGUAGE:CXX>:${KOKKOS_AMDGPU_OPTIONS}> + ) + ENDIF() + + LIST(LENGTH KOKKOS_XCOMPILER_OPTIONS XOPT_LENGTH) + IF (XOPT_LENGTH GREATER 1) + MESSAGE(FATAL_ERROR "CMake deduplication does not allow multiple -Xcompiler flags (${KOKKOS_XCOMPILER_OPTIONS}): will require Kokkos to upgrade to minimum 3.12") + ENDIF() + IF(KOKKOS_XCOMPILER_OPTIONS) + SET(NODEDUP_XCOMPILER_OPTIONS) + FOREACH(OPT ${KOKKOS_XCOMPILER_OPTIONS}) + #I have to do this for now because we can't guarantee 3.12 support + #I really should do this with the shell option + LIST(APPEND NODEDUP_XCOMPILER_OPTIONS -Xcompiler) + LIST(APPEND NODEDUP_XCOMPILER_OPTIONS ${OPT}) + ENDFOREACH() + TARGET_COMPILE_OPTIONS( + ${LIBRARY_NAME} + PUBLIC $<$<COMPILE_LANGUAGE:CXX>:${NODEDUP_XCOMPILER_OPTIONS}> + ) + ENDIF() + + IF (KOKKOS_CXX_STANDARD_FEATURE) + #GREAT! I can do this the right way + TARGET_COMPILE_FEATURES(${LIBRARY_NAME} PUBLIC ${KOKKOS_CXX_STANDARD_FEATURE}) + IF (NOT KOKKOS_USE_CXX_EXTENSIONS) + SET_TARGET_PROPERTIES(${LIBRARY_NAME} PROPERTIES CXX_EXTENSIONS OFF) + ENDIF() + ELSE() + #OH, well, no choice but the wrong way + TARGET_COMPILE_OPTIONS(${LIBRARY_NAME} PUBLIC ${KOKKOS_CXX_STANDARD_FLAG}) + ENDIF() +ENDFUNCTION() + +FUNCTION(KOKKOS_INTERNAL_ADD_LIBRARY LIBRARY_NAME) + CMAKE_PARSE_ARGUMENTS(PARSE + "STATIC;SHARED" + "" + "HEADERS;SOURCES" + ${ARGN}) + + IF(PARSE_HEADERS) + LIST(REMOVE_DUPLICATES PARSE_HEADERS) + ENDIF() + IF(PARSE_SOURCES) + LIST(REMOVE_DUPLICATES PARSE_SOURCES) + ENDIF() + + IF(PARSE_STATIC) + SET(LINK_TYPE STATIC) + ENDIF() + + IF(PARSE_SHARED) + SET(LINK_TYPE SHARED) + ENDIF() + + # MSVC and other platforms want to have + # the headers included as source files + # for better dependency detection + ADD_LIBRARY( + ${LIBRARY_NAME} + ${LINK_TYPE} + ${PARSE_HEADERS} + ${PARSE_SOURCES} + ) + + IF(PARSE_SHARED OR BUILD_SHARED_LIBS) + SET_TARGET_PROPERTIES(${LIBRARY_NAME} PROPERTIES + VERSION ${Kokkos_VERSION} + SOVERSION ${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR} + ) + ENDIF() + + KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL(${LIBRARY_NAME}) + + #In case we are building in-tree, add an alias name + #that matches the install Kokkos:: name + ADD_LIBRARY(Kokkos::${LIBRARY_NAME} ALIAS ${LIBRARY_NAME}) +ENDFUNCTION() + +FUNCTION(KOKKOS_ADD_LIBRARY LIBRARY_NAME) + CMAKE_PARSE_ARGUMENTS(PARSE + "ADD_BUILD_OPTIONS" + "" + "HEADERS" + ${ARGN} + ) + IF (KOKKOS_HAS_TRILINOS) + # We do not pass headers to trilinos. They would get installed + # to the default include folder, but we want headers installed + # preserving the directory structure, e.g. impl + # If headers got installed in both locations, it breaks some + # downstream packages + TRIBITS_ADD_LIBRARY(${LIBRARY_NAME} ${PARSE_UNPARSED_ARGUMENTS}) + #Stolen from Tribits - it can add prefixes + SET(TRIBITS_LIBRARY_NAME_PREFIX "${${PROJECT_NAME}_LIBRARY_NAME_PREFIX}") + SET(TRIBITS_LIBRARY_NAME ${TRIBITS_LIBRARY_NAME_PREFIX}${LIBRARY_NAME}) + #Tribits has way too much techinical debt and baggage to even + #allow PUBLIC target_compile_options to be used. It forces C++ flags on projects + #as a giant blob of space-separated strings. We end up with duplicated + #flags between the flags implicitly forced on Kokkos-dependent and those Kokkos + #has in its public INTERFACE_COMPILE_OPTIONS. + #These do NOT get de-deduplicated because Tribits + #creates flags as a giant monolithic space-separated string + #Do not set any transitive properties and keep everything working as before + #KOKKOS_SET_LIBRARY_PROPERTIES(${TRIBITS_LIBRARY_NAME} PLAIN_STYLE) + ELSE() + # Forward the headers, we want to know about all headers + # to make sure they appear correctly in IDEs + KOKKOS_INTERNAL_ADD_LIBRARY( + ${LIBRARY_NAME} ${PARSE_UNPARSED_ARGUMENTS} HEADERS ${PARSE_HEADERS}) + IF (PARSE_ADD_BUILD_OPTIONS) + KOKKOS_SET_LIBRARY_PROPERTIES(${LIBRARY_NAME}) + ENDIF() + ENDIF() +ENDFUNCTION() + +FUNCTION(KOKKOS_ADD_INTERFACE_LIBRARY NAME) +IF (KOKKOS_HAS_TRILINOS) + TRIBITS_ADD_LIBRARY(${NAME} ${ARGN}) +ELSE() + CMAKE_PARSE_ARGUMENTS(PARSE + "" + "" + "HEADERS;SOURCES" + ${ARGN} + ) + + ADD_LIBRARY(${NAME} INTERFACE) + KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL(${NAME}) +ENDIF() +ENDFUNCTION() + +FUNCTION(KOKKOS_LIB_INCLUDE_DIRECTORIES TARGET) + IF(KOKKOS_HAS_TRILINOS) + #ignore the target, tribits doesn't do anything directly with targets + TRIBITS_INCLUDE_DIRECTORIES(${ARGN}) + ELSE() #append to a list for later + KOKKOS_LIB_TYPE(${TARGET} INCTYPE) + FOREACH(DIR ${ARGN}) + TARGET_INCLUDE_DIRECTORIES(${TARGET} ${INCTYPE} $<BUILD_INTERFACE:${DIR}>) + ENDFOREACH() + ENDIF() +ENDFUNCTION() + +FUNCTION(KOKKOS_LIB_COMPILE_OPTIONS TARGET) + IF(KOKKOS_HAS_TRILINOS) + #don't trust tribits to do this correctly + KOKKOS_TARGET_COMPILE_OPTIONS(${TARGET} ${ARGN}) + ELSE() + KOKKOS_LIB_TYPE(${TARGET} INCTYPE) + KOKKOS_TARGET_COMPILE_OPTIONS(${${PROJECT_NAME}_LIBRARY_NAME_PREFIX}${TARGET} ${INCTYPE} ${ARGN}) + ENDIF() +ENDFUNCTION() + +MACRO(KOKKOS_ADD_TEST_DIRECTORIES) + IF (KOKKOS_HAS_TRILINOS) + TRIBITS_ADD_TEST_DIRECTORIES(${ARGN}) + ELSE() + IF(KOKKOS_ENABLE_TESTS) + FOREACH(TEST_DIR ${ARGN}) + ADD_SUBDIRECTORY(${TEST_DIR}) + ENDFOREACH() + ENDIF() + ENDIF() +ENDMACRO() + +MACRO(KOKKOS_ADD_EXAMPLE_DIRECTORIES) + if (KOKKOS_HAS_TRILINOS) + TRIBITS_ADD_EXAMPLE_DIRECTORIES(${ARGN}) + else() + IF(KOKKOS_ENABLE_EXAMPLES) + FOREACH(EXAMPLE_DIR ${ARGN}) + ADD_SUBDIRECTORY(${EXAMPLE_DIR}) + ENDFOREACH() + ENDIF() + endif() +ENDMACRO() diff --git a/packages/kokkos/cmake/msvc.cmake b/packages/kokkos/cmake/msvc.cmake new file mode 100644 index 0000000000000000000000000000000000000000..85421bdbaaa46dd5d671f4e86b50d52b25d98d30 --- /dev/null +++ b/packages/kokkos/cmake/msvc.cmake @@ -0,0 +1,11 @@ + +FUNCTION(kokkos_set_msvc_flags full_standard int_standard) + IF (CMAKE_CXX_EXTENSIONS) + SET(KOKKOS_CXX_STANDARD_FLAG "" PARENT_SCOPE) + SET(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "" PARENT_SCOPE) + ELSE() + SET(KOKKOS_CXX_STANDARD_FLAG "" PARENT_SCOPE) + SET(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "" PARENT_SCOPE) + ENDIF() +ENDFUNCTION() + diff --git a/packages/kokkos/cmake/pgi.cmake b/packages/kokkos/cmake/pgi.cmake new file mode 100644 index 0000000000000000000000000000000000000000..e98e84955888496225e9268c7db47ef514f08a48 --- /dev/null +++ b/packages/kokkos/cmake/pgi.cmake @@ -0,0 +1,8 @@ + +function(kokkos_set_pgi_flags full_standard int_standard) + STRING(TOLOWER ${full_standard} FULL_LC_STANDARD) + STRING(TOLOWER ${int_standard} INT_LC_STANDARD) + SET(KOKKOS_CXX_STANDARD_FLAG "--c++${FULL_LC_STANDARD}" PARENT_SCOPE) + SET(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "--c++${INT_LC_STANDARD}" PARENT_SCOPE) +endfunction() + diff --git a/packages/kokkos/cmake/tpls/FindTPLCUSPARSE.cmake b/packages/kokkos/cmake/tpls/FindTPLCUSPARSE.cmake new file mode 100644 index 0000000000000000000000000000000000000000..1ae4f19dd4036df21d22c758a7e535f432734415 --- /dev/null +++ b/packages/kokkos/cmake/tpls/FindTPLCUSPARSE.cmake @@ -0,0 +1,52 @@ +# @HEADER +# ************************************************************************ +# +# Kokkos v. 3.0 +# Copyright (2020) National Technology & Engineering +# Solutions of Sandia, LLC (NTESS). +# +# Under the terms of Contract DE-NA0003525 with NTESS, +# the U.S. Government retains certain rights in this software. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# 3. Neither the name of the Corporation nor the names of the +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Questions? Contact Christian R. Trott (crtrott@sandia.gov) +# +# ************************************************************************ +# @HEADER + +# Check for CUDA support + +IF (NOT TPL_ENABLE_CUDA) + MESSAGE(FATAL_ERROR "\nCUSPARSE requires CUDA") +ELSE() + GLOBAL_SET(TPL_CUSPARSE_LIBRARY_DIRS) + GLOBAL_SET(TPL_CUSPARSE_INCLUDE_DIRS ${TPL_CUDA_INCLUDE_DIRS}) + GLOBAL_SET(TPL_CUSPARSE_LIBRARIES ${CUDA_cusparse_LIBRARY}) +ENDIF() + diff --git a/packages/kokkos/cmake/tpls/FindTPLHWLOC.cmake b/packages/kokkos/cmake/tpls/FindTPLHWLOC.cmake new file mode 100644 index 0000000000000000000000000000000000000000..467635083f2bdf83db722d51e4eea1ead1b604b4 --- /dev/null +++ b/packages/kokkos/cmake/tpls/FindTPLHWLOC.cmake @@ -0,0 +1,58 @@ +# @HEADER +# ************************************************************************ +# +# Kokkos v. 3.0 +# Copyright (2020) National Technology & Engineering +# Solutions of Sandia, LLC (NTESS). +# +# Under the terms of Contract DE-NA0003525 with NTESS, +# the U.S. Government retains certain rights in this software. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# 3. Neither the name of the Corporation nor the names of the +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Questions? Contact Christian R. Trott (crtrott@sandia.gov) +# +# ************************************************************************ +# @HEADER + + +#----------------------------------------------------------------------------- +# Hardware locality detection and control library. +# +# Acquisition information: +# Date checked: November 2011 +# Checked by: H. Carter Edwards <hcedwar AT sandia.gov> +# Source: http://www.open-mpi.org/projects/hwloc/ +# Version: 1.3 +# + +KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( HWLOC + REQUIRED_HEADERS hwloc.h + REQUIRED_LIBS_NAMES "hwloc" + ) + diff --git a/packages/kokkos/cmake/tpls/FindTPLPthread.cmake b/packages/kokkos/cmake/tpls/FindTPLPthread.cmake new file mode 100644 index 0000000000000000000000000000000000000000..c78630b7f19108da4c730c24b7151dfec57204b7 --- /dev/null +++ b/packages/kokkos/cmake/tpls/FindTPLPthread.cmake @@ -0,0 +1,69 @@ +# @HEADER +# ************************************************************************ +# +# Kokkos v. 3.0 +# Copyright (2020) National Technology & Engineering +# Solutions of Sandia, LLC (NTESS). +# +# Under the terms of Contract DE-NA0003525 with NTESS, +# the U.S. Government retains certain rights in this software. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# 3. Neither the name of the Corporation nor the names of the +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Questions? Contact Christian R. Trott (crtrott@sandia.gov) +# +# ************************************************************************ +# @HEADER + + +SET(USE_THREADS FALSE) + +IF(NOT TPL_Pthread_INCLUDE_DIRS AND NOT TPL_Pthread_LIBRARY_DIRS AND NOT TPL_Pthread_LIBRARIES) + # Use CMake's Thread finder since it is a bit smarter in determining + # whether pthreads is already built into the compiler and doesn't need + # a library to link. + FIND_PACKAGE(Threads) + #If Threads found a copy of pthreads make sure it is one of the cases the tribits + #tpl system cannot handle. + IF(Threads_FOUND AND CMAKE_USE_PTHREADS_INIT) + IF(CMAKE_THREAD_LIBS_INIT STREQUAL "" OR CMAKE_THREAD_LIBS_INIT STREQUAL "-pthread") + SET(USE_THREADS TRUE) + ENDIF() + ENDIF() +ENDIF() + +IF(USE_THREADS) + SET(TPL_Pthread_INCLUDE_DIRS "") + SET(TPL_Pthread_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}") + SET(TPL_Pthread_LIBRARY_DIRS "") +ELSE() + KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( Pthread + REQUIRED_HEADERS pthread.h + REQUIRED_LIBS_NAMES pthread + ) +ENDIF() diff --git a/packages/kokkos/config/test_all_sandia b/packages/kokkos/config/test_all_sandia new file mode 100755 index 0000000000000000000000000000000000000000..193a162a4e6e385db674d7b3410fe39f81d4e648 --- /dev/null +++ b/packages/kokkos/config/test_all_sandia @@ -0,0 +1,773 @@ +#!/bin/bash -e + +# +# Global config +# + +set -o pipefail + +# Determine current machine. + +MACHINE="" +HOSTNAME=$(hostname) +PROCESSOR=`uname -p` + +if [[ "$HOSTNAME" =~ (white|ride).* ]]; then + MACHINE=white + module load git +fi + +if [[ "$HOSTNAME" =~ .*bowman.* ]]; then + MACHINE=bowman + module load git +fi + +if [[ "$HOSTNAME" == n* ]]; then # Warning: very generic name + if [[ "$PROCESSOR" = "aarch64" ]]; then + MACHINE=sullivan + module load git + fi +fi + +if [[ "$HOSTNAME" == node* ]]; then # Warning: very generic name + if [[ "$MACHINE" = "" ]]; then + MACHINE=shepard + module load git + fi +fi + +if [[ "$HOSTNAME" == apollo\.* ]]; then + MACHINE=apollo + module load git +fi + +if [[ "$HOSTNAME" == sullivan ]]; then + MACHINE=sullivan + module load git +fi + +if [[ "$HOSTNAME" == mayer\.* ]]; then + MACHINE=mayer +# module load git +fi +if [[ "$HOSTNAME" == cn* ]]; then # Warning: very generic name + MACHINE=mayer +fi + +if [ ! -z "$SEMS_MODULEFILES_ROOT" ]; then + if [[ "$MACHINE" = "" ]]; then + MACHINE=sems + module load sems-git + fi +fi + +if [[ "$MACHINE" = "" ]]; then + echo "Unrecognized machine" >&2 + exit 1 +fi + +echo "Running on machine: $MACHINE" + +GCC_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial" +IBM_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" +ARM_GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" +INTEL_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial" +CLANG_BUILD_LIST="Pthread,Serial,Pthread_Serial" +CUDA_BUILD_LIST="Cuda_OpenMP,Cuda_Pthread,Cuda_Serial" +CUDA_IBM_BUILD_LIST="Cuda_OpenMP,Cuda_Serial" + +GCC_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wignored-qualifiers,-Wempty-body,-Wclobbered,-Wuninitialized" +IBM_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" +CLANG_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" +INTEL_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" +#CUDA_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" +CUDA_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Wsign-compare,-Wtype-limits,-Wuninitialized" +PGI_WARNING_FLAGS="" + +# Default. Machine specific can override. +DEBUG=False +ARGS="" +CUSTOM_BUILD_LIST="" +DRYRUN=False +BUILD_ONLY=False +declare -i NUM_JOBS_TO_RUN_IN_PARALLEL=1 +TEST_SCRIPT=False +SKIP_HWLOC=False +SPOT_CHECK=False + +PRINT_HELP=False +OPT_FLAG="" +CXX_FLAGS_EXTRA="" +LD_FLAGS_EXTRA="" +KOKKOS_OPTIONS="" + +# +# Handle arguments. +# + +while [[ $# > 0 ]] +do + key="$1" + + case $key in + --kokkos-path*) + KOKKOS_PATH="${key#*=}" + ;; + --build-list*) + CUSTOM_BUILD_LIST="${key#*=}" + ;; + --debug*) + DEBUG=True + ;; + --build-only*) + BUILD_ONLY=True + ;; + --test-script*) + TEST_SCRIPT=True + ;; + --skip-hwloc*) + SKIP_HWLOC=True + ;; + --num*) + NUM_JOBS_TO_RUN_IN_PARALLEL="${key#*=}" + ;; + --dry-run*) + DRYRUN=True + ;; + --spot-check*) + SPOT_CHECK=True + ;; + --arch*) + ARCH_FLAG="--arch=${key#*=}" + ;; + --opt-flag*) + OPT_FLAG="${key#*=}" + ;; + --with-cuda-options*) + KOKKOS_CUDA_OPTIONS="--with-cuda-options=${key#*=}" + ;; + --with-options*) + KOKKOS_OPTIONS="--with-options=enable_large_mem_tests,${key#*=}" + ;; + --cxxflags-extra*) + CXX_FLAGS_EXTRA="${key#*=}" + ;; + --ldflags-extra*) + LD_FLAGS_EXTRA="${key#*=}" + ;; + --help*) + PRINT_HELP=True + ;; + *) + # args, just append + ARGS="$ARGS $1" + ;; + esac + + shift +done + +SCRIPT_KOKKOS_ROOT=$( cd "$( dirname "$0" )" && cd .. && pwd ) + +# Set kokkos path. +if [ -z "$KOKKOS_PATH" ]; then + KOKKOS_PATH=$SCRIPT_KOKKOS_ROOT +else + # Ensure KOKKOS_PATH is abs path. + KOKKOS_PATH=$( cd $KOKKOS_PATH && pwd ) +fi + +UNCOMMITTED=`cd ${KOKKOS_PATH}; git status --porcelain 2>/dev/null` +if ! [ -z "$UNCOMMITTED" ]; then + echo "WARNING!! THE FOLLOWING CHANGES ARE UNCOMMITTED!! :" + echo "$UNCOMMITTED" + echo "" +fi + +GITSTATUS=`cd ${KOKKOS_PATH}; git log -n 1 --format=oneline` +echo "Repository Status: " ${GITSTATUS} +echo "" +echo "" + +# +# Machine specific config. +# + +if [ "$MACHINE" = "sems" ]; then + source /projects/sems/modulefiles/utils/sems-modules-init.sh + + BASE_MODULE_LIST="sems-env,kokkos-env,kokkos-hwloc/1.10.1/base,sems-<COMPILER_NAME>/<COMPILER_VERSION>" + CUDA_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/4.8.4,kokkos-hwloc/1.10.1/base" + CUDA8_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0,kokkos-hwloc/1.10.1/base" + + if [ -z "$ARCH_FLAG" ]; then + ARCH_FLAG="" + fi + + if [ "$SPOT_CHECK" = "True" ]; then + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS" + "gcc/6.1.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" + "intel/17.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" + "clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" + "cuda/8.0.44 $CUDA8_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + ) + else + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/16.0.3 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "clang/3.7.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "clang/3.8.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "clang/3.9.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/7.5.18 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/8.0.44 $CUDA8_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + ) + fi +elif [ "$MACHINE" = "white" ]; then + source /etc/profile.d/modules.sh + SKIP_HWLOC=True + export SLURM_TASKS_PER_NODE=32 + + BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>" + IBM_MODULE_LIST="<COMPILER_NAME>/xl/<COMPILER_VERSION>" + CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/6.4.0,ibm/xl/16.1.0" + + # Don't do pthread on white. + GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" + + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("gcc/5.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/6.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "ibm/16.1.0 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS" + "cuda/9.0.103 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + ) + + if [ -z "$ARCH_FLAG" ]; then + ARCH_FLAG="--arch=Power8,Kepler37" + fi + + NUM_JOBS_TO_RUN_IN_PARALLEL=1 + +elif [ "$MACHINE" = "bowman" ]; then + source /etc/profile.d/modules.sh + SKIP_HWLOC=True + export SLURM_TASKS_PER_NODE=32 + + BASE_MODULE_LIST="<COMPILER_NAME>/compilers/<COMPILER_VERSION>" + + OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial" + + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("intel/16.4.258 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/17.2.174 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/18.0.128 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + ) + + if [ -z "$ARCH_FLAG" ]; then + ARCH_FLAG="--arch=KNL" + fi + + NUM_JOBS_TO_RUN_IN_PARALLEL=1 + +elif [ "$MACHINE" = "sullivan" ]; then + source /etc/profile.d/modules.sh + SKIP_HWLOC=True + export SLURM_TASKS_PER_NODE=96 + + BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>" + + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("gcc/6.1.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS") + + if [ -z "$ARCH_FLAG" ]; then + ARCH_FLAG="--arch=ARMv8-ThunderX" + fi + + NUM_JOBS_TO_RUN_IN_PARALLEL=1 + +elif [ "$MACHINE" = "mayer" ]; then + SKIP_HWLOC=True + export SLURM_TASKS_PER_NODE=96 + + BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>" + ARM_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>" + + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("gcc/7.2.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "arm/1.4.0 $ARM_MODULE_LIST $ARM_GCC_BUILD_LIST armclang++ $CLANG_WARNING_FLAGS") + + if [ -z "$ARCH_FLAG" ]; then + ARCH_FLAG="--arch=ARMv8-TX2" + fi + + NUM_JOBS_TO_RUN_IN_PARALLEL=1 + +elif [ "$MACHINE" = "shepard" ]; then + source /etc/profile.d/modules.sh + SKIP_HWLOC=True + export SLURM_TASKS_PER_NODE=32 + + BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>" + BASE_MODULE_LIST_INTEL="<COMPILER_NAME>/compilers/<COMPILER_VERSION>" + + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("intel/17.4.196 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/18.0.128 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "pgi/17.10.0 $BASE_MODULE_LIST $GCC_BUILD_LIST pgc++ $PGI_WARNING_FLAGS" + ) + + if [ -z "$ARCH_FLAG" ]; then + ARCH_FLAG="--arch=HSW" + fi + NUM_JOBS_TO_RUN_IN_PARALLEL=1 + +elif [ "$MACHINE" = "apollo" ]; then + source /projects/sems/modulefiles/utils/sems-modules-init.sh + module use /home/projects/modulefiles/local/x86-64 + module load kokkos-env + + module load sems-git + module load sems-tex + module load sems-cmake/3.5.2 + module load sems-gdb + + SKIP_HWLOC=True + + BASE_MODULE_LIST="sems-env,kokkos-env,sems-<COMPILER_NAME>/<COMPILER_VERSION>,kokkos-hwloc/1.10.1/base" + CUDA_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/4.8.4,kokkos-hwloc/1.10.1/base" + CUDA8_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0,kokkos-hwloc/1.10.1/base" + + CLANG_MODULE_LIST="sems-env,kokkos-env,sems-git,sems-cmake/3.5.2,<COMPILER_NAME>/<COMPILER_VERSION>,cuda/9.0.69" + NVCC_MODULE_LIST="sems-env,kokkos-env,sems-git,sems-cmake/3.5.2,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0" + + BUILD_LIST_CUDA_NVCC="Cuda_Serial,Cuda_OpenMP" + BUILD_LIST_CUDA_CLANG="Cuda_Serial,Cuda_Pthread" + BUILD_LIST_CLANG="Serial,Pthread,OpenMP" + + if [ "$SPOT_CHECK" = "True" ]; then + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("gcc/4.8.4 $BASE_MODULE_LIST "OpenMP,Pthread" g++ $GCC_WARNING_FLAGS" + "gcc/5.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" + "intel/16.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" + "clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" + "clang/6.0 $CLANG_MODULE_LIST "Cuda_Pthread,OpenMP" clang++ $CUDA_WARNING_FLAGS" + "cuda/9.1 $CUDA_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + ) + else + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("cuda/9.1 $CUDA8_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "clang/6.0 $CLANG_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS" + "clang/3.9.0 $CLANG_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS" + "gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "clang/3.5.2 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + ) + fi + + if [ -z "$ARCH_FLAG" ]; then + ARCH_FLAG="--arch=SNB,Volta70" + fi + + NUM_JOBS_TO_RUN_IN_PARALLEL=1 + +else + echo "Unhandled machine $MACHINE" >&2 + exit 1 +fi + +export OMP_NUM_THREADS=4 + +declare -i NUM_RESULTS_TO_KEEP=7 + +RESULT_ROOT_PREFIX=TestAll + +if [ "$PRINT_HELP" = "True" ]; then + echo "test_all_sandia <ARGS> <OPTIONS>:" + echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory" + echo " Defaults to root repo containing this script" + echo "--debug: Run tests in debug. Defaults to False" + echo "--test-script: Test this script, not Kokkos" + echo "--skip-hwloc: Do not do hwloc tests" + echo "--num=N: Number of jobs to run in parallel" + echo "--spot-check: Minimal test set to issue pull request" + echo "--dry-run: Just print what would be executed" + echo "--build-only: Just do builds, don't run anything" + echo "--opt-flag=FLAG: Optimization flag (default: -O3)" + echo "--cxxflags-extra=FLAGS: Extra flags to be added to CXX_FLAGS" + echo "--ldflags-extra=FLAGS: Extra flags to be added to LD_FLAGS" + echo "--arch=ARCHITECTURE: overwrite architecture flags" + echo "--with-cuda-options=OPT: set KOKKOS_CUDA_OPTIONS" + echo "--build-list=BUILD,BUILD,BUILD..." + echo " Provide a comma-separated list of builds instead of running all builds" + echo " Valid items:" + echo " OpenMP, Pthread, Serial, OpenMP_Serial, Pthread_Serial" + echo " Cuda_OpenMP, Cuda_Pthread, Cuda_Serial" + echo "" + + echo "ARGS: list of expressions matching compilers to test" + echo " supported compilers sems" + for COMPILER_DATA in "${COMPILERS[@]}"; do + ARR=($COMPILER_DATA) + COMPILER=${ARR[0]} + echo " $COMPILER" + done + echo "" + + echo "Examples:" + echo " Run all tests" + echo " % test_all_sandia" + echo "" + echo " Run all gcc tests" + echo " % test_all_sandia gcc" + echo "" + echo " Run all gcc/4.8.4 and all intel tests" + echo " % test_all_sandia gcc/4.8.4 intel" + echo "" + echo " Run all tests in debug" + echo " % test_all_sandia --debug" + echo "" + echo " Run gcc/4.8.4 and only do OpenMP and OpenMP_Serial builds" + echo " % test_all_sandia gcc/4.8.4 --build-list=OpenMP,OpenMP_Serial" + echo "" + echo "If you want to kill the tests, do:" + echo " hit ctrl-z" + echo " % kill -9 %1" + echo + exit 0 +fi + +# Set build type. +if [ "$DEBUG" = "True" ]; then + BUILD_TYPE=debug +else + BUILD_TYPE=release +fi + +# If no args provided, do all compilers. +if [ -z "$ARGS" ]; then + ARGS='?' +fi + +# Process args to figure out which compilers to test. +COMPILERS_TO_TEST="" + +for ARG in $ARGS; do + for COMPILER_DATA in "${COMPILERS[@]}"; do + ARR=($COMPILER_DATA) + COMPILER=${ARR[0]} + + if [[ "$COMPILER" = $ARG* ]]; then + if [[ "$COMPILERS_TO_TEST" != *${COMPILER}* ]]; then + COMPILERS_TO_TEST="$COMPILERS_TO_TEST $COMPILER" + else + echo "Tried to add $COMPILER twice" + fi + fi + done +done + +# +# Functions. +# + +# get_compiler_name <COMPILER> +get_compiler_name() { + echo $1 | cut -d/ -f1 +} + +# get_compiler_version <COMPILER> +get_compiler_version() { + echo $1 | cut -d/ -f2 +} + +# Do not call directly. +get_compiler_data() { + local compiler=$1 + local item=$2 + local compiler_name=$(get_compiler_name $compiler) + local compiler_vers=$(get_compiler_version $compiler) + + local compiler_data + for compiler_data in "${COMPILERS[@]}" ; do + local arr=($compiler_data) + + if [ "$compiler" = "${arr[0]}" ]; then + echo "${arr[$item]}" | tr , ' ' | sed -e "s/<COMPILER_NAME>/$compiler_name/g" -e "s/<COMPILER_VERSION>/$compiler_vers/g" + return 0 + fi + done + + # Not found. + echo "Unreconized compiler $compiler" >&2 + exit 1 +} + +# +# For all getters, usage: <GETTER> <COMPILER> +# + +get_compiler_modules() { + get_compiler_data $1 1 +} + +get_compiler_build_list() { + get_compiler_data $1 2 +} + +get_compiler_exe_name() { + get_compiler_data $1 3 +} + +get_compiler_warning_flags() { + get_compiler_data $1 4 +} + +run_cmd() { + echo "RUNNING: $*" + if [ "$DRYRUN" != "True" ]; then + eval "$* 2>&1" + fi +} + +# report_and_log_test_results <SUCCESS> <DESC> <COMMENT> +report_and_log_test_result() { + # Use sane var names. + local success=$1; local desc=$2; local comment=$3; + + if [ "$success" = "0" ]; then + echo " PASSED $desc" + echo $comment > $PASSED_DIR/$desc + else + # For failures, comment should be the name of the phase that failed. + echo " FAILED $desc" >&2 + echo $comment > $FAILED_DIR/$desc + cat ${desc}.${comment}.log + fi +} + +setup_env() { + local compiler=$1 + local compiler_modules=$(get_compiler_modules $compiler) + + module purge + + local mod + for mod in $compiler_modules; do + echo "Loading module $mod" + module load $mod 2>&1 + # It is ridiculously hard to check for the success of a loaded + # module. Module does not return error codes and piping to grep + # causes module to run in a subshell. + module list 2>&1 | grep "$mod" >& /dev/null || return 1 + done + + return 0 +} + +# single_build_and_test <COMPILER> <BUILD> <BUILD_TYPE> +single_build_and_test() { + # Use sane var names. + local compiler=$1; local build=$2; local build_type=$3; + + # Set up env. + mkdir -p $ROOT_DIR/$compiler/"${build}-$build_type" + cd $ROOT_DIR/$compiler/"${build}-$build_type" + local desc=$(echo "${compiler}-${build}-${build_type}" | sed 's:/:-:g') + setup_env $compiler >& ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } + + # Set up flags. + local compiler_warning_flags=$(get_compiler_warning_flags $compiler) + local compiler_exe=$(get_compiler_exe_name $compiler) + + if [[ "$build_type" = hwloc* ]]; then + local extra_args=--with-hwloc=$(dirname $(dirname $(which hwloc-info))) + fi + + if [[ "$OPT_FLAG" = "" ]]; then + OPT_FLAG="-O3" + fi + + if [[ "$build_type" = *debug* ]]; then + local extra_args="$extra_args --debug" + local cxxflags="-g $compiler_warning_flags" + local ldflags="-g" + else + local cxxflags="$OPT_FLAG $compiler_warning_flags" + local ldflags="${OPT_FLAG}" + fi + + local cxxflags="${cxxflags} ${CXX_FLAGS_EXTRA}" + local ldflags="${ldflags} ${LD_FLAGS_EXTRA}" + + if [[ "$KOKKOS_CUDA_OPTIONS" != "" ]]; then + local extra_args="$extra_args $KOKKOS_CUDA_OPTIONS" + fi + if [[ "$KOKKOS_OPTIONS" != "" ]]; then + local extra_args="$extra_args $KOKKOS_OPTIONS" + else + local extra_args="$extra_args --with-options=enable_large_mem_tests" + fi + + echo " Starting job $desc" + + local comment="no_comment" + + if [ "$TEST_SCRIPT" = "True" ]; then + local rand=$[ 1 + $[ RANDOM % 10 ]] + sleep $rand + + if [ $rand -gt 5 ]; then + run_cmd ls fake_problem >& ${desc}.configure.log || { report_and_log_test_result 1 $desc configure && return 0; } + fi + else + run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --ldflags=\"$ldflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } + local -i build_start_time=$(date +%s) + run_cmd make -j 48 build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; } + local -i build_end_time=$(date +%s) + comment="build_time=$(($build_end_time-$build_start_time))" + + if [[ "$BUILD_ONLY" == False ]]; then + run_cmd make test >& ${desc}.test.log || { report_and_log_test_result 1 ${desc} test && return 0; } + local -i run_end_time=$(date +%s) + comment="$comment run_time=$(($run_end_time-$build_end_time))" + fi + fi + + report_and_log_test_result 0 $desc "$comment" + + return 0 +} + +# wait_for_jobs <NUM-JOBS> +wait_for_jobs() { + local -i max_jobs=$1 + local -i num_active_jobs=$(jobs | wc -l) + while [ $num_active_jobs -ge $max_jobs ] + do + sleep 1 + num_active_jobs=$(jobs | wc -l) + jobs >& /dev/null + done +} + +# run_in_background <COMPILER> <BUILD> <BUILD_TYPE> +run_in_background() { + local compiler=$1 + + local -i num_jobs=$NUM_JOBS_TO_RUN_IN_PARALLEL + # Don't override command line input. + # if [[ "$BUILD_ONLY" == True ]]; then + # num_jobs=8 + # else + if [[ "$compiler" == cuda* ]]; then + num_jobs=1 + fi + if [[ "$compiler" == clang ]]; then + num_jobs=1 + fi + # fi + wait_for_jobs $num_jobs + + single_build_and_test $* & +} + +# build_and_test_all <COMPILER> +build_and_test_all() { + # Get compiler data. + local compiler=$1 + if [ -z "$CUSTOM_BUILD_LIST" ]; then + local compiler_build_list=$(get_compiler_build_list $compiler) + else + local compiler_build_list=$(echo "$CUSTOM_BUILD_LIST" | tr , ' ') + fi + + # Do builds. + local build + for build in $compiler_build_list + do + run_in_background $compiler $build $BUILD_TYPE + + # If not cuda, do a hwloc test too. + if [[ "$compiler" != cuda* && "$SKIP_HWLOC" == False ]]; then + run_in_background $compiler $build "hwloc-$BUILD_TYPE" + fi + done + + return 0 +} + +get_test_root_dir() { + local existing_results=$(find . -maxdepth 1 -name "$RESULT_ROOT_PREFIX*" | sort) + local -i num_existing_results=$(echo $existing_results | tr ' ' '\n' | wc -l) + local -i num_to_delete=${num_existing_results}-${NUM_RESULTS_TO_KEEP} + + if [ $num_to_delete -gt 0 ]; then + /bin/rm -rf $(echo $existing_results | tr ' ' '\n' | head -n $num_to_delete) + fi + + echo $(pwd)/${RESULT_ROOT_PREFIX}_$(date +"%Y-%m-%d_%H.%M.%S") +} + +wait_summarize_and_exit() { + wait_for_jobs 1 + + echo "#######################################################" + echo "PASSED TESTS" + echo "#######################################################" + + local passed_test + for passed_test in $(\ls -1 $PASSED_DIR | sort) + do + echo $passed_test $(cat $PASSED_DIR/$passed_test) + done + + local -i rv=0 + if [ "$(ls -A $FAILED_DIR)" ]; then + echo "#######################################################" + echo "FAILED TESTS" + echo "#######################################################" + + local failed_test + for failed_test in $(\ls -1 $FAILED_DIR | sort) + do + echo $failed_test "("$(cat $FAILED_DIR/$failed_test)" failed)" + rv=$rv+1 + done + fi + + exit $rv +} + +# +# Main. +# + +ROOT_DIR=$(get_test_root_dir) +mkdir -p $ROOT_DIR +cd $ROOT_DIR + +PASSED_DIR=$ROOT_DIR/results/passed +FAILED_DIR=$ROOT_DIR/results/failed +mkdir -p $PASSED_DIR +mkdir -p $FAILED_DIR + +echo "Going to test compilers: " $COMPILERS_TO_TEST +for COMPILER in $COMPILERS_TO_TEST; do + echo "Testing compiler $COMPILER" + build_and_test_all $COMPILER +done + +wait_summarize_and_exit diff --git a/packages/kokkos/config/yaml/volta.yaml b/packages/kokkos/config/yaml/volta.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f67af9c2a44a427f6a8021763bced669cf8b30f6 --- /dev/null +++ b/packages/kokkos/config/yaml/volta.yaml @@ -0,0 +1,4 @@ +packages: + kokkos: + variants: +cuda +openmp +volta70 +cuda_lambda +wrapper ^cuda@10.1 + compiler: [gcc@7.2.0] diff --git a/packages/kokkos/containers/CMakeLists.txt b/packages/kokkos/containers/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..b0e0c4eadea7f64ac2cf8a5cf4d56fec786ff416 --- /dev/null +++ b/packages/kokkos/containers/CMakeLists.txt @@ -0,0 +1,12 @@ + + +KOKKOS_SUBPACKAGE(Containers) + +IF (NOT Kokkos_INSTALL_TESTING) + ADD_SUBDIRECTORY(src) +ENDIF() + +KOKKOS_ADD_TEST_DIRECTORIES(unit_tests) +KOKKOS_ADD_TEST_DIRECTORIES(performance_tests) + +KOKKOS_SUBPACKAGE_POSTPROCESS() diff --git a/packages/kokkos/containers/cmake/Dependencies.cmake b/packages/kokkos/containers/cmake/Dependencies.cmake new file mode 100644 index 0000000000000000000000000000000000000000..5e29157369c9ab8cab935a1bfc4c6dad2fdd0296 --- /dev/null +++ b/packages/kokkos/containers/cmake/Dependencies.cmake @@ -0,0 +1,5 @@ +TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( + LIB_REQUIRED_PACKAGES KokkosCore + LIB_OPTIONAL_TPLS Pthread CUDA HWLOC HPX + TEST_OPTIONAL_TPLS CUSPARSE + ) diff --git a/packages/kokkos/containers/cmake/KokkosContainers_config.h.in b/packages/kokkos/containers/cmake/KokkosContainers_config.h.in new file mode 100644 index 0000000000000000000000000000000000000000..d91fdda1e353eddb2088ff86327e142676c9a6c9 --- /dev/null +++ b/packages/kokkos/containers/cmake/KokkosContainers_config.h.in @@ -0,0 +1,4 @@ +#ifndef KOKKOS_CONTAINERS_CONFIG_H +#define KOKKOS_CONTAINERS_CONFIG_H + +#endif diff --git a/packages/kokkos/containers/performance_tests/CMakeLists.txt b/packages/kokkos/containers/performance_tests/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..43c66c24fd8b83f579ed1961fc996c9b82e1d073 --- /dev/null +++ b/packages/kokkos/containers/performance_tests/CMakeLists.txt @@ -0,0 +1,28 @@ + +KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) +KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src ) + +foreach(Tag Threads;OpenMP;Cuda;HPX;HIP) + # Because there is always an exception to the rule + if(Tag STREQUAL "Threads") + set(DEVICE "PTHREAD") + else() + string(TOUPPER ${Tag} DEVICE) + endif() + string(TOLOWER ${Tag} dir) + + if(Kokkos_ENABLE_${DEVICE}) + message(STATUS "Sources Test${Tag}.cpp") + + set(SOURCES + TestMain.cpp + Test${Tag}.cpp + ) + + KOKKOS_ADD_EXECUTABLE_AND_TEST( + PerformanceTest_${Tag} + SOURCES ${SOURCES} + ) + endif() +endforeach() diff --git a/packages/kokkos/containers/performance_tests/Makefile b/packages/kokkos/containers/performance_tests/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..cbb8490798fd69586355cd3a0f449a8585d55565 --- /dev/null +++ b/packages/kokkos/containers/performance_tests/Makefile @@ -0,0 +1,101 @@ +KOKKOS_PATH = ../.. + +GTEST_PATH = ../../TPL/gtest + +vpath %.cpp ${KOKKOS_PATH}/containers/performance_tests + +default: build_all + echo "End Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) + CXX = $(KOKKOS_PATH)/bin/nvcc_wrapper +else + CXX = g++ +endif + +CXXFLAGS = -O3 +LINK ?= $(CXX) +LDFLAGS ?= +override LDFLAGS += -lpthread + +include $(KOKKOS_PATH)/Makefile.kokkos + +KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/containers/performance_tests + +TEST_TARGETS = +TARGETS = + +ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) + OBJ_CUDA = TestCuda.o TestMain.o gtest-all.o + TARGETS += KokkosContainers_PerformanceTest_Cuda + TEST_TARGETS += test-cuda +endif + +ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) + OBJ_HIP = TestHIP.o TestMain.o gtest-all.o + TARGETS += KokkosContainers_PerformanceTest_HIP + TEST_TARGETS += test-hip +endif + +ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) + OBJ_THREADS = TestThreads.o TestMain.o gtest-all.o + TARGETS += KokkosContainers_PerformanceTest_Threads + TEST_TARGETS += test-threads +endif + +ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) + OBJ_OPENMP = TestOpenMP.o TestMain.o gtest-all.o + TARGETS += KokkosContainers_PerformanceTest_OpenMP + TEST_TARGETS += test-openmp +endif + +ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) + OBJ_HPX = TestHPX.o TestMain.o gtest-all.o + TARGETS += KokkosContainers_PerformanceTest_HPX + TEST_TARGETS += test-hpx +endif + +KokkosContainers_PerformanceTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_Cuda + +KokkosContainers_PerformanceTest_HIP: $(OBJ_HIP) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_HIP) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_HIP + +KokkosContainers_PerformanceTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_Threads + +KokkosContainers_PerformanceTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_OPENMP) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_OpenMP + +KokkosContainers_PerformanceTest_HPX: $(OBJ_HPX) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_HPX) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_HPX + +test-cuda: KokkosContainers_PerformanceTest_Cuda + ./KokkosContainers_PerformanceTest_Cuda + +test-hip: KokkosContainers_PerformanceTest_HIP + ./KokkosContainers_PerformanceTest_HIP + +test-threads: KokkosContainers_PerformanceTest_Threads + ./KokkosContainers_PerformanceTest_Threads + +test-openmp: KokkosContainers_PerformanceTest_OpenMP + ./KokkosContainers_PerformanceTest_OpenMP + +test-hpx: KokkosContainers_PerformanceTest_HPX + ./KokkosContainers_PerformanceTest_HPX + +build_all: $(TARGETS) + +test: $(TEST_TARGETS) + +clean: kokkos-clean + rm -f *.o $(TARGETS) + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< + +gtest-all.o:$(GTEST_PATH)/gtest/gtest-all.cc + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $(GTEST_PATH)/gtest/gtest-all.cc diff --git a/packages/kokkos/containers/performance_tests/TestCuda.cpp b/packages/kokkos/containers/performance_tests/TestCuda.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8874590e2acad12369989d87b78670fa004b2086 --- /dev/null +++ b/packages/kokkos/containers/performance_tests/TestCuda.cpp @@ -0,0 +1,90 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Macros.hpp> + +#include <cstdint> +#include <string> +#include <iostream> +#include <iomanip> +#include <sstream> +#include <fstream> + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +#include <TestDynRankView.hpp> + +#include <Kokkos_UnorderedMap.hpp> + +#include <TestGlobal2LocalIds.hpp> + +#include <TestUnorderedMapPerformance.hpp> + +namespace Performance { + +TEST(TEST_CATEGORY, dynrankview_perf) { + std::cout << "Cuda" << std::endl; + std::cout << " DynRankView vs View: Initialization Only " << std::endl; + test_dynrankview_op_perf<Kokkos::Cuda>(40960); +} + +TEST(TEST_CATEGORY, global_2_local) { + std::cout << "Cuda" << std::endl; + std::cout << "size, create, generate, fill, find" << std::endl; + for (unsigned i = Performance::begin_id_size; i <= Performance::end_id_size; + i *= Performance::id_step) + test_global_to_local_ids<Kokkos::Cuda>(i); +} + +TEST(TEST_CATEGORY, unordered_map_performance_near) { + Perf::run_performance_tests<Kokkos::Cuda, true>("cuda-near"); +} + +TEST(TEST_CATEGORY, unordered_map_performance_far) { + Perf::run_performance_tests<Kokkos::Cuda, false>("cuda-far"); +} + +} // namespace Performance diff --git a/packages/kokkos/containers/performance_tests/TestDynRankView.hpp b/packages/kokkos/containers/performance_tests/TestDynRankView.hpp new file mode 100644 index 0000000000000000000000000000000000000000..8c507c76621d09b134ad94f12da589e8c31a014c --- /dev/null +++ b/packages/kokkos/containers/performance_tests/TestDynRankView.hpp @@ -0,0 +1,273 @@ + +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER + +#ifndef KOKKOS_TEST_DYNRANKVIEW_HPP +#define KOKKOS_TEST_DYNRANKVIEW_HPP + +#include <Kokkos_Core.hpp> +#include <Kokkos_DynRankView.hpp> +#include <vector> + +#include <impl/Kokkos_Timer.hpp> + +// Compare performance of DynRankView to View, specific focus on the parenthesis +// operators + +namespace Performance { + +// View functor +template <typename DeviceType> +struct InitViewFunctor { + using inviewtype = Kokkos::View<double ***, DeviceType>; + inviewtype _inview; + + InitViewFunctor(inviewtype &inview_) : _inview(inview_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { + for (unsigned j = 0; j < _inview.extent(1); ++j) { + for (unsigned k = 0; k < _inview.extent(2); ++k) { + _inview(i, j, k) = i / 2 - j * j + k / 3; + } + } + } + + struct SumComputationTest { + using inviewtype = Kokkos::View<double ***, DeviceType>; + inviewtype _inview; + + using outviewtype = Kokkos::View<double *, DeviceType>; + outviewtype _outview; + + KOKKOS_INLINE_FUNCTION + SumComputationTest(inviewtype &inview_, outviewtype &outview_) + : _inview(inview_), _outview(outview_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { + for (unsigned j = 0; j < _inview.extent(1); ++j) { + for (unsigned k = 0; k < _inview.extent(2); ++k) { + _outview(i) += _inview(i, j, k); + } + } + } + }; +}; + +template <typename DeviceType> +struct InitStrideViewFunctor { + using inviewtype = Kokkos::View<double ***, Kokkos::LayoutStride, DeviceType>; + inviewtype _inview; + + InitStrideViewFunctor(inviewtype &inview_) : _inview(inview_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { + for (unsigned j = 0; j < _inview.extent(1); ++j) { + for (unsigned k = 0; k < _inview.extent(2); ++k) { + _inview(i, j, k) = i / 2 - j * j + k / 3; + } + } + } +}; + +template <typename DeviceType> +struct InitViewRank7Functor { + using inviewtype = Kokkos::View<double *******, DeviceType>; + inviewtype _inview; + + InitViewRank7Functor(inviewtype &inview_) : _inview(inview_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { + for (unsigned j = 0; j < _inview.extent(1); ++j) { + for (unsigned k = 0; k < _inview.extent(2); ++k) { + _inview(i, j, k, 0, 0, 0, 0) = i / 2 - j * j + k / 3; + } + } + } +}; + +// DynRankView functor +template <typename DeviceType> +struct InitDynRankViewFunctor { + using inviewtype = Kokkos::DynRankView<double, DeviceType>; + inviewtype _inview; + + InitDynRankViewFunctor(inviewtype &inview_) : _inview(inview_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { + for (unsigned j = 0; j < _inview.extent(1); ++j) { + for (unsigned k = 0; k < _inview.extent(2); ++k) { + _inview(i, j, k) = i / 2 - j * j + k / 3; + } + } + } + + struct SumComputationTest { + using inviewtype = Kokkos::DynRankView<double, DeviceType>; + inviewtype _inview; + + using outviewtype = Kokkos::DynRankView<double, DeviceType>; + outviewtype _outview; + + KOKKOS_INLINE_FUNCTION + SumComputationTest(inviewtype &inview_, outviewtype &outview_) + : _inview(inview_), _outview(outview_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { + for (unsigned j = 0; j < _inview.extent(1); ++j) { + for (unsigned k = 0; k < _inview.extent(2); ++k) { + _outview(i) += _inview(i, j, k); + } + } + } + }; +}; + +template <typename DeviceType> +void test_dynrankview_op_perf(const int par_size) { + using execution_space = DeviceType; + using size_type = typename execution_space::size_type; + const size_type dim_2 = 90; + const size_type dim_3 = 30; + + double elapsed_time_view = 0; + double elapsed_time_compview = 0; + double elapsed_time_strideview = 0; + double elapsed_time_view_rank7 = 0; + double elapsed_time_drview = 0; + double elapsed_time_compdrview = 0; + Kokkos::Timer timer; + { + Kokkos::View<double ***, DeviceType> testview("testview", par_size, dim_2, + dim_3); + using FunctorType = InitViewFunctor<DeviceType>; + + timer.reset(); + Kokkos::RangePolicy<DeviceType> policy(0, par_size); + Kokkos::parallel_for(policy, FunctorType(testview)); + DeviceType().fence(); + elapsed_time_view = timer.seconds(); + std::cout << " View time (init only): " << elapsed_time_view << std::endl; + + timer.reset(); + Kokkos::View<double *, DeviceType> sumview("sumview", par_size); + Kokkos::parallel_for( + policy, typename FunctorType::SumComputationTest(testview, sumview)); + DeviceType().fence(); + elapsed_time_compview = timer.seconds(); + std::cout << " View sum computation time: " << elapsed_time_view + << std::endl; + + Kokkos::View<double ***, Kokkos::LayoutStride, DeviceType> teststrideview = + Kokkos::subview(testview, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL); + using FunctorStrideType = InitStrideViewFunctor<DeviceType>; + + timer.reset(); + Kokkos::parallel_for(policy, FunctorStrideType(teststrideview)); + DeviceType().fence(); + elapsed_time_strideview = timer.seconds(); + std::cout << " Strided View time (init only): " << elapsed_time_strideview + << std::endl; + } + { + Kokkos::View<double *******, DeviceType> testview("testview", par_size, + dim_2, dim_3, 1, 1, 1, 1); + using FunctorType = InitViewRank7Functor<DeviceType>; + + timer.reset(); + Kokkos::RangePolicy<DeviceType> policy(0, par_size); + Kokkos::parallel_for(policy, FunctorType(testview)); + DeviceType().fence(); + elapsed_time_view_rank7 = timer.seconds(); + std::cout << " View Rank7 time (init only): " << elapsed_time_view_rank7 + << std::endl; + } + { + Kokkos::DynRankView<double, DeviceType> testdrview("testdrview", par_size, + dim_2, dim_3); + using FunctorType = InitDynRankViewFunctor<DeviceType>; + + timer.reset(); + Kokkos::RangePolicy<DeviceType> policy(0, par_size); + Kokkos::parallel_for(policy, FunctorType(testdrview)); + DeviceType().fence(); + elapsed_time_drview = timer.seconds(); + std::cout << " DynRankView time (init only): " << elapsed_time_drview + << std::endl; + + timer.reset(); + Kokkos::DynRankView<double, DeviceType> sumview("sumview", par_size); + Kokkos::parallel_for( + policy, typename FunctorType::SumComputationTest(testdrview, sumview)); + DeviceType().fence(); + elapsed_time_compdrview = timer.seconds(); + std::cout << " DynRankView sum computation time: " + << elapsed_time_compdrview << std::endl; + } + + std::cout << " Ratio of View to DynRankView time: " + << elapsed_time_view / elapsed_time_drview + << std::endl; // expect < 1 + std::cout << " Ratio of View to DynRankView sum computation time: " + << elapsed_time_compview / elapsed_time_compdrview + << std::endl; // expect < 1 + std::cout << " Ratio of View to View Rank7 time: " + << elapsed_time_view / elapsed_time_view_rank7 + << std::endl; // expect < 1 + std::cout << " Ratio of StrideView to DynRankView time: " + << elapsed_time_strideview / elapsed_time_drview + << std::endl; // expect < 1 + std::cout << " Ratio of DynRankView to View Rank7 time: " + << elapsed_time_drview / elapsed_time_view_rank7 + << std::endl; // expect ? + + timer.reset(); + +} // end test_dynrankview + +} // namespace Performance +#endif diff --git a/packages/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp b/packages/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp new file mode 100644 index 0000000000000000000000000000000000000000..65de551b2715f1eb31f4385fa0cb2a455bca6a4f --- /dev/null +++ b/packages/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp @@ -0,0 +1,209 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER + +#ifndef KOKKOS_TEST_GLOBAL_TO_LOCAL_IDS_HPP +#define KOKKOS_TEST_GLOBAL_TO_LOCAL_IDS_HPP + +#include <Kokkos_Core.hpp> +#include <Kokkos_UnorderedMap.hpp> +#include <vector> +#include <algorithm> + +#include <impl/Kokkos_Timer.hpp> + +// This test will simulate global ids + +namespace Performance { + +static const unsigned begin_id_size = 256u; +static const unsigned end_id_size = 1u << 22; +static const unsigned id_step = 2u; + +union helper { + uint32_t word; + uint8_t byte[4]; +}; + +template <typename Device> +struct generate_ids { + using execution_space = Device; + using size_type = typename execution_space::size_type; + using local_id_view = Kokkos::View<uint32_t*, execution_space>; + + local_id_view local_2_global; + + generate_ids(local_id_view& ids) : local_2_global(ids) { + Kokkos::parallel_for(local_2_global.extent(0), *this); + } + + KOKKOS_INLINE_FUNCTION + void operator()(size_type i) const { + helper x = {static_cast<uint32_t>(i)}; + + // shuffle the bytes of i to create a unique, semi-random global_id + x.word = ~x.word; + + uint8_t tmp = x.byte[3]; + x.byte[3] = x.byte[1]; + x.byte[1] = tmp; + + tmp = x.byte[2]; + x.byte[2] = x.byte[0]; + x.byte[0] = tmp; + + local_2_global[i] = x.word; + } +}; + +template <typename Device> +struct fill_map { + using execution_space = Device; + using size_type = typename execution_space::size_type; + using local_id_view = Kokkos::View<const uint32_t*, execution_space, + Kokkos::MemoryRandomAccess>; + using global_id_view = + Kokkos::UnorderedMap<uint32_t, size_type, execution_space>; + + global_id_view global_2_local; + local_id_view local_2_global; + + fill_map(global_id_view gIds, local_id_view lIds) + : global_2_local(gIds), local_2_global(lIds) { + Kokkos::parallel_for(local_2_global.extent(0), *this); + } + + KOKKOS_INLINE_FUNCTION + void operator()(size_type i) const { + global_2_local.insert(local_2_global[i], i); + } +}; + +template <typename Device> +struct find_test { + using execution_space = Device; + using size_type = typename execution_space::size_type; + using local_id_view = Kokkos::View<const uint32_t*, execution_space, + Kokkos::MemoryRandomAccess>; + using global_id_view = + Kokkos::UnorderedMap<const uint32_t, const size_type, execution_space>; + + global_id_view global_2_local; + local_id_view local_2_global; + + using value_type = size_t; + + find_test(global_id_view gIds, local_id_view lIds, value_type& num_errors) + : global_2_local(gIds), local_2_global(lIds) { + Kokkos::parallel_reduce(local_2_global.extent(0), *this, num_errors); + } + + KOKKOS_INLINE_FUNCTION + void init(value_type& v) const { v = 0; } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dst, volatile value_type const& src) const { + dst += src; + } + + KOKKOS_INLINE_FUNCTION + void operator()(size_type i, value_type& num_errors) const { + uint32_t index = global_2_local.find(local_2_global[i]); + + if (global_2_local.value_at(index) != i) ++num_errors; + } +}; + +template <typename Device> +void test_global_to_local_ids(unsigned num_ids) { + using execution_space = Device; + using size_type = typename execution_space::size_type; + + using local_id_view = Kokkos::View<uint32_t*, execution_space>; + using global_id_view = + Kokkos::UnorderedMap<uint32_t, size_type, execution_space>; + + // size + std::cout << num_ids << ", "; + + double elasped_time = 0; + Kokkos::Timer timer; + + local_id_view local_2_global("local_ids", num_ids); + global_id_view global_2_local((3u * num_ids) / 2u); + + // create + elasped_time = timer.seconds(); + std::cout << elasped_time << ", "; + timer.reset(); + + // generate unique ids + { generate_ids<Device> gen(local_2_global); } + Device().fence(); + // generate + elasped_time = timer.seconds(); + std::cout << elasped_time << ", "; + timer.reset(); + + { fill_map<Device> fill(global_2_local, local_2_global); } + Device().fence(); + + // fill + elasped_time = timer.seconds(); + std::cout << elasped_time << ", "; + timer.reset(); + + size_t num_errors = 0; + for (int i = 0; i < 100; ++i) { + find_test<Device> find(global_2_local, local_2_global, num_errors); + } + Device().fence(); + + // find + elasped_time = timer.seconds(); + std::cout << elasped_time << std::endl; + + ASSERT_EQ(num_errors, 0u); +} + +} // namespace Performance + +#endif // KOKKOS_TEST_GLOBAL_TO_LOCAL_IDS_HPP diff --git a/packages/kokkos/containers/performance_tests/TestHIP.cpp b/packages/kokkos/containers/performance_tests/TestHIP.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8033c76be6cfaf491ba7ff9abd63856aca589c89 --- /dev/null +++ b/packages/kokkos/containers/performance_tests/TestHIP.cpp @@ -0,0 +1,90 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Macros.hpp> + +#include <cstdint> +#include <string> +#include <iostream> +#include <iomanip> +#include <sstream> +#include <fstream> + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +#include <TestDynRankView.hpp> + +#include <Kokkos_UnorderedMap.hpp> + +#include <TestGlobal2LocalIds.hpp> + +#include <TestUnorderedMapPerformance.hpp> + +namespace Performance { + +TEST(TEST_CATEGORY, dynrankview_perf) { + std::cout << "HIP" << std::endl; + std::cout << " DynRankView vs View: Initialization Only " << std::endl; + test_dynrankview_op_perf<Kokkos::Experimental::HIP>(40960); +} + +TEST(TEST_CATEGORY, global_2_local) { + std::cout << "HIP" << std::endl; + std::cout << "size, create, generate, fill, find" << std::endl; + for (unsigned i = Performance::begin_id_size; i <= Performance::end_id_size; + i *= Performance::id_step) + test_global_to_local_ids<Kokkos::Experimental::HIP>(i); +} + +TEST(TEST_CATEGORY, unordered_map_performance_near) { + Perf::run_performance_tests<Kokkos::Experimental::HIP, true>("hip-near"); +} + +TEST(TEST_CATEGORY, unordered_map_performance_far) { + Perf::run_performance_tests<Kokkos::Experimental::HIP, false>("hip-far"); +} + +} // namespace Performance diff --git a/packages/kokkos/containers/performance_tests/TestHPX.cpp b/packages/kokkos/containers/performance_tests/TestHPX.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f229901dcc421f964d731380172690986b65163a --- /dev/null +++ b/packages/kokkos/containers/performance_tests/TestHPX.cpp @@ -0,0 +1,108 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Macros.hpp> + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +#include <Kokkos_UnorderedMap.hpp> + +#include <TestGlobal2LocalIds.hpp> +#include <TestUnorderedMapPerformance.hpp> + +#include <TestDynRankView.hpp> +#include <TestScatterView.hpp> + +#include <iomanip> +#include <sstream> +#include <string> +#include <fstream> + +namespace Performance { + +TEST(TEST_CATEGORY, dynrankview_perf) { + std::cout << "HPX" << std::endl; + std::cout << " DynRankView vs View: Initialization Only " << std::endl; + test_dynrankview_op_perf<Kokkos::Experimental::HPX>(8192); +} + +TEST(TEST_CATEGORY, global_2_local) { + std::cout << "HPX" << std::endl; + std::cout << "size, create, generate, fill, find" << std::endl; + for (unsigned i = Performance::begin_id_size; i <= Performance::end_id_size; + i *= Performance::id_step) + test_global_to_local_ids<Kokkos::Experimental::HPX>(i); +} + +TEST(TEST_CATEGORY, unordered_map_performance_near) { + unsigned num_hpx = 4; + std::ostringstream base_file_name; + base_file_name << "hpx-" << num_hpx << "-near"; + Perf::run_performance_tests<Kokkos::Experimental::HPX, true>( + base_file_name.str()); +} + +TEST(TEST_CATEGORY, unordered_map_performance_far) { + unsigned num_hpx = 4; + std::ostringstream base_file_name; + base_file_name << "hpx-" << num_hpx << "-far"; + Perf::run_performance_tests<Kokkos::Experimental::HPX, false>( + base_file_name.str()); +} + +TEST(TEST_CATEGORY, scatter_view) { + std::cout << "ScatterView data-duplicated test:\n"; + Perf::test_scatter_view<Kokkos::Experimental::HPX, Kokkos::LayoutRight, + Kokkos::Experimental::ScatterDuplicated, + Kokkos::Experimental::ScatterNonAtomic>(10, + 1000 * 1000); + // std::cout << "ScatterView atomics test:\n"; + // Perf::test_scatter_view<Kokkos::Experimental::HPX, Kokkos::LayoutRight, + // Kokkos::Experimental::ScatterNonDuplicated, + // Kokkos::Experimental::ScatterAtomic>(10, 1000 * 1000); +} + +} // namespace Performance diff --git a/packages/kokkos/containers/performance_tests/TestMain.cpp b/packages/kokkos/containers/performance_tests/TestMain.cpp new file mode 100644 index 0000000000000000000000000000000000000000..140ba418fdac4eda95ed362a9b5bf64e50676cc2 --- /dev/null +++ b/packages/kokkos/containers/performance_tests/TestMain.cpp @@ -0,0 +1,57 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> +#include <cstdlib> + +#include <Kokkos_Core.hpp> + +int main(int argc, char *argv[]) { + Kokkos::initialize(argc, argv); + ::testing::InitGoogleTest(&argc, argv); + + int result = RUN_ALL_TESTS(); + Kokkos::finalize(); + return result; +} diff --git a/packages/kokkos/containers/performance_tests/TestOpenMP.cpp b/packages/kokkos/containers/performance_tests/TestOpenMP.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f414b0d8282b62c01b64d7c1275c2bc1bf7bbdf4 --- /dev/null +++ b/packages/kokkos/containers/performance_tests/TestOpenMP.cpp @@ -0,0 +1,116 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Macros.hpp> + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +#include <Kokkos_UnorderedMap.hpp> + +#include <TestGlobal2LocalIds.hpp> +#include <TestUnorderedMapPerformance.hpp> + +#include <TestDynRankView.hpp> +#include <TestScatterView.hpp> + +#include <iomanip> +#include <sstream> +#include <string> +#include <fstream> + +namespace Performance { + +TEST(TEST_CATEGORY, dynrankview_perf) { + std::cout << "OpenMP" << std::endl; + std::cout << " DynRankView vs View: Initialization Only " << std::endl; + test_dynrankview_op_perf<Kokkos::OpenMP>(8192); +} + +TEST(TEST_CATEGORY, global_2_local) { + std::cout << "OpenMP" << std::endl; + std::cout << "size, create, generate, fill, find" << std::endl; + for (unsigned i = Performance::begin_id_size; i <= Performance::end_id_size; + i *= Performance::id_step) + test_global_to_local_ids<Kokkos::OpenMP>(i); +} + +TEST(TEST_CATEGORY, unordered_map_performance_near) { + unsigned num_openmp = 4; + if (Kokkos::hwloc::available()) { + num_openmp = Kokkos::hwloc::get_available_numa_count() * + Kokkos::hwloc::get_available_cores_per_numa() * + Kokkos::hwloc::get_available_threads_per_core(); + } + std::ostringstream base_file_name; + base_file_name << "openmp-" << num_openmp << "-near"; + Perf::run_performance_tests<Kokkos::OpenMP, true>(base_file_name.str()); +} + +TEST(TEST_CATEGORY, unordered_map_performance_far) { + unsigned num_openmp = 4; + if (Kokkos::hwloc::available()) { + num_openmp = Kokkos::hwloc::get_available_numa_count() * + Kokkos::hwloc::get_available_cores_per_numa() * + Kokkos::hwloc::get_available_threads_per_core(); + } + std::ostringstream base_file_name; + base_file_name << "openmp-" << num_openmp << "-far"; + Perf::run_performance_tests<Kokkos::OpenMP, false>(base_file_name.str()); +} + +TEST(TEST_CATEGORY, scatter_view) { + std::cout << "ScatterView data-duplicated test:\n"; + Perf::test_scatter_view<Kokkos::OpenMP, Kokkos::LayoutRight, + Kokkos::Experimental::ScatterDuplicated, + Kokkos::Experimental::ScatterNonAtomic>(10, + 1000 * 1000); + // std::cout << "ScatterView atomics test:\n"; + // Perf::test_scatter_view<Kokkos::OpenMP, Kokkos::LayoutRight, + // Kokkos::Experimental::ScatterNonDuplicated, + // Kokkos::Experimental::ScatterAtomic>(10, 1000 * 1000); +} + +} // namespace Performance diff --git a/packages/kokkos/containers/performance_tests/TestScatterView.hpp b/packages/kokkos/containers/performance_tests/TestScatterView.hpp new file mode 100644 index 0000000000000000000000000000000000000000..0f3ba103efc5d09d012e3cc35cbfa41fa8be9170 --- /dev/null +++ b/packages/kokkos/containers/performance_tests/TestScatterView.hpp @@ -0,0 +1,117 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_SCATTER_VIEW_HPP +#define KOKKOS_TEST_SCATTER_VIEW_HPP + +#include <Kokkos_ScatterView.hpp> +#include <impl/Kokkos_Timer.hpp> + +namespace Perf { + +template <typename ExecSpace, typename Layout, typename Duplication, + typename Contribution> +void test_scatter_view(int m, int n) { + Kokkos::View<double * [3], Layout, ExecSpace> original_view("original_view", + n); + { + auto scatter_view = Kokkos::Experimental::create_scatter_view< + Kokkos::Experimental::ScatterSum, Duplication, Contribution>( + original_view); + Kokkos::Experimental::UniqueToken< + ExecSpace, Kokkos::Experimental::UniqueTokenScope::Global> + unique_token{ExecSpace()}; + // auto internal_view = scatter_view.internal_view; + auto policy = Kokkos::RangePolicy<ExecSpace, int>(0, n); + for (int foo = 0; foo < 5; ++foo) { + { + auto num_threads = unique_token.size(); + std::cout << "num_threads " << num_threads << '\n'; + Kokkos::View<double* * [3], Layout, ExecSpace> + hand_coded_duplicate_view("hand_coded_duplicate", num_threads, n); + auto f2 = KOKKOS_LAMBDA(int i) { + auto thread_id = unique_token.acquire(); + for (int j = 0; j < 10; ++j) { + auto k = (i + j) % n; + hand_coded_duplicate_view(thread_id, k, 0) += 4.2; + hand_coded_duplicate_view(thread_id, k, 1) += 2.0; + hand_coded_duplicate_view(thread_id, k, 2) += 1.0; + } + }; + Kokkos::Timer timer; + timer.reset(); + for (int k = 0; k < m; ++k) { + Kokkos::parallel_for(policy, f2, + "hand_coded_duplicate_scatter_view_test"); + } + Kokkos::fence(); + auto t = timer.seconds(); + std::cout << "hand-coded test took " << t << " seconds\n"; + } + { + auto f = KOKKOS_LAMBDA(int i) { + auto scatter_access = scatter_view.access(); + for (int j = 0; j < 10; ++j) { + auto k = (i + j) % n; + scatter_access(k, 0) += 4.2; + scatter_access(k, 1) += 2.0; + scatter_access(k, 2) += 1.0; + } + }; + Kokkos::Timer timer; + timer.reset(); + for (int k = 0; k < m; ++k) { + Kokkos::parallel_for(policy, f, "scatter_view_test"); + } + Kokkos::fence(); + auto t = timer.seconds(); + std::cout << "test took " << t << " seconds\n"; + } + } + } +} + +} // namespace Perf + +#endif diff --git a/packages/kokkos/containers/performance_tests/TestThreads.cpp b/packages/kokkos/containers/performance_tests/TestThreads.cpp new file mode 100644 index 0000000000000000000000000000000000000000..72bef1a3ad027a2ba780230fb39e742f6a335327 --- /dev/null +++ b/packages/kokkos/containers/performance_tests/TestThreads.cpp @@ -0,0 +1,105 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Macros.hpp> + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +#include <Kokkos_UnorderedMap.hpp> + +#include <iomanip> + +#include <TestGlobal2LocalIds.hpp> +#include <TestUnorderedMapPerformance.hpp> + +#include <TestDynRankView.hpp> + +#include <iomanip> +#include <sstream> +#include <string> +#include <fstream> + +namespace Performance { + +TEST(threads, dynrankview_perf) { + std::cout << "Threads" << std::endl; + std::cout << " DynRankView vs View: Initialization Only " << std::endl; + test_dynrankview_op_perf<Kokkos::Threads>(8192); +} + +TEST(threads, global_2_local) { + std::cout << "Threads" << std::endl; + std::cout << "size, create, generate, fill, find" << std::endl; + for (unsigned i = Performance::begin_id_size; i <= Performance::end_id_size; + i *= Performance::id_step) + test_global_to_local_ids<Kokkos::Threads>(i); +} + +TEST(threads, unordered_map_performance_near) { + unsigned num_threads = 4; + if (Kokkos::hwloc::available()) { + num_threads = Kokkos::hwloc::get_available_numa_count() * + Kokkos::hwloc::get_available_cores_per_numa() * + Kokkos::hwloc::get_available_threads_per_core(); + } + std::ostringstream base_file_name; + base_file_name << "threads-" << num_threads << "-near"; + Perf::run_performance_tests<Kokkos::Threads, true>(base_file_name.str()); +} + +TEST(threads, unordered_map_performance_far) { + unsigned num_threads = 4; + if (Kokkos::hwloc::available()) { + num_threads = Kokkos::hwloc::get_available_numa_count() * + Kokkos::hwloc::get_available_cores_per_numa() * + Kokkos::hwloc::get_available_threads_per_core(); + } + std::ostringstream base_file_name; + base_file_name << "threads-" << num_threads << "-far"; + Perf::run_performance_tests<Kokkos::Threads, false>(base_file_name.str()); +} + +} // namespace Performance diff --git a/packages/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp b/packages/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp new file mode 100644 index 0000000000000000000000000000000000000000..c31412552ad696ada0dad4fd1058f76290282256 --- /dev/null +++ b/packages/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp @@ -0,0 +1,256 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER + +#ifndef KOKKOS_TEST_UNORDERED_MAP_PERFORMANCE_HPP +#define KOKKOS_TEST_UNORDERED_MAP_PERFORMANCE_HPP + +#include <impl/Kokkos_Timer.hpp> + +#include <iostream> +#include <iomanip> +#include <fstream> +#include <string> +#include <sstream> + +namespace Perf { + +template <typename Device, bool Near> +struct UnorderedMapTest { + using execution_space = Device; + using map_type = Kokkos::UnorderedMap<uint32_t, uint32_t, execution_space>; + using histogram_type = typename map_type::histogram_type; + + struct value_type { + uint32_t failed_count; + uint32_t max_list; + }; + + uint32_t capacity; + uint32_t inserts; + uint32_t collisions; + double seconds; + map_type map; + histogram_type histogram; + + UnorderedMapTest(uint32_t arg_capacity, uint32_t arg_inserts, + uint32_t arg_collisions) + : capacity(arg_capacity), + inserts(arg_inserts), + collisions(arg_collisions), + seconds(0), + map(capacity), + histogram(map.get_histogram()) { + Kokkos::Timer wall_clock; + wall_clock.reset(); + + value_type v = {}; + int loop_count = 0; + do { + ++loop_count; + + v = value_type(); + Kokkos::parallel_reduce(inserts, *this, v); + + if (v.failed_count > 0u) { + const uint32_t new_capacity = map.capacity() + + ((map.capacity() * 3ull) / 20u) + + v.failed_count / collisions; + map.rehash(new_capacity); + } + } while (v.failed_count > 0u); + + seconds = wall_clock.seconds(); + + switch (loop_count) { + case 1u: std::cout << " \033[0;32m" << loop_count << "\033[0m "; break; + case 2u: std::cout << " \033[1;31m" << loop_count << "\033[0m "; break; + default: std::cout << " \033[0;31m" << loop_count << "\033[0m "; break; + } + std::cout << std::setprecision(2) << std::fixed << std::setw(5) + << (1e9 * (seconds / (inserts))) << "; " << std::flush; + + histogram.calculate(); + Device().fence(); + } + + void print(std::ostream& metrics_out, std::ostream& length_out, + std::ostream& distance_out, std::ostream& block_distance_out) { + metrics_out << map.capacity() << " , "; + metrics_out << inserts / collisions << " , "; + metrics_out << (100.0 * inserts / collisions) / map.capacity() << " , "; + metrics_out << inserts << " , "; + metrics_out << (map.failed_insert() ? "true" : "false") << " , "; + metrics_out << collisions << " , "; + metrics_out << 1e9 * (seconds / inserts) << " , "; + metrics_out << seconds << std::endl; + + length_out << map.capacity() << " , "; + length_out << ((100.0 * inserts / collisions) / map.capacity()) << " , "; + length_out << collisions << " , "; + histogram.print_length(length_out); + + distance_out << map.capacity() << " , "; + distance_out << ((100.0 * inserts / collisions) / map.capacity()) << " , "; + distance_out << collisions << " , "; + histogram.print_distance(distance_out); + + block_distance_out << map.capacity() << " , "; + block_distance_out << ((100.0 * inserts / collisions) / map.capacity()) + << " , "; + block_distance_out << collisions << " , "; + histogram.print_block_distance(block_distance_out); + } + + KOKKOS_INLINE_FUNCTION + void init(value_type& v) const { + v.failed_count = 0; + v.max_list = 0; + } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dst, const volatile value_type& src) const { + dst.failed_count += src.failed_count; + dst.max_list = src.max_list < dst.max_list ? dst.max_list : src.max_list; + } + + KOKKOS_INLINE_FUNCTION + void operator()(uint32_t i, value_type& v) const { + const uint32_t key = Near ? i / collisions : i % (inserts / collisions); + typename map_type::insert_result result = map.insert(key, i); + v.failed_count += !result.failed() ? 0 : 1; + v.max_list = result.list_position() < v.max_list ? v.max_list + : result.list_position(); + } +}; + +template <typename Device, bool Near> +void run_performance_tests(std::string const& base_file_name) { +#if 0 + std::string metrics_file_name = base_file_name + std::string("-metrics.csv"); + std::string length_file_name = base_file_name + std::string("-length.csv"); + std::string distance_file_name = base_file_name + std::string("-distance.csv"); + std::string block_distance_file_name = base_file_name + std::string("-block_distance.csv"); + + std::ofstream metrics_out( metrics_file_name.c_str(), std::ofstream::out ); + std::ofstream length_out( length_file_name.c_str(), std::ofstream::out ); + std::ofstream distance_out( distance_file_name.c_str(), std::ofstream::out ); + std::ofstream block_distance_out( block_distance_file_name.c_str(), std::ofstream::out ); + + + /* + const double test_ratios[] = { + 0.50 + , 0.75 + , 0.80 + , 0.85 + , 0.90 + , 0.95 + , 1.00 + , 1.25 + , 2.00 + }; + */ + + const double test_ratios[] = { 1.00 }; + + const int num_ratios = sizeof(test_ratios) / sizeof(double); + + /* + const uint32_t collisions[] { + 1 + , 4 + , 16 + , 64 + }; + */ + + const uint32_t collisions[] = { 16 }; + + const int num_collisions = sizeof(collisions) / sizeof(uint32_t); + + // set up file headers + metrics_out << "Capacity , Unique , Percent Full , Attempted Inserts , Failed Inserts , Collision Ratio , Nanoseconds/Inserts, Seconds" << std::endl; + length_out << "Capacity , Percent Full , "; + distance_out << "Capacity , Percent Full , "; + block_distance_out << "Capacity , Percent Full , "; + + for (int i=0; i<100; ++i) { + length_out << i << " , "; + distance_out << i << " , "; + block_distance_out << i << " , "; + } + + length_out << "\b\b\b " << std::endl; + distance_out << "\b\b\b " << std::endl; + block_distance_out << "\b\b\b " << std::endl; + + Kokkos::Timer wall_clock ; + for (int i=0; i < num_collisions ; ++i) { + wall_clock.reset(); + std::cout << "Collisions: " << collisions[i] << std::endl; + for (int j = 0; j < num_ratios; ++j) { + std::cout << std::setprecision(1) << std::fixed << std::setw(5) << (100.0*test_ratios[j]) << "% " << std::flush; + for (uint32_t capacity = 1<<14; capacity < 1<<25; capacity = capacity << 1) { + uint32_t inserts = static_cast<uint32_t>(test_ratios[j]*(capacity)); + std::cout << capacity << std::flush; + UnorderedMapTest<Device, Near> test(capacity, inserts*collisions[i], collisions[i]); + Device().fence(); + test.print(metrics_out, length_out, distance_out, block_distance_out); + } + std::cout << "\b\b " << std::endl; + + } + std::cout << " " << wall_clock.seconds() << " secs" << std::endl; + } + metrics_out.close(); + length_out.close(); + distance_out.close(); + block_distance_out.close(); +#else + (void)base_file_name; + std::cout << "skipping test" << std::endl; +#endif +} + +} // namespace Perf + +#endif // KOKKOS_TEST_UNORDERED_MAP_PERFORMANCE_HPP diff --git a/packages/kokkos/containers/src/CMakeLists.txt b/packages/kokkos/containers/src/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..98655896d4f351418fc60e5330cd194fa2358d0e --- /dev/null +++ b/packages/kokkos/containers/src/CMakeLists.txt @@ -0,0 +1,36 @@ + +KOKKOS_CONFIGURE_FILE(${PACKAGE_NAME}_config.h) + +#need these here for now +KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +#----------------------------------------------------------------------------- + +SET(KOKKOS_CONTAINERS_SRCS) +APPEND_GLOB(KOKKOS_CONTAINERS_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.cpp) +SET(KOKKOS_CONTAINER_HEADERS) +APPEND_GLOB(KOKKOS_CONTAINERS_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.hpp) +APPEND_GLOB(KOKKOS_CONTAINERS_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp) + + +INSTALL ( + DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/" + DESTINATION ${KOKKOS_HEADER_DIR} + FILES_MATCHING PATTERN "*.hpp" +) + +KOKKOS_ADD_LIBRARY( + kokkoscontainers + SOURCES ${KOKKOS_CONTAINERS_SRCS} + HEADERS ${KOKKOS_CONTAINER_HEADERS} +) + +KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkoscontainers + ${KOKKOS_TOP_BUILD_DIR} + ${CMAKE_CURRENT_BINARY_DIR} + ${CMAKE_CURRENT_SOURCE_DIR} +) +KOKKOS_LINK_INTERNAL_LIBRARY(kokkoscontainers kokkoscore) + +#----------------------------------------------------------------------------- diff --git a/packages/kokkos/containers/src/Kokkos_Bitset.hpp b/packages/kokkos/containers/src/Kokkos_Bitset.hpp new file mode 100644 index 0000000000000000000000000000000000000000..ea1d6dde5d26e9baf719281a0d8f13bb80ec59f8 --- /dev/null +++ b/packages/kokkos/containers/src/Kokkos_Bitset.hpp @@ -0,0 +1,424 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_BITSET_HPP +#define KOKKOS_BITSET_HPP + +#include <Kokkos_Core.hpp> +#include <Kokkos_Functional.hpp> + +#include <impl/Kokkos_Bitset_impl.hpp> + +#include <stdexcept> + +namespace Kokkos { + +template <typename Device = Kokkos::DefaultExecutionSpace> +class Bitset; + +template <typename Device = Kokkos::DefaultExecutionSpace> +class ConstBitset; + +template <typename DstDevice, typename SrcDevice> +void deep_copy(Bitset<DstDevice>& dst, Bitset<SrcDevice> const& src); + +template <typename DstDevice, typename SrcDevice> +void deep_copy(Bitset<DstDevice>& dst, ConstBitset<SrcDevice> const& src); + +template <typename DstDevice, typename SrcDevice> +void deep_copy(ConstBitset<DstDevice>& dst, ConstBitset<SrcDevice> const& src); + +/// A thread safe view to a bitset +template <typename Device> +class Bitset { + public: + using execution_space = Device; + using size_type = unsigned int; + + enum { BIT_SCAN_REVERSE = 1u }; + enum { MOVE_HINT_BACKWARD = 2u }; + + enum { + BIT_SCAN_FORWARD_MOVE_HINT_FORWARD = 0u, + BIT_SCAN_REVERSE_MOVE_HINT_FORWARD = BIT_SCAN_REVERSE, + BIT_SCAN_FORWARD_MOVE_HINT_BACKWARD = MOVE_HINT_BACKWARD, + BIT_SCAN_REVERSE_MOVE_HINT_BACKWARD = BIT_SCAN_REVERSE | MOVE_HINT_BACKWARD + }; + + private: + enum { block_size = static_cast<unsigned>(sizeof(unsigned) * CHAR_BIT) }; + enum { block_mask = block_size - 1u }; + enum { block_shift = Kokkos::Impl::integral_power_of_two(block_size) }; + + public: + /// constructor + /// arg_size := number of bit in set + Bitset(unsigned arg_size = 0u) + : m_size(arg_size), + m_last_block_mask(0u), + m_blocks("Bitset", ((m_size + block_mask) >> block_shift)) { + for (int i = 0, end = static_cast<int>(m_size & block_mask); i < end; ++i) { + m_last_block_mask |= 1u << i; + } + } + + KOKKOS_DEFAULTED_FUNCTION + Bitset(const Bitset<Device>&) = default; + + KOKKOS_DEFAULTED_FUNCTION + Bitset& operator=(const Bitset<Device>&) = default; + + KOKKOS_DEFAULTED_FUNCTION + Bitset(Bitset<Device>&&) = default; + + KOKKOS_DEFAULTED_FUNCTION + Bitset& operator=(Bitset<Device>&&) = default; + + KOKKOS_DEFAULTED_FUNCTION + ~Bitset() = default; + + /// number of bits in the set + /// can be call from the host or the device + KOKKOS_FORCEINLINE_FUNCTION + unsigned size() const { return m_size; } + + /// number of bits which are set to 1 + /// can only be called from the host + unsigned count() const { + Impl::BitsetCount<Bitset<Device> > f(*this); + return f.apply(); + } + + /// set all bits to 1 + /// can only be called from the host + void set() { + Kokkos::deep_copy(m_blocks, ~0u); + + if (m_last_block_mask) { + // clear the unused bits in the last block + using raw_deep_copy = + Kokkos::Impl::DeepCopy<typename execution_space::memory_space, + Kokkos::HostSpace>; + raw_deep_copy(m_blocks.data() + (m_blocks.extent(0) - 1u), + &m_last_block_mask, sizeof(unsigned)); + } + } + + /// set all bits to 0 + /// can only be called from the host + void reset() { Kokkos::deep_copy(m_blocks, 0u); } + + /// set all bits to 0 + /// can only be called from the host + void clear() { Kokkos::deep_copy(m_blocks, 0u); } + + /// set i'th bit to 1 + /// can only be called from the device + KOKKOS_FORCEINLINE_FUNCTION + bool set(unsigned i) const { + if (i < m_size) { + unsigned* block_ptr = &m_blocks[i >> block_shift]; + const unsigned mask = 1u << static_cast<int>(i & block_mask); + + return !(atomic_fetch_or(block_ptr, mask) & mask); + } + return false; + } + + /// set i'th bit to 0 + /// can only be called from the device + KOKKOS_FORCEINLINE_FUNCTION + bool reset(unsigned i) const { + if (i < m_size) { + unsigned* block_ptr = &m_blocks[i >> block_shift]; + const unsigned mask = 1u << static_cast<int>(i & block_mask); + + return atomic_fetch_and(block_ptr, ~mask) & mask; + } + return false; + } + + /// return true if the i'th bit set to 1 + /// can only be called from the device + KOKKOS_FORCEINLINE_FUNCTION + bool test(unsigned i) const { + if (i < m_size) { + const unsigned block = volatile_load(&m_blocks[i >> block_shift]); + const unsigned mask = 1u << static_cast<int>(i & block_mask); + return block & mask; + } + return false; + } + + /// used with find_any_set_near or find_any_unset_near functions + /// returns the max number of times those functions should be call + /// when searching for an available bit + KOKKOS_FORCEINLINE_FUNCTION + unsigned max_hint() const { return m_blocks.extent(0); } + + /// find a bit set to 1 near the hint + /// returns a pair< bool, unsigned> where if result.first is true then + /// result.second is the bit found and if result.first is false the + /// result.second is a new hint + KOKKOS_INLINE_FUNCTION + Kokkos::pair<bool, unsigned> find_any_set_near( + unsigned hint, + unsigned scan_direction = BIT_SCAN_FORWARD_MOVE_HINT_FORWARD) const { + const unsigned block_idx = + (hint >> block_shift) < m_blocks.extent(0) ? (hint >> block_shift) : 0; + const unsigned offset = hint & block_mask; + unsigned block = volatile_load(&m_blocks[block_idx]); + block = !m_last_block_mask || (block_idx < (m_blocks.extent(0) - 1)) + ? block + : block & m_last_block_mask; + + return find_any_helper(block_idx, offset, block, scan_direction); + } + + /// find a bit set to 0 near the hint + /// returns a pair< bool, unsigned> where if result.first is true then + /// result.second is the bit found and if result.first is false the + /// result.second is a new hint + KOKKOS_INLINE_FUNCTION + Kokkos::pair<bool, unsigned> find_any_unset_near( + unsigned hint, + unsigned scan_direction = BIT_SCAN_FORWARD_MOVE_HINT_FORWARD) const { + const unsigned block_idx = hint >> block_shift; + const unsigned offset = hint & block_mask; + unsigned block = volatile_load(&m_blocks[block_idx]); + block = !m_last_block_mask || (block_idx < (m_blocks.extent(0) - 1)) + ? ~block + : ~block & m_last_block_mask; + + return find_any_helper(block_idx, offset, block, scan_direction); + } + + KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const { + return m_blocks.is_allocated(); + } + + private: + KOKKOS_FORCEINLINE_FUNCTION + Kokkos::pair<bool, unsigned> find_any_helper(unsigned block_idx, + unsigned offset, unsigned block, + unsigned scan_direction) const { + Kokkos::pair<bool, unsigned> result(block > 0u, 0); + + if (!result.first) { + result.second = update_hint(block_idx, offset, scan_direction); + } else { + result.second = + scan_block((block_idx << block_shift), offset, block, scan_direction); + } + return result; + } + + KOKKOS_FORCEINLINE_FUNCTION + unsigned scan_block(unsigned block_start, int offset, unsigned block, + unsigned scan_direction) const { + offset = !(scan_direction & BIT_SCAN_REVERSE) + ? offset + : (offset + block_mask) & block_mask; + block = Impl::rotate_right(block, offset); + return (((!(scan_direction & BIT_SCAN_REVERSE) + ? Impl::bit_scan_forward(block) + : ::Kokkos::log2(block)) + + offset) & + block_mask) + + block_start; + } + + KOKKOS_FORCEINLINE_FUNCTION + unsigned update_hint(long long block_idx, unsigned offset, + unsigned scan_direction) const { + block_idx += scan_direction & MOVE_HINT_BACKWARD ? -1 : 1; + block_idx = block_idx >= 0 ? block_idx : m_blocks.extent(0) - 1; + block_idx = + block_idx < static_cast<long long>(m_blocks.extent(0)) ? block_idx : 0; + + return static_cast<unsigned>(block_idx) * block_size + offset; + } + + private: + unsigned m_size; + unsigned m_last_block_mask; + View<unsigned*, execution_space, MemoryTraits<RandomAccess> > m_blocks; + + private: + template <typename DDevice> + friend class Bitset; + + template <typename DDevice> + friend class ConstBitset; + + template <typename Bitset> + friend struct Impl::BitsetCount; + + template <typename DstDevice, typename SrcDevice> + friend void deep_copy(Bitset<DstDevice>& dst, Bitset<SrcDevice> const& src); + + template <typename DstDevice, typename SrcDevice> + friend void deep_copy(Bitset<DstDevice>& dst, + ConstBitset<SrcDevice> const& src); +}; + +/// a thread-safe view to a const bitset +/// i.e. can only test bits +template <typename Device> +class ConstBitset { + public: + using execution_space = Device; + using size_type = unsigned int; + + private: + enum { block_size = static_cast<unsigned>(sizeof(unsigned) * CHAR_BIT) }; + enum { block_mask = block_size - 1u }; + enum { block_shift = Kokkos::Impl::integral_power_of_two(block_size) }; + + public: + ConstBitset() : m_size(0) {} + + ConstBitset(Bitset<Device> const& rhs) + : m_size(rhs.m_size), m_blocks(rhs.m_blocks) {} + + ConstBitset(ConstBitset<Device> const& rhs) + : m_size(rhs.m_size), m_blocks(rhs.m_blocks) {} + + ConstBitset<Device>& operator=(Bitset<Device> const& rhs) { + this->m_size = rhs.m_size; + this->m_blocks = rhs.m_blocks; + + return *this; + } + + ConstBitset<Device>& operator=(ConstBitset<Device> const& rhs) { + this->m_size = rhs.m_size; + this->m_blocks = rhs.m_blocks; + + return *this; + } + + KOKKOS_FORCEINLINE_FUNCTION + unsigned size() const { return m_size; } + + unsigned count() const { + Impl::BitsetCount<ConstBitset<Device> > f(*this); + return f.apply(); + } + + KOKKOS_FORCEINLINE_FUNCTION + bool test(unsigned i) const { + if (i < m_size) { + const unsigned block = m_blocks[i >> block_shift]; + const unsigned mask = 1u << static_cast<int>(i & block_mask); + return block & mask; + } + return false; + } + + private: + unsigned m_size; + View<const unsigned*, execution_space, MemoryTraits<RandomAccess> > m_blocks; + + private: + template <typename DDevice> + friend class ConstBitset; + + template <typename Bitset> + friend struct Impl::BitsetCount; + + template <typename DstDevice, typename SrcDevice> + friend void deep_copy(Bitset<DstDevice>& dst, + ConstBitset<SrcDevice> const& src); + + template <typename DstDevice, typename SrcDevice> + friend void deep_copy(ConstBitset<DstDevice>& dst, + ConstBitset<SrcDevice> const& src); +}; + +template <typename DstDevice, typename SrcDevice> +void deep_copy(Bitset<DstDevice>& dst, Bitset<SrcDevice> const& src) { + if (dst.size() != src.size()) { + throw std::runtime_error( + "Error: Cannot deep_copy bitsets of different sizes!"); + } + + using raw_deep_copy = + Kokkos::Impl::DeepCopy<typename DstDevice::memory_space, + typename SrcDevice::memory_space>; + raw_deep_copy(dst.m_blocks.data(), src.m_blocks.data(), + sizeof(unsigned) * src.m_blocks.extent(0)); +} + +template <typename DstDevice, typename SrcDevice> +void deep_copy(Bitset<DstDevice>& dst, ConstBitset<SrcDevice> const& src) { + if (dst.size() != src.size()) { + throw std::runtime_error( + "Error: Cannot deep_copy bitsets of different sizes!"); + } + + using raw_deep_copy = + Kokkos::Impl::DeepCopy<typename DstDevice::memory_space, + typename SrcDevice::memory_space>; + raw_deep_copy(dst.m_blocks.data(), src.m_blocks.data(), + sizeof(unsigned) * src.m_blocks.extent(0)); +} + +template <typename DstDevice, typename SrcDevice> +void deep_copy(ConstBitset<DstDevice>& dst, ConstBitset<SrcDevice> const& src) { + if (dst.size() != src.size()) { + throw std::runtime_error( + "Error: Cannot deep_copy bitsets of different sizes!"); + } + + using raw_deep_copy = + Kokkos::Impl::DeepCopy<typename DstDevice::memory_space, + typename SrcDevice::memory_space>; + raw_deep_copy(dst.m_blocks.data(), src.m_blocks.data(), + sizeof(unsigned) * src.m_blocks.extent(0)); +} + +} // namespace Kokkos + +#endif // KOKKOS_BITSET_HPP diff --git a/packages/kokkos/containers/src/Kokkos_DualView.hpp b/packages/kokkos/containers/src/Kokkos_DualView.hpp new file mode 100644 index 0000000000000000000000000000000000000000..45710d1f737ca14348dd79d698bbc4a581225bbb --- /dev/null +++ b/packages/kokkos/containers/src/Kokkos_DualView.hpp @@ -0,0 +1,1067 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +/// \file Kokkos_DualView.hpp +/// \brief Declaration and definition of Kokkos::DualView. +/// +/// This header file declares and defines Kokkos::DualView and its +/// related nonmember functions. + +#ifndef KOKKOS_DUALVIEW_HPP +#define KOKKOS_DUALVIEW_HPP + +#include <Kokkos_Core.hpp> +#include <impl/Kokkos_Error.hpp> + +namespace Kokkos { + +/* \class DualView + * \brief Container to manage mirroring a Kokkos::View that lives + * in device memory with a Kokkos::View that lives in host memory. + * + * This class provides capabilities to manage data which exists in two + * memory spaces at the same time. It keeps views of the same layout + * on two memory spaces as well as modified flags for both + * allocations. Users are responsible for setting the modified flags + * manually if they change the data in either memory space, by calling + * the sync() method templated on the device where they modified the + * data. Users may synchronize data by calling the modify() function, + * templated on the device towards which they want to synchronize + * (i.e., the target of the one-way copy operation). + * + * The DualView class also provides convenience methods such as + * realloc, resize and capacity which call the appropriate methods of + * the underlying Kokkos::View objects. + * + * The four template arguments are the same as those of Kokkos::View. + * (Please refer to that class' documentation for a detailed + * description.) + * + * \tparam DataType The type of the entries stored in the container. + * + * \tparam Layout The array's layout in memory. + * + * \tparam Device The Kokkos Device type. If its memory space is + * not the same as the host's memory space, then DualView will + * contain two separate Views: one in device memory, and one in + * host memory. Otherwise, DualView will only store one View. + * + * \tparam MemoryTraits (optional) The user's intended memory access + * behavior. Please see the documentation of Kokkos::View for + * examples. The default suffices for most users. + */ + +namespace Impl { + +#ifdef KOKKOS_ENABLE_CUDA + +inline const Kokkos::Cuda& get_cuda_space(const Kokkos::Cuda& in) { return in; } + +inline const Kokkos::Cuda& get_cuda_space() { + return *Kokkos::Impl::cuda_get_deep_copy_space(); +} + +template <typename NonCudaExecSpace> +inline const Kokkos::Cuda& get_cuda_space(const NonCudaExecSpace&) { + return get_cuda_space(); +} + +#endif // KOKKOS_ENABLE_CUDA + +} // namespace Impl +template <class DataType, class Arg1Type = void, class Arg2Type = void, + class Arg3Type = void> +class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> { + template <class, class, class, class> + friend class DualView; + + public: + //! \name Typedefs for device types and various Kokkos::View specializations. + //@{ + using traits = ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type>; + + //! The Kokkos Host Device type; + using host_mirror_space = typename traits::host_mirror_space; + + //! The type of a Kokkos::View on the device. + using t_dev = View<typename traits::data_type, Arg1Type, Arg2Type, Arg3Type>; + + /// \typedef t_host + /// \brief The type of a Kokkos::View host mirror of \c t_dev. + using t_host = typename t_dev::HostMirror; + + //! The type of a const View on the device. + //! The type of a Kokkos::View on the device. + using t_dev_const = + View<typename traits::const_data_type, Arg1Type, Arg2Type, Arg3Type>; + + /// \typedef t_host_const + /// \brief The type of a const View host mirror of \c t_dev_const. + using t_host_const = typename t_dev_const::HostMirror; + + //! The type of a const, random-access View on the device. + using t_dev_const_randomread = + View<typename traits::const_data_type, typename traits::array_layout, + typename traits::device_type, + Kokkos::MemoryTraits<Kokkos::RandomAccess> >; + + /// \typedef t_host_const_randomread + /// \brief The type of a const, random-access View host mirror of + /// \c t_dev_const_randomread. + using t_host_const_randomread = typename t_dev_const_randomread::HostMirror; + + //! The type of an unmanaged View on the device. + using t_dev_um = + View<typename traits::data_type, typename traits::array_layout, + typename traits::device_type, MemoryUnmanaged>; + + //! The type of an unmanaged View host mirror of \c t_dev_um. + using t_host_um = + View<typename t_host::data_type, typename t_host::array_layout, + typename t_host::device_type, MemoryUnmanaged>; + + //! The type of a const unmanaged View on the device. + using t_dev_const_um = + View<typename traits::const_data_type, typename traits::array_layout, + typename traits::device_type, MemoryUnmanaged>; + + //! The type of a const unmanaged View host mirror of \c t_dev_const_um. + using t_host_const_um = + View<typename t_host::const_data_type, typename t_host::array_layout, + typename t_host::device_type, MemoryUnmanaged>; + + //! The type of a const, random-access View on the device. + using t_dev_const_randomread_um = + View<typename t_host::const_data_type, typename t_host::array_layout, + typename t_host::device_type, + Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >; + + /// \typedef t_host_const_randomread + /// \brief The type of a const, random-access View host mirror of + /// \c t_dev_const_randomread. + using t_host_const_randomread_um = + typename t_dev_const_randomread_um::HostMirror; + + //@} + //! \name Counters to keep track of changes ("modified" flags) + //@{ + + protected: + // modified_flags[0] -> host + // modified_flags[1] -> device + using t_modified_flags = View<unsigned int[2], LayoutLeft, Kokkos::HostSpace>; + t_modified_flags modified_flags; + + public: + //@} + + // Moved this specifically after modified_flags to resolve an alignment issue + // on MSVC/NVCC + //! \name The two View instances. + //@{ + t_dev d_view; + t_host h_view; + //@} + + //! \name Constructors + //@{ + + /// \brief Empty constructor. + /// + /// Both device and host View objects are constructed using their + /// default constructors. The "modified" flags are both initialized + /// to "unmodified." + DualView() = default; + + /// \brief Constructor that allocates View objects on both host and device. + /// + /// This constructor works like the analogous constructor of View. + /// The first argument is a string label, which is entirely for your + /// benefit. (Different DualView objects may have the same label if + /// you like.) The arguments that follow are the dimensions of the + /// View objects. For example, if the View has three dimensions, + /// the first three integer arguments will be nonzero, and you may + /// omit the integer arguments that follow. + DualView(const std::string& label, + const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) + : modified_flags(t_modified_flags("DualView::modified_flags")), + d_view(label, n0, n1, n2, n3, n4, n5, n6, n7), + h_view(create_mirror_view(d_view)) // without UVM, host View mirrors + {} + + /// \brief Constructor that allocates View objects on both host and device. + /// + /// This constructor works like the analogous constructor of View. + /// The first arguments are wrapped up in a ViewCtor class, this allows + /// for a label, without initializing, and all of the other things that can + /// be wrapped up in a Ctor class. + /// The arguments that follow are the dimensions of the + /// View objects. For example, if the View has three dimensions, + /// the first three integer arguments will be nonzero, and you may + /// omit the integer arguments that follow. + template <class... P> + DualView(const Impl::ViewCtorProp<P...>& arg_prop, + typename std::enable_if<!Impl::ViewCtorProp<P...>::has_pointer, + size_t>::type const n0 = + KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) + : modified_flags(t_modified_flags("DualView::modified_flags")), + d_view(arg_prop, n0, n1, n2, n3, n4, n5, n6, n7), + h_view(create_mirror_view(d_view)) // without UVM, host View mirrors + {} + + //! Copy constructor (shallow copy) + template <class SS, class LS, class DS, class MS> + DualView(const DualView<SS, LS, DS, MS>& src) + : modified_flags(src.modified_flags), + d_view(src.d_view), + h_view(src.h_view) {} + + //! Subview constructor + template <class SD, class S1, class S2, class S3, class Arg0, class... Args> + DualView(const DualView<SD, S1, S2, S3>& src, const Arg0& arg0, Args... args) + : modified_flags(src.modified_flags), + d_view(Kokkos::subview(src.d_view, arg0, args...)), + h_view(Kokkos::subview(src.h_view, arg0, args...)) {} + + /// \brief Create DualView from existing device and host View objects. + /// + /// This constructor assumes that the device and host View objects + /// are synchronized. You, the caller, are responsible for making + /// sure this is the case before calling this constructor. After + /// this constructor returns, you may use DualView's sync() and + /// modify() methods to ensure synchronization of the View objects. + /// + /// \param d_view_ Device View + /// \param h_view_ Host View (must have type t_host = t_dev::HostMirror) + DualView(const t_dev& d_view_, const t_host& h_view_) + : modified_flags(t_modified_flags("DualView::modified_flags")), + d_view(d_view_), + h_view(h_view_) { + if (int(d_view.rank) != int(h_view.rank) || + d_view.extent(0) != h_view.extent(0) || + d_view.extent(1) != h_view.extent(1) || + d_view.extent(2) != h_view.extent(2) || + d_view.extent(3) != h_view.extent(3) || + d_view.extent(4) != h_view.extent(4) || + d_view.extent(5) != h_view.extent(5) || + d_view.extent(6) != h_view.extent(6) || + d_view.extent(7) != h_view.extent(7) || + d_view.stride_0() != h_view.stride_0() || + d_view.stride_1() != h_view.stride_1() || + d_view.stride_2() != h_view.stride_2() || + d_view.stride_3() != h_view.stride_3() || + d_view.stride_4() != h_view.stride_4() || + d_view.stride_5() != h_view.stride_5() || + d_view.stride_6() != h_view.stride_6() || + d_view.stride_7() != h_view.stride_7() || + d_view.span() != h_view.span()) { + Kokkos::Impl::throw_runtime_exception( + "DualView constructed with incompatible views"); + } + } + // does the DualView have only one device + struct impl_dualview_is_single_device { + enum : bool { + value = std::is_same<typename t_dev::device_type, + typename t_host::device_type>::value + }; + }; + + // does the given device match the device of t_dev? + template <typename Device> + struct impl_device_matches_tdev_device { + enum : bool { + value = std::is_same<typename t_dev::device_type, Device>::value + }; + }; + // does the given device match the device of t_host? + template <typename Device> + struct impl_device_matches_thost_device { + enum : bool { + value = std::is_same<typename t_host::device_type, Device>::value + }; + }; + + // does the given device match the execution space of t_host? + template <typename Device> + struct impl_device_matches_thost_exec { + enum : bool { + value = std::is_same<typename t_host::execution_space, Device>::value + }; + }; + + // does the given device match the execution space of t_dev? + template <typename Device> + struct impl_device_matches_tdev_exec { + enum : bool { + value = std::is_same<typename t_dev::execution_space, Device>::value + }; + }; + + // does the given device's memory space match the memory space of t_dev? + template <typename Device> + struct impl_device_matches_tdev_memory_space { + enum : bool { + value = std::is_same<typename t_dev::memory_space, + typename Device::memory_space>::value + }; + }; + + //@} + //! \name Methods for synchronizing, marking as modified, and getting Views. + //@{ + + /// \brief Return a View on a specific device \c Device. + /// + /// Please don't be afraid of the nested if_c expressions in the return + /// value's type. That just tells the method what the return type + /// should be: t_dev if the \c Device template parameter matches + /// this DualView's device type, else t_host. + /// + /// For example, suppose you create a DualView on Cuda, like this: + /// \code + /// using dual_view_type = + /// Kokkos::DualView<float, Kokkos::LayoutRight, Kokkos::Cuda>; + /// dual_view_type DV ("my dual view", 100); + /// \endcode + /// If you want to get the CUDA device View, do this: + /// \code + /// typename dual_view_type::t_dev cudaView = DV.view<Kokkos::Cuda> (); + /// \endcode + /// and if you want to get the host mirror of that View, do this: + /// \code + /// using host_device_type = typename Kokkos::HostSpace::execution_space; + /// typename dual_view_type::t_host hostView = DV.view<host_device_type> (); + /// \endcode + template <class Device> + KOKKOS_INLINE_FUNCTION const typename std::conditional_t< + impl_device_matches_tdev_device<Device>::value, t_dev, + typename std::conditional_t< + impl_device_matches_thost_device<Device>::value, t_host, + typename std::conditional_t< + impl_device_matches_thost_exec<Device>::value, t_host, + typename std::conditional_t< + impl_device_matches_tdev_exec<Device>::value, t_dev, + typename std::conditional_t< + impl_device_matches_tdev_memory_space<Device>::value, + t_dev, t_host> > > > > + view() const { + constexpr bool device_is_memspace = + std::is_same<Device, typename Device::memory_space>::value; + constexpr bool device_is_execspace = + std::is_same<Device, typename Device::execution_space>::value; + constexpr bool device_exec_is_t_dev_exec = + std::is_same<typename Device::execution_space, + typename t_dev::execution_space>::value; + constexpr bool device_mem_is_t_dev_mem = + std::is_same<typename Device::memory_space, + typename t_dev::memory_space>::value; + constexpr bool device_exec_is_t_host_exec = + std::is_same<typename Device::execution_space, + typename t_host::execution_space>::value; + constexpr bool device_mem_is_t_host_mem = + std::is_same<typename Device::memory_space, + typename t_host::memory_space>::value; + constexpr bool device_is_t_host_device = + std::is_same<typename Device::execution_space, + typename t_host::device_type>::value; + constexpr bool device_is_t_dev_device = + std::is_same<typename Device::memory_space, + typename t_host::device_type>::value; + + static_assert( + device_is_t_dev_device || device_is_t_host_device || + (device_is_memspace && + (device_mem_is_t_dev_mem || device_mem_is_t_host_mem)) || + (device_is_execspace && + (device_exec_is_t_dev_exec || device_exec_is_t_host_exec)) || + ((!device_is_execspace && !device_is_memspace) && + ((device_mem_is_t_dev_mem || device_mem_is_t_host_mem) || + (device_exec_is_t_dev_exec || device_exec_is_t_host_exec))), + "Template parameter to .view() must exactly match one of the " + "DualView's device types or one of the execution or memory spaces"); + + return Impl::if_c<std::is_same<typename t_dev::memory_space, + typename Device::memory_space>::value, + t_dev, t_host>::select(d_view, h_view); + } + + KOKKOS_INLINE_FUNCTION + t_host view_host() const { return h_view; } + + KOKKOS_INLINE_FUNCTION + t_dev view_device() const { return d_view; } + + KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const { + return (d_view.is_allocated() && h_view.is_allocated()); + } + + template <class Device> + static int get_device_side() { + constexpr bool device_is_memspace = + std::is_same<Device, typename Device::memory_space>::value; + constexpr bool device_is_execspace = + std::is_same<Device, typename Device::execution_space>::value; + constexpr bool device_exec_is_t_dev_exec = + std::is_same<typename Device::execution_space, + typename t_dev::execution_space>::value; + constexpr bool device_mem_is_t_dev_mem = + std::is_same<typename Device::memory_space, + typename t_dev::memory_space>::value; + constexpr bool device_exec_is_t_host_exec = + std::is_same<typename Device::execution_space, + typename t_host::execution_space>::value; + constexpr bool device_mem_is_t_host_mem = + std::is_same<typename Device::memory_space, + typename t_host::memory_space>::value; + constexpr bool device_is_t_host_device = + std::is_same<typename Device::execution_space, + typename t_host::device_type>::value; + constexpr bool device_is_t_dev_device = + std::is_same<typename Device::memory_space, + typename t_host::device_type>::value; + + static_assert( + device_is_t_dev_device || device_is_t_host_device || + (device_is_memspace && + (device_mem_is_t_dev_mem || device_mem_is_t_host_mem)) || + (device_is_execspace && + (device_exec_is_t_dev_exec || device_exec_is_t_host_exec)) || + ((!device_is_execspace && !device_is_memspace) && + ((device_mem_is_t_dev_mem || device_mem_is_t_host_mem) || + (device_exec_is_t_dev_exec || device_exec_is_t_host_exec))), + "Template parameter to .sync() must exactly match one of the " + "DualView's device types or one of the execution or memory spaces"); + + int dev = -1; + if (device_is_t_dev_device) + dev = 1; + else if (device_is_t_host_device) + dev = 0; + else { + if (device_is_memspace) { + if (device_mem_is_t_dev_mem) dev = 1; + if (device_mem_is_t_host_mem) dev = 0; + if (device_mem_is_t_host_mem && device_mem_is_t_dev_mem) dev = -1; + } + if (device_is_execspace) { + if (device_exec_is_t_dev_exec) dev = 1; + if (device_exec_is_t_host_exec) dev = 0; + if (device_exec_is_t_host_exec && device_exec_is_t_dev_exec) dev = -1; + } + if (!device_is_execspace && !device_is_memspace) { + if (device_mem_is_t_dev_mem) dev = 1; + if (device_mem_is_t_host_mem) dev = 0; + if (device_mem_is_t_host_mem && device_mem_is_t_dev_mem) dev = -1; + if (device_exec_is_t_dev_exec) dev = 1; + if (device_exec_is_t_host_exec) dev = 0; + if (device_exec_is_t_host_exec && device_exec_is_t_dev_exec) dev = -1; + } + } + return dev; + } + static constexpr const int view_header_size = 128; + void impl_report_host_sync() const noexcept { + if (Kokkos::Tools::Experimental::get_callbacks().sync_dual_view != + nullptr) { + Kokkos::Tools::syncDualView( + h_view.label(), + reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(h_view.data()) - + view_header_size), + false); + } + } + void impl_report_device_sync() const noexcept { + if (Kokkos::Tools::Experimental::get_callbacks().sync_dual_view != + nullptr) { + Kokkos::Tools::syncDualView( + d_view.label(), + reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(d_view.data()) - + view_header_size), + true); + } + } + + /// \brief Update data on device or host only if data in the other + /// space has been marked as modified. + /// + /// If \c Device is the same as this DualView's device type, then + /// copy data from host to device. Otherwise, copy data from device + /// to host. In either case, only copy if the source of the copy + /// has been modified. + /// + /// This is a one-way synchronization only. If the target of the + /// copy has been modified, this operation will discard those + /// modifications. It will also reset both device and host modified + /// flags. + /// + /// \note This method doesn't know on its own whether you modified + /// the data in either View. You must manually mark modified data + /// as modified, by calling the modify() method with the + /// appropriate template parameter. + // deliberately passing args by cref as they're used multiple times + template <class Device, class... Args> + void sync_impl(std::true_type, Args const&... args) { + if (modified_flags.data() == nullptr) return; + + int dev = get_device_side<Device>(); + + if (dev == 1) { // if Device is the same as DualView's device type + if ((modified_flags(0) > 0) && (modified_flags(0) >= modified_flags(1))) { +#ifdef KOKKOS_ENABLE_CUDA + if (std::is_same<typename t_dev::memory_space, + Kokkos::CudaUVMSpace>::value) { + if (d_view.data() == h_view.data()) + Kokkos::Impl::cuda_prefetch_pointer( + Impl::get_cuda_space(args...), d_view.data(), + sizeof(typename t_dev::value_type) * d_view.span(), true); + } +#endif + + deep_copy(args..., d_view, h_view); + modified_flags(0) = modified_flags(1) = 0; + impl_report_device_sync(); + } + } + if (dev == 0) { // hopefully Device is the same as DualView's host type + if ((modified_flags(1) > 0) && (modified_flags(1) >= modified_flags(0))) { +#ifdef KOKKOS_ENABLE_CUDA + if (std::is_same<typename t_dev::memory_space, + Kokkos::CudaUVMSpace>::value) { + if (d_view.data() == h_view.data()) + Kokkos::Impl::cuda_prefetch_pointer( + Impl::get_cuda_space(args...), d_view.data(), + sizeof(typename t_dev::value_type) * d_view.span(), false); + } +#endif + + deep_copy(args..., h_view, d_view); + modified_flags(0) = modified_flags(1) = 0; + impl_report_host_sync(); + } + } + if (std::is_same<typename t_host::memory_space, + typename t_dev::memory_space>::value) { + typename t_dev::execution_space().fence(); + typename t_host::execution_space().fence(); + } + } + + template <class Device> + void sync(const typename std::enable_if< + (std::is_same<typename traits::data_type, + typename traits::non_const_data_type>::value) || + (std::is_same<Device, int>::value), + int>::type& = 0) { + sync_impl<Device>(std::true_type{}); + } + + template <class Device, class ExecutionSpace> + void sync(const ExecutionSpace& exec, + const typename std::enable_if< + (std::is_same<typename traits::data_type, + typename traits::non_const_data_type>::value) || + (std::is_same<Device, int>::value), + int>::type& = 0) { + sync_impl<Device>(std::true_type{}, exec); + } + + // deliberately passing args by cref as they're used multiple times + template <class Device, class... Args> + void sync_impl(std::false_type, Args const&...) { + if (modified_flags.data() == nullptr) return; + + int dev = get_device_side<Device>(); + + if (dev == 1) { // if Device is the same as DualView's device type + if ((modified_flags(0) > 0) && (modified_flags(0) >= modified_flags(1))) { + Impl::throw_runtime_exception( + "Calling sync on a DualView with a const datatype."); + } + impl_report_device_sync(); + } + if (dev == 0) { // hopefully Device is the same as DualView's host type + if ((modified_flags(1) > 0) && (modified_flags(1) >= modified_flags(0))) { + Impl::throw_runtime_exception( + "Calling sync on a DualView with a const datatype."); + } + impl_report_host_sync(); + } + } + + template <class Device> + void sync(const typename std::enable_if< + (!std::is_same<typename traits::data_type, + typename traits::non_const_data_type>::value) || + (std::is_same<Device, int>::value), + int>::type& = 0) { + sync_impl<Device>(std::false_type{}); + } + template <class Device, class ExecutionSpace> + void sync(const ExecutionSpace& exec, + const typename std::enable_if< + (!std::is_same<typename traits::data_type, + typename traits::non_const_data_type>::value) || + (std::is_same<Device, int>::value), + int>::type& = 0) { + sync_impl<Device>(std::false_type{}, exec); + } + + // deliberately passing args by cref as they're used multiple times + template <typename... Args> + void sync_host_impl(Args const&... args) { + if (!std::is_same<typename traits::data_type, + typename traits::non_const_data_type>::value) + Impl::throw_runtime_exception( + "Calling sync_host on a DualView with a const datatype."); + if (modified_flags.data() == nullptr) return; + if (modified_flags(1) > modified_flags(0)) { +#ifdef KOKKOS_ENABLE_CUDA + if (std::is_same<typename t_dev::memory_space, + Kokkos::CudaUVMSpace>::value) { + if (d_view.data() == h_view.data()) + Kokkos::Impl::cuda_prefetch_pointer( + Impl::get_cuda_space(args...), d_view.data(), + sizeof(typename t_dev::value_type) * d_view.span(), false); + } +#endif + + deep_copy(args..., h_view, d_view); + modified_flags(1) = modified_flags(0) = 0; + impl_report_host_sync(); + } + } + + template <class ExecSpace> + void sync_host(const ExecSpace& exec) { + sync_host_impl(exec); + } + void sync_host() { sync_host_impl(); } + + // deliberately passing args by cref as they're used multiple times + template <typename... Args> + void sync_device_impl(Args const&... args) { + if (!std::is_same<typename traits::data_type, + typename traits::non_const_data_type>::value) + Impl::throw_runtime_exception( + "Calling sync_device on a DualView with a const datatype."); + if (modified_flags.data() == nullptr) return; + if (modified_flags(0) > modified_flags(1)) { +#ifdef KOKKOS_ENABLE_CUDA + if (std::is_same<typename t_dev::memory_space, + Kokkos::CudaUVMSpace>::value) { + if (d_view.data() == h_view.data()) + Kokkos::Impl::cuda_prefetch_pointer( + Impl::get_cuda_space(args...), d_view.data(), + sizeof(typename t_dev::value_type) * d_view.span(), true); + } +#endif + + deep_copy(args..., d_view, h_view); + modified_flags(1) = modified_flags(0) = 0; + impl_report_device_sync(); + } + } + + template <class ExecSpace> + void sync_device(const ExecSpace& exec) { + sync_device_impl(exec); + } + void sync_device() { sync_device_impl(); } + + template <class Device> + bool need_sync() const { + if (modified_flags.data() == nullptr) return false; + int dev = get_device_side<Device>(); + + if (dev == 1) { // if Device is the same as DualView's device type + if ((modified_flags(0) > 0) && (modified_flags(0) >= modified_flags(1))) { + return true; + } + } + if (dev == 0) { // hopefully Device is the same as DualView's host type + if ((modified_flags(1) > 0) && (modified_flags(1) >= modified_flags(0))) { + return true; + } + } + return false; + } + + inline bool need_sync_host() const { + if (modified_flags.data() == nullptr) return false; + return modified_flags(0) < modified_flags(1); + } + + inline bool need_sync_device() const { + if (modified_flags.data() == nullptr) return false; + return modified_flags(1) < modified_flags(0); + } + void impl_report_device_modification() { + if (Kokkos::Tools::Experimental::get_callbacks().modify_dual_view != + nullptr) { + Kokkos::Tools::modifyDualView( + d_view.label(), + reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(d_view.data()) - + view_header_size), + true); + } + } + void impl_report_host_modification() { + if (Kokkos::Tools::Experimental::get_callbacks().modify_dual_view != + nullptr) { + Kokkos::Tools::modifyDualView( + h_view.label(), + reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(h_view.data()) - + view_header_size), + false); + } + } + /// \brief Mark data as modified on the given device \c Device. + /// + /// If \c Device is the same as this DualView's device type, then + /// mark the device's data as modified. Otherwise, mark the host's + /// data as modified. + template <class Device> + void modify() { + if (modified_flags.data() == nullptr) return; + if (impl_dualview_is_single_device::value) return; + int dev = get_device_side<Device>(); + + if (dev == 1) { // if Device is the same as DualView's device type + // Increment the device's modified count. + modified_flags(1) = + (modified_flags(1) > modified_flags(0) ? modified_flags(1) + : modified_flags(0)) + + 1; + impl_report_device_modification(); + } + if (dev == 0) { // hopefully Device is the same as DualView's host type + // Increment the host's modified count. + modified_flags(0) = + (modified_flags(1) > modified_flags(0) ? modified_flags(1) + : modified_flags(0)) + + 1; + impl_report_host_modification(); + } + +#ifdef KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK + if (modified_flags(0) && modified_flags(1)) { + std::string msg = "Kokkos::DualView::modify ERROR: "; + msg += "Concurrent modification of host and device views "; + msg += "in DualView \""; + msg += d_view.label(); + msg += "\"\n"; + Kokkos::abort(msg.c_str()); + } +#endif + } + + inline void modify_host() { + if (impl_dualview_is_single_device::value) return; + if (modified_flags.data() != nullptr) { + modified_flags(0) = + (modified_flags(1) > modified_flags(0) ? modified_flags(1) + : modified_flags(0)) + + 1; + impl_report_host_modification(); +#ifdef KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK + if (modified_flags(0) && modified_flags(1)) { + std::string msg = "Kokkos::DualView::modify_host ERROR: "; + msg += "Concurrent modification of host and device views "; + msg += "in DualView \""; + msg += d_view.label(); + msg += "\"\n"; + Kokkos::abort(msg.c_str()); + } +#endif + } + } + + inline void modify_device() { + if (impl_dualview_is_single_device::value) return; + if (modified_flags.data() != nullptr) { + modified_flags(1) = + (modified_flags(1) > modified_flags(0) ? modified_flags(1) + : modified_flags(0)) + + 1; + impl_report_device_modification(); +#ifdef KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK + if (modified_flags(0) && modified_flags(1)) { + std::string msg = "Kokkos::DualView::modify_device ERROR: "; + msg += "Concurrent modification of host and device views "; + msg += "in DualView \""; + msg += d_view.label(); + msg += "\"\n"; + Kokkos::abort(msg.c_str()); + } +#endif + } + } + + inline void clear_sync_state() { + if (modified_flags.data() != nullptr) + modified_flags(1) = modified_flags(0) = 0; + } + + //@} + //! \name Methods for reallocating or resizing the View objects. + //@{ + + /// \brief Reallocate both View objects. + /// + /// This discards any existing contents of the objects, and resets + /// their modified flags. It does <i>not</i> copy the old contents + /// of either View into the new View objects. + void realloc(const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) { + ::Kokkos::realloc(d_view, n0, n1, n2, n3, n4, n5, n6, n7); + h_view = create_mirror_view(d_view); + + /* Reset dirty flags */ + if (modified_flags.data() == nullptr) { + modified_flags = t_modified_flags("DualView::modified_flags"); + } else + modified_flags(1) = modified_flags(0) = 0; + } + + /// \brief Resize both views, copying old contents into new if necessary. + /// + /// This method only copies the old contents into the new View + /// objects for the device which was last marked as modified. + void resize(const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) { + if (modified_flags.data() == nullptr) { + modified_flags = t_modified_flags("DualView::modified_flags"); + } + if (modified_flags(1) >= modified_flags(0)) { + /* Resize on Device */ + ::Kokkos::resize(d_view, n0, n1, n2, n3, n4, n5, n6, n7); + h_view = create_mirror_view(d_view); + + /* Mark Device copy as modified */ + modified_flags(1) = modified_flags(1) + 1; + + } else { + /* Realloc on Device */ + + ::Kokkos::realloc(d_view, n0, n1, n2, n3, n4, n5, n6, n7); + + const bool sizeMismatch = + (h_view.extent(0) != n0) || (h_view.extent(1) != n1) || + (h_view.extent(2) != n2) || (h_view.extent(3) != n3) || + (h_view.extent(4) != n4) || (h_view.extent(5) != n5) || + (h_view.extent(6) != n6) || (h_view.extent(7) != n7); + if (sizeMismatch) + ::Kokkos::resize(h_view, n0, n1, n2, n3, n4, n5, n6, n7); + + t_host temp_view = create_mirror_view(d_view); + + /* Remap on Host */ + Kokkos::deep_copy(temp_view, h_view); + + h_view = temp_view; + + d_view = create_mirror_view(typename t_dev::execution_space(), h_view); + + /* Mark Host copy as modified */ + modified_flags(0) = modified_flags(0) + 1; + } + } + + //@} + //! \name Methods for getting capacity, stride, or dimension(s). + //@{ + + //! The allocation size (same as Kokkos::View::span). + KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return d_view.span(); } + + KOKKOS_INLINE_FUNCTION bool span_is_contiguous() const { + return d_view.span_is_contiguous(); + } + + //! Get stride(s) for each dimension. + template <typename iType> + void stride(iType* stride_) const { + d_view.stride(stride_); + } + + template <typename iType> + KOKKOS_INLINE_FUNCTION constexpr + typename std::enable_if<std::is_integral<iType>::value, size_t>::type + extent(const iType& r) const { + return d_view.extent(r); + } + + template <typename iType> + KOKKOS_INLINE_FUNCTION constexpr + typename std::enable_if<std::is_integral<iType>::value, int>::type + extent_int(const iType& r) const { + return static_cast<int>(d_view.extent(r)); + } + + //@} +}; + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- +// +// Partial specializations of Kokkos::subview() for DualView objects. +// + +namespace Kokkos { +namespace Impl { + +template <class D, class A1, class A2, class A3, class... Args> +struct DualViewSubview { + using dst_traits = typename Kokkos::Impl::ViewMapping< + void, Kokkos::ViewTraits<D, A1, A2, A3>, Args...>::traits_type; + + using type = Kokkos::DualView< + typename dst_traits::data_type, typename dst_traits::array_layout, + typename dst_traits::device_type, typename dst_traits::memory_traits>; +}; + +} /* namespace Impl */ + +template <class D, class A1, class A2, class A3, class... Args> +typename Impl::DualViewSubview<D, A1, A2, A3, Args...>::type subview( + const DualView<D, A1, A2, A3>& src, Args... args) { + return typename Impl::DualViewSubview<D, A1, A2, A3, Args...>::type(src, + args...); +} + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +// +// Partial specialization of Kokkos::deep_copy() for DualView objects. +// + +template <class DT, class DL, class DD, class DM, class ST, class SL, class SD, + class SM> +void deep_copy( + DualView<DT, DL, DD, DM> dst, // trust me, this must not be a reference + const DualView<ST, SL, SD, SM>& src) { + if (src.need_sync_device()) { + deep_copy(dst.h_view, src.h_view); + dst.modify_host(); + } else { + deep_copy(dst.d_view, src.d_view); + dst.modify_device(); + } +} + +template <class ExecutionSpace, class DT, class DL, class DD, class DM, + class ST, class SL, class SD, class SM> +void deep_copy( + const ExecutionSpace& exec, + DualView<DT, DL, DD, DM> dst, // trust me, this must not be a reference + const DualView<ST, SL, SD, SM>& src) { + if (src.need_sync_device()) { + deep_copy(exec, dst.h_view, src.h_view); + dst.modify_host(); + } else { + deep_copy(exec, dst.d_view, src.d_view); + dst.modify_device(); + } +} + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +// +// Non-member resize and realloc +// + +template <class... Properties, class... Args> +void resize(DualView<Properties...>& dv, Args&&... args) noexcept( + noexcept(dv.resize(std::forward<Args>(args)...))) { + dv.resize(std::forward<Args>(args)...); +} + +template <class... Properties, class... Args> +void realloc(DualView<Properties...>& dv, Args&&... args) noexcept( + noexcept(dv.realloc(std::forward<Args>(args)...))) { + dv.realloc(std::forward<Args>(args)...); +} + +} // end namespace Kokkos + +#endif diff --git a/packages/kokkos/containers/src/Kokkos_DynRankView.hpp b/packages/kokkos/containers/src/Kokkos_DynRankView.hpp new file mode 100644 index 0000000000000000000000000000000000000000..c6323fef93694de1ee39d5784141bf6991f78bd7 --- /dev/null +++ b/packages/kokkos/containers/src/Kokkos_DynRankView.hpp @@ -0,0 +1,2074 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +/// \file Kokkos_DynRankView.hpp +/// \brief Declaration and definition of Kokkos::DynRankView. +/// +/// This header file declares and defines Kokkos::DynRankView and its +/// related nonmember functions. + +#ifndef KOKKOS_DYNRANKVIEW_HPP +#define KOKKOS_DYNRANKVIEW_HPP + +#include <Kokkos_Core.hpp> +#include <impl/Kokkos_Error.hpp> +#include <type_traits> + +namespace Kokkos { + +template <typename DataType, class... Properties> +class DynRankView; // forward declare + +namespace Impl { + +template <typename Specialize> +struct DynRankDimTraits { + enum : size_t { unspecified = KOKKOS_INVALID_INDEX }; + + // Compute the rank of the view from the nonzero dimension arguments. + KOKKOS_INLINE_FUNCTION + static size_t computeRank(const size_t N0, const size_t N1, const size_t N2, + const size_t N3, const size_t N4, const size_t N5, + const size_t N6, const size_t /* N7 */) { + return ( + (N6 == unspecified && N5 == unspecified && N4 == unspecified && + N3 == unspecified && N2 == unspecified && N1 == unspecified && + N0 == unspecified) + ? 0 + : ((N6 == unspecified && N5 == unspecified && N4 == unspecified && + N3 == unspecified && N2 == unspecified && N1 == unspecified) + ? 1 + : ((N6 == unspecified && N5 == unspecified && + N4 == unspecified && N3 == unspecified && + N2 == unspecified) + ? 2 + : ((N6 == unspecified && N5 == unspecified && + N4 == unspecified && N3 == unspecified) + ? 3 + : ((N6 == unspecified && N5 == unspecified && + N4 == unspecified) + ? 4 + : ((N6 == unspecified && + N5 == unspecified) + ? 5 + : ((N6 == unspecified) + ? 6 + : 7))))))); + } + + // Compute the rank of the view from the nonzero layout arguments. + template <typename Layout> + KOKKOS_INLINE_FUNCTION static size_t computeRank(const Layout& layout) { + return computeRank(layout.dimension[0], layout.dimension[1], + layout.dimension[2], layout.dimension[3], + layout.dimension[4], layout.dimension[5], + layout.dimension[6], layout.dimension[7]); + } + + // Extra overload to match that for specialize types v2 + template <typename Layout, typename... P> + KOKKOS_INLINE_FUNCTION static size_t computeRank( + const Kokkos::Impl::ViewCtorProp<P...>& /* prop */, + const Layout& layout) { + return computeRank(layout); + } + + // Create the layout for the rank-7 view. + // Non-strided Layout + template <typename Layout> + KOKKOS_INLINE_FUNCTION static typename std::enable_if< + (std::is_same<Layout, Kokkos::LayoutRight>::value || + std::is_same<Layout, Kokkos::LayoutLeft>::value), + Layout>::type + createLayout(const Layout& layout) { + return Layout(layout.dimension[0] != unspecified ? layout.dimension[0] : 1, + layout.dimension[1] != unspecified ? layout.dimension[1] : 1, + layout.dimension[2] != unspecified ? layout.dimension[2] : 1, + layout.dimension[3] != unspecified ? layout.dimension[3] : 1, + layout.dimension[4] != unspecified ? layout.dimension[4] : 1, + layout.dimension[5] != unspecified ? layout.dimension[5] : 1, + layout.dimension[6] != unspecified ? layout.dimension[6] : 1, + layout.dimension[7] != unspecified ? layout.dimension[7] : 1); + } + + // LayoutStride + template <typename Layout> + KOKKOS_INLINE_FUNCTION static typename std::enable_if< + (std::is_same<Layout, Kokkos::LayoutStride>::value), Layout>::type + createLayout(const Layout& layout) { + return Layout(layout.dimension[0] != unspecified ? layout.dimension[0] : 1, + layout.stride[0], + layout.dimension[1] != unspecified ? layout.dimension[1] : 1, + layout.stride[1], + layout.dimension[2] != unspecified ? layout.dimension[2] : 1, + layout.stride[2], + layout.dimension[3] != unspecified ? layout.dimension[3] : 1, + layout.stride[3], + layout.dimension[4] != unspecified ? layout.dimension[4] : 1, + layout.stride[4], + layout.dimension[5] != unspecified ? layout.dimension[5] : 1, + layout.stride[5], + layout.dimension[6] != unspecified ? layout.dimension[6] : 1, + layout.stride[6], + layout.dimension[7] != unspecified ? layout.dimension[7] : 1, + layout.stride[7]); + } + + // Extra overload to match that for specialize types + template <typename Traits, typename... P> + KOKKOS_INLINE_FUNCTION static typename std::enable_if< + (std::is_same<typename Traits::array_layout, + Kokkos::LayoutRight>::value || + std::is_same<typename Traits::array_layout, Kokkos::LayoutLeft>::value || + std::is_same<typename Traits::array_layout, + Kokkos::LayoutStride>::value), + typename Traits::array_layout>::type + createLayout(const Kokkos::Impl::ViewCtorProp<P...>& /* prop */, + const typename Traits::array_layout& layout) { + return createLayout(layout); + } + + // Create a view from the given dimension arguments. + // This is only necessary because the shmem constructor doesn't take a layout. + // NDE shmem View's are not compatible with the added view_alloc value_type + // / fad_dim deduction functionality + template <typename ViewType, typename ViewArg> + static ViewType createView(const ViewArg& arg, const size_t N0, + const size_t N1, const size_t N2, const size_t N3, + const size_t N4, const size_t N5, const size_t N6, + const size_t N7) { + return ViewType(arg, N0 != unspecified ? N0 : 1, N1 != unspecified ? N1 : 1, + N2 != unspecified ? N2 : 1, N3 != unspecified ? N3 : 1, + N4 != unspecified ? N4 : 1, N5 != unspecified ? N5 : 1, + N6 != unspecified ? N6 : 1, N7 != unspecified ? N7 : 1); + } +}; + +// Non-strided Layout +template <typename Layout, typename iType> +KOKKOS_INLINE_FUNCTION static + typename std::enable_if<(std::is_same<Layout, Kokkos::LayoutRight>::value || + std::is_same<Layout, Kokkos::LayoutLeft>::value) && + std::is_integral<iType>::value, + Layout>::type + reconstructLayout(const Layout& layout, iType dynrank) { + return Layout(dynrank > 0 ? layout.dimension[0] : KOKKOS_INVALID_INDEX, + dynrank > 1 ? layout.dimension[1] : KOKKOS_INVALID_INDEX, + dynrank > 2 ? layout.dimension[2] : KOKKOS_INVALID_INDEX, + dynrank > 3 ? layout.dimension[3] : KOKKOS_INVALID_INDEX, + dynrank > 4 ? layout.dimension[4] : KOKKOS_INVALID_INDEX, + dynrank > 5 ? layout.dimension[5] : KOKKOS_INVALID_INDEX, + dynrank > 6 ? layout.dimension[6] : KOKKOS_INVALID_INDEX, + dynrank > 7 ? layout.dimension[7] : KOKKOS_INVALID_INDEX); +} + +// LayoutStride +template <typename Layout, typename iType> +KOKKOS_INLINE_FUNCTION static typename std::enable_if< + (std::is_same<Layout, Kokkos::LayoutStride>::value) && + std::is_integral<iType>::value, + Layout>::type +reconstructLayout(const Layout& layout, iType dynrank) { + return Layout(dynrank > 0 ? layout.dimension[0] : KOKKOS_INVALID_INDEX, + dynrank > 0 ? layout.stride[0] : (0), + dynrank > 1 ? layout.dimension[1] : KOKKOS_INVALID_INDEX, + dynrank > 1 ? layout.stride[1] : (0), + dynrank > 2 ? layout.dimension[2] : KOKKOS_INVALID_INDEX, + dynrank > 2 ? layout.stride[2] : (0), + dynrank > 3 ? layout.dimension[3] : KOKKOS_INVALID_INDEX, + dynrank > 3 ? layout.stride[3] : (0), + dynrank > 4 ? layout.dimension[4] : KOKKOS_INVALID_INDEX, + dynrank > 4 ? layout.stride[4] : (0), + dynrank > 5 ? layout.dimension[5] : KOKKOS_INVALID_INDEX, + dynrank > 5 ? layout.stride[5] : (0), + dynrank > 6 ? layout.dimension[6] : KOKKOS_INVALID_INDEX, + dynrank > 6 ? layout.stride[6] : (0), + dynrank > 7 ? layout.dimension[7] : KOKKOS_INVALID_INDEX, + dynrank > 7 ? layout.stride[7] : (0)); +} + +/** \brief Debug bounds-checking routines */ +// Enhanced debug checking - most infrastructure matches that of functions in +// Kokkos_ViewMapping; additional checks for extra arguments beyond rank are 0 +template <unsigned, typename iType0, class MapType> +KOKKOS_INLINE_FUNCTION bool dyn_rank_view_verify_operator_bounds( + const iType0&, const MapType&) { + return true; +} + +template <unsigned R, typename iType0, class MapType, typename iType1, + class... Args> +KOKKOS_INLINE_FUNCTION bool dyn_rank_view_verify_operator_bounds( + const iType0& rank, const MapType& map, const iType1& i, Args... args) { + if (static_cast<iType0>(R) < rank) { + return (size_t(i) < map.extent(R)) && + dyn_rank_view_verify_operator_bounds<R + 1>(rank, map, args...); + } else if (i != 0) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "DynRankView Debug Bounds Checking Error: at rank %u\n Extra " + "arguments beyond the rank must be zero \n", + R); + return (false) && + dyn_rank_view_verify_operator_bounds<R + 1>(rank, map, args...); + } else { + return (true) && + dyn_rank_view_verify_operator_bounds<R + 1>(rank, map, args...); + } +} + +template <unsigned, class MapType> +inline void dyn_rank_view_error_operator_bounds(char*, int, const MapType&) {} + +template <unsigned R, class MapType, class iType, class... Args> +inline void dyn_rank_view_error_operator_bounds(char* buf, int len, + const MapType& map, + const iType& i, Args... args) { + const int n = snprintf( + buf, len, " %ld < %ld %c", static_cast<unsigned long>(i), + static_cast<unsigned long>(map.extent(R)), (sizeof...(Args) ? ',' : ')')); + dyn_rank_view_error_operator_bounds<R + 1>(buf + n, len - n, map, args...); +} + +// op_rank = rank of the operator version that was called +template <typename MemorySpace, typename iType0, typename iType1, class MapType, + class... Args> +KOKKOS_INLINE_FUNCTION void dyn_rank_view_verify_operator_bounds( + const iType0& op_rank, const iType1& rank, + const Kokkos::Impl::SharedAllocationTracker& tracker, const MapType& map, + Args... args) { + if (static_cast<iType0>(rank) > op_rank) { + Kokkos::abort( + "DynRankView Bounds Checking Error: Need at least rank arguments to " + "the operator()"); + } + + if (!dyn_rank_view_verify_operator_bounds<0>(rank, map, args...)) { +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + enum { LEN = 1024 }; + char buffer[LEN]; + const std::string label = tracker.template get_label<MemorySpace>(); + int n = snprintf(buffer, LEN, "DynRankView bounds error of view %s (", + label.c_str()); + dyn_rank_view_error_operator_bounds<0>(buffer + n, LEN - n, map, args...); + Kokkos::Impl::throw_runtime_exception(std::string(buffer)); +#else + (void)tracker; + Kokkos::abort("DynRankView bounds error"); +#endif + } +} + +/** \brief Assign compatible default mappings */ +struct ViewToDynRankViewTag {}; + +} // namespace Impl + +namespace Impl { + +template <class DstTraits, class SrcTraits> +class ViewMapping< + DstTraits, SrcTraits, + typename std::enable_if< + (std::is_same<typename DstTraits::memory_space, + typename SrcTraits::memory_space>::value && + std::is_same<typename DstTraits::specialize, void>::value && + std::is_same<typename SrcTraits::specialize, void>::value && + (std::is_same<typename DstTraits::array_layout, + typename SrcTraits::array_layout>::value || + ((std::is_same<typename DstTraits::array_layout, + Kokkos::LayoutLeft>::value || + std::is_same<typename DstTraits::array_layout, + Kokkos::LayoutRight>::value || + std::is_same<typename DstTraits::array_layout, + Kokkos::LayoutStride>::value) && + (std::is_same<typename SrcTraits::array_layout, + Kokkos::LayoutLeft>::value || + std::is_same<typename SrcTraits::array_layout, + Kokkos::LayoutRight>::value || + std::is_same<typename SrcTraits::array_layout, + Kokkos::LayoutStride>::value)))), + Kokkos::Impl::ViewToDynRankViewTag>::type> { + private: + enum { + is_assignable_value_type = + std::is_same<typename DstTraits::value_type, + typename SrcTraits::value_type>::value || + std::is_same<typename DstTraits::value_type, + typename SrcTraits::const_value_type>::value + }; + + enum { + is_assignable_layout = + std::is_same<typename DstTraits::array_layout, + typename SrcTraits::array_layout>::value || + std::is_same<typename DstTraits::array_layout, + Kokkos::LayoutStride>::value + }; + + public: + enum { is_assignable = is_assignable_value_type && is_assignable_layout }; + + using DstType = ViewMapping<DstTraits, typename DstTraits::specialize>; + using SrcType = ViewMapping<SrcTraits, typename SrcTraits::specialize>; + + template <typename DT, typename... DP, typename ST, typename... SP> + KOKKOS_INLINE_FUNCTION static void assign( + Kokkos::DynRankView<DT, DP...>& dst, const Kokkos::View<ST, SP...>& src) { + static_assert( + is_assignable_value_type, + "View assignment must have same value type or const = non-const"); + + static_assert( + is_assignable_layout, + "View assignment must have compatible layout or have rank <= 1"); + + // Removed dimension checks... + + using dst_offset_type = typename DstType::offset_type; + dst.m_map.m_impl_offset = dst_offset_type( + std::integral_constant<unsigned, 0>(), + src.layout()); // Check this for integer input1 for padding, etc + dst.m_map.m_impl_handle = Kokkos::Impl::ViewDataHandle<DstTraits>::assign( + src.m_map.m_impl_handle, src.m_track.m_tracker); + dst.m_track.assign(src.m_track.m_tracker, DstTraits::is_managed); + dst.m_rank = src.Rank; + } +}; + +} // namespace Impl + +/* \class DynRankView + * \brief Container that creates a Kokkos view with rank determined at runtime. + * Essentially this is a rank 7 view + * + * Changes from View + * 1. The rank of the DynRankView is returned by the method rank() + * 2. Max rank of a DynRankView is 7 + * 3. subview called with 'subview(...)' or 'subdynrankview(...)' (backward + * compatibility) + * 4. Every subview is returned with LayoutStride + * 5. Copy and Copy-Assign View to DynRankView + * 6. deep_copy between Views and DynRankViews + * 7. rank( view ); returns the rank of View or DynRankView + * + */ + +template <class> +struct is_dyn_rank_view : public std::false_type {}; + +template <class D, class... P> +struct is_dyn_rank_view<Kokkos::DynRankView<D, P...> > : public std::true_type { +}; + +template <typename DataType, class... Properties> +class DynRankView : public ViewTraits<DataType, Properties...> { + static_assert(!std::is_array<DataType>::value && + !std::is_pointer<DataType>::value, + "Cannot template DynRankView with array or pointer datatype - " + "must be pod"); + + private: + template <class, class...> + friend class DynRankView; + template <class, class...> + friend class Kokkos::Impl::ViewMapping; + + public: + using drvtraits = ViewTraits<DataType, Properties...>; + + using view_type = View<DataType*******, Properties...>; + + using traits = ViewTraits<DataType*******, Properties...>; + + private: + using map_type = + Kokkos::Impl::ViewMapping<traits, typename traits::specialize>; + using track_type = Kokkos::Impl::SharedAllocationTracker; + + track_type m_track; + map_type m_map; + unsigned m_rank; + + public: + KOKKOS_INLINE_FUNCTION + view_type& DownCast() const { return (view_type&)(*this); } + KOKKOS_INLINE_FUNCTION + const view_type& ConstDownCast() const { return (const view_type&)(*this); } + + // Types below - at least the HostMirror requires the value_type, NOT the rank + // 7 data_type of the traits + + /** \brief Compatible view of array of scalar types */ + using array_type = DynRankView< + typename drvtraits::scalar_array_type, typename drvtraits::array_layout, + typename drvtraits::device_type, typename drvtraits::memory_traits>; + + /** \brief Compatible view of const data type */ + using const_type = DynRankView< + typename drvtraits::const_data_type, typename drvtraits::array_layout, + typename drvtraits::device_type, typename drvtraits::memory_traits>; + + /** \brief Compatible view of non-const data type */ + using non_const_type = DynRankView< + typename drvtraits::non_const_data_type, typename drvtraits::array_layout, + typename drvtraits::device_type, typename drvtraits::memory_traits>; + + /** \brief Compatible HostMirror view */ + using HostMirror = DynRankView<typename drvtraits::non_const_data_type, + typename drvtraits::array_layout, + typename drvtraits::host_mirror_space>; + + //---------------------------------------- + // Domain rank and extents + + // enum { Rank = map_type::Rank }; //Will be dyn rank of 7 always, keep the + // enum? + + template <typename iType> + KOKKOS_INLINE_FUNCTION constexpr + typename std::enable_if<std::is_integral<iType>::value, size_t>::type + extent(const iType& r) const { + return m_map.extent(r); + } + + template <typename iType> + KOKKOS_INLINE_FUNCTION constexpr + typename std::enable_if<std::is_integral<iType>::value, int>::type + extent_int(const iType& r) const { + return static_cast<int>(m_map.extent(r)); + } + + KOKKOS_INLINE_FUNCTION constexpr typename traits::array_layout layout() + const { + return m_map.layout(); + } + + //---------------------------------------- + /* Deprecate all 'dimension' functions in favor of + * ISO/C++ vocabulary 'extent'. + */ + + KOKKOS_INLINE_FUNCTION constexpr size_t size() const { + return m_map.extent(0) * m_map.extent(1) * m_map.extent(2) * + m_map.extent(3) * m_map.extent(4) * m_map.extent(5) * + m_map.extent(6) * m_map.extent(7); + } + + KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { + return m_map.stride_0(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { + return m_map.stride_1(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { + return m_map.stride_2(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { + return m_map.stride_3(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { + return m_map.stride_4(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { + return m_map.stride_5(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { + return m_map.stride_6(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { + return m_map.stride_7(); + } + + template <typename iType> + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + m_map.stride(s); + } + + //---------------------------------------- + // Range span is the span which contains all members. + + using reference_type = typename map_type::reference_type; + using pointer_type = typename map_type::pointer_type; + + enum { + reference_type_is_lvalue_reference = + std::is_lvalue_reference<reference_type>::value + }; + + KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_map.span(); } + KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { + return m_map.span_is_contiguous(); + } + KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { + return m_map.data(); + } + KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const { + return (m_map.data() != nullptr); + } + + //---------------------------------------- + // Allow specializations to query their specialized map + KOKKOS_INLINE_FUNCTION + const Kokkos::Impl::ViewMapping<traits, typename traits::specialize>& + impl_map() const { + return m_map; + } + + //---------------------------------------- + + private: + enum { + is_layout_left = + std::is_same<typename traits::array_layout, Kokkos::LayoutLeft>::value, + + is_layout_right = + std::is_same<typename traits::array_layout, Kokkos::LayoutRight>::value, + + is_layout_stride = std::is_same<typename traits::array_layout, + Kokkos::LayoutStride>::value, + + is_default_map = std::is_same<typename traits::specialize, void>::value && + (is_layout_left || is_layout_right || is_layout_stride) + }; + +// Bounds checking macros +#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) + +// rank of the calling operator - included as first argument in ARG +#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(ARG) \ + Kokkos::Impl::verify_space<Kokkos::Impl::ActiveExecutionMemorySpace, \ + typename traits::memory_space>::check(); \ + Kokkos::Impl::dyn_rank_view_verify_operator_bounds< \ + typename traits::memory_space> \ + ARG; + +#else + +#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(ARG) \ + Kokkos::Impl::verify_space<Kokkos::Impl::ActiveExecutionMemorySpace, \ + typename traits::memory_space>::check(); + +#endif + + public: + KOKKOS_INLINE_FUNCTION + constexpr unsigned rank() const { return m_rank; } + + // operators () + // Rank 0 + KOKKOS_INLINE_FUNCTION + reference_type operator()() const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((0, this->rank(), m_track, m_map)) + return impl_map().reference(); + // return m_map.reference(0,0,0,0,0,0,0); + } + + // Rank 1 + // This assumes a contiguous underlying memory (i.e. no padding, no + // striding...) + template <typename iType> + KOKKOS_INLINE_FUNCTION typename std::enable_if< + std::is_same<typename drvtraits::value_type, + typename drvtraits::scalar_array_type>::value && + std::is_integral<iType>::value, + reference_type>::type + operator[](const iType& i0) const { + // Phalanx is violating this, since they use the operator to access ALL + // elements in the allocation KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (1 , + // this->rank(), m_track, m_map) ) + return data()[i0]; + } + + // This assumes a contiguous underlying memory (i.e. no padding, no + // striding... AND a Trilinos/Sacado scalar type ) + template <typename iType> + KOKKOS_INLINE_FUNCTION typename std::enable_if< + !std::is_same<typename drvtraits::value_type, + typename drvtraits::scalar_array_type>::value && + std::is_integral<iType>::value, + reference_type>::type + operator[](const iType& i0) const { + // auto map = impl_map(); + const size_t dim_scalar = m_map.dimension_scalar(); + const size_t bytes = this->span() / dim_scalar; + + using tmp_view_type = Kokkos::View< + DataType*, typename traits::array_layout, typename traits::device_type, + Kokkos::MemoryTraits<traits::memory_traits::is_unmanaged | + traits::memory_traits::is_random_access | + traits::memory_traits::is_atomic> >; + tmp_view_type rankone_view(this->data(), bytes, dim_scalar); + return rankone_view(i0); + } + + // Rank 1 parenthesis + template <typename iType> + KOKKOS_INLINE_FUNCTION typename std::enable_if< + (std::is_same<typename traits::specialize, void>::value && + std::is_integral<iType>::value), + reference_type>::type + operator()(const iType& i0) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((1, this->rank(), m_track, m_map, i0)) + return m_map.reference(i0); + } + + template <typename iType> + KOKKOS_INLINE_FUNCTION typename std::enable_if< + !(std::is_same<typename traits::specialize, void>::value && + std::is_integral<iType>::value), + reference_type>::type + operator()(const iType& i0) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((1, this->rank(), m_track, m_map, i0)) + return m_map.reference(i0, 0, 0, 0, 0, 0, 0); + } + + // Rank 2 + template <typename iType0, typename iType1> + KOKKOS_INLINE_FUNCTION typename std::enable_if< + (std::is_same<typename traits::specialize, void>::value && + std::is_integral<iType0>::value && std::is_integral<iType1>::value), + reference_type>::type + operator()(const iType0& i0, const iType1& i1) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((2, this->rank(), m_track, m_map, i0, i1)) + return m_map.reference(i0, i1); + } + + template <typename iType0, typename iType1> + KOKKOS_INLINE_FUNCTION typename std::enable_if< + !(std::is_same<typename drvtraits::specialize, void>::value && + std::is_integral<iType0>::value), + reference_type>::type + operator()(const iType0& i0, const iType1& i1) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((2, this->rank(), m_track, m_map, i0, i1)) + return m_map.reference(i0, i1, 0, 0, 0, 0, 0); + } + + // Rank 3 + template <typename iType0, typename iType1, typename iType2> + KOKKOS_INLINE_FUNCTION typename std::enable_if< + (std::is_same<typename traits::specialize, void>::value && + std::is_integral<iType0>::value && std::is_integral<iType1>::value && + std::is_integral<iType2>::value), + reference_type>::type + operator()(const iType0& i0, const iType1& i1, const iType2& i2) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + (3, this->rank(), m_track, m_map, i0, i1, i2)) + return m_map.reference(i0, i1, i2); + } + + template <typename iType0, typename iType1, typename iType2> + KOKKOS_INLINE_FUNCTION typename std::enable_if< + !(std::is_same<typename drvtraits::specialize, void>::value && + std::is_integral<iType0>::value), + reference_type>::type + operator()(const iType0& i0, const iType1& i1, const iType2& i2) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + (3, this->rank(), m_track, m_map, i0, i1, i2)) + return m_map.reference(i0, i1, i2, 0, 0, 0, 0); + } + + // Rank 4 + template <typename iType0, typename iType1, typename iType2, typename iType3> + KOKKOS_INLINE_FUNCTION typename std::enable_if< + (std::is_same<typename traits::specialize, void>::value && + std::is_integral<iType0>::value && std::is_integral<iType1>::value && + std::is_integral<iType2>::value && std::is_integral<iType3>::value), + reference_type>::type + operator()(const iType0& i0, const iType1& i1, const iType2& i2, + const iType3& i3) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + (4, this->rank(), m_track, m_map, i0, i1, i2, i3)) + return m_map.reference(i0, i1, i2, i3); + } + + template <typename iType0, typename iType1, typename iType2, typename iType3> + KOKKOS_INLINE_FUNCTION typename std::enable_if< + !(std::is_same<typename drvtraits::specialize, void>::value && + std::is_integral<iType0>::value), + reference_type>::type + operator()(const iType0& i0, const iType1& i1, const iType2& i2, + const iType3& i3) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + (4, this->rank(), m_track, m_map, i0, i1, i2, i3)) + return m_map.reference(i0, i1, i2, i3, 0, 0, 0); + } + + // Rank 5 + template <typename iType0, typename iType1, typename iType2, typename iType3, + typename iType4> + KOKKOS_INLINE_FUNCTION typename std::enable_if< + (std::is_same<typename traits::specialize, void>::value && + std::is_integral<iType0>::value && std::is_integral<iType1>::value && + std::is_integral<iType2>::value && std::is_integral<iType3>::value && + std::is_integral<iType4>::value), + reference_type>::type + operator()(const iType0& i0, const iType1& i1, const iType2& i2, + const iType3& i3, const iType4& i4) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + (5, this->rank(), m_track, m_map, i0, i1, i2, i3, i4)) + return m_map.reference(i0, i1, i2, i3, i4); + } + + template <typename iType0, typename iType1, typename iType2, typename iType3, + typename iType4> + KOKKOS_INLINE_FUNCTION typename std::enable_if< + !(std::is_same<typename drvtraits::specialize, void>::value && + std::is_integral<iType0>::value), + reference_type>::type + operator()(const iType0& i0, const iType1& i1, const iType2& i2, + const iType3& i3, const iType4& i4) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + (5, this->rank(), m_track, m_map, i0, i1, i2, i3, i4)) + return m_map.reference(i0, i1, i2, i3, i4, 0, 0); + } + + // Rank 6 + template <typename iType0, typename iType1, typename iType2, typename iType3, + typename iType4, typename iType5> + KOKKOS_INLINE_FUNCTION typename std::enable_if< + (std::is_same<typename traits::specialize, void>::value && + std::is_integral<iType0>::value && std::is_integral<iType1>::value && + std::is_integral<iType2>::value && std::is_integral<iType3>::value && + std::is_integral<iType4>::value && std::is_integral<iType5>::value), + reference_type>::type + operator()(const iType0& i0, const iType1& i1, const iType2& i2, + const iType3& i3, const iType4& i4, const iType5& i5) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + (6, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5)) + return m_map.reference(i0, i1, i2, i3, i4, i5); + } + + template <typename iType0, typename iType1, typename iType2, typename iType3, + typename iType4, typename iType5> + KOKKOS_INLINE_FUNCTION typename std::enable_if< + !(std::is_same<typename drvtraits::specialize, void>::value && + std::is_integral<iType0>::value), + reference_type>::type + operator()(const iType0& i0, const iType1& i1, const iType2& i2, + const iType3& i3, const iType4& i4, const iType5& i5) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + (6, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5)) + return m_map.reference(i0, i1, i2, i3, i4, i5, 0); + } + + // Rank 7 + template <typename iType0, typename iType1, typename iType2, typename iType3, + typename iType4, typename iType5, typename iType6> + KOKKOS_INLINE_FUNCTION typename std::enable_if< + (std::is_integral<iType0>::value && std::is_integral<iType1>::value && + std::is_integral<iType2>::value && std::is_integral<iType3>::value && + std::is_integral<iType4>::value && std::is_integral<iType5>::value && + std::is_integral<iType6>::value), + reference_type>::type + operator()(const iType0& i0, const iType1& i1, const iType2& i2, + const iType3& i3, const iType4& i4, const iType5& i5, + const iType6& i6) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + (7, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5, i6)) + return m_map.reference(i0, i1, i2, i3, i4, i5, i6); + } + + // Rank 0 + KOKKOS_INLINE_FUNCTION + reference_type access() const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((0, this->rank(), m_track, m_map)) + return impl_map().reference(); + // return m_map.reference(0,0,0,0,0,0,0); + } + + // Rank 1 + // Rank 1 parenthesis + template <typename iType> + KOKKOS_INLINE_FUNCTION typename std::enable_if< + (std::is_same<typename traits::specialize, void>::value && + std::is_integral<iType>::value), + reference_type>::type + access(const iType& i0) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((1, this->rank(), m_track, m_map, i0)) + return m_map.reference(i0); + } + + template <typename iType> + KOKKOS_INLINE_FUNCTION typename std::enable_if< + !(std::is_same<typename traits::specialize, void>::value && + std::is_integral<iType>::value), + reference_type>::type + access(const iType& i0) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((1, this->rank(), m_track, m_map, i0)) + return m_map.reference(i0, 0, 0, 0, 0, 0, 0); + } + + // Rank 2 + template <typename iType0, typename iType1> + KOKKOS_INLINE_FUNCTION typename std::enable_if< + (std::is_same<typename traits::specialize, void>::value && + std::is_integral<iType0>::value && std::is_integral<iType1>::value), + reference_type>::type + access(const iType0& i0, const iType1& i1) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((2, this->rank(), m_track, m_map, i0, i1)) + return m_map.reference(i0, i1); + } + + template <typename iType0, typename iType1> + KOKKOS_INLINE_FUNCTION typename std::enable_if< + !(std::is_same<typename drvtraits::specialize, void>::value && + std::is_integral<iType0>::value), + reference_type>::type + access(const iType0& i0, const iType1& i1) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((2, this->rank(), m_track, m_map, i0, i1)) + return m_map.reference(i0, i1, 0, 0, 0, 0, 0); + } + + // Rank 3 + template <typename iType0, typename iType1, typename iType2> + KOKKOS_INLINE_FUNCTION typename std::enable_if< + (std::is_same<typename traits::specialize, void>::value && + std::is_integral<iType0>::value && std::is_integral<iType1>::value && + std::is_integral<iType2>::value), + reference_type>::type + access(const iType0& i0, const iType1& i1, const iType2& i2) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + (3, this->rank(), m_track, m_map, i0, i1, i2)) + return m_map.reference(i0, i1, i2); + } + + template <typename iType0, typename iType1, typename iType2> + KOKKOS_INLINE_FUNCTION typename std::enable_if< + !(std::is_same<typename drvtraits::specialize, void>::value && + std::is_integral<iType0>::value), + reference_type>::type + access(const iType0& i0, const iType1& i1, const iType2& i2) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + (3, this->rank(), m_track, m_map, i0, i1, i2)) + return m_map.reference(i0, i1, i2, 0, 0, 0, 0); + } + + // Rank 4 + template <typename iType0, typename iType1, typename iType2, typename iType3> + KOKKOS_INLINE_FUNCTION typename std::enable_if< + (std::is_same<typename traits::specialize, void>::value && + std::is_integral<iType0>::value && std::is_integral<iType1>::value && + std::is_integral<iType2>::value && std::is_integral<iType3>::value), + reference_type>::type + access(const iType0& i0, const iType1& i1, const iType2& i2, + const iType3& i3) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + (4, this->rank(), m_track, m_map, i0, i1, i2, i3)) + return m_map.reference(i0, i1, i2, i3); + } + + template <typename iType0, typename iType1, typename iType2, typename iType3> + KOKKOS_INLINE_FUNCTION typename std::enable_if< + !(std::is_same<typename drvtraits::specialize, void>::value && + std::is_integral<iType0>::value), + reference_type>::type + access(const iType0& i0, const iType1& i1, const iType2& i2, + const iType3& i3) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + (4, this->rank(), m_track, m_map, i0, i1, i2, i3)) + return m_map.reference(i0, i1, i2, i3, 0, 0, 0); + } + + // Rank 5 + template <typename iType0, typename iType1, typename iType2, typename iType3, + typename iType4> + KOKKOS_INLINE_FUNCTION typename std::enable_if< + (std::is_same<typename traits::specialize, void>::value && + std::is_integral<iType0>::value && std::is_integral<iType1>::value && + std::is_integral<iType2>::value && std::is_integral<iType3>::value && + std::is_integral<iType4>::value), + reference_type>::type + access(const iType0& i0, const iType1& i1, const iType2& i2, const iType3& i3, + const iType4& i4) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + (5, this->rank(), m_track, m_map, i0, i1, i2, i3, i4)) + return m_map.reference(i0, i1, i2, i3, i4); + } + + template <typename iType0, typename iType1, typename iType2, typename iType3, + typename iType4> + KOKKOS_INLINE_FUNCTION typename std::enable_if< + !(std::is_same<typename drvtraits::specialize, void>::value && + std::is_integral<iType0>::value), + reference_type>::type + access(const iType0& i0, const iType1& i1, const iType2& i2, const iType3& i3, + const iType4& i4) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + (5, this->rank(), m_track, m_map, i0, i1, i2, i3, i4)) + return m_map.reference(i0, i1, i2, i3, i4, 0, 0); + } + + // Rank 6 + template <typename iType0, typename iType1, typename iType2, typename iType3, + typename iType4, typename iType5> + KOKKOS_INLINE_FUNCTION typename std::enable_if< + (std::is_same<typename traits::specialize, void>::value && + std::is_integral<iType0>::value && std::is_integral<iType1>::value && + std::is_integral<iType2>::value && std::is_integral<iType3>::value && + std::is_integral<iType4>::value && std::is_integral<iType5>::value), + reference_type>::type + access(const iType0& i0, const iType1& i1, const iType2& i2, const iType3& i3, + const iType4& i4, const iType5& i5) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + (6, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5)) + return m_map.reference(i0, i1, i2, i3, i4, i5); + } + + template <typename iType0, typename iType1, typename iType2, typename iType3, + typename iType4, typename iType5> + KOKKOS_INLINE_FUNCTION typename std::enable_if< + !(std::is_same<typename drvtraits::specialize, void>::value && + std::is_integral<iType0>::value), + reference_type>::type + access(const iType0& i0, const iType1& i1, const iType2& i2, const iType3& i3, + const iType4& i4, const iType5& i5) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + (6, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5)) + return m_map.reference(i0, i1, i2, i3, i4, i5, 0); + } + + // Rank 7 + template <typename iType0, typename iType1, typename iType2, typename iType3, + typename iType4, typename iType5, typename iType6> + KOKKOS_INLINE_FUNCTION typename std::enable_if< + (std::is_integral<iType0>::value && std::is_integral<iType1>::value && + std::is_integral<iType2>::value && std::is_integral<iType3>::value && + std::is_integral<iType4>::value && std::is_integral<iType5>::value && + std::is_integral<iType6>::value), + reference_type>::type + access(const iType0& i0, const iType1& i1, const iType2& i2, const iType3& i3, + const iType4& i4, const iType5& i5, const iType6& i6) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + (7, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5, i6)) + return m_map.reference(i0, i1, i2, i3, i4, i5, i6); + } + +#undef KOKKOS_IMPL_VIEW_OPERATOR_VERIFY + + //---------------------------------------- + // Standard constructor, destructor, and assignment operators... + + KOKKOS_DEFAULTED_FUNCTION + ~DynRankView() = default; + + KOKKOS_INLINE_FUNCTION + DynRankView() : m_track(), m_map(), m_rank() {} // Default ctor + + KOKKOS_INLINE_FUNCTION + DynRankView(const DynRankView& rhs) + : m_track(rhs.m_track), m_map(rhs.m_map), m_rank(rhs.m_rank) {} + + KOKKOS_INLINE_FUNCTION + DynRankView(DynRankView&& rhs) + : m_track(rhs.m_track), m_map(rhs.m_map), m_rank(rhs.m_rank) {} + + KOKKOS_INLINE_FUNCTION + DynRankView& operator=(const DynRankView& rhs) { + m_track = rhs.m_track; + m_map = rhs.m_map; + m_rank = rhs.m_rank; + return *this; + } + + KOKKOS_INLINE_FUNCTION + DynRankView& operator=(DynRankView&& rhs) { + m_track = rhs.m_track; + m_map = rhs.m_map; + m_rank = rhs.m_rank; + return *this; + } + + //---------------------------------------- + // Compatible view copy constructor and assignment + // may assign unmanaged from managed. + template <class RT, class... RP> + KOKKOS_INLINE_FUNCTION DynRankView(const DynRankView<RT, RP...>& rhs) + : m_track(rhs.m_track, traits::is_managed), m_map(), m_rank(rhs.m_rank) { + using SrcTraits = typename DynRankView<RT, RP...>::traits; + using Mapping = Kokkos::Impl::ViewMapping<traits, SrcTraits, + typename traits::specialize>; + static_assert(Mapping::is_assignable, + "Incompatible DynRankView copy construction"); + Mapping::assign(m_map, rhs.m_map, rhs.m_track); + } + + template <class RT, class... RP> + KOKKOS_INLINE_FUNCTION DynRankView& operator=( + const DynRankView<RT, RP...>& rhs) { + using SrcTraits = typename DynRankView<RT, RP...>::traits; + using Mapping = Kokkos::Impl::ViewMapping<traits, SrcTraits, + typename traits::specialize>; + static_assert(Mapping::is_assignable, + "Incompatible DynRankView copy construction"); + Mapping::assign(m_map, rhs.m_map, rhs.m_track); + m_track.assign(rhs.m_track, traits::is_managed); + m_rank = rhs.rank(); + return *this; + } + + // Copy/Assign View to DynRankView + template <class RT, class... RP> + KOKKOS_INLINE_FUNCTION DynRankView(const View<RT, RP...>& rhs) + : m_track(), m_map(), m_rank(rhs.Rank) { + using SrcTraits = typename View<RT, RP...>::traits; + using Mapping = + Kokkos::Impl::ViewMapping<traits, SrcTraits, + Kokkos::Impl::ViewToDynRankViewTag>; + static_assert(Mapping::is_assignable, + "Incompatible View to DynRankView copy construction"); + Mapping::assign(*this, rhs); + } + + template <class RT, class... RP> + KOKKOS_INLINE_FUNCTION DynRankView& operator=(const View<RT, RP...>& rhs) { + using SrcTraits = typename View<RT, RP...>::traits; + using Mapping = + Kokkos::Impl::ViewMapping<traits, SrcTraits, + Kokkos::Impl::ViewToDynRankViewTag>; + static_assert(Mapping::is_assignable, + "Incompatible View to DynRankView copy assignment"); + Mapping::assign(*this, rhs); + return *this; + } + + //---------------------------------------- + // Allocation tracking properties + + KOKKOS_INLINE_FUNCTION + int use_count() const { return m_track.use_count(); } + + inline const std::string label() const { + return m_track.template get_label<typename traits::memory_space>(); + } + + //---------------------------------------- + // Allocation according to allocation properties and array layout + // unused arg_layout dimensions must be set to KOKKOS_INVALID_INDEX so that + // rank deduction can properly take place + template <class... P> + explicit inline DynRankView( + const Kokkos::Impl::ViewCtorProp<P...>& arg_prop, + typename std::enable_if<!Kokkos::Impl::ViewCtorProp<P...>::has_pointer, + typename traits::array_layout>::type const& + arg_layout) + : m_track(), + m_map(), + m_rank(Impl::DynRankDimTraits<typename traits::specialize>:: + template computeRank<typename traits::array_layout, P...>( + arg_prop, arg_layout)) { + // Append layout and spaces if not input + using alloc_prop_input = Kokkos::Impl::ViewCtorProp<P...>; + + // use 'std::integral_constant<unsigned,I>' for non-types + // to avoid duplicate class error. + using alloc_prop = Kokkos::Impl::ViewCtorProp< + P..., + typename std::conditional<alloc_prop_input::has_label, + std::integral_constant<unsigned, 0>, + typename std::string>::type, + typename std::conditional< + alloc_prop_input::has_memory_space, + std::integral_constant<unsigned, 1>, + typename traits::device_type::memory_space>::type, + typename std::conditional< + alloc_prop_input::has_execution_space, + std::integral_constant<unsigned, 2>, + typename traits::device_type::execution_space>::type>; + + static_assert(traits::is_managed, + "View allocation constructor requires managed memory"); + + if (alloc_prop::initialize && + !alloc_prop::execution_space::impl_is_initialized()) { + // If initializing view data then + // the execution space must be initialized. + Kokkos::Impl::throw_runtime_exception( + "Constructing DynRankView and initializing data with uninitialized " + "execution space"); + } + + // Copy the input allocation properties with possibly defaulted properties + alloc_prop prop_copy(arg_prop); + +//------------------------------------------------------------ +#if defined(KOKKOS_ENABLE_CUDA) + // If allocating in CudaUVMSpace must fence before and after + // the allocation to protect against possible concurrent access + // on the CPU and the GPU. + // Fence using the trait's executon space (which will be Kokkos::Cuda) + // to avoid incomplete type errors from usng Kokkos::Cuda directly. + if (std::is_same<Kokkos::CudaUVMSpace, + typename traits::device_type::memory_space>::value) { + typename traits::device_type::memory_space::execution_space().fence(); + } +#endif + //------------------------------------------------------------ + + Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared( + prop_copy, + Impl::DynRankDimTraits<typename traits::specialize>:: + template createLayout<traits, P...>(arg_prop, arg_layout)); + +//------------------------------------------------------------ +#if defined(KOKKOS_ENABLE_CUDA) + if (std::is_same<Kokkos::CudaUVMSpace, + typename traits::device_type::memory_space>::value) { + typename traits::device_type::memory_space::execution_space().fence(); + } +#endif + //------------------------------------------------------------ + + // Setup and initialization complete, start tracking + m_track.assign_allocated_record_to_uninitialized(record); + } + + // Wrappers + template <class... P> + explicit KOKKOS_INLINE_FUNCTION DynRankView( + const Kokkos::Impl::ViewCtorProp<P...>& arg_prop, + typename std::enable_if<Kokkos::Impl::ViewCtorProp<P...>::has_pointer, + typename traits::array_layout>::type const& + arg_layout) + : m_track() // No memory tracking + , + m_map(arg_prop, + Impl::DynRankDimTraits<typename traits::specialize>:: + template createLayout<traits, P...>(arg_prop, arg_layout)), + m_rank(Impl::DynRankDimTraits<typename traits::specialize>:: + template computeRank<typename traits::array_layout, P...>( + arg_prop, arg_layout)) { + static_assert( + std::is_same<pointer_type, + typename Impl::ViewCtorProp<P...>::pointer_type>::value, + "Constructing DynRankView to wrap user memory must supply matching " + "pointer type"); + } + + //---------------------------------------- + // Constructor(s) + + // Simple dimension-only layout + template <class... P> + explicit inline DynRankView( + const Kokkos::Impl::ViewCtorProp<P...>& arg_prop, + typename std::enable_if<!Kokkos::Impl::ViewCtorProp<P...>::has_pointer, + size_t>::type const arg_N0 = KOKKOS_INVALID_INDEX, + const size_t arg_N1 = KOKKOS_INVALID_INDEX, + const size_t arg_N2 = KOKKOS_INVALID_INDEX, + const size_t arg_N3 = KOKKOS_INVALID_INDEX, + const size_t arg_N4 = KOKKOS_INVALID_INDEX, + const size_t arg_N5 = KOKKOS_INVALID_INDEX, + const size_t arg_N6 = KOKKOS_INVALID_INDEX, + const size_t arg_N7 = KOKKOS_INVALID_INDEX) + : DynRankView(arg_prop, typename traits::array_layout( + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, + arg_N5, arg_N6, arg_N7)) {} + + template <class... P> + explicit KOKKOS_INLINE_FUNCTION DynRankView( + const Kokkos::Impl::ViewCtorProp<P...>& arg_prop, + typename std::enable_if<Kokkos::Impl::ViewCtorProp<P...>::has_pointer, + size_t>::type const arg_N0 = KOKKOS_INVALID_INDEX, + const size_t arg_N1 = KOKKOS_INVALID_INDEX, + const size_t arg_N2 = KOKKOS_INVALID_INDEX, + const size_t arg_N3 = KOKKOS_INVALID_INDEX, + const size_t arg_N4 = KOKKOS_INVALID_INDEX, + const size_t arg_N5 = KOKKOS_INVALID_INDEX, + const size_t arg_N6 = KOKKOS_INVALID_INDEX, + const size_t arg_N7 = KOKKOS_INVALID_INDEX) + : DynRankView(arg_prop, typename traits::array_layout( + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, + arg_N5, arg_N6, arg_N7)) {} + + // Allocate with label and layout + template <typename Label> + explicit inline DynRankView( + const Label& arg_label, + typename std::enable_if<Kokkos::Impl::is_view_label<Label>::value, + typename traits::array_layout>::type const& + arg_layout) + : DynRankView(Kokkos::Impl::ViewCtorProp<std::string>(arg_label), + arg_layout) {} + + // Allocate label and layout, must disambiguate from subview constructor + template <typename Label> + explicit inline DynRankView( + const Label& arg_label, + typename std::enable_if<Kokkos::Impl::is_view_label<Label>::value, + const size_t>::type arg_N0 = KOKKOS_INVALID_INDEX, + const size_t arg_N1 = KOKKOS_INVALID_INDEX, + const size_t arg_N2 = KOKKOS_INVALID_INDEX, + const size_t arg_N3 = KOKKOS_INVALID_INDEX, + const size_t arg_N4 = KOKKOS_INVALID_INDEX, + const size_t arg_N5 = KOKKOS_INVALID_INDEX, + const size_t arg_N6 = KOKKOS_INVALID_INDEX, + const size_t arg_N7 = KOKKOS_INVALID_INDEX) + : DynRankView( + Kokkos::Impl::ViewCtorProp<std::string>(arg_label), + typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6, arg_N7)) {} + + //---------------------------------------- + // Memory span required to wrap these dimensions. + static constexpr size_t required_allocation_size( + const size_t arg_N0 = 0, const size_t arg_N1 = 0, const size_t arg_N2 = 0, + const size_t arg_N3 = 0, const size_t arg_N4 = 0, const size_t arg_N5 = 0, + const size_t arg_N6 = 0, const size_t arg_N7 = 0) { + return map_type::memory_span(typename traits::array_layout( + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); + } + + explicit KOKKOS_INLINE_FUNCTION DynRankView( + pointer_type arg_ptr, const size_t arg_N0 = KOKKOS_INVALID_INDEX, + const size_t arg_N1 = KOKKOS_INVALID_INDEX, + const size_t arg_N2 = KOKKOS_INVALID_INDEX, + const size_t arg_N3 = KOKKOS_INVALID_INDEX, + const size_t arg_N4 = KOKKOS_INVALID_INDEX, + const size_t arg_N5 = KOKKOS_INVALID_INDEX, + const size_t arg_N6 = KOKKOS_INVALID_INDEX, + const size_t arg_N7 = KOKKOS_INVALID_INDEX) + : DynRankView(Kokkos::Impl::ViewCtorProp<pointer_type>(arg_ptr), arg_N0, + arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7) {} + + explicit KOKKOS_INLINE_FUNCTION DynRankView( + pointer_type arg_ptr, typename traits::array_layout& arg_layout) + : DynRankView(Kokkos::Impl::ViewCtorProp<pointer_type>(arg_ptr), + arg_layout) {} + + //---------------------------------------- + // Shared scratch memory constructor + + static inline size_t shmem_size(const size_t arg_N0 = KOKKOS_INVALID_INDEX, + const size_t arg_N1 = KOKKOS_INVALID_INDEX, + const size_t arg_N2 = KOKKOS_INVALID_INDEX, + const size_t arg_N3 = KOKKOS_INVALID_INDEX, + const size_t arg_N4 = KOKKOS_INVALID_INDEX, + const size_t arg_N5 = KOKKOS_INVALID_INDEX, + const size_t arg_N6 = KOKKOS_INVALID_INDEX, + const size_t arg_N7 = KOKKOS_INVALID_INDEX) { + const size_t num_passed_args = + (arg_N0 != KOKKOS_INVALID_INDEX) + (arg_N1 != KOKKOS_INVALID_INDEX) + + (arg_N2 != KOKKOS_INVALID_INDEX) + (arg_N3 != KOKKOS_INVALID_INDEX) + + (arg_N4 != KOKKOS_INVALID_INDEX) + (arg_N5 != KOKKOS_INVALID_INDEX) + + (arg_N6 != KOKKOS_INVALID_INDEX) + (arg_N7 != KOKKOS_INVALID_INDEX); + + if (std::is_same<typename traits::specialize, void>::value && + num_passed_args != traits::rank_dynamic) { + Kokkos::abort( + "Kokkos::View::shmem_size() rank_dynamic != number of arguments.\n"); + } + {} + + return map_type::memory_span(typename traits::array_layout( + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); + } + + explicit KOKKOS_INLINE_FUNCTION DynRankView( + const typename traits::execution_space::scratch_memory_space& arg_space, + const typename traits::array_layout& arg_layout) + : DynRankView( + Kokkos::Impl::ViewCtorProp<pointer_type>( + reinterpret_cast<pointer_type>( + arg_space.get_shmem(map_type::memory_span( + Impl::DynRankDimTraits<typename traits::specialize>:: + createLayout(arg_layout) // is this correct? + )))), + arg_layout) {} + + explicit KOKKOS_INLINE_FUNCTION DynRankView( + const typename traits::execution_space::scratch_memory_space& arg_space, + const size_t arg_N0 = KOKKOS_INVALID_INDEX, + const size_t arg_N1 = KOKKOS_INVALID_INDEX, + const size_t arg_N2 = KOKKOS_INVALID_INDEX, + const size_t arg_N3 = KOKKOS_INVALID_INDEX, + const size_t arg_N4 = KOKKOS_INVALID_INDEX, + const size_t arg_N5 = KOKKOS_INVALID_INDEX, + const size_t arg_N6 = KOKKOS_INVALID_INDEX, + const size_t arg_N7 = KOKKOS_INVALID_INDEX) + + : DynRankView( + Kokkos::Impl::ViewCtorProp<pointer_type>( + reinterpret_cast<pointer_type>( + arg_space.get_shmem(map_type::memory_span( + Impl::DynRankDimTraits<typename traits::specialize>:: + createLayout(typename traits::array_layout( + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, + arg_N6, arg_N7)))))), + typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6, arg_N7)) {} +}; + +template <typename D, class... P> +KOKKOS_INLINE_FUNCTION constexpr unsigned rank( + const DynRankView<D, P...>& DRV) { + return DRV.rank(); +} // needed for transition to common constexpr method in view and dynrankview + // to return rank + +//---------------------------------------------------------------------------- +// Subview mapping. +// Deduce destination view type from source view traits and subview arguments + +namespace Impl { + +struct DynRankSubviewTag {}; + +} // namespace Impl + +namespace Impl { + +template <class SrcTraits, class... Args> +class ViewMapping< + typename std::enable_if< + (std::is_same<typename SrcTraits::specialize, void>::value && + (std::is_same<typename SrcTraits::array_layout, + Kokkos::LayoutLeft>::value || + std::is_same<typename SrcTraits::array_layout, + Kokkos::LayoutRight>::value || + std::is_same<typename SrcTraits::array_layout, + Kokkos::LayoutStride>::value)), + Kokkos::Impl::DynRankSubviewTag>::type, + SrcTraits, Args...> { + private: + enum { + RZ = false, + R0 = bool(is_integral_extent<0, Args...>::value), + R1 = bool(is_integral_extent<1, Args...>::value), + R2 = bool(is_integral_extent<2, Args...>::value), + R3 = bool(is_integral_extent<3, Args...>::value), + R4 = bool(is_integral_extent<4, Args...>::value), + R5 = bool(is_integral_extent<5, Args...>::value), + R6 = bool(is_integral_extent<6, Args...>::value) + }; + + enum { + rank = unsigned(R0) + unsigned(R1) + unsigned(R2) + unsigned(R3) + + unsigned(R4) + unsigned(R5) + unsigned(R6) + }; + + using array_layout = Kokkos::LayoutStride; + + using value_type = typename SrcTraits::value_type; + + using data_type = value_type*******; + + public: + using traits_type = Kokkos::ViewTraits<data_type, array_layout, + typename SrcTraits::device_type, + typename SrcTraits::memory_traits>; + + using type = + Kokkos::View<data_type, array_layout, typename SrcTraits::device_type, + typename SrcTraits::memory_traits>; + + template <class MemoryTraits> + struct apply { + static_assert(Kokkos::Impl::is_memory_traits<MemoryTraits>::value, ""); + + using traits_type = + Kokkos::ViewTraits<data_type, array_layout, + typename SrcTraits::device_type, MemoryTraits>; + + using type = Kokkos::View<data_type, array_layout, + typename SrcTraits::device_type, MemoryTraits>; + }; + + using dimension = typename SrcTraits::dimension; + + template <class Arg0 = int, class Arg1 = int, class Arg2 = int, + class Arg3 = int, class Arg4 = int, class Arg5 = int, + class Arg6 = int> + struct ExtentGenerator { + KOKKOS_INLINE_FUNCTION + static SubviewExtents<7, rank> generator( + const dimension& dim, Arg0 arg0 = Arg0(), Arg1 arg1 = Arg1(), + Arg2 arg2 = Arg2(), Arg3 arg3 = Arg3(), Arg4 arg4 = Arg4(), + Arg5 arg5 = Arg5(), Arg6 arg6 = Arg6()) { + return SubviewExtents<7, rank>(dim, arg0, arg1, arg2, arg3, arg4, arg5, + arg6); + } + }; + + using ret_type = Kokkos::DynRankView<value_type, array_layout, + typename SrcTraits::device_type, + typename SrcTraits::memory_traits>; + + template <typename T, class... P> + KOKKOS_INLINE_FUNCTION static ret_type subview( + const unsigned src_rank, Kokkos::DynRankView<T, P...> const& src, + Args... args) { + using DstType = ViewMapping<traits_type, typename traits_type::specialize>; + + using DstDimType = typename std::conditional< + (rank == 0), ViewDimension<>, + typename std::conditional< + (rank == 1), ViewDimension<0>, + typename std::conditional< + (rank == 2), ViewDimension<0, 0>, + typename std::conditional< + (rank == 3), ViewDimension<0, 0, 0>, + typename std::conditional< + (rank == 4), ViewDimension<0, 0, 0, 0>, + typename std::conditional< + (rank == 5), ViewDimension<0, 0, 0, 0, 0>, + typename std::conditional< + (rank == 6), ViewDimension<0, 0, 0, 0, 0, 0>, + ViewDimension<0, 0, 0, 0, 0, 0, 0> >::type>:: + type>::type>::type>::type>::type>::type; + + using dst_offset_type = ViewOffset<DstDimType, Kokkos::LayoutStride>; + using dst_handle_type = typename DstType::handle_type; + + ret_type dst; + + const SubviewExtents<7, rank> extents = ExtentGenerator<Args...>::generator( + src.m_map.m_impl_offset.m_dim, args...); + + dst_offset_type tempdst(src.m_map.m_impl_offset, extents); + + dst.m_track = src.m_track; + + dst.m_map.m_impl_offset.m_dim.N0 = tempdst.m_dim.N0; + dst.m_map.m_impl_offset.m_dim.N1 = tempdst.m_dim.N1; + dst.m_map.m_impl_offset.m_dim.N2 = tempdst.m_dim.N2; + dst.m_map.m_impl_offset.m_dim.N3 = tempdst.m_dim.N3; + dst.m_map.m_impl_offset.m_dim.N4 = tempdst.m_dim.N4; + dst.m_map.m_impl_offset.m_dim.N5 = tempdst.m_dim.N5; + dst.m_map.m_impl_offset.m_dim.N6 = tempdst.m_dim.N6; + + dst.m_map.m_impl_offset.m_stride.S0 = tempdst.m_stride.S0; + dst.m_map.m_impl_offset.m_stride.S1 = tempdst.m_stride.S1; + dst.m_map.m_impl_offset.m_stride.S2 = tempdst.m_stride.S2; + dst.m_map.m_impl_offset.m_stride.S3 = tempdst.m_stride.S3; + dst.m_map.m_impl_offset.m_stride.S4 = tempdst.m_stride.S4; + dst.m_map.m_impl_offset.m_stride.S5 = tempdst.m_stride.S5; + dst.m_map.m_impl_offset.m_stride.S6 = tempdst.m_stride.S6; + + dst.m_map.m_impl_handle = + dst_handle_type(src.m_map.m_impl_handle + + src.m_map.m_impl_offset( + extents.domain_offset(0), extents.domain_offset(1), + extents.domain_offset(2), extents.domain_offset(3), + extents.domain_offset(4), extents.domain_offset(5), + extents.domain_offset(6))); + + dst.m_rank = + (src_rank > 0 ? unsigned(R0) : 0) + (src_rank > 1 ? unsigned(R1) : 0) + + (src_rank > 2 ? unsigned(R2) : 0) + (src_rank > 3 ? unsigned(R3) : 0) + + (src_rank > 4 ? unsigned(R4) : 0) + (src_rank > 5 ? unsigned(R5) : 0) + + (src_rank > 6 ? unsigned(R6) : 0); + + return dst; + } +}; + +} // namespace Impl + +template <class V, class... Args> +using Subdynrankview = + typename Kokkos::Impl::ViewMapping<Kokkos::Impl::DynRankSubviewTag, V, + Args...>::ret_type; + +template <class D, class... P, class... Args> +KOKKOS_INLINE_FUNCTION Subdynrankview<ViewTraits<D*******, P...>, Args...> +subdynrankview(const Kokkos::DynRankView<D, P...>& src, Args... args) { + if (src.rank() > sizeof...(Args)) // allow sizeof...(Args) >= src.rank(), + // ignore the remaining args + { + Kokkos::abort( + "subdynrankview: num of args must be >= rank of the source " + "DynRankView"); + } + + using metafcn = + Kokkos::Impl::ViewMapping<Kokkos::Impl::DynRankSubviewTag, + Kokkos::ViewTraits<D*******, P...>, Args...>; + + return metafcn::subview(src.rank(), src, args...); +} + +// Wrapper to allow subview function name +template <class D, class... P, class... Args> +KOKKOS_INLINE_FUNCTION Subdynrankview<ViewTraits<D*******, P...>, Args...> +subview(const Kokkos::DynRankView<D, P...>& src, Args... args) { + return subdynrankview(src, args...); +} + +} // namespace Kokkos + +namespace Kokkos { + +// overload == and != +template <class LT, class... LP, class RT, class... RP> +KOKKOS_INLINE_FUNCTION bool operator==(const DynRankView<LT, LP...>& lhs, + const DynRankView<RT, RP...>& rhs) { + // Same data, layout, dimensions + using lhs_traits = ViewTraits<LT, LP...>; + using rhs_traits = ViewTraits<RT, RP...>; + + return std::is_same<typename lhs_traits::const_value_type, + typename rhs_traits::const_value_type>::value && + std::is_same<typename lhs_traits::array_layout, + typename rhs_traits::array_layout>::value && + std::is_same<typename lhs_traits::memory_space, + typename rhs_traits::memory_space>::value && + lhs.rank() == rhs.rank() && lhs.data() == rhs.data() && + lhs.span() == rhs.span() && lhs.extent(0) == rhs.extent(0) && + lhs.extent(1) == rhs.extent(1) && lhs.extent(2) == rhs.extent(2) && + lhs.extent(3) == rhs.extent(3) && lhs.extent(4) == rhs.extent(4) && + lhs.extent(5) == rhs.extent(5) && lhs.extent(6) == rhs.extent(6) && + lhs.extent(7) == rhs.extent(7); +} + +template <class LT, class... LP, class RT, class... RP> +KOKKOS_INLINE_FUNCTION bool operator!=(const DynRankView<LT, LP...>& lhs, + const DynRankView<RT, RP...>& rhs) { + return !(operator==(lhs, rhs)); +} + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- +namespace Kokkos { +namespace Impl { + +template <class OutputView, typename Enable = void> +struct DynRankViewFill { + using const_value_type = typename OutputView::traits::const_value_type; + + const OutputView output; + const_value_type input; + + KOKKOS_INLINE_FUNCTION + void operator()(const size_t i0) const { + const size_t n1 = output.extent(1); + const size_t n2 = output.extent(2); + const size_t n3 = output.extent(3); + const size_t n4 = output.extent(4); + const size_t n5 = output.extent(5); + const size_t n6 = output.extent(6); + + for (size_t i1 = 0; i1 < n1; ++i1) { + for (size_t i2 = 0; i2 < n2; ++i2) { + for (size_t i3 = 0; i3 < n3; ++i3) { + for (size_t i4 = 0; i4 < n4; ++i4) { + for (size_t i5 = 0; i5 < n5; ++i5) { + for (size_t i6 = 0; i6 < n6; ++i6) { + output.access(i0, i1, i2, i3, i4, i5, i6) = input; + } + } + } + } + } + } + } + + DynRankViewFill(const OutputView& arg_out, const_value_type& arg_in) + : output(arg_out), input(arg_in) { + using execution_space = typename OutputView::execution_space; + using Policy = Kokkos::RangePolicy<execution_space>; + + Kokkos::parallel_for("Kokkos::DynRankViewFill", Policy(0, output.extent(0)), + *this); + } +}; + +template <class OutputView> +struct DynRankViewFill<OutputView, + typename std::enable_if<OutputView::Rank == 0>::type> { + DynRankViewFill(const OutputView& dst, + const typename OutputView::const_value_type& src) { + Kokkos::Impl::DeepCopy<typename OutputView::memory_space, + Kokkos::HostSpace>( + dst.data(), &src, sizeof(typename OutputView::const_value_type)); + } +}; + +template <class OutputView, class InputView, + class ExecSpace = typename OutputView::execution_space> +struct DynRankViewRemap { + const OutputView output; + const InputView input; + const size_t n0; + const size_t n1; + const size_t n2; + const size_t n3; + const size_t n4; + const size_t n5; + const size_t n6; + const size_t n7; + + DynRankViewRemap(const OutputView& arg_out, const InputView& arg_in) + : output(arg_out), + input(arg_in), + n0(std::min((size_t)arg_out.extent(0), (size_t)arg_in.extent(0))), + n1(std::min((size_t)arg_out.extent(1), (size_t)arg_in.extent(1))), + n2(std::min((size_t)arg_out.extent(2), (size_t)arg_in.extent(2))), + n3(std::min((size_t)arg_out.extent(3), (size_t)arg_in.extent(3))), + n4(std::min((size_t)arg_out.extent(4), (size_t)arg_in.extent(4))), + n5(std::min((size_t)arg_out.extent(5), (size_t)arg_in.extent(5))), + n6(std::min((size_t)arg_out.extent(6), (size_t)arg_in.extent(6))), + n7(std::min((size_t)arg_out.extent(7), (size_t)arg_in.extent(7))) { + using Policy = Kokkos::RangePolicy<ExecSpace>; + + Kokkos::parallel_for("Kokkos::DynRankViewRemap", Policy(0, n0), *this); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const size_t i0) const { + for (size_t i1 = 0; i1 < n1; ++i1) { + for (size_t i2 = 0; i2 < n2; ++i2) { + for (size_t i3 = 0; i3 < n3; ++i3) { + for (size_t i4 = 0; i4 < n4; ++i4) { + for (size_t i5 = 0; i5 < n5; ++i5) { + for (size_t i6 = 0; i6 < n6; ++i6) { + output.access(i0, i1, i2, i3, i4, i5, i6) = + input.access(i0, i1, i2, i3, i4, i5, i6); + } + } + } + } + } + } + } +}; + +} /* namespace Impl */ +} /* namespace Kokkos */ + +namespace Kokkos { + +/** \brief Deep copy a value from Host memory into a view. */ +template <class DT, class... DP> +inline void deep_copy( + const DynRankView<DT, DP...>& dst, + typename ViewTraits<DT, DP...>::const_value_type& value, + typename std::enable_if<std::is_same< + typename ViewTraits<DT, DP...>::specialize, void>::value>::type* = + nullptr) { + static_assert( + std::is_same<typename ViewTraits<DT, DP...>::non_const_value_type, + typename ViewTraits<DT, DP...>::value_type>::value, + "deep_copy requires non-const type"); + + Kokkos::fence(); + Kokkos::Impl::DynRankViewFill<DynRankView<DT, DP...> >(dst, value); + Kokkos::fence(); +} + +/** \brief Deep copy into a value in Host memory from a view. */ +template <class ST, class... SP> +inline void deep_copy( + typename ViewTraits<ST, SP...>::non_const_value_type& dst, + const DynRankView<ST, SP...>& src, + typename std::enable_if<std::is_same< + typename ViewTraits<ST, SP...>::specialize, void>::value>::type* = 0) { + if (src.rank() != 0) { + Kokkos::abort(""); + } + + using src_traits = ViewTraits<ST, SP...>; + using src_memory_space = typename src_traits::memory_space; + Kokkos::fence(); + Kokkos::Impl::DeepCopy<HostSpace, src_memory_space>(&dst, src.data(), + sizeof(ST)); + Kokkos::fence(); +} + +//---------------------------------------------------------------------------- +/** \brief A deep copy between views of the default specialization, compatible + * type, same rank, same contiguous layout. + */ +template <class DstType, class SrcType> +inline void deep_copy( + const DstType& dst, const SrcType& src, + typename std::enable_if< + (std::is_same<typename DstType::traits::specialize, void>::value && + std::is_same<typename SrcType::traits::specialize, void>::value && + (Kokkos::is_dyn_rank_view<DstType>::value || + Kokkos::is_dyn_rank_view<SrcType>::value))>::type* = nullptr) { + static_assert( + std::is_same<typename DstType::traits::value_type, + typename DstType::traits::non_const_value_type>::value, + "deep_copy requires non-const destination type"); + + using dst_type = DstType; + using src_type = SrcType; + + using dst_execution_space = typename dst_type::execution_space; + using src_execution_space = typename src_type::execution_space; + using dst_memory_space = typename dst_type::memory_space; + using src_memory_space = typename src_type::memory_space; + + enum { + DstExecCanAccessSrc = + Kokkos::Impl::SpaceAccessibility<dst_execution_space, + src_memory_space>::accessible + }; + + enum { + SrcExecCanAccessDst = + Kokkos::Impl::SpaceAccessibility<src_execution_space, + dst_memory_space>::accessible + }; + + if ((void*)dst.data() != (void*)src.data()) { + // Concern: If overlapping views then a parallel copy will be erroneous. + // ... + + // If same type, equal layout, equal dimensions, equal span, and contiguous + // memory then can byte-wise copy + if (rank(src) == 0 && rank(dst) == 0) { + using value_type = typename dst_type::value_type; + Kokkos::fence(); + Kokkos::Impl::DeepCopy<dst_memory_space, src_memory_space>( + dst.data(), src.data(), sizeof(value_type)); + Kokkos::fence(); + } else if (std::is_same< + typename DstType::traits::value_type, + typename SrcType::traits::non_const_value_type>::value && + ((std::is_same<typename DstType::traits::array_layout, + typename SrcType::traits::array_layout>::value && + (std::is_same<typename DstType::traits::array_layout, + typename Kokkos::LayoutLeft>::value || + std::is_same<typename DstType::traits::array_layout, + typename Kokkos::LayoutRight>::value)) || + (rank(dst) == 1 && rank(src) == 1)) && + dst.span_is_contiguous() && src.span_is_contiguous() && + dst.span() == src.span() && dst.extent(0) == src.extent(0) && + + dst.extent(1) == src.extent(1) && + dst.extent(2) == src.extent(2) && + dst.extent(3) == src.extent(3) && + dst.extent(4) == src.extent(4) && + dst.extent(5) == src.extent(5) && + dst.extent(6) == src.extent(6) && + dst.extent(7) == src.extent(7)) { + const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span(); + Kokkos::fence(); + Kokkos::Impl::DeepCopy<dst_memory_space, src_memory_space>( + dst.data(), src.data(), nbytes); + Kokkos::fence(); + } else if (std::is_same< + typename DstType::traits::value_type, + typename SrcType::traits::non_const_value_type>::value && + ((std::is_same<typename DstType::traits::array_layout, + typename SrcType::traits::array_layout>::value && + std::is_same<typename DstType::traits::array_layout, + typename Kokkos::LayoutStride>::value) || + (rank(dst) == 1 && rank(src) == 1)) && + dst.span_is_contiguous() && src.span_is_contiguous() && + dst.span() == src.span() && dst.extent(0) == src.extent(0) && + dst.extent(1) == src.extent(1) && + dst.extent(2) == src.extent(2) && + dst.extent(3) == src.extent(3) && + dst.extent(4) == src.extent(4) && + dst.extent(5) == src.extent(5) && + dst.extent(6) == src.extent(6) && + dst.extent(7) == src.extent(7) && + dst.stride_0() == src.stride_0() && + dst.stride_1() == src.stride_1() && + dst.stride_2() == src.stride_2() && + dst.stride_3() == src.stride_3() && + dst.stride_4() == src.stride_4() && + dst.stride_5() == src.stride_5() && + dst.stride_6() == src.stride_6() && + dst.stride_7() == src.stride_7()) { + const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span(); + Kokkos::fence(); + Kokkos::Impl::DeepCopy<dst_memory_space, src_memory_space>( + dst.data(), src.data(), nbytes); + Kokkos::fence(); + } else if (DstExecCanAccessSrc) { + // Copying data between views in accessible memory spaces and either + // non-contiguous or incompatible shape. + Kokkos::fence(); + Kokkos::Impl::DynRankViewRemap<dst_type, src_type>(dst, src); + Kokkos::fence(); + } else if (SrcExecCanAccessDst) { + // Copying data between views in accessible memory spaces and either + // non-contiguous or incompatible shape. + Kokkos::fence(); + Kokkos::Impl::DynRankViewRemap<dst_type, src_type, src_execution_space>( + dst, src); + Kokkos::fence(); + } else { + Kokkos::Impl::throw_runtime_exception( + "deep_copy given views that would require a temporary allocation"); + } + } else { + Kokkos::fence(); + } +} + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +// Deduce Mirror Types +template <class Space, class T, class... P> +struct MirrorDRViewType { + // The incoming view_type + using src_view_type = typename Kokkos::DynRankView<T, P...>; + // The memory space for the mirror view + using memory_space = typename Space::memory_space; + // Check whether it is the same memory space + enum { + is_same_memspace = + std::is_same<memory_space, typename src_view_type::memory_space>::value + }; + // The array_layout + using array_layout = typename src_view_type::array_layout; + // The data type (we probably want it non-const since otherwise we can't even + // deep_copy to it. + using data_type = typename src_view_type::non_const_data_type; + // The destination view type if it is not the same memory space + using dest_view_type = Kokkos::DynRankView<data_type, array_layout, Space>; + // If it is the same memory_space return the existsing view_type + // This will also keep the unmanaged trait if necessary + using view_type = typename std::conditional<is_same_memspace, src_view_type, + dest_view_type>::type; +}; + +template <class Space, class T, class... P> +struct MirrorDRVType { + // The incoming view_type + using src_view_type = typename Kokkos::DynRankView<T, P...>; + // The memory space for the mirror view + using memory_space = typename Space::memory_space; + // Check whether it is the same memory space + enum { + is_same_memspace = + std::is_same<memory_space, typename src_view_type::memory_space>::value + }; + // The array_layout + using array_layout = typename src_view_type::array_layout; + // The data type (we probably want it non-const since otherwise we can't even + // deep_copy to it. + using data_type = typename src_view_type::non_const_data_type; + // The destination view type if it is not the same memory space + using view_type = Kokkos::DynRankView<data_type, array_layout, Space>; +}; + +} // namespace Impl + +template <class T, class... P> +inline typename DynRankView<T, P...>::HostMirror create_mirror( + const DynRankView<T, P...>& src, + typename std::enable_if< + std::is_same<typename ViewTraits<T, P...>::specialize, void>::value && + !std::is_same<typename Kokkos::ViewTraits<T, P...>::array_layout, + Kokkos::LayoutStride>::value>::type* = nullptr) { + using src_type = DynRankView<T, P...>; + using dst_type = typename src_type::HostMirror; + + return dst_type(std::string(src.label()).append("_mirror"), + Impl::reconstructLayout(src.layout(), src.rank())); +} + +template <class T, class... P> +inline typename DynRankView<T, P...>::HostMirror create_mirror( + const DynRankView<T, P...>& src, + typename std::enable_if< + std::is_same<typename ViewTraits<T, P...>::specialize, void>::value && + std::is_same<typename Kokkos::ViewTraits<T, P...>::array_layout, + Kokkos::LayoutStride>::value>::type* = 0) { + using src_type = DynRankView<T, P...>; + using dst_type = typename src_type::HostMirror; + + return dst_type(std::string(src.label()).append("_mirror"), + Impl::reconstructLayout(src.layout(), src.rank())); +} + +// Create a mirror in a new space (specialization for different space) +template <class Space, class T, class... P> +typename Impl::MirrorDRVType<Space, T, P...>::view_type create_mirror( + const Space&, const Kokkos::DynRankView<T, P...>& src, + typename std::enable_if<std::is_same< + typename ViewTraits<T, P...>::specialize, void>::value>::type* = + nullptr) { + return typename Impl::MirrorDRVType<Space, T, P...>::view_type( + src.label(), Impl::reconstructLayout(src.layout(), src.rank())); +} + +template <class T, class... P> +inline typename DynRankView<T, P...>::HostMirror create_mirror_view( + const DynRankView<T, P...>& src, + typename std::enable_if< + (std::is_same< + typename DynRankView<T, P...>::memory_space, + typename DynRankView<T, P...>::HostMirror::memory_space>::value && + std::is_same<typename DynRankView<T, P...>::data_type, + typename DynRankView<T, P...>::HostMirror::data_type>:: + value)>::type* = nullptr) { + return src; +} + +template <class T, class... P> +inline typename DynRankView<T, P...>::HostMirror create_mirror_view( + const DynRankView<T, P...>& src, + typename std::enable_if< + !(std::is_same< + typename DynRankView<T, P...>::memory_space, + typename DynRankView<T, P...>::HostMirror::memory_space>::value && + std::is_same<typename DynRankView<T, P...>::data_type, + typename DynRankView<T, P...>::HostMirror::data_type>:: + value)>::type* = nullptr) { + return Kokkos::create_mirror(src); +} + +// Create a mirror view in a new space (specialization for same space) +template <class Space, class T, class... P> +typename Impl::MirrorDRViewType<Space, T, P...>::view_type create_mirror_view( + const Space&, const Kokkos::DynRankView<T, P...>& src, + typename std::enable_if< + Impl::MirrorDRViewType<Space, T, P...>::is_same_memspace>::type* = + nullptr) { + return src; +} + +// Create a mirror view in a new space (specialization for different space) +template <class Space, class T, class... P> +typename Impl::MirrorDRViewType<Space, T, P...>::view_type create_mirror_view( + const Space&, const Kokkos::DynRankView<T, P...>& src, + typename std::enable_if< + !Impl::MirrorDRViewType<Space, T, P...>::is_same_memspace>::type* = + nullptr) { + return typename Impl::MirrorDRViewType<Space, T, P...>::view_type( + src.label(), Impl::reconstructLayout(src.layout(), src.rank())); +} + +// Create a mirror view and deep_copy in a new space (specialization for same +// space) +template <class Space, class T, class... P> +typename Impl::MirrorDRViewType<Space, T, P...>::view_type +create_mirror_view_and_copy( + const Space&, const Kokkos::DynRankView<T, P...>& src, + std::string const& name = "", + typename std::enable_if< + Impl::MirrorDRViewType<Space, T, P...>::is_same_memspace>::type* = + nullptr) { + (void)name; + return src; +} + +// Create a mirror view and deep_copy in a new space (specialization for +// different space) +template <class Space, class T, class... P> +typename Impl::MirrorDRViewType<Space, T, P...>::view_type +create_mirror_view_and_copy( + const Space&, const Kokkos::DynRankView<T, P...>& src, + std::string const& name = "", + typename std::enable_if< + !Impl::MirrorDRViewType<Space, T, P...>::is_same_memspace>::type* = + nullptr) { + using Mirror = typename Impl::MirrorDRViewType<Space, T, P...>::view_type; + std::string label = name.empty() ? src.label() : name; + auto mirror = Mirror(view_alloc(WithoutInitializing, label), + Impl::reconstructLayout(src.layout(), src.rank())); + deep_copy(mirror, src); + return mirror; +} + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +/** \brief Resize a view with copying old data to new data at the corresponding + * indices. */ +template <class T, class... P> +inline void resize(DynRankView<T, P...>& v, + const size_t n0 = KOKKOS_INVALID_INDEX, + const size_t n1 = KOKKOS_INVALID_INDEX, + const size_t n2 = KOKKOS_INVALID_INDEX, + const size_t n3 = KOKKOS_INVALID_INDEX, + const size_t n4 = KOKKOS_INVALID_INDEX, + const size_t n5 = KOKKOS_INVALID_INDEX, + const size_t n6 = KOKKOS_INVALID_INDEX, + const size_t n7 = KOKKOS_INVALID_INDEX) { + using drview_type = DynRankView<T, P...>; + + static_assert(Kokkos::ViewTraits<T, P...>::is_managed, + "Can only resize managed views"); + + drview_type v_resized(v.label(), n0, n1, n2, n3, n4, n5, n6, n7); + + Kokkos::Impl::DynRankViewRemap<drview_type, drview_type>(v_resized, v); + + v = v_resized; +} + +/** \brief Resize a view with copying old data to new data at the corresponding + * indices. */ +template <class T, class... P> +inline void realloc(DynRankView<T, P...>& v, + const size_t n0 = KOKKOS_INVALID_INDEX, + const size_t n1 = KOKKOS_INVALID_INDEX, + const size_t n2 = KOKKOS_INVALID_INDEX, + const size_t n3 = KOKKOS_INVALID_INDEX, + const size_t n4 = KOKKOS_INVALID_INDEX, + const size_t n5 = KOKKOS_INVALID_INDEX, + const size_t n6 = KOKKOS_INVALID_INDEX, + const size_t n7 = KOKKOS_INVALID_INDEX) { + using drview_type = DynRankView<T, P...>; + + static_assert(Kokkos::ViewTraits<T, P...>::is_managed, + "Can only realloc managed views"); + + const std::string label = v.label(); + + v = drview_type(); // Deallocate first, if the only view to allocation + v = drview_type(label, n0, n1, n2, n3, n4, n5, n6, n7); +} + +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/containers/src/Kokkos_DynamicView.hpp b/packages/kokkos/containers/src/Kokkos_DynamicView.hpp new file mode 100644 index 0000000000000000000000000000000000000000..cc949d4c556ab4abd982ea5334fee870c42ef305 --- /dev/null +++ b/packages/kokkos/containers/src/Kokkos_DynamicView.hpp @@ -0,0 +1,611 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_DYNAMIC_VIEW_HPP +#define KOKKOS_DYNAMIC_VIEW_HPP + +#include <cstdio> + +#include <Kokkos_Core.hpp> +#include <impl/Kokkos_Error.hpp> + +namespace Kokkos { +namespace Experimental { + +// Simple metafunction for choosing memory space +// In the current implementation, if memory_space == CudaSpace, +// use CudaUVMSpace for the chunk 'array' allocation, which +// contains will contain pointers to chunks of memory allocated +// in CudaSpace +namespace Impl { +template <class MemSpace> +struct ChunkArraySpace { + using memory_space = MemSpace; +}; + +#ifdef KOKKOS_ENABLE_CUDA +template <> +struct ChunkArraySpace<Kokkos::CudaSpace> { + using memory_space = typename Kokkos::CudaUVMSpace; +}; +#endif +#ifdef KOKKOS_ENABLE_HIP +template <> +struct ChunkArraySpace<Kokkos::Experimental::HIPSpace> { + using memory_space = typename Kokkos::Experimental::HIPHostPinnedSpace; +}; +#endif +#ifdef KOKKOS_ENABLE_SYCL +template <> +struct ChunkArraySpace<Kokkos::Experimental::SYCLDeviceUSMSpace> { + using memory_space = typename Kokkos::Experimental::SYCLSharedUSMSpace; +}; +#endif +} // end namespace Impl + +/** \brief Dynamic views are restricted to rank-one and no layout. + * Resize only occurs on host outside of parallel_regions. + * Subviews are not allowed. + */ +template <typename DataType, typename... P> +class DynamicView : public Kokkos::ViewTraits<DataType, P...> { + public: + using traits = Kokkos::ViewTraits<DataType, P...>; + + private: + template <class, class...> + friend class DynamicView; + + using track_type = Kokkos::Impl::SharedAllocationTracker; + + static_assert(traits::rank == 1 && traits::rank_dynamic == 1, + "DynamicView must be rank-one"); + + // It is assumed that the value_type is trivially copyable; + // when this is not the case, potential problems can occur. + static_assert(std::is_same<typename traits::specialize, void>::value, + "DynamicView only implemented for non-specialized View type"); + + template <class Space, bool = Kokkos::Impl::MemorySpaceAccess< + Space, typename traits::memory_space>::accessible> + struct verify_space { + KOKKOS_FORCEINLINE_FUNCTION static void check() {} + }; + + template <class Space> + struct verify_space<Space, false> { + KOKKOS_FORCEINLINE_FUNCTION static void check() { + Kokkos::abort( + "Kokkos::DynamicView ERROR: attempt to access inaccessible memory " + "space"); + }; + }; + + private: + track_type m_track; + typename traits::value_type** m_chunks = + nullptr; // array of pointers to 'chunks' of memory + unsigned m_chunk_shift; // ceil(log2(m_chunk_size)) + unsigned m_chunk_mask; // m_chunk_size - 1 + unsigned m_chunk_max; // number of entries in the chunk array - each pointing + // to a chunk of extent == m_chunk_size entries + unsigned m_chunk_size; // 2 << (m_chunk_shift - 1) + + public: + //---------------------------------------------------------------------- + + /** \brief Compatible view of array of scalar types */ + using array_type = + DynamicView<typename traits::data_type, typename traits::device_type>; + + /** \brief Compatible view of const data type */ + using const_type = DynamicView<typename traits::const_data_type, + typename traits::device_type>; + + /** \brief Compatible view of non-const data type */ + using non_const_type = DynamicView<typename traits::non_const_data_type, + typename traits::device_type>; + + /** \brief Must be accessible everywhere */ + using HostMirror = DynamicView; + + /** \brief Unified types */ + using uniform_device = + Kokkos::Device<typename traits::device_type::execution_space, + Kokkos::AnonymousSpace>; + using uniform_type = array_type; + using uniform_const_type = const_type; + using uniform_runtime_type = array_type; + using uniform_runtime_const_type = const_type; + using uniform_nomemspace_type = + DynamicView<typename traits::data_type, uniform_device>; + using uniform_const_nomemspace_type = + DynamicView<typename traits::const_data_type, uniform_device>; + using uniform_runtime_nomemspace_type = + DynamicView<typename traits::data_type, uniform_device>; + using uniform_runtime_const_nomemspace_type = + DynamicView<typename traits::const_data_type, uniform_device>; + + //---------------------------------------------------------------------- + + enum { Rank = 1 }; + + KOKKOS_INLINE_FUNCTION + size_t allocation_extent() const noexcept { + uintptr_t n = *reinterpret_cast<const uintptr_t*>(m_chunks + m_chunk_max); + return (n << m_chunk_shift); + } + + KOKKOS_INLINE_FUNCTION + size_t chunk_size() const noexcept { return m_chunk_size; } + + KOKKOS_INLINE_FUNCTION + size_t size() const noexcept { + size_t extent_0 = + *reinterpret_cast<const size_t*>(m_chunks + m_chunk_max + 1); + return extent_0; + } + + template <typename iType> + KOKKOS_INLINE_FUNCTION size_t extent(const iType& r) const { + return r == 0 ? size() : 1; + } + + template <typename iType> + KOKKOS_INLINE_FUNCTION size_t extent_int(const iType& r) const { + return r == 0 ? size() : 1; + } + + KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { return 0; } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { return 0; } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { return 0; } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { return 0; } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { return 0; } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { return 0; } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { return 0; } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { return 0; } + + template <typename iType> + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + *s = 0; + } + + //---------------------------------------- + // Allocation tracking properties + + KOKKOS_INLINE_FUNCTION + int use_count() const { return m_track.use_count(); } + + inline const std::string label() const { + return m_track.template get_label<typename traits::memory_space>(); + } + + //---------------------------------------------------------------------- + // Range span is the span which contains all members. + + using reference_type = typename traits::value_type&; + using pointer_type = typename traits::value_type*; + + enum { + reference_type_is_lvalue_reference = + std::is_lvalue_reference<reference_type>::value + }; + + KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { + return false; + } + KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return 0; } + KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { return 0; } + + //---------------------------------------- + + template <typename I0, class... Args> + KOKKOS_INLINE_FUNCTION reference_type + operator()(const I0& i0, const Args&... /*args*/) const { + static_assert(Kokkos::Impl::are_integral<I0, Args...>::value, + "Indices must be integral type"); + + DynamicView::template verify_space< + Kokkos::Impl::ActiveExecutionMemorySpace>::check(); + + // Which chunk is being indexed. + const uintptr_t ic = uintptr_t(i0 >> m_chunk_shift); + + typename traits::value_type* volatile* const ch = m_chunks + ic; + + // Do bounds checking if enabled or if the chunk pointer is zero. + // If not bounds checking then we assume a non-zero pointer is valid. + +#if !defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) + if (nullptr == *ch) +#endif + { + // Verify that allocation of the requested chunk in in progress. + + // The allocated chunk counter is m_chunks[ m_chunk_max ] + const uintptr_t n = + *reinterpret_cast<uintptr_t volatile*>(m_chunks + m_chunk_max); + + if (n <= ic) { + Kokkos::abort("Kokkos::DynamicView array bounds error"); + } + + // Allocation of this chunk is in progress + // so wait for allocation to complete. + while (nullptr == *ch) + ; + } + + return (*ch)[i0 & m_chunk_mask]; + } + + //---------------------------------------- + /** \brief Resizing in serial can grow or shrink the array size + * up to the maximum number of chunks + * */ + template <typename IntType> + inline typename std::enable_if< + std::is_integral<IntType>::value && + Kokkos::Impl::MemorySpaceAccess< + Kokkos::HostSpace, + typename Impl::ChunkArraySpace< + typename traits::memory_space>::memory_space>::accessible>::type + resize_serial(IntType const& n) { + using local_value_type = typename traits::value_type; + using value_pointer_type = local_value_type*; + + const uintptr_t NC = + (n + m_chunk_mask) >> + m_chunk_shift; // New total number of chunks needed for resize + + if (m_chunk_max < NC) { + Kokkos::abort("DynamicView::resize_serial exceeded maximum size"); + } + + // *m_chunks[m_chunk_max] stores the current number of chunks being used + uintptr_t* const pc = reinterpret_cast<uintptr_t*>(m_chunks + m_chunk_max); + std::string _label = + m_track.template get_label<typename traits::memory_space>(); + if (*pc < NC) { + while (*pc < NC) { + m_chunks[*pc] = reinterpret_cast<value_pointer_type>( + typename traits::memory_space().allocate( + _label.c_str(), sizeof(local_value_type) << m_chunk_shift)); + ++*pc; + } + } else { + while (NC + 1 <= *pc) { + --*pc; + typename traits::memory_space().deallocate( + _label.c_str(), m_chunks[*pc], + sizeof(local_value_type) << m_chunk_shift); + m_chunks[*pc] = nullptr; + } + } + // *m_chunks[m_chunk_max+1] stores the 'extent' requested by resize + *(pc + 1) = n; + } + + KOKKOS_INLINE_FUNCTION bool is_allocated() const { + if (m_chunks == nullptr) { + return false; + } else { + // *m_chunks[m_chunk_max] stores the current number of chunks being used + uintptr_t* const pc = + reinterpret_cast<uintptr_t*>(m_chunks + m_chunk_max); + return (*(pc + 1) > 0); + } + } + + //---------------------------------------------------------------------- + + ~DynamicView() = default; + DynamicView() = default; + DynamicView(DynamicView&&) = default; + DynamicView(const DynamicView&) = default; + DynamicView& operator=(DynamicView&&) = default; + DynamicView& operator=(const DynamicView&) = default; + + template <class RT, class... RP> + DynamicView(const DynamicView<RT, RP...>& rhs) + : m_track(rhs.m_track), + m_chunks((typename traits::value_type**)rhs.m_chunks), + m_chunk_shift(rhs.m_chunk_shift), + m_chunk_mask(rhs.m_chunk_mask), + m_chunk_max(rhs.m_chunk_max), + m_chunk_size(rhs.m_chunk_size) { + using SrcTraits = typename DynamicView<RT, RP...>::traits; + using Mapping = Kokkos::Impl::ViewMapping<traits, SrcTraits, void>; + static_assert(Mapping::is_assignable, + "Incompatible DynamicView copy construction"); + } + + //---------------------------------------------------------------------- + + struct Destroy { + using local_value_type = typename traits::value_type; + std::string m_label; + local_value_type** m_chunks; + unsigned m_chunk_max; + bool m_destroy; + unsigned m_chunk_size; + + // Initialize or destroy array of chunk pointers. + // Two entries beyond the max chunks are allocation counters. + inline void operator()(unsigned i) const { + if (m_destroy && i < m_chunk_max && nullptr != m_chunks[i]) { + typename traits::memory_space().deallocate( + m_label.c_str(), m_chunks[i], + sizeof(local_value_type) * m_chunk_size); + } + m_chunks[i] = nullptr; + } + + void execute(bool arg_destroy) { + using Range = Kokkos::RangePolicy<typename HostSpace::execution_space>; + + m_destroy = arg_destroy; + + Kokkos::Impl::ParallelFor<Destroy, Range> closure( + *this, + Range(0, m_chunk_max + 2)); // Add 2 to 'destroy' extra slots storing + // num_chunks and extent; previously + 1 + + closure.execute(); + + typename traits::execution_space().fence(); + // Impl::ChunkArraySpace< typename traits::memory_space + // >::memory_space::execution_space().fence(); + } + + void construct_shared_allocation() { execute(false); } + + void destroy_shared_allocation() { execute(true); } + + Destroy() = default; + Destroy(Destroy&&) = default; + Destroy(const Destroy&) = default; + Destroy& operator=(Destroy&&) = default; + Destroy& operator=(const Destroy&) = default; + + Destroy(std::string label, typename traits::value_type** arg_chunk, + const unsigned arg_chunk_max, const unsigned arg_chunk_size) + : m_label(label), + m_chunks(arg_chunk), + m_chunk_max(arg_chunk_max), + m_destroy(false), + m_chunk_size(arg_chunk_size) {} + }; + + /**\brief Allocation constructor + * + * Memory is allocated in chunks + * A maximum size is required in order to allocate a + * chunk-pointer array. + */ + explicit inline DynamicView(const std::string& arg_label, + const unsigned min_chunk_size, + const unsigned max_extent) + : m_track(), + m_chunks(nullptr) + // The chunk size is guaranteed to be a power of two + , + m_chunk_shift(Kokkos::Impl::integral_power_of_two_that_contains( + min_chunk_size)) // div ceil(log2(min_chunk_size)) + , + m_chunk_mask((1 << m_chunk_shift) - 1) // mod + , + m_chunk_max((max_extent + m_chunk_mask) >> + m_chunk_shift) // max num pointers-to-chunks in array + , + m_chunk_size(2 << (m_chunk_shift - 1)) { + using chunk_array_memory_space = typename Impl::ChunkArraySpace< + typename traits::memory_space>::memory_space; + // A functor to deallocate all of the chunks upon final destruction + using record_type = + Kokkos::Impl::SharedAllocationRecord<chunk_array_memory_space, Destroy>; + + // Allocate chunk pointers and allocation counter + record_type* const record = + record_type::allocate(chunk_array_memory_space(), arg_label, + (sizeof(pointer_type) * (m_chunk_max + 2))); + // Allocate + 2 extra slots so that *m_chunk[m_chunk_max] == + // num_chunks_alloc and *m_chunk[m_chunk_max+1] == extent This must match in + // Destroy's execute(...) method + + m_chunks = reinterpret_cast<pointer_type*>(record->data()); + + record->m_destroy = Destroy(arg_label, m_chunks, m_chunk_max, m_chunk_size); + + // Initialize to zero + record->m_destroy.construct_shared_allocation(); + + m_track.assign_allocated_record_to_uninitialized(record); + } +}; + +} // namespace Experimental +} // namespace Kokkos + +namespace Kokkos { + +template <class T, class... P> +inline typename Kokkos::Experimental::DynamicView<T, P...>::HostMirror +create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src) { + return src; +} + +template <class T, class... DP, class... SP> +inline void deep_copy(const View<T, DP...>& dst, + const Kokkos::Experimental::DynamicView<T, SP...>& src) { + using dst_type = View<T, DP...>; + using src_type = Kokkos::Experimental::DynamicView<T, SP...>; + + using dst_execution_space = typename ViewTraits<T, DP...>::execution_space; + using src_memory_space = typename ViewTraits<T, SP...>::memory_space; + + enum { + DstExecCanAccessSrc = + Kokkos::Impl::SpaceAccessibility<dst_execution_space, + src_memory_space>::accessible + }; + + if (DstExecCanAccessSrc) { + // Copying data between views in accessible memory spaces and either + // non-contiguous or incompatible shape. + Kokkos::Impl::ViewRemap<dst_type, src_type>(dst, src); + } else { + Kokkos::Impl::throw_runtime_exception( + "deep_copy given views that would require a temporary allocation"); + } +} + +template <class T, class... DP, class... SP> +inline void deep_copy(const Kokkos::Experimental::DynamicView<T, DP...>& dst, + const View<T, SP...>& src) { + using dst_type = Kokkos::Experimental::DynamicView<T, SP...>; + using src_type = View<T, DP...>; + + using dst_execution_space = typename ViewTraits<T, DP...>::execution_space; + using src_memory_space = typename ViewTraits<T, SP...>::memory_space; + + enum { + DstExecCanAccessSrc = + Kokkos::Impl::SpaceAccessibility<dst_execution_space, + src_memory_space>::accessible + }; + + if (DstExecCanAccessSrc) { + // Copying data between views in accessible memory spaces and either + // non-contiguous or incompatible shape. + Kokkos::Impl::ViewRemap<dst_type, src_type>(dst, src); + } else { + Kokkos::Impl::throw_runtime_exception( + "deep_copy given views that would require a temporary allocation"); + } +} + +namespace Impl { +template <class Arg0, class... DP, class... SP> +struct CommonSubview<Kokkos::Experimental::DynamicView<DP...>, + Kokkos::Experimental::DynamicView<SP...>, 1, Arg0> { + using DstType = Kokkos::Experimental::DynamicView<DP...>; + using SrcType = Kokkos::Experimental::DynamicView<SP...>; + using dst_subview_type = DstType; + using src_subview_type = SrcType; + dst_subview_type dst_sub; + src_subview_type src_sub; + CommonSubview(const DstType& dst, const SrcType& src, const Arg0& /*arg0*/) + : dst_sub(dst), src_sub(src) {} +}; + +template <class... DP, class SrcType, class Arg0> +struct CommonSubview<Kokkos::Experimental::DynamicView<DP...>, SrcType, 1, + Arg0> { + using DstType = Kokkos::Experimental::DynamicView<DP...>; + using dst_subview_type = DstType; + using src_subview_type = typename Kokkos::Subview<SrcType, Arg0>; + dst_subview_type dst_sub; + src_subview_type src_sub; + CommonSubview(const DstType& dst, const SrcType& src, const Arg0& arg0) + : dst_sub(dst), src_sub(src, arg0) {} +}; + +template <class DstType, class... SP, class Arg0> +struct CommonSubview<DstType, Kokkos::Experimental::DynamicView<SP...>, 1, + Arg0> { + using SrcType = Kokkos::Experimental::DynamicView<SP...>; + using dst_subview_type = typename Kokkos::Subview<DstType, Arg0>; + using src_subview_type = SrcType; + dst_subview_type dst_sub; + src_subview_type src_sub; + CommonSubview(const DstType& dst, const SrcType& src, const Arg0& arg0) + : dst_sub(dst, arg0), src_sub(src) {} +}; + +template <class... DP, class ViewTypeB, class Layout, class ExecSpace, + typename iType> +struct ViewCopy<Kokkos::Experimental::DynamicView<DP...>, ViewTypeB, Layout, + ExecSpace, 1, iType> { + Kokkos::Experimental::DynamicView<DP...> a; + ViewTypeB b; + + using policy_type = Kokkos::RangePolicy<ExecSpace, Kokkos::IndexType<iType>>; + + ViewCopy(const Kokkos::Experimental::DynamicView<DP...>& a_, + const ViewTypeB& b_) + : a(a_), b(b_) { + Kokkos::parallel_for("Kokkos::ViewCopy-1D", policy_type(0, b.extent(0)), + *this); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const iType& i0) const { a(i0) = b(i0); }; +}; + +template <class... DP, class... SP, class Layout, class ExecSpace, + typename iType> +struct ViewCopy<Kokkos::Experimental::DynamicView<DP...>, + Kokkos::Experimental::DynamicView<SP...>, Layout, ExecSpace, 1, + iType> { + Kokkos::Experimental::DynamicView<DP...> a; + Kokkos::Experimental::DynamicView<SP...> b; + + using policy_type = Kokkos::RangePolicy<ExecSpace, Kokkos::IndexType<iType>>; + + ViewCopy(const Kokkos::Experimental::DynamicView<DP...>& a_, + const Kokkos::Experimental::DynamicView<SP...>& b_) + : a(a_), b(b_) { + const iType n = std::min(a.extent(0), b.extent(0)); + Kokkos::parallel_for("Kokkos::ViewCopy-1D", policy_type(0, n), *this); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const iType& i0) const { a(i0) = b(i0); }; +}; + +} // namespace Impl +} // namespace Kokkos + +#endif /* #ifndef KOKKOS_DYNAMIC_VIEW_HPP */ diff --git a/packages/kokkos/containers/src/Kokkos_ErrorReporter.hpp b/packages/kokkos/containers/src/Kokkos_ErrorReporter.hpp new file mode 100644 index 0000000000000000000000000000000000000000..fbfaed9b1bcda2d22077947532f3abe303ea5533 --- /dev/null +++ b/packages/kokkos/containers/src/Kokkos_ErrorReporter.hpp @@ -0,0 +1,196 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_EXPERIMENTAL_ERROR_REPORTER_HPP +#define KOKKOS_EXPERIMENTAL_ERROR_REPORTER_HPP + +#include <vector> +#include <Kokkos_Core.hpp> +#include <Kokkos_View.hpp> +#include <Kokkos_DualView.hpp> + +namespace Kokkos { +namespace Experimental { + +template <typename ReportType, typename DeviceType> +class ErrorReporter { + public: + using report_type = ReportType; + using device_type = DeviceType; + using execution_space = typename device_type::execution_space; + + ErrorReporter(int max_results) + : m_numReportsAttempted(""), + m_reports("", max_results), + m_reporters("", max_results) { + clear(); + } + + int getCapacity() const { return m_reports.h_view.extent(0); } + + int getNumReports(); + + int getNumReportAttempts(); + + void getReports(std::vector<int> &reporters_out, + std::vector<report_type> &reports_out); + void getReports( + typename Kokkos::View<int *, + typename DeviceType::execution_space>::HostMirror + &reporters_out, + typename Kokkos::View<report_type *, + typename DeviceType::execution_space>::HostMirror + &reports_out); + + void clear(); + + void resize(const size_t new_size); + + bool full() { return (getNumReportAttempts() >= getCapacity()); } + + KOKKOS_INLINE_FUNCTION + bool add_report(int reporter_id, report_type report) const { + int idx = Kokkos::atomic_fetch_add(&m_numReportsAttempted(), 1); + + if (idx >= 0 && (idx < static_cast<int>(m_reports.d_view.extent(0)))) { + m_reporters.d_view(idx) = reporter_id; + m_reports.d_view(idx) = report; + return true; + } else { + return false; + } + } + + private: + using reports_view_t = Kokkos::View<report_type *, execution_space>; + using reports_dualview_t = Kokkos::DualView<report_type *, execution_space>; + + using host_mirror_space = typename reports_dualview_t::host_mirror_space; + Kokkos::View<int, execution_space> m_numReportsAttempted; + reports_dualview_t m_reports; + Kokkos::DualView<int *, execution_space> m_reporters; +}; + +template <typename ReportType, typename DeviceType> +inline int ErrorReporter<ReportType, DeviceType>::getNumReports() { + int num_reports = 0; + Kokkos::deep_copy(num_reports, m_numReportsAttempted); + if (num_reports > static_cast<int>(m_reports.h_view.extent(0))) { + num_reports = m_reports.h_view.extent(0); + } + return num_reports; +} + +template <typename ReportType, typename DeviceType> +inline int ErrorReporter<ReportType, DeviceType>::getNumReportAttempts() { + int num_reports = 0; + Kokkos::deep_copy(num_reports, m_numReportsAttempted); + return num_reports; +} + +template <typename ReportType, typename DeviceType> +void ErrorReporter<ReportType, DeviceType>::getReports( + std::vector<int> &reporters_out, std::vector<report_type> &reports_out) { + int num_reports = getNumReports(); + reporters_out.clear(); + reporters_out.reserve(num_reports); + reports_out.clear(); + reports_out.reserve(num_reports); + + if (num_reports > 0) { + m_reports.template sync<host_mirror_space>(); + m_reporters.template sync<host_mirror_space>(); + + for (int i = 0; i < num_reports; ++i) { + reporters_out.push_back(m_reporters.h_view(i)); + reports_out.push_back(m_reports.h_view(i)); + } + } +} + +template <typename ReportType, typename DeviceType> +void ErrorReporter<ReportType, DeviceType>::getReports( + typename Kokkos::View< + int *, typename DeviceType::execution_space>::HostMirror &reporters_out, + typename Kokkos::View<report_type *, + typename DeviceType::execution_space>::HostMirror + &reports_out) { + int num_reports = getNumReports(); + reporters_out = + typename Kokkos::View<int *, typename DeviceType::execution_space>:: + HostMirror("ErrorReport::reporters_out", num_reports); + reports_out = typename Kokkos:: + View<report_type *, typename DeviceType::execution_space>::HostMirror( + "ErrorReport::reports_out", num_reports); + + if (num_reports > 0) { + m_reports.template sync<host_mirror_space>(); + m_reporters.template sync<host_mirror_space>(); + + for (int i = 0; i < num_reports; ++i) { + reporters_out(i) = m_reporters.h_view(i); + reports_out(i) = m_reports.h_view(i); + } + } +} + +template <typename ReportType, typename DeviceType> +void ErrorReporter<ReportType, DeviceType>::clear() { + int num_reports = 0; + Kokkos::deep_copy(m_numReportsAttempted, num_reports); + m_reports.template modify<execution_space>(); + m_reporters.template modify<execution_space>(); +} + +template <typename ReportType, typename DeviceType> +void ErrorReporter<ReportType, DeviceType>::resize(const size_t new_size) { + m_reports.resize(new_size); + m_reporters.resize(new_size); + typename DeviceType::execution_space().fence(); +} + +} // namespace Experimental +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/containers/src/Kokkos_Functional.hpp b/packages/kokkos/containers/src/Kokkos_Functional.hpp new file mode 100644 index 0000000000000000000000000000000000000000..2e1fa336f7bc062cbfce346b5a1bb39e4354a15a --- /dev/null +++ b/packages/kokkos/containers/src/Kokkos_Functional.hpp @@ -0,0 +1,157 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER + +#ifndef KOKKOS_FUNCTIONAL_HPP +#define KOKKOS_FUNCTIONAL_HPP + +#include <Kokkos_Macros.hpp> +#include <impl/Kokkos_Functional_impl.hpp> + +namespace Kokkos { + +// These should work for most types + +template <typename T> +struct pod_hash { + using argument_type = T; + using first_argument_type = T; + using second_argument_type = uint32_t; + using result_type = uint32_t; + + KOKKOS_FORCEINLINE_FUNCTION + uint32_t operator()(T const& t) const { + return Impl::MurmurHash3_x86_32(&t, sizeof(T), 0); + } + + KOKKOS_FORCEINLINE_FUNCTION + uint32_t operator()(T const& t, uint32_t seed) const { + return Impl::MurmurHash3_x86_32(&t, sizeof(T), seed); + } +}; + +template <typename T> +struct pod_equal_to { + using first_argument_type = T; + using second_argument_type = T; + using result_type = bool; + + KOKKOS_FORCEINLINE_FUNCTION + bool operator()(T const& a, T const& b) const { + return Impl::bitwise_equal(&a, &b); + } +}; + +template <typename T> +struct pod_not_equal_to { + using first_argument_type = T; + using second_argument_type = T; + using result_type = bool; + + KOKKOS_FORCEINLINE_FUNCTION + bool operator()(T const& a, T const& b) const { + return !Impl::bitwise_equal(&a, &b); + } +}; + +template <typename T> +struct equal_to { + using first_argument_type = T; + using second_argument_type = T; + using result_type = bool; + + KOKKOS_FORCEINLINE_FUNCTION + bool operator()(T const& a, T const& b) const { return a == b; } +}; + +template <typename T> +struct not_equal_to { + using first_argument_type = T; + using second_argument_type = T; + using result_type = bool; + + KOKKOS_FORCEINLINE_FUNCTION + bool operator()(T const& a, T const& b) const { return a != b; } +}; + +template <typename T> +struct greater { + using first_argument_type = T; + using second_argument_type = T; + using result_type = bool; + + KOKKOS_FORCEINLINE_FUNCTION + bool operator()(T const& a, T const& b) const { return a > b; } +}; + +template <typename T> +struct less { + using first_argument_type = T; + using second_argument_type = T; + using result_type = bool; + + KOKKOS_FORCEINLINE_FUNCTION + bool operator()(T const& a, T const& b) const { return a < b; } +}; + +template <typename T> +struct greater_equal { + using first_argument_type = T; + using second_argument_type = T; + using result_type = bool; + + KOKKOS_FORCEINLINE_FUNCTION + bool operator()(T const& a, T const& b) const { return a >= b; } +}; + +template <typename T> +struct less_equal { + using first_argument_type = T; + using second_argument_type = T; + using result_type = bool; + + KOKKOS_FORCEINLINE_FUNCTION + bool operator()(T const& a, T const& b) const { return a <= b; } +}; + +} // namespace Kokkos + +#endif // KOKKOS_FUNCTIONAL_HPP diff --git a/packages/kokkos/containers/src/Kokkos_OffsetView.hpp b/packages/kokkos/containers/src/Kokkos_OffsetView.hpp new file mode 100644 index 0000000000000000000000000000000000000000..0f21a08ba3ba86ed176dc4c4535ef76c960e90bc --- /dev/null +++ b/packages/kokkos/containers/src/Kokkos_OffsetView.hpp @@ -0,0 +1,2082 @@ +/* + * Kokkos_OffsetView.hpp + * + * Created on: Apr 23, 2018 + * Author: swbova + */ + +#ifndef KOKKOS_OFFSETVIEW_HPP_ +#define KOKKOS_OFFSETVIEW_HPP_ + +#include <Kokkos_Core.hpp> + +#include <Kokkos_View.hpp> + +namespace Kokkos { + +namespace Experimental { +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +template <class DataType, class... Properties> +class OffsetView; + +template <class> +struct is_offset_view : public std::false_type {}; + +template <class D, class... P> +struct is_offset_view<OffsetView<D, P...> > : public std::true_type {}; + +template <class D, class... P> +struct is_offset_view<const OffsetView<D, P...> > : public std::true_type {}; + +#define KOKKOS_INVALID_OFFSET int64_t(0x7FFFFFFFFFFFFFFFLL) +#define KOKKOS_INVALID_INDEX_RANGE \ + { KOKKOS_INVALID_OFFSET, KOKKOS_INVALID_OFFSET } + +template <typename iType, + typename std::enable_if<std::is_integral<iType>::value && + std::is_signed<iType>::value, + iType>::type = 0> +using IndexRange = Kokkos::Array<iType, 2>; + +using index_list_type = std::initializer_list<int64_t>; + +// template <typename iType, +// typename std::enable_if< std::is_integral<iType>::value && +// std::is_signed<iType>::value, iType >::type = 0> using min_index_type = +// std::initializer_list<iType>; + +namespace Impl { + +template <class ViewType> +struct GetOffsetViewTypeFromViewType { + using type = + OffsetView<typename ViewType::data_type, typename ViewType::array_layout, + typename ViewType::device_type, + typename ViewType::memory_traits>; +}; + +template <unsigned, class MapType, class BeginsType> +KOKKOS_INLINE_FUNCTION bool offsetview_verify_operator_bounds( + const MapType&, const BeginsType&) { + return true; +} + +template <unsigned R, class MapType, class BeginsType, class iType, + class... Args> +KOKKOS_INLINE_FUNCTION bool offsetview_verify_operator_bounds( + const MapType& map, const BeginsType& begins, const iType& i, + Args... args) { + const bool legalIndex = + (int64_t(i) >= begins[R]) && + (int64_t(i) <= int64_t(begins[R] + map.extent(R) - 1)); + return legalIndex && + offsetview_verify_operator_bounds<R + 1>(map, begins, args...); +} +template <unsigned, class MapType, class BeginsType> +inline void offsetview_error_operator_bounds(char*, int, const MapType&, + const BeginsType&) {} + +template <unsigned R, class MapType, class BeginsType, class iType, + class... Args> +inline void offsetview_error_operator_bounds(char* buf, int len, + const MapType& map, + const BeginsType begins, + const iType& i, Args... args) { + const int64_t b = begins[R]; + const int64_t e = b + map.extent(R) - 1; + const int n = + snprintf(buf, len, " %ld <= %ld <= %ld %c", static_cast<unsigned long>(b), + static_cast<unsigned long>(i), static_cast<unsigned long>(e), + (sizeof...(Args) ? ',' : ')')); + offsetview_error_operator_bounds<R + 1>(buf + n, len - n, map, begins, + args...); +} + +template <class MemorySpace, class MapType, class BeginsType, class... Args> +KOKKOS_INLINE_FUNCTION void offsetview_verify_operator_bounds( + Kokkos::Impl::SharedAllocationTracker const& tracker, const MapType& map, + const BeginsType& begins, Args... args) { + if (!offsetview_verify_operator_bounds<0>(map, begins, args...)) { +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + enum { LEN = 1024 }; + char buffer[LEN]; + const std::string label = tracker.template get_label<MemorySpace>(); + int n = + snprintf(buffer, LEN, "OffsetView bounds error of view labeled %s (", + label.c_str()); + offsetview_error_operator_bounds<0>(buffer + n, LEN - n, map, begins, + args...); + Kokkos::Impl::throw_runtime_exception(std::string(buffer)); +#else + /* Check #1: is there a SharedAllocationRecord? + (we won't use it, but if its not there then there isn't + a corresponding SharedAllocationHeader containing a label). + This check should cover the case of Views that don't + have the Unmanaged trait but were initialized by pointer. */ + if (tracker.has_record()) { + Kokkos::Impl::operator_bounds_error_on_device<MapType>( + map, Kokkos::Impl::has_printable_label_typedef<MapType>()); + } else { + Kokkos::abort("OffsetView bounds error"); + } +#endif + } +} + +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST +KOKKOS_INLINE_FUNCTION +void runtime_check_rank_host(const size_t rank_dynamic, const size_t rank, + const index_list_type minIndices, + const std::string& label) { + bool isBad = false; + std::string message = + "Kokkos::Experimental::OffsetView ERROR: for OffsetView labeled '" + + label + "':"; + if (rank_dynamic != rank) { + message += + "The full rank must be the same as the dynamic rank. full rank = "; + message += std::to_string(rank) + + " dynamic rank = " + std::to_string(rank_dynamic) + "\n"; + isBad = true; + } + + size_t numOffsets = 0; + for (size_t i = 0; i < minIndices.size(); ++i) { + if (minIndices.begin()[i] != KOKKOS_INVALID_OFFSET) numOffsets++; + } + if (numOffsets != rank_dynamic) { + message += "The number of offsets provided ( " + + std::to_string(numOffsets) + + " ) must equal the dynamic rank ( " + + std::to_string(rank_dynamic) + " )."; + isBad = true; + } + + if (isBad) Kokkos::abort(message.c_str()); +} +#endif + +KOKKOS_INLINE_FUNCTION +void runtime_check_rank_device(const size_t rank_dynamic, const size_t rank, + const index_list_type minIndices) { + if (rank_dynamic != rank) { + Kokkos::abort( + "The full rank of an OffsetView must be the same as the dynamic rank."); + } + size_t numOffsets = 0; + for (size_t i = 0; i < minIndices.size(); ++i) { + if (minIndices.begin()[i] != KOKKOS_INVALID_OFFSET) numOffsets++; + } + if (numOffsets != rank) { + Kokkos::abort( + "The number of offsets provided to an OffsetView constructor must " + "equal the dynamic rank."); + } +} +} // namespace Impl + +template <class DataType, class... Properties> +class OffsetView : public ViewTraits<DataType, Properties...> { + public: + using traits = ViewTraits<DataType, Properties...>; + + private: + template <class, class...> + friend class OffsetView; + template <class, class...> + friend class View; // FIXME delete this line + template <class, class...> + friend class Kokkos::Impl::ViewMapping; + + using map_type = Kokkos::Impl::ViewMapping<traits, void>; + using track_type = Kokkos::Impl::SharedAllocationTracker; + + public: + enum { Rank = map_type::Rank }; + using begins_type = Kokkos::Array<int64_t, Rank>; + + template < + typename iType, + typename std::enable_if<std::is_integral<iType>::value, iType>::type = 0> + KOKKOS_INLINE_FUNCTION int64_t begin(const iType local_dimension) const { + return local_dimension < Rank ? m_begins[local_dimension] + : KOKKOS_INVALID_OFFSET; + } + + KOKKOS_INLINE_FUNCTION + begins_type begins() const { return m_begins; } + + template < + typename iType, + typename std::enable_if<std::is_integral<iType>::value, iType>::type = 0> + KOKKOS_INLINE_FUNCTION int64_t end(const iType local_dimension) const { + return begin(local_dimension) + m_map.extent(local_dimension); + } + + private: + track_type m_track; + map_type m_map; + begins_type m_begins; + + public: + //---------------------------------------- + /** \brief Compatible view of array of scalar types */ + using array_type = + OffsetView<typename traits::scalar_array_type, + typename traits::array_layout, typename traits::device_type, + typename traits::memory_traits>; + + /** \brief Compatible view of const data type */ + using const_type = + OffsetView<typename traits::const_data_type, + typename traits::array_layout, typename traits::device_type, + typename traits::memory_traits>; + + /** \brief Compatible view of non-const data type */ + using non_const_type = + OffsetView<typename traits::non_const_data_type, + typename traits::array_layout, typename traits::device_type, + typename traits::memory_traits>; + + /** \brief Compatible HostMirror view */ + using HostMirror = OffsetView<typename traits::non_const_data_type, + typename traits::array_layout, + typename traits::host_mirror_space>; + + //---------------------------------------- + // Domain rank and extents + + /** \brief rank() to be implemented + */ + // KOKKOS_INLINE_FUNCTION + // static + // constexpr unsigned rank() { return map_type::Rank; } + + template <typename iType> + KOKKOS_INLINE_FUNCTION constexpr + typename std::enable_if<std::is_integral<iType>::value, size_t>::type + extent(const iType& r) const { + return m_map.extent(r); + } + + template <typename iType> + KOKKOS_INLINE_FUNCTION constexpr + typename std::enable_if<std::is_integral<iType>::value, int>::type + extent_int(const iType& r) const { + return static_cast<int>(m_map.extent(r)); + } + + KOKKOS_INLINE_FUNCTION constexpr typename traits::array_layout layout() + const { + return m_map.layout(); + } + + KOKKOS_INLINE_FUNCTION constexpr size_t size() const { + return m_map.dimension_0() * m_map.dimension_1() * m_map.dimension_2() * + m_map.dimension_3() * m_map.dimension_4() * m_map.dimension_5() * + m_map.dimension_6() * m_map.dimension_7(); + } + + KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { + return m_map.stride_0(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { + return m_map.stride_1(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { + return m_map.stride_2(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { + return m_map.stride_3(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { + return m_map.stride_4(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { + return m_map.stride_5(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { + return m_map.stride_6(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { + return m_map.stride_7(); + } + + template <typename iType> + KOKKOS_INLINE_FUNCTION constexpr + typename std::enable_if<std::is_integral<iType>::value, size_t>::type + stride(iType r) const { + return ( + r == 0 + ? m_map.stride_0() + : (r == 1 + ? m_map.stride_1() + : (r == 2 + ? m_map.stride_2() + : (r == 3 + ? m_map.stride_3() + : (r == 4 + ? m_map.stride_4() + : (r == 5 + ? m_map.stride_5() + : (r == 6 + ? m_map.stride_6() + : m_map.stride_7()))))))); + } + + template <typename iType> + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + m_map.stride(s); + } + + //---------------------------------------- + // Range span is the span which contains all members. + + using reference_type = typename map_type::reference_type; + using pointer_type = typename map_type::pointer_type; + + enum { + reference_type_is_lvalue_reference = + std::is_lvalue_reference<reference_type>::value + }; + + KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_map.span(); } + KOKKOS_INLINE_FUNCTION bool span_is_contiguous() const { + return m_map.span_is_contiguous(); + } + KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const { + return m_map.data() != nullptr; + } + KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { + return m_map.data(); + } + + //---------------------------------------- + // Allow specializations to query their specialized map + + KOKKOS_INLINE_FUNCTION + const Kokkos::Impl::ViewMapping<traits, void>& implementation_map() const { + return m_map; + } + + //---------------------------------------- + + private: + static constexpr bool is_layout_left = + std::is_same<typename traits::array_layout, Kokkos::LayoutLeft>::value; + + static constexpr bool is_layout_right = + std::is_same<typename traits::array_layout, Kokkos::LayoutRight>::value; + + static constexpr bool is_layout_stride = + std::is_same<typename traits::array_layout, Kokkos::LayoutStride>::value; + + static constexpr bool is_default_map = + std::is_same<typename traits::specialize, void>::value && + (is_layout_left || is_layout_right || is_layout_stride); + +#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) + +#define KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(ARG) \ + Kokkos::Impl::verify_space<Kokkos::Impl::ActiveExecutionMemorySpace, \ + typename traits::memory_space>::check(); \ + Kokkos::Experimental::Impl::offsetview_verify_operator_bounds< \ + typename traits::memory_space> \ + ARG; + +#else + +#define KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(ARG) \ + Kokkos::Impl::verify_space<Kokkos::Impl::ActiveExecutionMemorySpace, \ + typename traits::memory_space>::check(); + +#endif + public: + //------------------------------ + // Rank 0 operator() + + KOKKOS_FORCEINLINE_FUNCTION + reference_type operator()() const { return m_map.reference(); } + //------------------------------ + // Rank 1 operator() + + template <typename I0> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<(Kokkos::Impl::are_integral<I0>::value && + (1 == Rank) && !is_default_map), + reference_type>::type + operator()(const I0& i0) const { + KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) + const size_t j0 = i0 - m_begins[0]; + return m_map.reference(j0); + } + + template <typename I0> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<(Kokkos::Impl::are_integral<I0>::value && + (1 == Rank) && is_default_map && + !is_layout_stride), + reference_type>::type + operator()(const I0& i0) const { + KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) + const size_t j0 = i0 - m_begins[0]; + return m_map.m_impl_handle[j0]; + } + + template <typename I0> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<(Kokkos::Impl::are_integral<I0>::value && + (1 == Rank) && is_default_map && + is_layout_stride), + reference_type>::type + operator()(const I0& i0) const { + KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) + const size_t j0 = i0 - m_begins[0]; + return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * j0]; + } + //------------------------------ + // Rank 1 operator[] + + template <typename I0> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<(Kokkos::Impl::are_integral<I0>::value && + (1 == Rank) && !is_default_map), + reference_type>::type + operator[](const I0& i0) const { + KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) + const size_t j0 = i0 - m_begins[0]; + return m_map.reference(j0); + } + + template <typename I0> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<(Kokkos::Impl::are_integral<I0>::value && + (1 == Rank) && is_default_map && + !is_layout_stride), + reference_type>::type + operator[](const I0& i0) const { + KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) + const size_t j0 = i0 - m_begins[0]; + return m_map.m_impl_handle[j0]; + } + + template <typename I0> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<(Kokkos::Impl::are_integral<I0>::value && + (1 == Rank) && is_default_map && + is_layout_stride), + reference_type>::type + operator[](const I0& i0) const { + KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) + const size_t j0 = i0 - m_begins[0]; + return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * j0]; + } + + //------------------------------ + // Rank 2 + + template <typename I0, typename I1> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<(Kokkos::Impl::are_integral<I0, I1>::value && + (2 == Rank) && !is_default_map), + reference_type>::type + operator()(const I0& i0, const I1& i1) const { + KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0, i1)) + const size_t j0 = i0 - m_begins[0]; + const size_t j1 = i1 - m_begins[1]; + return m_map.reference(j0, j1); + } + + template <typename I0, typename I1> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<(Kokkos::Impl::are_integral<I0, I1>::value && + (2 == Rank) && is_default_map && + is_layout_left && (traits::rank_dynamic == 0)), + reference_type>::type + operator()(const I0& i0, const I1& i1) const { + KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0, i1)) + const size_t j0 = i0 - m_begins[0]; + const size_t j1 = i1 - m_begins[1]; + return m_map.m_impl_handle[j0 + m_map.m_impl_offset.m_dim.N0 * j1]; + } + + template <typename I0, typename I1> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<(Kokkos::Impl::are_integral<I0, I1>::value && + (2 == Rank) && is_default_map && + is_layout_left && (traits::rank_dynamic != 0)), + reference_type>::type + operator()(const I0& i0, const I1& i1) const { + KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0, i1)) + const size_t j0 = i0 - m_begins[0]; + const size_t j1 = i1 - m_begins[1]; + return m_map.m_impl_handle[j0 + m_map.m_impl_offset.m_stride * j1]; + } + + template <typename I0, typename I1> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<(Kokkos::Impl::are_integral<I0, I1>::value && + (2 == Rank) && is_default_map && + is_layout_right && (traits::rank_dynamic == 0)), + reference_type>::type + operator()(const I0& i0, const I1& i1) const { + KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0, i1)) + const size_t j0 = i0 - m_begins[0]; + const size_t j1 = i1 - m_begins[1]; + return m_map.m_impl_handle[j1 + m_map.m_impl_offset.m_dim.N1 * j0]; + } + + template <typename I0, typename I1> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<(Kokkos::Impl::are_integral<I0, I1>::value && + (2 == Rank) && is_default_map && + is_layout_right && (traits::rank_dynamic != 0)), + reference_type>::type + operator()(const I0& i0, const I1& i1) const { + KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0, i1)) + const size_t j0 = i0 - m_begins[0]; + const size_t j1 = i1 - m_begins[1]; + return m_map.m_impl_handle[j1 + m_map.m_impl_offset.m_stride * j0]; + } + + template <typename I0, typename I1> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<(Kokkos::Impl::are_integral<I0, I1>::value && + (2 == Rank) && is_default_map && + is_layout_stride), + reference_type>::type + operator()(const I0& i0, const I1& i1) const { + KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0, i1)) + const size_t j0 = i0 - m_begins[0]; + const size_t j1 = i1 - m_begins[1]; + return m_map.m_impl_handle[j0 * m_map.m_impl_offset.m_stride.S0 + + j1 * m_map.m_impl_offset.m_stride.S1]; + } + + //------------------------------ + // Rank 3 + + template <typename I0, typename I1, typename I2> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<(Kokkos::Impl::are_integral<I0, I1, I2>::value && + (3 == Rank) && is_default_map), + reference_type>::type + operator()(const I0& i0, const I1& i1, const I2& i2) const { + KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( + (m_track, m_map, m_begins, i0, i1, i2)) + const size_t j0 = i0 - m_begins[0]; + const size_t j1 = i1 - m_begins[1]; + const size_t j2 = i2 - m_begins[2]; + return m_map.m_impl_handle[m_map.m_impl_offset(j0, j1, j2)]; + } + + template <typename I0, typename I1, typename I2> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<(Kokkos::Impl::are_integral<I0, I1, I2>::value && + (3 == Rank) && !is_default_map), + reference_type>::type + operator()(const I0& i0, const I1& i1, const I2& i2) const { + KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( + (m_track, m_map, m_begins, i0, i1, i2)) + const size_t j0 = i0 - m_begins[0]; + const size_t j1 = i1 - m_begins[1]; + const size_t j2 = i2 - m_begins[2]; + return m_map.reference(j0, j1, j2); + } + + //------------------------------ + // Rank 4 + + template <typename I0, typename I1, typename I2, typename I3> + KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< + (Kokkos::Impl::are_integral<I0, I1, I2, I3>::value && (4 == Rank) && + is_default_map), + reference_type>::type + operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3) const { + KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( + (m_track, m_map, m_begins, i0, i1, i2, i3)) + const size_t j0 = i0 - m_begins[0]; + const size_t j1 = i1 - m_begins[1]; + const size_t j2 = i2 - m_begins[2]; + const size_t j3 = i3 - m_begins[3]; + return m_map.m_impl_handle[m_map.m_impl_offset(j0, j1, j2, j3)]; + } + + template <typename I0, typename I1, typename I2, typename I3> + KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< + (Kokkos::Impl::are_integral<I0, I1, I2, I3>::value && (4 == Rank) && + !is_default_map), + reference_type>::type + operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3) const { + KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( + (m_track, m_map, m_begins, i0, i1, i2, i3)) + const size_t j0 = i0 - m_begins[0]; + const size_t j1 = i1 - m_begins[1]; + const size_t j2 = i2 - m_begins[2]; + const size_t j3 = i3 - m_begins[3]; + return m_map.reference(j0, j1, j2, j3); + } + + //------------------------------ + // Rank 5 + + template <typename I0, typename I1, typename I2, typename I3, typename I4> + KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< + (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4>::value && (5 == Rank) && + is_default_map), + reference_type>::type + operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, + const I4& i4) const { + KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( + (m_track, m_map, m_begins, i0, i1, i2, i3, i4)) + const size_t j0 = i0 - m_begins[0]; + const size_t j1 = i1 - m_begins[1]; + const size_t j2 = i2 - m_begins[2]; + const size_t j3 = i3 - m_begins[3]; + const size_t j4 = i4 - m_begins[4]; + return m_map.m_impl_handle[m_map.m_impl_offset(j0, j1, j2, j3, j4)]; + } + + template <typename I0, typename I1, typename I2, typename I3, typename I4> + KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< + (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4>::value && (5 == Rank) && + !is_default_map), + reference_type>::type + operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, + const I4& i4) const { + KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( + (m_track, m_map, m_begins, i0, i1, i2, i3, i4)) + const size_t j0 = i0 - m_begins[0]; + const size_t j1 = i1 - m_begins[1]; + const size_t j2 = i2 - m_begins[2]; + const size_t j3 = i3 - m_begins[3]; + const size_t j4 = i4 - m_begins[4]; + return m_map.reference(j0, j1, j2, j3, j4); + } + + //------------------------------ + // Rank 6 + + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5> + KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< + (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5>::value && + (6 == Rank) && is_default_map), + reference_type>::type + operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, + const I4& i4, const I5& i5) const { + KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( + (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5)) + const size_t j0 = i0 - m_begins[0]; + const size_t j1 = i1 - m_begins[1]; + const size_t j2 = i2 - m_begins[2]; + const size_t j3 = i3 - m_begins[3]; + const size_t j4 = i4 - m_begins[4]; + const size_t j5 = i5 - m_begins[5]; + return m_map.m_impl_handle[m_map.m_impl_offset(j0, j1, j2, j3, j4, j5)]; + } + + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5> + KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< + (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5>::value && + (6 == Rank) && !is_default_map), + reference_type>::type + operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, + const I4& i4, const I5& i5) const { + KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( + (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5)) + const size_t j0 = i0 - m_begins[0]; + const size_t j1 = i1 - m_begins[1]; + const size_t j2 = i2 - m_begins[2]; + const size_t j3 = i3 - m_begins[3]; + const size_t j4 = i4 - m_begins[4]; + const size_t j5 = i5 - m_begins[5]; + return m_map.reference(j0, j1, j2, j3, j4, j5); + } + + //------------------------------ + // Rank 7 + + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5, typename I6> + KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< + (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5, I6>::value && + (7 == Rank) && is_default_map), + reference_type>::type + operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, + const I4& i4, const I5& i5, const I6& i6) const { + KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( + (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5, i6)) + const size_t j0 = i0 - m_begins[0]; + const size_t j1 = i1 - m_begins[1]; + const size_t j2 = i2 - m_begins[2]; + const size_t j3 = i3 - m_begins[3]; + const size_t j4 = i4 - m_begins[4]; + const size_t j5 = i5 - m_begins[5]; + const size_t j6 = i6 - m_begins[6]; + return m_map.m_impl_handle[m_map.m_impl_offset(j0, j1, j2, j3, j4, j5, j6)]; + } + + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5, typename I6> + KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< + (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5, I6>::value && + (7 == Rank) && !is_default_map), + reference_type>::type + operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, + const I4& i4, const I5& i5, const I6& i6) const { + KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( + (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5, i6)) + const size_t j0 = i0 - m_begins[0]; + const size_t j1 = i1 - m_begins[1]; + const size_t j2 = i2 - m_begins[2]; + const size_t j3 = i3 - m_begins[3]; + const size_t j4 = i4 - m_begins[4]; + const size_t j5 = i5 - m_begins[5]; + const size_t j6 = i6 - m_begins[6]; + return m_map.reference(j0, j1, j2, j3, j4, j5, j6); + } + + //------------------------------ + // Rank 8 + + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5, typename I6, typename I7> + KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< + (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5, I6, I7>::value && + (8 == Rank) && is_default_map), + reference_type>::type + operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, + const I4& i4, const I5& i5, const I6& i6, const I7& i7) const { + KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( + (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5, i6, i7)) + const size_t j0 = i0 - m_begins[0]; + const size_t j1 = i1 - m_begins[1]; + const size_t j2 = i2 - m_begins[2]; + const size_t j3 = i3 - m_begins[3]; + const size_t j4 = i4 - m_begins[4]; + const size_t j5 = i5 - m_begins[5]; + const size_t j6 = i6 - m_begins[6]; + const size_t j7 = i7 - m_begins[7]; + return m_map + .m_impl_handle[m_map.m_impl_offset(j0, j1, j2, j3, j4, j5, j6, j7)]; + } + + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5, typename I6, typename I7> + KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< + (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5, I6, I7>::value && + (8 == Rank) && !is_default_map), + reference_type>::type + operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, + const I4& i4, const I5& i5, const I6& i6, const I7& i7) const { + KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( + (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5, i6, i7)) + const size_t j0 = i0 - m_begins[0]; + const size_t j1 = i1 - m_begins[1]; + const size_t j2 = i2 - m_begins[2]; + const size_t j3 = i3 - m_begins[3]; + const size_t j4 = i4 - m_begins[4]; + const size_t j5 = i5 - m_begins[5]; + const size_t j6 = i6 - m_begins[6]; + const size_t j7 = i7 - m_begins[7]; + return m_map.reference(j0, j1, j2, j3, j4, j5, j6, j7); + } + +#undef KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY + + //---------------------------------------- + // Standard destructor, constructors, and assignment operators + + KOKKOS_DEFAULTED_FUNCTION + ~OffsetView() = default; + + KOKKOS_INLINE_FUNCTION + OffsetView() : m_track(), m_map() { + for (size_t i = 0; i < Rank; ++i) m_begins[i] = KOKKOS_INVALID_OFFSET; + } + + KOKKOS_INLINE_FUNCTION + OffsetView(const OffsetView& rhs) + : m_track(rhs.m_track, traits::is_managed), + m_map(rhs.m_map), + m_begins(rhs.m_begins) {} + + KOKKOS_INLINE_FUNCTION + OffsetView(OffsetView&& rhs) + : m_track(std::move(rhs.m_track)), + m_map(std::move(rhs.m_map)), + m_begins(std::move(rhs.m_begins)) {} + + KOKKOS_INLINE_FUNCTION + OffsetView& operator=(const OffsetView& rhs) { + m_track = rhs.m_track; + m_map = rhs.m_map; + m_begins = rhs.m_begins; + return *this; + } + + KOKKOS_INLINE_FUNCTION + OffsetView& operator=(OffsetView&& rhs) { + m_track = std::move(rhs.m_track); + m_map = std::move(rhs.m_map); + m_begins = std::move(rhs.m_begins); + return *this; + } + + // interoperability with View + private: + using view_type = + View<typename traits::scalar_array_type, typename traits::array_layout, + typename traits::device_type, typename traits::memory_traits>; + + public: + KOKKOS_INLINE_FUNCTION + view_type view() const { + view_type v(m_track, m_map); + return v; + } + + template <class RT, class... RP> + KOKKOS_INLINE_FUNCTION OffsetView(const View<RT, RP...>& aview) + : m_track(aview.impl_track()), m_map() { + using SrcTraits = typename OffsetView<RT, RP...>::traits; + using Mapping = Kokkos::Impl::ViewMapping<traits, SrcTraits, void>; + static_assert(Mapping::is_assignable, + "Incompatible OffsetView copy construction"); + Mapping::assign(m_map, aview.impl_map(), m_track); + + for (int i = 0; i < aview.Rank; ++i) { + m_begins[i] = 0; + } + } + + template <class RT, class... RP> + KOKKOS_INLINE_FUNCTION OffsetView(const View<RT, RP...>& aview, + const index_list_type& minIndices) + : m_track(aview.impl_track()), m_map() { + using SrcTraits = typename OffsetView<RT, RP...>::traits; + using Mapping = Kokkos::Impl::ViewMapping<traits, SrcTraits, void>; + static_assert(Mapping::is_assignable, + "Incompatible OffsetView copy construction"); + Mapping::assign(m_map, aview.impl_map(), m_track); + +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + Kokkos::Experimental::Impl::runtime_check_rank_host( + traits::rank_dynamic, Rank, minIndices, label()); +#else + Kokkos::Experimental::Impl::runtime_check_rank_device(traits::rank_dynamic, + Rank, minIndices); + +#endif + + for (size_t i = 0; i < minIndices.size(); ++i) { + m_begins[i] = minIndices.begin()[i]; + } + } + template <class RT, class... RP> + KOKKOS_INLINE_FUNCTION OffsetView(const View<RT, RP...>& aview, + const begins_type& beg) + : m_track(aview.impl_track()), m_map(), m_begins(beg) { + using SrcTraits = typename OffsetView<RT, RP...>::traits; + using Mapping = Kokkos::Impl::ViewMapping<traits, SrcTraits, void>; + static_assert(Mapping::is_assignable, + "Incompatible OffsetView copy construction"); + Mapping::assign(m_map, aview.impl_map(), m_track); + + //#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + // Kokkos::Experimental::Impl::runtime_check_rank_host(traits::rank_dynamic, + // Rank, minIndices, label()); + //#else + // Kokkos::Experimental::Impl::runtime_check_rank_device(traits::rank_dynamic, + // Rank, minIndices); + // + //#endif + } + + // may assign unmanaged from managed. + + template <class RT, class... RP> + KOKKOS_INLINE_FUNCTION OffsetView(const OffsetView<RT, RP...>& rhs) + : m_track(rhs.m_track, traits::is_managed), + m_map(), + m_begins(rhs.m_begins) { + using SrcTraits = typename OffsetView<RT, RP...>::traits; + using Mapping = Kokkos::Impl::ViewMapping<traits, SrcTraits, void>; + static_assert(Mapping::is_assignable, + "Incompatible OffsetView copy construction"); + Mapping::assign(m_map, rhs.m_map, rhs.m_track); // swb what about assign? + } + + private: + enum class subtraction_failure { + none, + negative, + overflow, + }; + + // Subtraction should return a non-negative number and not overflow + KOKKOS_INLINE_FUNCTION static subtraction_failure check_subtraction( + int64_t lhs, int64_t rhs) { + if (lhs < rhs) return subtraction_failure::negative; + + if (static_cast<uint64_t>(-1) / static_cast<uint64_t>(2) < + static_cast<uint64_t>(lhs) - static_cast<uint64_t>(rhs)) + return subtraction_failure::overflow; + + return subtraction_failure::none; + } + + // Need a way to get at an element from both begins_type (aka Kokkos::Array + // which doesn't have iterators) and index_list_type (aka + // std::initializer_list which doesn't have .data() or operator[]). + // Returns by value + KOKKOS_INLINE_FUNCTION + static int64_t at(const begins_type& a, size_t pos) { return a[pos]; } + + KOKKOS_INLINE_FUNCTION + static int64_t at(index_list_type a, size_t pos) { + return *(a.begin() + pos); + } + +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + // Check that begins < ends for all elements + // B, E can be begins_type and/or index_list_type + template <typename B, typename E> + KOKKOS_INLINE_FUNCTION static subtraction_failure + runtime_check_begins_ends_host(const B& begins, const E& ends) { + std::string message; + if (begins.size() != Rank) + message += + "begins.size() " + "(" + + std::to_string(begins.size()) + + ")" + " != Rank " + "(" + + std::to_string(Rank) + + ")" + "\n"; + + if (ends.size() != Rank) + message += + "ends.size() " + "(" + + std::to_string(begins.size()) + + ")" + " != Rank " + "(" + + std::to_string(Rank) + + ")" + "\n"; + + // If there are no errors so far, then rank == Rank + // Otherwise, check as much as possible + size_t rank = begins.size() < ends.size() ? begins.size() : ends.size(); + for (size_t i = 0; i != rank; ++i) { + subtraction_failure sf = check_subtraction(at(ends, i), at(begins, i)); + if (sf != subtraction_failure::none) { + message += + "(" + "ends[" + + std::to_string(i) + + "]" + " " + "(" + + std::to_string(at(ends, i)) + + ")" + " - " + "begins[" + + std::to_string(i) + + "]" + " " + "(" + + std::to_string(at(begins, i)) + + ")" + ")"; + switch (sf) { + case subtraction_failure::negative: + message += " must be non-negative\n"; + break; + case subtraction_failure::overflow: message += " overflows\n"; break; + default: break; + } + } + } + + if (!message.empty()) { + message = + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView\n" + + message; + Kokkos::Impl::throw_runtime_exception(message); + } + + return subtraction_failure::none; + } +#endif // KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + + // Check the begins < ends for all elements + template <typename B, typename E> + KOKKOS_INLINE_FUNCTION static subtraction_failure + runtime_check_begins_ends_device(const B& begins, const E& ends) { + if (begins.size() != Rank) + Kokkos::abort( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged " + "OffsetView: begins has bad Rank"); + if (ends.size() != Rank) + Kokkos::abort( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged " + "OffsetView: ends has bad Rank"); + + for (size_t i = 0; i != begins.size(); ++i) { + switch (check_subtraction(at(ends, i), at(begins, i))) { + case subtraction_failure::negative: + Kokkos::abort( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged " + "OffsetView: bad range"); + break; + case subtraction_failure::overflow: + Kokkos::abort( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged " + "OffsetView: range overflows"); + break; + default: break; + } + } + + return subtraction_failure::none; + } + + // Constructor around unmanaged data after checking begins < ends for all + // elements + // Each of B, E can be begins_type and/or index_list_type + // Precondition: begins.size() == ends.size() == m_begins.size() == Rank + template <typename B, typename E> + KOKKOS_INLINE_FUNCTION OffsetView(const pointer_type& p, const B& begins_, + const E& ends_, + subtraction_failure) + : m_track() // no tracking + , + m_map(Kokkos::Impl::ViewCtorProp<pointer_type>(p), + typename traits::array_layout( + Rank > 0 ? at(ends_, 0) - at(begins_, 0) : 0, + Rank > 1 ? at(ends_, 1) - at(begins_, 1) : 0, + Rank > 2 ? at(ends_, 2) - at(begins_, 2) : 0, + Rank > 3 ? at(ends_, 3) - at(begins_, 3) : 0, + Rank > 4 ? at(ends_, 4) - at(begins_, 4) : 0, + Rank > 5 ? at(ends_, 5) - at(begins_, 5) : 0, + Rank > 6 ? at(ends_, 6) - at(begins_, 6) : 0, + Rank > 7 ? at(ends_, 7) - at(begins_, 7) : 0)) { + for (size_t i = 0; i != m_begins.size(); ++i) { + m_begins[i] = at(begins_, i); + }; + } + + public: + // Constructor around unmanaged data + // Four overloads, as both begins and ends can be either + // begins_type or index_list_type + KOKKOS_INLINE_FUNCTION + OffsetView(const pointer_type& p, const begins_type& begins_, + const begins_type& ends_) +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + : OffsetView(p, begins_, ends_, + runtime_check_begins_ends_host(begins_, ends_)) +#else + : OffsetView(p, begins_, ends_, + runtime_check_begins_ends_device(begins_, ends_)) +#endif + { + } + + KOKKOS_INLINE_FUNCTION + OffsetView(const pointer_type& p, const begins_type& begins_, + index_list_type ends_) +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + : OffsetView(p, begins_, ends_, + runtime_check_begins_ends_host(begins_, ends_)) +#else + : OffsetView(p, begins_, ends_, + runtime_check_begins_ends_device(begins_, ends_)) +#endif + { + } + + KOKKOS_INLINE_FUNCTION + OffsetView(const pointer_type& p, index_list_type begins_, + const begins_type& ends_) +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + : OffsetView(p, begins_, ends_, + runtime_check_begins_ends_host(begins_, ends_)) +#else + : OffsetView(p, begins_, ends_, + runtime_check_begins_ends_device(begins_, ends_)) +#endif + { + } + + KOKKOS_INLINE_FUNCTION + OffsetView(const pointer_type& p, index_list_type begins_, + index_list_type ends_) +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + : OffsetView(p, begins_, ends_, + runtime_check_begins_ends_host(begins_, ends_)) +#else + : OffsetView(p, begins_, ends_, + runtime_check_begins_ends_device(begins_, ends_)) +#endif + { + } + + //---------------------------------------- + // Allocation tracking properties + KOKKOS_INLINE_FUNCTION + int use_count() const { return m_track.use_count(); } + + inline const std::string label() const { + return m_track.template get_label<typename traits::memory_space>(); + } + + template <typename Label> + explicit inline OffsetView( + const Label& arg_label, + typename std::enable_if<Kokkos::Impl::is_view_label<Label>::value, + const index_list_type>::type range0, + const index_list_type range1 = KOKKOS_INVALID_INDEX_RANGE, + const index_list_type range2 = KOKKOS_INVALID_INDEX_RANGE, + const index_list_type range3 = KOKKOS_INVALID_INDEX_RANGE, + const index_list_type range4 = KOKKOS_INVALID_INDEX_RANGE, + const index_list_type range5 = KOKKOS_INVALID_INDEX_RANGE, + const index_list_type range6 = KOKKOS_INVALID_INDEX_RANGE, + const index_list_type range7 = KOKKOS_INVALID_INDEX_RANGE + + ) + : OffsetView(Kokkos::Impl::ViewCtorProp<std::string>(arg_label), + typename traits::array_layout( + range0.begin()[1] - range0.begin()[0] + 1, + range1.begin()[1] - range1.begin()[0] + 1, + range2.begin()[1] - range2.begin()[0] + 1, + range3.begin()[1] - range3.begin()[0] + 1, + range4.begin()[1] - range4.begin()[0] + 1, + range5.begin()[1] - range5.begin()[0] + 1, + range6.begin()[1] - range6.begin()[0] + 1, + range7.begin()[1] - range7.begin()[0] + 1), + {range0.begin()[0], range1.begin()[0], range2.begin()[0], + range3.begin()[0], range4.begin()[0], range5.begin()[0], + range6.begin()[0], range7.begin()[0]}) {} + + template <class... P> + explicit KOKKOS_INLINE_FUNCTION OffsetView( + const Kokkos::Impl::ViewCtorProp<P...>& arg_prop, + typename std::enable_if<Kokkos::Impl::ViewCtorProp<P...>::has_pointer, + typename traits::array_layout>::type const& + arg_layout, + const index_list_type minIndices) + : m_track() // No memory tracking + , + m_map(arg_prop, arg_layout) { + for (size_t i = 0; i < minIndices.size(); ++i) { + m_begins[i] = minIndices.begin()[i]; + } + static_assert( + std::is_same<pointer_type, typename Kokkos::Impl::ViewCtorProp< + P...>::pointer_type>::value, + "When constructing OffsetView to wrap user memory, you must supply " + "matching pointer type"); + } + + template <class... P> + explicit inline OffsetView( + const Kokkos::Impl::ViewCtorProp<P...>& arg_prop, + typename std::enable_if<!Kokkos::Impl::ViewCtorProp<P...>::has_pointer, + typename traits::array_layout>::type const& + arg_layout, + const index_list_type minIndices) + : m_track(), + m_map() + + { + for (size_t i = 0; i < Rank; ++i) m_begins[i] = minIndices.begin()[i]; + + // Append layout and spaces if not input + using alloc_prop_input = Kokkos::Impl::ViewCtorProp<P...>; + + // use 'std::integral_constant<unsigned,I>' for non-types + // to avoid duplicate class error. + using alloc_prop = Kokkos::Impl::ViewCtorProp< + P..., + typename std::conditional<alloc_prop_input::has_label, + std::integral_constant<unsigned, 0>, + typename std::string>::type, + typename std::conditional< + alloc_prop_input::has_memory_space, + std::integral_constant<unsigned, 1>, + typename traits::device_type::memory_space>::type, + typename std::conditional< + alloc_prop_input::has_execution_space, + std::integral_constant<unsigned, 2>, + typename traits::device_type::execution_space>::type>; + + static_assert(traits::is_managed, + "OffsetView allocation constructor requires managed memory"); + + if (alloc_prop::initialize && + !alloc_prop::execution_space::impl_is_initialized()) { + // If initializing view data then + // the execution space must be initialized. + Kokkos::Impl::throw_runtime_exception( + "Constructing OffsetView and initializing data with uninitialized " + "execution space"); + } + + // Copy the input allocation properties with possibly defaulted properties + alloc_prop prop_copy(arg_prop); + + //------------------------------------------------------------ +#if defined(KOKKOS_ENABLE_CUDA) + // If allocating in CudaUVMSpace must fence before and after + // the allocation to protect against possible concurrent access + // on the CPU and the GPU. + // Fence using the trait's executon space (which will be Kokkos::Cuda) + // to avoid incomplete type errors from usng Kokkos::Cuda directly. + if (std::is_same<Kokkos::CudaUVMSpace, + typename traits::device_type::memory_space>::value) { + typename traits::device_type::memory_space::execution_space().fence(); + } +#endif + //------------------------------------------------------------ + + Kokkos::Impl::SharedAllocationRecord<>* record = + m_map.allocate_shared(prop_copy, arg_layout); + + //------------------------------------------------------------ +#if defined(KOKKOS_ENABLE_CUDA) + if (std::is_same<Kokkos::CudaUVMSpace, + typename traits::device_type::memory_space>::value) { + typename traits::device_type::memory_space::execution_space().fence(); + } +#endif + //------------------------------------------------------------ + + // Setup and initialization complete, start tracking + m_track.assign_allocated_record_to_uninitialized(record); + +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + Kokkos::Experimental::Impl::runtime_check_rank_host( + traits::rank_dynamic, Rank, minIndices, label()); +#else + Kokkos::Experimental::Impl::runtime_check_rank_device(traits::rank_dynamic, + Rank, minIndices); + +#endif + } +}; + +/** \brief Temporary free function rank() + * until rank() is implemented + * in the View + */ +template <typename D, class... P> +KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const OffsetView<D, P...>& V) { + return V.Rank; +} // Temporary until added to view + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- +namespace Impl { + +template <class T> +KOKKOS_INLINE_FUNCTION + typename std::enable_if<std::is_integral<T>::value, T>::type + shift_input(const T arg, const int64_t offset) { + return arg - offset; +} + +KOKKOS_INLINE_FUNCTION +Kokkos::Impl::ALL_t shift_input(const Kokkos::Impl::ALL_t arg, + const int64_t /*offset*/) { + return arg; +} + +template <class T> +KOKKOS_INLINE_FUNCTION typename std::enable_if<std::is_integral<T>::value, + Kokkos::pair<T, T> >::type +shift_input(const Kokkos::pair<T, T> arg, const int64_t offset) { + return Kokkos::make_pair<T, T>(arg.first - offset, arg.second - offset); +} +template <class T> +inline + typename std::enable_if<std::is_integral<T>::value, std::pair<T, T> >::type + shift_input(const std::pair<T, T> arg, const int64_t offset) { + return std::make_pair<T, T>(arg.first - offset, arg.second - offset); +} + +template <size_t N, class Arg, class A> +KOKKOS_INLINE_FUNCTION void map_arg_to_new_begin( + const size_t i, Kokkos::Array<int64_t, N>& subviewBegins, + typename std::enable_if<N != 0, const Arg>::type shiftedArg, const Arg arg, + const A viewBegins, size_t& counter) { + if (!std::is_integral<Arg>::value) { + subviewBegins[counter] = shiftedArg == arg ? viewBegins[i] : 0; + counter++; + } +} + +template <size_t N, class Arg, class A> +KOKKOS_INLINE_FUNCTION void map_arg_to_new_begin( + const size_t /*i*/, Kokkos::Array<int64_t, N>& /*subviewBegins*/, + typename std::enable_if<N == 0, const Arg>::type /*shiftedArg*/, + const Arg /*arg*/, const A /*viewBegins*/, size_t& /*counter*/) {} + +template <class D, class... P, class T> +KOKKOS_INLINE_FUNCTION + typename Kokkos::Experimental::Impl::GetOffsetViewTypeFromViewType< + typename Kokkos::Impl::ViewMapping<void /* deduce subview type from + source view traits */ + , + ViewTraits<D, P...>, T>::type>::type + subview_offset(const OffsetView<D, P...>& src, T arg) { + auto theView = src.view(); + auto begins = src.begins(); + + T shiftedArg = shift_input(arg, begins[0]); + + constexpr size_t rank = + Kokkos::Impl::ViewMapping<void /* deduce subview type from source view + traits */ + , + ViewTraits<D, P...>, T>::type::Rank; + + auto theSubview = Kokkos::subview(theView, shiftedArg); + + Kokkos::Array<int64_t, rank> subviewBegins; + size_t counter = 0; + Kokkos::Experimental::Impl::map_arg_to_new_begin(0, subviewBegins, shiftedArg, + arg, begins, counter); + + typename Kokkos::Experimental::Impl::GetOffsetViewTypeFromViewType< + typename Kokkos::Impl::ViewMapping<void /* deduce subview type from source + view traits */ + , + ViewTraits<D, P...>, T>::type>::type + offsetView(theSubview, subviewBegins); + + return offsetView; +} + +template <class D, class... P, class T0, class T1> +KOKKOS_INLINE_FUNCTION + typename Kokkos::Experimental::Impl::GetOffsetViewTypeFromViewType< + typename Kokkos::Impl::ViewMapping< + void /* deduce subview type from source view traits */ + , + ViewTraits<D, P...>, T0, T1>::type>::type + subview_offset(const Kokkos::Experimental::OffsetView<D, P...>& src, + T0 arg0, T1 arg1) { + auto theView = src.view(); + auto begins = src.begins(); + + T0 shiftedArg0 = shift_input(arg0, begins[0]); + T1 shiftedArg1 = shift_input(arg1, begins[1]); + + auto theSubview = Kokkos::subview(theView, shiftedArg0, shiftedArg1); + constexpr size_t rank = + Kokkos::Impl::ViewMapping<void /* deduce subview type from source view + traits */ + , + ViewTraits<D, P...>, T0, T1>::type::Rank; + + Kokkos::Array<int64_t, rank> subviewBegins; + size_t counter = 0; + Kokkos::Experimental::Impl::map_arg_to_new_begin( + 0, subviewBegins, shiftedArg0, arg0, begins, counter); + Kokkos::Experimental::Impl::map_arg_to_new_begin( + 1, subviewBegins, shiftedArg1, arg1, begins, counter); + + typename Kokkos::Experimental::Impl::GetOffsetViewTypeFromViewType< + typename Kokkos::Impl::ViewMapping< + void /* deduce subview type from source view traits */ + , + ViewTraits<D, P...>, T0, T1>::type>::type offsetView(theSubview, + subviewBegins); + + return offsetView; +} + +template <class D, class... P, class T0, class T1, class T2> +KOKKOS_INLINE_FUNCTION + typename Kokkos::Experimental::Impl::GetOffsetViewTypeFromViewType< + typename Kokkos::Impl::ViewMapping< + void /* deduce subview type from source view traits */ + , + ViewTraits<D, P...>, T0, T1, T2>::type>::type + subview_offset(const OffsetView<D, P...>& src, T0 arg0, T1 arg1, T2 arg2) { + auto theView = src.view(); + auto begins = src.begins(); + + T0 shiftedArg0 = shift_input(arg0, begins[0]); + T1 shiftedArg1 = shift_input(arg1, begins[1]); + T2 shiftedArg2 = shift_input(arg2, begins[2]); + + auto theSubview = + Kokkos::subview(theView, shiftedArg0, shiftedArg1, shiftedArg2); + + constexpr size_t rank = + Kokkos::Impl::ViewMapping<void /* deduce subview type from source view + traits */ + , + ViewTraits<D, P...>, T0, T1, T2>::type::Rank; + + Kokkos::Array<int64_t, rank> subviewBegins; + + size_t counter = 0; + Kokkos::Experimental::Impl::map_arg_to_new_begin( + 0, subviewBegins, shiftedArg0, arg0, begins, counter); + Kokkos::Experimental::Impl::map_arg_to_new_begin( + 1, subviewBegins, shiftedArg1, arg1, begins, counter); + Kokkos::Experimental::Impl::map_arg_to_new_begin( + 2, subviewBegins, shiftedArg2, arg2, begins, counter); + + typename Kokkos::Experimental::Impl::GetOffsetViewTypeFromViewType< + typename Kokkos::Impl::ViewMapping< + void /* deduce subview type from source view traits */ + , + ViewTraits<D, P...>, T0, T1, T2>::type>::type + offsetView(theSubview, subviewBegins); + + return offsetView; +} + +template <class D, class... P, class T0, class T1, class T2, class T3> +KOKKOS_INLINE_FUNCTION + typename Kokkos::Experimental::Impl::GetOffsetViewTypeFromViewType< + typename Kokkos::Impl::ViewMapping< + void /* deduce subview type from source view traits */ + , + ViewTraits<D, P...>, T0, T1, T2, T3>::type>::type + subview_offset(const OffsetView<D, P...>& src, T0 arg0, T1 arg1, T2 arg2, + T3 arg3) { + auto theView = src.view(); + auto begins = src.begins(); + + T0 shiftedArg0 = shift_input(arg0, begins[0]); + T1 shiftedArg1 = shift_input(arg1, begins[1]); + T2 shiftedArg2 = shift_input(arg2, begins[2]); + T3 shiftedArg3 = shift_input(arg3, begins[3]); + + auto theSubview = Kokkos::subview(theView, shiftedArg0, shiftedArg1, + shiftedArg2, shiftedArg3); + + constexpr size_t rank = Kokkos::Impl::ViewMapping< + void /* deduce subview type from source view traits */ + , + ViewTraits<D, P...>, T0, T1, T2, T3>::type::Rank; + Kokkos::Array<int64_t, rank> subviewBegins; + + size_t counter = 0; + Kokkos::Experimental::Impl::map_arg_to_new_begin( + 0, subviewBegins, shiftedArg0, arg0, begins, counter); + Kokkos::Experimental::Impl::map_arg_to_new_begin( + 1, subviewBegins, shiftedArg1, arg1, begins, counter); + Kokkos::Experimental::Impl::map_arg_to_new_begin( + 2, subviewBegins, shiftedArg2, arg2, begins, counter); + Kokkos::Experimental::Impl::map_arg_to_new_begin( + 3, subviewBegins, shiftedArg3, arg3, begins, counter); + + typename Kokkos::Experimental::Impl::GetOffsetViewTypeFromViewType< + typename Kokkos::Impl::ViewMapping< + void /* deduce subview type from source view traits */ + , + ViewTraits<D, P...>, T0, T1, T2, T3>::type>::type + offsetView(theSubview, subviewBegins); + + return offsetView; +} + +template <class D, class... P, class T0, class T1, class T2, class T3, class T4> +KOKKOS_INLINE_FUNCTION + typename Kokkos::Experimental::Impl::GetOffsetViewTypeFromViewType< + typename Kokkos::Impl::ViewMapping< + void /* deduce subview type from source view traits */ + , + ViewTraits<D, P...>, T0, T1, T2, T3, T4>::type>::type + subview_offset(const OffsetView<D, P...>& src, T0 arg0, T1 arg1, T2 arg2, + T3 arg3, T4 arg4) { + auto theView = src.view(); + auto begins = src.begins(); + + T0 shiftedArg0 = shift_input(arg0, begins[0]); + T1 shiftedArg1 = shift_input(arg1, begins[1]); + T2 shiftedArg2 = shift_input(arg2, begins[2]); + T3 shiftedArg3 = shift_input(arg3, begins[3]); + T4 shiftedArg4 = shift_input(arg4, begins[4]); + + auto theSubview = Kokkos::subview(theView, shiftedArg0, shiftedArg1, + shiftedArg2, shiftedArg3, shiftedArg4); + + constexpr size_t rank = Kokkos::Impl::ViewMapping< + void /* deduce subview type from source view traits */ + , + ViewTraits<D, P...>, T0, T1, T2, T3, T4>::type::Rank; + Kokkos::Array<int64_t, rank> subviewBegins; + + size_t counter = 0; + Kokkos::Experimental::Impl::map_arg_to_new_begin( + 0, subviewBegins, shiftedArg0, arg0, begins, counter); + Kokkos::Experimental::Impl::map_arg_to_new_begin( + 1, subviewBegins, shiftedArg1, arg1, begins, counter); + Kokkos::Experimental::Impl::map_arg_to_new_begin( + 2, subviewBegins, shiftedArg2, arg2, begins, counter); + Kokkos::Experimental::Impl::map_arg_to_new_begin( + 3, subviewBegins, shiftedArg3, arg3, begins, counter); + Kokkos::Experimental::Impl::map_arg_to_new_begin( + 4, subviewBegins, shiftedArg4, arg4, begins, counter); + + typename Kokkos::Experimental::Impl::GetOffsetViewTypeFromViewType< + typename Kokkos::Impl::ViewMapping< + void /* deduce subview type from source view traits */ + , + ViewTraits<D, P...>, T0, T1, T2, T3, T4>::type>::type + offsetView(theSubview, subviewBegins); + + return offsetView; +} + +template <class D, class... P, class T0, class T1, class T2, class T3, class T4, + class T5> +KOKKOS_INLINE_FUNCTION + typename Kokkos::Experimental::Impl::GetOffsetViewTypeFromViewType< + typename Kokkos::Impl::ViewMapping< + void /* deduce subview type from source view traits */ + , + ViewTraits<D, P...>, T0, T1, T2, T3, T4, T5>::type>::type + subview_offset(const OffsetView<D, P...>& src, T0 arg0, T1 arg1, T2 arg2, + T3 arg3, T4 arg4, T5 arg5) { + auto theView = src.view(); + auto begins = src.begins(); + + T0 shiftedArg0 = shift_input(arg0, begins[0]); + T1 shiftedArg1 = shift_input(arg1, begins[1]); + T2 shiftedArg2 = shift_input(arg2, begins[2]); + T3 shiftedArg3 = shift_input(arg3, begins[3]); + T4 shiftedArg4 = shift_input(arg4, begins[4]); + T5 shiftedArg5 = shift_input(arg5, begins[5]); + + auto theSubview = + Kokkos::subview(theView, shiftedArg0, shiftedArg1, shiftedArg2, + shiftedArg3, shiftedArg4, shiftedArg5); + + constexpr size_t rank = Kokkos::Impl::ViewMapping< + void /* deduce subview type from source view traits */ + , + ViewTraits<D, P...>, T0, T1, T2, T3, T4, T5>::type::Rank; + + Kokkos::Array<int64_t, rank> subviewBegins; + + size_t counter = 0; + Kokkos::Experimental::Impl::map_arg_to_new_begin( + 0, subviewBegins, shiftedArg0, arg0, begins, counter); + Kokkos::Experimental::Impl::map_arg_to_new_begin( + 1, subviewBegins, shiftedArg1, arg1, begins, counter); + Kokkos::Experimental::Impl::map_arg_to_new_begin( + 2, subviewBegins, shiftedArg2, arg2, begins, counter); + Kokkos::Experimental::Impl::map_arg_to_new_begin( + 3, subviewBegins, shiftedArg3, arg3, begins, counter); + Kokkos::Experimental::Impl::map_arg_to_new_begin( + 4, subviewBegins, shiftedArg4, arg4, begins, counter); + Kokkos::Experimental::Impl::map_arg_to_new_begin( + 5, subviewBegins, shiftedArg5, arg5, begins, counter); + + typename Kokkos::Experimental::Impl::GetOffsetViewTypeFromViewType< + typename Kokkos::Impl::ViewMapping< + void /* deduce subview type from source view traits */ + , + ViewTraits<D, P...>, T0, T1, T2, T3, T4, T5>::type>::type + offsetView(theSubview, subviewBegins); + + return offsetView; +} +template <class D, class... P, class T0, class T1, class T2, class T3, class T4, + class T5, class T6> +KOKKOS_INLINE_FUNCTION + typename Kokkos::Experimental::Impl::GetOffsetViewTypeFromViewType< + typename Kokkos::Impl::ViewMapping< + void /* deduce subview type from source view traits */ + , + ViewTraits<D, P...>, T0, T1, T2, T3, T4, T5, T6>::type>::type + subview_offset(const OffsetView<D, P...>& src, T0 arg0, T1 arg1, T2 arg2, + T3 arg3, T4 arg4, T5 arg5, T6 arg6) { + auto theView = src.view(); + auto begins = src.begins(); + + T0 shiftedArg0 = shift_input(arg0, begins[0]); + T1 shiftedArg1 = shift_input(arg1, begins[1]); + T2 shiftedArg2 = shift_input(arg2, begins[2]); + T3 shiftedArg3 = shift_input(arg3, begins[3]); + T4 shiftedArg4 = shift_input(arg4, begins[4]); + T5 shiftedArg5 = shift_input(arg5, begins[5]); + T6 shiftedArg6 = shift_input(arg6, begins[6]); + + auto theSubview = + Kokkos::subview(theView, shiftedArg0, shiftedArg1, shiftedArg2, + shiftedArg3, shiftedArg4, shiftedArg5, shiftedArg6); + + constexpr size_t rank = Kokkos::Impl::ViewMapping< + void /* deduce subview type from source view traits */ + , + ViewTraits<D, P...>, T0, T1, T2, T3, T4, T5, T6>::type::Rank; + + Kokkos::Array<int64_t, rank> subviewBegins; + + size_t counter = 0; + Kokkos::Experimental::Impl::map_arg_to_new_begin( + 0, subviewBegins, shiftedArg0, arg0, begins, counter); + Kokkos::Experimental::Impl::map_arg_to_new_begin( + 1, subviewBegins, shiftedArg1, arg1, begins, counter); + Kokkos::Experimental::Impl::map_arg_to_new_begin( + 2, subviewBegins, shiftedArg2, arg2, begins, counter); + Kokkos::Experimental::Impl::map_arg_to_new_begin( + 3, subviewBegins, shiftedArg3, arg3, begins, counter); + Kokkos::Experimental::Impl::map_arg_to_new_begin( + 4, subviewBegins, shiftedArg4, arg4, begins, counter); + Kokkos::Experimental::Impl::map_arg_to_new_begin( + 5, subviewBegins, shiftedArg5, arg5, begins, counter); + Kokkos::Experimental::Impl::map_arg_to_new_begin( + 6, subviewBegins, shiftedArg6, arg6, begins, counter); + + typename Kokkos::Experimental::Impl::GetOffsetViewTypeFromViewType< + typename Kokkos::Impl::ViewMapping< + void /* deduce subview type from source view traits */ + , + ViewTraits<D, P...>, T0, T1, T2, T3, T4, T5, T6>::type>::type + offsetView(theSubview, subviewBegins); + + return offsetView; +} + +template <class D, class... P, class T0, class T1, class T2, class T3, class T4, + class T5, class T6, class T7> +KOKKOS_INLINE_FUNCTION + typename Kokkos::Experimental::Impl::GetOffsetViewTypeFromViewType< + typename Kokkos::Impl::ViewMapping< + void /* deduce subview type from source view traits */ + , + ViewTraits<D, P...>, T0, T1, T2, T3, T4, T5, T6, T7>::type>::type + subview_offset(const OffsetView<D, P...>& src, T0 arg0, T1 arg1, T2 arg2, + T3 arg3, T4 arg4, T5 arg5, T6 arg6, T7 arg7) { + auto theView = src.view(); + auto begins = src.begins(); + + T0 shiftedArg0 = shift_input(arg0, begins[0]); + T1 shiftedArg1 = shift_input(arg1, begins[1]); + T2 shiftedArg2 = shift_input(arg2, begins[2]); + T3 shiftedArg3 = shift_input(arg3, begins[3]); + T4 shiftedArg4 = shift_input(arg4, begins[4]); + T5 shiftedArg5 = shift_input(arg5, begins[5]); + T6 shiftedArg6 = shift_input(arg6, begins[6]); + T7 shiftedArg7 = shift_input(arg7, begins[7]); + + auto theSubview = Kokkos::subview(theView, shiftedArg0, shiftedArg1, + shiftedArg2, shiftedArg3, shiftedArg4, + shiftedArg5, shiftedArg6, shiftedArg7); + + constexpr size_t rank = Kokkos::Impl::ViewMapping< + void /* deduce subview type from source view traits */ + , + ViewTraits<D, P...>, T0, T1, T2, T3, T4, T5, T6, T7>::type::Rank; + + Kokkos::Array<int64_t, rank> subviewBegins; + + size_t counter = 0; + Kokkos::Experimental::Impl::map_arg_to_new_begin( + 0, subviewBegins, shiftedArg0, arg0, begins, counter); + Kokkos::Experimental::Impl::map_arg_to_new_begin( + 1, subviewBegins, shiftedArg1, arg1, begins, counter); + Kokkos::Experimental::Impl::map_arg_to_new_begin( + 2, subviewBegins, shiftedArg2, arg2, begins, counter); + Kokkos::Experimental::Impl::map_arg_to_new_begin( + 3, subviewBegins, shiftedArg3, arg3, begins, counter); + Kokkos::Experimental::Impl::map_arg_to_new_begin( + 4, subviewBegins, shiftedArg4, arg4, begins, counter); + Kokkos::Experimental::Impl::map_arg_to_new_begin( + 5, subviewBegins, shiftedArg5, arg5, begins, counter); + Kokkos::Experimental::Impl::map_arg_to_new_begin( + 6, subviewBegins, shiftedArg6, arg6, begins, counter); + Kokkos::Experimental::Impl::map_arg_to_new_begin( + 7, subviewBegins, shiftedArg7, arg7, begins, counter); + + typename Kokkos::Experimental::Impl::GetOffsetViewTypeFromViewType< + typename Kokkos::Impl::ViewMapping< + void /* deduce subview type from source view traits */ + , + ViewTraits<D, P...>, T0, T1, T2, T3, T4, T5, T6, T7>::type>::type + offsetView(theSubview, subviewBegins); + + return offsetView; +} +} // namespace Impl + +template <class D, class... P, class... Args> +KOKKOS_INLINE_FUNCTION + typename Kokkos::Experimental::Impl::GetOffsetViewTypeFromViewType< + typename Kokkos::Impl::ViewMapping< + void /* deduce subview type from source view traits */ + , + ViewTraits<D, P...>, Args...>::type>::type + subview(const OffsetView<D, P...>& src, Args... args) { + static_assert( + OffsetView<D, P...>::Rank == sizeof...(Args), + "subview requires one argument for each source OffsetView rank"); + + return Kokkos::Experimental::Impl::subview_offset(src, args...); +} + +} // namespace Experimental +} // namespace Kokkos +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { +template <class LT, class... LP, class RT, class... RP> +KOKKOS_INLINE_FUNCTION bool operator==(const OffsetView<LT, LP...>& lhs, + const OffsetView<RT, RP...>& rhs) { + // Same data, layout, dimensions + using lhs_traits = ViewTraits<LT, LP...>; + using rhs_traits = ViewTraits<RT, RP...>; + + return std::is_same<typename lhs_traits::const_value_type, + typename rhs_traits::const_value_type>::value && + std::is_same<typename lhs_traits::array_layout, + typename rhs_traits::array_layout>::value && + std::is_same<typename lhs_traits::memory_space, + typename rhs_traits::memory_space>::value && + unsigned(lhs_traits::rank) == unsigned(rhs_traits::rank) && + lhs.data() == rhs.data() && lhs.span() == rhs.span() && + lhs.extent(0) == rhs.extent(0) && lhs.extent(1) == rhs.extent(1) && + lhs.extent(2) == rhs.extent(2) && lhs.extent(3) == rhs.extent(3) && + lhs.extent(4) == rhs.extent(4) && lhs.extent(5) == rhs.extent(5) && + lhs.extent(6) == rhs.extent(6) && lhs.extent(7) == rhs.extent(7) && + lhs.begin(0) == rhs.begin(0) && lhs.begin(1) == rhs.begin(1) && + lhs.begin(2) == rhs.begin(2) && lhs.begin(3) == rhs.begin(3) && + lhs.begin(4) == rhs.begin(4) && lhs.begin(5) == rhs.begin(5) && + lhs.begin(6) == rhs.begin(6) && lhs.begin(7) == rhs.begin(7); +} + +template <class LT, class... LP, class RT, class... RP> +KOKKOS_INLINE_FUNCTION bool operator!=(const OffsetView<LT, LP...>& lhs, + const OffsetView<RT, RP...>& rhs) { + return !(operator==(lhs, rhs)); +} + +template <class LT, class... LP, class RT, class... RP> +KOKKOS_INLINE_FUNCTION bool operator==(const View<LT, LP...>& lhs, + const OffsetView<RT, RP...>& rhs) { + // Same data, layout, dimensions + using lhs_traits = ViewTraits<LT, LP...>; + using rhs_traits = ViewTraits<RT, RP...>; + + return std::is_same<typename lhs_traits::const_value_type, + typename rhs_traits::const_value_type>::value && + std::is_same<typename lhs_traits::array_layout, + typename rhs_traits::array_layout>::value && + std::is_same<typename lhs_traits::memory_space, + typename rhs_traits::memory_space>::value && + unsigned(lhs_traits::rank) == unsigned(rhs_traits::rank) && + lhs.data() == rhs.data() && lhs.span() == rhs.span() && + lhs.extent(0) == rhs.extent(0) && lhs.extent(1) == rhs.extent(1) && + lhs.extent(2) == rhs.extent(2) && lhs.extent(3) == rhs.extent(3) && + lhs.extent(4) == rhs.extent(4) && lhs.extent(5) == rhs.extent(5) && + lhs.extent(6) == rhs.extent(6) && lhs.extent(7) == rhs.extent(7); +} + +template <class LT, class... LP, class RT, class... RP> +KOKKOS_INLINE_FUNCTION bool operator==(const OffsetView<LT, LP...>& lhs, + const View<RT, RP...>& rhs) { + return rhs == lhs; +} + +} // namespace Experimental +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +template <class DT, class... DP> +inline void deep_copy( + const Experimental::OffsetView<DT, DP...>& dst, + typename ViewTraits<DT, DP...>::const_value_type& value, + typename std::enable_if<std::is_same< + typename ViewTraits<DT, DP...>::specialize, void>::value>::type* = + nullptr) { + static_assert( + std::is_same<typename ViewTraits<DT, DP...>::non_const_value_type, + typename ViewTraits<DT, DP...>::value_type>::value, + "deep_copy requires non-const type"); + + auto dstView = dst.view(); + Kokkos::deep_copy(dstView, value); +} + +template <class DT, class... DP, class ST, class... SP> +inline void deep_copy( + const Experimental::OffsetView<DT, DP...>& dst, + const Experimental::OffsetView<ST, SP...>& value, + typename std::enable_if<std::is_same< + typename ViewTraits<DT, DP...>::specialize, void>::value>::type* = + nullptr) { + static_assert( + std::is_same<typename ViewTraits<DT, DP...>::value_type, + typename ViewTraits<ST, SP...>::non_const_value_type>::value, + "deep_copy requires matching non-const destination type"); + + auto dstView = dst.view(); + Kokkos::deep_copy(dstView, value.view()); +} +template <class DT, class... DP, class ST, class... SP> +inline void deep_copy( + const Experimental::OffsetView<DT, DP...>& dst, + const View<ST, SP...>& value, + typename std::enable_if<std::is_same< + typename ViewTraits<DT, DP...>::specialize, void>::value>::type* = + nullptr) { + static_assert( + std::is_same<typename ViewTraits<DT, DP...>::value_type, + typename ViewTraits<ST, SP...>::non_const_value_type>::value, + "deep_copy requires matching non-const destination type"); + + auto dstView = dst.view(); + Kokkos::deep_copy(dstView, value); +} + +template <class DT, class... DP, class ST, class... SP> +inline void deep_copy( + const View<DT, DP...>& dst, + const Experimental::OffsetView<ST, SP...>& value, + typename std::enable_if<std::is_same< + typename ViewTraits<DT, DP...>::specialize, void>::value>::type* = + nullptr) { + static_assert( + std::is_same<typename ViewTraits<DT, DP...>::value_type, + typename ViewTraits<ST, SP...>::non_const_value_type>::value, + "deep_copy requires matching non-const destination type"); + + Kokkos::deep_copy(dst, value.view()); +} + +namespace Impl { + +// Deduce Mirror Types +template <class Space, class T, class... P> +struct MirrorOffsetViewType { + // The incoming view_type + using src_view_type = typename Kokkos::Experimental::OffsetView<T, P...>; + // The memory space for the mirror view + using memory_space = typename Space::memory_space; + // Check whether it is the same memory space + enum { + is_same_memspace = + std::is_same<memory_space, typename src_view_type::memory_space>::value + }; + // The array_layout + using array_layout = typename src_view_type::array_layout; + // The data type (we probably want it non-const since otherwise we can't even + // deep_copy to it. + using data_type = typename src_view_type::non_const_data_type; + // The destination view type if it is not the same memory space + using dest_view_type = + Kokkos::Experimental::OffsetView<data_type, array_layout, Space>; + // If it is the same memory_space return the existsing view_type + // This will also keep the unmanaged trait if necessary + using view_type = typename std::conditional<is_same_memspace, src_view_type, + dest_view_type>::type; +}; + +template <class Space, class T, class... P> +struct MirrorOffsetType { + // The incoming view_type + using src_view_type = typename Kokkos::Experimental::OffsetView<T, P...>; + // The memory space for the mirror view + using memory_space = typename Space::memory_space; + // Check whether it is the same memory space + enum { + is_same_memspace = + std::is_same<memory_space, typename src_view_type::memory_space>::value + }; + // The array_layout + using array_layout = typename src_view_type::array_layout; + // The data type (we probably want it non-const since otherwise we can't even + // deep_copy to it. + using data_type = typename src_view_type::non_const_data_type; + // The destination view type if it is not the same memory space + using view_type = + Kokkos::Experimental::OffsetView<data_type, array_layout, Space>; +}; + +} // namespace Impl + +template <class T, class... P> +inline typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror +create_mirror( + const Kokkos::Experimental::OffsetView<T, P...>& src, + typename std::enable_if< + !std::is_same<typename Kokkos::ViewTraits<T, P...>::array_layout, + Kokkos::LayoutStride>::value>::type* = nullptr) { + using src_type = Experimental::OffsetView<T, P...>; + using dst_type = typename src_type::HostMirror; + + return dst_type( + Kokkos::Impl::ViewCtorProp<std::string>( + std::string(src.label()).append("_mirror")), + typename Kokkos::ViewTraits<T, P...>::array_layout( + src.extent(0), src.extent(1), src.extent(2), src.extent(3), + src.extent(4), src.extent(5), src.extent(6), src.extent(7)), + {src.begin(0), src.begin(1), src.begin(2), src.begin(3), src.begin(4), + src.begin(5), src.begin(6), src.begin(7)}); +} + +template <class T, class... P> +inline typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror +create_mirror( + const Kokkos::Experimental::OffsetView<T, P...>& src, + typename std::enable_if< + std::is_same<typename Kokkos::ViewTraits<T, P...>::array_layout, + Kokkos::LayoutStride>::value>::type* = nullptr) { + using src_type = Experimental::OffsetView<T, P...>; + using dst_type = typename src_type::HostMirror; + + Kokkos::LayoutStride layout; + + layout.dimension[0] = src.extent(0); + layout.dimension[1] = src.extent(1); + layout.dimension[2] = src.extent(2); + layout.dimension[3] = src.extent(3); + layout.dimension[4] = src.extent(4); + layout.dimension[5] = src.extent(5); + layout.dimension[6] = src.extent(6); + layout.dimension[7] = src.extent(7); + + layout.stride[0] = src.stride_0(); + layout.stride[1] = src.stride_1(); + layout.stride[2] = src.stride_2(); + layout.stride[3] = src.stride_3(); + layout.stride[4] = src.stride_4(); + layout.stride[5] = src.stride_5(); + layout.stride[6] = src.stride_6(); + layout.stride[7] = src.stride_7(); + + return dst_type(std::string(src.label()).append("_mirror"), layout, + {src.begin(0), src.begin(1), src.begin(2), src.begin(3), + src.begin(4), src.begin(5), src.begin(6), src.begin(7)}); +} + +// Create a mirror in a new space (specialization for different space) +template <class Space, class T, class... P> +typename Kokkos::Impl::MirrorOffsetType<Space, T, P...>::view_type +create_mirror(const Space&, + const Kokkos::Experimental::OffsetView<T, P...>& src) { + return typename Kokkos::Impl::MirrorOffsetType<Space, T, P...>::view_type( + src.label(), src.layout(), + {src.begin(0), src.begin(1), src.begin(2), src.begin(3), src.begin(4), + src.begin(5), src.begin(6), src.begin(7)}); +} + +template <class T, class... P> +inline typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror +create_mirror_view( + const typename Kokkos::Experimental::OffsetView<T, P...>& src, + typename std::enable_if< + (std::is_same< + typename Kokkos::Experimental::OffsetView<T, P...>::memory_space, + typename Kokkos::Experimental::OffsetView< + T, P...>::HostMirror::memory_space>::value && + std::is_same< + typename Kokkos::Experimental::OffsetView<T, P...>::data_type, + typename Kokkos::Experimental::OffsetView< + T, P...>::HostMirror::data_type>::value)>::type* = nullptr) { + return src; +} + +template <class T, class... P> +inline typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror +create_mirror_view( + const Kokkos::Experimental::OffsetView<T, P...>& src, + typename std::enable_if< + !(std::is_same< + typename Kokkos::Experimental::OffsetView<T, P...>::memory_space, + typename Kokkos::Experimental::OffsetView< + T, P...>::HostMirror::memory_space>::value && + std::is_same< + typename Kokkos::Experimental::OffsetView<T, P...>::data_type, + typename Kokkos::Experimental::OffsetView< + T, P...>::HostMirror::data_type>::value)>::type* = nullptr) { + return Kokkos::create_mirror(src); +} + +// Create a mirror view in a new space (specialization for same space) +template <class Space, class T, class... P> +typename Kokkos::Impl::MirrorOffsetViewType<Space, T, P...>::view_type +create_mirror_view(const Space&, + const Kokkos::Experimental::OffsetView<T, P...>& src, + typename std::enable_if<Impl::MirrorOffsetViewType< + Space, T, P...>::is_same_memspace>::type* = nullptr) { + return src; +} + +// Create a mirror view in a new space (specialization for different space) +template <class Space, class T, class... P> +typename Kokkos::Impl::MirrorOffsetViewType<Space, T, P...>::view_type +create_mirror_view(const Space&, + const Kokkos::Experimental::OffsetView<T, P...>& src, + typename std::enable_if<!Impl::MirrorOffsetViewType< + Space, T, P...>::is_same_memspace>::type* = nullptr) { + return typename Kokkos::Impl::MirrorOffsetViewType<Space, T, P...>::view_type( + src.label(), src.layout(), + {src.begin(0), src.begin(1), src.begin(2), src.begin(3), src.begin(4), + src.begin(5), src.begin(6), src.begin(7)}); +} +// +// // Create a mirror view and deep_copy in a new space (specialization for +// same space) template<class Space, class T, class ... P> typename +// Kokkos::Experimental::Impl::MirrorViewType<Space,T,P ...>::view_type +// create_mirror_view_and_copy(const Space& , const +// Kokkos::Experimental::OffsetView<T,P...> & src +// , std::string const& name = "" +// , typename +// std::enable_if<Impl::MirrorViewType<Space,T,P +// ...>::is_same_memspace>::type* = nullptr) { +// (void)name; +// return src; +// } +// +// // Create a mirror view and deep_copy in a new space (specialization for +// different space) template<class Space, class T, class ... P> typename +// Kokkos::Experimental::Impl::MirrorViewType<Space,T,P ...>::view_type +// create_mirror_view_and_copy(const Space& , const +// Kokkos::Experimental::OffsetView<T,P...> & src +// , std::string const& name = "" +// , typename +// std::enable_if<!Impl::MirrorViewType<Space,T,P +// ...>::is_same_memspace>::type* = nullptr) { +// using Mirror = typename +// Kokkos::Experimental::Impl::MirrorViewType<Space,T,P ...>::view_type; +// std::string label = name.empty() ? src.label() : name; +// auto mirror = Mirror(view_alloc(WithoutInitializing, label), src.layout(), +// { src.begin(0), src.begin(1), src.begin(2), +// src.begin(3), src.begin(4), +// src.begin(5), src.begin(6), src.begin(7) }); +// deep_copy(mirror, src); +// return mirror; +// } + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* KOKKOS_OFFSETVIEW_HPP_ */ diff --git a/packages/kokkos/containers/src/Kokkos_ScatterView.hpp b/packages/kokkos/containers/src/Kokkos_ScatterView.hpp new file mode 100644 index 0000000000000000000000000000000000000000..dcd4cf73e5d710bc427772a8a8de6384e80c9dae --- /dev/null +++ b/packages/kokkos/containers/src/Kokkos_ScatterView.hpp @@ -0,0 +1,1531 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +/// \file Kokkos_ScatterView.hpp +/// \brief Declaration and definition of Kokkos::ScatterView. +/// +/// This header file declares and defines Kokkos::ScatterView and its +/// related nonmember functions. + +#ifndef KOKKOS_SCATTER_VIEW_HPP +#define KOKKOS_SCATTER_VIEW_HPP + +#include <Kokkos_Core.hpp> +#include <utility> + +namespace Kokkos { +namespace Experimental { + +/* + * Reduction Type list + * - These corresponds to subset of the reducers in parallel_reduce + * - See Implementations of ScatterValue for details. + */ +struct ScatterSum {}; +struct ScatterProd {}; +struct ScatterMax {}; +struct ScatterMin {}; + +struct ScatterNonDuplicated {}; +struct ScatterDuplicated {}; + +struct ScatterNonAtomic {}; +struct ScatterAtomic {}; + +} // namespace Experimental +} // namespace Kokkos + +namespace Kokkos { +namespace Impl { +namespace Experimental { + +template <typename ExecSpace> +struct DefaultDuplication; + +template <typename ExecSpace, typename Duplication> +struct DefaultContribution; + +#ifdef KOKKOS_ENABLE_SERIAL +template <> +struct DefaultDuplication<Kokkos::Serial> { + using type = Kokkos::Experimental::ScatterNonDuplicated; +}; + +template <> +struct DefaultContribution<Kokkos::Serial, + Kokkos::Experimental::ScatterNonDuplicated> { + using type = Kokkos::Experimental::ScatterNonAtomic; +}; +template <> +struct DefaultContribution<Kokkos::Serial, + Kokkos::Experimental::ScatterDuplicated> { + using type = Kokkos::Experimental::ScatterNonAtomic; +}; +#endif + +#ifdef KOKKOS_ENABLE_OPENMP +template <> +struct DefaultDuplication<Kokkos::OpenMP> { + using type = Kokkos::Experimental::ScatterDuplicated; +}; +template <> +struct DefaultContribution<Kokkos::OpenMP, + Kokkos::Experimental::ScatterNonDuplicated> { + using type = Kokkos::Experimental::ScatterAtomic; +}; +template <> +struct DefaultContribution<Kokkos::OpenMP, + Kokkos::Experimental::ScatterDuplicated> { + using type = Kokkos::Experimental::ScatterNonAtomic; +}; +#endif + +#ifdef KOKKOS_ENABLE_OPENMPTARGET +template <> +struct DefaultDuplication<Kokkos::Experimental::OpenMPTarget> { + using type = Kokkos::Experimental::ScatterNonDuplicated; +}; +template <> +struct DefaultContribution<Kokkos::Experimental::OpenMPTarget, + Kokkos::Experimental::ScatterNonDuplicated> { + using type = Kokkos::Experimental::ScatterAtomic; +}; +template <> +struct DefaultContribution<Kokkos::Experimental::OpenMPTarget, + Kokkos::Experimental::ScatterDuplicated> { + using type = Kokkos::Experimental::ScatterNonAtomic; +}; +#endif + +#ifdef KOKKOS_ENABLE_HPX +template <> +struct DefaultDuplication<Kokkos::Experimental::HPX> { + using type = Kokkos::Experimental::ScatterDuplicated; +}; +template <> +struct DefaultContribution<Kokkos::Experimental::HPX, + Kokkos::Experimental::ScatterNonDuplicated> { + using type = Kokkos::Experimental::ScatterAtomic; +}; +template <> +struct DefaultContribution<Kokkos::Experimental::HPX, + Kokkos::Experimental::ScatterDuplicated> { + using type = Kokkos::Experimental::ScatterNonAtomic; +}; +#endif + +#ifdef KOKKOS_ENABLE_THREADS +template <> +struct DefaultDuplication<Kokkos::Threads> { + using type = Kokkos::Experimental::ScatterDuplicated; +}; +template <> +struct DefaultContribution<Kokkos::Threads, + Kokkos::Experimental::ScatterNonDuplicated> { + using type = Kokkos::Experimental::ScatterAtomic; +}; +template <> +struct DefaultContribution<Kokkos::Threads, + Kokkos::Experimental::ScatterDuplicated> { + using type = Kokkos::Experimental::ScatterNonAtomic; +}; +#endif + +#ifdef KOKKOS_ENABLE_CUDA +template <> +struct DefaultDuplication<Kokkos::Cuda> { + using type = Kokkos::Experimental::ScatterNonDuplicated; +}; +template <> +struct DefaultContribution<Kokkos::Cuda, + Kokkos::Experimental::ScatterNonDuplicated> { + using type = Kokkos::Experimental::ScatterAtomic; +}; +template <> +struct DefaultContribution<Kokkos::Cuda, + Kokkos::Experimental::ScatterDuplicated> { + using type = Kokkos::Experimental::ScatterAtomic; +}; +#endif + +#ifdef KOKKOS_ENABLE_HIP +template <> +struct DefaultDuplication<Kokkos::Experimental::HIP> { + using type = Kokkos::Experimental::ScatterNonDuplicated; +}; +template <> +struct DefaultContribution<Kokkos::Experimental::HIP, + Kokkos::Experimental::ScatterNonDuplicated> { + using type = Kokkos::Experimental::ScatterAtomic; +}; +template <> +struct DefaultContribution<Kokkos::Experimental::HIP, + Kokkos::Experimental::ScatterDuplicated> { + using type = Kokkos::Experimental::ScatterAtomic; +}; +#endif + +#ifdef KOKKOS_ENABLE_SYCL +template <> +struct DefaultDuplication<Kokkos::Experimental::SYCL> { + using type = Kokkos::Experimental::ScatterNonDuplicated; +}; +template <> +struct DefaultContribution<Kokkos::Experimental::SYCL, + Kokkos::Experimental::ScatterNonDuplicated> { + using type = Kokkos::Experimental::ScatterAtomic; +}; +template <> +struct DefaultContribution<Kokkos::Experimental::SYCL, + Kokkos::Experimental::ScatterDuplicated> { + using type = Kokkos::Experimental::ScatterAtomic; +}; +#endif + +// FIXME All these scatter values need overhaul: +// - like should they be copyable at all? +// - what is the internal handle type +// - remove join +// - consistently use the update function in operators +template <typename ValueType, typename Op, typename DeviceType, + typename Contribution> +struct ScatterValue; + +/* ScatterValue <Op=ScatterSum, Contribution=ScatterNonAtomic> is + the object returned by the access operator() of ScatterAccess. This class + inherits from the Sum<> reducer and it wraps join(dest, src) with convenient + operator+=, etc. Note the addition of update(ValueType const& rhs) and + reset() so that all reducers can have common functions See ReduceDuplicates + and ResetDuplicates ) */ +template <typename ValueType, typename DeviceType> +struct ScatterValue<ValueType, Kokkos::Experimental::ScatterSum, DeviceType, + Kokkos::Experimental::ScatterNonAtomic> { + ValueType& value; + + public: + KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) + : value(value_in) {} + KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ScatterValue&& other) + : value(other.value) {} + KOKKOS_FORCEINLINE_FUNCTION void operator+=(ValueType const& rhs) { + update(rhs); + } + KOKKOS_FORCEINLINE_FUNCTION void operator++() { update(1); } + KOKKOS_FORCEINLINE_FUNCTION void operator++(int) { update(1); } + KOKKOS_FORCEINLINE_FUNCTION void operator-=(ValueType const& rhs) { + update(ValueType(-rhs)); + } + KOKKOS_FORCEINLINE_FUNCTION void operator--() { update(ValueType(-1)); } + KOKKOS_FORCEINLINE_FUNCTION void operator--(int) { update(ValueType(-1)); } + KOKKOS_FORCEINLINE_FUNCTION void update(ValueType const& rhs) { + value += rhs; + } + KOKKOS_FORCEINLINE_FUNCTION void reset() { + value = reduction_identity<ValueType>::sum(); + } +}; + +/* ScatterValue <Op=ScatterSum, Contribution=ScatterAtomic> is the + object returned by the access operator() of ScatterAccess. This class inherits + from the Sum<> reducer, and similar to that returned by an Atomic View, it + wraps Kokkos::atomic_add with convenient operator+=, etc. This version also has + the update(rhs) and reset() functions. */ +template <typename ValueType, typename DeviceType> +struct ScatterValue<ValueType, Kokkos::Experimental::ScatterSum, DeviceType, + Kokkos::Experimental::ScatterAtomic> { + ValueType& value; + + public: + KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) + : value(value_in) {} + + KOKKOS_FORCEINLINE_FUNCTION void operator+=(ValueType const& rhs) { + this->join(value, rhs); + } + KOKKOS_FORCEINLINE_FUNCTION void operator++() { this->join(value, 1); } + KOKKOS_FORCEINLINE_FUNCTION void operator++(int) { this->join(value, 1); } + KOKKOS_FORCEINLINE_FUNCTION void operator-=(ValueType const& rhs) { + this->join(value, ValueType(-rhs)); + } + KOKKOS_FORCEINLINE_FUNCTION void operator--() { + this->join(value, ValueType(-1)); + } + KOKKOS_FORCEINLINE_FUNCTION void operator--(int) { + this->join(value, ValueType(-1)); + } + + KOKKOS_INLINE_FUNCTION + void join(ValueType& dest, const ValueType& src) const { + Kokkos::atomic_add(&dest, src); + } + + KOKKOS_INLINE_FUNCTION + void join(volatile ValueType& dest, const volatile ValueType& src) const { + Kokkos::atomic_add(&dest, src); + } + + KOKKOS_FORCEINLINE_FUNCTION void update(ValueType const& rhs) { + this->join(value, rhs); + } + + KOKKOS_FORCEINLINE_FUNCTION void reset() { + value = reduction_identity<ValueType>::sum(); + } +}; + +/* ScatterValue <Op=ScatterProd, Contribution=ScatterNonAtomic> is + the object returned by the access operator() of ScatterAccess. This class + inherits from the Prod<> reducer, and it wraps join(dest, src) with + convenient operator*=, etc. Note the addition of update(ValueType const& rhs) + and reset() so that all reducers can have common functions See + ReduceDuplicates and ResetDuplicates ) */ +template <typename ValueType, typename DeviceType> +struct ScatterValue<ValueType, Kokkos::Experimental::ScatterProd, DeviceType, + Kokkos::Experimental::ScatterNonAtomic> { + ValueType& value; + + public: + KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) + : value(value_in) {} + KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ScatterValue&& other) + : value(other.value) {} + KOKKOS_FORCEINLINE_FUNCTION void operator*=(ValueType const& rhs) { + value *= rhs; + } + KOKKOS_FORCEINLINE_FUNCTION void operator/=(ValueType const& rhs) { + value /= rhs; + } + + KOKKOS_FORCEINLINE_FUNCTION void update(ValueType const& rhs) { + value *= rhs; + } + KOKKOS_FORCEINLINE_FUNCTION void reset() { + value = reduction_identity<ValueType>::prod(); + } +}; + +/* ScatterValue <Op=ScatterProd, Contribution=ScatterAtomic> is the + object returned by the access operator() of ScatterAccess. This class + inherits from the Prod<> reducer, and similar to that returned by an Atomic + View, it wraps and atomic_prod with convenient operator*=, etc. atomic_prod + uses the atomic_compare_exchange. This version also has the update(rhs) + and reset() functions. */ +template <typename ValueType, typename DeviceType> +struct ScatterValue<ValueType, Kokkos::Experimental::ScatterProd, DeviceType, + Kokkos::Experimental::ScatterAtomic> { + ValueType& value; + + public: + KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) + : value(value_in) {} + KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ScatterValue&& other) + : value(other.value) {} + + KOKKOS_FORCEINLINE_FUNCTION void operator*=(ValueType const& rhs) { + Kokkos::atomic_mul(&value, rhs); + } + KOKKOS_FORCEINLINE_FUNCTION void operator/=(ValueType const& rhs) { + Kokkos::atomic_div(&value, rhs); + } + + KOKKOS_FORCEINLINE_FUNCTION + void atomic_prod(ValueType& dest, const ValueType& src) const { + bool success = false; + while (!success) { + ValueType dest_old = dest; + ValueType dest_new = dest_old * src; + dest_new = + Kokkos::atomic_compare_exchange<ValueType>(&dest, dest_old, dest_new); + success = ((dest_new - dest_old) / dest_old <= 1e-15); + } + } + + KOKKOS_INLINE_FUNCTION + void join(ValueType& dest, const ValueType& src) const { + atomic_prod(&dest, src); + } + + KOKKOS_INLINE_FUNCTION + void join(volatile ValueType& dest, const volatile ValueType& src) const { + atomic_prod(&dest, src); + } + + KOKKOS_FORCEINLINE_FUNCTION void update(ValueType const& rhs) { + atomic_prod(&value, rhs); + } + KOKKOS_FORCEINLINE_FUNCTION void reset() { + value = reduction_identity<ValueType>::prod(); + } +}; + +/* ScatterValue <Op=ScatterMin, Contribution=ScatterNonAtomic> is + the object returned by the access operator() of ScatterAccess. This class + inherits from the Min<> reducer and it wraps join(dest, src) with convenient + update(rhs). Note the addition of update(ValueType const& rhs) and reset() + are so that all reducers can have a common update function See + ReduceDuplicates and ResetDuplicates ) */ +template <typename ValueType, typename DeviceType> +struct ScatterValue<ValueType, Kokkos::Experimental::ScatterMin, DeviceType, + Kokkos::Experimental::ScatterNonAtomic> { + ValueType& value; + KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) + : value(value_in) {} + KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ScatterValue&& other) + : value(other.value) {} + + public: + KOKKOS_FORCEINLINE_FUNCTION void update(ValueType const& rhs) { + value = rhs < value ? rhs : value; + } + KOKKOS_FORCEINLINE_FUNCTION void reset() { + value = reduction_identity<ValueType>::min(); + } +}; + +/* ScatterValue <Op=ScatterMin, Contribution=ScatterAtomic> is the + object returned by the access operator() of ScatterAccess. This class + inherits from the Min<> reducer, and similar to that returned by an Atomic + View, it wraps atomic_min with join(), etc. atomic_min uses the + atomic_compare_exchange. This version also has the update(rhs) and reset() + functions. */ +template <typename ValueType, typename DeviceType> +struct ScatterValue<ValueType, Kokkos::Experimental::ScatterMin, DeviceType, + Kokkos::Experimental::ScatterAtomic> { + ValueType& value; + + public: + KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) + : value(value_in) {} + KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ScatterValue&& other) + : value(other.value) {} + + KOKKOS_FORCEINLINE_FUNCTION + void atomic_min(ValueType& dest, const ValueType& src) const { + bool success = false; + while (!success) { + ValueType dest_old = dest; + ValueType dest_new = (dest_old > src) ? src : dest_old; + dest_new = + Kokkos::atomic_compare_exchange<ValueType>(&dest, dest_old, dest_new); + success = ((dest_new - dest_old) / dest_old <= 1e-15); + } + } + + KOKKOS_INLINE_FUNCTION + void join(ValueType& dest, const ValueType& src) const { + atomic_min(dest, src); + } + + KOKKOS_INLINE_FUNCTION + void join(volatile ValueType& dest, const volatile ValueType& src) const { + atomic_min(dest, src); + } + + KOKKOS_FORCEINLINE_FUNCTION void update(ValueType const& rhs) { + this->join(value, rhs); + } + KOKKOS_FORCEINLINE_FUNCTION void reset() { + value = reduction_identity<ValueType>::min(); + } +}; + +/* ScatterValue <Op=ScatterMax, Contribution=ScatterNonAtomic> is + the object returned by the access operator() of ScatterAccess. This class + inherits from the Max<> reducer and it wraps join(dest, src) with convenient + update(rhs). Note the addition of update(ValueType const& rhs) and reset() + are so that all reducers can have a common update function See + ReduceDuplicates and ResetDuplicates ) */ +template <typename ValueType, typename DeviceType> +struct ScatterValue<ValueType, Kokkos::Experimental::ScatterMax, DeviceType, + Kokkos::Experimental::ScatterNonAtomic> { + ValueType& value; + + public: + KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) + : value(value_in) {} + KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ScatterValue&& other) + : value(other.value) {} + KOKKOS_FORCEINLINE_FUNCTION void update(ValueType const& rhs) { + value = rhs > value ? rhs : value; + } + KOKKOS_FORCEINLINE_FUNCTION void reset() { + value = reduction_identity<ValueType>::max(); + } +}; + +/* ScatterValue <Op=ScatterMax, Contribution=ScatterAtomic> is the + object returned by the access operator() of ScatterAccess. This class + inherits from the Max<> reducer, and similar to that returned by an Atomic + View, it wraps atomic_max with join(), etc. atomic_max uses the + atomic_compare_exchange. This version also has the update(rhs) and reset() + functions. */ +template <typename ValueType, typename DeviceType> +struct ScatterValue<ValueType, Kokkos::Experimental::ScatterMax, DeviceType, + Kokkos::Experimental::ScatterAtomic> { + ValueType& value; + + public: + KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) + : value(value_in) {} + KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ScatterValue&& other) + : value(other.value) {} + + KOKKOS_FORCEINLINE_FUNCTION + void atomic_max(ValueType& dest, const ValueType& src) const { + bool success = false; + while (!success) { + ValueType dest_old = dest; + ValueType dest_new = (dest_old < src) ? src : dest_old; + dest_new = + Kokkos::atomic_compare_exchange<ValueType>(&dest, dest_old, dest_new); + success = ((dest_new - dest_old) / dest_old <= 1e-15); + } + } + + KOKKOS_INLINE_FUNCTION + void join(ValueType& dest, const ValueType& src) const { + atomic_max(dest, src); + } + + KOKKOS_INLINE_FUNCTION + void join(volatile ValueType& dest, const volatile ValueType& src) const { + atomic_max(dest, src); + } + + KOKKOS_FORCEINLINE_FUNCTION void update(ValueType const& rhs) { + this->join(value, rhs); + } + KOKKOS_FORCEINLINE_FUNCTION void reset() { + value = reduction_identity<ValueType>::max(); + } +}; + +/* DuplicatedDataType, given a View DataType, will create a new DataType + that has a new runtime dimension which becomes the largest-stride dimension. + In the case of LayoutLeft, due to the limitation induced by the design of + DataType itself, it must convert any existing compile-time dimensions into + runtime dimensions. */ +template <typename T, typename Layout> +struct DuplicatedDataType; + +template <typename T> +struct DuplicatedDataType<T, Kokkos::LayoutRight> { + using value_type = T*; // For LayoutRight, add a star all the way on the left +}; + +template <typename T, size_t N> +struct DuplicatedDataType<T[N], Kokkos::LayoutRight> { + using value_type = + typename DuplicatedDataType<T, Kokkos::LayoutRight>::value_type[N]; +}; + +template <typename T> +struct DuplicatedDataType<T[], Kokkos::LayoutRight> { + using value_type = + typename DuplicatedDataType<T, Kokkos::LayoutRight>::value_type[]; +}; + +template <typename T> +struct DuplicatedDataType<T*, Kokkos::LayoutRight> { + using value_type = + typename DuplicatedDataType<T, Kokkos::LayoutRight>::value_type*; +}; + +template <typename T> +struct DuplicatedDataType<T, Kokkos::LayoutLeft> { + using value_type = T*; +}; + +template <typename T, size_t N> +struct DuplicatedDataType<T[N], Kokkos::LayoutLeft> { + using value_type = + typename DuplicatedDataType<T, Kokkos::LayoutLeft>::value_type*; +}; + +template <typename T> +struct DuplicatedDataType<T[], Kokkos::LayoutLeft> { + using value_type = + typename DuplicatedDataType<T, Kokkos::LayoutLeft>::value_type*; +}; + +template <typename T> +struct DuplicatedDataType<T*, Kokkos::LayoutLeft> { + using value_type = + typename DuplicatedDataType<T, Kokkos::LayoutLeft>::value_type*; +}; + +/* Insert integer argument pack into array */ + +template <class T> +void args_to_array(size_t* array, int pos, T dim0) { + array[pos] = dim0; +} +template <class T, class... Dims> +void args_to_array(size_t* array, int pos, T dim0, Dims... dims) { + array[pos] = dim0; + args_to_array(array, pos + 1, dims...); +} + +/* Slice is just responsible for stuffing the correct number of Kokkos::ALL + arguments on the correct side of the index in a call to subview() to get a + subview where the index specified is the largest-stride one. */ +template <typename Layout, int rank, typename V, typename... Args> +struct Slice { + using next = Slice<Layout, rank - 1, V, Kokkos::Impl::ALL_t, Args...>; + using value_type = typename next::value_type; + + static value_type get(V const& src, const size_t i, Args... args) { + return next::get(src, i, Kokkos::ALL, args...); + } +}; + +template <typename V, typename... Args> +struct Slice<Kokkos::LayoutRight, 1, V, Args...> { + using value_type = + typename Kokkos::Impl::ViewMapping<void, V, const size_t, Args...>::type; + static value_type get(V const& src, const size_t i, Args... args) { + return Kokkos::subview(src, i, args...); + } +}; + +template <typename V, typename... Args> +struct Slice<Kokkos::LayoutLeft, 1, V, Args...> { + using value_type = + typename Kokkos::Impl::ViewMapping<void, V, Args..., const size_t>::type; + static value_type get(V const& src, const size_t i, Args... args) { + return Kokkos::subview(src, args..., i); + } +}; + +template <typename ExecSpace, typename ValueType, typename Op> +struct ReduceDuplicates; + +template <typename ExecSpace, typename ValueType, typename Op> +struct ReduceDuplicatesBase { + using Derived = ReduceDuplicates<ExecSpace, ValueType, Op>; + ValueType const* src; + ValueType* dst; + size_t stride; + size_t start; + size_t n; + ReduceDuplicatesBase(ExecSpace const& exec_space, ValueType const* src_in, + ValueType* dest_in, size_t stride_in, size_t start_in, + size_t n_in, std::string const& name) + : src(src_in), dst(dest_in), stride(stride_in), start(start_in), n(n_in) { + parallel_for( + std::string("Kokkos::ScatterView::ReduceDuplicates [") + name + "]", + RangePolicy<ExecSpace, size_t>(exec_space, 0, stride), + static_cast<Derived const&>(*this)); + } +}; + +/* ReduceDuplicates -- Perform reduction on destination array using strided + * source Use ScatterValue<> specific to operation to wrap destination array so + * that the reduction operation can be accessed via the update(rhs) function */ +template <typename ExecSpace, typename ValueType, typename Op> +struct ReduceDuplicates + : public ReduceDuplicatesBase<ExecSpace, ValueType, Op> { + using Base = ReduceDuplicatesBase<ExecSpace, ValueType, Op>; + ReduceDuplicates(ExecSpace const& exec_space, ValueType const* src_in, + ValueType* dst_in, size_t stride_in, size_t start_in, + size_t n_in, std::string const& name) + : Base(exec_space, src_in, dst_in, stride_in, start_in, n_in, name) {} + KOKKOS_FORCEINLINE_FUNCTION void operator()(size_t i) const { + for (size_t j = Base::start; j < Base::n; ++j) { + ScatterValue<ValueType, Op, ExecSpace, + Kokkos::Experimental::ScatterNonAtomic> + sv(Base::dst[i]); + sv.update(Base::src[i + Base::stride * j]); + } + } +}; + +template <typename ExecSpace, typename ValueType, typename Op> +struct ResetDuplicates; + +template <typename ExecSpace, typename ValueType, typename Op> +struct ResetDuplicatesBase { + using Derived = ResetDuplicates<ExecSpace, ValueType, Op>; + ValueType* data; + ResetDuplicatesBase(ExecSpace const& exec_space, ValueType* data_in, + size_t size_in, std::string const& name) + : data(data_in) { + parallel_for( + std::string("Kokkos::ScatterView::ResetDuplicates [") + name + "]", + RangePolicy<ExecSpace, size_t>(exec_space, 0, size_in), + static_cast<Derived const&>(*this)); + } +}; + +/* ResetDuplicates -- Perform reset on destination array + * Use ScatterValue<> specific to operation to wrap destination array so that + * the reset operation can be accessed via the reset() function */ +template <typename ExecSpace, typename ValueType, typename Op> +struct ResetDuplicates : public ResetDuplicatesBase<ExecSpace, ValueType, Op> { + using Base = ResetDuplicatesBase<ExecSpace, ValueType, Op>; + ResetDuplicates(ExecSpace const& exec_space, ValueType* data_in, + size_t size_in, std::string const& name) + : Base(exec_space, data_in, size_in, name) {} + KOKKOS_FORCEINLINE_FUNCTION void operator()(size_t i) const { + ScatterValue<ValueType, Op, ExecSpace, + Kokkos::Experimental::ScatterNonAtomic> + sv(Base::data[i]); + sv.reset(); + } +}; + +template <typename... P> +void check_scatter_view_allocation_properties_argument( + ViewCtorProp<P...> const&) { + static_assert(ViewCtorProp<P...>::has_execution_space && + ViewCtorProp<P...>::has_label && + ViewCtorProp<P...>::initialize, + "Allocation property must have an execution name as well as a " + "label, and must perform the view initialization"); +} + +} // namespace Experimental +} // namespace Impl +} // namespace Kokkos + +namespace Kokkos { +namespace Experimental { + +template <typename DataType, + typename Layout = Kokkos::DefaultExecutionSpace::array_layout, + typename DeviceType = Kokkos::DefaultExecutionSpace, + typename Op = Kokkos::Experimental::ScatterSum, + typename Duplication = typename Kokkos::Impl::Experimental:: + DefaultDuplication<typename DeviceType::execution_space>::type, + typename Contribution = + typename Kokkos::Impl::Experimental::DefaultContribution< + typename DeviceType::execution_space, Duplication>::type> +class ScatterView; + +template <typename DataType, typename Op, typename DeviceType, typename Layout, + typename Duplication, typename Contribution, + typename OverrideContribution> +class ScatterAccess; + +// non-duplicated implementation +template <typename DataType, typename Op, typename DeviceType, typename Layout, + typename Contribution> +class ScatterView<DataType, Layout, DeviceType, Op, ScatterNonDuplicated, + Contribution> { + public: + using execution_space = typename DeviceType::execution_space; + using memory_space = typename DeviceType::memory_space; + using device_type = Kokkos::Device<execution_space, memory_space>; + using original_view_type = Kokkos::View<DataType, Layout, device_type>; + using original_value_type = typename original_view_type::value_type; + using original_reference_type = typename original_view_type::reference_type; + friend class ScatterAccess<DataType, Op, DeviceType, Layout, + ScatterNonDuplicated, Contribution, + ScatterNonAtomic>; + friend class ScatterAccess<DataType, Op, DeviceType, Layout, + ScatterNonDuplicated, Contribution, ScatterAtomic>; + template <class, class, class, class, class, class> + friend class ScatterView; + + ScatterView() = default; + + template <typename RT, typename... RP> + ScatterView(View<RT, RP...> const& original_view) + : internal_view(original_view) {} + + template <typename RT, typename... P, typename... RP> + ScatterView(execution_space const& /* exec_space */, + View<RT, RP...> const& original_view) + : internal_view(original_view) {} + + template <typename... Dims> + ScatterView(std::string const& name, Dims... dims) + : internal_view(name, dims...) {} + + // This overload allows specifying an execution space instance to be + // used by passing, e.g., Kokkos::view_alloc(exec_space, "label") as + // first argument. + template <typename... P, typename... Dims> + ScatterView(::Kokkos::Impl::ViewCtorProp<P...> const& arg_prop, Dims... dims) + : internal_view(arg_prop, dims...) { + using ::Kokkos::Impl::Experimental:: + check_scatter_view_allocation_properties_argument; + check_scatter_view_allocation_properties_argument(arg_prop); + } + + template <typename OtherDataType, typename OtherDeviceType> + KOKKOS_FUNCTION ScatterView( + const ScatterView<OtherDataType, Layout, OtherDeviceType, Op, + ScatterNonDuplicated, Contribution>& other_view) + : internal_view(other_view.internal_view) {} + + template <typename OtherDataType, typename OtherDeviceType> + KOKKOS_FUNCTION void operator=( + const ScatterView<OtherDataType, Layout, OtherDeviceType, Op, + ScatterNonDuplicated, Contribution>& other_view) { + internal_view = other_view.internal_view; + } + + template <typename OverrideContribution = Contribution> + KOKKOS_FORCEINLINE_FUNCTION + ScatterAccess<DataType, Op, DeviceType, Layout, ScatterNonDuplicated, + Contribution, OverrideContribution> + access() const { + return ScatterAccess<DataType, Op, DeviceType, Layout, ScatterNonDuplicated, + Contribution, OverrideContribution>(*this); + } + + original_view_type subview() const { return internal_view; } + + KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const { + return internal_view.is_allocated(); + } + + template <typename DT, typename... RP> + void contribute_into(View<DT, RP...> const& dest) const { + contribute_into(execution_space(), dest); + } + + template <typename DT, typename... RP> + void contribute_into(execution_space const& exec_space, + View<DT, RP...> const& dest) const { + using dest_type = View<DT, RP...>; + static_assert(std::is_same<typename dest_type::array_layout, Layout>::value, + "ScatterView contribute destination has different layout"); + static_assert( + Kokkos::Impl::SpaceAccessibility< + execution_space, typename dest_type::memory_space>::accessible, + "ScatterView contribute destination memory space not accessible"); + if (dest.data() == internal_view.data()) return; + Kokkos::Impl::Experimental::ReduceDuplicates<execution_space, + original_value_type, Op>( + exec_space, internal_view.data(), dest.data(), 0, 0, 1, + internal_view.label()); + } + + void reset(execution_space const& exec_space = execution_space()) { + Kokkos::Impl::Experimental::ResetDuplicates<execution_space, + original_value_type, Op>( + exec_space, internal_view.data(), internal_view.size(), + internal_view.label()); + } + template <typename DT, typename... RP> + void reset_except(View<DT, RP...> const& view) { + reset_except(execution_space(), view); + } + + template <typename DT, typename... RP> + void reset_except(const execution_space& exec_space, + View<DT, RP...> const& view) { + if (view.data() != internal_view.data()) reset(exec_space); + } + + void resize(const size_t n0 = 0, const size_t n1 = 0, const size_t n2 = 0, + const size_t n3 = 0, const size_t n4 = 0, const size_t n5 = 0, + const size_t n6 = 0, const size_t n7 = 0) { + ::Kokkos::resize(internal_view, n0, n1, n2, n3, n4, n5, n6, n7); + } + + void realloc(const size_t n0 = 0, const size_t n1 = 0, const size_t n2 = 0, + const size_t n3 = 0, const size_t n4 = 0, const size_t n5 = 0, + const size_t n6 = 0, const size_t n7 = 0) { + ::Kokkos::realloc(internal_view, n0, n1, n2, n3, n4, n5, n6, n7); + } + + protected: + template <typename... Args> + KOKKOS_FORCEINLINE_FUNCTION original_reference_type at(Args... args) const { + return internal_view(args...); + } + + private: + using internal_view_type = original_view_type; + internal_view_type internal_view; +}; + +template <typename DataType, typename Op, typename DeviceType, typename Layout, + typename Contribution, typename OverrideContribution> +class ScatterAccess<DataType, Op, DeviceType, Layout, ScatterNonDuplicated, + Contribution, OverrideContribution> { + public: + using view_type = ScatterView<DataType, Layout, DeviceType, Op, + ScatterNonDuplicated, Contribution>; + using original_value_type = typename view_type::original_value_type; + using value_type = Kokkos::Impl::Experimental::ScatterValue< + original_value_type, Op, DeviceType, OverrideContribution>; + + KOKKOS_INLINE_FUNCTION + ScatterAccess() : view(view_type()) {} + + KOKKOS_INLINE_FUNCTION + ScatterAccess(view_type const& view_in) : view(view_in) {} + KOKKOS_DEFAULTED_FUNCTION + ~ScatterAccess() = default; + + template <typename... Args> + KOKKOS_FORCEINLINE_FUNCTION value_type operator()(Args... args) const { + return view.at(args...); + } + + template <typename Arg> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<view_type::original_view_type::rank == 1 && + std::is_integral<Arg>::value, + value_type>::type + operator[](Arg arg) const { + return view.at(arg); + } + + private: + view_type const& view; +}; + +// duplicated implementation +// LayoutLeft and LayoutRight are different enough that we'll just specialize +// each + +template <typename DataType, typename Op, typename DeviceType, + typename Contribution> +class ScatterView<DataType, Kokkos::LayoutRight, DeviceType, Op, + ScatterDuplicated, Contribution> { + public: + using execution_space = typename DeviceType::execution_space; + using memory_space = typename DeviceType::memory_space; + using device_type = Kokkos::Device<execution_space, memory_space>; + using original_view_type = + Kokkos::View<DataType, Kokkos::LayoutRight, device_type>; + using original_value_type = typename original_view_type::value_type; + using original_reference_type = typename original_view_type::reference_type; + friend class ScatterAccess<DataType, Op, DeviceType, Kokkos::LayoutRight, + ScatterDuplicated, Contribution, ScatterNonAtomic>; + friend class ScatterAccess<DataType, Op, DeviceType, Kokkos::LayoutRight, + ScatterDuplicated, Contribution, ScatterAtomic>; + template <class, class, class, class, class, class> + friend class ScatterView; + + using data_type_info = + typename Kokkos::Impl::Experimental::DuplicatedDataType< + DataType, Kokkos::LayoutRight>; + using internal_data_type = typename data_type_info::value_type; + using internal_view_type = + Kokkos::View<internal_data_type, Kokkos::LayoutRight, device_type>; + + ScatterView() = default; + + template <typename OtherDataType, typename OtherDeviceType> + KOKKOS_FUNCTION ScatterView( + const ScatterView<OtherDataType, Kokkos::LayoutRight, OtherDeviceType, Op, + ScatterDuplicated, Contribution>& other_view) + : unique_token(other_view.unique_token), + internal_view(other_view.internal_view) {} + + template <typename OtherDataType, typename OtherDeviceType> + KOKKOS_FUNCTION void operator=( + const ScatterView<OtherDataType, Kokkos::LayoutRight, OtherDeviceType, Op, + ScatterDuplicated, Contribution>& other_view) { + unique_token = other_view.unique_token; + internal_view = other_view.internal_view; + } + + template <typename RT, typename... RP> + ScatterView(View<RT, RP...> const& original_view) + : ScatterView(execution_space(), original_view) {} + + template <typename RT, typename... P, typename... RP> + ScatterView(execution_space const& exec_space, + View<RT, RP...> const& original_view) + : unique_token(), + internal_view( + view_alloc(WithoutInitializing, + std::string("duplicated_") + original_view.label(), + exec_space), + unique_token.size(), + original_view.rank_dynamic > 0 ? original_view.extent(0) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + original_view.rank_dynamic > 1 ? original_view.extent(1) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + original_view.rank_dynamic > 2 ? original_view.extent(2) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + original_view.rank_dynamic > 3 ? original_view.extent(3) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + original_view.rank_dynamic > 4 ? original_view.extent(4) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + original_view.rank_dynamic > 5 ? original_view.extent(5) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + original_view.rank_dynamic > 6 ? original_view.extent(6) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG) + + { + reset(exec_space); + } + + template <typename... Dims> + ScatterView(std::string const& name, Dims... dims) + : ScatterView(view_alloc(execution_space(), name), dims...) {} + + // This overload allows specifying an execution space instance to be + // used by passing, e.g., Kokkos::view_alloc(exec_space, "label") as + // first argument. + template <typename... P, typename... Dims> + ScatterView(::Kokkos::Impl::ViewCtorProp<P...> const& arg_prop, Dims... dims) + : internal_view(view_alloc(WithoutInitializing, + static_cast<::Kokkos::Impl::ViewCtorProp< + void, std::string> const&>(arg_prop) + .value), + unique_token.size(), dims...) { + using ::Kokkos::Impl::Experimental:: + check_scatter_view_allocation_properties_argument; + check_scatter_view_allocation_properties_argument(arg_prop); + + auto const exec_space = + static_cast<::Kokkos::Impl::ViewCtorProp<void, execution_space> const&>( + arg_prop) + .value; + reset(exec_space); + } + + template <typename OverrideContribution = Contribution> + KOKKOS_FORCEINLINE_FUNCTION + ScatterAccess<DataType, Op, DeviceType, Kokkos::LayoutRight, + ScatterDuplicated, Contribution, OverrideContribution> + access() const { + return ScatterAccess<DataType, Op, DeviceType, Kokkos::LayoutRight, + ScatterDuplicated, Contribution, OverrideContribution>( + *this); + } + + typename Kokkos::Impl::Experimental::Slice<Kokkos::LayoutRight, + internal_view_type::rank, + internal_view_type>::value_type + subview() const { + return Kokkos::Impl::Experimental::Slice< + Kokkos::LayoutRight, internal_view_type::Rank, + internal_view_type>::get(internal_view, 0); + } + + KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const { + return internal_view.is_allocated(); + } + + template <typename DT, typename... RP> + void contribute_into(View<DT, RP...> const& dest) const { + contribute_into(execution_space(), dest); + } + + template <typename DT, typename... RP> + void contribute_into(execution_space const& exec_space, + View<DT, RP...> const& dest) const { + using dest_type = View<DT, RP...>; + static_assert(std::is_same<typename dest_type::array_layout, + Kokkos::LayoutRight>::value, + "ScatterView deep_copy destination has different layout"); + static_assert( + Kokkos::Impl::SpaceAccessibility< + execution_space, typename dest_type::memory_space>::accessible, + "ScatterView deep_copy destination memory space not accessible"); + bool is_equal = (dest.data() == internal_view.data()); + size_t start = is_equal ? 1 : 0; + Kokkos::Impl::Experimental::ReduceDuplicates<execution_space, + original_value_type, Op>( + exec_space, internal_view.data(), dest.data(), internal_view.stride(0), + start, internal_view.extent(0), internal_view.label()); + } + + void reset(execution_space const& exec_space = execution_space()) { + Kokkos::Impl::Experimental::ResetDuplicates<execution_space, + original_value_type, Op>( + exec_space, internal_view.data(), internal_view.size(), + internal_view.label()); + } + + template <typename DT, typename... RP> + void reset_except(View<DT, RP...> const& view) { + reset_except(execution_space(), view); + } + + template <typename DT, typename... RP> + void reset_except(execution_space const& exec_space, + View<DT, RP...> const& view) { + if (view.data() != internal_view.data()) { + reset(exec_space); + return; + } + Kokkos::Impl::Experimental::ResetDuplicates<execution_space, + original_value_type, Op>( + exec_space, internal_view.data() + view.size(), + internal_view.size() - view.size(), internal_view.label()); + } + + void resize(const size_t n0 = 0, const size_t n1 = 0, const size_t n2 = 0, + const size_t n3 = 0, const size_t n4 = 0, const size_t n5 = 0, + const size_t n6 = 0) { + ::Kokkos::resize(internal_view, unique_token.size(), n0, n1, n2, n3, n4, n5, + n6); + } + + void realloc(const size_t n0 = 0, const size_t n1 = 0, const size_t n2 = 0, + const size_t n3 = 0, const size_t n4 = 0, const size_t n5 = 0, + const size_t n6 = 0) { + ::Kokkos::realloc(internal_view, unique_token.size(), n0, n1, n2, n3, n4, + n5, n6); + } + + protected: + template <typename... Args> + KOKKOS_FORCEINLINE_FUNCTION original_reference_type at(int rank, + Args... args) const { + return internal_view(rank, args...); + } + + protected: + using unique_token_type = Kokkos::Experimental::UniqueToken< + execution_space, Kokkos::Experimental::UniqueTokenScope::Global>; + + unique_token_type unique_token; + internal_view_type internal_view; +}; + +template <typename DataType, typename Op, typename DeviceType, + typename Contribution> +class ScatterView<DataType, Kokkos::LayoutLeft, DeviceType, Op, + ScatterDuplicated, Contribution> { + public: + using execution_space = typename DeviceType::execution_space; + using memory_space = typename DeviceType::memory_space; + using device_type = Kokkos::Device<execution_space, memory_space>; + using original_view_type = + Kokkos::View<DataType, Kokkos::LayoutLeft, device_type>; + using original_value_type = typename original_view_type::value_type; + using original_reference_type = typename original_view_type::reference_type; + friend class ScatterAccess<DataType, Op, DeviceType, Kokkos::LayoutLeft, + ScatterDuplicated, Contribution, ScatterNonAtomic>; + friend class ScatterAccess<DataType, Op, DeviceType, Kokkos::LayoutLeft, + ScatterDuplicated, Contribution, ScatterAtomic>; + template <class, class, class, class, class, class> + friend class ScatterView; + + using data_type_info = + typename Kokkos::Impl::Experimental::DuplicatedDataType< + DataType, Kokkos::LayoutLeft>; + using internal_data_type = typename data_type_info::value_type; + using internal_view_type = + Kokkos::View<internal_data_type, Kokkos::LayoutLeft, device_type>; + + ScatterView() = default; + + template <typename RT, typename... RP> + ScatterView(View<RT, RP...> const& original_view) + : ScatterView(execution_space(), original_view) {} + + template <typename RT, typename... P, typename... RP> + ScatterView(execution_space const& exec_space, + View<RT, RP...> const& original_view) + : unique_token() { + size_t arg_N[8] = {original_view.rank > 0 ? original_view.extent(0) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + original_view.rank > 1 ? original_view.extent(1) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + original_view.rank > 2 ? original_view.extent(2) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + original_view.rank > 3 ? original_view.extent(3) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + original_view.rank > 4 ? original_view.extent(4) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + original_view.rank > 5 ? original_view.extent(5) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + original_view.rank > 6 ? original_view.extent(6) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + KOKKOS_IMPL_CTOR_DEFAULT_ARG}; + arg_N[internal_view_type::rank - 1] = unique_token.size(); + internal_view = internal_view_type( + view_alloc(WithoutInitializing, + std::string("duplicated_") + original_view.label(), + exec_space), + arg_N[0], arg_N[1], arg_N[2], arg_N[3], arg_N[4], arg_N[5], arg_N[6], + arg_N[7]); + reset(exec_space); + } + + template <typename... Dims> + ScatterView(std::string const& name, Dims... dims) + : ScatterView(view_alloc(execution_space(), name), dims...) {} + + // This overload allows specifying an execution space instance to be + // used by passing, e.g., Kokkos::view_alloc(exec_space, "label") as + // first argument. + template <typename... P, typename... Dims> + ScatterView(::Kokkos::Impl::ViewCtorProp<P...> const& arg_prop, + Dims... dims) { + using ::Kokkos::Impl::Experimental:: + check_scatter_view_allocation_properties_argument; + check_scatter_view_allocation_properties_argument(arg_prop); + + original_view_type original_view; + size_t arg_N[8] = {original_view.rank > 0 ? original_view.static_extent(0) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + original_view.rank > 1 ? original_view.static_extent(1) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + original_view.rank > 2 ? original_view.static_extent(2) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + original_view.rank > 3 ? original_view.static_extent(3) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + original_view.rank > 4 ? original_view.static_extent(4) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + original_view.rank > 5 ? original_view.static_extent(5) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + original_view.rank > 6 ? original_view.static_extent(6) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + KOKKOS_IMPL_CTOR_DEFAULT_ARG}; + Kokkos::Impl::Experimental::args_to_array(arg_N, 0, dims...); + arg_N[internal_view_type::rank - 1] = unique_token.size(); + + auto const name = + static_cast<::Kokkos::Impl::ViewCtorProp<void, std::string> const&>( + arg_prop) + .value; + internal_view = internal_view_type(view_alloc(WithoutInitializing, name), + arg_N[0], arg_N[1], arg_N[2], arg_N[3], + arg_N[4], arg_N[5], arg_N[6], arg_N[7]); + + auto const exec_space = + static_cast<::Kokkos::Impl::ViewCtorProp<void, execution_space> const&>( + arg_prop) + .value; + reset(exec_space); + } + + template <typename OtherDataType, typename OtherDeviceType> + KOKKOS_FUNCTION ScatterView( + const ScatterView<OtherDataType, Kokkos::LayoutLeft, OtherDeviceType, Op, + ScatterDuplicated, Contribution>& other_view) + : unique_token(other_view.unique_token), + internal_view(other_view.internal_view) {} + + template <typename OtherDataType, typename OtherDeviceType> + KOKKOS_FUNCTION void operator=( + const ScatterView<OtherDataType, Kokkos::LayoutLeft, OtherDeviceType, Op, + ScatterDuplicated, Contribution>& other_view) { + unique_token = other_view.unique_token; + internal_view = other_view.internal_view; + } + + template <typename OverrideContribution = Contribution> + KOKKOS_FORCEINLINE_FUNCTION + ScatterAccess<DataType, Op, DeviceType, Kokkos::LayoutLeft, + ScatterDuplicated, Contribution, OverrideContribution> + access() const { + return ScatterAccess<DataType, Op, DeviceType, Kokkos::LayoutLeft, + ScatterDuplicated, Contribution, OverrideContribution>( + *this); + } + + typename Kokkos::Impl::Experimental::Slice<Kokkos::LayoutLeft, + internal_view_type::rank, + internal_view_type>::value_type + subview() const { + return Kokkos::Impl::Experimental::Slice< + Kokkos::LayoutLeft, internal_view_type::rank, + internal_view_type>::get(internal_view, 0); + } + + KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const { + return internal_view.is_allocated(); + } + + template <typename... RP> + void contribute_into(View<RP...> const& dest) const { + contribute_into(execution_space(), dest); + } + + template <typename... RP> + void contribute_into(execution_space const& exec_space, + View<RP...> const& dest) const { + using dest_type = View<RP...>; + static_assert( + std::is_same<typename dest_type::value_type, + typename original_view_type::non_const_value_type>::value, + "ScatterView deep_copy destination has wrong value_type"); + static_assert(std::is_same<typename dest_type::array_layout, + Kokkos::LayoutLeft>::value, + "ScatterView deep_copy destination has different layout"); + static_assert( + Kokkos::Impl::SpaceAccessibility< + execution_space, typename dest_type::memory_space>::accessible, + "ScatterView deep_copy destination memory space not accessible"); + auto extent = internal_view.extent(internal_view_type::rank - 1); + bool is_equal = (dest.data() == internal_view.data()); + size_t start = is_equal ? 1 : 0; + Kokkos::Impl::Experimental::ReduceDuplicates<execution_space, + original_value_type, Op>( + exec_space, internal_view.data(), dest.data(), + internal_view.stride(internal_view_type::rank - 1), start, extent, + internal_view.label()); + } + + void reset(execution_space const& exec_space = execution_space()) { + Kokkos::Impl::Experimental::ResetDuplicates<execution_space, + original_value_type, Op>( + exec_space, internal_view.data(), internal_view.size(), + internal_view.label()); + } + + template <typename DT, typename... RP> + void reset_except(View<DT, RP...> const& view) { + reset_except(execution_space(), view); + } + + template <typename DT, typename... RP> + void reset_except(execution_space const& exec_space, + View<DT, RP...> const& view) { + if (view.data() != internal_view.data()) { + reset(exec_space); + return; + } + Kokkos::Impl::Experimental::ResetDuplicates<execution_space, + original_value_type, Op>( + exec_space, internal_view.data() + view.size(), + internal_view.size() - view.size(), internal_view.label()); + } + + void resize(const size_t n0 = 0, const size_t n1 = 0, const size_t n2 = 0, + const size_t n3 = 0, const size_t n4 = 0, const size_t n5 = 0, + const size_t n6 = 0) { + size_t arg_N[8] = {n0, n1, n2, n3, n4, n5, n6, 0}; + const int i = internal_view.rank - 1; + arg_N[i] = unique_token.size(); + + ::Kokkos::resize(internal_view, arg_N[0], arg_N[1], arg_N[2], arg_N[3], + arg_N[4], arg_N[5], arg_N[6], arg_N[7]); + } + + void realloc(const size_t n0 = 0, const size_t n1 = 0, const size_t n2 = 0, + const size_t n3 = 0, const size_t n4 = 0, const size_t n5 = 0, + const size_t n6 = 0) { + size_t arg_N[8] = {n0, n1, n2, n3, n4, n5, n6, 0}; + const int i = internal_view.rank - 1; + arg_N[i] = unique_token.size(); + + ::Kokkos::realloc(internal_view, arg_N[0], arg_N[1], arg_N[2], arg_N[3], + arg_N[4], arg_N[5], arg_N[6], arg_N[7]); + } + + protected: + template <typename... Args> + KOKKOS_FORCEINLINE_FUNCTION original_reference_type at(int thread_id, + Args... args) const { + return internal_view(args..., thread_id); + } + + protected: + using unique_token_type = Kokkos::Experimental::UniqueToken< + execution_space, Kokkos::Experimental::UniqueTokenScope::Global>; + + unique_token_type unique_token; + internal_view_type internal_view; +}; + +/* This object has to be separate in order to store the thread ID, which cannot + be obtained until one is inside a parallel construct, and may be relatively + expensive to obtain at every contribution + (calls a non-inlined function, looks up a thread-local variable). + Due to the expense, it is sensible to query it at most once per parallel + iterate (ideally once per thread, but parallel_for doesn't expose that) and + then store it in a stack variable. + ScatterAccess serves as a non-const object on the stack which can store the + thread ID */ + +template <typename DataType, typename Op, typename DeviceType, typename Layout, + typename Contribution, typename OverrideContribution> +class ScatterAccess<DataType, Op, DeviceType, Layout, ScatterDuplicated, + Contribution, OverrideContribution> { + public: + using view_type = ScatterView<DataType, Layout, DeviceType, Op, + ScatterDuplicated, Contribution>; + using original_value_type = typename view_type::original_value_type; + using value_type = Kokkos::Impl::Experimental::ScatterValue< + original_value_type, Op, DeviceType, OverrideContribution>; + + KOKKOS_FORCEINLINE_FUNCTION + ScatterAccess(view_type const& view_in) + : view(view_in), thread_id(view_in.unique_token.acquire()) {} + + KOKKOS_FORCEINLINE_FUNCTION + ~ScatterAccess() { + if (thread_id != ~thread_id_type(0)) view.unique_token.release(thread_id); + } + + template <typename... Args> + KOKKOS_FORCEINLINE_FUNCTION value_type operator()(Args... args) const { + return view.at(thread_id, args...); + } + + template <typename Arg> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<view_type::original_view_type::rank == 1 && + std::is_integral<Arg>::value, + value_type>::type + operator[](Arg arg) const { + return view.at(thread_id, arg); + } + + private: + view_type const& view; + + // simplify RAII by disallowing copies + ScatterAccess(ScatterAccess const& other) = delete; + ScatterAccess& operator=(ScatterAccess const& other) = delete; + ScatterAccess& operator=(ScatterAccess&& other) = delete; + + public: + // do need to allow moves though, for the common + // auto b = a.access(); + // that assignments turns into a move constructor call + KOKKOS_FORCEINLINE_FUNCTION + ScatterAccess(ScatterAccess&& other) + : view(other.view), thread_id(other.thread_id) { + other.thread_id = ~thread_id_type(0); + } + + private: + using unique_token_type = typename view_type::unique_token_type; + using thread_id_type = typename unique_token_type::size_type; + thread_id_type thread_id; +}; + +template <typename Op = Kokkos::Experimental::ScatterSum, + typename Duplication = void, typename Contribution = void, + typename RT, typename... RP> +ScatterView< + RT, typename ViewTraits<RT, RP...>::array_layout, + typename ViewTraits<RT, RP...>::device_type, Op, + std::conditional_t< + std::is_same<Duplication, void>::value, + typename Kokkos::Impl::Experimental::DefaultDuplication< + typename ViewTraits<RT, RP...>::execution_space>::type, + Duplication>, + std::conditional_t< + std::is_same<Contribution, void>::value, + typename Kokkos::Impl::Experimental::DefaultContribution< + typename ViewTraits<RT, RP...>::execution_space, + typename std::conditional_t< + std::is_same<Duplication, void>::value, + typename Kokkos::Impl::Experimental::DefaultDuplication< + typename ViewTraits<RT, RP...>::execution_space>::type, + Duplication>>::type, + Contribution>> +create_scatter_view(View<RT, RP...> const& original_view) { + return original_view; // implicit ScatterView constructor call +} + +template <typename Op, typename RT, typename... RP> +ScatterView< + RT, typename ViewTraits<RT, RP...>::array_layout, + typename ViewTraits<RT, RP...>::device_type, Op, + typename Kokkos::Impl::Experimental::DefaultDuplication< + typename ViewTraits<RT, RP...>::execution_space>::type, + typename Kokkos::Impl::Experimental::DefaultContribution< + typename ViewTraits<RT, RP...>::execution_space, + typename Kokkos::Impl::Experimental::DefaultDuplication< + typename ViewTraits<RT, RP...>::execution_space>::type>::type> +create_scatter_view(Op, View<RT, RP...> const& original_view) { + return original_view; // implicit ScatterView constructor call +} + +template <typename Op, typename Duplication, typename Contribution, typename RT, + typename... RP> +ScatterView<RT, typename ViewTraits<RT, RP...>::array_layout, + typename ViewTraits<RT, RP...>::device_type, Op, Duplication, + Contribution> +create_scatter_view(Op, Duplication, Contribution, + View<RT, RP...> const& original_view) { + return original_view; // implicit ScatterView constructor call +} + +} // namespace Experimental +} // namespace Kokkos + +namespace Kokkos { +namespace Experimental { + +template <typename DT1, typename DT2, typename LY, typename ES, typename OP, + typename CT, typename DP, typename... VP> +void contribute( + typename ES::execution_space const& exec_space, View<DT1, VP...>& dest, + Kokkos::Experimental::ScatterView<DT2, LY, ES, OP, CT, DP> const& src) { + src.contribute_into(exec_space, dest); +} + +template <typename DT1, typename DT2, typename LY, typename ES, typename OP, + typename CT, typename DP, typename... VP> +void contribute( + View<DT1, VP...>& dest, + Kokkos::Experimental::ScatterView<DT2, LY, ES, OP, CT, DP> const& src) { + using execution_space = typename ES::execution_space; + contribute(execution_space{}, dest, src); +} + +} // namespace Experimental +} // namespace Kokkos + +namespace Kokkos { + +template <typename DT, typename LY, typename ES, typename OP, typename CT, + typename DP, typename... IS> +void realloc( + Kokkos::Experimental::ScatterView<DT, LY, ES, OP, CT, DP>& scatter_view, + IS... is) { + scatter_view.realloc(is...); +} + +template <typename DT, typename LY, typename ES, typename OP, typename CT, + typename DP, typename... IS> +void resize( + Kokkos::Experimental::ScatterView<DT, LY, ES, OP, CT, DP>& scatter_view, + IS... is) { + scatter_view.resize(is...); +} + +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp b/packages/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp new file mode 100644 index 0000000000000000000000000000000000000000..81be3ee2d3e836436a23f8808a07f9386bc3ac05 --- /dev/null +++ b/packages/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp @@ -0,0 +1,499 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_STATICCRSGRAPH_HPP +#define KOKKOS_STATICCRSGRAPH_HPP + +#include <string> +#include <vector> + +#include <Kokkos_View.hpp> +#include <Kokkos_Parallel.hpp> +#include <Kokkos_Parallel_Reduce.hpp> + +namespace Kokkos { + +namespace Impl { +template <class RowOffsetsType, class RowBlockOffsetsType> +struct StaticCrsGraphBalancerFunctor { + using int_type = typename RowOffsetsType::non_const_value_type; + RowOffsetsType row_offsets; + RowBlockOffsetsType row_block_offsets; + + int_type cost_per_row, num_blocks; + + StaticCrsGraphBalancerFunctor(RowOffsetsType row_offsets_, + RowBlockOffsetsType row_block_offsets_, + int_type cost_per_row_, int_type num_blocks_) + : row_offsets(row_offsets_), + row_block_offsets(row_block_offsets_), + cost_per_row(cost_per_row_), + num_blocks(num_blocks_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int_type& iRow) const { + const int_type num_rows = row_offsets.extent(0) - 1; + const int_type num_entries = row_offsets(num_rows); + const int_type total_cost = num_entries + num_rows * cost_per_row; + + const double cost_per_workset = 1.0 * total_cost / num_blocks; + + const int_type row_cost = + row_offsets(iRow + 1) - row_offsets(iRow) + cost_per_row; + + int_type count = row_offsets(iRow + 1) + cost_per_row * iRow; + + if (iRow == num_rows - 1) row_block_offsets(num_blocks) = num_rows; + + if (true) { + int_type current_block = + (count - row_cost - cost_per_row) / cost_per_workset; + int_type end_block = count / cost_per_workset; + + // Handle some corner cases for the last two blocks. + if (current_block >= num_blocks - 2) { + if ((current_block == num_blocks - 2) && + (count >= (current_block + 1) * cost_per_workset)) { + int_type row = iRow; + int_type cc = count - row_cost - cost_per_row; + int_type block = cc / cost_per_workset; + while ((block > 0) && (block == current_block)) { + cc = row_offsets(row) + row * cost_per_row; + block = cc / cost_per_workset; + row--; + } + if ((count - cc - row_cost - cost_per_row) < + num_entries - row_offsets(iRow + 1)) { + row_block_offsets(current_block + 1) = iRow + 1; + } else { + row_block_offsets(current_block + 1) = iRow; + } + } + } else { + if ((count >= (current_block + 1) * cost_per_workset) || + (iRow + 2 == int_type(row_offsets.extent(0)))) { + if (end_block > current_block + 1) { + int_type num_block = end_block - current_block; + row_block_offsets(current_block + 1) = iRow; + for (int_type block = current_block + 2; block <= end_block; + block++) + if ((block < current_block + 2 + (num_block - 1) / 2)) + row_block_offsets(block) = iRow; + else + row_block_offsets(block) = iRow + 1; + } else { + row_block_offsets(current_block + 1) = iRow + 1; + } + } + } + } + } +}; +} // namespace Impl + +/// \class GraphRowViewConst +/// \brief View of a row of a sparse graph. +/// \tparam GraphType Sparse graph type, such as (but not limited to) +/// StaticCrsGraph. +/// +/// This class provides a generic view of a row of a sparse graph. +/// We intended this class to view a row of a StaticCrsGraph, but +/// GraphType need not necessarily be CrsMatrix. +/// +/// The row view is suited for computational kernels like sparse +/// matrix-vector multiply, as well as for modifying entries in the +/// sparse matrix. The view is always const as it does not allow graph +/// modification. +/// +/// Here is an example loop over the entries in the row: +/// \code +/// using ordinal_type = typename GraphRowViewConst<MatrixType>::ordinal_type; +/// +/// GraphRowView<GraphType> G_i = ...; +/// const ordinal_type numEntries = G_i.length; +/// for (ordinal_type k = 0; k < numEntries; ++k) { +/// ordinal_type j = G_i.colidx (k); +/// // ... do something with A_ij and j ... +/// } +/// \endcode +/// +/// GraphType must provide the \c data_type +/// aliases. In addition, it must make sense to use GraphRowViewConst to +/// view a row of GraphType. In particular, column +/// indices of a row must be accessible using the <tt>entries</tt> +/// resp. <tt>colidx</tt> arrays given to the constructor of this +/// class, with a constant <tt>stride</tt> between successive entries. +/// The stride is one for the compressed sparse row storage format (as +/// is used by CrsMatrix), but may be greater than one for other +/// sparse matrix storage formats (e.g., ELLPACK or jagged diagonal). +template <class GraphType> +struct GraphRowViewConst { + //! The type of the column indices in the row. + using ordinal_type = const typename GraphType::data_type; + + private: + //! Array of (local) column indices in the row. + ordinal_type* colidx_; + /// \brief Stride between successive entries in the row. + /// + /// For compressed sparse row (CSR) storage, this is always one. + /// This might be greater than one for storage formats like ELLPACK + /// or Jagged Diagonal. Nevertheless, the stride can never be + /// greater than the number of rows or columns in the matrix. Thus, + /// \c ordinal_type is the correct type. + const ordinal_type stride_; + + public: + /// \brief Constructor + /// + /// \param values [in] Array of the row's values. + /// \param colidx [in] Array of the row's column indices. + /// \param stride [in] (Constant) stride between matrix entries in + /// each of the above arrays. + /// \param count [in] Number of entries in the row. + KOKKOS_INLINE_FUNCTION + GraphRowViewConst(ordinal_type* const colidx_in, const ordinal_type& stride, + const ordinal_type& count) + : colidx_(colidx_in), stride_(stride), length(count) {} + + /// \brief Constructor with offset into \c colidx array + /// + /// \param colidx [in] Array of the row's column indices. + /// \param stride [in] (Constant) stride between matrix entries in + /// each of the above arrays. + /// \param count [in] Number of entries in the row. + /// \param idx [in] Start offset into \c colidx array + /// + /// \tparam OffsetType The type of \c idx (see above). Must be a + /// built-in integer type. This may differ from ordinal_type. + /// For example, the matrix may have dimensions that fit in int, + /// but a number of entries that does not fit in int. + template <class OffsetType> + KOKKOS_INLINE_FUNCTION GraphRowViewConst( + const typename GraphType::entries_type& colidx_in, + const ordinal_type& stride, const ordinal_type& count, + const OffsetType& idx, + const typename std::enable_if<std::is_integral<OffsetType>::value, + int>::type& = 0) + : colidx_(&colidx_in(idx)), stride_(stride), length(count) {} + + /// \brief Number of entries in the row. + /// + /// This is a public const field rather than a public const method, + /// in order to avoid possible overhead of a method call if the + /// compiler is unable to inline that method call. + /// + /// We assume that rows contain no duplicate entries (i.e., entries + /// with the same column index). Thus, a row may have up to + /// A.numCols() entries. This means that the correct type of + /// 'length' is ordinal_type. + const ordinal_type length; + + /// \brief (Const) reference to the column index of entry i in this + /// row of the sparse matrix. + /// + /// "Entry i" is not necessarily the entry with column index i, nor + /// does i necessarily correspond to the (local) row index. + KOKKOS_INLINE_FUNCTION + ordinal_type& colidx(const ordinal_type& i) const { + return colidx_[i * stride_]; + } + + /// \brief An alias for colidx + KOKKOS_INLINE_FUNCTION + ordinal_type& operator()(const ordinal_type& i) const { return colidx(i); } +}; + +/// \class StaticCrsGraph +/// \brief Compressed row storage array. +/// +/// \tparam DataType The type of stored entries. If a StaticCrsGraph is +/// used as the graph of a sparse matrix, then this is usually an +/// integer type, the type of the column indices in the sparse +/// matrix. +/// +/// \tparam Arg1Type The second template parameter, corresponding +/// either to the Device type (if there are no more template +/// parameters) or to the Layout type (if there is at least one more +/// template parameter). +/// +/// \tparam Arg2Type The third template parameter, which if provided +/// corresponds to the Device type. +/// +/// \tparam Arg3Type The third template parameter, which if provided +/// corresponds to the MemoryTraits. +/// +/// \tparam SizeType The type of row offsets. Usually the default +/// parameter suffices. However, setting a nondefault value is +/// necessary in some cases, for example, if you want to have a +/// sparse matrices with dimensions (and therefore column indices) +/// that fit in \c int, but want to store more than <tt>INT_MAX</tt> +/// entries in the sparse matrix. +/// +/// A row has a range of entries: +/// <ul> +/// <li> <tt> row_map[i0] <= entry < row_map[i0+1] </tt> </li> +/// <li> <tt> 0 <= i1 < row_map[i0+1] - row_map[i0] </tt> </li> +/// <li> <tt> entries( entry , i2 , i3 , ... ); </tt> </li> +/// <li> <tt> entries( row_map[i0] + i1 , i2 , i3 , ... ); </tt> </li> +/// </ul> +template <class DataType, class Arg1Type, class Arg2Type = void, + class Arg3Type = void, + typename SizeType = typename ViewTraits<DataType*, Arg1Type, Arg2Type, + Arg3Type>::size_type> +class StaticCrsGraph { + private: + using traits = ViewTraits<DataType*, Arg1Type, Arg2Type, Arg3Type>; + + public: + using data_type = DataType; + using array_layout = typename traits::array_layout; + using execution_space = typename traits::execution_space; + using device_type = typename traits::device_type; + using memory_traits = typename traits::memory_traits; + using size_type = SizeType; + + using staticcrsgraph_type = + StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type, SizeType>; + using HostMirror = StaticCrsGraph<data_type, array_layout, + typename traits::host_mirror_space, + memory_traits, size_type>; + + using row_map_type = + View<const size_type*, array_layout, device_type, memory_traits>; + using entries_type = + View<data_type*, array_layout, device_type, memory_traits>; + using row_block_type = + View<const size_type*, array_layout, device_type, memory_traits>; + + entries_type entries; + row_map_type row_map; + row_block_type row_block_offsets; + + //! Construct an empty view. + KOKKOS_INLINE_FUNCTION + StaticCrsGraph() : entries(), row_map(), row_block_offsets() {} + + //! Copy constructor (shallow copy). + KOKKOS_INLINE_FUNCTION + StaticCrsGraph(const StaticCrsGraph& rhs) + : entries(rhs.entries), + row_map(rhs.row_map), + row_block_offsets(rhs.row_block_offsets) {} + + template <class EntriesType, class RowMapType> + KOKKOS_INLINE_FUNCTION StaticCrsGraph(const EntriesType& entries_, + const RowMapType& row_map_) + : entries(entries_), row_map(row_map_), row_block_offsets() {} + + /** \brief Assign to a view of the rhs array. + * If the old view is the last view + * then allocated memory is deallocated. + */ + KOKKOS_INLINE_FUNCTION + StaticCrsGraph& operator=(const StaticCrsGraph& rhs) { + entries = rhs.entries; + row_map = rhs.row_map; + row_block_offsets = rhs.row_block_offsets; + return *this; + } + + /** \brief Destroy this view of the array. + * If the last view then allocated memory is deallocated. + */ + KOKKOS_DEFAULTED_FUNCTION + ~StaticCrsGraph() = default; + + /** \brief Return number of rows in the graph + */ + KOKKOS_INLINE_FUNCTION + size_type numRows() const { + return (row_map.extent(0) != 0) + ? row_map.extent(0) - static_cast<size_type>(1) + : static_cast<size_type>(0); + } + + KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const { + return (row_map.is_allocated() && entries.is_allocated()); + } + + /// \brief Return a const view of row i of the graph. + /// + /// If row i does not belong to the graph, return an empty view. + /// + /// The returned object \c view implements the following interface: + /// <ul> + /// <li> \c view.length is the number of entries in the row </li> + /// <li> \c view.colidx(k) returns a const reference to the + /// column index of the k-th entry in the row </li> + /// </ul> + /// k is not a column index; it just counts from 0 to + /// <tt>view.length - 1</tt>. + /// + /// Users should not rely on the return type of this method. They + /// should instead assign to 'auto'. That allows compile-time + /// polymorphism for different kinds of sparse matrix formats (e.g., + /// ELLPACK or Jagged Diagonal) that we may wish to support in the + /// future. + KOKKOS_INLINE_FUNCTION + GraphRowViewConst<StaticCrsGraph> rowConst(const data_type i) const { + const size_type start = row_map(i); + // count is guaranteed to fit in ordinal_type, as long as no row + // has duplicate entries. + const data_type count = static_cast<data_type>(row_map(i + 1) - start); + + if (count == 0) { + return GraphRowViewConst<StaticCrsGraph>(nullptr, 1, 0); + } else { + return GraphRowViewConst<StaticCrsGraph>(entries, 1, count, start); + } + } + + /** \brief Create a row partitioning into a given number of blocks + * balancing non-zeros + a fixed cost per row. + */ + void create_block_partitioning(size_type num_blocks, + size_type fix_cost_per_row = 4) { + View<size_type*, array_layout, device_type> block_offsets( + "StatisCrsGraph::load_balance_offsets", num_blocks + 1); + + Impl::StaticCrsGraphBalancerFunctor< + row_map_type, View<size_type*, array_layout, device_type> > + partitioner(row_map, block_offsets, fix_cost_per_row, num_blocks); + + Kokkos::parallel_for("Kokkos::StaticCrsGraph::create_block_partitioning", + Kokkos::RangePolicy<execution_space>(0, numRows()), + partitioner); + typename device_type::execution_space().fence(); + + row_block_offsets = block_offsets; + } +}; + +//---------------------------------------------------------------------------- + +template <class StaticCrsGraphType, class InputSizeType> +typename StaticCrsGraphType::staticcrsgraph_type create_staticcrsgraph( + const std::string& label, const std::vector<InputSizeType>& input); + +template <class StaticCrsGraphType, class InputSizeType> +typename StaticCrsGraphType::staticcrsgraph_type create_staticcrsgraph( + const std::string& label, + const std::vector<std::vector<InputSizeType> >& input); + +//---------------------------------------------------------------------------- + +template <class DataType, class Arg1Type, class Arg2Type, class Arg3Type, + typename SizeType> +typename StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type, + SizeType>::HostMirror +create_mirror_view(const StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type, + SizeType>& input); + +template <class DataType, class Arg1Type, class Arg2Type, class Arg3Type, + typename SizeType> +typename StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type, + SizeType>::HostMirror +create_mirror(const StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type, + SizeType>& input); + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#include <impl/Kokkos_StaticCrsGraph_factory.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <class GraphType> +struct StaticCrsGraphMaximumEntry { + using execution_space = typename GraphType::execution_space; + using value_type = typename GraphType::data_type; + + const typename GraphType::entries_type entries; + + StaticCrsGraphMaximumEntry(const GraphType& graph) : entries(graph.entries) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const unsigned i, value_type& update) const { + if (update < entries(i)) update = entries(i); + } + + KOKKOS_INLINE_FUNCTION + void init(value_type& update) const { update = 0; } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& update, + volatile const value_type& input) const { + if (update < input) update = input; + } +}; + +} // namespace Impl + +template <class DataType, class Arg1Type, class Arg2Type, class Arg3Type, + typename SizeType> +DataType maximum_entry(const StaticCrsGraph<DataType, Arg1Type, Arg2Type, + Arg3Type, SizeType>& graph) { + using GraphType = + StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type, SizeType>; + using FunctorType = Impl::StaticCrsGraphMaximumEntry<GraphType>; + + DataType result = 0; + Kokkos::parallel_reduce("Kokkos::maximum_entry", graph.entries.extent(0), + FunctorType(graph), result); + return result; +} + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #ifndef KOKKOS_CRSARRAY_HPP */ diff --git a/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp b/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp new file mode 100644 index 0000000000000000000000000000000000000000..edb0e7261da93bb629cad4e9cc7c7d3118868288 --- /dev/null +++ b/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp @@ -0,0 +1,841 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +/// \file Kokkos_UnorderedMap.hpp +/// \brief Declaration and definition of Kokkos::UnorderedMap. +/// +/// This header file declares and defines Kokkos::UnorderedMap and its +/// related nonmember functions. + +#ifndef KOKKOS_UNORDERED_MAP_HPP +#define KOKKOS_UNORDERED_MAP_HPP + +#include <Kokkos_Core.hpp> +#include <Kokkos_Functional.hpp> + +#include <Kokkos_Bitset.hpp> + +#include <impl/Kokkos_Traits.hpp> +#include <impl/Kokkos_UnorderedMap_impl.hpp> + +#include <iostream> + +#include <cstdint> +#include <stdexcept> + +namespace Kokkos { + +enum : unsigned { UnorderedMapInvalidIndex = ~0u }; + +/// \brief First element of the return value of UnorderedMap::insert(). +/// +/// Inserting an element into an UnorderedMap is not guaranteed to +/// succeed. There are three possible conditions: +/// <ol> +/// <li> <tt>INSERT_FAILED</tt>: The insert failed. This usually +/// means that the UnorderedMap ran out of space. </li> +/// <li> <tt>INSERT_SUCCESS</tt>: The insert succeeded, and the key +/// did <i>not</i> exist in the table before. </li> +/// <li> <tt>INSERT_EXISTING</tt>: The insert succeeded, and the key +/// <i>did</i> exist in the table before. The new value was +/// ignored and the old value was left in place. </li> +/// </ol> + +class UnorderedMapInsertResult { + private: + enum Status : uint32_t { + SUCCESS = 1u << 31, + EXISTING = 1u << 30, + FREED_EXISTING = 1u << 29, + LIST_LENGTH_MASK = ~(SUCCESS | EXISTING | FREED_EXISTING) + }; + + public: + /// Did the map successful insert the key/value pair + KOKKOS_FORCEINLINE_FUNCTION + bool success() const { return (m_status & SUCCESS); } + + /// Was the key already present in the map + KOKKOS_FORCEINLINE_FUNCTION + bool existing() const { return (m_status & EXISTING); } + + /// Did the map fail to insert the key due to insufficient capacity + KOKKOS_FORCEINLINE_FUNCTION + bool failed() const { return m_index == UnorderedMapInvalidIndex; } + + /// Did the map lose a race condition to insert a dupulicate key/value pair + /// where an index was claimed that needed to be released + KOKKOS_FORCEINLINE_FUNCTION + bool freed_existing() const { return (m_status & FREED_EXISTING); } + + /// How many iterations through the insert loop did it take before the + /// map returned + KOKKOS_FORCEINLINE_FUNCTION + uint32_t list_position() const { return (m_status & LIST_LENGTH_MASK); } + + /// Index where the key can be found as long as the insert did not fail + KOKKOS_FORCEINLINE_FUNCTION + uint32_t index() const { return m_index; } + + KOKKOS_FORCEINLINE_FUNCTION + UnorderedMapInsertResult() : m_index(UnorderedMapInvalidIndex), m_status(0) {} + + KOKKOS_FORCEINLINE_FUNCTION + void increment_list_position() { + m_status += (list_position() < LIST_LENGTH_MASK) ? 1u : 0u; + } + + KOKKOS_FORCEINLINE_FUNCTION + void set_existing(uint32_t i, bool arg_freed_existing) { + m_index = i; + m_status = + EXISTING | (arg_freed_existing ? FREED_EXISTING : 0u) | list_position(); + } + + KOKKOS_FORCEINLINE_FUNCTION + void set_success(uint32_t i) { + m_index = i; + m_status = SUCCESS | list_position(); + } + + private: + uint32_t m_index; + uint32_t m_status; +}; + +/// \class UnorderedMap +/// \brief Thread-safe, performance-portable lookup table. +/// +/// This class provides a lookup table. In terms of functionality, +/// this class compares to std::unordered_map (new in C++11). +/// "Unordered" means that keys are not stored in any particular +/// order, unlike (for example) std::map. "Thread-safe" means that +/// lookups, insertion, and deletion are safe to call by multiple +/// threads in parallel. "Performance-portable" means that parallel +/// performance of these operations is reasonable, on multiple +/// hardware platforms. Platforms on which performance has been +/// tested include conventional Intel x86 multicore processors, Intel +/// Xeon Phi ("MIC"), and NVIDIA GPUs. +/// +/// Parallel performance portability entails design decisions that +/// might differ from one's expectation for a sequential interface. +/// This particularly affects insertion of single elements. In an +/// interface intended for sequential use, insertion might reallocate +/// memory if the original allocation did not suffice to hold the new +/// element. In this class, insertion does <i>not</i> reallocate +/// memory. This means that it might fail. insert() returns an enum +/// which indicates whether the insert failed. There are three +/// possible conditions: +/// <ol> +/// <li> <tt>INSERT_FAILED</tt>: The insert failed. This usually +/// means that the UnorderedMap ran out of space. </li> +/// <li> <tt>INSERT_SUCCESS</tt>: The insert succeeded, and the key +/// did <i>not</i> exist in the table before. </li> +/// <li> <tt>INSERT_EXISTING</tt>: The insert succeeded, and the key +/// <i>did</i> exist in the table before. The new value was +/// ignored and the old value was left in place. </li> +/// </ol> +/// +/// \tparam Key Type of keys of the lookup table. If \c const, users +/// are not allowed to add or remove keys, though they are allowed +/// to change values. In that case, the implementation may make +/// optimizations specific to the <tt>Device</tt>. For example, if +/// <tt>Device</tt> is \c Cuda, it may use texture fetches to access +/// keys. +/// +/// \tparam Value Type of values stored in the lookup table. You may use +/// \c void here, in which case the table will be a set of keys. If +/// \c const, users are not allowed to change entries. +/// In that case, the implementation may make +/// optimizations specific to the \c Device, such as using texture +/// fetches to access values. +/// +/// \tparam Device The Kokkos Device type. +/// +/// \tparam Hasher Definition of the hash function for instances of +/// <tt>Key</tt>. The default will calculate a bitwise hash. +/// +/// \tparam EqualTo Definition of the equality function for instances of +/// <tt>Key</tt>. The default will do a bitwise equality comparison. +/// +template <typename Key, typename Value, + typename Device = Kokkos::DefaultExecutionSpace, + typename Hasher = pod_hash<typename std::remove_const<Key>::type>, + typename EqualTo = + pod_equal_to<typename std::remove_const<Key>::type> > +class UnorderedMap { + private: + using host_mirror_space = + typename ViewTraits<Key, Device, void, void>::host_mirror_space; + + public: + //! \name Public types and constants + //@{ + + // key_types + using declared_key_type = Key; + using key_type = typename std::remove_const<declared_key_type>::type; + using const_key_type = typename std::add_const<key_type>::type; + + // value_types + using declared_value_type = Value; + using value_type = typename std::remove_const<declared_value_type>::type; + using const_value_type = typename std::add_const<value_type>::type; + + using device_type = Device; + using execution_space = typename Device::execution_space; + using hasher_type = Hasher; + using equal_to_type = EqualTo; + using size_type = uint32_t; + + // map_types + using declared_map_type = + UnorderedMap<declared_key_type, declared_value_type, device_type, + hasher_type, equal_to_type>; + using insertable_map_type = UnorderedMap<key_type, value_type, device_type, + hasher_type, equal_to_type>; + using modifiable_map_type = + UnorderedMap<const_key_type, value_type, device_type, hasher_type, + equal_to_type>; + using const_map_type = UnorderedMap<const_key_type, const_value_type, + device_type, hasher_type, equal_to_type>; + + static const bool is_set = std::is_same<void, value_type>::value; + static const bool has_const_key = + std::is_same<const_key_type, declared_key_type>::value; + static const bool has_const_value = + is_set || std::is_same<const_value_type, declared_value_type>::value; + + static const bool is_insertable_map = + !has_const_key && (is_set || !has_const_value); + static const bool is_modifiable_map = has_const_key && !has_const_value; + static const bool is_const_map = has_const_key && has_const_value; + + using insert_result = UnorderedMapInsertResult; + + using HostMirror = + UnorderedMap<Key, Value, host_mirror_space, Hasher, EqualTo>; + + using histogram_type = Impl::UnorderedMapHistogram<const_map_type>; + + //@} + + private: + enum : size_type { invalid_index = ~static_cast<size_type>(0) }; + + using impl_value_type = std::conditional_t<is_set, int, declared_value_type>; + + using key_type_view = std::conditional_t< + is_insertable_map, View<key_type *, device_type>, + View<const key_type *, device_type, MemoryTraits<RandomAccess> > >; + + using value_type_view = std::conditional_t< + is_insertable_map || is_modifiable_map, + View<impl_value_type *, device_type>, + View<const impl_value_type *, device_type, MemoryTraits<RandomAccess> > >; + + using size_type_view = std::conditional_t< + is_insertable_map, View<size_type *, device_type>, + View<const size_type *, device_type, MemoryTraits<RandomAccess> > >; + + using bitset_type = + std::conditional_t<is_insertable_map, Bitset<execution_space>, + ConstBitset<execution_space> >; + + enum { modified_idx = 0, erasable_idx = 1, failed_insert_idx = 2 }; + enum { num_scalars = 3 }; + using scalars_view = View<int[num_scalars], LayoutLeft, device_type>; + + public: + //! \name Public member functions + //@{ + + /// \brief Constructor + /// + /// \param capacity_hint [in] Initial guess of how many unique keys will be + /// inserted into the map \param hash [in] Hasher function for \c Key + /// instances. The + /// default value usually suffices. + UnorderedMap(size_type capacity_hint = 0, hasher_type hasher = hasher_type(), + equal_to_type equal_to = equal_to_type()) + : m_bounded_insert(true), + m_hasher(hasher), + m_equal_to(equal_to), + m_size(), + m_available_indexes(calculate_capacity(capacity_hint)), + m_hash_lists(view_alloc(WithoutInitializing, "UnorderedMap hash list"), + Impl::find_hash_size(capacity())), + m_next_index(view_alloc(WithoutInitializing, "UnorderedMap next index"), + capacity() + 1) // +1 so that the *_at functions can + // always return a valid reference + , + m_keys("UnorderedMap keys", capacity() + 1), + m_values("UnorderedMap values", (is_set ? 1 : capacity() + 1)), + m_scalars("UnorderedMap scalars") { + if (!is_insertable_map) { + throw std::runtime_error( + "Cannot construct a non-insertable (i.e. const key_type) " + "unordered_map"); + } + + Kokkos::deep_copy(m_hash_lists, invalid_index); + Kokkos::deep_copy(m_next_index, invalid_index); + } + + void reset_failed_insert_flag() { reset_flag(failed_insert_idx); } + + histogram_type get_histogram() { return histogram_type(*this); } + + //! Clear all entries in the table. + void clear() { + m_bounded_insert = true; + + if (capacity() == 0) return; + + m_available_indexes.clear(); + + Kokkos::deep_copy(m_hash_lists, invalid_index); + Kokkos::deep_copy(m_next_index, invalid_index); + { + const key_type tmp = key_type(); + Kokkos::deep_copy(m_keys, tmp); + } + if (is_set) { + const impl_value_type tmp = impl_value_type(); + Kokkos::deep_copy(m_values, tmp); + } + { Kokkos::deep_copy(m_scalars, 0); } + } + + KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const { + return (m_keys.is_allocated() && m_values.is_allocated() && + m_scalars.is_allocated()); + } + + /// \brief Change the capacity of the the map + /// + /// If there are no failed inserts the current size of the map will + /// be used as a lower bound for the input capacity. + /// If the map is not empty and does not have failed inserts + /// and the capacity changes then the current data is copied + /// into the resized / rehashed map. + /// + /// This is <i>not</i> a device function; it may <i>not</i> be + /// called in a parallel kernel. + bool rehash(size_type requested_capacity = 0) { + const bool bounded_insert = (capacity() == 0) || (size() == 0u); + return rehash(requested_capacity, bounded_insert); + } + + bool rehash(size_type requested_capacity, bool bounded_insert) { + if (!is_insertable_map) return false; + + const size_type curr_size = size(); + requested_capacity = + (requested_capacity < curr_size) ? curr_size : requested_capacity; + + insertable_map_type tmp(requested_capacity, m_hasher, m_equal_to); + + if (curr_size) { + tmp.m_bounded_insert = false; + Impl::UnorderedMapRehash<insertable_map_type> f(tmp, *this); + f.apply(); + } + tmp.m_bounded_insert = bounded_insert; + + *this = tmp; + + return true; + } + + /// \brief The number of entries in the table. + /// + /// This method has undefined behavior when erasable() is true. + /// + /// Note that this is not a device function; it cannot be called in + /// a parallel kernel. The value is not stored as a variable; it + /// must be computed. + size_type size() const { + if (capacity() == 0u) return 0u; + if (modified()) { + m_size = m_available_indexes.count(); + reset_flag(modified_idx); + } + return m_size; + } + + /// \brief The current number of failed insert() calls. + /// + /// This is <i>not</i> a device function; it may <i>not</i> be + /// called in a parallel kernel. The value is not stored as a + /// variable; it must be computed. + bool failed_insert() const { return get_flag(failed_insert_idx); } + + bool erasable() const { + return is_insertable_map ? get_flag(erasable_idx) : false; + } + + bool begin_erase() { + bool result = !erasable(); + if (is_insertable_map && result) { + execution_space().fence(); + set_flag(erasable_idx); + execution_space().fence(); + } + return result; + } + + bool end_erase() { + bool result = erasable(); + if (is_insertable_map && result) { + execution_space().fence(); + Impl::UnorderedMapErase<declared_map_type> f(*this); + f.apply(); + execution_space().fence(); + reset_flag(erasable_idx); + } + return result; + } + + /// \brief The maximum number of entries that the table can hold. + /// + /// This <i>is</i> a device function; it may be called in a parallel + /// kernel. + KOKKOS_FORCEINLINE_FUNCTION + size_type capacity() const { return m_available_indexes.size(); } + + /// \brief The number of hash table "buckets." + /// + /// This is different than the number of entries that the table can + /// hold. Each key hashes to an index in [0, hash_capacity() - 1]. + /// That index can hold zero or more entries. This class decides + /// what hash_capacity() should be, given the user's upper bound on + /// the number of entries the table must be able to hold. + /// + /// This <i>is</i> a device function; it may be called in a parallel + /// kernel. + KOKKOS_INLINE_FUNCTION + size_type hash_capacity() const { return m_hash_lists.extent(0); } + + //--------------------------------------------------------------------------- + //--------------------------------------------------------------------------- + + /// This <i>is</i> a device function; it may be called in a parallel + /// kernel. As discussed in the class documentation, it need not + /// succeed. The return value tells you if it did. + /// + /// \param k [in] The key to attempt to insert. + /// \param v [in] The corresponding value to attempt to insert. If + /// using this class as a set (with Value = void), then you need not + /// provide this value. + KOKKOS_INLINE_FUNCTION + insert_result insert(key_type const &k, + impl_value_type const &v = impl_value_type()) const { + insert_result result; + + if (!is_insertable_map || capacity() == 0u || + m_scalars((int)erasable_idx)) { + return result; + } + + if (!m_scalars((int)modified_idx)) { + m_scalars((int)modified_idx) = true; + } + + int volatile &failed_insert_ref = m_scalars((int)failed_insert_idx); + + const size_type hash_value = m_hasher(k); + const size_type hash_list = hash_value % m_hash_lists.extent(0); + + size_type *curr_ptr = &m_hash_lists[hash_list]; + size_type new_index = invalid_index; + + // Force integer multiply to long + size_type index_hint = static_cast<size_type>( + (static_cast<double>(hash_list) * capacity()) / m_hash_lists.extent(0)); + + size_type find_attempts = 0; + + enum : unsigned { bounded_find_attempts = 32u }; + const size_type max_attempts = + (m_bounded_insert && + (bounded_find_attempts < m_available_indexes.max_hint())) + ? bounded_find_attempts + : m_available_indexes.max_hint(); + + bool not_done = true; + +#if defined(__MIC__) +#pragma noprefetch +#endif + while (not_done) { + // Continue searching the unordered list for this key, + // list will only be appended during insert phase. + // Need volatile_load as other threads may be appending. + size_type curr = volatile_load(curr_ptr); + + KOKKOS_NONTEMPORAL_PREFETCH_LOAD( + &m_keys[curr != invalid_index ? curr : 0]); +#if defined(__MIC__) +#pragma noprefetch +#endif + while (curr != invalid_index && + !m_equal_to(volatile_load(&m_keys[curr]), k)) { + result.increment_list_position(); + index_hint = curr; + curr_ptr = &m_next_index[curr]; + curr = volatile_load(curr_ptr); + KOKKOS_NONTEMPORAL_PREFETCH_LOAD( + &m_keys[curr != invalid_index ? curr : 0]); + } + + //------------------------------------------------------------ + // If key already present then return that index. + if (curr != invalid_index) { + const bool free_existing = new_index != invalid_index; + if (free_existing) { + // Previously claimed an unused entry that was not inserted. + // Release this unused entry immediately. + if (!m_available_indexes.reset(new_index)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Unable to free existing\n"); + } + } + + result.set_existing(curr, free_existing); + not_done = false; + } + //------------------------------------------------------------ + // Key is not currently in the map. + // If the thread has claimed an entry try to insert now. + else { + //------------------------------------------------------------ + // If have not already claimed an unused entry then do so now. + if (new_index == invalid_index) { + bool found = false; + // use the hash_list as the flag for the search direction + Kokkos::tie(found, index_hint) = + m_available_indexes.find_any_unset_near(index_hint, hash_list); + + // found and index and this thread set it + if (!found && ++find_attempts >= max_attempts) { + failed_insert_ref = true; + not_done = false; + } else if (m_available_indexes.set(index_hint)) { + new_index = index_hint; + // Set key and value + KOKKOS_NONTEMPORAL_PREFETCH_STORE(&m_keys[new_index]); + m_keys[new_index] = k; + + if (!is_set) { + KOKKOS_NONTEMPORAL_PREFETCH_STORE(&m_values[new_index]); + m_values[new_index] = v; + } + + // Do not proceed until key and value are updated in global memory + memory_fence(); + } + } else if (failed_insert_ref) { + not_done = false; + } + + // Attempt to append claimed entry into the list. + // Another thread may also be trying to append the same list so protect + // with atomic. + if (new_index != invalid_index && + curr == atomic_compare_exchange( + curr_ptr, static_cast<size_type>(invalid_index), + new_index)) { + // Succeeded in appending + result.set_success(new_index); + not_done = false; + } + } + } // while ( not_done ) + + return result; + } + + KOKKOS_INLINE_FUNCTION + bool erase(key_type const &k) const { + bool result = false; + + if (is_insertable_map && 0u < capacity() && m_scalars((int)erasable_idx)) { + if (!m_scalars((int)modified_idx)) { + m_scalars((int)modified_idx) = true; + } + + size_type index = find(k); + if (valid_at(index)) { + m_available_indexes.reset(index); + result = true; + } + } + + return result; + } + + /// \brief Find the given key \c k, if it exists in the table. + /// + /// \return If the key exists in the table, the index of the + /// value corresponding to that key; otherwise, an invalid index. + /// + /// This <i>is</i> a device function; it may be called in a parallel + /// kernel. + KOKKOS_INLINE_FUNCTION + size_type find(const key_type &k) const { + size_type curr = 0u < capacity() + ? m_hash_lists(m_hasher(k) % m_hash_lists.extent(0)) + : invalid_index; + + KOKKOS_NONTEMPORAL_PREFETCH_LOAD(&m_keys[curr != invalid_index ? curr : 0]); + while (curr != invalid_index && !m_equal_to(m_keys[curr], k)) { + KOKKOS_NONTEMPORAL_PREFETCH_LOAD( + &m_keys[curr != invalid_index ? curr : 0]); + curr = m_next_index[curr]; + } + + return curr; + } + + /// \brief Does the key exist in the map + /// + /// This <i>is</i> a device function; it may be called in a parallel + /// kernel. + KOKKOS_INLINE_FUNCTION + bool exists(const key_type &k) const { return valid_at(find(k)); } + + /// \brief Get the value with \c i as its direct index. + /// + /// \param i [in] Index directly into the array of entries. + /// + /// This <i>is</i> a device function; it may be called in a parallel + /// kernel. + /// + /// 'const value_type' via Cuda texture fetch must return by value. + KOKKOS_FORCEINLINE_FUNCTION + std::conditional_t<(is_set || has_const_value), impl_value_type, + impl_value_type &> + value_at(size_type i) const { + return m_values[is_set ? 0 : (i < capacity() ? i : capacity())]; + } + + /// \brief Get the key with \c i as its direct index. + /// + /// \param i [in] Index directly into the array of entries. + /// + /// This <i>is</i> a device function; it may be called in a parallel + /// kernel. + KOKKOS_FORCEINLINE_FUNCTION + key_type key_at(size_type i) const { + return m_keys[i < capacity() ? i : capacity()]; + } + + KOKKOS_FORCEINLINE_FUNCTION + bool valid_at(size_type i) const { return m_available_indexes.test(i); } + + template <typename SKey, typename SValue> + UnorderedMap( + UnorderedMap<SKey, SValue, Device, Hasher, EqualTo> const &src, + typename std::enable_if< + Impl::UnorderedMapCanAssign<declared_key_type, declared_value_type, + SKey, SValue>::value, + int>::type = 0) + : m_bounded_insert(src.m_bounded_insert), + m_hasher(src.m_hasher), + m_equal_to(src.m_equal_to), + m_size(src.m_size), + m_available_indexes(src.m_available_indexes), + m_hash_lists(src.m_hash_lists), + m_next_index(src.m_next_index), + m_keys(src.m_keys), + m_values(src.m_values), + m_scalars(src.m_scalars) {} + + template <typename SKey, typename SValue> + typename std::enable_if< + Impl::UnorderedMapCanAssign<declared_key_type, declared_value_type, SKey, + SValue>::value, + declared_map_type &>::type + operator=(UnorderedMap<SKey, SValue, Device, Hasher, EqualTo> const &src) { + m_bounded_insert = src.m_bounded_insert; + m_hasher = src.m_hasher; + m_equal_to = src.m_equal_to; + m_size = src.m_size; + m_available_indexes = src.m_available_indexes; + m_hash_lists = src.m_hash_lists; + m_next_index = src.m_next_index; + m_keys = src.m_keys; + m_values = src.m_values; + m_scalars = src.m_scalars; + return *this; + } + + template <typename SKey, typename SValue, typename SDevice> + typename std::enable_if< + std::is_same<typename std::remove_const<SKey>::type, key_type>::value && + std::is_same<typename std::remove_const<SValue>::type, + value_type>::value>::type + create_copy_view( + UnorderedMap<SKey, SValue, SDevice, Hasher, EqualTo> const &src) { + if (m_hash_lists.data() != src.m_hash_lists.data()) { + insertable_map_type tmp; + + tmp.m_bounded_insert = src.m_bounded_insert; + tmp.m_hasher = src.m_hasher; + tmp.m_equal_to = src.m_equal_to; + tmp.m_size = src.size(); + tmp.m_available_indexes = bitset_type(src.capacity()); + tmp.m_hash_lists = size_type_view( + view_alloc(WithoutInitializing, "UnorderedMap hash list"), + src.m_hash_lists.extent(0)); + tmp.m_next_index = size_type_view( + view_alloc(WithoutInitializing, "UnorderedMap next index"), + src.m_next_index.extent(0)); + tmp.m_keys = + key_type_view(view_alloc(WithoutInitializing, "UnorderedMap keys"), + src.m_keys.extent(0)); + tmp.m_values = value_type_view( + view_alloc(WithoutInitializing, "UnorderedMap values"), + src.m_values.extent(0)); + tmp.m_scalars = scalars_view("UnorderedMap scalars"); + + Kokkos::deep_copy(tmp.m_available_indexes, src.m_available_indexes); + + using raw_deep_copy = + Kokkos::Impl::DeepCopy<typename device_type::memory_space, + typename SDevice::memory_space>; + + raw_deep_copy(tmp.m_hash_lists.data(), src.m_hash_lists.data(), + sizeof(size_type) * src.m_hash_lists.extent(0)); + raw_deep_copy(tmp.m_next_index.data(), src.m_next_index.data(), + sizeof(size_type) * src.m_next_index.extent(0)); + raw_deep_copy(tmp.m_keys.data(), src.m_keys.data(), + sizeof(key_type) * src.m_keys.extent(0)); + if (!is_set) { + raw_deep_copy(tmp.m_values.data(), src.m_values.data(), + sizeof(impl_value_type) * src.m_values.extent(0)); + } + raw_deep_copy(tmp.m_scalars.data(), src.m_scalars.data(), + sizeof(int) * num_scalars); + + *this = tmp; + } + } + + //@} + private: // private member functions + bool modified() const { return get_flag(modified_idx); } + + void set_flag(int flag) const { + using raw_deep_copy = + Kokkos::Impl::DeepCopy<typename device_type::memory_space, + Kokkos::HostSpace>; + const int true_ = true; + raw_deep_copy(m_scalars.data() + flag, &true_, sizeof(int)); + } + + void reset_flag(int flag) const { + using raw_deep_copy = + Kokkos::Impl::DeepCopy<typename device_type::memory_space, + Kokkos::HostSpace>; + const int false_ = false; + raw_deep_copy(m_scalars.data() + flag, &false_, sizeof(int)); + } + + bool get_flag(int flag) const { + using raw_deep_copy = + Kokkos::Impl::DeepCopy<Kokkos::HostSpace, + typename device_type::memory_space>; + int result = false; + raw_deep_copy(&result, m_scalars.data() + flag, sizeof(int)); + return result; + } + + static uint32_t calculate_capacity(uint32_t capacity_hint) { + // increase by 16% and round to nears multiple of 128 + return capacity_hint + ? ((static_cast<uint32_t>(7ull * capacity_hint / 6u) + 127u) / + 128u) * + 128u + : 128u; + } + + private: // private members + bool m_bounded_insert; + hasher_type m_hasher; + equal_to_type m_equal_to; + mutable size_type m_size; + bitset_type m_available_indexes; + size_type_view m_hash_lists; + size_type_view m_next_index; + key_type_view m_keys; + value_type_view m_values; + scalars_view m_scalars; + + template <typename KKey, typename VValue, typename DDevice, typename HHash, + typename EEqualTo> + friend class UnorderedMap; + + template <typename UMap> + friend struct Impl::UnorderedMapErase; + + template <typename UMap> + friend struct Impl::UnorderedMapHistogram; + + template <typename UMap> + friend struct Impl::UnorderedMapPrint; +}; + +// Specialization of deep_copy for two UnorderedMap objects. +template <typename DKey, typename DT, typename DDevice, typename SKey, + typename ST, typename SDevice, typename Hasher, typename EqualTo> +inline void deep_copy( + UnorderedMap<DKey, DT, DDevice, Hasher, EqualTo> &dst, + const UnorderedMap<SKey, ST, SDevice, Hasher, EqualTo> &src) { + dst.create_copy_view(src); +} + +} // namespace Kokkos + +#endif // KOKKOS_UNORDERED_MAP_HPP diff --git a/packages/kokkos/containers/src/Kokkos_Vector.hpp b/packages/kokkos/containers/src/Kokkos_Vector.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a1fbba6b21c76b4bb7b2a63a4e3a863241a7cd74 --- /dev/null +++ b/packages/kokkos/containers/src/Kokkos_Vector.hpp @@ -0,0 +1,340 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_VECTOR_HPP +#define KOKKOS_VECTOR_HPP + +#include <Kokkos_Core_fwd.hpp> +#include <Kokkos_DualView.hpp> + +/* Drop in replacement for std::vector based on Kokkos::DualView + * Most functions only work on the host (it will not compile if called from + * device kernel) + * + */ +namespace Kokkos { + +template <class Scalar, class Arg1Type = void> +class vector : public DualView<Scalar*, LayoutLeft, Arg1Type> { + public: + using value_type = Scalar; + using pointer = Scalar*; + using const_pointer = const Scalar*; + using reference = Scalar&; + using const_reference = const Scalar&; + using iterator = Scalar*; + using const_iterator = const Scalar*; + using size_type = size_t; + + private: + size_t _size; + float _extra_storage; + using DV = DualView<Scalar*, LayoutLeft, Arg1Type>; + + public: +#ifdef KOKKOS_ENABLE_CUDA_UVM + KOKKOS_INLINE_FUNCTION reference operator()(int i) const { + return DV::h_view(i); + }; + KOKKOS_INLINE_FUNCTION reference operator[](int i) const { + return DV::h_view(i); + }; +#else + inline reference operator()(int i) const { return DV::h_view(i); }; + inline reference operator[](int i) const { return DV::h_view(i); }; +#endif + + /* Member functions which behave like std::vector functions */ + + vector() : DV() { + _size = 0; + _extra_storage = 1.1; + } + + vector(int n, Scalar val = Scalar()) + : DualView<Scalar*, LayoutLeft, Arg1Type>("Vector", size_t(n * (1.1))) { + _size = n; + _extra_storage = 1.1; + DV::modified_flags(0) = 1; + + assign(n, val); + } + + void resize(size_t n) { + if (n >= span()) DV::resize(size_t(n * _extra_storage)); + _size = n; + } + + void resize(size_t n, const Scalar& val) { assign(n, val); } + + void assign(size_t n, const Scalar& val) { + /* Resize if necessary (behavior of std:vector) */ + + if (n > span()) DV::resize(size_t(n * _extra_storage)); + _size = n; + + /* Assign value either on host or on device */ + + if (DV::template need_sync<typename DV::t_dev::device_type>()) { + set_functor_host f(DV::h_view, val); + parallel_for("Kokkos::vector::assign", n, f); + typename DV::t_host::execution_space().fence(); + DV::template modify<typename DV::t_host::device_type>(); + } else { + set_functor f(DV::d_view, val); + parallel_for("Kokkos::vector::assign", n, f); + typename DV::t_dev::execution_space().fence(); + DV::template modify<typename DV::t_dev::device_type>(); + } + } + + void reserve(size_t n) { DV::resize(size_t(n * _extra_storage)); } + + void push_back(Scalar val) { + DV::template sync<typename DV::t_host::device_type>(); + DV::template modify<typename DV::t_host::device_type>(); + if (_size == span()) { + size_t new_size = _size * _extra_storage; + if (new_size == _size) new_size++; + DV::resize(new_size); + } + + DV::h_view(_size) = val; + _size++; + } + + void pop_back() { _size--; } + + void clear() { _size = 0; } + + iterator insert(iterator it, const value_type& val) { + return insert(it, 1, val); + } + + iterator insert(iterator it, size_type count, const value_type& val) { + if ((size() == 0) && (it == begin())) { + resize(count, val); + DV::sync_host(); + return begin(); + } + DV::sync_host(); + DV::modify_host(); + if (it < begin() || it > end()) + Kokkos::abort("Kokkos::vector::insert : invalid insert iterator"); + if (count == 0) return it; + ptrdiff_t start = std::distance(begin(), it); + auto org_size = size(); + resize(size() + count); + + std::copy_backward(begin() + start, begin() + org_size, + begin() + org_size + count); + std::fill_n(begin() + start, count, val); + + return begin() + start; + } + + private: + template <class T> + struct impl_is_input_iterator + : /* TODO replace this */ std::integral_constant< + bool, !std::is_convertible<T, size_type>::value> {}; + + public: + // TODO: can use detection idiom to generate better error message here later + template <typename InputIterator> + typename std::enable_if<impl_is_input_iterator<InputIterator>::value, + iterator>::type + insert(iterator it, InputIterator b, InputIterator e) { + ptrdiff_t count = std::distance(b, e); + if (count == 0) return it; + + DV::sync_host(); + DV::modify_host(); + if (it < begin() || it > end()) + Kokkos::abort("Kokkos::vector::insert : invalid insert iterator"); + + bool resized = false; + if ((size() == 0) && (it == begin())) { + resize(count); + it = begin(); + resized = true; + } + ptrdiff_t start = std::distance(begin(), it); + auto org_size = size(); + if (!resized) resize(size() + count); + it = begin() + start; + + std::copy_backward(begin() + start, begin() + org_size, + begin() + org_size + count); + std::copy(b, e, it); + + return begin() + start; + } + + KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const { + return DV::is_allocated(); + } + + size_type size() const { return _size; } + size_type max_size() const { return 2000000000; } + size_type span() const { return DV::span(); } + bool empty() const { return _size == 0; } + + pointer data() const { return DV::h_view.data(); } + + iterator begin() const { return DV::h_view.data(); } + + iterator end() const { + return _size > 0 ? DV::h_view.data() + _size : DV::h_view.data(); + } + + reference front() { return DV::h_view(0); } + + reference back() { return DV::h_view(_size - 1); } + + const_reference front() const { return DV::h_view(0); } + + const_reference back() const { return DV::h_view(_size - 1); } + + /* std::algorithms which work originally with iterators, here they are + * implemented as member functions */ + + size_t lower_bound(const size_t& start, const size_t& theEnd, + const Scalar& comp_val) const { + int lower = start; // FIXME (mfh 24 Apr 2014) narrowing conversion + int upper = + _size > theEnd + ? theEnd + : _size - 1; // FIXME (mfh 24 Apr 2014) narrowing conversion + if (upper <= lower) { + return theEnd; + } + + Scalar lower_val = DV::h_view(lower); + Scalar upper_val = DV::h_view(upper); + size_t idx = (upper + lower) / 2; + Scalar val = DV::h_view(idx); + if (val > upper_val) return upper; + if (val < lower_val) return start; + + while (upper > lower) { + if (comp_val > val) { + lower = ++idx; + } else { + upper = idx; + } + idx = (upper + lower) / 2; + val = DV::h_view(idx); + } + return idx; + } + + bool is_sorted() { + for (int i = 0; i < _size - 1; i++) { + if (DV::h_view(i) > DV::h_view(i + 1)) return false; + } + return true; + } + + iterator find(Scalar val) const { + if (_size == 0) return end(); + + int upper, lower, current; + current = _size / 2; + upper = _size - 1; + lower = 0; + + if ((val < DV::h_view(0)) || (val > DV::h_view(_size - 1))) return end(); + + while (upper > lower) { + if (val > DV::h_view(current)) + lower = current + 1; + else + upper = current; + current = (upper + lower) / 2; + } + + if (val == DV::h_view(current)) + return &DV::h_view(current); + else + return end(); + } + + /* Additional functions for data management */ + + void device_to_host() { deep_copy(DV::h_view, DV::d_view); } + void host_to_device() const { deep_copy(DV::d_view, DV::h_view); } + + void on_host() { DV::template modify<typename DV::t_host::device_type>(); } + void on_device() { DV::template modify<typename DV::t_dev::device_type>(); } + + void set_overallocation(float extra) { _extra_storage = 1.0 + extra; } + + public: + struct set_functor { + using execution_space = typename DV::t_dev::execution_space; + typename DV::t_dev _data; + Scalar _val; + + set_functor(typename DV::t_dev data, Scalar val) : _data(data), _val(val) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int& i) const { _data(i) = _val; } + }; + + struct set_functor_host { + using execution_space = typename DV::t_host::execution_space; + typename DV::t_host _data; + Scalar _val; + + set_functor_host(typename DV::t_host data, Scalar val) + : _data(data), _val(val) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int& i) const { _data(i) = _val; } + }; +}; + +} // namespace Kokkos +#endif diff --git a/packages/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp b/packages/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..6047e60f3dd080b8cfe456627ccc80266e7df66b --- /dev/null +++ b/packages/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp @@ -0,0 +1,114 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_BITSET_IMPL_HPP +#define KOKKOS_BITSET_IMPL_HPP + +#include <Kokkos_Macros.hpp> +#include <impl/Kokkos_BitOps.hpp> +#include <cstdint> + +#include <cstdio> +#include <climits> +#include <iostream> +#include <iomanip> + +namespace Kokkos { +namespace Impl { + +KOKKOS_FORCEINLINE_FUNCTION +unsigned rotate_left(unsigned i, int r) { + constexpr int size = static_cast<int>(sizeof(unsigned) * CHAR_BIT); + return r ? ((i << r) | (i >> (size - r))) : i; +} + +KOKKOS_FORCEINLINE_FUNCTION +unsigned rotate_right(unsigned i, int r) { + constexpr int size = static_cast<int>(sizeof(unsigned) * CHAR_BIT); + // FIXME_SYCL llvm.fshr.i32 missing + // (https://github.com/intel/llvm/issues/3308) +#ifdef __SYCL_DEVICE_ONLY__ + return rotate_left(i, size - r); +#else + return r ? ((i >> r) | (i << (size - r))) : i; +#endif +} + +template <typename Bitset> +struct BitsetCount { + using bitset_type = Bitset; + using execution_space = + typename bitset_type::execution_space::execution_space; + using size_type = typename bitset_type::size_type; + using value_type = size_type; + + bitset_type m_bitset; + + BitsetCount(bitset_type const& bitset) : m_bitset(bitset) {} + + size_type apply() const { + size_type count = 0u; + parallel_reduce("Kokkos::Impl::BitsetCount::apply", + m_bitset.m_blocks.extent(0), *this, count); + return count; + } + + KOKKOS_INLINE_FUNCTION + void init(value_type& count) const { count = 0u; } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& count, const volatile size_type& incr) const { + count += incr; + } + + KOKKOS_INLINE_FUNCTION + void operator()(size_type i, value_type& count) const { + count += bit_count(m_bitset.m_blocks[i]); + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif // KOKKOS_BITSET_IMPL_HPP diff --git a/packages/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp b/packages/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..367ab338572064f167c3c50f447e4d27efff6999 --- /dev/null +++ b/packages/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp @@ -0,0 +1,189 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER + +#ifndef KOKKOS_FUNCTIONAL_IMPL_HPP +#define KOKKOS_FUNCTIONAL_IMPL_HPP + +#include <Kokkos_Macros.hpp> +#include <cstdint> + +namespace Kokkos { +namespace Impl { + +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. +KOKKOS_FORCEINLINE_FUNCTION +uint32_t getblock32(const uint8_t* p, int i) { + // used to avoid aliasing error which could cause errors with + // forced inlining + return ((uint32_t)p[i * 4 + 0]) | ((uint32_t)p[i * 4 + 1] << 8) | + ((uint32_t)p[i * 4 + 2] << 16) | ((uint32_t)p[i * 4 + 3] << 24); +} + +KOKKOS_FORCEINLINE_FUNCTION +uint32_t rotl32(uint32_t x, int8_t r) { return (x << r) | (x >> (32 - r)); } + +KOKKOS_FORCEINLINE_FUNCTION +uint32_t fmix32(uint32_t h) { + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + + return h; +} + +KOKKOS_INLINE_FUNCTION +uint32_t MurmurHash3_x86_32(const void* key, int len, uint32_t seed) { + const uint8_t* data = (const uint8_t*)key; + const int nblocks = len / 4; + + uint32_t h1 = seed; + + const uint32_t c1 = 0xcc9e2d51; + const uint32_t c2 = 0x1b873593; + + //---------- + // body + + for (int i = 0; i < nblocks; ++i) { + uint32_t k1 = getblock32(data, i); + + k1 *= c1; + k1 = rotl32(k1, 15); + k1 *= c2; + + h1 ^= k1; + h1 = rotl32(h1, 13); + h1 = h1 * 5 + 0xe6546b64; + } + + //---------- + // tail + + const uint8_t* tail = (const uint8_t*)(data + nblocks * 4); + + uint32_t k1 = 0; + + switch (len & 3) { + case 3: k1 ^= tail[2] << 16; + case 2: k1 ^= tail[1] << 8; + case 1: + k1 ^= tail[0]; + k1 *= c1; + k1 = rotl32(k1, 15); + k1 *= c2; + h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; + + h1 = fmix32(h1); + + return h1; +} + +#if defined(__GNUC__) /* GNU C */ || defined(__GNUG__) /* GNU C++ */ || \ + defined(__clang__) + +#define KOKKOS_IMPL_MAY_ALIAS __attribute__((__may_alias__)) + +#else + +#define KOKKOS_IMPL_MAY_ALIAS + +#endif + +template <typename T> +KOKKOS_FORCEINLINE_FUNCTION bool bitwise_equal(T const* const a_ptr, + T const* const b_ptr) { + typedef uint64_t KOKKOS_IMPL_MAY_ALIAS T64; // NOLINT(modernize-use-using) + typedef uint32_t KOKKOS_IMPL_MAY_ALIAS T32; // NOLINT(modernize-use-using) + typedef uint16_t KOKKOS_IMPL_MAY_ALIAS T16; // NOLINT(modernize-use-using) + typedef uint8_t KOKKOS_IMPL_MAY_ALIAS T8; // NOLINT(modernize-use-using) + + enum { + NUM_8 = sizeof(T), + NUM_16 = NUM_8 / 2, + NUM_32 = NUM_8 / 4, + NUM_64 = NUM_8 / 8 + }; + + union { + T const* const ptr; + T64 const* const ptr64; + T32 const* const ptr32; + T16 const* const ptr16; + T8 const* const ptr8; + } a = {a_ptr}, b = {b_ptr}; + + bool result = true; + + for (int i = 0; i < NUM_64; ++i) { + result = result && a.ptr64[i] == b.ptr64[i]; + } + + if (NUM_64 * 2 < NUM_32) { + result = result && a.ptr32[NUM_64 * 2] == b.ptr32[NUM_64 * 2]; + } + + if (NUM_32 * 2 < NUM_16) { + result = result && a.ptr16[NUM_32 * 2] == b.ptr16[NUM_32 * 2]; + } + + if (NUM_16 * 2 < NUM_8) { + result = result && a.ptr8[NUM_16 * 2] == b.ptr8[NUM_16 * 2]; + } + + return result; +} + +#undef KOKKOS_IMPL_MAY_ALIAS + +} // namespace Impl +} // namespace Kokkos + +#endif // KOKKOS_FUNCTIONAL_IMPL_HPP diff --git a/packages/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp b/packages/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp new file mode 100644 index 0000000000000000000000000000000000000000..f22e5d1eca928bc968d3cf32900f9fa0335751d7 --- /dev/null +++ b/packages/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp @@ -0,0 +1,215 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_IMPL_STATICCRSGRAPH_FACTORY_HPP +#define KOKKOS_IMPL_STATICCRSGRAPH_FACTORY_HPP + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +template <class DataType, class Arg1Type, class Arg2Type, class Arg3Type, + typename SizeType> +inline typename StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type, + SizeType>::HostMirror +create_mirror_view( + const StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type, SizeType>& + view, + typename std::enable_if<ViewTraits<DataType, Arg1Type, Arg2Type, + Arg3Type>::is_hostspace>::type* = 0) { + return view; +} + +template <class DataType, class Arg1Type, class Arg2Type, class Arg3Type, + typename SizeType> +inline typename StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type, + SizeType>::HostMirror +create_mirror(const StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type, + SizeType>& view) { + // Force copy: + // using alloc = Impl::ViewAssignment<Impl::ViewDefault>; // unused + using staticcrsgraph_type = + StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type, SizeType>; + + typename staticcrsgraph_type::HostMirror tmp; + typename staticcrsgraph_type::row_map_type::HostMirror tmp_row_map = + create_mirror(view.row_map); + typename staticcrsgraph_type::row_block_type::HostMirror + tmp_row_block_offsets = create_mirror(view.row_block_offsets); + + // Allocation to match: + tmp.row_map = tmp_row_map; // Assignment of 'const' from 'non-const' + tmp.entries = create_mirror(view.entries); + tmp.row_block_offsets = + tmp_row_block_offsets; // Assignment of 'const' from 'non-const' + + // Deep copy: + deep_copy(tmp_row_map, view.row_map); + deep_copy(tmp.entries, view.entries); + deep_copy(tmp_row_block_offsets, view.row_block_offsets); + + return tmp; +} + +template <class DataType, class Arg1Type, class Arg2Type, class Arg3Type, + typename SizeType> +inline typename StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type, + SizeType>::HostMirror +create_mirror_view( + const StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type, SizeType>& + view, + typename std::enable_if<!ViewTraits<DataType, Arg1Type, Arg2Type, + Arg3Type>::is_hostspace>::type* = 0) { + return create_mirror(view); +} +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +template <class StaticCrsGraphType, class InputSizeType> +inline typename StaticCrsGraphType::staticcrsgraph_type create_staticcrsgraph( + const std::string& label, const std::vector<InputSizeType>& input) { + using output_type = StaticCrsGraphType; + // using input_type = std::vector<InputSizeType>; // unused + + using entries_type = typename output_type::entries_type; + + using work_type = View<typename output_type::size_type[], + typename output_type::array_layout, + typename output_type::execution_space, + typename output_type::memory_traits>; + + output_type output; + + // Create the row map: + + const size_t length = input.size(); + + { + work_type row_work("tmp", length + 1); + + typename work_type::HostMirror row_work_host = create_mirror_view(row_work); + + size_t sum = 0; + row_work_host[0] = 0; + for (size_t i = 0; i < length; ++i) { + row_work_host[i + 1] = sum += input[i]; + } + + deep_copy(row_work, row_work_host); + + output.entries = entries_type(label, sum); + output.row_map = row_work; + } + + return output; +} + +//---------------------------------------------------------------------------- + +template <class StaticCrsGraphType, class InputSizeType> +inline typename StaticCrsGraphType::staticcrsgraph_type create_staticcrsgraph( + const std::string& label, + const std::vector<std::vector<InputSizeType> >& input) { + using output_type = StaticCrsGraphType; + using entries_type = typename output_type::entries_type; + + static_assert(entries_type::rank == 1, "Graph entries view must be rank one"); + + using work_type = View<typename output_type::size_type[], + typename output_type::array_layout, + typename output_type::execution_space, + typename output_type::memory_traits>; + + output_type output; + + // Create the row map: + + const size_t length = input.size(); + + { + work_type row_work("tmp", length + 1); + + typename work_type::HostMirror row_work_host = create_mirror_view(row_work); + + size_t sum = 0; + row_work_host[0] = 0; + for (size_t i = 0; i < length; ++i) { + row_work_host[i + 1] = sum += input[i].size(); + } + + deep_copy(row_work, row_work_host); + + output.entries = entries_type(label, sum); + output.row_map = row_work; + } + + // Fill in the entries: + { + typename entries_type::HostMirror host_entries = + create_mirror_view(output.entries); + + size_t sum = 0; + for (size_t i = 0; i < length; ++i) { + for (size_t j = 0; j < input[i].size(); ++j, ++sum) { + host_entries(sum) = input[i][j]; + } + } + + deep_copy(output.entries, host_entries); + } + + return output; +} + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #ifndef KOKKOS_IMPL_CRSARRAY_FACTORY_HPP */ diff --git a/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.cpp b/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e10e256b6a8d1e6e48f8e80b205cd097a3486723 --- /dev/null +++ b/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.cpp @@ -0,0 +1,106 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_UnorderedMap.hpp> + +namespace Kokkos { +namespace Impl { + +uint32_t find_hash_size(uint32_t size) { + if (size == 0u) return 0u; + + // these primes try to preserve randomness of hash + static const uint32_t primes[] = { + 3, 7, 13, 23, 53, 97, + 193, 389, 769, 1543, 2237, 2423, + 2617, 2797, 2999, 3167, 3359, 3539, + 3727, 3911, 4441, 4787, 5119, 5471, + 5801, 6143, 6521, 6827, 7177, 7517, + 7853, 8887, 9587, 10243, 10937, 11617, + 12289, 12967, 13649, 14341, 15013, 15727, + 17749, 19121, 20479, 21859, 23209, 24593, + 25939, 27329, 28669, 30047, 31469, 35507, + 38231, 40961, 43711, 46439, 49157, 51893, + 54617, 57347, 60077, 62801, 70583, 75619, + 80669, 85703, 90749, 95783, 100823, 105871, + 110909, 115963, 120997, 126031, 141157, 151237, + 161323, 171401, 181499, 191579, 201653, 211741, + 221813, 231893, 241979, 252079, 282311, 302483, + 322649, 342803, 362969, 383143, 403301, 423457, + 443629, 463787, 483953, 504121, 564617, 604949, + 645313, 685609, 725939, 766273, 806609, 846931, + 887261, 927587, 967919, 1008239, 1123477, 1198397, + 1273289, 1348177, 1423067, 1497983, 1572869, 1647761, + 1722667, 1797581, 1872461, 1947359, 2022253, 2246953, + 2396759, 2546543, 2696363, 2846161, 2995973, 3145739, + 3295541, 3445357, 3595117, 3744941, 3894707, 4044503, + 4493921, 4793501, 5093089, 5392679, 5692279, 5991883, + 6291469, 6591059, 6890641, 7190243, 7489829, 7789447, + 8089033, 8987807, 9586981, 10186177, 10785371, 11384539, + 11983729, 12582917, 13182109, 13781291, 14380469, 14979667, + 15578861, 16178053, 17895707, 19014187, 20132683, 21251141, + 22369661, 23488103, 24606583, 25725083, 26843549, 27962027, + 29080529, 30198989, 31317469, 32435981, 35791397, 38028379, + 40265327, 42502283, 44739259, 46976221, 49213237, 51450131, + 53687099, 55924061, 58161041, 60397993, 62634959, 64871921, + 71582857, 76056727, 80530643, 85004567, 89478503, 93952427, + 98426347, 102900263, 107374217, 111848111, 116322053, 120795971, + 125269877, 129743807, 143165587, 152113427, 161061283, 170009141, + 178956983, 187904819, 196852693, 205800547, 214748383, 223696237, + 232644089, 241591943, 250539763, 259487603, 268435399}; + + const uint32_t num_primes = sizeof(primes) / sizeof(uint32_t); + + uint32_t hsize = primes[num_primes - 1]; + for (uint32_t i = 0; i < num_primes; ++i) { + if (size <= primes[i]) { + hsize = primes[i]; + break; + } + } + return hsize; +} + +} // namespace Impl +} // namespace Kokkos diff --git a/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp b/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..d7c4a5d1ffdf9969e3c158473e7fb5754113a665 --- /dev/null +++ b/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp @@ -0,0 +1,280 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_UNORDERED_MAP_IMPL_HPP +#define KOKKOS_UNORDERED_MAP_IMPL_HPP + +#include <Kokkos_Core_fwd.hpp> +#include <cstdint> + +#include <cstdio> +#include <climits> +#include <iostream> +#include <iomanip> + +namespace Kokkos { +namespace Impl { + +uint32_t find_hash_size(uint32_t size); + +template <typename Map> +struct UnorderedMapRehash { + using map_type = Map; + using const_map_type = typename map_type::const_map_type; + using execution_space = typename map_type::execution_space; + using size_type = typename map_type::size_type; + + map_type m_dst; + const_map_type m_src; + + UnorderedMapRehash(map_type const& dst, const_map_type const& src) + : m_dst(dst), m_src(src) {} + + void apply() const { + parallel_for("Kokkos::Impl::UnorderedMapRehash::apply", m_src.capacity(), + *this); + } + + KOKKOS_INLINE_FUNCTION + void operator()(size_type i) const { + if (m_src.valid_at(i)) m_dst.insert(m_src.key_at(i), m_src.value_at(i)); + } +}; + +template <typename UMap> +struct UnorderedMapErase { + using map_type = UMap; + using execution_space = typename map_type::execution_space; + using size_type = typename map_type::size_type; + using key_type = typename map_type::key_type; + using value_type = typename map_type::impl_value_type; + + map_type m_map; + + UnorderedMapErase(map_type const& map) : m_map(map) {} + + void apply() const { + parallel_for("Kokkos::Impl::UnorderedMapErase::apply", + m_map.m_hash_lists.extent(0), *this); + } + + KOKKOS_INLINE_FUNCTION + void operator()(size_type i) const { + const size_type invalid_index = map_type::invalid_index; + + size_type curr = m_map.m_hash_lists(i); + size_type next = invalid_index; + + // remove erased head of the linked-list + while (curr != invalid_index && !m_map.valid_at(curr)) { + next = m_map.m_next_index[curr]; + m_map.m_next_index[curr] = invalid_index; + m_map.m_keys[curr] = key_type(); + if (m_map.is_set) m_map.m_values[curr] = value_type(); + curr = next; + m_map.m_hash_lists(i) = next; + } + + // if the list is non-empty and the head is valid + if (curr != invalid_index && m_map.valid_at(curr)) { + size_type prev = curr; + curr = m_map.m_next_index[prev]; + + while (curr != invalid_index) { + next = m_map.m_next_index[curr]; + if (m_map.valid_at(curr)) { + prev = curr; + } else { + // remove curr from list + m_map.m_next_index[prev] = next; + m_map.m_next_index[curr] = invalid_index; + m_map.m_keys[curr] = key_type(); + if (map_type::is_set) m_map.m_values[curr] = value_type(); + } + curr = next; + } + } + } +}; + +template <typename UMap> +struct UnorderedMapHistogram { + using map_type = UMap; + using execution_space = typename map_type::execution_space; + using size_type = typename map_type::size_type; + + using histogram_view = View<int[100], execution_space>; + using host_histogram_view = typename histogram_view::HostMirror; + + map_type m_map; + histogram_view m_length; + histogram_view m_distance; + histogram_view m_block_distance; + + UnorderedMapHistogram(map_type const& map) + : m_map(map), + m_length("UnorderedMap Histogram"), + m_distance("UnorderedMap Histogram"), + m_block_distance("UnorderedMap Histogram") {} + + void calculate() { + parallel_for("Kokkos::Impl::UnorderedMapHistogram::calculate", + m_map.m_hash_lists.extent(0), *this); + } + + void clear() { + Kokkos::deep_copy(m_length, 0); + Kokkos::deep_copy(m_distance, 0); + Kokkos::deep_copy(m_block_distance, 0); + } + + void print_length(std::ostream& out) { + host_histogram_view host_copy = create_mirror_view(m_length); + Kokkos::deep_copy(host_copy, m_length); + + for (int i = 0, size = host_copy.extent(0); i < size; ++i) { + out << host_copy[i] << " , "; + } + out << "\b\b\b " << std::endl; + } + + void print_distance(std::ostream& out) { + host_histogram_view host_copy = create_mirror_view(m_distance); + Kokkos::deep_copy(host_copy, m_distance); + + for (int i = 0, size = host_copy.extent(0); i < size; ++i) { + out << host_copy[i] << " , "; + } + out << "\b\b\b " << std::endl; + } + + void print_block_distance(std::ostream& out) { + host_histogram_view host_copy = create_mirror_view(m_block_distance); + Kokkos::deep_copy(host_copy, m_block_distance); + + for (int i = 0, size = host_copy.extent(0); i < size; ++i) { + out << host_copy[i] << " , "; + } + out << "\b\b\b " << std::endl; + } + + KOKKOS_INLINE_FUNCTION + void operator()(size_type i) const { + const size_type invalid_index = map_type::invalid_index; + + uint32_t length = 0; + size_type min_index = ~0u, max_index = 0; + for (size_type curr = m_map.m_hash_lists(i); curr != invalid_index; + curr = m_map.m_next_index[curr]) { + ++length; + min_index = (curr < min_index) ? curr : min_index; + max_index = (max_index < curr) ? curr : max_index; + } + + size_type distance = (0u < length) ? max_index - min_index : 0u; + size_type blocks = (0u < length) ? max_index / 32u - min_index / 32u : 0u; + + // normalize data + length = length < 100u ? length : 99u; + distance = distance < 100u ? distance : 99u; + blocks = blocks < 100u ? blocks : 99u; + + if (0u < length) { + atomic_fetch_add(&m_length(length), 1); + atomic_fetch_add(&m_distance(distance), 1); + atomic_fetch_add(&m_block_distance(blocks), 1); + } + } +}; + +template <typename UMap> +struct UnorderedMapPrint { + using map_type = UMap; + using execution_space = typename map_type::execution_space; + using size_type = typename map_type::size_type; + + map_type m_map; + + UnorderedMapPrint(map_type const& map) : m_map(map) {} + + void apply() { + parallel_for("Kokkos::Impl::UnorderedMapPrint::apply", + m_map.m_hash_lists.extent(0), *this); + } + + KOKKOS_INLINE_FUNCTION + void operator()(size_type i) const { + const size_type invalid_index = map_type::invalid_index; + + uint32_t list = m_map.m_hash_lists(i); + for (size_type curr = list, ii = 0; curr != invalid_index; + curr = m_map.m_next_index[curr], ++ii) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d[%d]: %d->%d\n", list, ii, + m_map.key_at(curr), m_map.value_at(curr)); + } + } +}; + +template <typename DKey, typename DValue, typename SKey, typename SValue> +struct UnorderedMapCanAssign : public std::false_type {}; + +template <typename Key, typename Value> +struct UnorderedMapCanAssign<Key, Value, Key, Value> : public std::true_type {}; + +template <typename Key, typename Value> +struct UnorderedMapCanAssign<const Key, Value, Key, Value> + : public std::true_type {}; + +template <typename Key, typename Value> +struct UnorderedMapCanAssign<const Key, const Value, Key, Value> + : public std::true_type {}; + +template <typename Key, typename Value> +struct UnorderedMapCanAssign<const Key, const Value, const Key, Value> + : public std::true_type {}; + +} // namespace Impl +} // namespace Kokkos + +#endif // KOKKOS_UNORDERED_MAP_IMPL_HPP diff --git a/packages/kokkos/containers/unit_tests/CMakeLists.txt b/packages/kokkos/containers/unit_tests/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..947d222c273dc4d87823ad3560a1af6c62a1e52b --- /dev/null +++ b/packages/kokkos/containers/unit_tests/CMakeLists.txt @@ -0,0 +1,47 @@ + +KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) +KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src ) +KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files) + +foreach(Tag Threads;Serial;OpenMP;HPX;Cuda;HIP;SYCL) + # Because there is always an exception to the rule + if(Tag STREQUAL "Threads") + set(DEVICE "PTHREAD") + else() + string(TOUPPER ${Tag} DEVICE) + endif() + string(TOLOWER ${Tag} dir) + # Add test for that backend if it is enabled + if(Kokkos_ENABLE_${DEVICE}) + set(UnitTestSources UnitTestMain.cpp) + set(dir ${CMAKE_CURRENT_BINARY_DIR}/${dir}) + file(MAKE_DIRECTORY ${dir}) + foreach(Name + Bitset + DualView + DynamicView + DynViewAPI_generic + DynViewAPI_rank12345 + DynViewAPI_rank67 + ErrorReporter + OffsetView + ScatterView + StaticCrsGraph + UnorderedMap + Vector + ViewCtorPropEmbeddedDim + ) + # Write to a temporary intermediate file and call configure_file to avoid + # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs. + set(file ${dir}/Test${Tag}_${Name}.cpp) + file(WRITE ${dir}/dummy.cpp + "#include <Test${Tag}_Category.hpp>\n" + "#include <Test${Name}.hpp>\n" + ) + configure_file(${dir}/dummy.cpp ${file}) + list(APPEND UnitTestSources ${file}) + endforeach() + KOKKOS_ADD_EXECUTABLE_AND_TEST(UnitTest_${Tag} SOURCES ${UnitTestSources}) + endif() +endforeach() diff --git a/packages/kokkos/containers/unit_tests/Makefile b/packages/kokkos/containers/unit_tests/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..82669fe1ab7532b69556cafbb7131b595f9e5f8e --- /dev/null +++ b/packages/kokkos/containers/unit_tests/Makefile @@ -0,0 +1,182 @@ +KOKKOS_PATH = ../.. + +GTEST_PATH = ../../TPL/gtest + +vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests +vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests/openmp +vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests/hpx +vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests/serial +vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests/threads +vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests/hip +vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests/cuda +vpath %.cpp ${CURDIR} +default: build_all + echo "End Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) + CXX = $(KOKKOS_PATH)/bin/nvcc_wrapper +else + CXX = g++ +endif + +CXXFLAGS = -O3 +LINK ?= $(CXX) +LDFLAGS ?= +override LDFLAGS += -lpthread + +include $(KOKKOS_PATH)/Makefile.kokkos + +KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/containers/unit_tests -I${KOKKOS_PATH}/core/unit_test/category_files + +TEST_TARGETS = +TARGETS = + +TESTS = Bitset DualView DynamicView DynViewAPI_generic DynViewAPI_rank12345 DynViewAPI_rank67 ErrorReporter OffsetView ScatterView StaticCrsGraph UnorderedMap Vector ViewCtorPropEmbeddedDim +tmp := $(foreach device, $(KOKKOS_DEVICELIST), \ + tmp2 := $(foreach test, $(TESTS), \ + $(if $(filter Test$(device)_$(test).cpp, $(shell ls Test$(device)_$(test).cpp 2>/dev/null)),,\ + $(shell echo "\#include<Test"$(device)"_Category.hpp>" > Test$(device)_$(test).cpp); \ + $(shell echo "\#include<Test"$(test)".hpp>" >> Test$(device)_$(test).cpp); \ + )\ + ) \ +) + +ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) + OBJ_CUDA = UnitTestMain.o gtest-all.o + OBJ_CUDA += TestCuda_Bitset.o + OBJ_CUDA += TestCuda_DualView.o + OBJ_CUDA += TestCuda_DynamicView.o + OBJ_CUDA += TestCuda_DynViewAPI_generic.o + OBJ_CUDA += TestCuda_DynViewAPI_rank12345.o + OBJ_CUDA += TestCuda_DynViewAPI_rank67.o + OBJ_CUDA += TestCuda_ErrorReporter.o + OBJ_CUDA += TestCuda_OffsetView.o + OBJ_CUDA += TestCuda_ScatterView.o + OBJ_CUDA += TestCuda_StaticCrsGraph.o + OBJ_CUDA += TestCuda_UnorderedMap.o + OBJ_CUDA += TestCuda_Vector.o + OBJ_CUDA += TestCuda_ViewCtorPropEmbeddedDim.o + TARGETS += KokkosContainers_UnitTest_Cuda + TEST_TARGETS += test-cuda +endif + +ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) + OBJ_THREADS = UnitTestMain.o gtest-all.o + OBJ_THREADS += TestThreads_Bitset.o + OBJ_THREADS += TestThreads_DualView.o + OBJ_THREADS += TestThreads_DynamicView.o + OBJ_THREADS += TestThreads_DynViewAPI_generic.o + OBJ_THREADS += TestThreads_DynViewAPI_rank12345.o + OBJ_THREADS += TestThreads_DynViewAPI_rank67.o + OBJ_THREADS += TestThreads_ErrorReporter.o + OBJ_THREADS += TestThreads_OffsetView.o + OBJ_THREADS += TestThreads_ScatterView.o + OBJ_THREADS += TestThreads_StaticCrsGraph.o + OBJ_THREADS += TestThreads_UnorderedMap.o + OBJ_THREADS += TestThreads_Vector.o + OBJ_THREADS += TestThreads_ViewCtorPropEmbeddedDim.o + TARGETS += KokkosContainers_UnitTest_Threads + TEST_TARGETS += test-threads +endif + +ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) + OBJ_OPENMP = UnitTestMain.o gtest-all.o + OBJ_OPENMP += TestOpenMP_Bitset.o + OBJ_OPENMP += TestOpenMP_DualView.o + OBJ_OPENMP += TestOpenMP_DynamicView.o + OBJ_OPENMP += TestOpenMP_DynViewAPI_generic.o + OBJ_OPENMP += TestOpenMP_DynViewAPI_rank12345.o + OBJ_OPENMP += TestOpenMP_DynViewAPI_rank67.o + OBJ_OPENMP += TestOpenMP_ErrorReporter.o + OBJ_OPENMP += TestOpenMP_OffsetView.o + OBJ_OPENMP += TestOpenMP_ScatterView.o + OBJ_OPENMP += TestOpenMP_StaticCrsGraph.o + OBJ_OPENMP += TestOpenMP_UnorderedMap.o + OBJ_OPENMP += TestOpenMP_Vector.o + OBJ_OPENMP += TestOpenMP_ViewCtorPropEmbeddedDim.o + TARGETS += KokkosContainers_UnitTest_OpenMP + TEST_TARGETS += test-openmp +endif + +ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) + OBJ_HPX = UnitTestMain.o gtest-all.o + OBJ_HPX += TestHPX_Bitset.o + OBJ_HPX += TestHPX_DualView.o + OBJ_HPX += TestHPX_DynamicView.o + OBJ_HPX += TestHPX_DynViewAPI_generic.o + OBJ_HPX += TestHPX_DynViewAPI_rank12345.o + OBJ_HPX += TestHPX_DynViewAPI_rank67.o + OBJ_HPX += TestHPX_ErrorReporter.o + OBJ_HPX += TestHPX_OffsetView.o + OBJ_HPX += TestHPX_ScatterView.o + OBJ_HPX += TestHPX_StaticCrsGraph.o + OBJ_HPX += TestHPX_UnorderedMap.o + OBJ_HPX += TestHPX_Vector.o + OBJ_HPX += TestHPX_ViewCtorPropEmbeddedDim.o + TARGETS += KokkosContainers_UnitTest_HPX + TEST_TARGETS += test-hpx +endif + +ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) + OBJ_SERIAL = UnitTestMain.o gtest-all.o + OBJ_SERIAL += TestSerial_Bitset.o + OBJ_SERIAL += TestSerial_DualView.o + OBJ_SERIAL += TestSerial_DynamicView.o + OBJ_SERIAL += TestSerial_DynViewAPI_generic.o + OBJ_SERIAL += TestSerial_DynViewAPI_rank12345.o + OBJ_SERIAL += TestSerial_DynViewAPI_rank67.o + OBJ_SERIAL += TestSerial_ErrorReporter.o + OBJ_SERIAL += TestSerial_OffsetView.o + OBJ_SERIAL += TestSerial_ScatterView.o + OBJ_SERIAL += TestSerial_StaticCrsGraph.o + OBJ_SERIAL += TestSerial_UnorderedMap.o + OBJ_SERIAL += TestSerial_Vector.o + OBJ_SERIAL += TestSerial_ViewCtorPropEmbeddedDim.o + TARGETS += KokkosContainers_UnitTest_Serial + TEST_TARGETS += test-serial +endif + +KokkosContainers_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosContainers_UnitTest_Cuda + +KokkosContainers_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosContainers_UnitTest_Threads + +KokkosContainers_UnitTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(EXTRA_PATH) $(OBJ_OPENMP) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosContainers_UnitTest_OpenMP + +KokkosContainers_UnitTest_HPX: $(OBJ_HPX) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(EXTRA_PATH) $(OBJ_HPX) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosContainers_UnitTest_HPX + +KokkosContainers_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(EXTRA_PATH) $(OBJ_SERIAL) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosContainers_UnitTest_Serial + +test-cuda: KokkosContainers_UnitTest_Cuda + ./KokkosContainers_UnitTest_Cuda + +test-threads: KokkosContainers_UnitTest_Threads + ./KokkosContainers_UnitTest_Threads + +test-openmp: KokkosContainers_UnitTest_OpenMP + ./KokkosContainers_UnitTest_OpenMP + +test-hpx: KokkosContainers_UnitTest_HPX + ./KokkosContainers_UnitTest_HPX + +test-serial: KokkosContainers_UnitTest_Serial + ./KokkosContainers_UnitTest_Serial + +build_all: $(TARGETS) + +test: $(TEST_TARGETS) + +clean: kokkos-clean + rm -f *.o $(TARGETS) *.cpp + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< + +gtest-all.o:$(GTEST_PATH)/gtest/gtest-all.cc + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $(GTEST_PATH)/gtest/gtest-all.cc diff --git a/packages/kokkos/containers/unit_tests/TestBitset.hpp b/packages/kokkos/containers/unit_tests/TestBitset.hpp new file mode 100644 index 0000000000000000000000000000000000000000..6810ae101aff68f7137c28689cf4d98f13194a4f --- /dev/null +++ b/packages/kokkos/containers/unit_tests/TestBitset.hpp @@ -0,0 +1,274 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER + +#ifndef KOKKOS_TEST_BITSET_HPP +#define KOKKOS_TEST_BITSET_HPP + +#include <gtest/gtest.h> +#include <iostream> +#include <Kokkos_Core.hpp> +#include <Kokkos_Bitset.hpp> +#include <array> + +namespace Test { + +namespace Impl { + +template <typename Bitset, bool Set> +struct TestBitset { + using bitset_type = Bitset; + using execution_space = typename bitset_type::execution_space; + using value_type = uint32_t; + + bitset_type m_bitset; + + TestBitset(bitset_type const& bitset) : m_bitset(bitset) {} + + unsigned testit(unsigned collisions) { + execution_space().fence(); + + unsigned count = 0; + Kokkos::parallel_reduce(m_bitset.size() * collisions, *this, count); + return count; + } + + KOKKOS_INLINE_FUNCTION + void init(value_type& v) const { v = 0; } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dst, const volatile value_type& src) const { + dst += src; + } + + KOKKOS_INLINE_FUNCTION + void operator()(uint32_t i, value_type& v) const { + i = i % m_bitset.size(); + if (Set) { + if (m_bitset.set(i)) { + if (m_bitset.test(i)) ++v; + } + } else { + if (m_bitset.reset(i)) { + if (!m_bitset.test(i)) ++v; + } + } + } +}; + +template <typename Bitset> +struct TestBitsetTest { + using bitset_type = Bitset; + using execution_space = typename bitset_type::execution_space; + using value_type = uint32_t; + + bitset_type m_bitset; + + TestBitsetTest(bitset_type const& bitset) : m_bitset(bitset) {} + + unsigned testit() { + execution_space().fence(); + + unsigned count = 0; + Kokkos::parallel_reduce(m_bitset.size(), *this, count); + return count; + } + + KOKKOS_INLINE_FUNCTION + void init(value_type& v) const { v = 0; } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dst, const volatile value_type& src) const { + dst += src; + } + + KOKKOS_INLINE_FUNCTION + void operator()(uint32_t i, value_type& v) const { + if (m_bitset.test(i)) ++v; + } +}; + +template <typename Bitset, bool Set> +struct TestBitsetAny { + using bitset_type = Bitset; + using execution_space = typename bitset_type::execution_space; + using value_type = uint32_t; + + bitset_type m_bitset; + + TestBitsetAny(bitset_type const& bitset) : m_bitset(bitset) {} + + unsigned testit() { + execution_space().fence(); + + unsigned count = 0; + Kokkos::parallel_reduce(m_bitset.size(), *this, count); + return count; + } + + KOKKOS_INLINE_FUNCTION + void init(value_type& v) const { v = 0; } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dst, const volatile value_type& src) const { + dst += src; + } + + KOKKOS_INLINE_FUNCTION + void operator()(uint32_t i, value_type& v) const { + bool result = false; + unsigned attempts = 0; + uint32_t hint = (i >> 4) << 4; + while (attempts < m_bitset.max_hint()) { + if (Set) { + Kokkos::tie(result, hint) = m_bitset.find_any_unset_near(hint, i); + if (result && m_bitset.set(hint)) { + ++v; + break; + } else if (!result) { + ++attempts; + } + } else { + Kokkos::tie(result, hint) = m_bitset.find_any_set_near(hint, i); + if (result && m_bitset.reset(hint)) { + ++v; + break; + } else if (!result) { + ++attempts; + } + } + } + } +}; +} // namespace Impl + +template <typename Device> +void test_bitset() { + using bitset_type = Kokkos::Bitset<Device>; + using const_bitset_type = Kokkos::ConstBitset<Device>; + + { + unsigned ts = 100u; + bitset_type b1; + ASSERT_TRUE(b1.is_allocated()); + + b1 = bitset_type(ts); + bitset_type b2(b1); + bitset_type b3(ts); + + ASSERT_TRUE(b1.is_allocated()); + ASSERT_TRUE(b2.is_allocated()); + ASSERT_TRUE(b3.is_allocated()); + } + + std::array<unsigned, 7> test_sizes = { + {0u, 10u, 100u, 1000u, 1u << 14, 1u << 16, 10000001}}; + + for (const auto test_size : test_sizes) { + // std::cout << "Bitset " << test_sizes[i] << std::endl; + + bitset_type bitset(test_size); + + // std::cout << " Check initial count " << std::endl; + // nothing should be set + { + Impl::TestBitsetTest<bitset_type> f(bitset); + uint32_t count = f.testit(); + EXPECT_EQ(0u, count); + EXPECT_EQ(count, bitset.count()); + } + + // std::cout << " Check set() " << std::endl; + bitset.set(); + // everything should be set + { + Impl::TestBitsetTest<const_bitset_type> f(bitset); + uint32_t count = f.testit(); + EXPECT_EQ(bitset.size(), count); + EXPECT_EQ(count, bitset.count()); + } + + // std::cout << " Check reset() " << std::endl; + bitset.reset(); + EXPECT_EQ(0u, bitset.count()); + + // std::cout << " Check set(i) " << std::endl; + // test setting bits + { + Impl::TestBitset<bitset_type, true> f(bitset); + uint32_t count = f.testit(10u); + EXPECT_EQ(bitset.size(), bitset.count()); + EXPECT_EQ(bitset.size(), count); + } + + // std::cout << " Check reset(i) " << std::endl; + // test resetting bits + { + Impl::TestBitset<bitset_type, false> f(bitset); + uint32_t count = f.testit(10u); + EXPECT_EQ(bitset.size(), count); + EXPECT_EQ(0u, bitset.count()); + } + + // std::cout << " Check find_any_set(i) " << std::endl; + // test setting any bits + { + Impl::TestBitsetAny<bitset_type, true> f(bitset); + uint32_t count = f.testit(); + EXPECT_EQ(bitset.size(), bitset.count()); + EXPECT_EQ(bitset.size(), count); + } + + // std::cout << " Check find_any_unset(i) " << std::endl; + // test resetting any bits + { + Impl::TestBitsetAny<bitset_type, false> f(bitset); + uint32_t count = f.testit(); + EXPECT_EQ(bitset.size(), count); + EXPECT_EQ(0u, bitset.count()); + } + } +} + +TEST(TEST_CATEGORY, bitset) { test_bitset<TEST_EXECSPACE>(); } +} // namespace Test + +#endif // KOKKOS_TEST_BITSET_HPP diff --git a/packages/kokkos/containers/unit_tests/TestDualView.hpp b/packages/kokkos/containers/unit_tests/TestDualView.hpp new file mode 100644 index 0000000000000000000000000000000000000000..3eee85ed10bd81bc8b511afa9f0fbde7ba244b8f --- /dev/null +++ b/packages/kokkos/containers/unit_tests/TestDualView.hpp @@ -0,0 +1,569 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_DUALVIEW_HPP +#define KOKKOS_TEST_DUALVIEW_HPP + +#include <gtest/gtest.h> +#include <iostream> +#include <cstdlib> +#include <cstdio> +#include <impl/Kokkos_Timer.hpp> +#include <Kokkos_DualView.hpp> + +namespace Test { + +namespace Impl { +template <typename Scalar, class Device> +struct test_dualview_alloc { + using scalar_type = Scalar; + using execution_space = Device; + + template <typename ViewType> + bool run_me(unsigned int n, unsigned int m) { + if (n < 10) n = 10; + if (m < 3) m = 3; + + { + ViewType b1; + if (b1.is_allocated() == true) return false; + + b1 = ViewType("B1", n, m); + ViewType b2(b1); + ViewType b3("B3", n, m); + + if (b1.is_allocated() == false) return false; + if (b2.is_allocated() == false) return false; + if (b3.is_allocated() == false) return false; + } + return true; + } + + bool result = false; + + test_dualview_alloc(unsigned int size) { + result = run_me<Kokkos::DualView<Scalar**, Kokkos::LayoutLeft, Device> >( + size, 3); + } +}; + +template <typename Scalar, class Device> +struct test_dualview_combinations { + using self_type = test_dualview_combinations<Scalar, Device>; + + using scalar_type = Scalar; + using execution_space = Device; + + Scalar reference; + Scalar result; + + template <typename ViewType> + Scalar run_me(unsigned int n, unsigned int m, bool with_init) { + if (n < 10) n = 10; + if (m < 3) m = 3; + + ViewType a; + + if (with_init) { + a = ViewType("A", n, m); + } else { + a = ViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, "A"), n, m); + } + Kokkos::deep_copy(a.d_view, 1); + + a.template modify<typename ViewType::execution_space>(); + a.template sync<typename ViewType::host_mirror_space>(); + a.template sync<typename ViewType::host_mirror_space>( + Kokkos::DefaultExecutionSpace{}); + + a.h_view(5, 1) = 3; + a.h_view(6, 1) = 4; + a.h_view(7, 2) = 5; + a.template modify<typename ViewType::host_mirror_space>(); + ViewType b = Kokkos::subview(a, std::pair<unsigned int, unsigned int>(6, 9), + std::pair<unsigned int, unsigned int>(0, 1)); + a.template sync<typename ViewType::execution_space>(); + a.template sync<typename ViewType::execution_space>( + Kokkos::DefaultExecutionSpace{}); + b.template modify<typename ViewType::execution_space>(); + + Kokkos::deep_copy(b.d_view, 2); + + a.template sync<typename ViewType::host_mirror_space>(); + a.template sync<typename ViewType::host_mirror_space>( + Kokkos::DefaultExecutionSpace{}); + Scalar count = 0; + for (unsigned int i = 0; i < a.d_view.extent(0); i++) + for (unsigned int j = 0; j < a.d_view.extent(1); j++) + count += a.h_view(i, j); + return count - a.d_view.extent(0) * a.d_view.extent(1) - 2 - 4 - 3 * 2; + } + + test_dualview_combinations(unsigned int size, bool with_init) { + result = run_me<Kokkos::DualView<Scalar**, Kokkos::LayoutLeft, Device> >( + size, 3, with_init); + } +}; + +template <typename Scalar, class ViewType> +struct SumViewEntriesFunctor { + using value_type = Scalar; + + ViewType fv; + + SumViewEntriesFunctor(const ViewType& fv_) : fv(fv_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, value_type& total) const { + for (size_t j = 0; j < fv.extent(1); ++j) { + total += fv(i, j); + } + } +}; + +template <typename Scalar, class Device> +struct test_dual_view_deep_copy { + using scalar_type = Scalar; + using execution_space = Device; + + template <typename ViewType> + void run_me(int n, const int m, const bool use_templ_sync) { + ViewType a, b; + if (n >= 0) { + a = ViewType("A", n, m); + b = ViewType("B", n, m); + } else { + n = 0; + } + const scalar_type sum_total = scalar_type(n * m); + + Kokkos::deep_copy(a.d_view, 1); + + if (use_templ_sync) { + a.template modify<typename ViewType::execution_space>(); + a.template sync<typename ViewType::host_mirror_space>(); + } else { + a.modify_device(); + a.sync_host(); + a.sync_host(Kokkos::DefaultExecutionSpace{}); + } + + // Check device view is initialized as expected + scalar_type a_d_sum = 0; + // Execute on the execution_space associated with t_dev's memory space + using t_dev_exec_space = + typename ViewType::t_dev::memory_space::execution_space; + Kokkos::parallel_reduce( + Kokkos::RangePolicy<t_dev_exec_space>(0, n), + SumViewEntriesFunctor<scalar_type, typename ViewType::t_dev>(a.d_view), + a_d_sum); + ASSERT_EQ(a_d_sum, sum_total); + + // Check host view is synced as expected + scalar_type a_h_sum = 0; + for (size_t i = 0; i < a.h_view.extent(0); ++i) + for (size_t j = 0; j < a.h_view.extent(1); ++j) { + a_h_sum += a.h_view(i, j); + } + + ASSERT_EQ(a_h_sum, sum_total); + + // Test deep_copy + Kokkos::deep_copy(b, a); + if (use_templ_sync) { + b.template sync<typename ViewType::host_mirror_space>(); + } else { + b.sync_host(); + b.sync_host(Kokkos::DefaultExecutionSpace{}); + } + + // Perform same checks on b as done on a + // Check device view is initialized as expected + scalar_type b_d_sum = 0; + // Execute on the execution_space associated with t_dev's memory space + Kokkos::parallel_reduce( + Kokkos::RangePolicy<t_dev_exec_space>(0, n), + SumViewEntriesFunctor<scalar_type, typename ViewType::t_dev>(b.d_view), + b_d_sum); + ASSERT_EQ(b_d_sum, sum_total); + + // Check host view is synced as expected + scalar_type b_h_sum = 0; + for (size_t i = 0; i < b.h_view.extent(0); ++i) + for (size_t j = 0; j < b.h_view.extent(1); ++j) { + b_h_sum += b.h_view(i, j); + } + + ASSERT_EQ(b_h_sum, sum_total); + + } // end run_me + + test_dual_view_deep_copy() { + run_me<Kokkos::DualView<Scalar**, Kokkos::LayoutLeft, Device> >(10, 5, + true); + run_me<Kokkos::DualView<Scalar**, Kokkos::LayoutLeft, Device> >(10, 5, + false); + // Test zero length but allocated (a.d_view.data!=nullptr but + // a.d_view.span()==0) + run_me<Kokkos::DualView<Scalar**, Kokkos::LayoutLeft, Device> >(0, 5, true); + run_me<Kokkos::DualView<Scalar**, Kokkos::LayoutLeft, Device> >(0, 5, + false); + + // Test default constructed view + run_me<Kokkos::DualView<Scalar**, Kokkos::LayoutLeft, Device> >(-1, 5, + true); + run_me<Kokkos::DualView<Scalar**, Kokkos::LayoutLeft, Device> >(-1, 5, + false); + } +}; + +template <typename Scalar, class Device> +struct test_dualview_resize { + using scalar_type = Scalar; + using execution_space = Device; + + template <typename ViewType> + void run_me() { + const unsigned int n = 10; + const unsigned int m = 5; + const unsigned int factor = 2; + + ViewType a("A", n, m); + Kokkos::deep_copy(a.d_view, 1); + + /* Covers case "Resize on Device" */ + a.modify_device(); + Kokkos::resize(a, factor * n, factor * m); + ASSERT_EQ(a.extent(0), n * factor); + ASSERT_EQ(a.extent(1), m * factor); + + Kokkos::deep_copy(a.d_view, 1); + a.sync_host(); + + // Check device view is initialized as expected + scalar_type a_d_sum = 0; + // Execute on the execution_space associated with t_dev's memory space + using t_dev_exec_space = + typename ViewType::t_dev::memory_space::execution_space; + Kokkos::parallel_reduce( + Kokkos::RangePolicy<t_dev_exec_space>(0, a.d_view.extent(0)), + SumViewEntriesFunctor<scalar_type, typename ViewType::t_dev>(a.d_view), + a_d_sum); + + // Check host view is synced as expected + scalar_type a_h_sum = 0; + for (size_t i = 0; i < a.h_view.extent(0); ++i) + for (size_t j = 0; j < a.h_view.extent(1); ++j) { + a_h_sum += a.h_view(i, j); + } + + // Check + ASSERT_EQ(a_h_sum, a_d_sum); + ASSERT_EQ(a_h_sum, a.extent(0) * a.extent(1)); + + /* Covers case "Resize on Host" */ + a.modify_host(); + + Kokkos::resize(a, n / factor, m / factor); + ASSERT_EQ(a.extent(0), n / factor); + ASSERT_EQ(a.extent(1), m / factor); + + a.sync_device(); + a.sync_device(Kokkos::DefaultExecutionSpace{}); + + // Check device view is initialized as expected + a_d_sum = 0; + // Execute on the execution_space associated with t_dev's memory space + using t_dev_exec_space = + typename ViewType::t_dev::memory_space::execution_space; + Kokkos::parallel_reduce( + Kokkos::RangePolicy<t_dev_exec_space>(0, a.d_view.extent(0)), + SumViewEntriesFunctor<scalar_type, typename ViewType::t_dev>(a.d_view), + a_d_sum); + + // Check host view is synced as expected + a_h_sum = 0; + for (size_t i = 0; i < a.h_view.extent(0); ++i) + for (size_t j = 0; j < a.h_view.extent(1); ++j) { + a_h_sum += a.h_view(i, j); + } + + // Check + ASSERT_EQ(a_h_sum, a.extent(0) * a.extent(1)); + ASSERT_EQ(a_h_sum, a_d_sum); + + } // end run_me + + test_dualview_resize() { + run_me<Kokkos::DualView<Scalar**, Kokkos::LayoutLeft, Device> >(); + } +}; + +template <typename Scalar, class Device> +struct test_dualview_realloc { + using scalar_type = Scalar; + using execution_space = Device; + + template <typename ViewType> + void run_me() { + const unsigned int n = 10; + const unsigned int m = 5; + + ViewType a("A", n, m); + Kokkos::realloc(a, n, m); + + Kokkos::deep_copy(a.d_view, 1); + a.modify_device(); + a.sync_host(); + + // Check device view is initialized as expected + scalar_type a_d_sum = 0; + // Execute on the execution_space associated with t_dev's memory space + using t_dev_exec_space = + typename ViewType::t_dev::memory_space::execution_space; + Kokkos::parallel_reduce( + Kokkos::RangePolicy<t_dev_exec_space>(0, a.d_view.extent(0)), + SumViewEntriesFunctor<scalar_type, typename ViewType::t_dev>(a.d_view), + a_d_sum); + + // Check host view is synced as expected + scalar_type a_h_sum = 0; + for (size_t i = 0; i < a.h_view.extent(0); ++i) + for (size_t j = 0; j < a.h_view.extent(1); ++j) { + a_h_sum += a.h_view(i, j); + } + + // Check + ASSERT_EQ(a_h_sum, a.extent(0) * a.extent(1)); + ASSERT_EQ(a_h_sum, a_d_sum); + } // end run_me + + test_dualview_realloc() { + run_me<Kokkos::DualView<Scalar**, Kokkos::LayoutLeft, Device> >(); + } +}; + +} // namespace Impl + +template <typename Scalar, typename Device> +void test_dualview_combinations(unsigned int size, bool with_init) { + Impl::test_dualview_combinations<Scalar, Device> test(size, with_init); + ASSERT_EQ(test.result, 0); +} + +template <typename Scalar, typename Device> +void test_dualview_alloc(unsigned int size) { + Impl::test_dualview_alloc<Scalar, Device> test(size); + ASSERT_TRUE(test.result); +} + +template <typename Scalar, typename Device> +void test_dualview_deep_copy() { + Impl::test_dual_view_deep_copy<Scalar, Device>(); +} + +template <typename Scalar, typename Device> +void test_dualview_realloc() { + Impl::test_dualview_realloc<Scalar, Device>(); +} + +template <typename Scalar, typename Device> +void test_dualview_resize() { + Impl::test_dualview_resize<Scalar, Device>(); +} + +TEST(TEST_CATEGORY, dualview_combination) { + test_dualview_combinations<int, TEST_EXECSPACE>(10, true); +} + +TEST(TEST_CATEGORY, dualview_alloc) { + test_dualview_alloc<int, TEST_EXECSPACE>(10); +} + +TEST(TEST_CATEGORY, dualview_combinations_without_init) { + test_dualview_combinations<int, TEST_EXECSPACE>(10, false); +} + +TEST(TEST_CATEGORY, dualview_deep_copy) { + test_dualview_deep_copy<int, TEST_EXECSPACE>(); + test_dualview_deep_copy<double, TEST_EXECSPACE>(); +} + +TEST(TEST_CATEGORY, dualview_realloc) { + test_dualview_realloc<int, TEST_EXECSPACE>(); +} + +TEST(TEST_CATEGORY, dualview_resize) { + test_dualview_resize<int, TEST_EXECSPACE>(); +} + +namespace { +/** + * + * The following tests are a response to + * https://github.com/kokkos/kokkos/issues/3850 + * and + * https://github.com/kokkos/kokkos/pull/3857 + * + * DualViews were returning incorrect view types and taking + * inappropriate actions based on the templated view methods. + * + * Specifically, template view methods were always returning + * a device view if the memory space was UVM and a Kokkos::Device was passed. + * Sync/modify methods completely broke down So these tests exist to make sure + * that we keep the semantics of UVM DualViews intact. + */ +// modify if we have other UVM enabled backends +#ifdef KOKKOS_ENABLE_CUDA // OR other UVM builds +#define UVM_ENABLED_BUILD +#endif + +#ifdef UVM_ENABLED_BUILD +template <typename ExecSpace> +struct UVMSpaceFor; +#endif + +#ifdef KOKKOS_ENABLE_CUDA // specific to CUDA +template <> +struct UVMSpaceFor<Kokkos::Cuda> { + using type = Kokkos::CudaUVMSpace; +}; +#endif + +#ifdef UVM_ENABLED_BUILD +template <> +struct UVMSpaceFor<Kokkos::DefaultHostExecutionSpace> { + using type = typename UVMSpaceFor<Kokkos::DefaultExecutionSpace>::type; +}; +#else +template <typename ExecSpace> +struct UVMSpaceFor { + using type = typename ExecSpace::memory_space; +}; +#endif + +using ExecSpace = Kokkos::DefaultExecutionSpace; +using MemSpace = typename UVMSpaceFor<Kokkos::DefaultExecutionSpace>::type; +using DeviceType = Kokkos::Device<ExecSpace, MemSpace>; + +using DualViewType = Kokkos::DualView<double*, Kokkos::LayoutLeft, DeviceType>; +using d_device = DeviceType; +using h_device = Kokkos::Device< + Kokkos::DefaultHostExecutionSpace, + typename UVMSpaceFor<Kokkos::DefaultHostExecutionSpace>::type>; + +TEST(TEST_CATEGORY, dualview_device_correct_kokkos_device) { + DualViewType dv("myView", 100); + dv.clear_sync_state(); + auto v_d = dv.template view<d_device>(); + using vdt = decltype(v_d); + using vdt_d = vdt::device_type; + using vdt_d_e = vdt_d::execution_space; + ASSERT_STREQ(vdt_d_e::name(), Kokkos::DefaultExecutionSpace::name()); +} +TEST(TEST_CATEGORY, dualview_host_correct_kokkos_device) { + DualViewType dv("myView", 100); + dv.clear_sync_state(); + auto v_h = dv.template view<h_device>(); + using vht = decltype(v_h); + using vht_d = vht::device_type; + using vht_d_e = vht_d::execution_space; + ASSERT_STREQ(vht_d_e::name(), Kokkos::DefaultHostExecutionSpace::name()); +} + +TEST(TEST_CATEGORY, dualview_host_modify_template_device_sync) { + DualViewType dv("myView", 100); + dv.clear_sync_state(); + dv.modify_host(); + dv.template sync<d_device>(); + EXPECT_TRUE(!dv.need_sync_device()); + EXPECT_TRUE(!dv.need_sync_host()); + dv.clear_sync_state(); +} + +TEST(TEST_CATEGORY, dualview_host_modify_template_device_execspace_sync) { + DualViewType dv("myView", 100); + dv.clear_sync_state(); + dv.modify_host(); + dv.template sync<d_device::execution_space>(); + EXPECT_TRUE(!dv.need_sync_device()); + EXPECT_TRUE(!dv.need_sync_host()); + dv.clear_sync_state(); +} + +TEST(TEST_CATEGORY, dualview_device_modify_template_host_sync) { + DualViewType dv("myView", 100); + dv.clear_sync_state(); + dv.modify_device(); + dv.template sync<h_device>(); + EXPECT_TRUE(!dv.need_sync_device()); + EXPECT_TRUE(!dv.need_sync_host()); + dv.clear_sync_state(); +} +TEST(TEST_CATEGORY, dualview_device_modify_template_host_execspace_sync) { + DualViewType dv("myView", 100); + dv.clear_sync_state(); + dv.modify_device(); + dv.template sync<h_device::execution_space>(); + EXPECT_TRUE(!dv.need_sync_device()); + EXPECT_TRUE(!dv.need_sync_host()); + dv.clear_sync_state(); +} + +TEST(TEST_CATEGORY, + dualview_template_views_return_correct_executionspace_views) { + DualViewType dv("myView", 100); + dv.clear_sync_state(); + using hvt = decltype(dv.view<typename Kokkos::DefaultHostExecutionSpace>()); + using dvt = decltype(dv.view<typename Kokkos::DefaultExecutionSpace>()); + ASSERT_STREQ(Kokkos::DefaultExecutionSpace::name(), + dvt::device_type::execution_space::name()); + ASSERT_STREQ(Kokkos::DefaultHostExecutionSpace::name(), + hvt::device_type::execution_space::name()); +} + +} // anonymous namespace +} // namespace Test + +#endif // KOKKOS_TEST_DUALVIEW_HPP diff --git a/packages/kokkos/containers/unit_tests/TestDynViewAPI.hpp b/packages/kokkos/containers/unit_tests/TestDynViewAPI.hpp new file mode 100644 index 0000000000000000000000000000000000000000..dd0199ed81c75dcee42b964ac0bb1c246175ed01 --- /dev/null +++ b/packages/kokkos/containers/unit_tests/TestDynViewAPI.hpp @@ -0,0 +1,1698 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> +#include <stdexcept> +#include <sstream> +#include <iostream> +#include <Kokkos_DynRankView.hpp> + +/*--------------------------------------------------------------------------*/ + +/*--------------------------------------------------------------------------*/ + +namespace Test { + +template <class T, class... P> +size_t allocation_count(const Kokkos::DynRankView<T, P...>& view) { + const size_t card = view.size(); + const size_t alloc = view.span(); + + return card <= alloc ? alloc : 0; +} + +/*--------------------------------------------------------------------------*/ + +template <typename T, class DeviceType> +struct TestViewOperator { + using execution_space = DeviceType; + + static const unsigned N = 100; + static const unsigned D = 3; + + using view_type = Kokkos::DynRankView<T, execution_space>; + + const view_type v1; + const view_type v2; + + TestViewOperator() : v1("v1", N, D), v2("v2", N, D) {} + + static void testit() { Kokkos::parallel_for(N, TestViewOperator()); } + + KOKKOS_INLINE_FUNCTION + void operator()(const unsigned i) const { + const unsigned X = 0; + const unsigned Y = 1; + const unsigned Z = 2; + + v2(i, X) = v1(i, X); + v2(i, Y) = v1(i, Y); + v2(i, Z) = v1(i, Z); + } +}; + +/*--------------------------------------------------------------------------*/ + +template <class DataType, class DeviceType, unsigned Rank> +struct TestViewOperator_LeftAndRight; + +template <class DataType, class DeviceType> +struct TestViewOperator_LeftAndRight<DataType, DeviceType, 7> { + using execution_space = DeviceType; + using memory_space = typename execution_space::memory_space; + using size_type = typename execution_space::size_type; + + using value_type = int; + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& update, + const volatile value_type& input) { + update |= input; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& update) { update = 0; } + + using left_view = + Kokkos::DynRankView<DataType, Kokkos::LayoutLeft, execution_space>; + + using right_view = + Kokkos::DynRankView<DataType, Kokkos::LayoutRight, execution_space>; + + left_view left; + right_view right; + long left_alloc; + long right_alloc; + + TestViewOperator_LeftAndRight(unsigned N0, unsigned N1, unsigned N2, + unsigned N3, unsigned N4, unsigned N5, + unsigned N6) + : left("left", N0, N1, N2, N3, N4, N5, N6), + right("right", N0, N1, N2, N3, N4, N5, N6), + left_alloc(allocation_count(left)), + right_alloc(allocation_count(right)) {} + + static void testit(unsigned N0, unsigned N1, unsigned N2, unsigned N3, + unsigned N4, unsigned N5, unsigned N6) { + TestViewOperator_LeftAndRight driver(N0, N1, N2, N3, N4, N5, N6); + + int error_flag = 0; + + Kokkos::parallel_reduce(1, driver, error_flag); + + ASSERT_EQ(error_flag, 0); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const size_type, value_type& update) const { + long offset; + + offset = -1; + for (unsigned i6 = 0; i6 < unsigned(left.extent(6)); ++i6) + for (unsigned i5 = 0; i5 < unsigned(left.extent(5)); ++i5) + for (unsigned i4 = 0; i4 < unsigned(left.extent(4)); ++i4) + for (unsigned i3 = 0; i3 < unsigned(left.extent(3)); ++i3) + for (unsigned i2 = 0; i2 < unsigned(left.extent(2)); ++i2) + for (unsigned i1 = 0; i1 < unsigned(left.extent(1)); ++i1) + for (unsigned i0 = 0; i0 < unsigned(left.extent(0)); ++i0) { + const long j = &left(i0, i1, i2, i3, i4, i5, i6) - + &left(0, 0, 0, 0, 0, 0, 0); + if (j <= offset || left_alloc <= j) { + update |= 1; + } + offset = j; + } + + offset = -1; + for (unsigned i0 = 0; i0 < unsigned(right.extent(0)); ++i0) + for (unsigned i1 = 0; i1 < unsigned(right.extent(1)); ++i1) + for (unsigned i2 = 0; i2 < unsigned(right.extent(2)); ++i2) + for (unsigned i3 = 0; i3 < unsigned(right.extent(3)); ++i3) + for (unsigned i4 = 0; i4 < unsigned(right.extent(4)); ++i4) + for (unsigned i5 = 0; i5 < unsigned(right.extent(5)); ++i5) + for (unsigned i6 = 0; i6 < unsigned(right.extent(6)); ++i6) { + const long j = &right(i0, i1, i2, i3, i4, i5, i6) - + &right(0, 0, 0, 0, 0, 0, 0); + if (j <= offset || right_alloc <= j) { + update |= 2; + } + offset = j; + } + } +}; + +template <class DataType, class DeviceType> +struct TestViewOperator_LeftAndRight<DataType, DeviceType, 6> { + using execution_space = DeviceType; + using memory_space = typename execution_space::memory_space; + using size_type = typename execution_space::size_type; + + using value_type = int; + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& update, + const volatile value_type& input) { + update |= input; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& update) { update = 0; } + + using left_view = + Kokkos::DynRankView<DataType, Kokkos::LayoutLeft, execution_space>; + + using right_view = + Kokkos::DynRankView<DataType, Kokkos::LayoutRight, execution_space>; + + left_view left; + right_view right; + long left_alloc; + long right_alloc; + + TestViewOperator_LeftAndRight(unsigned N0, unsigned N1, unsigned N2, + unsigned N3, unsigned N4, unsigned N5) + : left("left", N0, N1, N2, N3, N4, N5), + right("right", N0, N1, N2, N3, N4, N5), + left_alloc(allocation_count(left)), + right_alloc(allocation_count(right)) {} + + static void testit(unsigned N0, unsigned N1, unsigned N2, unsigned N3, + unsigned N4, unsigned N5) { + TestViewOperator_LeftAndRight driver(N0, N1, N2, N3, N4, N5); + + int error_flag = 0; + + Kokkos::parallel_reduce(1, driver, error_flag); + + ASSERT_EQ(error_flag, 0); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const size_type, value_type& update) const { + long offset; + + offset = -1; + for (unsigned i5 = 0; i5 < unsigned(left.extent(5)); ++i5) + for (unsigned i4 = 0; i4 < unsigned(left.extent(4)); ++i4) + for (unsigned i3 = 0; i3 < unsigned(left.extent(3)); ++i3) + for (unsigned i2 = 0; i2 < unsigned(left.extent(2)); ++i2) + for (unsigned i1 = 0; i1 < unsigned(left.extent(1)); ++i1) + for (unsigned i0 = 0; i0 < unsigned(left.extent(0)); ++i0) { + const long j = + &left(i0, i1, i2, i3, i4, i5) - &left(0, 0, 0, 0, 0, 0); + if (j <= offset || left_alloc <= j) { + update |= 1; + } + offset = j; + } + + offset = -1; + for (unsigned i0 = 0; i0 < unsigned(right.extent(0)); ++i0) + for (unsigned i1 = 0; i1 < unsigned(right.extent(1)); ++i1) + for (unsigned i2 = 0; i2 < unsigned(right.extent(2)); ++i2) + for (unsigned i3 = 0; i3 < unsigned(right.extent(3)); ++i3) + for (unsigned i4 = 0; i4 < unsigned(right.extent(4)); ++i4) + for (unsigned i5 = 0; i5 < unsigned(right.extent(5)); ++i5) { + const long j = + &right(i0, i1, i2, i3, i4, i5) - &right(0, 0, 0, 0, 0, 0); + if (j <= offset || right_alloc <= j) { + update |= 2; + } + offset = j; + } + } +}; + +template <class DataType, class DeviceType> +struct TestViewOperator_LeftAndRight<DataType, DeviceType, 5> { + using execution_space = DeviceType; + using memory_space = typename execution_space::memory_space; + using size_type = typename execution_space::size_type; + + using value_type = int; + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& update, + const volatile value_type& input) { + update |= input; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& update) { update = 0; } + + using left_view = + Kokkos::DynRankView<DataType, Kokkos::LayoutLeft, execution_space>; + + using right_view = + Kokkos::DynRankView<DataType, Kokkos::LayoutRight, execution_space>; + + using stride_view = + Kokkos::DynRankView<DataType, Kokkos::LayoutStride, execution_space>; + + left_view left; + right_view right; + stride_view left_stride; + stride_view right_stride; + long left_alloc; + long right_alloc; + + TestViewOperator_LeftAndRight(unsigned N0, unsigned N1, unsigned N2, + unsigned N3, unsigned N4) + : left("left", N0, N1, N2, N3, N4), + right("right", N0, N1, N2, N3, N4), + left_stride(left), + right_stride(right), + left_alloc(allocation_count(left)), + right_alloc(allocation_count(right)) {} + + static void testit(unsigned N0, unsigned N1, unsigned N2, unsigned N3, + unsigned N4) { + TestViewOperator_LeftAndRight driver(N0, N1, N2, N3, N4); + + int error_flag = 0; + + Kokkos::parallel_reduce(1, driver, error_flag); + + ASSERT_EQ(error_flag, 0); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const size_type, value_type& update) const { + long offset; + + offset = -1; + for (unsigned i4 = 0; i4 < unsigned(left.extent(4)); ++i4) + for (unsigned i3 = 0; i3 < unsigned(left.extent(3)); ++i3) + for (unsigned i2 = 0; i2 < unsigned(left.extent(2)); ++i2) + for (unsigned i1 = 0; i1 < unsigned(left.extent(1)); ++i1) + for (unsigned i0 = 0; i0 < unsigned(left.extent(0)); ++i0) { + const long j = &left(i0, i1, i2, i3, i4) - &left(0, 0, 0, 0, 0); + if (j <= offset || left_alloc <= j) { + update |= 1; + } + offset = j; + + if (&left(i0, i1, i2, i3, i4) != + &left_stride(i0, i1, i2, i3, i4)) { + update |= 4; + } + } + + offset = -1; + for (unsigned i0 = 0; i0 < unsigned(right.extent(0)); ++i0) + for (unsigned i1 = 0; i1 < unsigned(right.extent(1)); ++i1) + for (unsigned i2 = 0; i2 < unsigned(right.extent(2)); ++i2) + for (unsigned i3 = 0; i3 < unsigned(right.extent(3)); ++i3) + for (unsigned i4 = 0; i4 < unsigned(right.extent(4)); ++i4) { + const long j = &right(i0, i1, i2, i3, i4) - &right(0, 0, 0, 0, 0); + if (j <= offset || right_alloc <= j) { + update |= 2; + } + offset = j; + + if (&right(i0, i1, i2, i3, i4) != + &right_stride(i0, i1, i2, i3, i4)) { + update |= 8; + } + } + } +}; + +template <class DataType, class DeviceType> +struct TestViewOperator_LeftAndRight<DataType, DeviceType, 4> { + using execution_space = DeviceType; + using memory_space = typename execution_space::memory_space; + using size_type = typename execution_space::size_type; + + using value_type = int; + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& update, + const volatile value_type& input) { + update |= input; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& update) { update = 0; } + + using left_view = + Kokkos::DynRankView<DataType, Kokkos::LayoutLeft, execution_space>; + + using right_view = + Kokkos::DynRankView<DataType, Kokkos::LayoutRight, execution_space>; + + left_view left; + right_view right; + long left_alloc; + long right_alloc; + + TestViewOperator_LeftAndRight(unsigned N0, unsigned N1, unsigned N2, + unsigned N3) + : left("left", N0, N1, N2, N3), + right("right", N0, N1, N2, N3), + left_alloc(allocation_count(left)), + right_alloc(allocation_count(right)) {} + + static void testit(unsigned N0, unsigned N1, unsigned N2, unsigned N3) { + TestViewOperator_LeftAndRight driver(N0, N1, N2, N3); + + int error_flag = 0; + + Kokkos::parallel_reduce(1, driver, error_flag); + + ASSERT_EQ(error_flag, 0); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const size_type, value_type& update) const { + long offset; + + offset = -1; + for (unsigned i3 = 0; i3 < unsigned(left.extent(3)); ++i3) + for (unsigned i2 = 0; i2 < unsigned(left.extent(2)); ++i2) + for (unsigned i1 = 0; i1 < unsigned(left.extent(1)); ++i1) + for (unsigned i0 = 0; i0 < unsigned(left.extent(0)); ++i0) { + const long j = &left(i0, i1, i2, i3) - &left(0, 0, 0, 0); + if (j <= offset || left_alloc <= j) { + update |= 1; + } + offset = j; + } + + offset = -1; + for (unsigned i0 = 0; i0 < unsigned(right.extent(0)); ++i0) + for (unsigned i1 = 0; i1 < unsigned(right.extent(1)); ++i1) + for (unsigned i2 = 0; i2 < unsigned(right.extent(2)); ++i2) + for (unsigned i3 = 0; i3 < unsigned(right.extent(3)); ++i3) { + const long j = &right(i0, i1, i2, i3) - &right(0, 0, 0, 0); + if (j <= offset || right_alloc <= j) { + update |= 2; + } + offset = j; + } + } +}; + +template <class DataType, class DeviceType> +struct TestViewOperator_LeftAndRight<DataType, DeviceType, 3> { + using execution_space = DeviceType; + using memory_space = typename execution_space::memory_space; + using size_type = typename execution_space::size_type; + + using value_type = int; + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& update, + const volatile value_type& input) { + update |= input; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& update) { update = 0; } + + using left_view = + Kokkos::DynRankView<DataType, Kokkos::LayoutLeft, execution_space>; + + using right_view = + Kokkos::DynRankView<DataType, Kokkos::LayoutRight, execution_space>; + + using stride_view = + Kokkos::DynRankView<DataType, Kokkos::LayoutStride, execution_space>; + + left_view left; + right_view right; + stride_view left_stride; + stride_view right_stride; + long left_alloc; + long right_alloc; + + TestViewOperator_LeftAndRight(unsigned N0, unsigned N1, unsigned N2) + : left(std::string("left"), N0, N1, N2), + right(std::string("right"), N0, N1, N2), + left_stride(left), + right_stride(right), + left_alloc(allocation_count(left)), + right_alloc(allocation_count(right)) {} + + static void testit(unsigned N0, unsigned N1, unsigned N2) { + TestViewOperator_LeftAndRight driver(N0, N1, N2); + + int error_flag = 0; + + Kokkos::parallel_reduce(1, driver, error_flag); + + ASSERT_EQ(error_flag, 0); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const size_type, value_type& update) const { + long offset; + + offset = -1; + for (unsigned i2 = 0; i2 < unsigned(left.extent(2)); ++i2) + for (unsigned i1 = 0; i1 < unsigned(left.extent(1)); ++i1) + for (unsigned i0 = 0; i0 < unsigned(left.extent(0)); ++i0) { + const long j = &left(i0, i1, i2) - &left(0, 0, 0); + if (j <= offset || left_alloc <= j) { + update |= 1; + } + offset = j; + + if (&left(i0, i1, i2) != &left_stride(i0, i1, i2)) { + update |= 4; + } + } + + offset = -1; + for (unsigned i0 = 0; i0 < unsigned(right.extent(0)); ++i0) + for (unsigned i1 = 0; i1 < unsigned(right.extent(1)); ++i1) + for (unsigned i2 = 0; i2 < unsigned(right.extent(2)); ++i2) { + const long j = &right(i0, i1, i2) - &right(0, 0, 0); + if (j <= offset || right_alloc <= j) { + update |= 2; + } + offset = j; + + if (&right(i0, i1, i2) != &right_stride(i0, i1, i2)) { + update |= 8; + } + } + + for (unsigned i0 = 0; i0 < unsigned(left.extent(0)); ++i0) + for (unsigned i1 = 0; i1 < unsigned(left.extent(1)); ++i1) + for (unsigned i2 = 0; i2 < unsigned(left.extent(2)); ++i2) { + if (&left(i0, i1, i2) != &left(i0, i1, i2, 0, 0, 0, 0)) { + update |= 3; + } + if (&right(i0, i1, i2) != &right(i0, i1, i2, 0, 0, 0, 0)) { + update |= 3; + } + } + } +}; + +template <class DataType, class DeviceType> +struct TestViewOperator_LeftAndRight<DataType, DeviceType, 2> { + using execution_space = DeviceType; + using memory_space = typename execution_space::memory_space; + using size_type = typename execution_space::size_type; + + using value_type = int; + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& update, + const volatile value_type& input) { + update |= input; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& update) { update = 0; } + + using left_view = + Kokkos::DynRankView<DataType, Kokkos::LayoutLeft, execution_space>; + + using right_view = + Kokkos::DynRankView<DataType, Kokkos::LayoutRight, execution_space>; + + left_view left; + right_view right; + long left_alloc; + long right_alloc; + + TestViewOperator_LeftAndRight(unsigned N0, unsigned N1) + : left("left", N0, N1), + right("right", N0, N1), + left_alloc(allocation_count(left)), + right_alloc(allocation_count(right)) {} + + static void testit(unsigned N0, unsigned N1) { + TestViewOperator_LeftAndRight driver(N0, N1); + + int error_flag = 0; + + Kokkos::parallel_reduce(1, driver, error_flag); + + ASSERT_EQ(error_flag, 0); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const size_type, value_type& update) const { + long offset; + + offset = -1; + for (unsigned i1 = 0; i1 < unsigned(left.extent(1)); ++i1) + for (unsigned i0 = 0; i0 < unsigned(left.extent(0)); ++i0) { + const long j = &left(i0, i1) - &left(0, 0); + if (j <= offset || left_alloc <= j) { + update |= 1; + } + offset = j; + } + + offset = -1; + for (unsigned i0 = 0; i0 < unsigned(right.extent(0)); ++i0) + for (unsigned i1 = 0; i1 < unsigned(right.extent(1)); ++i1) { + const long j = &right(i0, i1) - &right(0, 0); + if (j <= offset || right_alloc <= j) { + update |= 2; + } + offset = j; + } + + for (unsigned i0 = 0; i0 < unsigned(left.extent(0)); ++i0) + for (unsigned i1 = 0; i1 < unsigned(left.extent(1)); ++i1) { + if (&left(i0, i1) != &left(i0, i1, 0, 0, 0, 0, 0)) { + update |= 3; + } + if (&right(i0, i1) != &right(i0, i1, 0, 0, 0, 0, 0)) { + update |= 3; + } + } + } +}; + +template <class DataType, class DeviceType> +struct TestViewOperator_LeftAndRight<DataType, DeviceType, 1> { + using execution_space = DeviceType; + using memory_space = typename execution_space::memory_space; + using size_type = typename execution_space::size_type; + + using value_type = int; + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& update, + const volatile value_type& input) { + update |= input; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& update) { update = 0; } + + using left_view = + Kokkos::DynRankView<DataType, Kokkos::LayoutLeft, execution_space>; + + using right_view = + Kokkos::DynRankView<DataType, Kokkos::LayoutRight, execution_space>; + + using stride_view = + Kokkos::DynRankView<DataType, Kokkos::LayoutStride, execution_space>; + + left_view left; + right_view right; + stride_view left_stride; + stride_view right_stride; + long left_alloc; + long right_alloc; + + TestViewOperator_LeftAndRight(unsigned N0) + : left("left", N0), + right("right", N0), + left_stride(left), + right_stride(right), + left_alloc(allocation_count(left)), + right_alloc(allocation_count(right)) {} + + static void testit(unsigned N0) { + TestViewOperator_LeftAndRight driver(N0); + + int error_flag = 0; + + Kokkos::parallel_reduce(1, driver, error_flag); + + ASSERT_EQ(error_flag, 0); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const size_type, value_type& update) const { + for (unsigned i0 = 0; i0 < unsigned(left.extent(0)); ++i0) { + if (&left(i0) != &left(i0, 0, 0, 0, 0, 0, 0)) { + update |= 3; + } + if (&right(i0) != &right(i0, 0, 0, 0, 0, 0, 0)) { + update |= 3; + } + if (&left(i0) != &left_stride(i0)) { + update |= 4; + } + if (&right(i0) != &right_stride(i0)) { + update |= 8; + } + } + } +}; + +/*--------------------------------------------------------------------------*/ + +template <typename T, class DeviceType> +class TestDynViewAPI { + public: + using device = DeviceType; + + enum { N0 = 1000, N1 = 3, N2 = 5, N3 = 7 }; + + using dView0 = Kokkos::DynRankView<T, device>; + using const_dView0 = Kokkos::DynRankView<const T, device>; + + using dView0_unmanaged = + Kokkos::DynRankView<T, device, Kokkos::MemoryUnmanaged>; + using host_drv_space = typename dView0::host_mirror_space; + + using View0 = Kokkos::View<T, device>; + using View1 = Kokkos::View<T*, device>; + using View7 = Kokkos::View<T*******, device>; + + using host_view_space = typename View0::host_mirror_space; + + static void run_tests() { + run_test_resize_realloc(); + run_test_mirror(); + run_test_mirror_and_copy(); + run_test_scalar(); + run_test(); + run_test_allocated(); + run_test_const(); + run_test_subview(); + run_test_subview_strided(); + run_test_vector(); + } + + static void run_operator_test_rank12345() { + TestViewOperator<T, device>::testit(); + TestViewOperator_LeftAndRight<int, device, 5>::testit(2, 3, 4, 2, 3); + TestViewOperator_LeftAndRight<int, device, 4>::testit(2, 3, 4, 2); + TestViewOperator_LeftAndRight<int, device, 3>::testit(2, 3, 4); + TestViewOperator_LeftAndRight<int, device, 2>::testit(2, 3); + TestViewOperator_LeftAndRight<int, device, 1>::testit(2); + } + + static void run_operator_test_rank67() { + TestViewOperator_LeftAndRight<int, device, 7>::testit(2, 3, 4, 2, 3, 4, 2); + TestViewOperator_LeftAndRight<int, device, 6>::testit(2, 3, 4, 2, 3, 4); + } + + static void run_test_resize_realloc() { + dView0 drv0("drv0", 10, 20, 30); + ASSERT_EQ(drv0.rank(), 3); + + Kokkos::resize(drv0, 5, 10); + ASSERT_EQ(drv0.rank(), 2); + ASSERT_EQ(drv0.extent(0), 5); + ASSERT_EQ(drv0.extent(1), 10); + ASSERT_EQ(drv0.extent(2), 1); + + Kokkos::realloc(drv0, 10, 20); + ASSERT_EQ(drv0.rank(), 2); + ASSERT_EQ(drv0.extent(0), 10); + ASSERT_EQ(drv0.extent(1), 20); + ASSERT_EQ(drv0.extent(2), 1); + } + + static void run_test_mirror() { + using view_type = Kokkos::DynRankView<int, host_drv_space>; + using mirror_type = typename view_type::HostMirror; + view_type a("a"); + mirror_type am = Kokkos::create_mirror_view(a); + mirror_type ax = Kokkos::create_mirror(a); + ASSERT_EQ(&a(), &am()); + ASSERT_EQ(a.rank(), am.rank()); + ASSERT_EQ(ax.rank(), am.rank()); + + { + Kokkos::DynRankView<double, Kokkos::LayoutLeft, Kokkos::HostSpace> a_h( + "A", 1000); + auto a_h2 = Kokkos::create_mirror(Kokkos::HostSpace(), a_h); + auto a_d = Kokkos::create_mirror(typename device::memory_space(), a_h); + + int equal_ptr_h_h2 = (a_h.data() == a_h2.data()) ? 1 : 0; + int equal_ptr_h_d = (a_h.data() == a_d.data()) ? 1 : 0; + int equal_ptr_h2_d = (a_h2.data() == a_d.data()) ? 1 : 0; + + ASSERT_EQ(equal_ptr_h_h2, 0); + ASSERT_EQ(equal_ptr_h_d, 0); + ASSERT_EQ(equal_ptr_h2_d, 0); + + ASSERT_EQ(a_h.extent(0), a_h2.extent(0)); + ASSERT_EQ(a_h.extent(0), a_d.extent(0)); + + ASSERT_EQ(a_h.rank(), a_h2.rank()); + ASSERT_EQ(a_h.rank(), a_d.rank()); + } + { + Kokkos::DynRankView<double, Kokkos::LayoutRight, Kokkos::HostSpace> a_h( + "A", 1000); + auto a_h2 = Kokkos::create_mirror(Kokkos::HostSpace(), a_h); + auto a_d = Kokkos::create_mirror(typename device::memory_space(), a_h); + + int equal_ptr_h_h2 = (a_h.data() == a_h2.data()) ? 1 : 0; + int equal_ptr_h_d = (a_h.data() == a_d.data()) ? 1 : 0; + int equal_ptr_h2_d = (a_h2.data() == a_d.data()) ? 1 : 0; + + ASSERT_EQ(equal_ptr_h_h2, 0); + ASSERT_EQ(equal_ptr_h_d, 0); + ASSERT_EQ(equal_ptr_h2_d, 0); + + ASSERT_EQ(a_h.extent(0), a_h2.extent(0)); + ASSERT_EQ(a_h.extent(0), a_d.extent(0)); + + ASSERT_EQ(a_h.rank(), a_h2.rank()); + ASSERT_EQ(a_h.rank(), a_d.rank()); + } + + { + Kokkos::DynRankView<double, Kokkos::LayoutLeft, Kokkos::HostSpace> a_h( + "A", 1000); + auto a_h2 = Kokkos::create_mirror_view(Kokkos::HostSpace(), a_h); + auto a_d = + Kokkos::create_mirror_view(typename device::memory_space(), a_h); + + int equal_ptr_h_h2 = a_h.data() == a_h2.data() ? 1 : 0; + int equal_ptr_h_d = a_h.data() == a_d.data() ? 1 : 0; + int equal_ptr_h2_d = a_h2.data() == a_d.data() ? 1 : 0; + + int is_same_memspace = + std::is_same<Kokkos::HostSpace, typename device::memory_space>::value + ? 1 + : 0; + ASSERT_EQ(equal_ptr_h_h2, 1); + ASSERT_EQ(equal_ptr_h_d, is_same_memspace); + ASSERT_EQ(equal_ptr_h2_d, is_same_memspace); + + ASSERT_EQ(a_h.extent(0), a_h2.extent(0)); + ASSERT_EQ(a_h.extent(0), a_d.extent(0)); + + ASSERT_EQ(a_h.rank(), a_h2.rank()); + ASSERT_EQ(a_h.rank(), a_d.rank()); + } + { + Kokkos::DynRankView<double, Kokkos::LayoutRight, Kokkos::HostSpace> a_h( + "A", 1000); + auto a_h2 = Kokkos::create_mirror_view(Kokkos::HostSpace(), a_h); + auto a_d = + Kokkos::create_mirror_view(typename device::memory_space(), a_h); + + int equal_ptr_h_h2 = a_h.data() == a_h2.data() ? 1 : 0; + int equal_ptr_h_d = a_h.data() == a_d.data() ? 1 : 0; + int equal_ptr_h2_d = a_h2.data() == a_d.data() ? 1 : 0; + + int is_same_memspace = + std::is_same<Kokkos::HostSpace, typename device::memory_space>::value + ? 1 + : 0; + ASSERT_EQ(equal_ptr_h_h2, 1); + ASSERT_EQ(equal_ptr_h_d, is_same_memspace); + ASSERT_EQ(equal_ptr_h2_d, is_same_memspace); + + ASSERT_EQ(a_h.extent(0), a_h2.extent(0)); + ASSERT_EQ(a_h.extent(0), a_d.extent(0)); + + ASSERT_EQ(a_h.rank(), a_h2.rank()); + ASSERT_EQ(a_h.rank(), a_d.rank()); + } + { + using view_stride_type = + Kokkos::DynRankView<int, Kokkos::LayoutStride, Kokkos::HostSpace>; + unsigned order[] = {6, 5, 4, 3, 2, 1, 0}, + dimen[] = {N0, N1, N2, 2, 2, 2, 2}; // LayoutRight equivalent + view_stride_type a_h( + "a", Kokkos::LayoutStride::order_dimensions(7, order, dimen)); + auto a_h2 = Kokkos::create_mirror_view(Kokkos::HostSpace(), a_h); + auto a_d = + Kokkos::create_mirror_view(typename device::memory_space(), a_h); + + int equal_ptr_h_h2 = a_h.data() == a_h2.data() ? 1 : 0; + int equal_ptr_h_d = a_h.data() == a_d.data() ? 1 : 0; + int equal_ptr_h2_d = a_h2.data() == a_d.data() ? 1 : 0; + + int is_same_memspace = + std::is_same<Kokkos::HostSpace, typename device::memory_space>::value + ? 1 + : 0; + ASSERT_EQ(equal_ptr_h_h2, 1); + ASSERT_EQ(equal_ptr_h_d, is_same_memspace); + ASSERT_EQ(equal_ptr_h2_d, is_same_memspace); + + ASSERT_EQ(a_h.extent(0), a_h2.extent(0)); + ASSERT_EQ(a_h.extent(0), a_d.extent(0)); + + ASSERT_EQ(a_h.rank(), a_h2.rank()); + ASSERT_EQ(a_h.rank(), a_d.rank()); + } + } + + static void run_test_mirror_and_copy() { + // LayoutLeft + { + Kokkos::DynRankView<double, Kokkos::LayoutLeft, Kokkos::HostSpace> a_org( + "A", 10); + a_org(5) = 42.0; + Kokkos::DynRankView<double, Kokkos::LayoutLeft, Kokkos::HostSpace> a_h = + a_org; + auto a_h2 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), a_h); + auto a_d = Kokkos::create_mirror_view_and_copy(DeviceType(), a_h); + auto a_h3 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), a_d); + + int equal_ptr_h_h2 = a_h.data() == a_h2.data() ? 1 : 0; + int equal_ptr_h_d = a_h.data() == a_d.data() ? 1 : 0; + int equal_ptr_h2_d = a_h2.data() == a_d.data() ? 1 : 0; + int equal_ptr_h3_d = a_h3.data() == a_d.data() ? 1 : 0; + + int is_same_memspace = + std::is_same<Kokkos::HostSpace, + typename DeviceType::memory_space>::value + ? 1 + : 0; + ASSERT_EQ(equal_ptr_h_h2, 1); + ASSERT_EQ(equal_ptr_h_d, is_same_memspace); + ASSERT_EQ(equal_ptr_h2_d, is_same_memspace); + ASSERT_EQ(equal_ptr_h3_d, is_same_memspace); + + ASSERT_EQ(a_h.extent(0), a_h3.extent(0)); + ASSERT_EQ(a_h.extent(0), a_h2.extent(0)); + ASSERT_EQ(a_h.extent(0), a_d.extent(0)); + ASSERT_EQ(a_h.extent(0), a_h3.extent(0)); + ASSERT_EQ(a_h.rank(), a_org.rank()); + ASSERT_EQ(a_h.rank(), a_h2.rank()); + ASSERT_EQ(a_h.rank(), a_h3.rank()); + ASSERT_EQ(a_h.rank(), a_d.rank()); + ASSERT_EQ(a_org(5), a_h3(5)); + } + // LayoutRight + { + Kokkos::DynRankView<double, Kokkos::LayoutRight, Kokkos::HostSpace> a_org( + "A", 10); + a_org(5) = 42.0; + Kokkos::DynRankView<double, Kokkos::LayoutRight, Kokkos::HostSpace> a_h = + a_org; + auto a_h2 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), a_h); + auto a_d = Kokkos::create_mirror_view_and_copy(DeviceType(), a_h); + auto a_h3 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), a_d); + + int equal_ptr_h_h2 = a_h.data() == a_h2.data() ? 1 : 0; + int equal_ptr_h_d = a_h.data() == a_d.data() ? 1 : 0; + int equal_ptr_h2_d = a_h2.data() == a_d.data() ? 1 : 0; + int equal_ptr_h3_d = a_h3.data() == a_d.data() ? 1 : 0; + + int is_same_memspace = + std::is_same<Kokkos::HostSpace, + typename DeviceType::memory_space>::value + ? 1 + : 0; + ASSERT_EQ(equal_ptr_h_h2, 1); + ASSERT_EQ(equal_ptr_h_d, is_same_memspace); + ASSERT_EQ(equal_ptr_h2_d, is_same_memspace); + ASSERT_EQ(equal_ptr_h3_d, is_same_memspace); + + ASSERT_EQ(a_h.extent(0), a_h3.extent(0)); + ASSERT_EQ(a_h.extent(0), a_h2.extent(0)); + ASSERT_EQ(a_h.extent(0), a_d.extent(0)); + ASSERT_EQ(a_h.rank(), a_org.rank()); + ASSERT_EQ(a_h.rank(), a_h2.rank()); + ASSERT_EQ(a_h.rank(), a_h3.rank()); + ASSERT_EQ(a_h.rank(), a_d.rank()); + ASSERT_EQ(a_org(5), a_h3(5)); + } + } + + static void run_test_scalar() { + using hView0 = typename dView0::HostMirror; // HostMirror of DynRankView is + // a DynRankView + + dView0 dx, dy; + hView0 hx, hy; + + dx = dView0("dx"); + dy = dView0("dy"); + + hx = Kokkos::create_mirror(dx); + hy = Kokkos::create_mirror(dy); + + hx() = 1; + + Kokkos::deep_copy(dx, hx); + Kokkos::deep_copy(dy, dx); + Kokkos::deep_copy(hy, dy); + + ASSERT_EQ(hx(), hy()); + ASSERT_EQ(dx.rank(), hx.rank()); + ASSERT_EQ(dy.rank(), hy.rank()); + + // View - DynRankView Interoperability tests + // deep_copy DynRankView to View + View0 vx("vx"); + Kokkos::deep_copy(vx, dx); + ASSERT_EQ(rank(dx), rank(vx)); + + View0 vy("vy"); + Kokkos::deep_copy(vy, dy); + ASSERT_EQ(rank(dy), rank(vy)); + + // deep_copy View to DynRankView + dView0 dxx("dxx"); + Kokkos::deep_copy(dxx, vx); + ASSERT_EQ(rank(dxx), rank(vx)); + + View7 vcast = dx.ConstDownCast(); + ASSERT_EQ(dx.extent(0), vcast.extent(0)); + ASSERT_EQ(dx.extent(1), vcast.extent(1)); + ASSERT_EQ(dx.extent(2), vcast.extent(2)); + ASSERT_EQ(dx.extent(3), vcast.extent(3)); + ASSERT_EQ(dx.extent(4), vcast.extent(4)); + + View7 vcast1(dy.ConstDownCast()); + ASSERT_EQ(dy.extent(0), vcast1.extent(0)); + ASSERT_EQ(dy.extent(1), vcast1.extent(1)); + ASSERT_EQ(dy.extent(2), vcast1.extent(2)); + ASSERT_EQ(dy.extent(3), vcast1.extent(3)); + ASSERT_EQ(dy.extent(4), vcast1.extent(4)); + + // View - DynRankView Interoperability tests + // copy View to DynRankView + dView0 dfromvx(vx); + auto hmx = Kokkos::create_mirror_view(dfromvx); + Kokkos::deep_copy(hmx, dfromvx); + auto hvx = Kokkos::create_mirror_view(vx); + Kokkos::deep_copy(hvx, vx); + ASSERT_EQ(rank(hvx), rank(hmx)); + ASSERT_EQ(hvx.extent(0), hmx.extent(0)); + ASSERT_EQ(hvx.extent(1), hmx.extent(1)); + + // copy-assign View to DynRankView + dView0 dfromvy = vy; + auto hmy = Kokkos::create_mirror_view(dfromvy); + Kokkos::deep_copy(hmy, dfromvy); + auto hvy = Kokkos::create_mirror_view(vy); + Kokkos::deep_copy(hvy, vy); + ASSERT_EQ(rank(hvy), rank(hmy)); + ASSERT_EQ(hvy.extent(0), hmy.extent(0)); + ASSERT_EQ(hvy.extent(1), hmy.extent(1)); + + View7 vtest1("vtest1", 2, 2, 2, 2, 2, 2, 2); + dView0 dfromv1(vtest1); + ASSERT_EQ(dfromv1.rank(), vtest1.Rank); + ASSERT_EQ(dfromv1.extent(0), vtest1.extent(0)); + ASSERT_EQ(dfromv1.extent(1), vtest1.extent(1)); + ASSERT_EQ(dfromv1.use_count(), vtest1.use_count()); + + dView0 dfromv2(vcast); + ASSERT_EQ(dfromv2.rank(), vcast.Rank); + ASSERT_EQ(dfromv2.extent(0), vcast.extent(0)); + ASSERT_EQ(dfromv2.extent(1), vcast.extent(1)); + ASSERT_EQ(dfromv2.use_count(), vcast.use_count()); + + dView0 dfromv3 = vcast1; + ASSERT_EQ(dfromv3.rank(), vcast1.Rank); + ASSERT_EQ(dfromv3.extent(0), vcast1.extent(0)); + ASSERT_EQ(dfromv3.extent(1), vcast1.extent(1)); + ASSERT_EQ(dfromv3.use_count(), vcast1.use_count()); + } + + static void run_test() { + // mfh 14 Feb 2014: This test doesn't actually create instances of + // these types. In order to avoid "unused type alias" + // warnings, we declare empty instances of these types, with the + // usual "(void)" marker to avoid compiler warnings for unused + // variables. + + using hView0 = typename dView0::HostMirror; + + { + hView0 thing; + (void)thing; + } + + dView0 d_uninitialized( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "uninit"), 10, 20); + ASSERT_TRUE(d_uninitialized.data() != nullptr); + ASSERT_EQ(d_uninitialized.rank(), 2); + ASSERT_EQ(d_uninitialized.extent(0), 10); + ASSERT_EQ(d_uninitialized.extent(1), 20); + ASSERT_EQ(d_uninitialized.extent(2), 1); + + dView0 dx, dy, dz; + hView0 hx, hy, hz; + + ASSERT_TRUE(Kokkos::is_dyn_rank_view<dView0>::value); + ASSERT_FALSE(Kokkos::is_dyn_rank_view<Kokkos::View<double> >::value); + + ASSERT_TRUE(dx.data() == nullptr); // Okay with UVM + ASSERT_TRUE(dy.data() == nullptr); // Okay with UVM + ASSERT_TRUE(dz.data() == nullptr); // Okay with UVM + ASSERT_TRUE(hx.data() == nullptr); + ASSERT_TRUE(hy.data() == nullptr); + ASSERT_TRUE(hz.data() == nullptr); + ASSERT_EQ(dx.extent(0), 0u); // Okay with UVM + ASSERT_EQ(dy.extent(0), 0u); // Okay with UVM + ASSERT_EQ(dz.extent(0), 0u); // Okay with UVM + ASSERT_EQ(hx.extent(0), 0u); + ASSERT_EQ(hy.extent(0), 0u); + ASSERT_EQ(hz.extent(0), 0u); + ASSERT_EQ(dx.rank(), 0u); // Okay with UVM + ASSERT_EQ(hx.rank(), 0u); + + dx = dView0("dx", N1, N2, N3); + dy = dView0("dy", N1, N2, N3); + + hx = hView0("hx", N1, N2, N3); + hy = hView0("hy", N1, N2, N3); + + ASSERT_EQ(dx.extent(0), unsigned(N1)); // Okay with UVM + ASSERT_EQ(dy.extent(0), unsigned(N1)); // Okay with UVM + ASSERT_EQ(hx.extent(0), unsigned(N1)); + ASSERT_EQ(hy.extent(0), unsigned(N1)); + ASSERT_EQ(dx.rank(), 3); // Okay with UVM + ASSERT_EQ(hx.rank(), 3); + + dx = dView0("dx", N0, N1, N2, N3); + dy = dView0("dy", N0, N1, N2, N3); + hx = hView0("hx", N0, N1, N2, N3); + hy = hView0("hy", N0, N1, N2, N3); + + ASSERT_EQ(dx.extent(0), unsigned(N0)); + ASSERT_EQ(dy.extent(0), unsigned(N0)); + ASSERT_EQ(hx.extent(0), unsigned(N0)); + ASSERT_EQ(hy.extent(0), unsigned(N0)); + ASSERT_EQ(dx.rank(), 4); + ASSERT_EQ(dy.rank(), 4); + ASSERT_EQ(hx.rank(), 4); + ASSERT_EQ(hy.rank(), 4); + + ASSERT_EQ(dx.use_count(), size_t(1)); + + dView0_unmanaged unmanaged_dx = dx; + ASSERT_EQ(dx.use_count(), size_t(1)); + + dView0_unmanaged unmanaged_from_ptr_dx = dView0_unmanaged( + dx.data(), dx.extent(0), dx.extent(1), dx.extent(2), dx.extent(3)); + + { + // Destruction of this view should be harmless + const_dView0 unmanaged_from_ptr_const_dx( + dx.data(), dx.extent(0), dx.extent(1), dx.extent(2), dx.extent(3)); + } + + const_dView0 const_dx = dx; + ASSERT_EQ(dx.use_count(), size_t(2)); + + { + const_dView0 const_dx2; + const_dx2 = const_dx; + ASSERT_EQ(dx.use_count(), size_t(3)); + + const_dx2 = dy; + ASSERT_EQ(dx.use_count(), size_t(2)); + + const_dView0 const_dx3(dx); + ASSERT_EQ(dx.use_count(), size_t(3)); + + dView0_unmanaged dx4_unmanaged(dx); + ASSERT_EQ(dx.use_count(), size_t(3)); + } + + ASSERT_EQ(dx.use_count(), size_t(2)); + + ASSERT_FALSE(dx.data() == nullptr); + ASSERT_FALSE(const_dx.data() == nullptr); + ASSERT_FALSE(unmanaged_dx.data() == nullptr); + ASSERT_FALSE(unmanaged_from_ptr_dx.data() == nullptr); + ASSERT_FALSE(dy.data() == nullptr); + ASSERT_NE(dx, dy); + + ASSERT_EQ(dx.extent(0), unsigned(N0)); + ASSERT_EQ(dx.extent(1), unsigned(N1)); + ASSERT_EQ(dx.extent(2), unsigned(N2)); + ASSERT_EQ(dx.extent(3), unsigned(N3)); + + ASSERT_EQ(dy.extent(0), unsigned(N0)); + ASSERT_EQ(dy.extent(1), unsigned(N1)); + ASSERT_EQ(dy.extent(2), unsigned(N2)); + ASSERT_EQ(dy.extent(3), unsigned(N3)); + + ASSERT_EQ(unmanaged_from_ptr_dx.span(), + unsigned(N0) * unsigned(N1) * unsigned(N2) * unsigned(N3)); + + hx = Kokkos::create_mirror(dx); + hy = Kokkos::create_mirror(dy); + + ASSERT_EQ(hx.rank(), dx.rank()); + ASSERT_EQ(hy.rank(), dy.rank()); + + ASSERT_EQ(hx.extent(0), unsigned(N0)); + ASSERT_EQ(hx.extent(1), unsigned(N1)); + ASSERT_EQ(hx.extent(2), unsigned(N2)); + ASSERT_EQ(hx.extent(3), unsigned(N3)); + + ASSERT_EQ(hy.extent(0), unsigned(N0)); + ASSERT_EQ(hy.extent(1), unsigned(N1)); + ASSERT_EQ(hy.extent(2), unsigned(N2)); + ASSERT_EQ(hy.extent(3), unsigned(N3)); + + // T v1 = hx() ; // Generates compile error as intended + // T v2 = hx(0,0) ; // Generates compile error as intended + // hx(0,0) = v2 ; // Generates compile error as intended + +#if 0 /* Asynchronous deep copies not implemented for dynamic rank view */ + // Testing with asynchronous deep copy with respect to device + { + size_t count = 0 ; + for ( size_t ip = 0 ; ip < N0 ; ++ip ) { + for ( size_t i1 = 0 ; i1 < hx.extent(1) ; ++i1 ) { + for ( size_t i2 = 0 ; i2 < hx.extent(2) ; ++i2 ) { + for ( size_t i3 = 0 ; i3 < hx.extent(3) ; ++i3 ) { + hx(ip,i1,i2,i3) = ++count ; + }}}} + + + Kokkos::deep_copy(typename hView0::execution_space(), dx , hx ); + Kokkos::deep_copy(typename hView0::execution_space(), dy , dx ); + Kokkos::deep_copy(typename hView0::execution_space(), hy , dy ); + + for ( size_t ip = 0 ; ip < N0 ; ++ip ) { + for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) { + for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) { + for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) { + { ASSERT_EQ( hx(ip,i1,i2,i3) , hy(ip,i1,i2,i3) ); } + }}}} + + Kokkos::deep_copy(typename hView0::execution_space(), dx , T(0) ); + Kokkos::deep_copy(typename hView0::execution_space(), hx , dx ); + + for ( size_t ip = 0 ; ip < N0 ; ++ip ) { + for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) { + for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) { + for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) { + { ASSERT_EQ( hx(ip,i1,i2,i3) , T(0) ); } + }}}} + } + + // Testing with asynchronous deep copy with respect to host + { + size_t count = 0 ; + for ( size_t ip = 0 ; ip < N0 ; ++ip ) { + for ( size_t i1 = 0 ; i1 < hx.extent(1) ; ++i1 ) { + for ( size_t i2 = 0 ; i2 < hx.extent(2) ; ++i2 ) { + for ( size_t i3 = 0 ; i3 < hx.extent(3) ; ++i3 ) { + hx(ip,i1,i2,i3) = ++count ; + }}}} + + Kokkos::deep_copy(typename dView0::execution_space(), dx , hx ); + Kokkos::deep_copy(typename dView0::execution_space(), dy , dx ); + Kokkos::deep_copy(typename dView0::execution_space(), hy , dy ); + + for ( size_t ip = 0 ; ip < N0 ; ++ip ) { + for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) { + for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) { + for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) { + { ASSERT_EQ( hx(ip,i1,i2,i3) , hy(ip,i1,i2,i3) ); } + }}}} + + Kokkos::deep_copy(typename dView0::execution_space(), dx , T(0) ); + Kokkos::deep_copy(typename dView0::execution_space(), hx , dx ); + + for ( size_t ip = 0 ; ip < N0 ; ++ip ) { + for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) { + for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) { + for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) { + { ASSERT_EQ( hx(ip,i1,i2,i3) , T(0) ); } + }}}} + } +#endif + + // Testing with synchronous deep copy + { + size_t count = 0; + for (size_t ip = 0; ip < N0; ++ip) { + for (size_t i1 = 0; i1 < hx.extent(1); ++i1) { + for (size_t i2 = 0; i2 < hx.extent(2); ++i2) { + for (size_t i3 = 0; i3 < hx.extent(3); ++i3) { + hx(ip, i1, i2, i3) = ++count; + } + } + } + } + + Kokkos::deep_copy(dx, hx); + Kokkos::deep_copy(dy, dx); + Kokkos::deep_copy(hy, dy); + Kokkos::fence(); + + for (size_t ip = 0; ip < N0; ++ip) { + for (size_t i1 = 0; i1 < N1; ++i1) { + for (size_t i2 = 0; i2 < N2; ++i2) { + for (size_t i3 = 0; i3 < N3; ++i3) { + { + ASSERT_EQ(hx(ip, i1, i2, i3), hy(ip, i1, i2, i3)); + } + } + } + } + } + + Kokkos::deep_copy(dx, T(0)); + Kokkos::deep_copy(hx, dx); + Kokkos::fence(); + + for (size_t ip = 0; ip < N0; ++ip) { + for (size_t i1 = 0; i1 < N1; ++i1) { + for (size_t i2 = 0; i2 < N2; ++i2) { + for (size_t i3 = 0; i3 < N3; ++i3) { + { + ASSERT_EQ(hx(ip, i1, i2, i3), T(0)); + } + } + } + } + } + // ASSERT_EQ( hx(0,0,0,0,0,0,0,0) , T(0) ); //Test rank8 op behaves + // properly - if implemented + } + + dz = dx; + ASSERT_EQ(dx, dz); + ASSERT_NE(dy, dz); + dz = dy; + ASSERT_EQ(dy, dz); + ASSERT_NE(dx, dz); + + dx = dView0(); + ASSERT_TRUE(dx.data() == nullptr); + ASSERT_FALSE(dy.data() == nullptr); + ASSERT_FALSE(dz.data() == nullptr); + dy = dView0(); + ASSERT_TRUE(dx.data() == nullptr); + ASSERT_TRUE(dy.data() == nullptr); + ASSERT_FALSE(dz.data() == nullptr); + dz = dView0(); + ASSERT_TRUE(dx.data() == nullptr); + ASSERT_TRUE(dy.data() == nullptr); + ASSERT_TRUE(dz.data() == nullptr); + + // View - DynRankView Interoperability tests + // deep_copy from view to dynrankview + const int testdim = 4; + dView0 dxx("dxx", testdim); + View1 vxx("vxx", testdim); + auto hvxx = Kokkos::create_mirror_view(vxx); + for (int i = 0; i < testdim; ++i) { + hvxx(i) = i; + } + Kokkos::deep_copy(vxx, hvxx); + Kokkos::deep_copy(dxx, vxx); + auto hdxx = Kokkos::create_mirror_view(dxx); + Kokkos::deep_copy(hdxx, dxx); + for (int i = 0; i < testdim; ++i) { + ASSERT_EQ(hvxx(i), hdxx(i)); + } + + ASSERT_EQ(rank(hdxx), rank(hvxx)); + ASSERT_EQ(hdxx.extent(0), testdim); + ASSERT_EQ(hdxx.extent(0), hvxx.extent(0)); + + // deep_copy from dynrankview to view + View1 vdxx("vdxx", testdim); + auto hvdxx = Kokkos::create_mirror_view(vdxx); + Kokkos::deep_copy(hvdxx, hdxx); + ASSERT_EQ(rank(hdxx), rank(hvdxx)); + ASSERT_EQ(hvdxx.extent(0), testdim); + ASSERT_EQ(hdxx.extent(0), hvdxx.extent(0)); + for (int i = 0; i < testdim; ++i) { + ASSERT_EQ(hvxx(i), hvdxx(i)); + } + } + + using DataType = T; + + static void check_auto_conversion_to_const( + const Kokkos::DynRankView<const DataType, device>& arg_const, + const Kokkos::DynRankView<DataType, device>& arg) { + ASSERT_TRUE(arg_const == arg); + } + + static void run_test_allocated() { + using device_type = Kokkos::DynRankView<DataType, device>; + + const int N1 = 100; + const int N2 = 10; + + device_type d1; + ASSERT_FALSE(d1.is_allocated()); + + d1 = device_type("d1", N1, N2); + device_type d2(d1); + device_type d3("d3", N1); + ASSERT_TRUE(d1.is_allocated()); + ASSERT_TRUE(d2.is_allocated()); + ASSERT_TRUE(d3.is_allocated()); + } + + static void run_test_const() { + using typeX = Kokkos::DynRankView<DataType, device>; + using const_typeX = Kokkos::DynRankView<const DataType, device>; + using const_typeR = + Kokkos::DynRankView<const DataType, device, Kokkos::MemoryRandomAccess>; + typeX x("X", 2); + const_typeX xc = x; + const_typeR xr = x; + + ASSERT_TRUE(xc == x); + ASSERT_TRUE(x == xc); + + // For CUDA the constant random access View does not return + // an lvalue reference due to retrieving through texture cache + // therefore not allowed to query the underlying pointer. +#if defined(KOKKOS_ENABLE_CUDA) + if (!std::is_same<typename device::execution_space, Kokkos::Cuda>::value) +#endif + { + ASSERT_TRUE(x.data() == xr.data()); + } + + // typeX xf = xc ; // setting non-const from const must not compile + + check_auto_conversion_to_const(x, x); + } + + static void run_test_subview() { + using cdView = Kokkos::DynRankView<const T, device>; + using dView = Kokkos::DynRankView<T, device>; + // LayoutStride required for all returned DynRankView subdynrankview's + using sdView = Kokkos::DynRankView<T, Kokkos::LayoutStride, device>; + + dView0 d0("d0"); + cdView s0 = d0; + + // N0 = 1000,N1 = 3,N2 = 5,N3 = 7 + unsigned order[] = {6, 5, 4, 3, 2, 1, 0}, + dimen[] = {N0, N1, N2, 2, 2, 2, 2}; // LayoutRight equivalent + sdView d7("d7", Kokkos::LayoutStride::order_dimensions(7, order, dimen)); + ASSERT_EQ(d7.rank(), 7); + + sdView ds0 = Kokkos::subdynrankview(d7, 1, 1, 1, 1, 1, 1, 1); + ASSERT_EQ(ds0.rank(), 0); + + // Basic test - ALL + sdView dsALL = Kokkos::subdynrankview( + d7, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + ASSERT_EQ(dsALL.rank(), 7); + + // Send a value to final rank returning rank 6 subview + sdView dsm1 = + Kokkos::subdynrankview(d7, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), 1); + ASSERT_EQ(dsm1.rank(), 6); + + // Send a std::pair as argument to a rank + sdView dssp = Kokkos::subdynrankview( + d7, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), std::pair<unsigned, unsigned>(1, 2)); + ASSERT_EQ(dssp.rank(), 7); + + // Send a kokkos::pair as argument to a rank; take default layout as input + dView0 dd0("dd0", N0, N1, N2, 2, 2, 2, 2); // default layout + ASSERT_EQ(dd0.rank(), 7); + sdView dtkp = Kokkos::subdynrankview( + dd0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), Kokkos::pair<unsigned, unsigned>(0, 1)); + ASSERT_EQ(dtkp.rank(), 7); + + // Return rank 7 subview, taking a pair as one argument, layout stride input + sdView ds7 = Kokkos::subdynrankview( + d7, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), Kokkos::pair<unsigned, unsigned>(0, 1)); + ASSERT_EQ(ds7.rank(), 7); + + // Default Layout DynRankView + dView dv6("dv6", N0, N1, N2, N3, 2, 2); + ASSERT_EQ(dv6.rank(), 6); + + // DynRankView with LayoutRight + using drView = Kokkos::DynRankView<T, Kokkos::LayoutRight, device>; + drView dr5("dr5", N0, N1, N2, 2, 2); + ASSERT_EQ(dr5.rank(), 5); + + // LayoutStride but arranged as LayoutRight + // NOTE: unused arg_layout dimensions must be set toKOKKOS_INVALID_INDEX so + // that + // rank deduction can properly take place + unsigned order5[] = {4, 3, 2, 1, 0}, dimen5[] = {N0, N1, N2, 2, 2}; + Kokkos::LayoutStride ls = + Kokkos::LayoutStride::order_dimensions(5, order5, dimen5); + ls.dimension[5] = KOKKOS_INVALID_INDEX; + ls.dimension[6] = KOKKOS_INVALID_INDEX; + ls.dimension[7] = KOKKOS_INVALID_INDEX; + sdView d5("d5", ls); + ASSERT_EQ(d5.rank(), 5); + + // LayoutStride arranged as LayoutRight - commented out as example that + // fails unit test + // unsigned order5[] = { 4,3,2,1,0 }, dimen5[] = { N0, N1, N2, 2, 2 }; + // sdView d5( "d5" , Kokkos::LayoutStride::order_dimensions(5, order5, + // dimen5) ); + // + // Fails the following unit test: + // ASSERT_EQ( d5.rank() , dr5.rank() ); + // + // Explanation: In construction of the Kokkos::LayoutStride below, since + // the + // remaining dimensions are not specified, they will default to values of + // 0 rather thanKOKKOS_INVALID_INDEX. + // When passed to the DynRankView constructor the default dimensions (of 0) + // will be counted toward the dynamic rank and returning an incorrect + // value (i.e. rank 7 rather than 5). + + // Check LayoutRight dr5 and LayoutStride d5 dimensions agree (as they + // should) + ASSERT_EQ(d5.extent(0), dr5.extent(0)); + ASSERT_EQ(d5.extent(1), dr5.extent(1)); + ASSERT_EQ(d5.extent(2), dr5.extent(2)); + ASSERT_EQ(d5.extent(3), dr5.extent(3)); + ASSERT_EQ(d5.extent(4), dr5.extent(4)); + ASSERT_EQ(d5.extent(5), dr5.extent(5)); + ASSERT_EQ(d5.rank(), dr5.rank()); + + // Rank 5 subview of rank 5 dynamic rank view, layout stride input + sdView ds5 = Kokkos::subdynrankview(d5, Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), + Kokkos::pair<unsigned, unsigned>(0, 1)); + ASSERT_EQ(ds5.rank(), 5); + + // Pass in extra ALL arguments beyond the rank of the DynRank View. + // This behavior is allowed - ignore the extra ALL arguments when + // the src.rank() < number of arguments, but be careful! + sdView ds5plus = Kokkos::subdynrankview( + d5, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), + Kokkos::pair<unsigned, unsigned>(0, 1), Kokkos::ALL()); + + ASSERT_EQ(ds5.rank(), ds5plus.rank()); + ASSERT_EQ(ds5.extent(0), ds5plus.extent(0)); + ASSERT_EQ(ds5.extent(4), ds5plus.extent(4)); + ASSERT_EQ(ds5.extent(5), ds5plus.extent(5)); + +#if (!defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_CUDA_UVM)) && \ + !defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_SYCL) + ASSERT_EQ(&ds5(1, 1, 1, 1, 0) - &ds5plus(1, 1, 1, 1, 0), 0); + ASSERT_EQ(&ds5(1, 1, 1, 1, 0, 0) - &ds5plus(1, 1, 1, 1, 0, 0), + 0); // passing argument to rank beyond the view's rank is allowed + // iff it is a 0. +#endif + + // Similar test to rank 5 above, but create rank 4 subview + // Check that the rank contracts (ds4 and ds4plus) and that subdynrankview + // can accept extra args (ds4plus) + sdView ds4 = Kokkos::subdynrankview(d5, Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), 0); + sdView ds4plus = + Kokkos::subdynrankview(d5, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), 0, Kokkos::ALL()); + + ASSERT_EQ(ds4.rank(), ds4plus.rank()); + ASSERT_EQ(ds4.rank(), 4); + ASSERT_EQ(ds4.extent(0), ds4plus.extent(0)); + ASSERT_EQ(ds4.extent(4), ds4plus.extent(4)); + ASSERT_EQ(ds4.extent(5), ds4plus.extent(5)); + } + + static void run_test_subview_strided() { + using drview_left = + Kokkos::DynRankView<int, Kokkos::LayoutLeft, host_drv_space>; + using drview_right = + Kokkos::DynRankView<int, Kokkos::LayoutRight, host_drv_space>; + using drview_stride = + Kokkos::DynRankView<int, Kokkos::LayoutStride, host_drv_space>; + + drview_left xl2("xl2", 100, 200); + drview_right xr2("xr2", 100, 200); + drview_stride yl1 = Kokkos::subdynrankview(xl2, 0, Kokkos::ALL()); + drview_stride yl2 = Kokkos::subdynrankview(xl2, 1, Kokkos::ALL()); + drview_stride ys1 = Kokkos::subdynrankview(xr2, 0, Kokkos::ALL()); + drview_stride ys2 = Kokkos::subdynrankview(xr2, 1, Kokkos::ALL()); + drview_stride yr1 = Kokkos::subdynrankview(xr2, 0, Kokkos::ALL()); + drview_stride yr2 = Kokkos::subdynrankview(xr2, 1, Kokkos::ALL()); + + ASSERT_EQ(yl1.extent(0), xl2.extent(1)); + ASSERT_EQ(yl2.extent(0), xl2.extent(1)); + + ASSERT_EQ(yr1.extent(0), xr2.extent(1)); + ASSERT_EQ(yr2.extent(0), xr2.extent(1)); + + ASSERT_EQ(&yl1(0) - &xl2(0, 0), 0); + ASSERT_EQ(&yl2(0) - &xl2(1, 0), 0); + ASSERT_EQ(&yr1(0) - &xr2(0, 0), 0); + ASSERT_EQ(&yr2(0) - &xr2(1, 0), 0); + + drview_left xl4("xl4", 10, 20, 30, 40); + drview_right xr4("xr4", 10, 20, 30, 40); + + // Replace subdynrankview with subview - test + drview_stride yl4 = + Kokkos::subview(xl4, 1, Kokkos::ALL(), 2, Kokkos::ALL()); + drview_stride yr4 = + Kokkos::subview(xr4, 1, Kokkos::ALL(), 2, Kokkos::ALL()); + + ASSERT_EQ(yl4.extent(0), xl4.extent(1)); + ASSERT_EQ(yl4.extent(1), xl4.extent(3)); + ASSERT_EQ(yr4.extent(0), xr4.extent(1)); + ASSERT_EQ(yr4.extent(1), xr4.extent(3)); + ASSERT_EQ(yl4.rank(), 2); + ASSERT_EQ(yr4.rank(), 2); + + ASSERT_EQ(&yl4(4, 4) - &xl4(1, 4, 2, 4), 0); + ASSERT_EQ(&yr4(4, 4) - &xr4(1, 4, 2, 4), 0); + } + + static void run_test_vector() { + static const unsigned Length = 1000, Count = 8; + + using multivector_type = + typename Kokkos::DynRankView<T, Kokkos::LayoutLeft, host_drv_space>; + + using multivector_right_type = + typename Kokkos::DynRankView<T, Kokkos::LayoutRight, host_drv_space>; + + multivector_type mv = multivector_type("mv", Length, Count); + multivector_right_type mv_right = + multivector_right_type("mv", Length, Count); + + using svector_type = + typename Kokkos::DynRankView<T, Kokkos::LayoutStride, host_drv_space>; + using smultivector_type = + typename Kokkos::DynRankView<T, Kokkos::LayoutStride, host_drv_space>; + using const_svector_right_type = + typename Kokkos::DynRankView<const T, Kokkos::LayoutStride, + host_drv_space>; + using const_svector_type = + typename Kokkos::DynRankView<const T, Kokkos::LayoutStride, + host_drv_space>; + using const_smultivector_type = + typename Kokkos::DynRankView<const T, Kokkos::LayoutStride, + host_drv_space>; + + svector_type v1 = Kokkos::subdynrankview(mv, Kokkos::ALL(), 0); + svector_type v2 = Kokkos::subdynrankview(mv, Kokkos::ALL(), 1); + svector_type v3 = Kokkos::subdynrankview(mv, Kokkos::ALL(), 2); + + svector_type rv1 = Kokkos::subdynrankview(mv_right, 0, Kokkos::ALL()); + svector_type rv2 = Kokkos::subdynrankview(mv_right, 1, Kokkos::ALL()); + svector_type rv3 = Kokkos::subdynrankview(mv_right, 2, Kokkos::ALL()); + + smultivector_type mv1 = Kokkos::subdynrankview(mv, std::make_pair(1, 998), + std::make_pair(2, 5)); + + smultivector_type mvr1 = Kokkos::subdynrankview( + mv_right, std::make_pair(1, 998), std::make_pair(2, 5)); + + const_svector_type cv1 = Kokkos::subdynrankview(mv, Kokkos::ALL(), 0); + const_svector_type cv2 = Kokkos::subdynrankview(mv, Kokkos::ALL(), 1); + const_svector_type cv3 = Kokkos::subdynrankview(mv, Kokkos::ALL(), 2); + + svector_type vr1 = Kokkos::subdynrankview(mv, Kokkos::ALL(), 0); + svector_type vr2 = Kokkos::subdynrankview(mv, Kokkos::ALL(), 1); + svector_type vr3 = Kokkos::subdynrankview(mv, Kokkos::ALL(), 2); + + const_svector_right_type cvr1 = + Kokkos::subdynrankview(mv, Kokkos::ALL(), 0); + const_svector_right_type cvr2 = + Kokkos::subdynrankview(mv, Kokkos::ALL(), 1); + const_svector_right_type cvr3 = + Kokkos::subdynrankview(mv, Kokkos::ALL(), 2); + + ASSERT_TRUE(&v1[0] == &v1(0)); + ASSERT_TRUE(&v1[0] == &mv(0, 0)); + ASSERT_TRUE(&v2[0] == &mv(0, 1)); + ASSERT_TRUE(&v3[0] == &mv(0, 2)); + + ASSERT_TRUE(&cv1[0] == &mv(0, 0)); + ASSERT_TRUE(&cv2[0] == &mv(0, 1)); + ASSERT_TRUE(&cv3[0] == &mv(0, 2)); + + ASSERT_TRUE(&vr1[0] == &mv(0, 0)); + ASSERT_TRUE(&vr2[0] == &mv(0, 1)); + ASSERT_TRUE(&vr3[0] == &mv(0, 2)); + + ASSERT_TRUE(&cvr1[0] == &mv(0, 0)); + ASSERT_TRUE(&cvr2[0] == &mv(0, 1)); + ASSERT_TRUE(&cvr3[0] == &mv(0, 2)); + + ASSERT_TRUE(&mv1(0, 0) == &mv(1, 2)); + ASSERT_TRUE(&mv1(1, 1) == &mv(2, 3)); + ASSERT_TRUE(&mv1(3, 2) == &mv(4, 4)); + ASSERT_TRUE(&mvr1(0, 0) == &mv_right(1, 2)); + ASSERT_TRUE(&mvr1(1, 1) == &mv_right(2, 3)); + ASSERT_TRUE(&mvr1(3, 2) == &mv_right(4, 4)); + + const_svector_type c_cv1(v1); + typename svector_type::const_type c_cv2(v2); + typename const_svector_type::const_type c_ccv2(v2); + + const_smultivector_type cmv(mv); + typename smultivector_type::const_type cmvX(cmv); + typename const_smultivector_type::const_type ccmvX(cmv); + } +}; + +} // namespace Test + +/*--------------------------------------------------------------------------*/ diff --git a/packages/kokkos/containers/unit_tests/TestDynViewAPI_generic.hpp b/packages/kokkos/containers/unit_tests/TestDynViewAPI_generic.hpp new file mode 100644 index 0000000000000000000000000000000000000000..90ca5df194d5c33f53446125470ab7f464085095 --- /dev/null +++ b/packages/kokkos/containers/unit_tests/TestDynViewAPI_generic.hpp @@ -0,0 +1,50 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestDynViewAPI.hpp> +namespace Test { +TEST(TEST_CATEGORY, dyn_rank_view_api_generic) { + TestDynViewAPI<double, TEST_EXECSPACE>::run_tests(); +} +} // namespace Test diff --git a/packages/kokkos/containers/unit_tests/TestDynViewAPI_rank12345.hpp b/packages/kokkos/containers/unit_tests/TestDynViewAPI_rank12345.hpp new file mode 100644 index 0000000000000000000000000000000000000000..050ebbe35cab5e85b726b7ad5f9a10f3170607bb --- /dev/null +++ b/packages/kokkos/containers/unit_tests/TestDynViewAPI_rank12345.hpp @@ -0,0 +1,51 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestDynViewAPI.hpp> + +namespace Test { +TEST(TEST_CATEGORY, dyn_rank_view_api_operator_rank12345) { + TestDynViewAPI<double, TEST_EXECSPACE>::run_operator_test_rank12345(); +} +} // namespace Test diff --git a/packages/kokkos/containers/unit_tests/TestDynViewAPI_rank67.hpp b/packages/kokkos/containers/unit_tests/TestDynViewAPI_rank67.hpp new file mode 100644 index 0000000000000000000000000000000000000000..eb8df60a89d204fee4e454956304e48026a7c120 --- /dev/null +++ b/packages/kokkos/containers/unit_tests/TestDynViewAPI_rank67.hpp @@ -0,0 +1,50 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestDynViewAPI.hpp> +namespace Test { +TEST(TEST_CATEGORY, dyn_rank_view_api_operator_rank67) { + TestDynViewAPI<double, TEST_EXECSPACE>::run_operator_test_rank67(); +} +} // namespace Test diff --git a/packages/kokkos/containers/unit_tests/TestDynamicView.hpp b/packages/kokkos/containers/unit_tests/TestDynamicView.hpp new file mode 100644 index 0000000000000000000000000000000000000000..f018793dd6f3b162acbf9db20174c47ac75fc1c0 --- /dev/null +++ b/packages/kokkos/containers/unit_tests/TestDynamicView.hpp @@ -0,0 +1,256 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_DYNAMICVIEW_HPP +#define KOKKOS_TEST_DYNAMICVIEW_HPP + +#include <gtest/gtest.h> +#include <iostream> +#include <cstdlib> +#include <cstdio> +#include <Kokkos_Core.hpp> + +#include <Kokkos_DynamicView.hpp> +#include <impl/Kokkos_Timer.hpp> + +namespace Test { + +template <typename Scalar, class Space> +struct TestDynamicView { + using execution_space = typename Space::execution_space; + using memory_space = typename Space::memory_space; + + using view_type = Kokkos::Experimental::DynamicView<Scalar*, Space>; + + using value_type = double; + + static void run(unsigned arg_total_size) { + // Test: Create DynamicView, initialize size (via resize), run through + // parallel_for to set values, check values (via parallel_reduce); resize + // values and repeat + // Case 1: min_chunk_size is a power of 2 + { + { + view_type d1; + ASSERT_FALSE(d1.is_allocated()); + + d1 = view_type("d1", 1024, arg_total_size); + view_type d2(d1); + view_type d3("d3", 1024, arg_total_size); + + ASSERT_FALSE(d1.is_allocated()); + ASSERT_FALSE(d2.is_allocated()); + ASSERT_FALSE(d3.is_allocated()); + + unsigned d_size = arg_total_size / 8; + d1.resize_serial(d_size); + d2.resize_serial(d_size); + d3.resize_serial(d_size); + + ASSERT_TRUE(d1.is_allocated()); + ASSERT_TRUE(d2.is_allocated()); + ASSERT_TRUE(d3.is_allocated()); + } + view_type da("da", 1024, arg_total_size); + ASSERT_EQ(da.size(), 0); + // Init + unsigned da_size = arg_total_size / 8; + da.resize_serial(da_size); + ASSERT_EQ(da.size(), da_size); + +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + Kokkos::parallel_for( + Kokkos::RangePolicy<execution_space>(0, da_size), + KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); + + value_type result_sum = 0.0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy<execution_space>(0, da_size), + KOKKOS_LAMBDA(const int i, value_type& partial_sum) { + partial_sum += (value_type)da(i); + }, + result_sum); + + ASSERT_EQ(result_sum, (value_type)(da_size * (da_size - 1) / 2)); +#endif + + // add 3x more entries i.e. 4x larger than previous size + // the first 1/4 should remain the same + unsigned da_resize = arg_total_size / 2; + da.resize_serial(da_resize); + ASSERT_EQ(da.size(), da_resize); + +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + Kokkos::parallel_for( + Kokkos::RangePolicy<execution_space>(da_size, da_resize), + KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); + + value_type new_result_sum = 0.0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy<execution_space>(da_size, da_resize), + KOKKOS_LAMBDA(const int i, value_type& partial_sum) { + partial_sum += (value_type)da(i); + }, + new_result_sum); + + ASSERT_EQ(new_result_sum + result_sum, + (value_type)(da_resize * (da_resize - 1) / 2)); +#endif + } // end scope + + // Test: Create DynamicView, initialize size (via resize), run through + // parallel_for to set values, check values (via parallel_reduce); resize + // values and repeat + // Case 2: min_chunk_size is NOT a power of 2 + { + view_type da("da", 1023, arg_total_size); + ASSERT_EQ(da.size(), 0); + // Init + unsigned da_size = arg_total_size / 8; + da.resize_serial(da_size); + ASSERT_EQ(da.size(), da_size); + +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + Kokkos::parallel_for( + Kokkos::RangePolicy<execution_space>(0, da_size), + KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); + + value_type result_sum = 0.0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy<execution_space>(0, da_size), + KOKKOS_LAMBDA(const int i, value_type& partial_sum) { + partial_sum += (value_type)da(i); + }, + result_sum); + + ASSERT_EQ(result_sum, (value_type)(da_size * (da_size - 1) / 2)); +#endif + + // add 3x more entries i.e. 4x larger than previous size + // the first 1/4 should remain the same + unsigned da_resize = arg_total_size / 2; + da.resize_serial(da_resize); + ASSERT_EQ(da.size(), da_resize); + +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + Kokkos::parallel_for( + Kokkos::RangePolicy<execution_space>(da_size, da_resize), + KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); + + value_type new_result_sum = 0.0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy<execution_space>(da_size, da_resize), + KOKKOS_LAMBDA(const int i, value_type& partial_sum) { + partial_sum += (value_type)da(i); + }, + new_result_sum); + + ASSERT_EQ(new_result_sum + result_sum, + (value_type)(da_resize * (da_resize - 1) / 2)); +#endif + } // end scope + + // Test: Create DynamicView, initialize size (via resize), run through + // parallel_for to set values, check values (via parallel_reduce); resize + // values and repeat + // Case 3: resize reduces the size + { + view_type da("da", 1023, arg_total_size); + ASSERT_EQ(da.size(), 0); + // Init + unsigned da_size = arg_total_size / 2; + da.resize_serial(da_size); + ASSERT_EQ(da.size(), da_size); + +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + Kokkos::parallel_for( + Kokkos::RangePolicy<execution_space>(0, da_size), + KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); + + value_type result_sum = 0.0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy<execution_space>(0, da_size), + KOKKOS_LAMBDA(const int i, value_type& partial_sum) { + partial_sum += (value_type)da(i); + }, + result_sum); + + ASSERT_EQ(result_sum, (value_type)(da_size * (da_size - 1) / 2)); +#endif + + // remove the final 3/4 entries i.e. first 1/4 remain + unsigned da_resize = arg_total_size / 8; + da.resize_serial(da_resize); + ASSERT_EQ(da.size(), da_resize); + +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + Kokkos::parallel_for( + Kokkos::RangePolicy<execution_space>(0, da_resize), + KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); + + value_type new_result_sum = 0.0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy<execution_space>(0, da_resize), + KOKKOS_LAMBDA(const int i, value_type& partial_sum) { + partial_sum += (value_type)da(i); + }, + new_result_sum); + + ASSERT_EQ(new_result_sum, (value_type)(da_resize * (da_resize - 1) / 2)); +#endif + } // end scope + } +}; + +TEST(TEST_CATEGORY, dynamic_view) { + using TestDynView = TestDynamicView<double, TEST_EXECSPACE>; + + for (int i = 0; i < 10; ++i) { + TestDynView::run(100000 + 100 * i); + } +} + +} // namespace Test + +#endif /* #ifndef KOKKOS_TEST_DYNAMICVIEW_HPP */ diff --git a/packages/kokkos/containers/unit_tests/TestErrorReporter.hpp b/packages/kokkos/containers/unit_tests/TestErrorReporter.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a90885bd33a8731667e20804d3c70fb5b8f35c37 --- /dev/null +++ b/packages/kokkos/containers/unit_tests/TestErrorReporter.hpp @@ -0,0 +1,250 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_EXPERIMENTAL_ERROR_REPORTER_HPP +#define KOKKOS_TEST_EXPERIMENTAL_ERROR_REPORTER_HPP + +#include <gtest/gtest.h> +#include <iostream> +#include <Kokkos_Core.hpp> +#include <Kokkos_ErrorReporter.hpp> + +#ifndef M_PI +#define M_PI 3.14159265358979323846 +#endif + +namespace Test { + +// Just save the data in the report. Informative text goes in the +// operator<<(..). +template <typename DataType1, typename DataType2, typename DataType3> +struct ThreeValReport { + DataType1 m_data1; + DataType2 m_data2; + DataType3 m_data3; +}; + +template <typename DataType1, typename DataType2, typename DataType3> +std::ostream &operator<<( + std::ostream &os, + const ThreeValReport<DataType1, DataType2, DataType3> &val) { + return os << "{" << val.m_data1 << " " << val.m_data2 << " " << val.m_data3 + << "}"; +} + +template <typename ReportType> +void checkReportersAndReportsAgree(const std::vector<int> &reporters, + const std::vector<ReportType> &reports) { + for (size_t i = 0; i < reports.size(); ++i) { + EXPECT_EQ(1, reporters[i] % 2); + EXPECT_EQ(reporters[i], reports[i].m_data1); + } +} + +template <typename DeviceType> +struct ErrorReporterDriverBase { + using report_type = ThreeValReport<int, int, double>; + using error_reporter_type = + Kokkos::Experimental::ErrorReporter<report_type, DeviceType>; + error_reporter_type m_errorReporter; + + ErrorReporterDriverBase(int reporter_capacity, int /*test_size*/) + : m_errorReporter(reporter_capacity) {} + + KOKKOS_INLINE_FUNCTION bool error_condition(const int work_idx) const { + return (work_idx % 2 != 0); + } + + void check_expectations(int reporter_capacity, int test_size) { + using namespace std; + int num_reported = m_errorReporter.getNumReports(); + int num_attempts = m_errorReporter.getNumReportAttempts(); + + int expected_num_reports = min(reporter_capacity, test_size / 2); + EXPECT_EQ(expected_num_reports, num_reported); + EXPECT_EQ(test_size / 2, num_attempts); + + bool expect_full = (reporter_capacity <= (test_size / 2)); + bool reported_full = m_errorReporter.full(); + EXPECT_EQ(expect_full, reported_full); + } +}; + +template <typename ErrorReporterDriverType> +void TestErrorReporter() { + using tester_type = ErrorReporterDriverType; + std::vector<int> reporters; + std::vector<typename tester_type::report_type> reports; + + tester_type test1(100, 10); + test1.m_errorReporter.getReports(reporters, reports); + checkReportersAndReportsAgree(reporters, reports); + + tester_type test2(10, 100); + test2.m_errorReporter.getReports(reporters, reports); + checkReportersAndReportsAgree(reporters, reports); + + typename Kokkos::View< + int *, typename ErrorReporterDriverType::execution_space>::HostMirror + view_reporters; + typename Kokkos::View<typename tester_type::report_type *, + typename ErrorReporterDriverType::execution_space>:: + HostMirror view_reports; + test2.m_errorReporter.getReports(view_reporters, view_reports); + + int num_reports = view_reporters.extent(0); + reporters.clear(); + reports.clear(); + reporters.reserve(num_reports); + reports.reserve(num_reports); + + for (int i = 0; i < num_reports; ++i) { + reporters.push_back(view_reporters(i)); + reports.push_back(view_reports(i)); + } + checkReportersAndReportsAgree(reporters, reports); +} + +template <typename DeviceType> +struct ErrorReporterDriver : public ErrorReporterDriverBase<DeviceType> { + using driver_base = ErrorReporterDriverBase<DeviceType>; + using execution_space = + typename driver_base::error_reporter_type::execution_space; + + ErrorReporterDriver(int reporter_capacity, int test_size) + : driver_base(reporter_capacity, test_size) { + execute(reporter_capacity, test_size); + + // Test that clear() and resize() work across memory spaces. + if (reporter_capacity < test_size) { + driver_base::m_errorReporter.clear(); + driver_base::m_errorReporter.resize(test_size); + execute(test_size, test_size); + } + } + + void execute(int reporter_capacity, int test_size) { + Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(0, test_size), + *this); + Kokkos::fence(); + driver_base::check_expectations(reporter_capacity, test_size); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int work_idx) const { + if (driver_base::error_condition(work_idx)) { + double val = M_PI * static_cast<double>(work_idx); + typename driver_base::report_type report = {work_idx, -2 * work_idx, val}; + driver_base::m_errorReporter.add_report(work_idx, report); + } + } +}; + +#if defined(KOKKOS_CLASS_LAMBDA) && \ + (!defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_CUDA_LAMBDA)) +template <typename DeviceType> +struct ErrorReporterDriverUseLambda + : public ErrorReporterDriverBase<DeviceType> { + using driver_base = ErrorReporterDriverBase<DeviceType>; + using execution_space = + typename driver_base::error_reporter_type::execution_space; + + ErrorReporterDriverUseLambda(int reporter_capacity, int test_size) + : driver_base(reporter_capacity, test_size) { + execute(reporter_capacity, test_size); + } + + void execute(int reporter_capacity, int test_size) { + Kokkos::parallel_for( + Kokkos::RangePolicy<execution_space>(0, test_size), + KOKKOS_CLASS_LAMBDA(const int work_idx) { + if (driver_base::error_condition(work_idx)) { + double val = M_PI * static_cast<double>(work_idx); + typename driver_base::report_type report = {work_idx, -2 * work_idx, + val}; + driver_base::m_errorReporter.add_report(work_idx, report); + } + }); + Kokkos::fence(); + driver_base::check_expectations(reporter_capacity, test_size); + } +}; +#endif + +#ifdef KOKKOS_ENABLE_OPENMP +struct ErrorReporterDriverNativeOpenMP + : public ErrorReporterDriverBase<Kokkos::OpenMP> { + using driver_base = ErrorReporterDriverBase<Kokkos::OpenMP>; + using execution_space = + typename driver_base::error_reporter_type::execution_space; + + ErrorReporterDriverNativeOpenMP(int reporter_capacity, int test_size) + : driver_base(reporter_capacity, test_size) { +#pragma omp parallel for + for (int work_idx = 0; work_idx < test_size; ++work_idx) { + if (driver_base::error_condition(work_idx)) { + double val = M_PI * static_cast<double>(work_idx); + typename driver_base::report_type report = {work_idx, -2 * work_idx, + val}; + driver_base::m_errorReporter.add_report(work_idx, report); + } + }; + driver_base::check_expectations(reporter_capacity, test_size); + } +}; +#endif + +#if defined(KOKKOS_CLASS_LAMBDA) && \ + (!defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_CUDA_LAMBDA)) +TEST(TEST_CATEGORY, ErrorReporterViaLambda) { + TestErrorReporter<ErrorReporterDriverUseLambda<TEST_EXECSPACE>>(); +} +#endif + +TEST(TEST_CATEGORY, ErrorReporter) { + TestErrorReporter<ErrorReporterDriver<TEST_EXECSPACE>>(); +} + +} // namespace Test +#endif // #ifndef KOKKOS_TEST_ERROR_REPORTING_HPP diff --git a/packages/kokkos/containers/unit_tests/TestOffsetView.hpp b/packages/kokkos/containers/unit_tests/TestOffsetView.hpp new file mode 100644 index 0000000000000000000000000000000000000000..9ddc226e291f6e7dc7d6bc960fad470fafeb9974 --- /dev/null +++ b/packages/kokkos/containers/unit_tests/TestOffsetView.hpp @@ -0,0 +1,717 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER + +/* + * FIXME the OffsetView class is really not very well tested. + */ +#ifndef CONTAINERS_UNIT_TESTS_TESTOFFSETVIEW_HPP_ +#define CONTAINERS_UNIT_TESTS_TESTOFFSETVIEW_HPP_ + +#include <gtest/gtest.h> +#include <iostream> +#include <cstdlib> +#include <cstdio> +#include <impl/Kokkos_Timer.hpp> +#include <Kokkos_OffsetView.hpp> +#include <KokkosExp_MDRangePolicy.hpp> + +using std::cout; +using std::endl; + +namespace Test { + +template <typename Scalar, typename Device> +void test_offsetview_construction() { + using offset_view_type = Kokkos::Experimental::OffsetView<Scalar**, Device>; + using view_type = Kokkos::View<Scalar**, Device>; + + Kokkos::Experimental::index_list_type range0 = {-1, 3}; + Kokkos::Experimental::index_list_type range1 = {-2, 2}; + + { + offset_view_type o1; + ASSERT_FALSE(o1.is_allocated()); + + o1 = offset_view_type("o1", range0, range1); + offset_view_type o2(o1); + offset_view_type o3("o3", range0, range1); + + ASSERT_TRUE(o1.is_allocated()); + ASSERT_TRUE(o2.is_allocated()); + ASSERT_TRUE(o3.is_allocated()); + } + + offset_view_type ov("firstOV", range0, range1); + + ASSERT_EQ("firstOV", ov.label()); + ASSERT_EQ(2, ov.Rank); + + ASSERT_EQ(ov.begin(0), -1); + ASSERT_EQ(ov.end(0), 4); + + ASSERT_EQ(ov.begin(1), -2); + ASSERT_EQ(ov.end(1), 3); + + ASSERT_EQ(ov.extent(0), 5); + ASSERT_EQ(ov.extent(1), 5); + +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) + { + Kokkos::Experimental::OffsetView<Scalar*, Device> offsetV1("OneDOffsetView", + range0); + + Kokkos::RangePolicy<Device, int> rangePolicy1(offsetV1.begin(0), + offsetV1.end(0)); + Kokkos::parallel_for( + rangePolicy1, KOKKOS_LAMBDA(const int i) { offsetV1(i) = 1; }); + Kokkos::fence(); + + int OVResult = 0; + Kokkos::parallel_reduce( + rangePolicy1, + KOKKOS_LAMBDA(const int i, int& updateMe) { updateMe += offsetV1(i); }, + OVResult); + + Kokkos::fence(); + ASSERT_EQ(OVResult, offsetV1.end(0) - offsetV1.begin(0)) + << "found wrong number of elements in OffsetView that was summed."; + } + { // test deep copy of scalar const value into mirro + const int constVal = 6; + typename offset_view_type::HostMirror hostOffsetView = + Kokkos::create_mirror_view(ov); + + Kokkos::deep_copy(hostOffsetView, constVal); + + for (int i = hostOffsetView.begin(0); i < hostOffsetView.end(0); ++i) { + for (int j = hostOffsetView.begin(1); j < hostOffsetView.end(1); ++j) { + ASSERT_EQ(hostOffsetView(i, j), constVal) + << "Bad data found in OffsetView"; + } + } + } + + const int ovmin0 = ov.begin(0); + const int ovend0 = ov.end(0); + const int ovmin1 = ov.begin(1); + const int ovend1 = ov.end(1); + + using range_type = + Kokkos::MDRangePolicy<Device, Kokkos::Rank<2>, Kokkos::IndexType<int> >; + using point_type = typename range_type::point_type; + + range_type rangePolicy2D(point_type{{ovmin0, ovmin1}}, + point_type{{ovend0, ovend1}}); + + const int constValue = 9; + Kokkos::parallel_for( + rangePolicy2D, + KOKKOS_LAMBDA(const int i, const int j) { ov(i, j) = constValue; }); + + // test offsetview to offsetviewmirror deep copy + typename offset_view_type::HostMirror hostOffsetView = + Kokkos::create_mirror_view(ov); + + Kokkos::deep_copy(hostOffsetView, ov); + + for (int i = hostOffsetView.begin(0); i < hostOffsetView.end(0); ++i) { + for (int j = hostOffsetView.begin(1); j < hostOffsetView.end(1); ++j) { + ASSERT_EQ(hostOffsetView(i, j), constValue) + << "Bad data found in OffsetView"; + } + } + + int OVResult = 0; + Kokkos::parallel_reduce( + rangePolicy2D, + KOKKOS_LAMBDA(const int i, const int j, int& updateMe) { + updateMe += ov(i, j); + }, + OVResult); + + int answer = 0; + for (int i = ov.begin(0); i < ov.end(0); ++i) { + for (int j = ov.begin(1); j < ov.end(1); ++j) { + answer += constValue; + } + } + + ASSERT_EQ(OVResult, answer) << "Bad data found in OffsetView"; +#endif + + { + offset_view_type ovCopy(ov); + ASSERT_EQ(ovCopy == ov, true) + << "Copy constructor or equivalence operator broken"; + } + + { + offset_view_type ovAssigned = ov; + ASSERT_EQ(ovAssigned == ov, true) + << "Assignment operator or equivalence operator broken"; + } + + { // construct OffsetView from a View plus begins array + const int extent0 = 100; + const int extent1 = 200; + const int extent2 = 300; + Kokkos::View<Scalar***, Device> view3D("view3D", extent0, extent1, extent2); + + Kokkos::deep_copy(view3D, 1); + + using range3_type = Kokkos::MDRangePolicy<Device, Kokkos::Rank<3>, + Kokkos::IndexType<int64_t> >; + using point3_type = typename range3_type::point_type; + + typename point3_type::value_type begins0 = -10, begins1 = -20, + begins2 = -30; + Kokkos::Array<int64_t, 3> begins = {{begins0, begins1, begins2}}; + Kokkos::Experimental::OffsetView<Scalar***, Device> offsetView3D(view3D, + begins); + + range3_type rangePolicy3DZero(point3_type{{0, 0, 0}}, + point3_type{{extent0, extent1, extent2}}); + +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) + int view3DSum = 0; + Kokkos::parallel_reduce( + rangePolicy3DZero, + KOKKOS_LAMBDA(const int i, const int j, int k, int& updateMe) { + updateMe += view3D(i, j, k); + }, + view3DSum); + + range3_type rangePolicy3D( + point3_type{{begins0, begins1, begins2}}, + point3_type{{begins0 + extent0, begins1 + extent1, begins2 + extent2}}); + int offsetView3DSum = 0; + + Kokkos::parallel_reduce( + rangePolicy3D, + KOKKOS_LAMBDA(const int i, const int j, int k, int& updateMe) { + updateMe += offsetView3D(i, j, k); + }, + offsetView3DSum); + + ASSERT_EQ(view3DSum, offsetView3DSum) + << "construction of OffsetView from View and begins array broken."; +#endif + } + view_type viewFromOV = ov.view(); + + ASSERT_EQ(viewFromOV == ov, true) + << "OffsetView::view() or equivalence operator View == OffsetView broken"; + + { + offset_view_type ovFromV(viewFromOV, {-1, -2}); + + ASSERT_EQ(ovFromV == viewFromOV, true) + << "Construction of OffsetView from View or equivalence operator " + "OffsetView == View broken"; + } + { + offset_view_type ovFromV = viewFromOV; + ASSERT_EQ(ovFromV == viewFromOV, true) + << "Construction of OffsetView from View by assignment (implicit " + "conversion) or equivalence operator OffsetView == View broken"; + } + + { // test offsetview to view deep copy + view_type aView("aView", ov.extent(0), ov.extent(1)); + Kokkos::deep_copy(aView, ov); + +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) + int sum = 0; + Kokkos::parallel_reduce( + rangePolicy2D, + KOKKOS_LAMBDA(const int i, const int j, int& updateMe) { + updateMe += ov(i, j) - aView(i - ov.begin(0), j - ov.begin(1)); + }, + sum); + + ASSERT_EQ(sum, 0) << "deep_copy(view, offsetView) broken."; +#endif + } + + { // test view to offsetview deep copy + view_type aView("aView", ov.extent(0), ov.extent(1)); + + Kokkos::deep_copy(aView, 99); + Kokkos::deep_copy(ov, aView); + +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) + int sum = 0; + Kokkos::parallel_reduce( + rangePolicy2D, + KOKKOS_LAMBDA(const int i, const int j, int& updateMe) { + updateMe += ov(i, j) - aView(i - ov.begin(0), j - ov.begin(1)); + }, + sum); + + ASSERT_EQ(sum, 0) << "deep_copy(offsetView, view) broken."; +#endif + } +} + +template <typename Scalar, typename Device> +void test_offsetview_unmanaged_construction() { + // Preallocated memory (Only need a valid address for this test) + Scalar s; + + { + // Constructing an OffsetView directly around our preallocated memory + Kokkos::Array<int64_t, 1> begins1{{2}}; + Kokkos::Array<int64_t, 1> ends1{{3}}; + Kokkos::Experimental::OffsetView<Scalar*, Device> ov1(&s, begins1, ends1); + + // Constructing an OffsetView around an unmanaged View of our preallocated + // memory + Kokkos::View<Scalar*, Device> v1(&s, ends1[0] - begins1[0]); + Kokkos::Experimental::OffsetView<Scalar*, Device> ovv1(v1, begins1); + + // They should match + ASSERT_EQ(ovv1, ov1) + << "OffsetView unmanaged construction fails for rank 1"; + } + + { + Kokkos::Array<int64_t, 2> begins2{{-2, -7}}; + Kokkos::Array<int64_t, 2> ends2{{5, -3}}; + Kokkos::Experimental::OffsetView<Scalar**, Device> ov2(&s, begins2, ends2); + + Kokkos::View<Scalar**, Device> v2(&s, ends2[0] - begins2[0], + ends2[1] - begins2[1]); + Kokkos::Experimental::OffsetView<Scalar**, Device> ovv2(v2, begins2); + + ASSERT_EQ(ovv2, ov2) + << "OffsetView unmanaged construction fails for rank 2"; + } + + { + Kokkos::Array<int64_t, 3> begins3{{2, 3, 5}}; + Kokkos::Array<int64_t, 3> ends3{{7, 11, 13}}; + Kokkos::Experimental::OffsetView<Scalar***, Device> ovv3(&s, begins3, + ends3); + + Kokkos::View<Scalar***, Device> v3(&s, ends3[0] - begins3[0], + ends3[1] - begins3[1], + ends3[2] - begins3[2]); + Kokkos::Experimental::OffsetView<Scalar***, Device> ov3(v3, begins3); + + ASSERT_EQ(ovv3, ov3) + << "OffsetView unmanaged construction fails for rank 3"; + } + + { + // Test all four public constructor overloads (begins_type x + // index_list_type) + Kokkos::Array<int64_t, 1> begins{{-3}}; + Kokkos::Array<int64_t, 1> ends{{2}}; + + Kokkos::Experimental::OffsetView<Scalar*, Device> bb(&s, begins, ends); + Kokkos::Experimental::OffsetView<Scalar*, Device> bi(&s, begins, {2}); + Kokkos::Experimental::OffsetView<Scalar*, Device> ib(&s, {-3}, ends); + Kokkos::Experimental::OffsetView<Scalar*, Device> ii(&s, {-3}, {2}); + + ASSERT_EQ(bb, bi); + ASSERT_EQ(bb, ib); + ASSERT_EQ(bb, ii); + } + +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + { + using offset_view_type = Kokkos::Experimental::OffsetView<Scalar*, Device>; + + // Range calculations must be positive + ASSERT_NO_THROW(offset_view_type(&s, {0}, {1})); + ASSERT_NO_THROW(offset_view_type(&s, {0}, {0})); + ASSERT_THROW(offset_view_type(&s, {0}, {-1}), std::runtime_error); + } + + { + using offset_view_type = Kokkos::Experimental::OffsetView<Scalar*, Device>; + + // Range calculations must not overflow + ASSERT_NO_THROW(offset_view_type(&s, {0}, {0x7fffffffffffffffl})); + ASSERT_THROW(offset_view_type(&s, {-1}, {0x7fffffffffffffffl}), + std::runtime_error); + ASSERT_THROW( + offset_view_type(&s, {-0x7fffffffffffffffl - 1}, {0x7fffffffffffffffl}), + std::runtime_error); + ASSERT_THROW(offset_view_type(&s, {-0x7fffffffffffffffl - 1}, {0}), + std::runtime_error); + } + + { + using offset_view_type = Kokkos::Experimental::OffsetView<Scalar**, Device>; + + // Should throw when the rank of begins and/or ends doesn't match that of + // OffsetView + ASSERT_THROW(offset_view_type(&s, {0}, {1}), std::runtime_error); + ASSERT_THROW(offset_view_type(&s, {0}, {1, 1}), std::runtime_error); + ASSERT_THROW(offset_view_type(&s, {0}, {1, 1, 1}), std::runtime_error); + ASSERT_THROW(offset_view_type(&s, {0, 0}, {1}), std::runtime_error); + ASSERT_NO_THROW(offset_view_type(&s, {0, 0}, {1, 1})); + ASSERT_THROW(offset_view_type(&s, {0, 0}, {1, 1, 1}), std::runtime_error); + ASSERT_THROW(offset_view_type(&s, {0, 0, 0}, {1}), std::runtime_error); + ASSERT_THROW(offset_view_type(&s, {0, 0, 0}, {1, 1}), std::runtime_error); + ASSERT_THROW(offset_view_type(&s, {0, 0, 0}, {1, 1, 1}), + std::runtime_error); + } +#endif // KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST +} + +template <typename Scalar, typename Device> +void test_offsetview_subview() { + { // test subview 1 + Kokkos::Experimental::OffsetView<Scalar*, Device> sliceMe("offsetToSlice", + {-10, 20}); + { + auto offsetSubviewa = Kokkos::Experimental::subview(sliceMe, 0); + ASSERT_EQ(offsetSubviewa.Rank, 0) << "subview of offset is broken."; + } + } + { // test subview 2 + Kokkos::Experimental::OffsetView<Scalar**, Device> sliceMe( + "offsetToSlice", {-10, 20}, {-20, 30}); + { + auto offsetSubview = + Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), -2); + ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + } + + { + auto offsetSubview = + Kokkos::Experimental::subview(sliceMe, 0, Kokkos::ALL()); + ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + } + } + + { // test subview rank 3 + + Kokkos::Experimental::OffsetView<Scalar***, Device> sliceMe( + "offsetToSlice", {-10, 20}, {-20, 30}, {-30, 40}); + + // slice 1 + { + auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), + Kokkos::ALL(), 0); + ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken."; + } + { + auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), + 0, Kokkos::ALL()); + ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken."; + } + + { + auto offsetSubview = Kokkos::Experimental::subview( + sliceMe, 0, Kokkos::ALL(), Kokkos::ALL()); + ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken."; + } + { + auto offsetSubview = Kokkos::Experimental::subview( + sliceMe, 0, Kokkos::ALL(), Kokkos::make_pair(-30, -21)); + ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken."; + + ASSERT_EQ(offsetSubview.begin(0), -20); + ASSERT_EQ(offsetSubview.end(0), 31); + ASSERT_EQ(offsetSubview.begin(1), 0); + ASSERT_EQ(offsetSubview.end(1), 9); + +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) + using range_type = Kokkos::MDRangePolicy<Device, Kokkos::Rank<2>, + Kokkos::IndexType<int> >; + using point_type = typename range_type::point_type; + + const int b0 = offsetSubview.begin(0); + const int b1 = offsetSubview.begin(1); + + const int e0 = offsetSubview.end(0); + const int e1 = offsetSubview.end(1); + + range_type rangeP2D(point_type{{b0, b1}}, point_type{{e0, e1}}); + + Kokkos::parallel_for( + rangeP2D, + KOKKOS_LAMBDA(const int i, const int j) { offsetSubview(i, j) = 6; }); + + int sum = 0; + Kokkos::parallel_reduce( + rangeP2D, + KOKKOS_LAMBDA(const int i, const int j, int& updateMe) { + updateMe += offsetSubview(i, j); + }, + sum); + + ASSERT_EQ(sum, 6 * (e0 - b0) * (e1 - b1)); +#endif + } + + // slice 2 + { + auto offsetSubview = + Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), 0, 0); + ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + } + { + auto offsetSubview = + Kokkos::Experimental::subview(sliceMe, 0, 0, Kokkos::ALL()); + ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + } + + { + auto offsetSubview = + Kokkos::Experimental::subview(sliceMe, 0, Kokkos::ALL(), 0); + ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + } + } + + { // test subview rank 4 + + Kokkos::Experimental::OffsetView<Scalar****, Device> sliceMe( + "offsetToSlice", {-10, 20}, {-20, 30}, {-30, 40}, {-40, 50}); + + // slice 1 + { + auto offsetSubview = Kokkos::Experimental::subview( + sliceMe, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), 0); + ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken."; + } + { + auto offsetSubview = Kokkos::Experimental::subview( + sliceMe, Kokkos::ALL(), Kokkos::ALL(), 0, Kokkos::ALL()); + ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken."; + } + { + auto offsetSubview = Kokkos::Experimental::subview( + sliceMe, Kokkos::ALL(), 0, Kokkos::ALL(), Kokkos::ALL()); + ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken."; + } + { + auto offsetSubview = Kokkos::Experimental::subview( + sliceMe, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken."; + } + + // slice 2 + auto offsetSubview2a = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), + Kokkos::ALL(), 0, 0); + ASSERT_EQ(offsetSubview2a.Rank, 2) << "subview of offset is broken."; + { + auto offsetSubview2b = Kokkos::Experimental::subview( + sliceMe, Kokkos::ALL(), 0, Kokkos::ALL(), 0); + ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken."; + } + { + auto offsetSubview2b = Kokkos::Experimental::subview( + sliceMe, Kokkos::ALL(), 0, 0, Kokkos::ALL()); + ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken."; + } + { + auto offsetSubview2b = Kokkos::Experimental::subview( + sliceMe, 0, Kokkos::ALL(), 0, Kokkos::ALL()); + ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken."; + } + { + auto offsetSubview2b = Kokkos::Experimental::subview( + sliceMe, 0, 0, Kokkos::ALL(), Kokkos::ALL()); + ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken."; + } + // slice 3 + { + auto offsetSubview = + Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), 0, 0, 0); + ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + } + { + auto offsetSubview = + Kokkos::Experimental::subview(sliceMe, 0, Kokkos::ALL(), 0, 0); + ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + } + { + auto offsetSubview = + Kokkos::Experimental::subview(sliceMe, 0, 0, Kokkos::ALL(), 0); + ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + } + { + auto offsetSubview = + Kokkos::Experimental::subview(sliceMe, 0, 0, 0, Kokkos::ALL()); + ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + } + } +} + +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) +template <class InputIt, class T, class BinaryOperation> +KOKKOS_INLINE_FUNCTION T std_accumulate(InputIt first, InputIt last, T init, + BinaryOperation op) { + for (; first != last; ++first) { + init = op(std::move(init), *first); + } + return init; +} + +KOKKOS_INLINE_FUNCTION int element(std::initializer_list<int> il) { + return std_accumulate(il.begin(), il.end(), 0, + [](int l, int r) { return l * 10 + r; }); +} + +template <typename DEVICE> +void test_offsetview_offsets_rank1() { + using data_type = int*; + using view_type = Kokkos::View<data_type, DEVICE>; + using index_type = Kokkos::IndexType<int>; + using execution_policy = Kokkos::RangePolicy<DEVICE, index_type>; + using offset_view_type = Kokkos::Experimental::OffsetView<data_type, DEVICE>; + + view_type v("View1", 10); + Kokkos::parallel_for( + "For1", execution_policy(0, v.extent_int(0)), + KOKKOS_LAMBDA(const int i) { v(i) = element({i}); }); + + int errors; + Kokkos::parallel_reduce( + "Reduce1", execution_policy(-3, 4), + KOKKOS_LAMBDA(const int ii, int& lerrors) { + offset_view_type ov(v, {ii}); + lerrors += (ov(3) != element({3 - ii})); + }, + errors); + + ASSERT_EQ(0, errors); +} + +template <typename DEVICE> +void test_offsetview_offsets_rank2() { + using data_type = int**; + using view_type = Kokkos::View<data_type, DEVICE>; + using index_type = Kokkos::IndexType<int>; + using execution_policy = Kokkos::RangePolicy<DEVICE, index_type>; + using offset_view_type = Kokkos::Experimental::OffsetView<data_type, DEVICE>; + + view_type v("View2", 10, 10); + Kokkos::parallel_for( + "For2", execution_policy(0, v.extent_int(0)), KOKKOS_LAMBDA(const int i) { + for (int j = 0; j != v.extent_int(1); ++j) { + v(i, j) = element({i, j}); + } + }); + + int errors; + Kokkos::parallel_reduce( + "Reduce2", execution_policy(-3, 4), + KOKKOS_LAMBDA(const int ii, int& lerrors) { + for (int jj = -3; jj <= 3; ++jj) { + offset_view_type ov(v, {ii, jj}); + lerrors += (ov(3, 3) != element({3 - ii, 3 - jj})); + } + }, + errors); + + ASSERT_EQ(0, errors); +} + +template <typename DEVICE> +void test_offsetview_offsets_rank3() { + using data_type = int***; + using view_type = Kokkos::View<data_type, DEVICE>; + using index_type = Kokkos::IndexType<int>; + using execution_policy = Kokkos::RangePolicy<DEVICE, index_type>; + using offset_view_type = Kokkos::Experimental::OffsetView<data_type, DEVICE>; + + view_type v("View3", 10, 10, 10); + Kokkos::parallel_for( + "For3", execution_policy(0, v.extent_int(0)), KOKKOS_LAMBDA(const int i) { + for (int j = 0; j != v.extent_int(1); ++j) { + for (int k = 0; k != v.extent_int(2); ++k) { + v(i, j, k) = element({i, j, k}); + } + } + }); + + int errors; + Kokkos::parallel_reduce( + "Reduce3", execution_policy(-3, 4), + KOKKOS_LAMBDA(const int ii, int& lerrors) { + for (int jj = -3; jj <= 3; ++jj) { + for (int kk = -3; kk <= 3; ++kk) { + offset_view_type ov(v, {ii, jj, kk}); + lerrors += (ov(3, 3, 3) != element({3 - ii, 3 - jj, 3 - kk})); + } + } + }, + errors); + + ASSERT_EQ(0, errors); +} +#endif + +TEST(TEST_CATEGORY, offsetview_construction) { + test_offsetview_construction<int, TEST_EXECSPACE>(); +} + +TEST(TEST_CATEGORY, offsetview_unmanaged_construction) { + test_offsetview_unmanaged_construction<int, TEST_EXECSPACE>(); +} + +TEST(TEST_CATEGORY, offsetview_subview) { + test_offsetview_subview<int, TEST_EXECSPACE>(); +} + +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) +TEST(TEST_CATEGORY, offsetview_offsets_rank1) { + test_offsetview_offsets_rank1<TEST_EXECSPACE>(); +} + +TEST(TEST_CATEGORY, offsetview_offsets_rank2) { + test_offsetview_offsets_rank2<TEST_EXECSPACE>(); +} + +TEST(TEST_CATEGORY, offsetview_offsets_rank3) { + test_offsetview_offsets_rank3<TEST_EXECSPACE>(); +} +#endif + +} // namespace Test + +#endif /* CONTAINERS_UNIT_TESTS_TESTOFFSETVIEW_HPP_ */ diff --git a/packages/kokkos/containers/unit_tests/TestScatterView.hpp b/packages/kokkos/containers/unit_tests/TestScatterView.hpp new file mode 100644 index 0000000000000000000000000000000000000000..fdbce2d492009cf38d5491398d77423108edc6a5 --- /dev/null +++ b/packages/kokkos/containers/unit_tests/TestScatterView.hpp @@ -0,0 +1,717 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_SCATTER_VIEW_HPP +#define KOKKOS_TEST_SCATTER_VIEW_HPP + +#include <Kokkos_ScatterView.hpp> +#include <gtest/gtest.h> + +namespace Test { + +template <typename DeviceType, typename Layout, typename Duplication, + typename Contribution, typename Op, typename NumberType> +struct test_scatter_view_impl_cls; + +template <typename DeviceType, typename Layout, typename Duplication, + typename Contribution, typename NumberType> +struct test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution, + Kokkos::Experimental::ScatterSum, + NumberType> { + public: + using scatter_view_type = + Kokkos::Experimental::ScatterView<NumberType * [12], Layout, DeviceType, + Kokkos::Experimental::ScatterSum, + Duplication, Contribution>; + + using orig_view_type = Kokkos::View<NumberType * [12], Layout, DeviceType>; + + scatter_view_type scatter_view; + int scatterSize; + + test_scatter_view_impl_cls(const scatter_view_type& view) { + scatter_view = view; + scatterSize = 0; + } + + void initialize(orig_view_type orig) { + auto host_view = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), orig); + Kokkos::fence(); + for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0); + ++i) { + host_view(i, 0) = 0.0; + host_view(i, 1) = 0.0; + host_view(i, 2) = 0.0; + host_view(i, 3) = 0.0; + host_view(i, 4) = 0.0; + host_view(i, 5) = 0.0; + host_view(i, 6) = 0.0; + host_view(i, 7) = 0.0; + host_view(i, 8) = 0.0; + host_view(i, 9) = 0.0; + host_view(i, 10) = 0.0; + host_view(i, 11) = 0.0; + } + Kokkos::fence(); + Kokkos::deep_copy(orig, host_view); + } + + void run_parallel(int n) { + scatterSize = n; + auto policy = + Kokkos::RangePolicy<typename DeviceType::execution_space, int>(0, n); + Kokkos::parallel_for(policy, *this, "scatter_view_test: Sum"); + } + + KOKKOS_INLINE_FUNCTION + void operator()(int i) const { + auto scatter_access = scatter_view.access(); + auto scatter_access_atomic = + scatter_view.template access<Kokkos::Experimental::ScatterAtomic>(); + for (int j = 0; j < 10; ++j) { + auto k = (i + j) % scatterSize; + scatter_access(k, 0) += 4; + ++scatter_access(k, 1); + --scatter_access(k, 2); + scatter_access(k, 3)++; + scatter_access(k, 4)--; + scatter_access(k, 5) -= 5; + scatter_access_atomic(k, 6) += 2; + scatter_access_atomic(k, 7)++; + scatter_access_atomic(k, 8)--; + --scatter_access_atomic(k, 9); + ++scatter_access_atomic(k, 10); + scatter_access(k, 11) -= 3; + } + } + + void validateResults(orig_view_type orig) { + auto host_view = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), orig); + Kokkos::fence(); + for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0); + ++i) { + auto val0 = host_view(i, 0); + auto val1 = host_view(i, 1); + auto val2 = host_view(i, 2); + auto val3 = host_view(i, 3); + auto val4 = host_view(i, 4); + auto val5 = host_view(i, 5); + auto val6 = host_view(i, 6); + auto val7 = host_view(i, 7); + auto val8 = host_view(i, 8); + auto val9 = host_view(i, 9); + auto val10 = host_view(i, 10); + auto val11 = host_view(i, 11); + EXPECT_NEAR(val0, NumberType(80), 1e-14); + EXPECT_NEAR(val1, NumberType(20), 1e-14); + EXPECT_NEAR(val2, NumberType(-20), 1e-14); + EXPECT_NEAR(val3, NumberType(20), 1e-14); + EXPECT_NEAR(val4, NumberType(-20), 1e-14); + EXPECT_NEAR(val5, NumberType(-100), 1e-14); + EXPECT_NEAR(val6, NumberType(40), 1e-14); + EXPECT_NEAR(val7, NumberType(20), 1e-14); + EXPECT_NEAR(val8, NumberType(-20), 1e-14); + EXPECT_NEAR(val9, NumberType(-20), 1e-14); + EXPECT_NEAR(val10, NumberType(20), 1e-14); + EXPECT_NEAR(val11, NumberType(-60), 1e-14); + } + } +}; + +template <typename DeviceType, typename Layout, typename Duplication, + typename Contribution, typename NumberType> +struct test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution, + Kokkos::Experimental::ScatterProd, + NumberType> { + public: + using scatter_view_type = + Kokkos::Experimental::ScatterView<NumberType * [3], Layout, DeviceType, + Kokkos::Experimental::ScatterProd, + Duplication, Contribution>; + + using orig_view_type = Kokkos::View<NumberType * [3], Layout, DeviceType>; + + scatter_view_type scatter_view; + int scatterSize; + + test_scatter_view_impl_cls(const scatter_view_type& view) { + scatter_view = view; + scatterSize = 0; + } + + void initialize(orig_view_type orig) { + auto host_view = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), orig); + Kokkos::fence(); + for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0); + ++i) { + host_view(i, 0) = 1.0; + host_view(i, 1) = 1.0; + host_view(i, 2) = 1.0; + } + Kokkos::fence(); + Kokkos::deep_copy(orig, host_view); + } + + void run_parallel(int n) { + scatterSize = n; + auto policy = + Kokkos::RangePolicy<typename DeviceType::execution_space, int>(0, n); + Kokkos::parallel_for(policy, *this, "scatter_view_test: Prod"); + } + + KOKKOS_INLINE_FUNCTION + void operator()(int i) const { + auto scatter_access = scatter_view.access(); + auto scatter_access_atomic = + scatter_view.template access<Kokkos::Experimental::ScatterAtomic>(); + for (int j = 0; j < 4; ++j) { + auto k = (i + j) % scatterSize; + scatter_access(k, 0) *= 4.0; + scatter_access_atomic(k, 1) *= 2.0; + scatter_access(k, 2) *= 1.0; + } + } + + void validateResults(orig_view_type orig) { + auto host_view = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), orig); + Kokkos::fence(); + for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0); + ++i) { + auto val0 = host_view(i, 0); + auto val1 = host_view(i, 1); + auto val2 = host_view(i, 2); + EXPECT_TRUE(std::fabs((val0 - 65536.0) / 65536.0) < 1e-14); + EXPECT_TRUE(std::fabs((val1 - 256.0) / 256.0) < 1e-14); + EXPECT_TRUE(std::fabs((val2 - 1.0) / 1.0) < 1e-14); + } + } +}; + +template <typename DeviceType, typename Layout, typename Duplication, + typename Contribution, typename NumberType> +struct test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution, + Kokkos::Experimental::ScatterMin, + NumberType> { + public: + using scatter_view_type = + Kokkos::Experimental::ScatterView<NumberType * [3], Layout, DeviceType, + Kokkos::Experimental::ScatterMin, + Duplication, Contribution>; + + using orig_view_type = Kokkos::View<NumberType * [3], Layout, DeviceType>; + + scatter_view_type scatter_view; + int scatterSize; + + test_scatter_view_impl_cls(const scatter_view_type& view) { + scatter_view = view; + scatterSize = 0; + } + + void initialize(orig_view_type orig) { + auto host_view = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), orig); + Kokkos::fence(); + for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0); + ++i) { + host_view(i, 0) = 999999.0; + host_view(i, 1) = 999999.0; + host_view(i, 2) = 999999.0; + } + Kokkos::fence(); + Kokkos::deep_copy(orig, host_view); + } + + void run_parallel(int n) { + scatterSize = n; + auto policy = + Kokkos::RangePolicy<typename DeviceType::execution_space, int>(0, n); + Kokkos::parallel_for(policy, *this, "scatter_view_test: Prod"); + } + + KOKKOS_INLINE_FUNCTION + void operator()(int i) const { + auto scatter_access = scatter_view.access(); + auto scatter_access_atomic = + scatter_view.template access<Kokkos::Experimental::ScatterAtomic>(); + for (int j = 0; j < 4; ++j) { + auto k = (i + j) % scatterSize; + scatter_access(k, 0).update((NumberType)(j + 1) * 4); + scatter_access_atomic(k, 1).update((NumberType)(j + 1) * 2.0); + scatter_access(k, 2).update((NumberType)(j + 1) * 1.0); + } + } + + void validateResults(orig_view_type orig) { + auto host_view = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), orig); + Kokkos::fence(); + for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0); + ++i) { + auto val0 = host_view(i, 0); + auto val1 = host_view(i, 1); + auto val2 = host_view(i, 2); + EXPECT_TRUE(std::fabs((val0 - 4.0) / 4.0) < 1e-14); + EXPECT_TRUE(std::fabs((val1 - 2.0) / 2.0) < 1e-14); + EXPECT_TRUE(std::fabs((val2 - 1.0) / 1.0) < 1e-14); + } + } +}; + +template <typename DeviceType, typename Layout, typename Duplication, + typename Contribution, typename NumberType> +struct test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution, + Kokkos::Experimental::ScatterMax, + NumberType> { + public: + using scatter_view_type = + Kokkos::Experimental::ScatterView<NumberType * [3], Layout, DeviceType, + Kokkos::Experimental::ScatterMax, + Duplication, Contribution>; + + using orig_view_type = Kokkos::View<NumberType * [3], Layout, DeviceType>; + + scatter_view_type scatter_view; + int scatterSize; + + test_scatter_view_impl_cls(const scatter_view_type& view) { + scatter_view = view; + scatterSize = 0; + } + + void initialize(orig_view_type orig) { + auto host_view = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), orig); + Kokkos::fence(); + for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0); + ++i) { + host_view(i, 0) = 0.0; + host_view(i, 1) = 0.0; + host_view(i, 2) = 0.0; + } + Kokkos::fence(); + Kokkos::deep_copy(orig, host_view); + } + + void run_parallel(int n) { + scatterSize = n; + Kokkos::RangePolicy<typename DeviceType::execution_space, int> policy(0, n); + Kokkos::parallel_for(policy, *this, "scatter_view_test: Prod"); + } + + KOKKOS_INLINE_FUNCTION + void operator()(int i) const { + auto scatter_access = scatter_view.access(); + auto scatter_access_atomic = + scatter_view.template access<Kokkos::Experimental::ScatterAtomic>(); + for (int j = 0; j < 4; ++j) { + auto k = (i + j) % scatterSize; + scatter_access(k, 0).update((NumberType)(j + 1) * 4); + scatter_access_atomic(k, 1).update((NumberType)(j + 1) * 2.0); + scatter_access(k, 2).update((NumberType)(j + 1) * 1.0); + } + } + + void validateResults(orig_view_type orig) { + auto host_view = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), orig); + Kokkos::fence(); + for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0); + ++i) { + auto val0 = host_view(i, 0); + auto val1 = host_view(i, 1); + auto val2 = host_view(i, 2); + EXPECT_TRUE(std::fabs((val0 - 16.0) / 16.0) < 1e-14); + EXPECT_TRUE(std::fabs((val1 - 8.0) / 8.0) < 1e-14); + EXPECT_TRUE(std::fabs((val2 - 4.0) / 4.0) < 1e-14); + } + } +}; + +template <typename DeviceType, typename Layout, typename Op, + typename NumberType> +struct test_default_scatter_view { + public: + using default_duplication = Kokkos::Impl::Experimental::DefaultDuplication< + typename DeviceType::execution_space>; + using Duplication = typename default_duplication::type; + using Contribution = typename Kokkos::Impl::Experimental::DefaultContribution< + typename DeviceType::execution_space, Duplication>::type; + using scatter_view_def = + typename test_scatter_view_impl_cls<DeviceType, Layout, Duplication, + Contribution, Op, + NumberType>::scatter_view_type; + using orig_view_def = + typename test_scatter_view_impl_cls<DeviceType, Layout, Duplication, + Contribution, Op, + NumberType>::orig_view_type; + + void run_test(int n) { + // Test creation via create_scatter_view overload 1 + { + orig_view_def original_view("original_view", n); + scatter_view_def scatter_view = + Kokkos::Experimental::create_scatter_view(Op{}, original_view); + + test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution, + Op, NumberType> + scatter_view_test_impl(scatter_view); + scatter_view_test_impl.initialize(original_view); + scatter_view_test_impl.run_parallel(n); + + Kokkos::Experimental::contribute(original_view, scatter_view); + scatter_view.reset_except(original_view); + + scatter_view_test_impl.run_parallel(n); + + Kokkos::Experimental::contribute(original_view, scatter_view); + Kokkos::fence(); + + scatter_view_test_impl.validateResults(original_view); + + { + scatter_view_def persistent_view("persistent", n); + auto result_view = persistent_view.subview(); + contribute(result_view, persistent_view); + Kokkos::fence(); + } + } + } +}; + +template <typename DeviceType, typename Layout, typename Duplication, + typename Contribution, typename Op, typename NumberType> +struct test_scatter_view_config { + public: + using scatter_view_def = + typename test_scatter_view_impl_cls<DeviceType, Layout, Duplication, + Contribution, Op, + NumberType>::scatter_view_type; + using orig_view_def = + typename test_scatter_view_impl_cls<DeviceType, Layout, Duplication, + Contribution, Op, + NumberType>::orig_view_type; + + void compile_constructor() { + auto sv = scatter_view_def(Kokkos::view_alloc(DeviceType{}, "label"), 10); + } + + void run_test(int n) { + // test allocation + { + orig_view_def ov1("ov1", n); + scatter_view_def sv1; + + ASSERT_FALSE(sv1.is_allocated()); + + sv1 = Kokkos::Experimental::create_scatter_view<Op, Duplication, + Contribution>(ov1); + + scatter_view_def sv2(sv1); + scatter_view_def sv3("sv3", n); + + ASSERT_TRUE(sv1.is_allocated()); + ASSERT_TRUE(sv2.is_allocated()); + ASSERT_TRUE(sv3.is_allocated()); + } + + // Test creation via create_scatter_view + { + orig_view_def original_view("original_view", n); + scatter_view_def scatter_view = Kokkos::Experimental::create_scatter_view< + Op, Duplication, Contribution>(original_view); + + test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution, + Op, NumberType> + scatter_view_test_impl(scatter_view); + scatter_view_test_impl.initialize(original_view); + scatter_view_test_impl.run_parallel(n); + + Kokkos::Experimental::contribute(original_view, scatter_view); + scatter_view.reset_except(original_view); + + scatter_view_test_impl.run_parallel(n); + + Kokkos::Experimental::contribute(original_view, scatter_view); + Kokkos::fence(); + + scatter_view_test_impl.validateResults(original_view); + + { + scatter_view_def persistent_view("persistent", n); + auto result_view = persistent_view.subview(); + contribute(result_view, persistent_view); + Kokkos::fence(); + } + } + // Test creation via create_scatter_view overload 2 + { + orig_view_def original_view("original_view", n); + scatter_view_def scatter_view = Kokkos::Experimental::create_scatter_view( + Op{}, Duplication{}, Contribution{}, original_view); + + test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution, + Op, NumberType> + scatter_view_test_impl(scatter_view); + scatter_view_test_impl.initialize(original_view); + scatter_view_test_impl.run_parallel(n); + + Kokkos::Experimental::contribute(original_view, scatter_view); + scatter_view.reset_except(original_view); + + scatter_view_test_impl.run_parallel(n); + + Kokkos::Experimental::contribute(original_view, scatter_view); + Kokkos::fence(); + + scatter_view_test_impl.validateResults(original_view); + + { + scatter_view_def persistent_view("persistent", n); + auto result_view = persistent_view.subview(); + contribute(result_view, persistent_view); + Kokkos::fence(); + } + } + // Test creation via constructor + { + orig_view_def original_view("original_view", n); + scatter_view_def scatter_view(original_view); + + test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution, + Op, NumberType> + scatter_view_test_impl(scatter_view); + scatter_view_test_impl.initialize(original_view); + scatter_view_test_impl.run_parallel(n); + + Kokkos::Experimental::contribute(original_view, scatter_view); + scatter_view.reset_except(original_view); + + scatter_view_test_impl.run_parallel(n); + + Kokkos::Experimental::contribute(original_view, scatter_view); + Kokkos::fence(); + + scatter_view_test_impl.validateResults(original_view); + + { + scatter_view_def persistent_view("persistent", n); + auto result_view = persistent_view.subview(); + contribute(result_view, persistent_view); + Kokkos::fence(); + } + } + } +}; + +template <typename DeviceType, typename ScatterType, typename NumberType> +struct TestDuplicatedScatterView { + TestDuplicatedScatterView(int n) { + // ScatterSum test + test_scatter_view_config<DeviceType, Kokkos::LayoutRight, + Kokkos::Experimental::ScatterDuplicated, + Kokkos::Experimental::ScatterNonAtomic, + ScatterType, NumberType> + test_sv_right_config; + test_sv_right_config.run_test(n); + test_scatter_view_config< + DeviceType, Kokkos::LayoutLeft, Kokkos::Experimental::ScatterDuplicated, + Kokkos::Experimental::ScatterNonAtomic, ScatterType, NumberType> + test_sv_left_config; + test_sv_left_config.run_test(n); + } +}; + +#ifdef KOKKOS_ENABLE_CUDA +// disable duplicated instantiation with CUDA until +// UniqueToken can support it +template <typename ScatterType, typename NumberType> +struct TestDuplicatedScatterView<Kokkos::Cuda, ScatterType, NumberType> { + TestDuplicatedScatterView(int) {} +}; +template <typename ScatterType, typename NumberType> +struct TestDuplicatedScatterView< + Kokkos::Device<Kokkos::Cuda, Kokkos::CudaSpace>, ScatterType, NumberType> { + TestDuplicatedScatterView(int) {} +}; +template <typename ScatterType, typename NumberType> +struct TestDuplicatedScatterView< + Kokkos::Device<Kokkos::Cuda, Kokkos::CudaUVMSpace>, ScatterType, + NumberType> { + TestDuplicatedScatterView(int) {} +}; +#endif + +template <typename DeviceType, typename ScatterType, + typename NumberType = double> +void test_scatter_view(int64_t n) { + using execution_space = typename DeviceType::execution_space; + + // no atomics or duplication is only sensible if the execution space + // is running essentially in serial (doesn't have to be Serial though, + // we also test OpenMP with one thread: LAMMPS cares about that) + if (execution_space().concurrency() == 1) { + test_scatter_view_config<DeviceType, Kokkos::LayoutRight, + Kokkos::Experimental::ScatterNonDuplicated, + Kokkos::Experimental::ScatterNonAtomic, + ScatterType, NumberType> + test_sv_config; + test_sv_config.run_test(n); + } +#ifdef KOKKOS_ENABLE_SERIAL + if (!std::is_same<DeviceType, Kokkos::Serial>::value) { +#endif + test_scatter_view_config<DeviceType, Kokkos::LayoutRight, + Kokkos::Experimental::ScatterNonDuplicated, + Kokkos::Experimental::ScatterAtomic, ScatterType, + NumberType> + test_sv_config; + test_sv_config.run_test(n); +#ifdef KOKKOS_ENABLE_SERIAL + } +#endif + // with hundreds of threads we were running out of memory. + // limit (n) so that duplication doesn't exceed 4GB + constexpr std::size_t maximum_allowed_total_bytes = + 4ull * 1024ull * 1024ull * 1024ull; + std::size_t const maximum_allowed_copy_bytes = + maximum_allowed_total_bytes / + std::size_t(execution_space().concurrency()); + constexpr std::size_t bytes_per_value = sizeof(NumberType) * 12; + std::size_t const maximum_allowed_copy_values = + maximum_allowed_copy_bytes / bytes_per_value; + n = std::min(n, int64_t(maximum_allowed_copy_values)); + + // if the default is duplicated, this needs to follow the limit + { + test_default_scatter_view<DeviceType, Kokkos::LayoutRight, ScatterType, + NumberType> + test_default_sv; + test_default_sv.run_test(n); + } + TestDuplicatedScatterView<DeviceType, ScatterType, NumberType> duptest(n); +} + +TEST(TEST_CATEGORY, scatterview) { + test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterSum, double>( + 10); + test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterSum, + unsigned int>(10); + test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterProd>(10); + test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterMin>(10); + test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterMax>(10); + // tests were timing out in DEBUG mode, reduce the amount of work +#ifdef KOKKOS_ENABLE_DEBUG + int big_n = 100 * 1000; +#else + +#ifdef KOKKOS_ENABLE_SERIAL + bool is_serial = std::is_same<TEST_EXECSPACE, Kokkos::Serial>::value; + int big_n = is_serial ? 100 * 1000 : 10000 * 1000; +#else + int big_n = 10000 * 1000; +#endif + +#endif + test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterSum, double>( + big_n); + test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterSum, + unsigned int>(big_n); + test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterProd>(big_n); + test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterMin>(big_n); + test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterMax>(big_n); +} + +TEST(TEST_CATEGORY, scatterview_devicetype) { + using device_type = + Kokkos::Device<TEST_EXECSPACE, typename TEST_EXECSPACE::memory_space>; + + test_scatter_view<device_type, Kokkos::Experimental::ScatterSum, double>(10); + test_scatter_view<device_type, Kokkos::Experimental::ScatterSum, + unsigned int>(10); + test_scatter_view<device_type, Kokkos::Experimental::ScatterProd>(10); + test_scatter_view<device_type, Kokkos::Experimental::ScatterMin>(10); + test_scatter_view<device_type, Kokkos::Experimental::ScatterMax>(10); + +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) +#ifdef KOKKOS_ENABLE_CUDA + using device_execution_space = Kokkos::Cuda; + using device_memory_space = Kokkos::CudaSpace; + using host_accessible_space = Kokkos::CudaUVMSpace; +#else + using device_execution_space = Kokkos::Experimental::HIP; + using device_memory_space = Kokkos::Experimental::HIPSpace; + using host_accessible_space = Kokkos::Experimental::HIPHostPinnedSpace; +#endif + if (std::is_same<TEST_EXECSPACE, device_execution_space>::value) { + using device_device_type = + Kokkos::Device<device_execution_space, device_memory_space>; + test_scatter_view<device_device_type, Kokkos::Experimental::ScatterSum, + double>(10); + test_scatter_view<device_device_type, Kokkos::Experimental::ScatterSum, + unsigned int>(10); + test_scatter_view<device_device_type, Kokkos::Experimental::ScatterProd>( + 10); + test_scatter_view<device_device_type, Kokkos::Experimental::ScatterMin>(10); + test_scatter_view<device_device_type, Kokkos::Experimental::ScatterMax>(10); + using host_device_type = + Kokkos::Device<device_execution_space, host_accessible_space>; + test_scatter_view<host_device_type, Kokkos::Experimental::ScatterSum, + double>(10); + test_scatter_view<host_device_type, Kokkos::Experimental::ScatterSum, + unsigned int>(10); + test_scatter_view<host_device_type, Kokkos::Experimental::ScatterProd>(10); + test_scatter_view<host_device_type, Kokkos::Experimental::ScatterMin>(10); + test_scatter_view<host_device_type, Kokkos::Experimental::ScatterMax>(10); + } +#endif +} + +} // namespace Test + +#endif // KOKKOS_TEST_SCATTER_VIEW_HPP diff --git a/packages/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp b/packages/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a9a178f95e7b7fedabcb7b00b292d88603ff3f77 --- /dev/null +++ b/packages/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp @@ -0,0 +1,303 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <vector> + +#include <Kokkos_StaticCrsGraph.hpp> +#include <Kokkos_Core.hpp> + +/*--------------------------------------------------------------------------*/ +namespace Test { +namespace TestStaticCrsGraph { + +template <class Space> +void run_test_graph() { + using dView = Kokkos::StaticCrsGraph<unsigned, Space>; + using hView = typename dView::HostMirror; + + const unsigned LENGTH = 1000; + + std::vector<std::vector<int> > graph(LENGTH); + + for (size_t i = 0; i < LENGTH; ++i) { + graph[i].reserve(8); + for (size_t j = 0; j < 8; ++j) { + graph[i].push_back(i + j * 3); + } + } + + { + dView d1; + ASSERT_FALSE(d1.is_allocated()); + + d1 = Kokkos::create_staticcrsgraph<dView>("d1", graph); + + dView d2(d1); + dView d3(d1.entries, d1.row_map); + + ASSERT_TRUE(d1.is_allocated()); + ASSERT_TRUE(d2.is_allocated()); + ASSERT_TRUE(d3.is_allocated()); + } + + dView dx; + hView hx; + + dx = Kokkos::create_staticcrsgraph<dView>("dx", graph); + hx = Kokkos::create_mirror(dx); + + ASSERT_EQ(hx.row_map.extent(0) - 1, LENGTH); + + for (size_t i = 0; i < LENGTH; ++i) { + const size_t begin = hx.row_map[i]; + const size_t n = hx.row_map[i + 1] - begin; + ASSERT_EQ(n, graph[i].size()); + for (size_t j = 0; j < n; ++j) { + ASSERT_EQ((int)hx.entries(j + begin), graph[i][j]); + } + } + + // Test row view access + for (size_t i = 0; i < LENGTH; ++i) { + auto rowView = hx.rowConst(i); + ASSERT_EQ(rowView.length, graph[i].size()); + for (size_t j = 0; j < rowView.length; ++j) { + ASSERT_EQ(rowView.colidx(j), graph[i][j]); + ASSERT_EQ(rowView(j), graph[i][j]); + } + } +} + +template <class Space> +void run_test_graph2() { + using dView = Kokkos::StaticCrsGraph<unsigned[3], Space>; + using hView = typename dView::HostMirror; + + const unsigned LENGTH = 10; + + std::vector<size_t> sizes(LENGTH); + + size_t total_length = 0; + + for (size_t i = 0; i < LENGTH; ++i) { + total_length += (sizes[i] = 6 + i % 4); + } + + dView dx = Kokkos::create_staticcrsgraph<dView>("test", sizes); + hView hx = Kokkos::create_mirror(dx); + hView mx = Kokkos::create_mirror(dx); + + ASSERT_EQ((size_t)dx.row_map.extent(0), (size_t)LENGTH + 1); + ASSERT_EQ((size_t)hx.row_map.extent(0), (size_t)LENGTH + 1); + ASSERT_EQ((size_t)mx.row_map.extent(0), (size_t)LENGTH + 1); + + ASSERT_EQ((size_t)dx.entries.extent(0), (size_t)total_length); + ASSERT_EQ((size_t)hx.entries.extent(0), (size_t)total_length); + ASSERT_EQ((size_t)mx.entries.extent(0), (size_t)total_length); + + ASSERT_EQ((size_t)dx.entries.extent(1), (size_t)3); + ASSERT_EQ((size_t)hx.entries.extent(1), (size_t)3); + ASSERT_EQ((size_t)mx.entries.extent(1), (size_t)3); + + for (size_t i = 0; i < LENGTH; ++i) { + const size_t entry_begin = hx.row_map[i]; + const size_t entry_end = hx.row_map[i + 1]; + for (size_t j = entry_begin; j < entry_end; ++j) { + hx.entries(j, 0) = j + 1; + hx.entries(j, 1) = j + 2; + hx.entries(j, 2) = j + 3; + } + } + + Kokkos::deep_copy(dx.entries, hx.entries); + Kokkos::deep_copy(mx.entries, dx.entries); + + ASSERT_EQ(mx.row_map.extent(0), (size_t)LENGTH + 1); + + for (size_t i = 0; i < LENGTH; ++i) { + const size_t entry_begin = mx.row_map[i]; + const size_t entry_end = mx.row_map[i + 1]; + ASSERT_EQ((entry_end - entry_begin), sizes[i]); + for (size_t j = entry_begin; j < entry_end; ++j) { + ASSERT_EQ((size_t)mx.entries(j, 0), (j + 1)); + ASSERT_EQ((size_t)mx.entries(j, 1), (j + 2)); + ASSERT_EQ((size_t)mx.entries(j, 2), (j + 3)); + } + } +} + +template <class Space> +void run_test_graph3(size_t B, size_t N) { + srand(10310); + + using dView = Kokkos::StaticCrsGraph<int, Space>; + using hView = typename dView::HostMirror; + + const unsigned LENGTH = 2000; + + std::vector<size_t> sizes(LENGTH); + + size_t total_length = 0; + + for (size_t i = 0; i < LENGTH; ++i) { + sizes[i] = rand() % 1000; + } + + sizes[1] = N; + sizes[1998] = N; + + for (size_t i = 0; i < LENGTH; ++i) { + total_length += sizes[i]; + } + + int C = 0; + dView dx = Kokkos::create_staticcrsgraph<dView>("test", sizes); + dx.create_block_partitioning(B, C); + hView hx = Kokkos::create_mirror(dx); + + for (size_t i = 0; i < B; i++) { + size_t ne = 0; + for (auto j = hx.row_block_offsets(i); j < hx.row_block_offsets(i + 1); j++) + ne += hx.row_map(j + 1) - hx.row_map(j) + C; + + ASSERT_FALSE( + (ne > 2 * ((hx.row_map(hx.numRows()) + C * hx.numRows()) / B)) && + (hx.row_block_offsets(i + 1) > hx.row_block_offsets(i) + 1)); + } +} + +template <class Space> +void run_test_graph4() { + using ordinal_type = unsigned int; + using layout_type = Kokkos::LayoutRight; + using space_type = Space; + using memory_traits_type = Kokkos::MemoryUnmanaged; + using dView = Kokkos::StaticCrsGraph<ordinal_type, layout_type, space_type, + memory_traits_type>; + using hView = typename dView::HostMirror; + + dView dx; + + // StaticCrsGraph with Unmanaged trait will contain row_map and entries + // members with the Unmanaged memory trait. Use of such a StaticCrsGraph + // requires an allocaton of memory for the unmanaged views to wrap. + // + // In this test, a graph (via raw arrays) resides on the host. + // The pointers are wrapped by unmanaged Views. + // To make use of this on the device, managed device Views are created + // (allocation required), and data from the unmanaged host views is deep + // copied to the device Views Unmanaged views of the appropriate type wrap the + // device data and are assigned to their corresponding unmanaged view members + // of the unmanaged StaticCrsGraph + + // Data types for raw pointers storing StaticCrsGraph info + using ptr_row_map_type = typename dView::size_type; + using ptr_entries_type = typename dView::data_type; + + const ordinal_type numRows = 8; + const ordinal_type nnz = 24; + ptr_row_map_type ptrRaw[] = {0, 4, 8, 10, 12, 14, 16, 20, 24}; + ptr_entries_type indRaw[] = {0, 1, 4, 5, 0, 1, 4, 5, 2, 3, 2, 3, + 4, 5, 4, 5, 2, 3, 6, 7, 2, 3, 6, 7}; + + // Wrap pointers in unmanaged host views + using local_row_map_type = typename hView::row_map_type; + using local_entries_type = typename hView::entries_type; + local_row_map_type unman_row_map(&(ptrRaw[0]), numRows + 1); + local_entries_type unman_entries(&(indRaw[0]), nnz); + + hView hx; + hx = hView(unman_entries, unman_row_map); + + // Create the device Views for copying the host arrays into + // An allocation is needed on the device for the unmanaged StaticCrsGraph to + // wrap the pointer + using d_row_map_view_type = + typename Kokkos::View<ptr_row_map_type*, layout_type, space_type>; + using d_entries_view_type = + typename Kokkos::View<ptr_entries_type*, layout_type, space_type>; + + d_row_map_view_type tmp_row_map("tmp_row_map", numRows + 1); + d_entries_view_type tmp_entries("tmp_entries", nnz); + + Kokkos::deep_copy(tmp_row_map, unman_row_map); + Kokkos::deep_copy(tmp_entries, unman_entries); + + // Wrap the pointer in unmanaged View and assign to the corresponding + // StaticCrsGraph member + dx.row_map = typename dView::row_map_type(tmp_row_map.data(), numRows + 1); + dx.entries = typename dView::entries_type(tmp_entries.data(), nnz); + + ASSERT_TRUE((std::is_same<typename dView::row_map_type::memory_traits, + Kokkos::MemoryUnmanaged>::value)); + ASSERT_TRUE((std::is_same<typename dView::entries_type::memory_traits, + Kokkos::MemoryUnmanaged>::value)); + ASSERT_TRUE((std::is_same<typename hView::row_map_type::memory_traits, + Kokkos::MemoryUnmanaged>::value)); + ASSERT_TRUE((std::is_same<typename hView::entries_type::memory_traits, + Kokkos::MemoryUnmanaged>::value)); +} + +} /* namespace TestStaticCrsGraph */ + +TEST(TEST_CATEGORY, staticcrsgraph) { + TestStaticCrsGraph::run_test_graph<TEST_EXECSPACE>(); + TestStaticCrsGraph::run_test_graph2<TEST_EXECSPACE>(); + TestStaticCrsGraph::run_test_graph3<TEST_EXECSPACE>(1, 0); + TestStaticCrsGraph::run_test_graph3<TEST_EXECSPACE>(1, 1000); + TestStaticCrsGraph::run_test_graph3<TEST_EXECSPACE>(1, 10000); + TestStaticCrsGraph::run_test_graph3<TEST_EXECSPACE>(1, 100000); + TestStaticCrsGraph::run_test_graph3<TEST_EXECSPACE>(3, 0); + TestStaticCrsGraph::run_test_graph3<TEST_EXECSPACE>(3, 1000); + TestStaticCrsGraph::run_test_graph3<TEST_EXECSPACE>(3, 10000); + TestStaticCrsGraph::run_test_graph3<TEST_EXECSPACE>(3, 100000); + TestStaticCrsGraph::run_test_graph3<TEST_EXECSPACE>(75, 0); + TestStaticCrsGraph::run_test_graph3<TEST_EXECSPACE>(75, 1000); + TestStaticCrsGraph::run_test_graph3<TEST_EXECSPACE>(75, 10000); + TestStaticCrsGraph::run_test_graph3<TEST_EXECSPACE>(75, 100000); + TestStaticCrsGraph::run_test_graph4<TEST_EXECSPACE>(); +} +} // namespace Test diff --git a/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp b/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp new file mode 100644 index 0000000000000000000000000000000000000000..4413cfbc80e31271d1e2b830976796ade24aaa9a --- /dev/null +++ b/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp @@ -0,0 +1,334 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER + +#ifndef KOKKOS_TEST_UNORDERED_MAP_HPP +#define KOKKOS_TEST_UNORDERED_MAP_HPP + +#include <gtest/gtest.h> +#include <iostream> +#include <Kokkos_UnorderedMap.hpp> + +namespace Test { + +namespace Impl { + +template <typename MapType, bool Near = false> +struct TestInsert { + using map_type = MapType; + using execution_space = typename map_type::execution_space; + using value_type = uint32_t; + + map_type map; + uint32_t inserts; + uint32_t collisions; + + TestInsert(map_type arg_map, uint32_t arg_inserts, uint32_t arg_collisions) + : map(arg_map), inserts(arg_inserts), collisions(arg_collisions) {} + + void testit(bool rehash_on_fail = true) { + execution_space().fence(); + + uint32_t failed_count = 0; + do { + failed_count = 0; + Kokkos::parallel_reduce(inserts, *this, failed_count); + + if (rehash_on_fail && failed_count > 0u) { + const uint32_t new_capacity = map.capacity() + + ((map.capacity() * 3ull) / 20u) + + failed_count / collisions; + map.rehash(new_capacity); + } + } while (rehash_on_fail && failed_count > 0u); + + execution_space().fence(); + } + + KOKKOS_INLINE_FUNCTION + void init(value_type &failed_count) const { failed_count = 0; } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type &failed_count, + const volatile value_type &count) const { + failed_count += count; + } + + KOKKOS_INLINE_FUNCTION + void operator()(uint32_t i, value_type &failed_count) const { + const uint32_t key = Near ? i / collisions : i % (inserts / collisions); + if (map.insert(key, i).failed()) ++failed_count; + } +}; + +template <typename MapType, bool Near> +struct TestErase { + using self_type = TestErase<MapType, Near>; + + using map_type = MapType; + using execution_space = typename MapType::execution_space; + + map_type m_map; + uint32_t m_num_erase; + uint32_t m_num_duplicates; + + TestErase(map_type map, uint32_t num_erases, uint32_t num_duplicates) + : m_map(map), m_num_erase(num_erases), m_num_duplicates(num_duplicates) {} + + void testit() { + execution_space().fence(); + Kokkos::parallel_for(m_num_erase, *this); + execution_space().fence(); + } + + KOKKOS_INLINE_FUNCTION + void operator()(typename execution_space::size_type i) const { + if (Near) { + m_map.erase(i / m_num_duplicates); + } else { + m_map.erase(i % (m_num_erase / m_num_duplicates)); + } + } +}; + +template <typename MapType> +struct TestFind { + using map_type = MapType; + using execution_space = typename MapType::execution_space::execution_space; + using value_type = uint32_t; + + map_type m_map; + uint32_t m_num_insert; + uint32_t m_num_duplicates; + uint32_t m_max_key; + + TestFind(map_type map, uint32_t num_inserts, uint32_t num_duplicates) + : m_map(map), + m_num_insert(num_inserts), + m_num_duplicates(num_duplicates), + m_max_key(((num_inserts + num_duplicates) - 1) / num_duplicates) {} + + void testit(value_type &errors) { + execution_space().fence(); + Kokkos::parallel_reduce(m_map.capacity(), *this, errors); + execution_space().fence(); + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type &dst) { dst = 0; } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type &dst, const volatile value_type &src) { + dst += src; + } + + KOKKOS_INLINE_FUNCTION + void operator()(typename execution_space::size_type i, + value_type &errors) const { + const bool expect_to_find_i = + (i < typename execution_space::size_type(m_max_key)); + + const bool exists = m_map.exists(i); + + if (expect_to_find_i && !exists) ++errors; + if (!expect_to_find_i && exists) ++errors; + } +}; + +} // namespace Impl + +// MSVC reports a syntax error for this test. +// WORKAROUND MSVC +#ifndef _WIN32 +template <typename Device> +void test_insert(uint32_t num_nodes, uint32_t num_inserts, + uint32_t num_duplicates, bool near) { + using map_type = Kokkos::UnorderedMap<uint32_t, uint32_t, Device>; + using const_map_type = + Kokkos::UnorderedMap<const uint32_t, const uint32_t, Device>; + + const uint32_t expected_inserts = + (num_inserts + num_duplicates - 1u) / num_duplicates; + + map_type map; + map.rehash(num_nodes, false); + + if (near) { + Impl::TestInsert<map_type, true> test_insert(map, num_inserts, + num_duplicates); + test_insert.testit(); + } else { + Impl::TestInsert<map_type, false> test_insert(map, num_inserts, + num_duplicates); + test_insert.testit(); + } + + const bool print_list = false; + if (print_list) { + Kokkos::Impl::UnorderedMapPrint<map_type> f(map); + f.apply(); + } + + const uint32_t map_size = map.size(); + + ASSERT_FALSE(map.failed_insert()); + { + EXPECT_EQ(expected_inserts, map_size); + + { + uint32_t find_errors = 0; + Impl::TestFind<const_map_type> test_find(map, num_inserts, + num_duplicates); + test_find.testit(find_errors); + EXPECT_EQ(0u, find_errors); + } + + map.begin_erase(); + Impl::TestErase<map_type, false> test_erase(map, num_inserts, + num_duplicates); + test_erase.testit(); + map.end_erase(); + EXPECT_EQ(0u, map.size()); + } +} +#endif + +template <typename Device> +void test_failed_insert(uint32_t num_nodes) { + using map_type = Kokkos::UnorderedMap<uint32_t, uint32_t, Device>; + + map_type map(num_nodes); + Impl::TestInsert<map_type> test_insert(map, 2u * num_nodes, 1u); + test_insert.testit(false /*don't rehash on fail*/); + typename Device::execution_space().fence(); + + EXPECT_TRUE(map.failed_insert()); +} + +template <typename Device> +void test_deep_copy(uint32_t num_nodes) { + using map_type = Kokkos::UnorderedMap<uint32_t, uint32_t, Device>; + using const_map_type = + Kokkos::UnorderedMap<const uint32_t, const uint32_t, Device>; + + using host_map_type = typename map_type::HostMirror; + + map_type map; + map.rehash(num_nodes, false); + + { + Impl::TestInsert<map_type> test_insert(map, num_nodes, 1); + test_insert.testit(); + ASSERT_EQ(map.size(), num_nodes); + ASSERT_FALSE(map.failed_insert()); + { + uint32_t find_errors = 0; + Impl::TestFind<map_type> test_find(map, num_nodes, 1); + test_find.testit(find_errors); + EXPECT_EQ(find_errors, 0u); + } + } + + host_map_type hmap; + Kokkos::deep_copy(hmap, map); + + ASSERT_EQ(map.size(), hmap.size()); + ASSERT_EQ(map.capacity(), hmap.capacity()); + { + uint32_t find_errors = 0; + Impl::TestFind<host_map_type> test_find(hmap, num_nodes, 1); + test_find.testit(find_errors); + EXPECT_EQ(find_errors, 0u); + } + + map_type mmap; + Kokkos::deep_copy(mmap, hmap); + + const_map_type cmap = mmap; + + EXPECT_EQ(cmap.size(), num_nodes); + + { + uint32_t find_errors = 0; + Impl::TestFind<const_map_type> test_find(cmap, num_nodes, 1); + test_find.testit(find_errors); + EXPECT_EQ(find_errors, 0u); + } +} + +// FIXME_SYCL wrong results on Nvidia GPUs but correct on Host and Intel GPUs +// FIXME_HIP +// WORKAROUND MSVC +#if !(defined(KOKKOS_ENABLE_HIP) && (HIP_VERSION < 401)) && \ + !defined(_WIN32) && !defined(KOKKOS_ENABLE_SYCL) +TEST(TEST_CATEGORY, UnorderedMap_insert) { + for (int i = 0; i < 500; ++i) { + test_insert<TEST_EXECSPACE>(100000, 90000, 100, true); + test_insert<TEST_EXECSPACE>(100000, 90000, 100, false); + } +} +#endif + +TEST(TEST_CATEGORY, UnorderedMap_failed_insert) { + for (int i = 0; i < 1000; ++i) test_failed_insert<TEST_EXECSPACE>(10000); +} + +TEST(TEST_CATEGORY, UnorderedMap_deep_copy) { + for (int i = 0; i < 2; ++i) test_deep_copy<TEST_EXECSPACE>(10000); +} + +TEST(TEST_CATEGORY, UnorderedMap_valid_empty) { + using Key = int; + using Value = int; + using Map = Kokkos::UnorderedMap<Key, Value, TEST_EXECSPACE>; + + Map m{}; + Map n{}; + n = Map{m.capacity()}; + n.rehash(m.capacity()); + Kokkos::deep_copy(n, m); + ASSERT_TRUE(m.is_allocated()); + ASSERT_TRUE(n.is_allocated()); +} + +} // namespace Test + +#endif // KOKKOS_TEST_UNORDERED_MAP_HPP diff --git a/packages/kokkos/containers/unit_tests/TestVector.hpp b/packages/kokkos/containers/unit_tests/TestVector.hpp new file mode 100644 index 0000000000000000000000000000000000000000..33b265e0774aa4eae38fd62e47d8e9b59864572a --- /dev/null +++ b/packages/kokkos/containers/unit_tests/TestVector.hpp @@ -0,0 +1,286 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER + +#ifndef KOKKOS_TEST_VECTOR_HPP +#define KOKKOS_TEST_VECTOR_HPP + +#include <gtest/gtest.h> +#include <iostream> +#include <cstdlib> +#include <cstdio> +#include <Kokkos_Vector.hpp> + +namespace Test { + +namespace Impl { + +template <typename Scalar, class Device> +struct test_vector_insert { + using scalar_type = Scalar; + using execution_space = Device; + + template <typename Vector> + void run_test(Vector& a) { + int n = a.size(); + + auto it = a.begin(); + if (n > 0) { + ASSERT_EQ(a.data(), &a[0]); + } + it += 15; + ASSERT_EQ(*it, scalar_type(1)); + + auto it_return = a.insert(it, scalar_type(3)); + ASSERT_EQ(a.size(), n + 1); + ASSERT_EQ(std::distance(it_return, a.begin() + 15), 0); + + it = a.begin(); + it += 17; +// Looks like some std::vector implementations do not have the restriction +// right on the overload taking three iterators, and thus the following call +// will hit that overload and then fail to compile. +#if defined(KOKKOS_COMPILER_INTEL) +// And at least GCC 4.8.4 doesn't implement vector insert correct for C++11 +// Return type is void ... +#if (__GNUC__ < 5) + a.insert(it, typename Vector::size_type(n + 5), scalar_type(5)); + it_return = a.begin() + 17; +#else + it_return = a.insert(it, typename Vector::size_type(n + 5), scalar_type(5)); +#endif +#else +#if (__GNUC__ < 5) + a.insert(it, n + 5, scalar_type(5)); + it_return = a.begin() + 17; +#else + it_return = a.insert(it, n + 5, scalar_type(5)); +#endif +#endif + + ASSERT_EQ(a.size(), n + 1 + n + 5); + ASSERT_EQ(std::distance(it_return, a.begin() + 17), 0); + + Vector b; + +// Looks like some std::vector implementations do not have the restriction +// right on the overload taking three iterators, and thus the following call +// will hit that overload and then fail to compile. +#if defined(KOKKOS_COMPILER_INTEL) + b.insert(b.begin(), typename Vector::size_type(7), 9); +#else + b.insert(b.begin(), 7, 9); +#endif + ASSERT_EQ(b.size(), 7); + ASSERT_EQ(b[0], scalar_type(9)); + + it = a.begin(); + it += 27 + n; +#if (__GNUC__ < 5) + a.insert(it, b.begin(), b.end()); + it_return = a.begin() + (27 + n); +#else + it_return = a.insert(it, b.begin(), b.end()); +#endif + ASSERT_EQ(a.size(), n + 1 + n + 5 + 7); + ASSERT_EQ(std::distance(it_return, a.begin() + 27 + n), 0); + + // Testing insert at end via all three function interfaces + a.insert(a.end(), 11); +#if defined(KOKKOS_COMPILER_INTEL) + a.insert(a.end(), typename Vector::size_type(2), 12); +#else + a.insert(a.end(), 2, 12); +#endif + a.insert(a.end(), b.begin(), b.end()); + } + + template <typename Vector> + void check_test(Vector& a, int n) { + for (int i = 0; i < (int)a.size(); i++) { + if (i == 15) + ASSERT_EQ(a[i], scalar_type(3)); + else if (i > 16 && i < 16 + 6 + n) + ASSERT_EQ(a[i], scalar_type(5)); + else if (i > 26 + n && i < 34 + n) + ASSERT_EQ(a[i], scalar_type(9)); + else if (i == (int)a.size() - 10) + ASSERT_EQ(a[i], scalar_type(11)); + else if ((i == (int)a.size() - 9) || (i == (int)a.size() - 8)) + ASSERT_EQ(a[i], scalar_type(12)); + else if (i > (int)a.size() - 8) + ASSERT_EQ(a[i], scalar_type(9)); + else + ASSERT_EQ(a[i], scalar_type(1)); + } + } + + test_vector_insert(unsigned int size) { + { + std::vector<Scalar> a(size, scalar_type(1)); + run_test(a); + check_test(a, size); + } + { + Kokkos::vector<Scalar, Device> a(size, scalar_type(1)); + a.sync_device(); + run_test(a); + a.sync_host(); + check_test(a, size); + } + { + Kokkos::vector<Scalar, Device> a(size, scalar_type(1)); + a.sync_host(); + run_test(a); + check_test(a, size); + } + } +}; + +template <typename Scalar, class Device> +struct test_vector_allocate { + using self_type = test_vector_allocate<Scalar, Device>; + + using scalar_type = Scalar; + using execution_space = Device; + + bool result = false; + + template <typename Vector> + Scalar run_me(unsigned int n) { + { + Vector v1; + if (v1.is_allocated() == true) return false; + + v1 = Vector(n, 1); + Vector v2(v1); + Vector v3(n, 1); + + if (v1.is_allocated() == false) return false; + if (v2.is_allocated() == false) return false; + if (v3.is_allocated() == false) return false; + } + return true; + } + + test_vector_allocate(unsigned int size) { + result = run_me<Kokkos::vector<Scalar, Device> >(size); + } +}; + +template <typename Scalar, class Device> +struct test_vector_combinations { + using self_type = test_vector_combinations<Scalar, Device>; + + using scalar_type = Scalar; + using execution_space = Device; + + Scalar reference; + Scalar result; + + template <typename Vector> + Scalar run_me(unsigned int n) { + Vector a(n, 1); + + a.push_back(2); + a.resize(n + 4); + a[n + 1] = 3; + a[n + 2] = 4; + a[n + 3] = 5; + + Scalar temp1 = a[2]; + Scalar temp2 = a[n]; + Scalar temp3 = a[n + 1]; + + a.assign(n + 2, -1); + + a[2] = temp1; + a[n] = temp2; + a[n + 1] = temp3; + + Scalar test1 = 0; + for (unsigned int i = 0; i < a.size(); i++) test1 += a[i]; + + a.assign(n + 1, -2); + Scalar test2 = 0; + for (unsigned int i = 0; i < a.size(); i++) test2 += a[i]; + + a.reserve(n + 10); + + Scalar test3 = 0; + for (unsigned int i = 0; i < a.size(); i++) test3 += a[i]; + + return (test1 * test2 + test3) * test2 + test1 * test3; + } + + test_vector_combinations(unsigned int size) { + reference = run_me<std::vector<Scalar> >(size); + result = run_me<Kokkos::vector<Scalar, Device> >(size); + } +}; + +} // namespace Impl + +template <typename Scalar, typename Device> +void test_vector_combinations(unsigned int size) { + Impl::test_vector_combinations<Scalar, Device> test(size); + ASSERT_EQ(test.reference, test.result); +} + +template <typename Scalar, typename Device> +void test_vector_allocate(unsigned int size) { + Impl::test_vector_allocate<Scalar, Device> test(size); + ASSERT_TRUE(test.result); +} + +TEST(TEST_CATEGORY, vector_combination) { + test_vector_allocate<int, TEST_EXECSPACE>(10); + test_vector_combinations<int, TEST_EXECSPACE>(10); + test_vector_combinations<int, TEST_EXECSPACE>(3057); +} + +TEST(TEST_CATEGORY, vector_insert) { + Impl::test_vector_insert<int, TEST_EXECSPACE>(3057); +} + +} // namespace Test + +#endif // KOKKOS_TEST_UNORDERED_MAP_HPP diff --git a/packages/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp b/packages/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp new file mode 100644 index 0000000000000000000000000000000000000000..d402160ef4bafb176647bf309b507129265de514 --- /dev/null +++ b/packages/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp @@ -0,0 +1,213 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <cstdio> + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> +#include <Kokkos_DynRankView.hpp> + +#include <type_traits> +#include <typeinfo> + +namespace Test { + +namespace { + +template <typename ExecSpace> +struct TestViewCtorProp_EmbeddedDim { + using ViewIntType = typename Kokkos::View<int**, ExecSpace>; + using ViewDoubleType = typename Kokkos::View<double*, ExecSpace>; + + using DynRankViewIntType = typename Kokkos::DynRankView<int, ExecSpace>; + using DynRankViewDoubleType = typename Kokkos::DynRankView<double, ExecSpace>; + + // Cuda 7.0 has issues with using a lambda in parallel_for to initialize the + // view - replace with this functor + template <class ViewType> + struct Functor { + ViewType v; + + Functor(const ViewType& v_) : v(v_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { v(i) = i; } + }; + + static void test_vcpt(const int N0, const int N1) { + // Create two views to test + { + using VIT = typename TestViewCtorProp_EmbeddedDim::ViewIntType; + using VDT = typename TestViewCtorProp_EmbeddedDim::ViewDoubleType; + + VIT vi1("vi1", N0, N1); + VDT vd1("vd1", N0); + + // TEST: Test for common type between two views, one with type double, + // other with type int Deduce common value_type and construct a view with + // that type + { + // Two views + auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1, vd1); + using CommonViewValueType = + typename decltype(view_alloc_arg)::value_type; + using CVT = typename Kokkos::View<CommonViewValueType*, ExecSpace>; + using HostCVT = typename CVT::HostMirror; + + // Construct View using the common type; for case of specialization, an + // 'embedded_dim' would be stored by view_alloc_arg + CVT cv1(Kokkos::view_alloc("cv1", view_alloc_arg), N0 * N1); + + Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, N0 * N1), + Functor<CVT>(cv1)); + + HostCVT hcv1 = Kokkos::create_mirror_view(cv1); + Kokkos::deep_copy(hcv1, cv1); + + ASSERT_EQ((std::is_same<CommonViewValueType, double>::value), true); +#if 0 + // debug output + for ( int i = 0; i < N0*N1; ++i ) { + printf(" Output check: hcv1(%d) = %lf\n ", i, hcv1(i) ); + } + + printf( " Common value type view: %s \n", typeid( CVT() ).name() ); + printf( " Common value type: %s \n", typeid( CommonViewValueType() ).name() ); + if ( std::is_same< CommonViewValueType, double >::value == true ) { + printf("Proper common value_type\n"); + } + else { + printf("WRONG common value_type\n"); + } + // end debug output +#endif + } + + { + // Single view + auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1); + using CommonViewValueType = + typename decltype(view_alloc_arg)::value_type; + using CVT = typename Kokkos::View<CommonViewValueType*, ExecSpace>; + using HostCVT = typename CVT::HostMirror; + + // Construct View using the common type; for case of specialization, an + // 'embedded_dim' would be stored by view_alloc_arg + CVT cv1(Kokkos::view_alloc("cv1", view_alloc_arg), N0 * N1); + + Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, N0 * N1), + Functor<CVT>(cv1)); + + HostCVT hcv1 = Kokkos::create_mirror_view(cv1); + Kokkos::deep_copy(hcv1, cv1); + + ASSERT_EQ((std::is_same<CommonViewValueType, int>::value), true); + } + } + + // Create two dynamic rank views to test + { + using VIT = typename TestViewCtorProp_EmbeddedDim::DynRankViewIntType; + using VDT = typename TestViewCtorProp_EmbeddedDim::DynRankViewDoubleType; + + VIT vi1("vi1", N0, N1); + VDT vd1("vd1", N0); + + // TEST: Test for common type between two views, one with type double, + // other with type int Deduce common value_type and construct a view with + // that type + { + // Two views + auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1, vd1); + using CommonViewValueType = + typename decltype(view_alloc_arg)::value_type; + using CVT = typename Kokkos::View<CommonViewValueType*, ExecSpace>; + using HostCVT = typename CVT::HostMirror; + + // Construct View using the common type; for case of specialization, an + // 'embedded_dim' would be stored by view_alloc_arg + CVT cv1(Kokkos::view_alloc("cv1", view_alloc_arg), N0 * N1); + + Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, N0 * N1), + Functor<CVT>(cv1)); + + HostCVT hcv1 = Kokkos::create_mirror_view(cv1); + Kokkos::deep_copy(hcv1, cv1); + + ASSERT_EQ((std::is_same<CommonViewValueType, double>::value), true); + } + + { + // Single views + auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1); + using CommonViewValueType = + typename decltype(view_alloc_arg)::value_type; + using CVT = typename Kokkos::View<CommonViewValueType*, ExecSpace>; + using HostCVT = typename CVT::HostMirror; + + // Construct View using the common type; for case of specialization, an + // 'embedded_dim' would be stored by view_alloc_arg + CVT cv1(Kokkos::view_alloc("cv1", view_alloc_arg), N0 * N1); + + Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, N0 * N1), + Functor<CVT>(cv1)); + + HostCVT hcv1 = Kokkos::create_mirror_view(cv1); + Kokkos::deep_copy(hcv1, cv1); + + ASSERT_EQ((std::is_same<CommonViewValueType, int>::value), true); + } + } + + } // end test_vcpt + +}; // end struct + +} // namespace + +TEST(TEST_CATEGORY, viewctorprop_embedded_dim) { + TestViewCtorProp_EmbeddedDim<TEST_EXECSPACE>::test_vcpt(2, 3); +} +} // namespace Test diff --git a/packages/kokkos/containers/unit_tests/UnitTestMain.cpp b/packages/kokkos/containers/unit_tests/UnitTestMain.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e245aad35fc33a595a16f711dbd4a63a0c7f8948 --- /dev/null +++ b/packages/kokkos/containers/unit_tests/UnitTestMain.cpp @@ -0,0 +1,54 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> +#include <Kokkos_Core.hpp> + +int main(int argc, char *argv[]) { + Kokkos::initialize(argc, argv); + ::testing::InitGoogleTest(&argc, argv); + int result = RUN_ALL_TESTS(); + Kokkos::finalize(); + return result; +} diff --git a/packages/kokkos/core/CMakeLists.txt b/packages/kokkos/core/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..68d3f83319995037aaa9528a93ee30c024c3ac39 --- /dev/null +++ b/packages/kokkos/core/CMakeLists.txt @@ -0,0 +1,14 @@ + + +KOKKOS_SUBPACKAGE(Core) + +IF (NOT Kokkos_INSTALL_TESTING) + ADD_SUBDIRECTORY(src) +ENDIF() + +KOKKOS_ADD_TEST_DIRECTORIES(unit_test) +KOKKOS_ADD_TEST_DIRECTORIES(perf_test) + +KOKKOS_SUBPACKAGE_POSTPROCESS() + + diff --git a/packages/kokkos/core/cmake/Dependencies.cmake b/packages/kokkos/core/cmake/Dependencies.cmake new file mode 100644 index 0000000000000000000000000000000000000000..cc901a4ede0c6b17fbb89bfa9edfaf6544d7b269 --- /dev/null +++ b/packages/kokkos/core/cmake/Dependencies.cmake @@ -0,0 +1,6 @@ +TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( + LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib HPX + TEST_OPTIONAL_TPLS CUSPARSE + ) + +TRIBITS_TPL_TENTATIVELY_ENABLE(DLlib) diff --git a/packages/kokkos/core/cmake/KokkosCore_config.h.in b/packages/kokkos/core/cmake/KokkosCore_config.h.in new file mode 100644 index 0000000000000000000000000000000000000000..f0835772b864faf0126796c75f5f1e9d02f95e28 --- /dev/null +++ b/packages/kokkos/core/cmake/KokkosCore_config.h.in @@ -0,0 +1,104 @@ +/* The trivial 'src/build_common.sh' creates a config + * that must stay in sync with this file. + */ +#cmakedefine KOKKOS_FOR_SIERRA + +#if !defined(KOKKOS_FOR_SIERRA) + +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error \ + "Don't include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif + +#cmakedefine KOKKOS_ENABLE_CUDA +#cmakedefine KOKKOS_ENABLE_HIP +#cmakedefine KOKKOS_ENABLE_OPENMP +#cmakedefine KOKKOS_ENABLE_THREADS +#cmakedefine KOKKOS_ENABLE_SERIAL +#cmakedefine KOKKOS_ENABLE_Winthread + +#cmakedefine KOKKOS_ENABLE_HWLOC +#cmakedefine KOKKOS_ENABLE_HBWSPACE +#cmakedefine KOKKOS_ENABLE_LIBRT + +#cmakedefine KOKKOS_ENABLE_DEBUG +#cmakedefine KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK +#cmakedefine KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK +#cmakedefine KOKKOS_ENABLE_PROFILING_LOAD_PRINT +#cmakedefine KOKKOS_ENABLE_TUNING + +#cmakedefine KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION + +#ifdef KOKKOS_ENABLE_CUDA + +#cmakedefine KOKKOS_ENABLE_CUDA_LDG_INTRINSIC + +// mfh 16 Sep 2014: If passed in on the command line, that overrides +// any value of KOKKOS_USE_CUDA_UVM here. Doing this should prevent build +// warnings like this one: +// +// packages/kokkos/core/src/KokkosCore_config.h:13:1: warning: +// "KOKKOS_USE_CUDA_UVM" redefined +// +// At some point, we should edit the test-build scripts in +// Trilinos/cmake/ctest/drivers/perseus/, and take +// -DKOKKOS_USE_CUDA_UVM from the command-line arguments there. I +// hesitate to do that now, because I'm not sure if all the files are +// including KokkosCore_config.h (or a header file that includes it) like +// they should. +#ifndef KOKKOS_USE_CUDA_UVM +#cmakedefine KOKKOS_USE_CUDA_UVM +#endif + +#cmakedefine KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE + +#cmakedefine KOKKOS_ENABLE_CUDA_LAMBDA + +#endif + +#cmakedefine KOKKOS_IMPL_CUDA_CLANG_WORKAROUND + +#ifndef __CUDA_ARCH__ +#cmakedefine KOKKOS_ENABLE_ISA_X86_64 +#cmakedefine KOKKOS_ENABLE_ISA_KNC +#cmakedefine KOKKOS_ENABLE_ISA_POWERPCLE +#endif + +#ifdef KOKKOS_ENABLE_HIP +#cmakedefine KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE +#endif + +#cmakedefine KOKKOS_ARCH_ARMV80 1 +#cmakedefine KOKKOS_ARCH_ARMV81 1 +#cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX 1 +#cmakedefine KOKKOS_ARCH_AVX 1 +#cmakedefine KOKKOS_ARCH_AVX2 1 +#cmakedefine KOKKOS_ARCH_AVX512MIC 1 +#cmakedefine KOKKOS_ARCH_AVX512XEON 1 +#cmakedefine KOKKOS_ARCH_KNC 1 +#cmakedefine KOKKOS_ARCH_POWER8 1 +#cmakedefine KOKKOS_ARCH_POWER9 1 +#cmakedefine KOKKOS_ARCH_KEPLER 1 +#cmakedefine KOKKOS_ARCH_KEPLER30 1 +#cmakedefine KOKKOS_ARCH_KEPLER32 1 +#cmakedefine KOKKOS_ARCH_KEPLER35 1 +#cmakedefine KOKKOS_ARCH_KEPLER37 1 +#cmakedefine KOKKOS_ARCH_MAXWELL 1 +#cmakedefine KOKKOS_ARCH_MAXWELL50 1 +#cmakedefine KOKKOS_ARCH_MAXWELL52 1 +#cmakedefine KOKKOS_ARCH_MAXWELL53 1 +#cmakedefine KOKKOS_ARCH_PASCAL 1 +#cmakedefine KOKKOS_ARCH_PASCAL60 1 +#cmakedefine KOKKOS_ARCH_PASCAL61 1 +#cmakedefine KOKKOS_ARCH_VOLTA70 1 + +// TODO: These are currently not used in Kokkos. Should they be removed? +#cmakedefine KOKKOS_ENABLE_MPI +#cmakedefine KOKKOS_ENABLE_CUSPARSE + +// TODO: No longer options in Kokkos. Need to be removed. +#cmakedefine KOKKOS_USING_DEPRECATED_VIEW + +#endif // !defined(KOKKOS_FOR_SIERRA) diff --git a/packages/kokkos/core/perf_test/CMakeLists.txt b/packages/kokkos/core/perf_test/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..9ff4b6006da8cb0358f2a9e53810b79ce59e8b02 --- /dev/null +++ b/packages/kokkos/core/perf_test/CMakeLists.txt @@ -0,0 +1,114 @@ + +#INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +#INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) + + +# warning: PerfTest_CustomReduction.cpp uses +# ../../algorithms/src/Kokkos_Random.hpp +# we'll just allow it to be included, but note +# that in TriBITS KokkosAlgorithms can be disabled... +#INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}/../../algorithms/src") + +# FIXME_OPENMPTARGET - the NVIDIA HPC compiler nvc++ in the OpenMPTarget backend does not pass the perf_tests. +IF (KOKKOS_ENABLE_OPENMPTARGET + AND (KOKKOS_CXX_COMPILER_ID STREQUAL PGI + OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)) + RETURN() +ENDIF() + + +SET(SOURCES + PerfTestMain.cpp + PerfTestGramSchmidt.cpp + PerfTestHexGrad.cpp + PerfTest_CustomReduction.cpp + PerfTest_ExecSpacePartitioning.cpp + PerfTest_ViewCopy_a123.cpp + PerfTest_ViewCopy_b123.cpp + PerfTest_ViewCopy_c123.cpp + PerfTest_ViewCopy_d123.cpp + PerfTest_ViewCopy_a45.cpp + PerfTest_ViewCopy_b45.cpp + PerfTest_ViewCopy_c45.cpp + PerfTest_ViewCopy_d45.cpp + PerfTest_ViewCopy_a6.cpp + PerfTest_ViewCopy_b6.cpp + PerfTest_ViewCopy_c6.cpp + PerfTest_ViewCopy_d6.cpp + PerfTest_ViewCopy_a7.cpp + PerfTest_ViewCopy_b7.cpp + PerfTest_ViewCopy_c7.cpp + PerfTest_ViewCopy_d7.cpp + PerfTest_ViewCopy_a8.cpp + PerfTest_ViewCopy_b8.cpp + PerfTest_ViewCopy_c8.cpp + PerfTest_ViewCopy_d8.cpp + PerfTest_ViewAllocate.cpp + PerfTest_ViewFill_123.cpp + PerfTest_ViewFill_45.cpp + PerfTest_ViewFill_6.cpp + PerfTest_ViewFill_7.cpp + PerfTest_ViewFill_8.cpp + PerfTest_ViewResize_123.cpp + PerfTest_ViewResize_45.cpp + PerfTest_ViewResize_6.cpp + PerfTest_ViewResize_7.cpp + PerfTest_ViewResize_8.cpp + ) + +IF(Kokkos_ENABLE_OPENMPTARGET) +# FIXME OPENMPTARGET requires TeamPolicy Reductions and Custom Reduction + LIST(REMOVE_ITEM SOURCES + PerfTestGramSchmidt.cpp + PerfTest_CustomReduction.cpp + PerfTest_ExecSpacePartitioning.cpp + ) +ENDIF() + +# Per #374, we always want to build this test, but we only want to run +# it as a PERFORMANCE test. That's why we separate building the test +# from running the test. + +#leave these as basic includes for now +#I don't need anything transitive +KOKKOS_INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}/../../algorithms/src") +KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) + +# This test currently times out for MSVC +IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") + KOKKOS_ADD_EXECUTABLE_AND_TEST( + PerfTestExec + SOURCES ${SOURCES} + CATEGORIES PERFORMANCE + ) +ENDIF() + +KOKKOS_ADD_EXECUTABLE_AND_TEST( + PerformanceTest_Atomic + SOURCES test_atomic.cpp + CATEGORIES PERFORMANCE +) + +IF(NOT KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_CUDA_LAMBDA) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + PerformanceTest_Atomic_MinMax + SOURCES test_atomic_minmax_simple.cpp + CATEGORIES PERFORMANCE + ) +ENDIF() + +KOKKOS_ADD_EXECUTABLE_AND_TEST( + PerformanceTest_Mempool + SOURCES test_mempool.cpp + CATEGORIES PERFORMANCE +) + +IF(NOT Kokkos_ENABLE_OPENMPTARGET) +# FIXME OPENMPTARGET needs tasking + KOKKOS_ADD_EXECUTABLE_AND_TEST( + PerformanceTest_TaskDag + SOURCES test_taskdag.cpp + CATEGORIES PERFORMANCE + ) +ENDIF() diff --git a/packages/kokkos/core/perf_test/Makefile b/packages/kokkos/core/perf_test/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..ac06c89757d243ae6af6de755cae749bdd959e5b --- /dev/null +++ b/packages/kokkos/core/perf_test/Makefile @@ -0,0 +1,118 @@ +KOKKOS_PATH = ../.. + +GTEST_PATH = ../../tpls/gtest + +vpath %.cpp ${KOKKOS_PATH}/core/perf_test + +default: build_all + echo "End Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) + CXX = $(KOKKOS_PATH)/bin/nvcc_wrapper + KOKKOS_CUDA_OPTIONS=enable_lambda +else + CXX = g++ +endif + +CXXFLAGS = -O3 +#CXXFLAGS += -DGENERIC_REDUCER +LINK ?= $(CXX) +LDFLAGS ?= +override LDFLAGS += -lpthread + +include $(KOKKOS_PATH)/Makefile.kokkos + +KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/core/perf_test + +TEST_TARGETS = +TARGETS = + +# + +OBJ_PERF = PerfTestMain.o gtest-all.o +OBJ_PERF += PerfTest_ExecSpacePartitioning.o +OBJ_PERF += PerfTestGramSchmidt.o +OBJ_PERF += PerfTestHexGrad.o +OBJ_PERF += PerfTest_CustomReduction.o +OBJ_PERF += PerfTest_ViewCopy_a123.o PerfTest_ViewCopy_b123.o PerfTest_ViewCopy_c123.o PerfTest_ViewCopy_d123.o +OBJ_PERF += PerfTest_ViewCopy_a45.o PerfTest_ViewCopy_b45.o PerfTest_ViewCopy_c45.o PerfTest_ViewCopy_d45.o +OBJ_PERF += PerfTest_ViewCopy_a6.o PerfTest_ViewCopy_b6.o PerfTest_ViewCopy_c6.o PerfTest_ViewCopy_d6.o +OBJ_PERF += PerfTest_ViewCopy_a7.o PerfTest_ViewCopy_b7.o PerfTest_ViewCopy_c7.o PerfTest_ViewCopy_d7.o +OBJ_PERF += PerfTest_ViewCopy_a8.o PerfTest_ViewCopy_b8.o PerfTest_ViewCopy_c8.o PerfTest_ViewCopy_d8.o +OBJ_PERF += PerfTest_ViewAllocate.o +OBJ_PERF += PerfTest_ViewFill_123.o PerfTest_ViewFill_45.o PerfTest_ViewFill_6.o PerfTest_ViewFill_7.o PerfTest_ViewFill_8.o +OBJ_PERF += PerfTest_ViewResize_123.o PerfTest_ViewResize_45.o PerfTest_ViewResize_6.o PerfTest_ViewResize_7.o PerfTest_ViewResize_8.o +TARGETS += KokkosCore_PerformanceTest +TEST_TARGETS += test-performance + +# + +OBJ_ATOMICS = test_atomic.o +TARGETS += KokkosCore_PerformanceTest_Atomics +TEST_TARGETS += test-atomic + +# + +OBJ_MEMPOOL = test_mempool.o +TARGETS += KokkosCore_PerformanceTest_Mempool +TEST_TARGETS += test-mempool + +# + +OBJ_TASKDAG = test_taskdag.o +TARGETS += KokkosCore_PerformanceTest_TaskDAG +TEST_TARGETS += test-taskdag + +# + +OBJ_ATOMICS_MINMAX = test_atomic_minmax_simple.o +TARGETS += KokkosCore_PerformanceTest_Atomics_MinMax +TEST_TARGETS += test-atomic-minmax + +# + +KokkosCore_PerformanceTest: $(OBJ_PERF) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(EXTRA_PATH) $(OBJ_PERF) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_PerformanceTest + +KokkosCore_PerformanceTest_Atomics: $(OBJ_ATOMICS) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(EXTRA_PATH) $(OBJ_ATOMICS) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_PerformanceTest_Atomics + +KokkosCore_PerformanceTest_Mempool: $(OBJ_MEMPOOL) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_MEMPOOL) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_PerformanceTest_Mempool + +KokkosCore_PerformanceTest_TaskDAG: $(OBJ_TASKDAG) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_TASKDAG) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_PerformanceTest_TaskDAG + +KokkosCore_PerformanceTest_Atomics_MinMax: $(OBJ_ATOMICS_MINMAX) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(EXTRA_PATH) $(OBJ_ATOMICS_MINMAX) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_PerformanceTest_Atomics_MinMax + +test-performance: KokkosCore_PerformanceTest + ./KokkosCore_PerformanceTest + +test-atomic: KokkosCore_PerformanceTest_Atomics + ./KokkosCore_PerformanceTest_Atomics + +test-mempool: KokkosCore_PerformanceTest_Mempool + ./KokkosCore_PerformanceTest_Mempool + +test-taskdag: KokkosCore_PerformanceTest_TaskDAG + ./KokkosCore_PerformanceTest_TaskDAG + +test-atomic-minmax: KokkosCore_PerformanceTest_Atomics_MinMax + ./KokkosCore_PerformanceTest_Atomics_MinMax + +build_all: $(TARGETS) + +test: $(TEST_TARGETS) + +clean: kokkos-clean + rm -f *.o $(TARGETS) + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< + +gtest-all.o:$(GTEST_PATH)/gtest/gtest-all.cc + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $(GTEST_PATH)/gtest/gtest-all.cc + diff --git a/packages/kokkos/core/perf_test/PerfTestBlasKernels.hpp b/packages/kokkos/core/perf_test/PerfTestBlasKernels.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e133dafa368d562a148caf2e3b8adc4ff4a8b77d --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTestBlasKernels.hpp @@ -0,0 +1,218 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_BLAS_KERNELS_HPP +#define KOKKOS_BLAS_KERNELS_HPP + +#include <type_traits> + +namespace Kokkos { + +template <class Type> +struct Dot { + using execution_space = typename Type::execution_space; + + static_assert(static_cast<unsigned>(Type::Rank) == static_cast<unsigned>(1), + "Dot static_assert Fail: Rank != 1"); + + using value_type = double; + +#if 1 + typename Type::const_type X; + typename Type::const_type Y; +#else + Type X; + Type Y; +#endif + + Dot(const Type& arg_x, const Type& arg_y) : X(arg_x), Y(arg_y) {} + + KOKKOS_INLINE_FUNCTION + void operator()(int i, value_type& update) const { update += X[i] * Y[i]; } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& update, + const volatile value_type& source) { + update += source; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& update) { update = 0; } +}; + +template <class Type> +struct DotSingle { + using execution_space = typename Type::execution_space; + + static_assert(static_cast<unsigned>(Type::Rank) == static_cast<unsigned>(1), + "DotSingle static_assert Fail: Rank != 1"); + + using value_type = double; + +#if 1 + typename Type::const_type X; +#else + Type X; +#endif + + DotSingle(const Type& arg_x) : X(arg_x) {} + + KOKKOS_INLINE_FUNCTION + void operator()(int i, value_type& update) const { + const typename Type::value_type& x = X[i]; + update += x * x; + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& update, + const volatile value_type& source) { + update += source; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& update) { update = 0; } +}; + +template <class ScalarType, class VectorType> +struct Scale { + using execution_space = typename VectorType::execution_space; + + static_assert(static_cast<unsigned>(ScalarType::Rank) == + static_cast<unsigned>(0), + "Scale static_assert Fail: ScalarType::Rank != 0"); + + static_assert(static_cast<unsigned>(VectorType::Rank) == + static_cast<unsigned>(1), + "Scale static_assert Fail: VectorType::Rank != 1"); + +#if 1 + typename ScalarType::const_type alpha; +#else + ScalarType alpha; +#endif + + VectorType Y; + + Scale(const ScalarType& arg_alpha, const VectorType& arg_Y) + : alpha(arg_alpha), Y(arg_Y) {} + + KOKKOS_INLINE_FUNCTION + void operator()(int i) const { Y[i] *= alpha(); } +}; + +template <class ScalarType, class ConstVectorType, class VectorType> +struct AXPBY { + using execution_space = typename VectorType::execution_space; + + static_assert(static_cast<unsigned>(ScalarType::Rank) == + static_cast<unsigned>(0), + "AXPBY static_assert Fail: ScalarType::Rank != 0"); + + static_assert(static_cast<unsigned>(ConstVectorType::Rank) == + static_cast<unsigned>(1), + "AXPBY static_assert Fail: ConstVectorType::Rank != 1"); + + static_assert(static_cast<unsigned>(VectorType::Rank) == + static_cast<unsigned>(1), + "AXPBY static_assert Fail: VectorType::Rank != 1"); + +#if 1 + typename ScalarType::const_type alpha, beta; + typename ConstVectorType::const_type X; +#else + ScalarType alpha, beta; + ConstVectorType X; +#endif + + VectorType Y; + + AXPBY(const ScalarType& arg_alpha, const ConstVectorType& arg_X, + const ScalarType& arg_beta, const VectorType& arg_Y) + : alpha(arg_alpha), beta(arg_beta), X(arg_X), Y(arg_Y) {} + + KOKKOS_INLINE_FUNCTION + void operator()(int i) const { Y[i] = alpha() * X[i] + beta() * Y[i]; } +}; + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +/** \brief Y = alpha * X + beta * Y */ +template <class ConstScalarType, class ConstVectorType, class VectorType> +void axpby(const ConstScalarType& alpha, const ConstVectorType& X, + const ConstScalarType& beta, const VectorType& Y) { + using functor = AXPBY<ConstScalarType, ConstVectorType, VectorType>; + + parallel_for(Y.extent(0), functor(alpha, X, beta, Y)); +} + +/** \brief Y *= alpha */ +template <class ConstScalarType, class VectorType> +void scale(const ConstScalarType& alpha, const VectorType& Y) { + using functor = Scale<ConstScalarType, VectorType>; + + parallel_for(Y.extent(0), functor(alpha, Y)); +} + +template <class ConstVectorType, class Finalize> +void dot(const ConstVectorType& X, const ConstVectorType& Y, + const Finalize& finalize) { + using functor = Dot<ConstVectorType>; + + parallel_reduce(X.extent(0), functor(X, Y), finalize); +} + +template <class ConstVectorType, class Finalize> +void dot(const ConstVectorType& X, const Finalize& finalize) { + using functor = DotSingle<ConstVectorType>; + + parallel_reduce(X.extent(0), functor(X), finalize); +} + +} /* namespace Kokkos */ + +#endif /* #ifndef KOKKOS_BLAS_KERNELS_HPP */ diff --git a/packages/kokkos/core/perf_test/PerfTestDriver.hpp b/packages/kokkos/core/perf_test/PerfTestDriver.hpp new file mode 100644 index 0000000000000000000000000000000000000000..95d5128abf312086edd055b5cce158178d5e3cac --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTestDriver.hpp @@ -0,0 +1,429 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <iostream> +#include <string> + +// mfh 06 Jun 2013: This macro doesn't work like one might thing it +// should. It doesn't take the template parameter DeviceType and +// print its actual type name; it just literally prints out +// "DeviceType". I've worked around this below without using the +// macro, so I'm commenting out the macro to avoid compiler complaints +// about an unused macro. + +// #define KOKKOS_IMPL_MACRO_TO_STRING( X ) #X +// #define KOKKOS_MACRO_TO_STRING( X ) KOKKOS_IMPL_MACRO_TO_STRING( X ) + +//------------------------------------------------------------------------ + +namespace Test { + +enum { NUMBER_OF_TRIALS = 5 }; + +template <class DeviceType, class LayoutType> +void run_test_mdrange(int exp_beg, int exp_end, const char deviceTypeName[], + int range_offset = 0, int tile_offset = 0) +// exp_beg = 6 => 2^6 = 64 is starting range length +{ +#define MDRANGE_PERFORMANCE_OUTPUT_VERBOSE 0 + + std::string label_mdrange; + label_mdrange.append("\"MDRange< double , "); + label_mdrange.append(deviceTypeName); + label_mdrange.append(" >\""); + + std::string label_range_col2; + label_range_col2.append("\"RangeColTwo< double , "); + label_range_col2.append(deviceTypeName); + label_range_col2.append(" >\""); + + std::string label_range_col_all; + label_range_col_all.append("\"RangeColAll< double , "); + label_range_col_all.append(deviceTypeName); + label_range_col_all.append(" >\""); + + if (std::is_same<LayoutType, Kokkos::LayoutRight>::value) { + std::cout + << "--------------------------------------------------------------\n" + << "Performance tests for MDRange Layout Right" + << "\n--------------------------------------------------------------" + << std::endl; + } else { + std::cout + << "--------------------------------------------------------------\n" + << "Performance tests for MDRange Layout Left" + << "\n--------------------------------------------------------------" + << std::endl; + } + + for (int i = exp_beg; i < exp_end; ++i) { + const int range_length = (1 << i) + range_offset; + + std::cout + << "\n--------------------------------------------------------------\n" + << "--------------------------------------------------------------\n" + << "MDRange Test: range bounds: " << range_length << " , " + << range_length << " , " << range_length + << "\n--------------------------------------------------------------\n" + << "--------------------------------------------------------------\n"; + // << std::endl; + + int t0_min = 0, t1_min = 0, t2_min = 0; + double seconds_min = 0.0; + + // Test 1: The MDRange in full + { + int t0 = 1, t1 = 1, t2 = 1; + int counter = 1; +#if !defined(KOKKOS_ENABLE_CUDA) + int min_bnd = 8; + int tfast = range_length; +#else + int min_bnd = 2; + int tfast = 32; +#endif + while (tfast >= min_bnd) { + int tmid = min_bnd; + while (tmid < tfast) { + t0 = min_bnd; + t1 = tmid; + t2 = tfast; + int t2_rev = min_bnd; + int t1_rev = tmid; + int t0_rev = tfast; + +#if defined(KOKKOS_ENABLE_CUDA) + // Note: Product of tile sizes must be < 1024 for Cuda + if (t0 * t1 * t2 >= 1024) { + printf(" Exceeded Cuda tile limits; onto next range set\n\n"); + break; + } +#endif + + // Run 1 with tiles LayoutRight style + double seconds_1 = 0; + { + seconds_1 = + MultiDimRangePerf3D<DeviceType, double, + LayoutType>::test_multi_index(range_length, + range_length, + range_length, + t0, t1, t2); + } + +#if MDRANGE_PERFORMANCE_OUTPUT_VERBOSE + std::cout << label_mdrange << " , " << t0 << " , " << t1 << " , " + << t2 << " , " << seconds_1 << std::endl; +#endif + + if (counter == 1) { + seconds_min = seconds_1; + t0_min = t0; + t1_min = t1; + t2_min = t2; + } else { + if (seconds_1 < seconds_min) { + seconds_min = seconds_1; + t0_min = t0; + t1_min = t1; + t2_min = t2; + } + } + + // Run 2 with tiles LayoutLeft style - reverse order of tile dims + double seconds_1rev = 0; + { + seconds_1rev = + MultiDimRangePerf3D<DeviceType, double, + LayoutType>::test_multi_index(range_length, + range_length, + range_length, + t0_rev, + t1_rev, + t2_rev); + } + +#if MDRANGE_PERFORMANCE_OUTPUT_VERBOSE + std::cout << label_mdrange << " , " << t0_rev << " , " << t1_rev + << " , " << t2_rev << " , " << seconds_1rev << std::endl; +#endif + + if (seconds_1rev < seconds_min) { + seconds_min = seconds_1rev; + t0_min = t0_rev; + t1_min = t1_rev; + t2_min = t2_rev; + } + + ++counter; + tmid <<= 1; + } // end inner while + tfast >>= 1; + } // end outer while + + std::cout + << "\n" + << "--------------------------------------------------------------\n" + << label_mdrange << "\n Min values " + << "\n Range length per dim (3D): " << range_length + << "\n TileDims: " << t0_min << " , " << t1_min << " , " << t2_min + << "\n Min time: " << seconds_min + << "\n---------------------------------------------------------------" + << std::endl; + } // end scope + +#if !defined(KOKKOS_ENABLE_CUDA) + double seconds_min_c = 0.0; + int t0c_min = 0, t1c_min = 0, t2c_min = 0; + int counter = 1; + { + int min_bnd = 8; + // Test 1_c: MDRange with 0 for 'inner' tile dim; this case will utilize + // the full span in that direction, should be similar to Collapse<2> + if (std::is_same<LayoutType, Kokkos::LayoutRight>::value) { + for (unsigned int T0 = min_bnd; + T0 < static_cast<unsigned int>(range_length); T0 <<= 1) { + for (unsigned int T1 = min_bnd; + T1 < static_cast<unsigned int>(range_length); T1 <<= 1) { + double seconds_c = 0; + { + seconds_c = MultiDimRangePerf3D<DeviceType, double, LayoutType>:: + test_multi_index(range_length, range_length, range_length, T0, + T1, 0); + } + +#if MDRANGE_PERFORMANCE_OUTPUT_VERBOSE + std::cout << " MDRange LR with '0' tile - collapse-like \n" + << label_mdrange << " , " << T0 << " , " << T1 << " , " + << range_length << " , " << seconds_c << std::endl; +#endif + + t2c_min = range_length; + if (counter == 1) { + seconds_min_c = seconds_c; + t0c_min = T0; + t1c_min = T1; + } else { + if (seconds_c < seconds_min_c) { + seconds_min_c = seconds_c; + t0c_min = T0; + t1c_min = T1; + } + } + ++counter; + } + } + } else { + for (unsigned int T1 = min_bnd; + T1 <= static_cast<unsigned int>(range_length); T1 <<= 1) { + for (unsigned int T2 = min_bnd; + T2 <= static_cast<unsigned int>(range_length); T2 <<= 1) { + double seconds_c = 0; + { + seconds_c = MultiDimRangePerf3D<DeviceType, double, LayoutType>:: + test_multi_index(range_length, range_length, range_length, 0, + T1, T2); + } + +#if MDRANGE_PERFORMANCE_OUTPUT_VERBOSE + std::cout << " MDRange LL with '0' tile - collapse-like \n" + << label_mdrange << " , " << range_length << " < " << T1 + << " , " << T2 << " , " << seconds_c << std::endl; +#endif + + t0c_min = range_length; + if (counter == 1) { + seconds_min_c = seconds_c; + t1c_min = T1; + t2c_min = T2; + } else { + if (seconds_c < seconds_min_c) { + seconds_min_c = seconds_c; + t1c_min = T1; + t2c_min = T2; + } + } + ++counter; + } + } + } + + std::cout + // << + // "--------------------------------------------------------------\n" + << label_mdrange << " Collapse<2> style: " + << "\n Min values " + << "\n Range length per dim (3D): " << range_length + << "\n TileDims: " << t0c_min << " , " << t1c_min << " , " << t2c_min + << "\n Min time: " << seconds_min_c + << "\n---------------------------------------------------------------" + << std::endl; + } // end scope test 2 +#endif + + // Test 2: RangePolicy Collapse2 style + double seconds_2 = 0; + { + seconds_2 = RangePolicyCollapseTwo<DeviceType, double, LayoutType>:: + test_index_collapse_two(range_length, range_length, range_length); + } + std::cout << label_range_col2 << " , " << range_length << " , " << seconds_2 + << std::endl; + + // Test 3: RangePolicy Collapse all style - not necessary, always slow + /* + double seconds_3 = 0; + { seconds_3 = RangePolicyCollapseAll< DeviceType , double , LayoutType + >::test_collapse_all(range_length,range_length,range_length) ; } std::cout + << label_range_col_all + << " , " << range_length + << " , " << seconds_3 + << "\n---------------------------------------------------------------" + << std::endl ; + */ + + // Compare fastest times... will never be collapse all so ignore it + // seconds_min = tiled MDRange + // seconds_min_c = collapse<2>-like MDRange (tiledim = span for fast dim) - + // only for non-Cuda, else tile too long seconds_2 = collapse<2>-style + // RangePolicy seconds_3 = collapse<3>-style RangePolicy + +#if !defined(KOKKOS_ENABLE_CUDA) + if (seconds_min < seconds_min_c) { + if (seconds_min < seconds_2) { + std::cout + << "--------------------------------------------------------------" + "\n" + << " Fastest run: MDRange tiled\n" + << " Time: " << seconds_min + << " Difference: " << seconds_2 - seconds_min << " Other times: \n" + << " MDrange collapse-like (tiledim = span on fast dim) type: " + << seconds_min_c << "\n" + << " Collapse2 Range Policy: " << seconds_2 << "\n" + << "\n-------------------------------------------------------------" + "-" + << "\n-------------------------------------------------------------" + "-" + //<< "\n\n" + << std::endl; + } else if (seconds_min > seconds_2) { + std::cout + << " Fastest run: Collapse2 RangePolicy\n" + << " Time: " << seconds_2 + << " Difference: " << seconds_min - seconds_2 << " Other times: \n" + << " MDrange Tiled: " << seconds_min << "\n" + << " MDrange collapse-like (tiledim = span on fast dim) type: " + << seconds_min_c << "\n" + << "\n-------------------------------------------------------------" + "-" + << "\n-------------------------------------------------------------" + "-" + //<< "\n\n" + << std::endl; + } + } else if (seconds_min > seconds_min_c) { + if (seconds_min_c < seconds_2) { + std::cout << "---------------------------------------------------------" + "-----\n" + << " Fastest run: MDRange collapse-like (tiledim = span on " + "fast dim) type\n" + << " Time: " << seconds_min_c + << " Difference: " << seconds_2 - seconds_min_c + << " Other times: \n" + << " MDrange Tiled: " << seconds_min << "\n" + << " Collapse2 Range Policy: " << seconds_2 << "\n" + << "\n-------------------------------------------------------" + "-------" + << "\n-------------------------------------------------------" + "-------" + //<< "\n\n" + << std::endl; + } else if (seconds_min_c > seconds_2) { + std::cout + << " Fastest run: Collapse2 RangePolicy\n" + << " Time: " << seconds_2 + << " Difference: " << seconds_min_c - seconds_2 + << " Other times: \n" + << " MDrange Tiled: " << seconds_min << "\n" + << " MDrange collapse-like (tiledim = span on fast dim) type: " + << seconds_min_c << "\n" + << "\n-------------------------------------------------------------" + "-" + << "\n-------------------------------------------------------------" + "-" + //<< "\n\n" + << std::endl; + } + } // end else if +#else + if (seconds_min < seconds_2) { + std::cout + << "--------------------------------------------------------------\n" + << " Fastest run: MDRange tiled\n" + << " Time: " << seconds_min + << " Difference: " << seconds_2 - seconds_min << " Other times: \n" + << " Collapse2 Range Policy: " << seconds_2 << "\n" + << "\n--------------------------------------------------------------" + << "\n--------------------------------------------------------------" + //<< "\n\n" + << std::endl; + } else if (seconds_min > seconds_2) { + std::cout + << " Fastest run: Collapse2 RangePolicy\n" + << " Time: " << seconds_2 + << " Difference: " << seconds_min - seconds_2 << " Other times: \n" + << " MDrange Tiled: " << seconds_min << "\n" + << "\n--------------------------------------------------------------" + << "\n--------------------------------------------------------------" + //<< "\n\n" + << std::endl; + } +#endif + + } // end for + +#undef MDRANGE_PERFORMANCE_OUTPUT_VERBOSE +} + +} // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTestGramSchmidt.cpp b/packages/kokkos/core/perf_test/PerfTestGramSchmidt.cpp new file mode 100644 index 0000000000000000000000000000000000000000..dee21fd7a575bd5aa0f6838980c670510f475cab --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTestGramSchmidt.cpp @@ -0,0 +1,251 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <gtest/gtest.h> +#include <PerfTest_Category.hpp> + +#include <cmath> +#include <PerfTestBlasKernels.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Test { + +// Reduction : result = dot( Q(:,j) , Q(:,j) ); +// PostProcess : R(j,j) = result ; inv = 1 / result ; +template <class VectorView, class ValueView> +struct InvNorm2 : public Kokkos::DotSingle<VectorView> { + using value_type = typename Kokkos::DotSingle<VectorView>::value_type; + + ValueView Rjj; + ValueView inv; + + InvNorm2(const VectorView& argX, const ValueView& argR, + const ValueView& argInv) + : Kokkos::DotSingle<VectorView>(argX), Rjj(argR), inv(argInv) {} + + KOKKOS_INLINE_FUNCTION + void final(value_type& result) const { + result = Kokkos::Experimental::sqrt(result); + Rjj() = result; + inv() = (0 < result) ? 1.0 / result : 0; + } +}; + +template <class VectorView, class ValueView> +inline void invnorm2(const VectorView& x, const ValueView& r, + const ValueView& r_inv) { + Kokkos::parallel_reduce(x.extent(0), + InvNorm2<VectorView, ValueView>(x, r, r_inv)); +} + +// PostProcess : tmp = - ( R(j,k) = result ); +template <class VectorView, class ValueView> +struct DotM : public Kokkos::Dot<VectorView> { + using value_type = typename Kokkos::Dot<VectorView>::value_type; + + ValueView Rjk; + ValueView tmp; + + DotM(const VectorView& argX, const VectorView& argY, const ValueView& argR, + const ValueView& argTmp) + : Kokkos::Dot<VectorView>(argX, argY), Rjk(argR), tmp(argTmp) {} + + KOKKOS_INLINE_FUNCTION + void final(value_type& result) const { + Rjk() = result; + tmp() = -result; + } +}; + +template <class VectorView, class ValueView> +inline void dot_neg(const VectorView& x, const VectorView& y, + const ValueView& r, const ValueView& r_neg) { + Kokkos::parallel_reduce(x.extent(0), + DotM<VectorView, ValueView>(x, y, r, r_neg)); +} + +template <typename Scalar, class DeviceType> +struct ModifiedGramSchmidt { + using execution_space = DeviceType; + using size_type = typename execution_space::size_type; + + using multivector_type = + Kokkos::View<Scalar**, Kokkos::LayoutLeft, execution_space>; + + using vector_type = + Kokkos::View<Scalar*, Kokkos::LayoutLeft, execution_space>; + + using value_view = Kokkos::View<Scalar, Kokkos::LayoutLeft, execution_space>; + + multivector_type Q; + multivector_type R; + + static double factorization(const multivector_type Q_, + const multivector_type R_) { + const size_type count = Q_.extent(1); + value_view tmp("tmp"); + value_view one("one"); + + Kokkos::deep_copy(one, (Scalar)1); + + Kokkos::Timer timer; + + for (size_type j = 0; j < count; ++j) { + // Reduction : tmp = dot( Q(:,j) , Q(:,j) ); + // PostProcess : tmp = std::sqrt( tmp ); R(j,j) = tmp ; tmp = 1 / tmp ; + const vector_type Qj = Kokkos::subview(Q_, Kokkos::ALL(), j); + const value_view Rjj = Kokkos::subview(R_, j, j); + + invnorm2(Qj, Rjj, tmp); + + // Q(:,j) *= ( 1 / R(j,j) ); => Q(:,j) *= tmp ; + Kokkos::scale(tmp, Qj); + + for (size_type k = j + 1; k < count; ++k) { + const vector_type Qk = Kokkos::subview(Q_, Kokkos::ALL(), k); + const value_view Rjk = Kokkos::subview(R_, j, k); + + // Reduction : R(j,k) = dot( Q(:,j) , Q(:,k) ); + // PostProcess : tmp = - R(j,k); + dot_neg(Qj, Qk, Rjk, tmp); + + // Q(:,k) -= R(j,k) * Q(:,j); => Q(:,k) += tmp * Q(:,j) + Kokkos::axpby(tmp, Qj, one, Qk); + } + } + + execution_space().fence(); + + return timer.seconds(); + } + + //-------------------------------------------------------------------------- + + static double test(const size_type length, const size_type count, + const size_t iter = 1) { + multivector_type Q_("Q", length, count); + multivector_type R_("R", count, count); + + typename multivector_type::HostMirror A = Kokkos::create_mirror(Q_); + + // Create and fill A on the host + + for (size_type j = 0; j < count; ++j) { + for (size_type i = 0; i < length; ++i) { + A(i, j) = (i + 1) * (j + 1); + } + } + + double dt_min = 0; + + for (size_t i = 0; i < iter; ++i) { + Kokkos::deep_copy(Q_, A); + + // A = Q * R + + const double dt = factorization(Q_, R_); + + if (0 == i) + dt_min = dt; + else + dt_min = dt < dt_min ? dt : dt_min; + } + + return dt_min; + } +}; + +template <class DeviceType> +void run_test_gramschmidt(int exp_beg, int exp_end, int num_trials, + const char deviceTypeName[]) { + std::string label_gramschmidt; + label_gramschmidt.append("\"GramSchmidt< double , "); + label_gramschmidt.append(deviceTypeName); + label_gramschmidt.append(" >\""); + + for (int i = exp_beg; i < exp_end; ++i) { + double min_seconds = 0.0; + double max_seconds = 0.0; + double avg_seconds = 0.0; + + const int parallel_work_length = 1 << i; + + for (int j = 0; j < num_trials; ++j) { + const double seconds = ModifiedGramSchmidt<double, DeviceType>::test( + parallel_work_length, 32); + + if (0 == j) { + min_seconds = seconds; + max_seconds = seconds; + } else { + if (seconds < min_seconds) min_seconds = seconds; + if (seconds > max_seconds) max_seconds = seconds; + } + avg_seconds += seconds; + } + avg_seconds /= num_trials; + + std::cout << label_gramschmidt << " , " << parallel_work_length << " , " + << min_seconds << " , " << (min_seconds / parallel_work_length) + << std::endl; + } +} + +TEST(default_exec, gramschmidt) { + int exp_beg = 10; + int exp_end = 20; + int num_trials = 5; + + if (command_line_num_args() > 1) exp_beg = std::stoi(command_line_arg(1)); + if (command_line_num_args() > 2) exp_end = std::stoi(command_line_arg(2)); + if (command_line_num_args() > 3) num_trials = std::stoi(command_line_arg(3)); + + EXPECT_NO_THROW(run_test_gramschmidt<Kokkos::DefaultExecutionSpace>( + exp_beg, exp_end, num_trials, Kokkos::DefaultExecutionSpace::name())); +} + +} // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTestHexGrad.cpp b/packages/kokkos/core/perf_test/PerfTestHexGrad.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c431c2b0c86d30192edc63d7dfbc447887f227cf --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTestHexGrad.cpp @@ -0,0 +1,300 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <gtest/gtest.h> +#include <PerfTest_Category.hpp> + +namespace Test { + +template <class DeviceType, typename CoordScalarType = double, + typename GradScalarType = float> +struct HexGrad { + using execution_space = DeviceType; + using size_type = typename execution_space::size_type; + + using self_type = HexGrad<DeviceType, CoordScalarType, GradScalarType>; + + // 3D array : ( ParallelWork , Space , Node ) + + enum { NSpace = 3, NNode = 8 }; + + using elem_coord_type = + Kokkos::View<CoordScalarType * [NSpace][NNode], execution_space>; + + using elem_grad_type = + Kokkos::View<GradScalarType * [NSpace][NNode], execution_space>; + + elem_coord_type coords; + elem_grad_type grad_op; + + enum { FLOPS = 318 }; // = 3 * ( 18 + 8 * 11 ) }; + enum { READS = 18 }; + enum { WRITES = 18 }; + + HexGrad(const elem_coord_type& arg_coords, const elem_grad_type& arg_grad_op) + : coords(arg_coords), grad_op(arg_grad_op) {} + + KOKKOS_INLINE_FUNCTION static void grad(const CoordScalarType x[], + const CoordScalarType z[], + GradScalarType grad_y[]) { + const GradScalarType R42 = (x[3] - x[1]); + const GradScalarType R52 = (x[4] - x[1]); + const GradScalarType R54 = (x[4] - x[3]); + + const GradScalarType R63 = (x[5] - x[2]); + const GradScalarType R83 = (x[7] - x[2]); + const GradScalarType R86 = (x[7] - x[5]); + + const GradScalarType R31 = (x[2] - x[0]); + const GradScalarType R61 = (x[5] - x[0]); + const GradScalarType R74 = (x[6] - x[3]); + + const GradScalarType R72 = (x[6] - x[1]); + const GradScalarType R75 = (x[6] - x[4]); + const GradScalarType R81 = (x[7] - x[0]); + + const GradScalarType t1 = (R63 + R54); + const GradScalarType t2 = (R61 + R74); + const GradScalarType t3 = (R72 + R81); + + const GradScalarType t4 = (R86 + R42); + const GradScalarType t5 = (R83 + R52); + const GradScalarType t6 = (R75 + R31); + + // Calculate Y gradient from X and Z data + + grad_y[0] = (z[1] * t1) - (z[2] * R42) - (z[3] * t5) + (z[4] * t4) + + (z[5] * R52) - (z[7] * R54); + grad_y[1] = (z[2] * t2) + (z[3] * R31) - (z[0] * t1) - (z[5] * t6) + + (z[6] * R63) - (z[4] * R61); + grad_y[2] = (z[3] * t3) + (z[0] * R42) - (z[1] * t2) - (z[6] * t4) + + (z[7] * R74) - (z[5] * R72); + grad_y[3] = (z[0] * t5) - (z[1] * R31) - (z[2] * t3) + (z[7] * t6) + + (z[4] * R81) - (z[6] * R83); + grad_y[4] = (z[5] * t3) + (z[6] * R86) - (z[7] * t2) - (z[0] * t4) - + (z[3] * R81) + (z[1] * R61); + grad_y[5] = (z[6] * t5) - (z[4] * t3) - (z[7] * R75) + (z[1] * t6) - + (z[0] * R52) + (z[2] * R72); + grad_y[6] = (z[7] * t1) - (z[5] * t5) - (z[4] * R86) + (z[2] * t4) - + (z[1] * R63) + (z[3] * R83); + grad_y[7] = (z[4] * t2) - (z[6] * t1) + (z[5] * R75) - (z[3] * t6) - + (z[2] * R74) + (z[0] * R54); + } + + KOKKOS_INLINE_FUNCTION + void operator()(size_type ielem) const { + GradScalarType g[NNode]; + + const CoordScalarType x[NNode] = {coords(ielem, 0, 0), coords(ielem, 0, 1), + coords(ielem, 0, 2), coords(ielem, 0, 3), + coords(ielem, 0, 4), coords(ielem, 0, 5), + coords(ielem, 0, 6), coords(ielem, 0, 7)}; + + const CoordScalarType y[NNode] = {coords(ielem, 1, 0), coords(ielem, 1, 1), + coords(ielem, 1, 2), coords(ielem, 1, 3), + coords(ielem, 1, 4), coords(ielem, 1, 5), + coords(ielem, 1, 6), coords(ielem, 1, 7)}; + + const CoordScalarType z[NNode] = {coords(ielem, 2, 0), coords(ielem, 2, 1), + coords(ielem, 2, 2), coords(ielem, 2, 3), + coords(ielem, 2, 4), coords(ielem, 2, 5), + coords(ielem, 2, 6), coords(ielem, 2, 7)}; + + grad(z, y, g); + + grad_op(ielem, 0, 0) = g[0]; + grad_op(ielem, 0, 1) = g[1]; + grad_op(ielem, 0, 2) = g[2]; + grad_op(ielem, 0, 3) = g[3]; + grad_op(ielem, 0, 4) = g[4]; + grad_op(ielem, 0, 5) = g[5]; + grad_op(ielem, 0, 6) = g[6]; + grad_op(ielem, 0, 7) = g[7]; + + grad(x, z, g); + + grad_op(ielem, 1, 0) = g[0]; + grad_op(ielem, 1, 1) = g[1]; + grad_op(ielem, 1, 2) = g[2]; + grad_op(ielem, 1, 3) = g[3]; + grad_op(ielem, 1, 4) = g[4]; + grad_op(ielem, 1, 5) = g[5]; + grad_op(ielem, 1, 6) = g[6]; + grad_op(ielem, 1, 7) = g[7]; + + grad(y, x, g); + + grad_op(ielem, 2, 0) = g[0]; + grad_op(ielem, 2, 1) = g[1]; + grad_op(ielem, 2, 2) = g[2]; + grad_op(ielem, 2, 3) = g[3]; + grad_op(ielem, 2, 4) = g[4]; + grad_op(ielem, 2, 5) = g[5]; + grad_op(ielem, 2, 6) = g[6]; + grad_op(ielem, 2, 7) = g[7]; + } + + //-------------------------------------------------------------------------- + + struct Init { + using execution_space = typename self_type::execution_space; + + elem_coord_type coords; + + Init(const elem_coord_type& arg_coords) : coords(arg_coords) {} + + KOKKOS_INLINE_FUNCTION + void operator()(size_type ielem) const { + coords(ielem, 0, 0) = 0.; + coords(ielem, 1, 0) = 0.; + coords(ielem, 2, 0) = 0.; + + coords(ielem, 0, 1) = 1.; + coords(ielem, 1, 1) = 0.; + coords(ielem, 2, 1) = 0.; + + coords(ielem, 0, 2) = 1.; + coords(ielem, 1, 2) = 1.; + coords(ielem, 2, 2) = 0.; + + coords(ielem, 0, 3) = 0.; + coords(ielem, 1, 3) = 1.; + coords(ielem, 2, 3) = 0.; + + coords(ielem, 0, 4) = 0.; + coords(ielem, 1, 4) = 0.; + coords(ielem, 2, 4) = 1.; + + coords(ielem, 0, 5) = 1.; + coords(ielem, 1, 5) = 0.; + coords(ielem, 2, 5) = 1.; + + coords(ielem, 0, 6) = 1.; + coords(ielem, 1, 6) = 1.; + coords(ielem, 2, 6) = 1.; + + coords(ielem, 0, 7) = 0.; + coords(ielem, 1, 7) = 1.; + coords(ielem, 2, 7) = 1.; + } + }; + + //-------------------------------------------------------------------------- + + static double test(const int count, const int iter = 1) { + elem_coord_type coord("coord", count); + elem_grad_type grad("grad", count); + + // Execute the parallel kernels on the arrays: + + double dt_min = 0; + + Kokkos::parallel_for(count, Init(coord)); + execution_space().fence(); + + for (int i = 0; i < iter; ++i) { + Kokkos::Timer timer; + Kokkos::parallel_for(count, HexGrad<execution_space>(coord, grad)); + execution_space().fence(); + const double dt = timer.seconds(); + if (0 == i) + dt_min = dt; + else + dt_min = dt < dt_min ? dt : dt_min; + } + + return dt_min; + } +}; + +template <class DeviceType> +void run_test_hexgrad(int exp_beg, int exp_end, int num_trials, + const char deviceTypeName[]) { + std::string label_hexgrad; + label_hexgrad.append("\"HexGrad< double , "); + label_hexgrad.append(deviceTypeName); + label_hexgrad.append(" >\""); + + for (int i = exp_beg; i < exp_end; ++i) { + double min_seconds = 0.0; + double max_seconds = 0.0; + double avg_seconds = 0.0; + + const int parallel_work_length = 1 << i; + + for (int j = 0; j < num_trials; ++j) { + const double seconds = HexGrad<DeviceType>::test(parallel_work_length); + + if (0 == j) { + min_seconds = seconds; + max_seconds = seconds; + } else { + if (seconds < min_seconds) min_seconds = seconds; + if (seconds > max_seconds) max_seconds = seconds; + } + avg_seconds += seconds; + } + avg_seconds /= num_trials; + + std::cout << label_hexgrad << " , " << parallel_work_length << " , " + << min_seconds << " , " << (min_seconds / parallel_work_length) + << std::endl; + } +} + +TEST(default_exec, hexgrad) { + int exp_beg = 10; + int exp_end = 20; + int num_trials = 5; + + if (command_line_num_args() > 1) exp_beg = std::stoi(command_line_arg(1)); + if (command_line_num_args() > 2) exp_end = std::stoi(command_line_arg(2)); + if (command_line_num_args() > 3) num_trials = std::stoi(command_line_arg(3)); + + EXPECT_NO_THROW(run_test_hexgrad<Kokkos::DefaultExecutionSpace>( + exp_beg, exp_end, num_trials, Kokkos::DefaultExecutionSpace::name())); +} + +} // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTestMDRange.hpp b/packages/kokkos/core/perf_test/PerfTestMDRange.hpp new file mode 100644 index 0000000000000000000000000000000000000000..ec0452d5f197888d1cdbdf2c189e2c614e1b66d8 --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTestMDRange.hpp @@ -0,0 +1,626 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +namespace Test { +template <class DeviceType, typename ScalarType = double, + typename TestLayout = Kokkos::LayoutRight> +struct MultiDimRangePerf3D { + using execution_space = DeviceType; + using size_type = typename execution_space::size_type; + + using iterate_type = Kokkos::Iterate; + + using view_type = Kokkos::View<ScalarType ***, TestLayout, DeviceType>; + using host_view_type = typename view_type::HostMirror; + + view_type A; + view_type B; + const long irange; + const long jrange; + const long krange; + + MultiDimRangePerf3D(const view_type &A_, const view_type &B_, + const long &irange_, const long &jrange_, + const long &krange_) + : A(A_), B(B_), irange(irange_), jrange(jrange_), krange(krange_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const long i, const long j, const long k) const { + A(i, j, k) = + 0.25 * (ScalarType)(B(i + 2, j, k) + B(i + 1, j, k) + B(i, j + 2, k) + + B(i, j + 1, k) + B(i, j, k + 2) + B(i, j, k + 1) + + B(i, j, k)); + } + + struct InitZeroTag {}; + // struct InitViewTag {}; + + struct Init { + Init(const view_type &input_, const long &irange_, const long &jrange_, + const long &krange_) + : input(input_), irange(irange_), jrange(jrange_), krange(krange_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const long i, const long j, const long k) const { + input(i, j, k) = 1.0; + } + + KOKKOS_INLINE_FUNCTION + void operator()(const InitZeroTag &, const long i, const long j, + const long k) const { + input(i, j, k) = 0; + } + + view_type input; + const long irange; + const long jrange; + const long krange; + }; + + static double test_multi_index(const unsigned int icount, + const unsigned int jcount, + const unsigned int kcount, + const unsigned int Ti = 1, + const unsigned int Tj = 1, + const unsigned int Tk = 1, + const long iter = 1) { + // This test performs multidim range over all dims + view_type Atest("Atest", icount, jcount, kcount); + view_type Btest("Btest", icount + 2, jcount + 2, kcount + 2); + using FunctorType = + MultiDimRangePerf3D<execution_space, ScalarType, TestLayout>; + + double dt_min = 0; + + // LayoutRight + if (std::is_same<TestLayout, Kokkos::LayoutRight>::value) { + Kokkos::MDRangePolicy< + Kokkos::Rank<3, iterate_type::Right, iterate_type::Right>, + execution_space> + policy_initA({{0, 0, 0}}, {{icount, jcount, kcount}}, {{Ti, Tj, Tk}}); + Kokkos::MDRangePolicy< + Kokkos::Rank<3, iterate_type::Right, iterate_type::Right>, + execution_space> + policy_initB({{0, 0, 0}}, {{icount + 2, jcount + 2, kcount + 2}}, + {{Ti, Tj, Tk}}); + + using MDRangeType = typename Kokkos::MDRangePolicy< + Kokkos::Rank<3, iterate_type::Right, iterate_type::Right>, + execution_space>; + using tile_type = typename MDRangeType::tile_type; + using point_type = typename MDRangeType::point_type; + + Kokkos::MDRangePolicy< + Kokkos::Rank<3, iterate_type::Right, iterate_type::Right>, + execution_space> + policy(point_type{{0, 0, 0}}, point_type{{icount, jcount, kcount}}, + tile_type{{Ti, Tj, Tk}}); + + Kokkos::parallel_for(policy_initA, Init(Atest, icount, jcount, kcount)); + execution_space().fence(); + Kokkos::parallel_for(policy_initB, + Init(Btest, icount + 2, jcount + 2, kcount + 2)); + execution_space().fence(); + + for (int i = 0; i < iter; ++i) { + Kokkos::Timer timer; + Kokkos::parallel_for(policy, + FunctorType(Atest, Btest, icount, jcount, kcount)); + execution_space().fence(); + const double dt = timer.seconds(); + if (0 == i) + dt_min = dt; + else + dt_min = dt < dt_min ? dt : dt_min; + + // Correctness check - only the first run + if (0 == i) { + long numErrors = 0; + host_view_type Ahost("Ahost", icount, jcount, kcount); + Kokkos::deep_copy(Ahost, Atest); + host_view_type Bhost("Bhost", icount + 2, jcount + 2, kcount + 2); + Kokkos::deep_copy(Bhost, Btest); + + // On KNL, this may vectorize - add print statement to prevent + // Also, compare against epsilon, as vectorization can change bitwise + // answer + for (long l = 0; l < static_cast<long>(icount); ++l) { + for (long j = 0; j < static_cast<long>(jcount); ++j) { + for (long k = 0; k < static_cast<long>(kcount); ++k) { + ScalarType check = + 0.25 * + (ScalarType)(Bhost(l + 2, j, k) + Bhost(l + 1, j, k) + + Bhost(l, j + 2, k) + Bhost(l, j + 1, k) + + Bhost(l, j, k + 2) + Bhost(l, j, k + 1) + + Bhost(l, j, k)); + if (Ahost(l, j, k) - check != 0) { + ++numErrors; + std::cout << " Correctness error at index: " << l << "," << j + << "," << k << "\n" + << " multi Ahost = " << Ahost(l, j, k) + << " expected = " << check + << " multi Bhost(ijk) = " << Bhost(l, j, k) + << " multi Bhost(l+1jk) = " << Bhost(l + 1, j, k) + << " multi Bhost(l+2jk) = " << Bhost(l + 2, j, k) + << " multi Bhost(ij+1k) = " << Bhost(l, j + 1, k) + << " multi Bhost(ij+2k) = " << Bhost(l, j + 2, k) + << " multi Bhost(ijk+1) = " << Bhost(l, j, k + 1) + << " multi Bhost(ijk+2) = " << Bhost(l, j, k + 2) + << std::endl; + // exit(-1); + } + } + } + } + if (numErrors != 0) { + std::cout << "LR multi: errors " << numErrors << " range product " + << icount * jcount * kcount << " LL " << jcount * kcount + << " LR " << icount * jcount << std::endl; + } + // else { std::cout << " multi: No errors!" << std::endl; } + } + } // end for + + } + // LayoutLeft + else { + Kokkos::MDRangePolicy< + Kokkos::Rank<3, iterate_type::Left, iterate_type::Left>, + execution_space> + policy_initA({{0, 0, 0}}, {{icount, jcount, kcount}}, {{Ti, Tj, Tk}}); + Kokkos::MDRangePolicy< + Kokkos::Rank<3, iterate_type::Left, iterate_type::Left>, + execution_space> + policy_initB({{0, 0, 0}}, {{icount + 2, jcount + 2, kcount + 2}}, + {{Ti, Tj, Tk}}); + + // using MDRangeType = + // typename Kokkos::MDRangePolicy< + // Kokkos::Rank<3, iterate_type::Left, iterate_type::Left>, + // execution_space >; + // using tile_type = typename MDRangeType::tile_type; + // using point_type = typename MDRangeType::point_type; + // MDRangeType policy(point_type{{0,0,0}}, + // point_type{{icount,jcount,kcount}}, + // tile_type{{Ti,Tj,Tk}}); + Kokkos::MDRangePolicy< + Kokkos::Rank<3, iterate_type::Left, iterate_type::Left>, + execution_space> + policy({{0, 0, 0}}, {{icount, jcount, kcount}}, {{Ti, Tj, Tk}}); + + Kokkos::parallel_for(policy_initA, Init(Atest, icount, jcount, kcount)); + execution_space().fence(); + Kokkos::parallel_for(policy_initB, + Init(Btest, icount + 2, jcount + 2, kcount + 2)); + execution_space().fence(); + + for (int i = 0; i < iter; ++i) { + Kokkos::Timer timer; + Kokkos::parallel_for(policy, + FunctorType(Atest, Btest, icount, jcount, kcount)); + execution_space().fence(); + const double dt = timer.seconds(); + if (0 == i) + dt_min = dt; + else + dt_min = dt < dt_min ? dt : dt_min; + + // Correctness check - only the first run + if (0 == i) { + long numErrors = 0; + host_view_type Ahost("Ahost", icount, jcount, kcount); + Kokkos::deep_copy(Ahost, Atest); + host_view_type Bhost("Bhost", icount + 2, jcount + 2, kcount + 2); + Kokkos::deep_copy(Bhost, Btest); + + // On KNL, this may vectorize - add print statement to prevent + // Also, compare against epsilon, as vectorization can change bitwise + // answer + for (long l = 0; l < static_cast<long>(icount); ++l) { + for (long j = 0; j < static_cast<long>(jcount); ++j) { + for (long k = 0; k < static_cast<long>(kcount); ++k) { + ScalarType check = + 0.25 * + (ScalarType)(Bhost(l + 2, j, k) + Bhost(l + 1, j, k) + + Bhost(l, j + 2, k) + Bhost(l, j + 1, k) + + Bhost(l, j, k + 2) + Bhost(l, j, k + 1) + + Bhost(l, j, k)); + if (Ahost(l, j, k) - check != 0) { + ++numErrors; + std::cout << " Correctness error at index: " << l << "," << j + << "," << k << "\n" + << " multi Ahost = " << Ahost(l, j, k) + << " expected = " << check + << " multi Bhost(ijk) = " << Bhost(l, j, k) + << " multi Bhost(l+1jk) = " << Bhost(l + 1, j, k) + << " multi Bhost(l+2jk) = " << Bhost(l + 2, j, k) + << " multi Bhost(ij+1k) = " << Bhost(l, j + 1, k) + << " multi Bhost(ij+2k) = " << Bhost(l, j + 2, k) + << " multi Bhost(ijk+1) = " << Bhost(l, j, k + 1) + << " multi Bhost(ijk+2) = " << Bhost(l, j, k + 2) + << std::endl; + // exit(-1); + } + } + } + } + if (numErrors != 0) { + std::cout << " LL multi run: errors " << numErrors + << " range product " << icount * jcount * kcount + << " LL " << jcount * kcount << " LR " + << icount * jcount << std::endl; + } + // else { std::cout << " multi: No errors!" << std::endl; } + } + } // end for + } + + return dt_min; + } +}; + +template <class DeviceType, typename ScalarType = double, + typename TestLayout = Kokkos::LayoutRight> +struct RangePolicyCollapseTwo { + // RangePolicy for 3D range, but will collapse only 2 dims => like Rank<2> for + // multi-dim; unroll 2 dims in one-dim + + using execution_space = DeviceType; + using size_type = typename execution_space::size_type; + using layout = TestLayout; + + using iterate_type = Kokkos::Iterate; + + using view_type = Kokkos::View<ScalarType ***, TestLayout, DeviceType>; + using host_view_type = typename view_type::HostMirror; + + view_type A; + view_type B; + const long irange; + const long jrange; + const long krange; + + RangePolicyCollapseTwo(view_type &A_, const view_type &B_, + const long &irange_, const long &jrange_, + const long &krange_) + : A(A_), B(B_), irange(irange_), jrange(jrange_), krange(krange_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const long r) const { + if (std::is_same<TestLayout, Kokkos::LayoutRight>::value) { + // id(i,j,k) = k + j*Nk + i*Nk*Nj = k + Nk*(j + i*Nj) = k + Nk*r + // r = j + i*Nj + long i = int(r / jrange); + long j = int(r - i * jrange); + for (int k = 0; k < krange; ++k) { + A(i, j, k) = + 0.25 * (ScalarType)(B(i + 2, j, k) + B(i + 1, j, k) + + B(i, j + 2, k) + B(i, j + 1, k) + + B(i, j, k + 2) + B(i, j, k + 1) + B(i, j, k)); + } + } else if (std::is_same<TestLayout, Kokkos::LayoutLeft>::value) { + // id(i,j,k) = i + j*Ni + k*Ni*Nj = i + Ni*(j + k*Nj) = i + Ni*r + // r = j + k*Nj + long k = int(r / jrange); + long j = int(r - k * jrange); + for (int i = 0; i < irange; ++i) { + A(i, j, k) = + 0.25 * (ScalarType)(B(i + 2, j, k) + B(i + 1, j, k) + + B(i, j + 2, k) + B(i, j + 1, k) + + B(i, j, k + 2) + B(i, j, k + 1) + B(i, j, k)); + } + } + } + + struct Init { + view_type input; + const long irange; + const long jrange; + const long krange; + + Init(const view_type &input_, const long &irange_, const long &jrange_, + const long &krange_) + : input(input_), irange(irange_), jrange(jrange_), krange(krange_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const long r) const { + if (std::is_same<TestLayout, Kokkos::LayoutRight>::value) { + long i = int(r / jrange); + long j = int(r - i * jrange); + for (int k = 0; k < krange; ++k) { + input(i, j, k) = 1; + } + } else if (std::is_same<TestLayout, Kokkos::LayoutLeft>::value) { + long k = int(r / jrange); + long j = int(r - k * jrange); + for (int i = 0; i < irange; ++i) { + input(i, j, k) = 1; + } + } + } + }; + + static double test_index_collapse_two(const unsigned int icount, + const unsigned int jcount, + const unsigned int kcount, + const long iter = 1) { + // This test refers to collapsing two dims while using the RangePolicy + view_type Atest("Atest", icount, jcount, kcount); + view_type Btest("Btest", icount + 2, jcount + 2, kcount + 2); + using FunctorType = + RangePolicyCollapseTwo<execution_space, ScalarType, TestLayout>; + + long collapse_index_rangeA = 0; + long collapse_index_rangeB = 0; + if (std::is_same<TestLayout, Kokkos::LayoutRight>::value) { + collapse_index_rangeA = icount * jcount; + collapse_index_rangeB = (icount + 2) * (jcount + 2); + // std::cout << " LayoutRight " << std::endl; + } else if (std::is_same<TestLayout, Kokkos::LayoutLeft>::value) { + collapse_index_rangeA = kcount * jcount; + collapse_index_rangeB = (kcount + 2) * (jcount + 2); + // std::cout << " LayoutLeft " << std::endl; + } else { + std::cout << " LayoutRight or LayoutLeft required - will pass 0 as " + "range instead " + << std::endl; + exit(-1); + } + + Kokkos::RangePolicy<execution_space> policy(0, (collapse_index_rangeA)); + Kokkos::RangePolicy<execution_space> policy_initB(0, + (collapse_index_rangeB)); + + double dt_min = 0; + + Kokkos::parallel_for(policy, Init(Atest, icount, jcount, kcount)); + execution_space().fence(); + Kokkos::parallel_for(policy_initB, + Init(Btest, icount + 2, jcount + 2, kcount + 2)); + execution_space().fence(); + + for (int i = 0; i < iter; ++i) { + Kokkos::Timer timer; + Kokkos::parallel_for(policy, + FunctorType(Atest, Btest, icount, jcount, kcount)); + execution_space().fence(); + const double dt = timer.seconds(); + if (0 == i) + dt_min = dt; + else + dt_min = dt < dt_min ? dt : dt_min; + + // Correctness check - first iteration only + if (0 == i) { + long numErrors = 0; + host_view_type Ahost("Ahost", icount, jcount, kcount); + Kokkos::deep_copy(Ahost, Atest); + host_view_type Bhost("Bhost", icount + 2, jcount + 2, kcount + 2); + Kokkos::deep_copy(Bhost, Btest); + + // On KNL, this may vectorize - add print statement to prevent + // Also, compare against epsilon, as vectorization can change bitwise + // answer + for (long l = 0; l < static_cast<long>(icount); ++l) { + for (long j = 0; j < static_cast<long>(jcount); ++j) { + for (long k = 0; k < static_cast<long>(kcount); ++k) { + ScalarType check = + 0.25 * (ScalarType)(Bhost(l + 2, j, k) + Bhost(l + 1, j, k) + + Bhost(l, j + 2, k) + Bhost(l, j + 1, k) + + Bhost(l, j, k + 2) + Bhost(l, j, k + 1) + + Bhost(l, j, k)); + if (Ahost(l, j, k) - check != 0) { + ++numErrors; + std::cout << " Correctness error at index: " << l << "," << j + << "," << k << "\n" + << " flat Ahost = " << Ahost(l, j, k) + << " expected = " << check << std::endl; + // exit(-1); + } + } + } + } + if (numErrors != 0) { + std::cout << " RP collapse2: errors " << numErrors + << " range product " << icount * jcount * kcount << " LL " + << jcount * kcount << " LR " << icount * jcount + << std::endl; + } + // else { std::cout << " RP collapse2: Pass! " << std::endl; } + } + } + + return dt_min; + } +}; + +template <class DeviceType, typename ScalarType = double, + typename TestLayout = Kokkos::LayoutRight> +struct RangePolicyCollapseAll { + // RangePolicy for 3D range, but will collapse all dims + + using execution_space = DeviceType; + using size_type = typename execution_space::size_type; + using layout = TestLayout; + + using view_type = Kokkos::View<ScalarType ***, TestLayout, DeviceType>; + using host_view_type = typename view_type::HostMirror; + + view_type A; + view_type B; + const long irange; + const long jrange; + const long krange; + + RangePolicyCollapseAll(view_type &A_, const view_type &B_, + const long &irange_, const long &jrange_, + const long &krange_) + : A(A_), B(B_), irange(irange_), jrange(jrange_), krange(krange_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const long r) const { + if (std::is_same<TestLayout, Kokkos::LayoutRight>::value) { + long i = int(r / (jrange * krange)); + long j = int((r - i * jrange * krange) / krange); + long k = int(r - i * jrange * krange - j * krange); + A(i, j, k) = + 0.25 * (ScalarType)(B(i + 2, j, k) + B(i + 1, j, k) + B(i, j + 2, k) + + B(i, j + 1, k) + B(i, j, k + 2) + B(i, j, k + 1) + + B(i, j, k)); + } else if (std::is_same<TestLayout, Kokkos::LayoutLeft>::value) { + long k = int(r / (irange * jrange)); + long j = int((r - k * irange * jrange) / irange); + long i = int(r - k * irange * jrange - j * irange); + A(i, j, k) = + 0.25 * (ScalarType)(B(i + 2, j, k) + B(i + 1, j, k) + B(i, j + 2, k) + + B(i, j + 1, k) + B(i, j, k + 2) + B(i, j, k + 1) + + B(i, j, k)); + } + } + + struct Init { + view_type input; + const long irange; + const long jrange; + const long krange; + + Init(const view_type &input_, const long &irange_, const long &jrange_, + const long &krange_) + : input(input_), irange(irange_), jrange(jrange_), krange(krange_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const long r) const { + if (std::is_same<TestLayout, Kokkos::LayoutRight>::value) { + long i = int(r / (jrange * krange)); + long j = int((r - i * jrange * krange) / krange); + long k = int(r - i * jrange * krange - j * krange); + input(i, j, k) = 1; + } else if (std::is_same<TestLayout, Kokkos::LayoutLeft>::value) { + long k = int(r / (irange * jrange)); + long j = int((r - k * irange * jrange) / irange); + long i = int(r - k * irange * jrange - j * irange); + input(i, j, k) = 1; + } + } + }; + + static double test_collapse_all(const unsigned int icount, + const unsigned int jcount, + const unsigned int kcount, + const long iter = 1) { + // This test refers to collapsing all dims using the RangePolicy + view_type Atest("Atest", icount, jcount, kcount); + view_type Btest("Btest", icount + 2, jcount + 2, kcount + 2); + using FunctorType = + RangePolicyCollapseAll<execution_space, ScalarType, TestLayout>; + + const long flat_index_range = icount * jcount * kcount; + Kokkos::RangePolicy<execution_space> policy(0, flat_index_range); + Kokkos::RangePolicy<execution_space> policy_initB( + 0, (icount + 2) * (jcount + 2) * (kcount + 2)); + + double dt_min = 0; + + Kokkos::parallel_for(policy, Init(Atest, icount, jcount, kcount)); + execution_space().fence(); + Kokkos::parallel_for(policy_initB, + Init(Btest, icount + 2, jcount + 2, kcount + 2)); + execution_space().fence(); + + for (int i = 0; i < iter; ++i) { + Kokkos::Timer timer; + Kokkos::parallel_for(policy, + FunctorType(Atest, Btest, icount, jcount, kcount)); + execution_space().fence(); + const double dt = timer.seconds(); + if (0 == i) + dt_min = dt; + else + dt_min = dt < dt_min ? dt : dt_min; + + // Correctness check - first iteration only + if (0 == i) { + long numErrors = 0; + host_view_type Ahost("Ahost", icount, jcount, kcount); + Kokkos::deep_copy(Ahost, Atest); + host_view_type Bhost("Bhost", icount + 2, jcount + 2, kcount + 2); + Kokkos::deep_copy(Bhost, Btest); + + // On KNL, this may vectorize - add print statement to prevent + // Also, compare against epsilon, as vectorization can change bitwise + // answer + for (long l = 0; l < static_cast<long>(icount); ++l) { + for (long j = 0; j < static_cast<long>(jcount); ++j) { + for (long k = 0; k < static_cast<long>(kcount); ++k) { + ScalarType check = + 0.25 * (ScalarType)(Bhost(l + 2, j, k) + Bhost(l + 1, j, k) + + Bhost(l, j + 2, k) + Bhost(l, j + 1, k) + + Bhost(l, j, k + 2) + Bhost(l, j, k + 1) + + Bhost(l, j, k)); + if (Ahost(l, j, k) - check != 0) { + ++numErrors; + std::cout << " Callapse ALL Correctness error at index: " << l + << "," << j << "," << k << "\n" + << " flat Ahost = " << Ahost(l, j, k) + << " expected = " << check << std::endl; + // exit(-1); + } + } + } + } + if (numErrors != 0) { + std::cout << " RP collapse all: errors " << numErrors + << " range product " << icount * jcount * kcount << " LL " + << jcount * kcount << " LR " << icount * jcount + << std::endl; + } + // else { std::cout << " RP collapse all: Pass! " << std::endl; } + } + } + + return dt_min; + } +}; + +} // end namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTestMain.cpp b/packages/kokkos/core/perf_test/PerfTestMain.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8f4d48d57bf3f2c21ed78054a22abc6c4694bdcb --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTestMain.cpp @@ -0,0 +1,79 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> +#include <cstdlib> + +#include <Kokkos_Core.hpp> + +namespace Test { +int command_line_num_args(int n = 0) { + static int n_args = 0; + if (n > 0) n_args = n; + return n_args; +} + +const char* command_line_arg(int k, char** input_args = nullptr) { + static char** args; + if (input_args != nullptr) args = input_args; + if (command_line_num_args() > k) + return args[k]; + else + return nullptr; +} + +} // namespace Test + +int main(int argc, char* argv[]) { + ::testing::InitGoogleTest(&argc, argv); + Kokkos::initialize(argc, argv); + + (void)Test::command_line_num_args(argc); + (void)Test::command_line_arg(0, argv); + + int result = RUN_ALL_TESTS(); + + Kokkos::finalize(); + return result; +} diff --git a/packages/kokkos/core/perf_test/PerfTest_Category.hpp b/packages/kokkos/core/perf_test/PerfTest_Category.hpp new file mode 100644 index 0000000000000000000000000000000000000000..0f24490bfeb77df3e86b4af14e9dcfdf5680efbb --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTest_Category.hpp @@ -0,0 +1,60 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_PERFTEST_CAT_HPP +#define KOKKOS_TEST_PERFTEST_CAT_HPP + +#include <gtest/gtest.h> + +namespace Test { + +extern int command_line_num_args(int n = 0); +extern const char* command_line_arg(int k, char** input_args = nullptr); + +} // namespace Test + +#define TEST_CATEGORY default_exec +#define TEST_EXECSPACE Kokkos::DefaultExecutionSpace + +#endif diff --git a/packages/kokkos/core/perf_test/PerfTest_CustomReduction.cpp b/packages/kokkos/core/perf_test/PerfTest_CustomReduction.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1ab76d6e543996be34b40800c8da31022cd625cd --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTest_CustomReduction.cpp @@ -0,0 +1,138 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <gtest/gtest.h> +#include <PerfTest_Category.hpp> +#include <Kokkos_Random.hpp> + +#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA +namespace Test { +template <class Scalar> +void custom_reduction_test(int N, int R, int num_trials) { + Kokkos::Random_XorShift64_Pool<> rand_pool(183291); + Kokkos::View<Scalar*> a("A", N); + Kokkos::fill_random(a, rand_pool, 1.0); + + Scalar max; + + int team_size = 32; + if (team_size > Kokkos::DefaultExecutionSpace::concurrency()) + team_size = Kokkos::DefaultExecutionSpace::concurrency(); + // Warm up + Kokkos::parallel_reduce( + Kokkos::TeamPolicy<>(N / 1024, team_size), + KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type& team, + Scalar& lmax) { + Scalar team_max = Scalar(0); + for (int rr = 0; rr < R; rr++) { + int i = team.league_rank(); + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(team, 32), + [&](const int& j, Scalar& thread_max) { + Scalar t_max = Scalar(0); + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, 32), + [&](const int& k, Scalar& max_) { + const Scalar val = a((i * 32 + j) * 32 + k); + if (val > max_) max_ = val; + if ((k == 11) && (j == 17) && (i == 2)) max_ = 11.5; + }, + Kokkos::Max<Scalar>(t_max)); + if (t_max > thread_max) thread_max = t_max; + }, + Kokkos::Max<Scalar>(team_max)); + } + if (team_max > lmax) lmax = team_max; + }, + Kokkos::Max<Scalar>(max)); + + // Timing + Kokkos::Timer timer; + for (int r = 0; r < num_trials; r++) { + Kokkos::parallel_reduce( + Kokkos::TeamPolicy<>(N / 1024, team_size), + KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type& team, + Scalar& lmax) { + Scalar team_max = Scalar(0); + for (int rr = 0; rr < R; rr++) { + int i = team.league_rank(); + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(team, 32), + [&](const int& j, Scalar& thread_max) { + Scalar t_max = Scalar(0); + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, 32), + [&](const int& k, Scalar& max_) { + const Scalar val = a((i * 32 + j) * 32 + k); + if (val > max_) max_ = val; + if ((k == 11) && (j == 17) && (i == 2)) max_ = 11.5; + }, + Kokkos::Max<Scalar>(t_max)); + if (t_max > thread_max) thread_max = t_max; + }, + Kokkos::Max<Scalar>(team_max)); + } + if (team_max > lmax) lmax = team_max; + }, + Kokkos::Max<Scalar>(max)); + } + double time = timer.seconds(); + printf("%e %e %e\n", time, + 1.0 * N * R * num_trials * sizeof(Scalar) / time / 1024 / 1024 / 1024, + max); +} + +TEST(default_exec, custom_reduction) { + int N = 100000; + int R = 1000; + int num_trials = 1; + + if (command_line_num_args() > 1) N = std::stoi(command_line_arg(1)); + if (command_line_num_args() > 2) R = std::stoi(command_line_arg(2)); + if (command_line_num_args() > 3) num_trials = std::stoi(command_line_arg(3)); + custom_reduction_test<double>(N, R, num_trials); +} +} // namespace Test +#endif diff --git a/packages/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp b/packages/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp new file mode 100644 index 0000000000000000000000000000000000000000..50bbc78a6b75815ad59ea73c0077dc27ae2dccfa --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp @@ -0,0 +1,632 @@ +#include <Kokkos_Core.hpp> +#include <gtest/gtest.h> +#include <PerfTest_Category.hpp> + +namespace Test { + +namespace { +template <class ExecSpace> +struct SpaceInstance { + static ExecSpace create() { return ExecSpace(); } + static void destroy(ExecSpace&) {} + static bool overlap() { return false; } +}; + +#ifndef KOKKOS_ENABLE_DEBUG +#ifdef KOKKOS_ENABLE_CUDA +template <> +struct SpaceInstance<Kokkos::Cuda> { + static Kokkos::Cuda create() { + cudaStream_t stream; + cudaStreamCreate(&stream); + return Kokkos::Cuda(stream); + } + static void destroy(Kokkos::Cuda& space) { + cudaStream_t stream = space.cuda_stream(); + cudaStreamDestroy(stream); + } + static bool overlap() { + bool value = true; + auto local_rank_str = std::getenv("CUDA_LAUNCH_BLOCKING"); + if (local_rank_str) { + value = (std::stoi(local_rank_str) == 0); + } + return value; + } +}; +#endif +#endif +} // namespace + +struct FunctorRange { + int M, R; + Kokkos::View<double**, TEST_EXECSPACE> a; + FunctorRange(int M_, int R_, Kokkos::View<double**, TEST_EXECSPACE> a_) + : M(M_), R(R_), a(a_) {} + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { + for (int r = 0; r < R; r++) + for (int j = 0; j < M; j++) { + a(i, j) += 1.0; + } + } +}; + +struct FunctorMDRange { + int M, R; + Kokkos::View<double**, TEST_EXECSPACE> a; + FunctorMDRange(int M_, int R_, Kokkos::View<double**, TEST_EXECSPACE> a_) + : M(M_), R(R_), a(a_) {} + KOKKOS_INLINE_FUNCTION + void operator()(const int i, const int) const { + for (int j = 0; j < M; j++) a(i, j) += 1.0; + } +}; + +struct FunctorTeam { + int M, R; + Kokkos::View<double**, Kokkos::LayoutRight, TEST_EXECSPACE> a; + FunctorTeam(int M_, int R_, + Kokkos::View<double**, Kokkos::LayoutRight, TEST_EXECSPACE> a_) + : M(M_), R(R_), a(a_) {} + KOKKOS_INLINE_FUNCTION + void operator()( + const Kokkos::TeamPolicy<TEST_EXECSPACE>::member_type& team) const { + int i = team.league_rank(); + for (int r = 0; r < R; r++) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, M), + [&](const int j) { a(i, j) += 1.0; }); + } + } +}; + +struct FunctorRangeReduce { + int M, R; + Kokkos::View<double**, TEST_EXECSPACE> a; + FunctorRangeReduce(int M_, int R_, Kokkos::View<double**, TEST_EXECSPACE> a_) + : M(M_), R(R_), a(a_) {} + KOKKOS_INLINE_FUNCTION + void operator()(const int i, double& tmp) const { + for (int r = 0; r < R; r++) + for (int j = 0; j < M; j++) { + tmp += a(i, j); + } + } +}; + +struct FunctorMDRangeReduce { + int M, R; + Kokkos::View<double**, TEST_EXECSPACE> a; + FunctorMDRangeReduce(int M_, int R_, + Kokkos::View<double**, TEST_EXECSPACE> a_) + : M(M_), R(R_), a(a_) {} + KOKKOS_INLINE_FUNCTION + void operator()(const int i, const int, double& tmp) const { + for (int j = 0; j < M; j++) tmp += a(i, j); + } +}; + +struct FunctorTeamReduce { + int M, R; + Kokkos::View<double**, Kokkos::LayoutRight, TEST_EXECSPACE> a; + FunctorTeamReduce( + int M_, int R_, + Kokkos::View<double**, Kokkos::LayoutRight, TEST_EXECSPACE> a_) + : M(M_), R(R_), a(a_) {} + KOKKOS_INLINE_FUNCTION + void operator()(const Kokkos::TeamPolicy<TEST_EXECSPACE>::member_type& team, + double& tmp) const { + int i = team.league_rank(); + for (int r = 0; r < R; r++) { + double val; + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(team, M), + [&](const int j, double& tmp2) { tmp2 += a(i, j); }, val); + tmp += val; + } + } +}; + +TEST(default_exec, overlap_range_policy) { + int N = 2000; + int M = 10000; + int R = 10; + + TEST_EXECSPACE space; + TEST_EXECSPACE space1 = SpaceInstance<TEST_EXECSPACE>::create(); + TEST_EXECSPACE space2 = SpaceInstance<TEST_EXECSPACE>::create(); + + Kokkos::View<double**, TEST_EXECSPACE> a("A", N, M); + FunctorRange f(M, R, a); + FunctorRangeReduce fr(M, R, a); + Kokkos::parallel_for("default_exec::overlap_range_policy::kernel0", + Kokkos::RangePolicy<TEST_EXECSPACE>(0, N), + FunctorRange(M, R, a)); + + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel1", + Kokkos::Experimental::require( + Kokkos::RangePolicy<TEST_EXECSPACE>(space1, 0, N), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + f); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel2", + Kokkos::Experimental::require( + Kokkos::RangePolicy<TEST_EXECSPACE>(space2, 0, N), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + f); + Kokkos::fence(); + + Kokkos::Timer timer; + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel3", + Kokkos::Experimental::require( + Kokkos::RangePolicy<TEST_EXECSPACE>(space, 0, N), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + f); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel4", + Kokkos::Experimental::require( + Kokkos::RangePolicy<TEST_EXECSPACE>(space, 0, N), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + f); + Kokkos::fence(); + + timer.reset(); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel5", + Kokkos::Experimental::require( + Kokkos::RangePolicy<TEST_EXECSPACE>(space1, 0, N), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + FunctorRange(M, R, a)); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel6", + Kokkos::Experimental::require( + Kokkos::RangePolicy<TEST_EXECSPACE>(space2, 0, N), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + FunctorRange(M, R, a)); + Kokkos::fence(); + double time_overlap = timer.seconds(); + + timer.reset(); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel7", + Kokkos::Experimental::require( + Kokkos::RangePolicy<TEST_EXECSPACE>(space, 0, N), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + f); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel8", + Kokkos::Experimental::require( + Kokkos::RangePolicy<TEST_EXECSPACE>(space, 0, N), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + f); + Kokkos::fence(); + double time_end = timer.seconds(); + + if (SpaceInstance<TEST_EXECSPACE>::overlap()) { + ASSERT_TRUE((time_end > 1.5 * time_overlap)); + } + printf("Time RangePolicy: NonOverlap: %lf Time Overlap: %lf\n", time_end, + time_overlap); + + Kokkos::View<double, TEST_EXECSPACE> result("result"); + Kokkos::View<double, TEST_EXECSPACE> result1("result1"); + Kokkos::View<double, TEST_EXECSPACE> result2("result2"); + Kokkos::View<double, Kokkos::HostSpace> h_result("h_result"); + Kokkos::View<double, Kokkos::HostSpace> h_result1("h_result1"); + Kokkos::View<double, Kokkos::HostSpace> h_result2("h_result2"); + + timer.reset(); + Kokkos::parallel_reduce( + "default_exec::overlap_range_policy::kernel_reduce", + Kokkos::Experimental::require( + Kokkos::RangePolicy<TEST_EXECSPACE>(space, 0, N), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + fr, result); + Kokkos::fence(); + double time_fenced = timer.seconds(); + Kokkos::deep_copy(h_result, result); + + timer.reset(); + Kokkos::parallel_reduce( + "default_exec::overlap_range_policy::kernel_reduce", + Kokkos::Experimental::require( + Kokkos::RangePolicy<TEST_EXECSPACE>(space, 0, N), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + fr, result); + double time_not_fenced = timer.seconds(); + Kokkos::fence(); + if (SpaceInstance<TEST_EXECSPACE>::overlap()) { + ASSERT_TRUE(time_fenced > 2.0 * time_not_fenced); + } + + timer.reset(); + Kokkos::parallel_reduce( + "default_exec::overlap_range_policy::kernel_reduce", + Kokkos::Experimental::require( + Kokkos::RangePolicy<TEST_EXECSPACE>(space, 0, N), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + fr, result); + Kokkos::parallel_reduce( + "default_exec::overlap_range_policy::kernel_reduce", + Kokkos::Experimental::require( + Kokkos::RangePolicy<TEST_EXECSPACE>(space, 0, N), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + fr, result); + Kokkos::fence(); + double time_no_overlapped_reduce = timer.seconds(); + + timer.reset(); + Kokkos::parallel_reduce( + "default_exec::overlap_range_policy::kernel_reduce", + Kokkos::Experimental::require( + Kokkos::RangePolicy<TEST_EXECSPACE>(space1, 0, N), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + fr, result1); + Kokkos::parallel_reduce( + "default_exec::overlap_range_policy::kernel_reduce", + Kokkos::Experimental::require( + Kokkos::RangePolicy<TEST_EXECSPACE>(space2, 0, N), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + fr, result2); + Kokkos::fence(); + double time_overlapped_reduce = timer.seconds(); + + Kokkos::deep_copy(h_result2, result2); + Kokkos::deep_copy(h_result1, result1); + + ASSERT_EQ(h_result1(), h_result()); + ASSERT_EQ(h_result2(), h_result()); + + if (SpaceInstance<TEST_EXECSPACE>::overlap()) { + ASSERT_TRUE(time_overlapped_reduce < 1.5 * time_no_overlapped_reduce); + } + printf("Time RangePolicy Reduce: NonOverlap: %lf Time Overlap: %lf\n", + time_no_overlapped_reduce, time_overlapped_reduce); + SpaceInstance<TEST_EXECSPACE>::destroy(space1); + SpaceInstance<TEST_EXECSPACE>::destroy(space2); +} + +TEST(default_exec, overlap_mdrange_policy) { + int N = 200; + int M = 10000; + int R = 10; + + TEST_EXECSPACE space; + TEST_EXECSPACE space1 = SpaceInstance<TEST_EXECSPACE>::create(); + TEST_EXECSPACE space2 = SpaceInstance<TEST_EXECSPACE>::create(); + + Kokkos::View<double**, TEST_EXECSPACE> a("A", N, M); + FunctorMDRange f(M, R, a); + FunctorMDRangeReduce fr(M, R, a); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel0", + Kokkos::Experimental::require( + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>({0, 0}, + {N, R}), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + FunctorMDRange(M, R, a)); + + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel1", + Kokkos::Experimental::require( + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space1, {0, 0}, + {N, R}), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + f); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel2", + Kokkos::Experimental::require( + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space2, {0, 0}, + {N, R}), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + f); + Kokkos::fence(); + + Kokkos::Timer timer; + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel3", + Kokkos::Experimental::require( + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space, {0, 0}, + {N, R}), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + f); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel4", + Kokkos::Experimental::require( + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space, {0, 0}, + {N, R}), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + f); + Kokkos::fence(); + + timer.reset(); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel5", + Kokkos::Experimental::require( + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space1, {0, 0}, + {N, R}), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + FunctorMDRange(M, R, a)); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel6", + Kokkos::Experimental::require( + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space2, {0, 0}, + {N, R}), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + FunctorMDRange(M, R, a)); + Kokkos::fence(); + double time_overlap = timer.seconds(); + + timer.reset(); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel7", + Kokkos::Experimental::require( + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space, {0, 0}, + {N, R}), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + f); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel8", + Kokkos::Experimental::require( + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space, {0, 0}, + {N, R}), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + f); + Kokkos::fence(); + double time_end = timer.seconds(); + + if (SpaceInstance<TEST_EXECSPACE>::overlap()) { + ASSERT_TRUE((time_end > 1.5 * time_overlap)); + } + printf("Time MDRangePolicy: NonOverlap: %lf Time Overlap: %lf\n", time_end, + time_overlap); + + Kokkos::View<double, TEST_EXECSPACE> result("result"); + Kokkos::View<double, TEST_EXECSPACE> result1("result1"); + Kokkos::View<double, TEST_EXECSPACE> result2("result2"); + Kokkos::View<double, Kokkos::HostSpace> h_result("h_result"); + Kokkos::View<double, Kokkos::HostSpace> h_result1("h_result1"); + Kokkos::View<double, Kokkos::HostSpace> h_result2("h_result2"); + + timer.reset(); + Kokkos::parallel_reduce( + "default_exec::overlap_mdrange_policy::kernel_reduce", + Kokkos::Experimental::require( + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space, {0, 0}, + {N, R}), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + fr, result); + Kokkos::fence(); + double time_fenced = timer.seconds(); + Kokkos::deep_copy(h_result, result); + + timer.reset(); + Kokkos::parallel_reduce( + "default_exec::overlap_mdrange_policy::kernel_reduce", + Kokkos::Experimental::require( + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space, {0, 0}, + {N, R}), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + fr, result); + double time_not_fenced = timer.seconds(); + Kokkos::fence(); + if (SpaceInstance<TEST_EXECSPACE>::overlap()) { + ASSERT_TRUE(time_fenced > 2.0 * time_not_fenced); + } + + timer.reset(); + Kokkos::parallel_reduce( + "default_exec::overlap_mdrange_policy::kernel_reduce", + Kokkos::Experimental::require( + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space, {0, 0}, + {N, R}), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + fr, result); + Kokkos::parallel_reduce( + "default_exec::overlap_mdrange_policy::kernel_reduce", + Kokkos::Experimental::require( + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space, {0, 0}, + {N, R}), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + fr, result); + Kokkos::fence(); + double time_no_overlapped_reduce = timer.seconds(); + + timer.reset(); + Kokkos::parallel_reduce( + "default_exec::overlap_mdrange_policy::kernel_reduce", + Kokkos::Experimental::require( + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space1, {0, 0}, + {N, R}), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + fr, result1); + Kokkos::parallel_reduce( + "default_exec::overlap_mdrange_policy::kernel_reduce", + Kokkos::Experimental::require( + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space2, {0, 0}, + {N, R}), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + fr, result2); + Kokkos::fence(); + double time_overlapped_reduce = timer.seconds(); + + Kokkos::deep_copy(h_result2, result2); + Kokkos::deep_copy(h_result1, result1); + + ASSERT_EQ(h_result1(), h_result()); + ASSERT_EQ(h_result2(), h_result()); + + if (SpaceInstance<TEST_EXECSPACE>::overlap()) { + ASSERT_TRUE(time_overlapped_reduce < 1.5 * time_no_overlapped_reduce); + } + printf("Time MDRangePolicy Reduce: NonOverlap: %lf Time Overlap: %lf\n", + time_no_overlapped_reduce, time_overlapped_reduce); + SpaceInstance<TEST_EXECSPACE>::destroy(space2); + SpaceInstance<TEST_EXECSPACE>::destroy(space1); +} + +TEST(default_exec, overlap_team_policy) { + int N = 20; + int M = 1000000; + int R = 10; + + TEST_EXECSPACE space; + TEST_EXECSPACE space1 = SpaceInstance<TEST_EXECSPACE>::create(); + TEST_EXECSPACE space2 = SpaceInstance<TEST_EXECSPACE>::create(); + + Kokkos::View<double**, Kokkos::LayoutRight, TEST_EXECSPACE> a("A", N, M); + FunctorTeam f(M, R, a); + FunctorTeamReduce fr(M, R, a); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel0", + Kokkos::Experimental::require( + Kokkos::TeamPolicy<TEST_EXECSPACE>(N, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + FunctorTeam(M, R, a)); + + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel1", + Kokkos::Experimental::require( + Kokkos::TeamPolicy<TEST_EXECSPACE>(space1, N, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + f); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel2", + Kokkos::Experimental::require( + Kokkos::TeamPolicy<TEST_EXECSPACE>(space2, N, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + f); + Kokkos::fence(); + + Kokkos::Timer timer; + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel3", + Kokkos::Experimental::require( + Kokkos::TeamPolicy<TEST_EXECSPACE>(space, N, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + f); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel4", + Kokkos::Experimental::require( + Kokkos::TeamPolicy<TEST_EXECSPACE>(space, N, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + f); + Kokkos::fence(); + + timer.reset(); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel5", + Kokkos::Experimental::require( + Kokkos::TeamPolicy<TEST_EXECSPACE>(space1, N, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + FunctorTeam(M, R, a)); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel6", + Kokkos::Experimental::require( + Kokkos::TeamPolicy<TEST_EXECSPACE>(space2, N, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + FunctorTeam(M, R, a)); + Kokkos::fence(); + double time_overlap = timer.seconds(); + + timer.reset(); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel7", + Kokkos::Experimental::require( + Kokkos::TeamPolicy<TEST_EXECSPACE>(space, N, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + f); + Kokkos::parallel_for( + "default_exec::overlap_range_policy::kernel8", + Kokkos::Experimental::require( + Kokkos::TeamPolicy<TEST_EXECSPACE>(space, N, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + f); + Kokkos::fence(); + double time_end = timer.seconds(); + + if (SpaceInstance<TEST_EXECSPACE>::overlap()) { + ASSERT_TRUE((time_end > 1.5 * time_overlap)); + } + printf("Time TeamPolicy: NonOverlap: %lf Time Overlap: %lf\n", time_end, + time_overlap); + + Kokkos::View<double, TEST_EXECSPACE> result("result"); + Kokkos::View<double, TEST_EXECSPACE> result1("result1"); + Kokkos::View<double, TEST_EXECSPACE> result2("result2"); + Kokkos::View<double, Kokkos::HostSpace> h_result("h_result"); + Kokkos::View<double, Kokkos::HostSpace> h_result1("h_result1"); + Kokkos::View<double, Kokkos::HostSpace> h_result2("h_result2"); + + timer.reset(); + Kokkos::parallel_reduce( + "default_exec::overlap_team_policy::kernel_reduce", + Kokkos::Experimental::require( + Kokkos::TeamPolicy<TEST_EXECSPACE>(space, N, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + fr, result); + Kokkos::fence(); + double time_fenced = timer.seconds(); + Kokkos::deep_copy(h_result, result); + + timer.reset(); + Kokkos::parallel_reduce( + "default_exec::overlap_team_policy::kernel_reduce", + Kokkos::Experimental::require( + Kokkos::TeamPolicy<TEST_EXECSPACE>(space, N, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + fr, result); + double time_not_fenced = timer.seconds(); + Kokkos::fence(); + if (SpaceInstance<TEST_EXECSPACE>::overlap()) { + ASSERT_TRUE(time_fenced > 2.0 * time_not_fenced); + } + timer.reset(); + Kokkos::parallel_reduce( + "default_exec::overlap_team_policy::kernel_reduce", + Kokkos::Experimental::require( + Kokkos::TeamPolicy<TEST_EXECSPACE>(space, N, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + fr, result); + Kokkos::parallel_reduce( + "default_exec::overlap_team_policy::kernel_reduce", + Kokkos::Experimental::require( + Kokkos::TeamPolicy<TEST_EXECSPACE>(space, N, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + fr, result); + Kokkos::fence(); + double time_no_overlapped_reduce = timer.seconds(); + + timer.reset(); + Kokkos::parallel_reduce( + "default_exec::overlap_team_policy::kernel_reduce", + Kokkos::Experimental::require( + Kokkos::TeamPolicy<TEST_EXECSPACE>(space1, N, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + fr, result1); + Kokkos::parallel_reduce( + "default_exec::overlap_team_policy::kernel_reduce", + Kokkos::Experimental::require( + Kokkos::TeamPolicy<TEST_EXECSPACE>(space2, N, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + fr, result2); + Kokkos::fence(); + double time_overlapped_reduce = timer.seconds(); + + Kokkos::deep_copy(h_result2, result2); + Kokkos::deep_copy(h_result1, result1); + + ASSERT_EQ(h_result1(), h_result()); + ASSERT_EQ(h_result2(), h_result()); + + if (SpaceInstance<TEST_EXECSPACE>::overlap()) { + ASSERT_TRUE(time_overlapped_reduce < 1.5 * time_no_overlapped_reduce); + } + printf("Time TeamPolicy Reduce: NonOverlap: %lf Time Overlap: %lf\n", + time_no_overlapped_reduce, time_overlapped_reduce); + SpaceInstance<TEST_EXECSPACE>::destroy(space1); + SpaceInstance<TEST_EXECSPACE>::destroy(space2); +} +} // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewAllocate.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewAllocate.cpp new file mode 100644 index 0000000000000000000000000000000000000000..550316bec997121a58a8b44f6df8efdced16a623 --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTest_ViewAllocate.cpp @@ -0,0 +1,160 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <gtest/gtest.h> +#include <cstdio> +#include <PerfTest_Category.hpp> + +namespace Test { + +template <class Layout> +void run_allocateview_tests(int N, int R) { + const int N1 = N; + const int N2 = N * N; + const int N3 = N2 * N; + const int N4 = N2 * N2; + const int N8 = N4 * N4; + + double time1, time2, time3, time4, time5, time6, time7, time8, + time_raw = 100000.0; + { + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::View<double*, Layout> a("A1", N8); + } + time1 = timer.seconds() / R; + } + { + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::View<double**, Layout> a("A2", N4, N4); + } + time2 = timer.seconds() / R; + } + { + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::View<double***, Layout> a("A3", N3, N3, N2); + } + time3 = timer.seconds() / R; + } + { + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::View<double****, Layout> a("A4", N2, N2, N2, N2); + } + time4 = timer.seconds() / R; + } + { + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::View<double*****, Layout> a("A5", N2, N2, N1, N1, N2); + } + time5 = timer.seconds() / R; + } + { + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::View<double******, Layout> a("A6", N2, N1, N1, N1, N1, N2); + } + time6 = timer.seconds() / R; + } + { + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::View<double*******, Layout> a("A7", N2, N1, N1, N1, N1, N1, N1); + } + time7 = timer.seconds() / R; + } + { + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::View<double********, Layout> a("A8", N1, N1, N1, N1, N1, N1, N1, + N1); + } + time8 = timer.seconds() / R; + } +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) + { + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + double* a_ptr = (double*)Kokkos::kokkos_malloc("A", sizeof(double) * N8); + Kokkos::parallel_for( + N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 0.0; }); + Kokkos::fence(); + Kokkos::kokkos_free(a_ptr); + } + time_raw = timer.seconds() / R; + } +#endif + double size = 1.0 * N8 * 8 / 1024 / 1024; + printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size, + size / 1024 / time_raw); + printf(" Rank1: %lf s %lf MB %lf GB/s\n", time1, size, + size / 1024 / time1); + printf(" Rank2: %lf s %lf MB %lf GB/s\n", time2, size, + size / 1024 / time2); + printf(" Rank3: %lf s %lf MB %lf GB/s\n", time3, size, + size / 1024 / time3); + printf(" Rank4: %lf s %lf MB %lf GB/s\n", time4, size, + size / 1024 / time4); + printf(" Rank5: %lf s %lf MB %lf GB/s\n", time5, size, + size / 1024 / time5); + printf(" Rank6: %lf s %lf MB %lf GB/s\n", time6, size, + size / 1024 / time6); + printf(" Rank7: %lf s %lf MB %lf GB/s\n", time7, size, + size / 1024 / time7); + printf(" Rank8: %lf s %lf MB %lf GB/s\n", time8, size, + size / 1024 / time8); +} + +TEST(default_exec, ViewCreate) { + printf("Create View Performance for LayoutLeft:\n"); + run_allocateview_tests<Kokkos::LayoutLeft>(10, 1); + printf("Create View Performance for LayoutRight:\n"); + run_allocateview_tests<Kokkos::LayoutRight>(10, 1); +} + +} // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy.hpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy.hpp new file mode 100644 index 0000000000000000000000000000000000000000..8e7bf25e809cf7649574c332af9bc35dfb0e1d94 --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy.hpp @@ -0,0 +1,261 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <gtest/gtest.h> +#include <cstdio> +#include <PerfTest_Category.hpp> + +namespace Test { + +template <class ViewTypeA, class ViewTypeB> +double deepcopy_view(ViewTypeA& a, ViewTypeB& b, int repeat) { + Kokkos::Timer timer; + for (int i = 0; i < repeat; i++) { + Kokkos::deep_copy(a, b); + } + Kokkos::fence(); + return timer.seconds(); +} + +template <class LayoutA, class LayoutB> +void run_deepcopyview_tests123(int N, int R) { + const int N1 = N; + const int N2 = N1 * N1; + const int N3 = N2 * N1; + const int N4 = N2 * N2; + const int N8 = N4 * N4; + + double time1, time2, time3, time_raw = 100000.0; + { + Kokkos::View<double*, LayoutA> a("A1", N8); + Kokkos::View<double*, LayoutB> b("B1", N8); + time1 = deepcopy_view(a, b, R) / R; + } + { + Kokkos::View<double**, LayoutA> a("A2", N4, N4); + Kokkos::View<double**, LayoutB> b("B2", N4, N4); + time2 = deepcopy_view(a, b, R) / R; + } + { + Kokkos::View<double***, LayoutA> a("A3", N3, N3, N2); + Kokkos::View<double***, LayoutB> b("B3", N3, N3, N2); + time3 = deepcopy_view(a, b, R) / R; + } +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) + { + Kokkos::View<double*, LayoutA> a("A1", N8); + Kokkos::View<double*, LayoutB> b("B1", N8); + double* const a_ptr = a.data(); + const double* const b_ptr = b.data(); + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::parallel_for( + N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = b_ptr[i]; }); + } + Kokkos::fence(); + time_raw = timer.seconds() / R; + } +#endif + double size = 1.0 * N8 * 8 / 1024 / 1024; + printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size, + 2.0 * size / 1024 / time_raw); + printf(" Rank1: %lf s %lf MB %lf GB/s\n", time1, size, + 2.0 * size / 1024 / time1); + printf(" Rank2: %lf s %lf MB %lf GB/s\n", time2, size, + 2.0 * size / 1024 / time2); + printf(" Rank3: %lf s %lf MB %lf GB/s\n", time3, size, + 2.0 * size / 1024 / time3); +} + +template <class LayoutA, class LayoutB> +void run_deepcopyview_tests45(int N, int R) { + const int N1 = N; + const int N2 = N1 * N1; + const int N4 = N2 * N2; + const int N8 = N4 * N4; + + double time4, time5, time_raw = 100000.0; + { + Kokkos::View<double****, LayoutA> a("A4", N2, N2, N2, N2); + Kokkos::View<double****, LayoutB> b("B4", N2, N2, N2, N2); + time4 = deepcopy_view(a, b, R) / R; + } + { + Kokkos::View<double*****, LayoutA> a("A5", N2, N2, N1, N1, N2); + Kokkos::View<double*****, LayoutB> b("B5", N2, N2, N1, N1, N2); + time5 = deepcopy_view(a, b, R) / R; + } +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) + { + Kokkos::View<double*, LayoutA> a("A1", N8); + Kokkos::View<double*, LayoutB> b("B1", N8); + double* const a_ptr = a.data(); + const double* const b_ptr = b.data(); + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::parallel_for( + N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = b_ptr[i]; }); + } + Kokkos::fence(); + time_raw = timer.seconds() / R; + } +#endif + double size = 1.0 * N8 * 8 / 1024 / 1024; + printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size, + 2.0 * size / 1024 / time_raw); + printf(" Rank4: %lf s %lf MB %lf GB/s\n", time4, size, + 2.0 * size / 1024 / time4); + printf(" Rank5: %lf s %lf MB %lf GB/s\n", time5, size, + 2.0 * size / 1024 / time5); +} + +template <class LayoutA, class LayoutB> +void run_deepcopyview_tests6(int N, int R) { + const int N1 = N; + const int N2 = N1 * N1; + const int N4 = N2 * N2; + const int N8 = N4 * N4; + + double time6, time_raw = 100000.0; + { + Kokkos::View<double******, LayoutA> a("A6", N2, N1, N1, N1, N1, N2); + Kokkos::View<double******, LayoutB> b("B6", N2, N1, N1, N1, N1, N2); + time6 = deepcopy_view(a, b, R) / R; + } +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) + { + Kokkos::View<double*, LayoutA> a("A1", N8); + Kokkos::View<double*, LayoutB> b("B1", N8); + double* const a_ptr = a.data(); + const double* const b_ptr = b.data(); + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::parallel_for( + N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = b_ptr[i]; }); + } + Kokkos::fence(); + time_raw = timer.seconds() / R; + } +#endif + double size = 1.0 * N8 * 8 / 1024 / 1024; + printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size, + 2.0 * size / 1024 / time_raw); + printf(" Rank6: %lf s %lf MB %lf GB/s\n", time6, size, + 2.0 * size / 1024 / time6); +} + +template <class LayoutA, class LayoutB> +void run_deepcopyview_tests7(int N, int R) { + const int N1 = N; + const int N2 = N1 * N1; + const int N4 = N2 * N2; + const int N8 = N4 * N4; + + double time7, time_raw = 100000.0; + { + Kokkos::View<double*******, LayoutA> a("A7", N2, N1, N1, N1, N1, N1, N1); + Kokkos::View<double*******, LayoutB> b("B7", N2, N1, N1, N1, N1, N1, N1); + time7 = deepcopy_view(a, b, R) / R; + } +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) + { + Kokkos::View<double*, LayoutA> a("A1", N8); + Kokkos::View<double*, LayoutB> b("B1", N8); + double* const a_ptr = a.data(); + const double* const b_ptr = b.data(); + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::parallel_for( + N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = b_ptr[i]; }); + } + Kokkos::fence(); + time_raw = timer.seconds() / R; + } +#endif + double size = 1.0 * N8 * 8 / 1024 / 1024; + printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size, + 2.0 * size / 1024 / time_raw); + printf(" Rank7: %lf s %lf MB %lf GB/s\n", time7, size, + 2.0 * size / 1024 / time7); +} + +template <class LayoutA, class LayoutB> +void run_deepcopyview_tests8(int N, int R) { + const int N1 = N; + const int N2 = N1 * N1; + const int N4 = N2 * N2; + const int N8 = N4 * N4; + + double time8, time_raw = 100000.0; + { + Kokkos::View<double********, LayoutA> a("A8", N1, N1, N1, N1, N1, N1, N1, + N1); + Kokkos::View<double********, LayoutB> b("B8", N1, N1, N1, N1, N1, N1, N1, + N1); + time8 = deepcopy_view(a, b, R) / R; + } +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) + { + Kokkos::View<double*, LayoutA> a("A1", N8); + Kokkos::View<double*, LayoutB> b("B1", N8); + double* const a_ptr = a.data(); + const double* const b_ptr = b.data(); + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::parallel_for( + N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = b_ptr[i]; }); + } + Kokkos::fence(); + time_raw = timer.seconds() / R; + } +#endif + double size = 1.0 * N8 * 8 / 1024 / 1024; + printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size, + 2.0 * size / 1024 / time_raw); + printf(" Rank8: %lf s %lf MB %lf GB/s\n", time8, size, + 2.0 * size / 1024 / time8); +} + +} // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_a123.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_a123.cpp new file mode 100644 index 0000000000000000000000000000000000000000..dceef801aa8bc295403505c16e8d664d61541bf7 --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_a123.cpp @@ -0,0 +1,51 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <PerfTest_ViewCopy.hpp> +namespace Test { +TEST(default_exec, ViewDeepCopy_LeftLeft_Rank123) { + printf("DeepCopy Performance for LayoutLeft to LayoutLeft:\n"); + run_deepcopyview_tests123<Kokkos::LayoutLeft, Kokkos::LayoutLeft>(10, 1); +} +} // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_a45.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_a45.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3f9b694461211933cf4fffc7424b6fa38c47af02 --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_a45.cpp @@ -0,0 +1,51 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <PerfTest_ViewCopy.hpp> +namespace Test { +TEST(default_exec, ViewDeepCopy_LeftLeft_Rank45) { + printf("DeepCopy Performance for LayoutLeft to LayoutLeft:\n"); + run_deepcopyview_tests45<Kokkos::LayoutLeft, Kokkos::LayoutLeft>(10, 1); +} +} // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_a6.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_a6.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ac364c31cbc53a002334811817220558cbc225be --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_a6.cpp @@ -0,0 +1,51 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <PerfTest_ViewCopy.hpp> +namespace Test { +TEST(default_exec, ViewDeepCopy_LeftLeft_Rank6) { + printf("DeepCopy Performance for LayoutLeft to LayoutLeft:\n"); + run_deepcopyview_tests6<Kokkos::LayoutLeft, Kokkos::LayoutLeft>(10, 1); +} +} // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_a7.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_a7.cpp new file mode 100644 index 0000000000000000000000000000000000000000..94f30bac9fb529de334344a48dd20539751a4a6e --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_a7.cpp @@ -0,0 +1,51 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <PerfTest_ViewCopy.hpp> +namespace Test { +TEST(default_exec, ViewDeepCopy_LeftLeft_Rank7) { + printf("DeepCopy Performance for LayoutLeft to LayoutLeft:\n"); + run_deepcopyview_tests7<Kokkos::LayoutLeft, Kokkos::LayoutLeft>(10, 1); +} +} // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_a8.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_a8.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b916169f1b732843d2070232885cfeb87b2e64e1 --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_a8.cpp @@ -0,0 +1,51 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <PerfTest_ViewCopy.hpp> +namespace Test { +TEST(default_exec, ViewDeepCopy_LeftLeft_Rank8) { + printf("DeepCopy Performance for LayoutLeft to LayoutLeft:\n"); + run_deepcopyview_tests8<Kokkos::LayoutLeft, Kokkos::LayoutLeft>(10, 1); +} +} // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_b123.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_b123.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f314cb0ff43c4d1e68e81574e273b7411c25118a --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_b123.cpp @@ -0,0 +1,51 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <PerfTest_ViewCopy.hpp> +namespace Test { +TEST(default_exec, ViewDeepCopy_RightRight_Rank123) { + printf("DeepCopy Performance for LayoutRight to LayoutRight:\n"); + run_deepcopyview_tests123<Kokkos::LayoutRight, Kokkos::LayoutRight>(10, 1); +} +} // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_b45.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_b45.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5d06f060afa91d7be89c140613f1ef93f9e1d46a --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_b45.cpp @@ -0,0 +1,51 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <PerfTest_ViewCopy.hpp> +namespace Test { +TEST(default_exec, ViewDeepCopy_RightRight_Rank45) { + printf("DeepCopy Performance for LayoutRight to LayoutRight:\n"); + run_deepcopyview_tests45<Kokkos::LayoutRight, Kokkos::LayoutRight>(10, 1); +} +} // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_b6.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_b6.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0e28fee631eb29ec1d70a38b834fc0744d45bb04 --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_b6.cpp @@ -0,0 +1,51 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <PerfTest_ViewCopy.hpp> +namespace Test { +TEST(default_exec, ViewDeepCopy_RightRight_Rank6) { + printf("DeepCopy Performance for LayoutRight to LayoutRight:\n"); + run_deepcopyview_tests6<Kokkos::LayoutRight, Kokkos::LayoutRight>(10, 1); +} +} // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_b7.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_b7.cpp new file mode 100644 index 0000000000000000000000000000000000000000..37e1325fc4c0b1928dcc160e8c0a12cd364830ec --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_b7.cpp @@ -0,0 +1,51 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <PerfTest_ViewCopy.hpp> +namespace Test { +TEST(default_exec, ViewDeepCopy_RightRight_Rank7) { + printf("DeepCopy Performance for LayoutRight to LayoutRight:\n"); + run_deepcopyview_tests7<Kokkos::LayoutRight, Kokkos::LayoutRight>(10, 1); +} +} // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_b8.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_b8.cpp new file mode 100644 index 0000000000000000000000000000000000000000..986c39aaf447d4f6fc04e54b06e5222007e4ace9 --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_b8.cpp @@ -0,0 +1,51 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <PerfTest_ViewCopy.hpp> +namespace Test { +TEST(default_exec, ViewDeepCopy_RightRight_Rank8) { + printf("DeepCopy Performance for LayoutRight to LayoutRight:\n"); + run_deepcopyview_tests8<Kokkos::LayoutRight, Kokkos::LayoutRight>(10, 1); +} +} // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_c123.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_c123.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b98563ee429564d1944a45426f3fd5afbc7e2e3d --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_c123.cpp @@ -0,0 +1,51 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <PerfTest_ViewCopy.hpp> +namespace Test { +TEST(default_exec, ViewDeepCopy_LeftRight_Rank123) { + printf("DeepCopy Performance for LayoutLeft to LayoutRight:\n"); + run_deepcopyview_tests123<Kokkos::LayoutLeft, Kokkos::LayoutRight>(10, 1); +} +} // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_c45.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_c45.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a0ef11e09b8736009afcebb6fe5e3070fadcbbe0 --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_c45.cpp @@ -0,0 +1,51 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <PerfTest_ViewCopy.hpp> +namespace Test { +TEST(default_exec, ViewDeepCopy_LeftRight_Rank45) { + printf("DeepCopy Performance for LayoutLeft to LayoutRight:\n"); + run_deepcopyview_tests45<Kokkos::LayoutLeft, Kokkos::LayoutRight>(10, 1); +} +} // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_c6.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_c6.cpp new file mode 100644 index 0000000000000000000000000000000000000000..fea5dde73adf69ec45bd8d6549882102dd4cbddc --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_c6.cpp @@ -0,0 +1,51 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <PerfTest_ViewCopy.hpp> +namespace Test { +TEST(default_exec, ViewDeepCopy_LeftRight_Rank6) { + printf("DeepCopy Performance for LayoutLeft to LayoutRight:\n"); + run_deepcopyview_tests6<Kokkos::LayoutLeft, Kokkos::LayoutRight>(10, 1); +} +} // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_c7.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_c7.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a8c8d866f927a40f7d6b29a759a6fadeda7c96e4 --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_c7.cpp @@ -0,0 +1,51 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <PerfTest_ViewCopy.hpp> +namespace Test { +TEST(default_exec, ViewDeepCopy_LeftRight_Rank7) { + printf("DeepCopy Performance for LayoutLeft to LayoutRight:\n"); + run_deepcopyview_tests7<Kokkos::LayoutLeft, Kokkos::LayoutRight>(10, 1); +} +} // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_c8.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_c8.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e5abdaa5d81f9e147a9180b54a0a897b40ee08a0 --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_c8.cpp @@ -0,0 +1,51 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <PerfTest_ViewCopy.hpp> +namespace Test { +TEST(default_exec, ViewDeepCopy_LeftRight_Rank8) { + printf("DeepCopy Performance for LayoutLeft to LayoutRight:\n"); + run_deepcopyview_tests8<Kokkos::LayoutLeft, Kokkos::LayoutRight>(10, 1); +} +} // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_d123.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_d123.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2b58f8dd1f6b2554f2aeaa89052e0d52403da118 --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_d123.cpp @@ -0,0 +1,51 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <PerfTest_ViewCopy.hpp> +namespace Test { +TEST(default_exec, ViewDeepCopy_RightLeft_Rank123) { + printf("DeepCopy Performance for LayoutRight to LayoutLeft:\n"); + run_deepcopyview_tests123<Kokkos::LayoutRight, Kokkos::LayoutLeft>(10, 1); +} +} // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_d45.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_d45.cpp new file mode 100644 index 0000000000000000000000000000000000000000..fe34e4fd1a84a1960a37893bc0e8dfd26ed0a42f --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_d45.cpp @@ -0,0 +1,51 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <PerfTest_ViewCopy.hpp> +namespace Test { +TEST(default_exec, ViewDeepCopy_RightLeft_Rank45) { + printf("DeepCopy Performance for LayoutRight to LayoutLeft:\n"); + run_deepcopyview_tests45<Kokkos::LayoutRight, Kokkos::LayoutLeft>(10, 1); +} +} // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_d6.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_d6.cpp new file mode 100644 index 0000000000000000000000000000000000000000..115b223e68b88b297d96cfe9fe6db7fdd9c5591c --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_d6.cpp @@ -0,0 +1,51 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <PerfTest_ViewCopy.hpp> +namespace Test { +TEST(default_exec, ViewDeepCopy_RightLeft_Rank6) { + printf("DeepCopy Performance for LayoutRight to LayoutLeft:\n"); + run_deepcopyview_tests6<Kokkos::LayoutRight, Kokkos::LayoutLeft>(10, 1); +} +} // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_d7.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_d7.cpp new file mode 100644 index 0000000000000000000000000000000000000000..51e88795e73a9b802c6410a99e63190c1cdb91eb --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_d7.cpp @@ -0,0 +1,51 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <PerfTest_ViewCopy.hpp> +namespace Test { +TEST(default_exec, ViewDeepCopy_RightLeft_Rank7) { + printf("DeepCopy Performance for LayoutRight to LayoutLeft:\n"); + run_deepcopyview_tests7<Kokkos::LayoutRight, Kokkos::LayoutLeft>(10, 1); +} +} // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy_d8.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_d8.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2a53cdef213079833e3c1c4b48b760e643dc7f23 --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy_d8.cpp @@ -0,0 +1,51 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <PerfTest_ViewCopy.hpp> +namespace Test { +TEST(default_exec, ViewDeepCopy_RightLeft_Rank8) { + printf("DeepCopy Performance for LayoutRight to LayoutLeft:\n"); + run_deepcopyview_tests8<Kokkos::LayoutRight, Kokkos::LayoutLeft>(10, 1); +} +} // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewFill.hpp b/packages/kokkos/core/perf_test/PerfTest_ViewFill.hpp new file mode 100644 index 0000000000000000000000000000000000000000..38be4bb212c8527ccb246712dc5c2d8b32d53a56 --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTest_ViewFill.hpp @@ -0,0 +1,243 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <gtest/gtest.h> +#include <cstdio> +#include <PerfTest_Category.hpp> + +namespace Test { + +template <class ViewType> +double fill_view(ViewType& a, typename ViewType::const_value_type& val, + int repeat) { + Kokkos::Timer timer; + for (int i = 0; i < repeat; i++) { + Kokkos::deep_copy(a, val); + } + Kokkos::fence(); + return timer.seconds(); +} + +template <class Layout> +void run_fillview_tests123(int N, int R) { + const int N1 = N; + const int N2 = N1 * N1; + const int N3 = N2 * N1; + const int N4 = N2 * N2; + const int N8 = N4 * N4; + + double time1, time2, time3, time_raw = 100000.0; + { + Kokkos::View<double*, Layout> a("A1", N8); + time1 = fill_view(a, 1.1, R) / R; + } + { + Kokkos::View<double**, Layout> a("A2", N4, N4); + time2 = fill_view(a, 1.1, R) / R; + } + { + Kokkos::View<double***, Layout> a("A3", N3, N3, N2); + time3 = fill_view(a, 1.1, R) / R; + } +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) + { + Kokkos::View<double*, Layout> a("A1", N8); + double* a_ptr = a.data(); + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::parallel_for( + N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 1.1; }); + } + Kokkos::fence(); + time_raw = timer.seconds() / R; + } +#endif + double size = 1.0 * N8 * 8 / 1024 / 1024; + printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size, + size / 1024 / time_raw); + printf(" Rank1: %lf s %lf MB %lf GB/s\n", time1, size, + size / 1024 / time1); + printf(" Rank2: %lf s %lf MB %lf GB/s\n", time2, size, + size / 1024 / time2); + printf(" Rank3: %lf s %lf MB %lf GB/s\n", time3, size, + size / 1024 / time3); +} + +template <class Layout> +void run_fillview_tests45(int N, int R) { + const int N1 = N; + const int N2 = N1 * N1; + const int N4 = N2 * N2; + const int N8 = N4 * N4; + + double time4, time5, time_raw = 100000.0; + { + Kokkos::View<double****, Layout> a("A4", N2, N2, N2, N2); + time4 = fill_view(a, 1.1, R) / R; + } + { + Kokkos::View<double*****, Layout> a("A5", N2, N2, N1, N1, N2); + time5 = fill_view(a, 1.1, R) / R; + } +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) + { + Kokkos::View<double*, Layout> a("A1", N8); + double* a_ptr = a.data(); + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::parallel_for( + N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 1.1; }); + } + Kokkos::fence(); + time_raw = timer.seconds() / R; + } +#endif + double size = 1.0 * N8 * 8 / 1024 / 1024; + printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size, + size / 1024 / time_raw); + printf(" Rank4: %lf s %lf MB %lf GB/s\n", time4, size, + size / 1024 / time4); + printf(" Rank5: %lf s %lf MB %lf GB/s\n", time5, size, + size / 1024 / time5); +} + +template <class Layout> +void run_fillview_tests6(int N, int R) { + const int N1 = N; + const int N2 = N1 * N1; + const int N4 = N2 * N2; + const int N8 = N4 * N4; + + double time6, time_raw = 100000.0; + { + Kokkos::View<double******, Layout> a("A6", N2, N1, N1, N1, N1, N2); + time6 = fill_view(a, 1.1, R) / R; + } +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) + { + Kokkos::View<double*, Layout> a("A1", N8); + double* a_ptr = a.data(); + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::parallel_for( + N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 1.1; }); + } + Kokkos::fence(); + time_raw = timer.seconds() / R; + } +#endif + double size = 1.0 * N8 * 8 / 1024 / 1024; + printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size, + size / 1024 / time_raw); + printf(" Rank6: %lf s %lf MB %lf GB/s\n", time6, size, + size / 1024 / time6); +} + +template <class Layout> +void run_fillview_tests7(int N, int R) { + const int N1 = N; + const int N2 = N1 * N1; + const int N4 = N2 * N2; + const int N8 = N4 * N4; + + double time7, time_raw = 100000.0; + { + Kokkos::View<double*******, Layout> a("A7", N2, N1, N1, N1, N1, N1, N1); + time7 = fill_view(a, 1.1, R) / R; + } +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) + { + Kokkos::View<double*, Layout> a("A1", N8); + double* a_ptr = a.data(); + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::parallel_for( + N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 1.1; }); + } + Kokkos::fence(); + time_raw = timer.seconds() / R; + } +#endif + double size = 1.0 * N8 * 8 / 1024 / 1024; + printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size, + size / 1024 / time_raw); + printf(" Rank7: %lf s %lf MB %lf GB/s\n", time7, size, + size / 1024 / time7); +} + +template <class Layout> +void run_fillview_tests8(int N, int R) { + const int N1 = N; + const int N2 = N1 * N1; + const int N4 = N2 * N2; + const int N8 = N4 * N4; + + double time8, time_raw = 100000.0; + { + Kokkos::View<double********, Layout> a("A8", N1, N1, N1, N1, N1, N1, N1, + N1); + time8 = fill_view(a, 1.1, R) / R; + } +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) + { + Kokkos::View<double*, Layout> a("A1", N8); + double* a_ptr = a.data(); + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::parallel_for( + N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 1.1; }); + } + Kokkos::fence(); + time_raw = timer.seconds() / R; + } +#endif + double size = 1.0 * N8 * 8 / 1024 / 1024; + printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size, + size / 1024 / time_raw); + printf(" Rank8: %lf s %lf MB %lf GB/s\n", time8, size, + size / 1024 / time8); +} + +} // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewFill_123.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewFill_123.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0bf8a28329e8993282e407955fe96b5caaabb5e3 --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTest_ViewFill_123.cpp @@ -0,0 +1,54 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <PerfTest_ViewFill.hpp> + +namespace Test { +TEST(default_exec, ViewFill_Rank123) { + printf("ViewFill Performance for LayoutLeft:\n"); + run_fillview_tests123<Kokkos::LayoutLeft>(10, 1); + printf("ViewFill Performance for LayoutRight:\n"); + run_fillview_tests123<Kokkos::LayoutRight>(10, 1); +} +} // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewFill_45.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewFill_45.cpp new file mode 100644 index 0000000000000000000000000000000000000000..53ac509da804c3fa6766f956fe9e2eabdbc27e01 --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTest_ViewFill_45.cpp @@ -0,0 +1,54 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <PerfTest_ViewFill.hpp> + +namespace Test { +TEST(default_exec, ViewFill_Rank45) { + printf("ViewFill Performance for LayoutLeft:\n"); + run_fillview_tests45<Kokkos::LayoutLeft>(10, 1); + printf("ViewFill Performance for LayoutRight:\n"); + run_fillview_tests45<Kokkos::LayoutRight>(10, 1); +} +} // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewFill_6.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewFill_6.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f0a2e248f238542084917497dec7c203a254a6fc --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTest_ViewFill_6.cpp @@ -0,0 +1,54 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <PerfTest_ViewFill.hpp> + +namespace Test { +TEST(default_exec, ViewFill_Rank6) { + printf("ViewFill Performance for LayoutLeft:\n"); + run_fillview_tests6<Kokkos::LayoutLeft>(10, 1); + printf("ViewFill Performance for LayoutRight:\n"); + run_fillview_tests6<Kokkos::LayoutRight>(10, 1); +} +} // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewFill_7.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewFill_7.cpp new file mode 100644 index 0000000000000000000000000000000000000000..675d9e636f27577b1deaa9202de5f21770163f8d --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTest_ViewFill_7.cpp @@ -0,0 +1,54 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <PerfTest_ViewFill.hpp> + +namespace Test { +TEST(default_exec, ViewFill_Rank7) { + printf("ViewFill Performance for LayoutLeft:\n"); + run_fillview_tests7<Kokkos::LayoutLeft>(10, 1); + printf("ViewFill Performance for LayoutRight:\n"); + run_fillview_tests7<Kokkos::LayoutRight>(10, 1); +} +} // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewFill_8.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewFill_8.cpp new file mode 100644 index 0000000000000000000000000000000000000000..35e1e81c43dbc6e36f6760694529d3b175b8d7d1 --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTest_ViewFill_8.cpp @@ -0,0 +1,54 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <PerfTest_ViewFill.hpp> + +namespace Test { +TEST(default_exec, ViewFill_Rank8) { + printf("ViewFill Performance for LayoutLeft:\n"); + run_fillview_tests8<Kokkos::LayoutLeft>(10, 1); + printf("ViewFill Performance for LayoutRight:\n"); + run_fillview_tests8<Kokkos::LayoutRight>(10, 1); +} +} // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewResize.hpp b/packages/kokkos/core/perf_test/PerfTest_ViewResize.hpp new file mode 100644 index 0000000000000000000000000000000000000000..66a631e3890359bb32fa6f0e2055e1f8d9bb9309 --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTest_ViewResize.hpp @@ -0,0 +1,388 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <gtest/gtest.h> +#include <cstdio> +#include <PerfTest_Category.hpp> + +namespace Test { + +template <class Layout> +void run_resizeview_tests123(int N, int R) { + const int N1 = N; + const int N2 = N1 * N1; + const int N3 = N2 * N1; + const int N4 = N2 * N2; + const int N8 = N4 * N4; + + double time1, time2, time3, time_raw = 100000.0; + double time1_noinit, time2_noinit, time3_noinit; + { + Kokkos::View<double*, Layout> a("A1", N8); + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::View<double*, Layout> a_(a); + Kokkos::resize(a_, int(N8 * 1.1)); + } + time1 = timer.seconds() / R; + } + { + Kokkos::View<double**, Layout> a("A2", N4, N4); + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::View<double**, Layout> a_(a); + Kokkos::resize(a_, int(N4 * 1.1), N4); + } + time2 = timer.seconds() / R; + } + { + Kokkos::View<double***, Layout> a("A3", N3, N3, N2); + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::View<double***, Layout> a_(a); + Kokkos::resize(a_, int(N3 * 1.1), N3, N2); + } + time3 = timer.seconds() / R; + } + { + Kokkos::View<double*, Layout> a("A1", N8); + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::View<double*, Layout> a_(a); + Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N8 * 1.1)); + } + time1_noinit = timer.seconds() / R; + } + { + Kokkos::View<double**, Layout> a("A2", N4, N4); + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::View<double**, Layout> a_(a); + Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N4 * 1.1), N4); + } + time2_noinit = timer.seconds() / R; + } + { + Kokkos::View<double***, Layout> a("A3", N3, N3, N2); + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::View<double***, Layout> a_(a); + Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N3 * 1.1), N3, N2); + } + time3_noinit = timer.seconds() / R; + } +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) + { + Kokkos::View<double*, Layout> a("A1", N8); + double* a_ptr = a.data(); + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::View<double*, Layout> a1( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "A1"), int(N8 * 1.1)); + double* a1_ptr = a1.data(); + Kokkos::parallel_for( + N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; }); + Kokkos::fence(); + } + Kokkos::fence(); + time_raw = timer.seconds() / R; + } +#endif + double size = 1.0 * N8 * 8 / 1024 / 1024; + printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size, + 2.0 * size / 1024 / time_raw); + printf(" Rank1: %lf s %lf MB %lf GB/s\n", time1, size, + 2.0 * size / 1024 / time1); + printf(" Rank2: %lf s %lf MB %lf GB/s\n", time2, size, + 2.0 * size / 1024 / time2); + printf(" Rank3: %lf s %lf MB %lf GB/s\n", time3, size, + 2.0 * size / 1024 / time3); + printf(" Rank1 (WithoutInitializing): %lf s %lf MB %lf GB/s\n", + time1_noinit, size, 2.0 * size / 1024 / time1_noinit); + printf(" Rank2 (WithoutInitializing): %lf s %lf MB %lf GB/s\n", + time2_noinit, size, 2.0 * size / 1024 / time2_noinit); + printf(" Rank3 (WithoutInitializing): %lf s %lf MB %lf GB/s\n", + time3_noinit, size, 2.0 * size / 1024 / time3_noinit); +} + +template <class Layout> +void run_resizeview_tests45(int N, int R) { + const int N1 = N; + const int N2 = N1 * N1; + const int N4 = N2 * N2; + const int N8 = N4 * N4; + + double time4, time5, time_raw = 100000.0; + double time4_noinit, time5_noinit; + { + Kokkos::View<double****, Layout> a("A4", N2, N2, N2, N2); + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::View<double****, Layout> a_(a); + Kokkos::resize(a_, int(N2 * 1.1), N2, N2, N2); + } + time4 = timer.seconds() / R; + } + { + Kokkos::View<double*****, Layout> a("A5", N2, N2, N1, N1, N2); + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::View<double*****, Layout> a_(a); + Kokkos::resize(a_, int(N2 * 1.1), N2, N1, N1, N2); + } + time5 = timer.seconds() / R; + } + { + Kokkos::View<double****, Layout> a("A4", N2, N2, N2, N2); + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::View<double****, Layout> a_(a); + Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N2, N2, + N2); + } + time4_noinit = timer.seconds() / R; + } + { + Kokkos::View<double*****, Layout> a("A5", N2, N2, N1, N1, N2); + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::View<double*****, Layout> a_(a); + Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N2, N1, N1, + N2); + } + time5_noinit = timer.seconds() / R; + } +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) + { + Kokkos::View<double*, Layout> a("A1", N8); + double* a_ptr = a.data(); + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::View<double*, Layout> a1( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "A1"), int(N8 * 1.1)); + double* a1_ptr = a1.data(); + Kokkos::parallel_for( + N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; }); + Kokkos::fence(); + } + Kokkos::fence(); + time_raw = timer.seconds() / R; + } +#endif + double size = 1.0 * N8 * 8 / 1024 / 1024; + printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size, + 2.0 * size / 1024 / time_raw); + printf(" Rank4: %lf s %lf MB %lf GB/s\n", time4, size, + 2.0 * size / 1024 / time4); + printf(" Rank5: %lf s %lf MB %lf GB/s\n", time5, size, + 2.0 * size / 1024 / time5); + printf(" Rank4 (WithoutInitializing): %lf s %lf MB %lf GB/s\n", + time4_noinit, size, 2.0 * size / 1024 / time4_noinit); + printf(" Rank5 (WithoutInitializing): %lf s %lf MB %lf GB/s\n", + time5_noinit, size, 2.0 * size / 1024 / time5_noinit); +} + +template <class Layout> +void run_resizeview_tests6(int N, int R) { + const int N1 = N; + const int N2 = N1 * N1; + const int N4 = N2 * N2; + const int N8 = N4 * N4; + + double time6, time6_noinit, time_raw = 100000.0; + { + Kokkos::View<double******, Layout> a("A6", N2, N1, N1, N1, N1, N2); + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::View<double******, Layout> a_(a); + Kokkos::resize(a_, int(N2 * 1.1), N1, N1, N1, N1, N2); + } + time6 = timer.seconds() / R; + } + { + Kokkos::View<double******, Layout> a("A6", N2, N1, N1, N1, N1, N2); + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::View<double******, Layout> a_(a); + Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N1, N1, N1, + N1, N2); + } + time6_noinit = timer.seconds() / R; + } +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) + { + Kokkos::View<double*, Layout> a("A1", N8); + double* a_ptr = a.data(); + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::View<double*, Layout> a1( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "A1"), int(N8 * 1.1)); + double* a1_ptr = a1.data(); + Kokkos::parallel_for( + N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; }); + Kokkos::fence(); + } + Kokkos::fence(); + time_raw = timer.seconds() / R; + } +#endif + double size = 1.0 * N8 * 8 / 1024 / 1024; + printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size, + 2.0 * size / 1024 / time_raw); + printf(" Rank6: %lf s %lf MB %lf GB/s\n", time6, size, + 2.0 * size / 1024 / time6); + printf(" Rank6 (WithoutInitializing): %lf s %lf MB %lf GB/s\n", + time6_noinit, size, 2.0 * size / 1024 / time6_noinit); +} + +template <class Layout> +void run_resizeview_tests7(int N, int R) { + const int N1 = N; + const int N2 = N1 * N1; + const int N4 = N2 * N2; + const int N8 = N4 * N4; + + double time7, time7_noinit, time_raw = 100000.0; + { + Kokkos::View<double*******, Layout> a("A7", N2, N1, N1, N1, N1, N1, N1); + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::View<double*******, Layout> a_(a); + Kokkos::resize(a_, int(N2 * 1.1), N1, N1, N1, N1, N1, N1); + } + time7 = timer.seconds() / R; + } + { + Kokkos::View<double*******, Layout> a("A7", N2, N1, N1, N1, N1, N1, N1); + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::View<double*******, Layout> a_(a); + Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N2 * 1.1), N1, N1, N1, + N1, N1, N1); + } + time7_noinit = timer.seconds() / R; + } +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) + { + Kokkos::View<double*, Layout> a("A1", N8); + double* a_ptr = a.data(); + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::View<double*, Layout> a1( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "A1"), int(N8 * 1.1)); + double* a1_ptr = a1.data(); + Kokkos::parallel_for( + N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; }); + Kokkos::fence(); + } + Kokkos::fence(); + time_raw = timer.seconds() / R; + } +#endif + double size = 1.0 * N8 * 8 / 1024 / 1024; + printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size, + 2.0 * size / 1024 / time_raw); + printf(" Rank7: %lf s %lf MB %lf GB/s\n", time7, size, + 2.0 * size / 1024 / time7); + printf(" Rank7 (WithoutInitializing): %lf s %lf MB %lf GB/s\n", + time7_noinit, size, 2.0 * size / 1024 / time7_noinit); +} + +template <class Layout> +void run_resizeview_tests8(int N, int R) { + const int N1 = N; + const int N2 = N1 * N1; + const int N4 = N2 * N2; + const int N8 = N4 * N4; + + double time8, time8_noinit, time_raw = 100000.0; + { + Kokkos::View<double********, Layout> a("A8", N1, N1, N1, N1, N1, N1, N1, + N1); + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::View<double********, Layout> a_(a); + Kokkos::resize(a_, int(N1 * 1.1), N1, N1, N1, N1, N1, N1, N1); + } + time8 = timer.seconds() / R; + } + { + Kokkos::View<double********, Layout> a("A8", N1, N1, N1, N1, N1, N1, N1, + N1); + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::View<double********, Layout> a_(a); + Kokkos::resize(Kokkos::WithoutInitializing, a_, int(N1 * 1.1), N1, N1, N1, + N1, N1, N1, N1); + } + time8_noinit = timer.seconds() / R; + } +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) + { + Kokkos::View<double*, Layout> a("A1", N8); + double* a_ptr = a.data(); + Kokkos::Timer timer; + for (int r = 0; r < R; r++) { + Kokkos::View<double*, Layout> a1( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "A1"), int(N8 * 1.1)); + double* a1_ptr = a1.data(); + Kokkos::parallel_for( + N8, KOKKOS_LAMBDA(const int& i) { a1_ptr[i] = a_ptr[i]; }); + Kokkos::fence(); + } + Kokkos::fence(); + time_raw = timer.seconds() / R; + } +#endif + double size = 1.0 * N8 * 8 / 1024 / 1024; + printf(" Raw: %lf s %lf MB %lf GB/s\n", time_raw, size, + 2.0 * size / 1024 / time_raw); + printf(" Rank8: %lf s %lf MB %lf GB/s\n", time8, size, + 2.0 * size / 1024 / time8); + printf(" Rank8 (WithoutInitializing): %lf s %lf MB %lf GB/s\n", + time8_noinit, size, 2.0 * size / 1024 / time8_noinit); +} + +} // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewResize_123.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewResize_123.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1dc4f285f56479b4987a0e3adf39664d8590ddc0 --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTest_ViewResize_123.cpp @@ -0,0 +1,56 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <PerfTest_ViewResize.hpp> + +namespace Test { + +TEST(default_exec, ViewResize_Rank123) { + printf("Resize View Performance for LayoutLeft:\n"); + run_resizeview_tests123<Kokkos::LayoutLeft>(10, 1); + printf("Resize View Performance for LayoutRight:\n"); + run_resizeview_tests123<Kokkos::LayoutRight>(10, 1); +} + +} // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewResize_45.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewResize_45.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3754a5bb147b8d5a057d3c385179aa7645d63e11 --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTest_ViewResize_45.cpp @@ -0,0 +1,56 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <PerfTest_ViewResize.hpp> + +namespace Test { + +TEST(default_exec, ViewResize_Rank_45) { + printf("Resize View Performance for LayoutLeft:\n"); + run_resizeview_tests45<Kokkos::LayoutLeft>(10, 1); + printf("Resize View Performance for LayoutRight:\n"); + run_resizeview_tests45<Kokkos::LayoutRight>(10, 1); +} + +} // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewResize_6.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewResize_6.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1b8d6fbc8a04528f6ea30d6c52bd17e55ff2a683 --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTest_ViewResize_6.cpp @@ -0,0 +1,56 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <PerfTest_ViewResize.hpp> + +namespace Test { + +TEST(default_exec, ViewResize_Rank6) { + printf("Resize View Performance for LayoutLeft:\n"); + run_resizeview_tests6<Kokkos::LayoutLeft>(10, 1); + printf("Resize View Performance for LayoutRight:\n"); + run_resizeview_tests6<Kokkos::LayoutRight>(10, 1); +} + +} // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewResize_7.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewResize_7.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f8efa195fc8e90eb9cbbde39a002b142192800a6 --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTest_ViewResize_7.cpp @@ -0,0 +1,56 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <PerfTest_ViewResize.hpp> + +namespace Test { + +TEST(default_exec, ViewResize_Rank7) { + printf("Resize View Performance for LayoutLeft:\n"); + run_resizeview_tests7<Kokkos::LayoutLeft>(10, 1); + printf("Resize View Performance for LayoutRight:\n"); + run_resizeview_tests7<Kokkos::LayoutRight>(10, 1); +} + +} // namespace Test diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewResize_8.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewResize_8.cpp new file mode 100644 index 0000000000000000000000000000000000000000..afeeb643569ecd1a981132bed08944288ec3ca72 --- /dev/null +++ b/packages/kokkos/core/perf_test/PerfTest_ViewResize_8.cpp @@ -0,0 +1,56 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <PerfTest_ViewResize.hpp> + +namespace Test { + +TEST(default_exec, ViewResize_Rank8) { + printf("Resize View Performance for LayoutLeft:\n"); + run_resizeview_tests8<Kokkos::LayoutLeft>(10, 1); + printf("Resize View Performance for LayoutRight:\n"); + run_resizeview_tests8<Kokkos::LayoutRight>(10, 1); +} + +} // namespace Test diff --git a/packages/kokkos/core/perf_test/run_mempool.sh b/packages/kokkos/core/perf_test/run_mempool.sh new file mode 100755 index 0000000000000000000000000000000000000000..e9b42c5a53fad43e2fc486fbf500660d1e932c97 --- /dev/null +++ b/packages/kokkos/core/perf_test/run_mempool.sh @@ -0,0 +1,25 @@ +#!/bin/bash -e +NT=$1 +PROG="./KokkosCore_PerformanceTest_Mempool" +COMMON_ARGS="--kokkos-threads=$NT --fill_stride=1 --fill_level=70 --chunk_span=5 --repeat_inner=100" + +postproc() { +cat log | head -n 1 | rev | cut -d ' ' -f 1 | rev >> xvals +cat log | tail -n 1 | rev | cut -d ' ' -f 1 | rev >> yvals +} + +for yset in 1 2 3 +do + rm -f xvals yvals + for x in 1 2 4 8 16 32 + do + echo "yset $yset x factor $x" + $PROG $COMMON_ARGS --alloc_size=`expr $x \* 1000000` --super_size=`expr $x \* 100000` > log + postproc + done + rm -f yvals$yset + mv yvals yvals$yset +done + +rm -f datapoints +paste -d',' xvals yvals1 yvals2 yvals3 > datapoints diff --git a/packages/kokkos/core/perf_test/run_mempool_fill.sh b/packages/kokkos/core/perf_test/run_mempool_fill.sh new file mode 100755 index 0000000000000000000000000000000000000000..cdd756b4873915a99d4531e260704640e7749fee --- /dev/null +++ b/packages/kokkos/core/perf_test/run_mempool_fill.sh @@ -0,0 +1,21 @@ +#!/bin/bash -e +NT=$1 +PROG="./KokkosCore_PerformanceTest_Mempool" +COMMON_ARGS="--kokkos-threads=$NT --fill_stride=1 --alloc_size=10027008 --super_size=65536 --repeat_inner=100 --chunk_span=4 --repeat_outer=10" + +postproc() { +cat log | grep "fill ops per second" | rev | cut -d ' ' -f 2 | rev >> yvals_fill +cat log | grep "cycle ops per second" | rev | cut -d ' ' -f 2 | rev >> yvals_cycle +} + +rm -f xvals yvals_fill yvals_cycle +for x in 75 95 +do + echo "test fill level $x" + echo $x >> xvals + $PROG $COMMON_ARGS --fill_level=$x 2>&1 | tee log + postproc +done + +rm -f datapoints +paste xvals yvals_fill yvals_cycle > datapoints.txt diff --git a/packages/kokkos/core/perf_test/run_taskdag.sh b/packages/kokkos/core/perf_test/run_taskdag.sh new file mode 100755 index 0000000000000000000000000000000000000000..dcb016c9d54cc5a8111f07b47c6d769098681253 --- /dev/null +++ b/packages/kokkos/core/perf_test/run_taskdag.sh @@ -0,0 +1,21 @@ +#!/bin/bash -e +NT=$1 +PROG="./KokkosCore_PerformanceTest_TaskDAG" +COMMON_ARGS="--kokkos-threads=$NT --alloc_size=10027008 --super_size=65536 --repeat_outer=10" + +postproc() { +cat log | grep "tasks per second" | rev | cut -d ' ' -f 2 | rev >> yvals +} + +rm -f xvals yvals +for x in 21 23 +do + echo "test input $x" + echo $x >> xvals + $PROG $COMMON_ARGS --input=$x 2>&1 | tee log + postproc +done + +rm -f datapoints.txt +paste xvals yvals > datapoints.txt + diff --git a/packages/kokkos/core/perf_test/test_atomic.cpp b/packages/kokkos/core/perf_test/test_atomic.cpp new file mode 100644 index 0000000000000000000000000000000000000000..59820f3bdd2e83291dfd524325d3b7be6ba918ef --- /dev/null +++ b/packages/kokkos/core/perf_test/test_atomic.cpp @@ -0,0 +1,501 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <cstdio> +#include <cstring> +#include <cstdlib> + +#include <Kokkos_Core.hpp> +#include <impl/Kokkos_Timer.hpp> + +using exec_space = Kokkos::DefaultExecutionSpace; + +#define RESET 0 +#define BRIGHT 1 +#define DIM 2 +#define UNDERLINE 3 +#define BLINK 4 +#define REVERSE 7 +#define HIDDEN 8 + +#define BLACK 0 +#define RED 1 +#define GREEN 2 +#define YELLOW 3 +#define BLUE 4 +#define MAGENTA 5 +#define CYAN 6 +#define GREY 7 +#define WHITE 8 + +void textcolor(int attr, int fg, int bg) { + char command[40]; + + /* Command is the control command to the terminal */ + sprintf(command, "%c[%d;%d;%dm", 0x1B, attr, fg + 30, bg + 40); + printf("%s", command); +} +void textcolor_standard() { textcolor(RESET, BLACK, WHITE); } + +template <class T, class DEVICE_TYPE> +struct ZeroFunctor { + using execution_space = DEVICE_TYPE; + using type = typename Kokkos::View<T, execution_space>; + using h_type = typename Kokkos::View<T, execution_space>::HostMirror; + type data; + KOKKOS_INLINE_FUNCTION + void operator()(int) const { data() = 0; } +}; + +//--------------------------------------------------- +//--------------atomic_fetch_add--------------------- +//--------------------------------------------------- + +template <class T, class DEVICE_TYPE> +struct AddFunctor { + using execution_space = DEVICE_TYPE; + using type = Kokkos::View<T, execution_space>; + type data; + + KOKKOS_INLINE_FUNCTION + void operator()(int) const { Kokkos::atomic_fetch_add(&data(), (T)1); } +}; + +template <class T> +T AddLoop(int loop) { + struct ZeroFunctor<T, exec_space> f_zero; + typename ZeroFunctor<T, exec_space>::type data("Data"); + typename ZeroFunctor<T, exec_space>::h_type h_data("HData"); + f_zero.data = data; + Kokkos::parallel_for(1, f_zero); + exec_space().fence(); + + struct AddFunctor<T, exec_space> f_add; + f_add.data = data; + Kokkos::parallel_for(loop, f_add); + exec_space().fence(); + + Kokkos::deep_copy(h_data, data); + T val = h_data(); + return val; +} + +template <class T, class DEVICE_TYPE> +struct AddNonAtomicFunctor { + using execution_space = DEVICE_TYPE; + using type = Kokkos::View<T, execution_space>; + type data; + + KOKKOS_INLINE_FUNCTION + void operator()(int) const { data() += (T)1; } +}; + +template <class T> +T AddLoopNonAtomic(int loop) { + struct ZeroFunctor<T, exec_space> f_zero; + typename ZeroFunctor<T, exec_space>::type data("Data"); + typename ZeroFunctor<T, exec_space>::h_type h_data("HData"); + + f_zero.data = data; + Kokkos::parallel_for(1, f_zero); + exec_space().fence(); + + struct AddNonAtomicFunctor<T, exec_space> f_add; + f_add.data = data; + Kokkos::parallel_for(loop, f_add); + exec_space().fence(); + + Kokkos::deep_copy(h_data, data); + T val = h_data(); + + return val; +} + +template <class T> +T AddLoopSerial(int loop) { + T* data = new T[1]; + data[0] = 0; + + for (int i = 0; i < loop; i++) *data += (T)1; + + T val = *data; + delete[] data; + return val; +} + +template <class T, class DEVICE_TYPE> +struct CASFunctor { + using execution_space = DEVICE_TYPE; + using type = Kokkos::View<T, execution_space>; + type data; + + KOKKOS_INLINE_FUNCTION + void operator()(int) const { + T old = data(); + T newval, assumed; + do { + assumed = old; + newval = assumed + (T)1; + old = Kokkos::atomic_compare_exchange(&data(), assumed, newval); + } while (old != assumed); + } +}; + +template <class T> +T CASLoop(int loop) { + struct ZeroFunctor<T, exec_space> f_zero; + typename ZeroFunctor<T, exec_space>::type data("Data"); + typename ZeroFunctor<T, exec_space>::h_type h_data("HData"); + f_zero.data = data; + Kokkos::parallel_for(1, f_zero); + exec_space().fence(); + + struct CASFunctor<T, exec_space> f_cas; + f_cas.data = data; + Kokkos::parallel_for(loop, f_cas); + exec_space().fence(); + + Kokkos::deep_copy(h_data, data); + T val = h_data(); + + return val; +} + +template <class T, class DEVICE_TYPE> +struct CASNonAtomicFunctor { + using execution_space = DEVICE_TYPE; + using type = Kokkos::View<T, execution_space>; + type data; + + KOKKOS_INLINE_FUNCTION + void operator()(int) const { + volatile T assumed; + volatile T newval; + bool fail = 1; + do { + assumed = data(); + newval = assumed + (T)1; + if (data() == assumed) { + data() = newval; + fail = 0; + } + } while (fail); + } +}; + +template <class T> +T CASLoopNonAtomic(int loop) { + struct ZeroFunctor<T, exec_space> f_zero; + typename ZeroFunctor<T, exec_space>::type data("Data"); + typename ZeroFunctor<T, exec_space>::h_type h_data("HData"); + f_zero.data = data; + Kokkos::parallel_for(1, f_zero); + exec_space().fence(); + + struct CASNonAtomicFunctor<T, exec_space> f_cas; + f_cas.data = data; + Kokkos::parallel_for(loop, f_cas); + exec_space().fence(); + + Kokkos::deep_copy(h_data, data); + T val = h_data(); + + return val; +} + +template <class T> +T CASLoopSerial(int loop) { + T* data = new T[1]; + data[0] = 0; + + for (int i = 0; i < loop; i++) { + T assumed; + T newval; + T old; + do { + assumed = *data; + newval = assumed + (T)1; + old = *data; + *data = newval; + } while (!(assumed == old)); + } + + T val = *data; + delete[] data; + return val; +} + +template <class T, class DEVICE_TYPE> +struct ExchFunctor { + using execution_space = DEVICE_TYPE; + using type = Kokkos::View<T, execution_space>; + type data, data2; + + KOKKOS_INLINE_FUNCTION + void operator()(int i) const { + T old = Kokkos::atomic_exchange(&data(), (T)i); + Kokkos::atomic_fetch_add(&data2(), old); + } +}; + +template <class T> +T ExchLoop(int loop) { + struct ZeroFunctor<T, exec_space> f_zero; + typename ZeroFunctor<T, exec_space>::type data("Data"); + typename ZeroFunctor<T, exec_space>::h_type h_data("HData"); + f_zero.data = data; + Kokkos::parallel_for(1, f_zero); + exec_space().fence(); + + typename ZeroFunctor<T, exec_space>::type data2("Data"); + typename ZeroFunctor<T, exec_space>::h_type h_data2("HData"); + f_zero.data = data2; + Kokkos::parallel_for(1, f_zero); + exec_space().fence(); + + struct ExchFunctor<T, exec_space> f_exch; + f_exch.data = data; + f_exch.data2 = data2; + Kokkos::parallel_for(loop, f_exch); + exec_space().fence(); + + Kokkos::deep_copy(h_data, data); + Kokkos::deep_copy(h_data2, data2); + T val = h_data() + h_data2(); + + return val; +} + +template <class T, class DEVICE_TYPE> +struct ExchNonAtomicFunctor { + using execution_space = DEVICE_TYPE; + using type = Kokkos::View<T, execution_space>; + type data, data2; + + KOKKOS_INLINE_FUNCTION + void operator()(int i) const { + T old = data(); + data() = (T)i; + data2() += old; + } +}; + +template <class T> +T ExchLoopNonAtomic(int loop) { + struct ZeroFunctor<T, exec_space> f_zero; + typename ZeroFunctor<T, exec_space>::type data("Data"); + typename ZeroFunctor<T, exec_space>::h_type h_data("HData"); + f_zero.data = data; + Kokkos::parallel_for(1, f_zero); + exec_space().fence(); + + typename ZeroFunctor<T, exec_space>::type data2("Data"); + typename ZeroFunctor<T, exec_space>::h_type h_data2("HData"); + f_zero.data = data2; + Kokkos::parallel_for(1, f_zero); + exec_space().fence(); + + struct ExchNonAtomicFunctor<T, exec_space> f_exch; + f_exch.data = data; + f_exch.data2 = data2; + Kokkos::parallel_for(loop, f_exch); + exec_space().fence(); + + Kokkos::deep_copy(h_data, data); + Kokkos::deep_copy(h_data2, data2); + T val = h_data() + h_data2(); + + return val; +} + +template <class T> +T ExchLoopSerial(int loop) { + T* data = new T[1]; + T* data2 = new T[1]; + data[0] = 0; + data2[0] = 0; + for (int i = 0; i < loop; i++) { + T old = *data; + *data = (T)i; + *data2 += old; + } + + T val = *data2 + *data; + delete[] data; + delete[] data2; + return val; +} + +template <class T> +T LoopVariant(int loop, int test) { + switch (test) { + case 1: return AddLoop<T>(loop); + case 2: return CASLoop<T>(loop); + case 3: return ExchLoop<T>(loop); + } + return 0; +} + +template <class T> +T LoopVariantSerial(int loop, int test) { + switch (test) { + case 1: return AddLoopSerial<T>(loop); + case 2: return CASLoopSerial<T>(loop); + case 3: return ExchLoopSerial<T>(loop); + } + return 0; +} + +template <class T> +T LoopVariantNonAtomic(int loop, int test) { + switch (test) { + case 1: return AddLoopNonAtomic<T>(loop); + case 2: return CASLoopNonAtomic<T>(loop); + case 3: return ExchLoopNonAtomic<T>(loop); + } + return 0; +} + +template <class T> +void Loop(int loop, int test, const char* type_name) { + LoopVariant<T>(loop, test); + + Kokkos::Impl::Timer timer; + T res = LoopVariant<T>(loop, test); + double time = timer.seconds(); + + timer.reset(); + T resNonAtomic = LoopVariantNonAtomic<T>(loop, test); + double timeNonAtomic = timer.seconds(); + + timer.reset(); + T resSerial = LoopVariantSerial<T>(loop, test); + double timeSerial = timer.seconds(); + + time *= 1e6 / loop; + timeNonAtomic *= 1e6 / loop; + timeSerial *= 1e6 / loop; + // textcolor_standard(); + bool passed = true; + if (resSerial != res) passed = false; + // if(!passed) textcolor(RESET,BLACK,YELLOW); + printf( + "%s Test %i %s --- Loop: %i Value (S,A,NA): %e %e %e Time: %7.4e %7.4e " + "%7.4e Size of Type %i)", + type_name, test, passed ? "PASSED" : "FAILED", loop, 1.0 * resSerial, + 1.0 * res, 1.0 * resNonAtomic, timeSerial, time, timeNonAtomic, + (int)sizeof(T)); + // if(!passed) textcolor_standard(); + printf("\n"); +} + +template <class T> +void Test(int loop, int test, const char* type_name) { + if (test == -1) { + Loop<T>(loop, 1, type_name); + Loop<T>(loop, 2, type_name); + Loop<T>(loop, 3, type_name); + + } else + Loop<T>(loop, test, type_name); +} + +int main(int argc, char* argv[]) { + int type = -1; + int loop = 100000; + int test = -1; + + for (int i = 0; i < argc; i++) { + if ((strcmp(argv[i], "--test") == 0)) { + test = std::stoi(argv[++i]); + continue; + } + if ((strcmp(argv[i], "--type") == 0)) { + type = std::stoi(argv[++i]); + continue; + } + if ((strcmp(argv[i], "-l") == 0) || (strcmp(argv[i], "--loop") == 0)) { + loop = std::stoi(argv[++i]); + continue; + } + } + + Kokkos::initialize(argc, argv); + + printf("Using %s\n", Kokkos::atomic_query_version()); + bool all_tests = false; + if (type == -1) all_tests = true; + while (type < 100) { + if (type == 1) { + Test<int>(loop, test, "int "); + } + if (type == 2) { + Test<long int>(loop, test, "long int "); + } + if (type == 3) { + Test<long long int>(loop, test, "long long int "); + } + if (type == 4) { + Test<unsigned int>(loop, test, "unsigned int "); + } + if (type == 5) { + Test<unsigned long int>(loop, test, "unsigned long int "); + } + if (type == 6) { + Test<unsigned long long int>(loop, test, "unsigned long long int "); + } + if (type == 10) { + // Test<float>(loop,test,"float "); + } + if (type == 11) { + Test<double>(loop, test, "double "); + } + if (!all_tests) + type = 100; + else + type++; + } + + Kokkos::finalize(); +} diff --git a/packages/kokkos/core/perf_test/test_atomic_minmax_simple.cpp b/packages/kokkos/core/perf_test/test_atomic_minmax_simple.cpp new file mode 100644 index 0000000000000000000000000000000000000000..eec1c8eacc7779121fd54e23d1ad7e4efa80902c --- /dev/null +++ b/packages/kokkos/core/perf_test/test_atomic_minmax_simple.cpp @@ -0,0 +1,244 @@ +// export OMP_PROC_BIND=spread ; export OMP_PLACES=threads +// c++ -O2 -g -DNDEBUG -fopenmp +// ../core/perf_test/test_atomic_minmax_simple.cpp -I../core/src/ -I. -o +// test_atomic_minmax_simple.x containers/src/libkokkoscontainers.a +// core/src/libkokkoscore.a -ldl && OMP_NUM_THREADS=1 +// ./test_atomic_minmax_simple.x 10000000 + +#include <cstdio> +#include <cstdlib> + +#include <iostream> +#include <typeinfo> + +#include <Kokkos_Core.hpp> +#include <impl/Kokkos_Timer.hpp> + +using exec_space = Kokkos::DefaultExecutionSpace; + +template <typename T> +void test(const int length) { + Kokkos::Impl::Timer timer; + + using vector = Kokkos::View<T*, exec_space>; + + vector inp("input", length); + T max = std::numeric_limits<T>::max(); + T min = std::numeric_limits<T>::lowest(); + + // input is max values - all min atomics will replace + { + Kokkos::parallel_for( + length, KOKKOS_LAMBDA(const int i) { inp(i) = max; }); + Kokkos::fence(); + + timer.reset(); + Kokkos::parallel_for( + length, KOKKOS_LAMBDA(const int i) { + (void)Kokkos::atomic_fetch_min(&(inp(i)), (T)i); + }); + Kokkos::fence(); + double time = timer.seconds(); + + int errors(0); + Kokkos::parallel_reduce( + length, + KOKKOS_LAMBDA(const int i, int& inner) { inner += (inp(i) != (T)i); }, + errors); + Kokkos::fence(); + + if (errors) { + std::cerr << "Error in 100% min replacements: " << errors << std::endl; + std::cerr << "inp(0)=" << inp(0) << std::endl; + } + std::cout << "Time for 100% min replacements: " << time << std::endl; + } + + // input is min values - all max atomics will replace + { + Kokkos::parallel_for( + length, KOKKOS_LAMBDA(const int i) { inp(i) = min; }); + Kokkos::fence(); + + timer.reset(); + Kokkos::parallel_for( + length, KOKKOS_LAMBDA(const int i) { + (void)Kokkos::atomic_max_fetch(&(inp(i)), (T)i); + }); + Kokkos::fence(); + double time = timer.seconds(); + + int errors(0); + Kokkos::parallel_reduce( + length, + KOKKOS_LAMBDA(const int i, int& inner) { inner += (inp(i) != (T)i); }, + errors); + Kokkos::fence(); + + if (errors) { + std::cerr << "Error in 100% max replacements: " << errors << std::endl; + std::cerr << "inp(0)=" << inp(0) << std::endl; + } + std::cout << "Time for 100% max replacements: " << time << std::endl; + } + + // input is max values - all max atomics will early exit + { + Kokkos::parallel_for( + length, KOKKOS_LAMBDA(const int i) { inp(i) = max; }); + Kokkos::fence(); + + timer.reset(); + Kokkos::parallel_for( + length, KOKKOS_LAMBDA(const int i) { + (void)Kokkos::atomic_max_fetch(&(inp(i)), (T)i); + }); + Kokkos::fence(); + double time = timer.seconds(); + + int errors(0); + Kokkos::parallel_reduce( + length, + KOKKOS_LAMBDA(const int i, int& inner) { + T ref = max; + inner += (inp(i) != ref); + }, + errors); + Kokkos::fence(); + + if (errors) { + std::cerr << "Error in 100% max early exits: " << errors << std::endl; + std::cerr << "inp(0)=" << inp(0) << std::endl; + } + std::cout << "Time for 100% max early exits: " << time << std::endl; + } + + // input is min values - all min atomics will early exit + { + Kokkos::parallel_for( + length, KOKKOS_LAMBDA(const int i) { inp(i) = min; }); + Kokkos::fence(); + + timer.reset(); + Kokkos::parallel_for( + length, KOKKOS_LAMBDA(const int i) { + (void)Kokkos::atomic_min_fetch(&(inp(i)), (T)i); + }); + Kokkos::fence(); + double time = timer.seconds(); + + int errors(0); + Kokkos::parallel_reduce( + length, + KOKKOS_LAMBDA(const int i, int& inner) { + T ref = min; + inner += (inp(i) != ref); + }, + errors); + Kokkos::fence(); + + if (errors) { + std::cerr << "Error in 100% min early exits: " << errors << std::endl; + std::cerr << "inp(0)=" << inp(0) << std::endl; + if (length > 9) std::cout << "inp(9)=" << inp(9) << std::endl; + } + std::cout << "Time for 100% min early exits: " << time << std::endl; + } + + // limit iterations for contentious test, takes ~50x longer for same length + auto con_length = length / 5; + // input is min values - some max atomics will replace + { + Kokkos::parallel_for( + 1, KOKKOS_LAMBDA(const int i) { inp(i) = min; }); + Kokkos::fence(); + + T current(0); + timer.reset(); + Kokkos::parallel_reduce( + con_length, + KOKKOS_LAMBDA(const int i, T& inner) { + inner = Kokkos::atomic_max_fetch(&(inp(0)), inner + 1); + if (i == con_length - 1) { + Kokkos::atomic_max_fetch(&(inp(0)), max); + inner = max; + } + }, + Kokkos::Max<T>(current)); + Kokkos::fence(); + double time = timer.seconds(); + + if (current < max) { + std::cerr << "Error in contentious max replacements: " << std::endl; + std::cerr << "final=" << current << " inp(0)=" << inp(0) << " max=" << max + << std::endl; + } + std::cout << "Time for contentious max " << con_length + << " replacements: " << time << std::endl; + } + + // input is max values - some min atomics will replace + { + Kokkos::parallel_for( + 1, KOKKOS_LAMBDA(const int i) { inp(i) = max; }); + Kokkos::fence(); + + timer.reset(); + T current(100000000); + Kokkos::parallel_reduce( + con_length, + KOKKOS_LAMBDA(const int i, T& inner) { + inner = Kokkos::atomic_min_fetch(&(inp(0)), inner - 1); + if (i == con_length - 1) { + Kokkos::atomic_min_fetch(&(inp(0)), min); + inner = min; + } + }, + Kokkos::Min<T>(current)); + Kokkos::fence(); + double time = timer.seconds(); + + if (current > min) { + std::cerr << "Error in contentious min replacements: " << std::endl; + std::cerr << "final=" << current << " inp(0)=" << inp(0) << " min=" << min + << std::endl; + } + std::cout << "Time for contentious min " << con_length + << " replacements: " << time << std::endl; + } +} + +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); + { + int length = 1000000; + if (argc == 2) { + length = std::stoi(argv[1]); + } + + if (length < 1) { + throw std::invalid_argument(""); + } + + std::cout << "================ int" << std::endl; + test<int>(length); + std::cout << "================ long" << std::endl; + test<long>(length); + std::cout << "================ long long" << std::endl; + test<long long>(length); + + std::cout << "================ unsigned int" << std::endl; + test<unsigned int>(length); + std::cout << "================ unsigned long" << std::endl; + test<unsigned long>(length); + std::cout << "================ unsigned long long" << std::endl; + test<unsigned long long>(length); + + std::cout << "================ float" << std::endl; + test<float>(length); + std::cout << "================ double" << std::endl; + test<double>(length); + } + Kokkos::finalize(); + return 0; +} diff --git a/packages/kokkos/core/perf_test/test_mempool.cpp b/packages/kokkos/core/perf_test/test_mempool.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9aab119774c49d99ec112c527c79364a9c02ddc6 --- /dev/null +++ b/packages/kokkos/core/perf_test/test_mempool.cpp @@ -0,0 +1,320 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <cstdio> +#include <cstring> +#include <cstdlib> +#include <limits> + +#include <Kokkos_Core.hpp> +#include <impl/Kokkos_Timer.hpp> + +using ExecSpace = Kokkos::DefaultExecutionSpace; +using MemorySpace = Kokkos::DefaultExecutionSpace::memory_space; + +using MemoryPool = Kokkos::MemoryPool<ExecSpace>; + +struct TestFunctor { + using ptrs_type = Kokkos::View<uintptr_t*, ExecSpace>; + + enum : unsigned { chunk = 32 }; + + MemoryPool pool; + ptrs_type ptrs; + unsigned chunk_span; + unsigned fill_stride; + unsigned range_iter; + unsigned repeat_inner; + + TestFunctor(size_t total_alloc_size, unsigned min_superblock_size, + unsigned number_alloc, unsigned arg_stride_alloc, + unsigned arg_chunk_span, unsigned arg_repeat) + : pool(), ptrs(), chunk_span(0), fill_stride(0), repeat_inner(0) { + MemorySpace m; + + const unsigned min_block_size = chunk; + const unsigned max_block_size = chunk * arg_chunk_span; + pool = MemoryPool(m, total_alloc_size, min_block_size, max_block_size, + min_superblock_size); + + ptrs = ptrs_type(Kokkos::view_alloc(m, "ptrs"), number_alloc); + fill_stride = arg_stride_alloc; + chunk_span = arg_chunk_span; + range_iter = fill_stride * number_alloc; + repeat_inner = arg_repeat; + } + + //---------------------------------------- + + using value_type = long; + + //---------------------------------------- + + struct TagFill {}; + + KOKKOS_INLINE_FUNCTION + void operator()(TagFill, int i, value_type& update) const noexcept { + if (0 == i % fill_stride) { + const int j = i / fill_stride; + + const unsigned size_alloc = chunk * (1 + (j % chunk_span)); + + ptrs(j) = (uintptr_t)pool.allocate(size_alloc); + + if (ptrs(j)) ++update; + } + } + + bool test_fill() { + using policy = Kokkos::RangePolicy<ExecSpace, TagFill>; + + long result = 0; + + Kokkos::parallel_reduce(policy(0, range_iter), *this, result); + + if (result == long(ptrs.extent(0))) return true; + pool.print_state(std::cerr); + return false; + } + + //---------------------------------------- + + struct TagDel {}; + + KOKKOS_INLINE_FUNCTION + void operator()(TagDel, int i) const noexcept { + if (0 == i % fill_stride) { + const int j = i / fill_stride; + + const unsigned size_alloc = chunk * (1 + (j % chunk_span)); + + pool.deallocate((void*)ptrs(j), size_alloc); + } + } + + void test_del() { + using policy = Kokkos::RangePolicy<ExecSpace, TagDel>; + + Kokkos::parallel_for(policy(0, range_iter), *this); + Kokkos::fence(); + } + + //---------------------------------------- + + struct TagAllocDealloc {}; + + KOKKOS_INLINE_FUNCTION + void operator()(TagAllocDealloc, int i, long& update) const noexcept { + if (0 == i % fill_stride) { + const int j = i / fill_stride; + + if (0 == j % 3) { + for (unsigned k = 0; k < repeat_inner; ++k) { + const unsigned size_alloc = chunk * (1 + (j % chunk_span)); + + pool.deallocate((void*)ptrs(j), size_alloc); + + ptrs(j) = (uintptr_t)pool.allocate(size_alloc); + + if (0 == ptrs(j)) update++; + } + } + } + } + + bool test_alloc_dealloc() { + using policy = Kokkos::RangePolicy<ExecSpace, TagAllocDealloc>; + + long error_count = 0; + + Kokkos::parallel_reduce(policy(0, range_iter), *this, error_count); + + return 0 == error_count; + } +}; + +int main(int argc, char* argv[]) { + static const char help_flag[] = "--help"; + static const char alloc_size_flag[] = "--alloc_size="; + static const char super_size_flag[] = "--super_size="; + static const char chunk_span_flag[] = "--chunk_span="; + static const char fill_stride_flag[] = "--fill_stride="; + static const char fill_level_flag[] = "--fill_level="; + static const char repeat_outer_flag[] = "--repeat_outer="; + static const char repeat_inner_flag[] = "--repeat_inner="; + + long total_alloc_size = 1000000; + int min_superblock_size = 10000; + int chunk_span = 5; + int fill_stride = 1; + int fill_level = 70; + int repeat_outer = 1; + int repeat_inner = 1; + + int ask_help = 0; + + for (int i = 1; i < argc; i++) { + const char* const a = argv[i]; + + if (!strncmp(a, help_flag, strlen(help_flag))) ask_help = 1; + + if (!strncmp(a, alloc_size_flag, strlen(alloc_size_flag))) + total_alloc_size = atol(a + strlen(alloc_size_flag)); + + if (!strncmp(a, super_size_flag, strlen(super_size_flag))) + min_superblock_size = std::stoi(a + strlen(super_size_flag)); + + if (!strncmp(a, fill_stride_flag, strlen(fill_stride_flag))) + fill_stride = std::stoi(a + strlen(fill_stride_flag)); + + if (!strncmp(a, fill_level_flag, strlen(fill_level_flag))) + fill_level = std::stoi(a + strlen(fill_level_flag)); + + if (!strncmp(a, chunk_span_flag, strlen(chunk_span_flag))) + chunk_span = std::stoi(a + strlen(chunk_span_flag)); + + if (!strncmp(a, repeat_outer_flag, strlen(repeat_outer_flag))) + repeat_outer = std::stoi(a + strlen(repeat_outer_flag)); + + if (!strncmp(a, repeat_inner_flag, strlen(repeat_inner_flag))) + repeat_inner = std::stoi(a + strlen(repeat_inner_flag)); + } + + int chunk_span_bytes = 0; + for (int i = 0; i < chunk_span; ++i) { + auto chunk_bytes = TestFunctor::chunk * (1 + i); + if (chunk_bytes < 64) chunk_bytes = 64; + auto block_bytes_lg2 = + Kokkos::Impl::integral_power_of_two_that_contains(chunk_bytes); + auto block_bytes = (1 << block_bytes_lg2); + chunk_span_bytes += block_bytes; + } + auto actual_superblock_bytes_lg2 = + Kokkos::Impl::integral_power_of_two_that_contains(min_superblock_size); + auto actual_superblock_bytes = (1 << actual_superblock_bytes_lg2); + auto superblock_mask = actual_superblock_bytes - 1; + auto nsuperblocks = + (total_alloc_size + superblock_mask) >> actual_superblock_bytes_lg2; + auto actual_total_bytes = nsuperblocks * actual_superblock_bytes; + auto bytes_wanted = (actual_total_bytes * fill_level) / 100; + auto chunk_spans = bytes_wanted / chunk_span_bytes; + auto number_alloc = int(chunk_spans * chunk_span); + + if (ask_help) { + std::cout << "command line options:" + << " " << help_flag << " " << alloc_size_flag << "##" + << " " << super_size_flag << "##" + << " " << fill_stride_flag << "##" + << " " << fill_level_flag << "##" + << " " << chunk_span_flag << "##" + << " " << repeat_outer_flag << "##" + << " " << repeat_inner_flag << "##" << std::endl; + return 0; + } + + Kokkos::initialize(argc, argv); + + double sum_fill_time = 0; + double sum_cycle_time = 0; + double sum_both_time = 0; + double min_fill_time = std::numeric_limits<double>::max(); + double min_cycle_time = std::numeric_limits<double>::max(); + double min_both_time = std::numeric_limits<double>::max(); + // one alloc in fill, alloc/dealloc pair in repeat_inner + for (int i = 0; i < repeat_outer; ++i) { + TestFunctor functor(total_alloc_size, min_superblock_size, number_alloc, + fill_stride, chunk_span, repeat_inner); + + Kokkos::Impl::Timer timer; + + if (!functor.test_fill()) { + Kokkos::abort("fill "); + } + + auto t0 = timer.seconds(); + + if (!functor.test_alloc_dealloc()) { + Kokkos::abort("alloc/dealloc "); + } + + auto t1 = timer.seconds(); + auto this_fill_time = t0; + auto this_cycle_time = t1 - t0; + auto this_both_time = t1; + sum_fill_time += this_fill_time; + sum_cycle_time += this_cycle_time; + sum_both_time += this_both_time; + min_fill_time = std::min(min_fill_time, this_fill_time); + min_cycle_time = std::min(min_cycle_time, this_cycle_time); + min_both_time = std::min(min_both_time, this_both_time); + } + + Kokkos::finalize(); + + printf( + "\"mempool: alloc super stride level span inner outer number\" %ld %d %d " + "%d %d %d %d %d\n", + total_alloc_size, min_superblock_size, fill_stride, fill_level, + chunk_span, repeat_inner, repeat_outer, number_alloc); + + auto avg_fill_time = sum_fill_time / repeat_outer; + auto avg_cycle_time = sum_cycle_time / repeat_outer; + auto avg_both_time = sum_both_time / repeat_outer; + + printf("\"mempool: fill time (min, avg)\" %.8f %.8f\n", min_fill_time, + avg_fill_time); + + printf("\"mempool: cycle time (min, avg)\" %.8f %.8f\n", min_cycle_time, + avg_cycle_time); + + printf("\"mempool: test time (min, avg)\" %.8f %.8f\n", min_both_time, + avg_both_time); + + printf("\"mempool: fill ops per second (max, avg)\" %g %g\n", + number_alloc / min_fill_time, number_alloc / avg_fill_time); + + printf("\"mempool: cycle ops per second (max, avg)\" %g %g\n", + (2 * number_alloc * repeat_inner) / min_cycle_time, + (2 * number_alloc * repeat_inner) / avg_cycle_time); +} diff --git a/packages/kokkos/core/perf_test/test_taskdag.cpp b/packages/kokkos/core/perf_test/test_taskdag.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b2f936a955eca4a6d8a3c0eec928e01c5de66e51 --- /dev/null +++ b/packages/kokkos/core/perf_test/test_taskdag.cpp @@ -0,0 +1,252 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> + +#if !defined(KOKKOS_ENABLE_TASKDAG) || \ + defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS) + +int main() { return 0; } + +#else + +#include <cstdio> +#include <cstring> +#include <cstdlib> +#include <limits> + +#include <impl/Kokkos_Timer.hpp> + +using ExecSpace = Kokkos::DefaultExecutionSpace; + +inline long eval_fib(long n) { + constexpr long mask = 0x03; + + long fib[4] = {0, 1, 0, 0}; + + for (long i = 2; i <= n; ++i) { + fib[i & mask] = fib[(i - 1) & mask] + fib[(i - 2) & mask]; + } + + return fib[n & mask]; +} + +inline long fib_alloc_count(long n) { + constexpr long mask = 0x03; + + long count[4] = {1, 1, 0, 0}; + + for (long i = 2; i <= n; ++i) { + count[i & mask] = 2 // this task plus the 'when_all' task + + count[(i - 1) & mask] + count[(i - 2) & mask]; + } + + return count[n & mask]; +} + +template <class Scheduler> +struct TestFib { + using MemorySpace = typename Scheduler::memory_space; + using MemberType = typename Scheduler::member_type; + using FutureType = Kokkos::BasicFuture<long, Scheduler>; + + using value_type = long; + + FutureType dep[2]; + const value_type n; + + KOKKOS_INLINE_FUNCTION + TestFib(const value_type arg_n) : dep{}, n(arg_n) {} + + KOKKOS_INLINE_FUNCTION + void operator()(MemberType& member, value_type& result) noexcept { + auto& sched = member.scheduler(); + if (n < 2) { + result = n; + } else if (!dep[0].is_null() && !dep[1].is_null()) { + result = dep[0].get() + dep[1].get(); + } else { + // Spawn new children and respawn myself to sum their results. + // Spawn lower value at higher priority as it has a shorter + // path to completion. + + dep[1] = Kokkos::task_spawn( + Kokkos::TaskSingle(sched, Kokkos::TaskPriority::High), + TestFib(n - 2)); + + dep[0] = Kokkos::task_spawn(Kokkos::TaskSingle(sched), TestFib(n - 1)); + + auto fib_all = sched.when_all(dep, 2); + + if (!dep[0].is_null() && !dep[1].is_null() && !fib_all.is_null()) { + // High priority to retire this branch. + Kokkos::respawn(this, fib_all, Kokkos::TaskPriority::High); + } else { + Kokkos::abort("Failed nested task spawn (allocation)"); + } + } + } +}; + +int main(int argc, char* argv[]) { + static const char help[] = "--help"; + static const char alloc_size[] = "--alloc_size="; + static const char super_size[] = "--super_size="; + static const char repeat_outer[] = "--repeat_outer="; + static const char input_value[] = "--input="; + + long total_alloc_size = 1000000; + int min_superblock_size = 10000; + int test_repeat_outer = 1; + int fib_input = 4; + + int ask_help = 0; + + for (int i = 1; i < argc; i++) { + const char* const a = argv[i]; + + if (!strncmp(a, help, strlen(help))) ask_help = 1; + + if (!strncmp(a, alloc_size, strlen(alloc_size))) + total_alloc_size = atol(a + strlen(alloc_size)); + + if (!strncmp(a, super_size, strlen(super_size))) + min_superblock_size = std::stoi(a + strlen(super_size)); + + if (!strncmp(a, repeat_outer, strlen(repeat_outer))) + test_repeat_outer = std::stoi(a + strlen(repeat_outer)); + + if (!strncmp(a, input_value, strlen(input_value))) + fib_input = std::stoi(a + strlen(input_value)); + } + + const long fib_output = eval_fib(fib_input); + const long number_alloc = fib_alloc_count(fib_input); + + const unsigned min_block_size = 32; + const unsigned max_block_size = 128; + + long task_count_max = 0; + long task_count_accum = 0; + long test_result = 0; + + if (ask_help) { + std::cout << "command line options:" + << " " << help << " " << alloc_size << "##" + << " " << super_size << "##" + << " " << input_value << "##" + << " " << repeat_outer << "##" << std::endl; + return -1; + } + + using Scheduler = Kokkos::TaskSchedulerMultiple<ExecSpace>; + + using Functor = TestFib<Scheduler>; + + Kokkos::initialize(argc, argv); + + { + Scheduler sched(Functor::MemorySpace(), total_alloc_size, min_block_size, + max_block_size, min_superblock_size); + + Functor::FutureType f = + Kokkos::host_spawn(Kokkos::TaskSingle(sched), Functor(fib_input)); + + Kokkos::wait(sched); + + test_result = f.get(); + + // task_count_max = sched.allocated_task_count_max(); + // task_count_accum = sched.allocated_task_count_accum(); + + // if ( number_alloc != task_count_accum ) { + // std::cout << " number_alloc( " << number_alloc << " )" + // << " != task_count_accum( " << task_count_accum << " )" + // << std::endl ; + //} + + if (fib_output != test_result) { + std::cout << " answer( " << fib_output << " )" + << " != result( " << test_result << " )" << std::endl; + } + + if (fib_output != test_result) { // || number_alloc != task_count_accum ) { + printf(" TEST FAILED\n"); + return -1; + } + + double min_time = std::numeric_limits<double>::max(); + double time_sum = 0; + + for (int i = 0; i < test_repeat_outer; ++i) { + Kokkos::Impl::Timer timer; + + Functor::FutureType ftmp = + Kokkos::host_spawn(Kokkos::TaskSingle(sched), Functor(fib_input)); + + Kokkos::wait(sched); + auto this_time = timer.seconds(); + min_time = std::min(min_time, this_time); + time_sum += this_time; + } + + auto avg_time = time_sum / test_repeat_outer; + + printf( + "\"taskdag: alloc super repeat input output task-accum task-max\" %ld " + "%d %d %d %ld %ld %ld\n", + total_alloc_size, min_superblock_size, test_repeat_outer, fib_input, + fib_output, task_count_accum, task_count_max); + + printf("\"taskdag: time (min, avg)\" %g %g\n", min_time, avg_time); + printf("\"taskdag: tasks per second (max, avg)\" %g %g\n", + number_alloc / min_time, number_alloc / avg_time); + } // end scope to destroy scheduler prior to finalize + + Kokkos::finalize(); + + return 0; +} + +#endif diff --git a/packages/kokkos/core/src/CMakeLists.txt b/packages/kokkos/core/src/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..2ab0989805723ce32115d379dd39708b5edd8209 --- /dev/null +++ b/packages/kokkos/core/src/CMakeLists.txt @@ -0,0 +1,88 @@ +#I have to leave these here for tribits +KOKKOS_INCLUDE_DIRECTORIES( + ${CMAKE_CURRENT_BINARY_DIR} + ${CMAKE_CURRENT_SOURCE_DIR} + ${KOKKOS_TOP_BUILD_DIR} +) + +INSTALL (DIRECTORY + "${CMAKE_CURRENT_SOURCE_DIR}/" + DESTINATION ${KOKKOS_HEADER_DIR} + FILES_MATCHING + PATTERN "*.hpp" + PATTERN "*.h" +) + +SET(KOKKOS_CORE_SRCS) +APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.cpp) +SET(KOKKOS_CORE_HEADERS) +APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp) +APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.hpp) + +IF (KOKKOS_ENABLE_CUDA) + APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/*.cpp) + APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/*.hpp) +ENDIF() + +IF (KOKKOS_ENABLE_OPENMP) + APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMP/*.cpp) + APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMP/*.hpp) +ENDIF() + +IF (KOKKOS_ENABLE_OPENMPTARGET) + APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMPTarget/*.cpp) + APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMPTarget/*.hpp) +ENDIF() + +IF (KOKKOS_ENABLE_PTHREAD) + APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Threads/*.cpp) + APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Threads/*.hpp) +ENDIF() + +IF (KOKKOS_ENABLE_HIP) + APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/HIP/*.cpp) + APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/HIP/*.hpp) +ENDIF() + +IF (KOKKOS_ENABLE_HPX) + APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/*.cpp) + APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/*.hpp) +ENDIF() + +IF (NOT KOKKOS_ENABLE_MEMKIND) + LIST(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/Kokkos_HBWSpace.cpp) +ENDIF() + +IF (KOKKOS_ENABLE_SERIAL) + APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/*.hpp) +ELSE() + LIST(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/Kokkos_Serial.cpp) + LIST(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/Kokkos_Serial_task.cpp) +ENDIF() + +IF (KOKKOS_ENABLE_SYCL) + APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/SYCL/*.cpp) + APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/SYCL/*.hpp) +ENDIF() + +KOKKOS_ADD_LIBRARY( + kokkoscore + SOURCES ${KOKKOS_CORE_SRCS} + HEADERS ${KOKKOS_CORE_HEADERS} + ADD_BUILD_OPTIONS # core should be given all the necessary compiler/linker flags +) + +KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkoscore + ${KOKKOS_TOP_BUILD_DIR} + ${CMAKE_CURRENT_BINARY_DIR} + ${CMAKE_CURRENT_SOURCE_DIR} +) + +KOKKOS_LINK_TPL(kokkoscore PUBLIC HWLOC) +KOKKOS_LINK_TPL(kokkoscore PUBLIC MEMKIND) +KOKKOS_LINK_TPL(kokkoscore PUBLIC CUDA) +KOKKOS_LINK_TPL(kokkoscore PUBLIC HPX) +KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBDL) +KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBRT) +KOKKOS_LINK_TPL(kokkoscore PUBLIC PTHREAD) +KOKKOS_LINK_TPL(kokkoscore PUBLIC ROCM) diff --git a/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp new file mode 100644 index 0000000000000000000000000000000000000000..916f109758de4ba3cf469659d7458ae77cf464da --- /dev/null +++ b/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp @@ -0,0 +1,628 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Macros.hpp> +#ifdef KOKKOS_ENABLE_CUDA + +#include <Kokkos_Core.hpp> +#include <Kokkos_Cuda.hpp> +#include <Kokkos_CudaSpace.hpp> + +#include <cstdlib> +#include <iostream> +#include <sstream> +#include <stdexcept> +#include <algorithm> +#include <atomic> + +//#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp> +#include <impl/Kokkos_Error.hpp> +#include <impl/Kokkos_MemorySpace.hpp> + +#include <impl/Kokkos_Tools.hpp> + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +cudaStream_t Kokkos::Impl::cuda_get_deep_copy_stream() { + static cudaStream_t s = nullptr; + if (s == nullptr) { + cudaStreamCreate(&s); + } + return s; +} + +const std::unique_ptr<Kokkos::Cuda> &Kokkos::Impl::cuda_get_deep_copy_space( + bool initialize) { + static std::unique_ptr<Cuda> space = nullptr; + if (!space && initialize) + space = std::make_unique<Cuda>(Kokkos::Impl::cuda_get_deep_copy_stream()); + return space; +} + +namespace Kokkos { +namespace Impl { + +namespace { + +static std::atomic<int> num_uvm_allocations(0); + +} // namespace + +DeepCopy<CudaSpace, CudaSpace, Cuda>::DeepCopy(void *dst, const void *src, + size_t n) { + CUDA_SAFE_CALL(cudaMemcpy(dst, src, n, cudaMemcpyDefault)); +} + +DeepCopy<HostSpace, CudaSpace, Cuda>::DeepCopy(void *dst, const void *src, + size_t n) { + CUDA_SAFE_CALL(cudaMemcpy(dst, src, n, cudaMemcpyDefault)); +} + +DeepCopy<CudaSpace, HostSpace, Cuda>::DeepCopy(void *dst, const void *src, + size_t n) { + CUDA_SAFE_CALL(cudaMemcpy(dst, src, n, cudaMemcpyDefault)); +} + +DeepCopy<CudaSpace, CudaSpace, Cuda>::DeepCopy(const Cuda &instance, void *dst, + const void *src, size_t n) { + CUDA_SAFE_CALL( + cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, instance.cuda_stream())); +} + +DeepCopy<HostSpace, CudaSpace, Cuda>::DeepCopy(const Cuda &instance, void *dst, + const void *src, size_t n) { + CUDA_SAFE_CALL( + cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, instance.cuda_stream())); +} + +DeepCopy<CudaSpace, HostSpace, Cuda>::DeepCopy(const Cuda &instance, void *dst, + const void *src, size_t n) { + CUDA_SAFE_CALL( + cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, instance.cuda_stream())); +} + +void DeepCopyAsyncCuda(void *dst, const void *src, size_t n) { + cudaStream_t s = cuda_get_deep_copy_stream(); + CUDA_SAFE_CALL(cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, s)); + cudaStreamSynchronize(s); +} + +} // namespace Impl +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { + +KOKKOS_DEPRECATED void CudaSpace::access_error() { + const std::string msg( + "Kokkos::CudaSpace::access_error attempt to execute Cuda function from " + "non-Cuda space"); + Kokkos::Impl::throw_runtime_exception(msg); +} + +KOKKOS_DEPRECATED void CudaSpace::access_error(const void *const) { + const std::string msg( + "Kokkos::CudaSpace::access_error attempt to execute Cuda function from " + "non-Cuda space"); + Kokkos::Impl::throw_runtime_exception(msg); +} + +/*--------------------------------------------------------------------------*/ + +bool CudaUVMSpace::available() { +#if defined(CUDA_VERSION) && !defined(__APPLE__) + enum : bool { UVM_available = true }; +#else + enum : bool { UVM_available = false }; +#endif + return UVM_available; +} + +/*--------------------------------------------------------------------------*/ + +int CudaUVMSpace::number_of_allocations() { + return Kokkos::Impl::num_uvm_allocations.load(); +} +#ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST +// The purpose of the following variable is to allow a state-based choice +// for pinning UVM allocations to the CPU. For now this is considered +// an experimental debugging capability - with the potential to work around +// some CUDA issues. +bool CudaUVMSpace::kokkos_impl_cuda_pin_uvm_to_host_v = false; + +bool CudaUVMSpace::cuda_pin_uvm_to_host() { + return CudaUVMSpace::kokkos_impl_cuda_pin_uvm_to_host_v; +} +void CudaUVMSpace::cuda_set_pin_uvm_to_host(bool val) { + CudaUVMSpace::kokkos_impl_cuda_pin_uvm_to_host_v = val; +} +#endif +} // namespace Kokkos + +#ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST +bool kokkos_impl_cuda_pin_uvm_to_host() { + return Kokkos::CudaUVMSpace::cuda_pin_uvm_to_host(); +} + +void kokkos_impl_cuda_set_pin_uvm_to_host(bool val) { + Kokkos::CudaUVMSpace::cuda_set_pin_uvm_to_host(val); +} +#endif + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { + +CudaSpace::CudaSpace() : m_device(Kokkos::Cuda().cuda_device()) {} + +CudaUVMSpace::CudaUVMSpace() : m_device(Kokkos::Cuda().cuda_device()) {} + +CudaHostPinnedSpace::CudaHostPinnedSpace() {} + +//============================================================================== +// <editor-fold desc="allocate()"> {{{1 + +void *CudaSpace::allocate(const size_t arg_alloc_size) const { + return allocate("[unlabeled]", arg_alloc_size); +} + +void *CudaSpace::allocate(const char *arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size) const { + return impl_allocate(arg_label, arg_alloc_size, arg_logical_size); +} +void *CudaSpace::impl_allocate( + const char *arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { + void *ptr = nullptr; + + auto error_code = cudaMalloc(&ptr, arg_alloc_size); + if (error_code != cudaSuccess) { // TODO tag as unlikely branch + cudaGetLastError(); // This is the only way to clear the last error, which + // we should do here since we're turning it into an + // exception here + throw Experimental::CudaRawMemoryAllocationFailure( + arg_alloc_size, error_code, + Experimental::RawMemoryAllocationFailure::AllocationMechanism:: + CudaMalloc); + } + + if (Kokkos::Profiling::profileLibraryLoaded()) { + const size_t reported_size = + (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; + Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size); + } + return ptr; +} + +void *CudaUVMSpace::allocate(const size_t arg_alloc_size) const { + return allocate("[unlabeled]", arg_alloc_size); +} +void *CudaUVMSpace::allocate(const char *arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size) const { + return impl_allocate(arg_label, arg_alloc_size, arg_logical_size); +} +void *CudaUVMSpace::impl_allocate( + const char *arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { + void *ptr = nullptr; + + Cuda::impl_static_fence(); + if (arg_alloc_size > 0) { + Kokkos::Impl::num_uvm_allocations++; + + auto error_code = + cudaMallocManaged(&ptr, arg_alloc_size, cudaMemAttachGlobal); + +#ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST + if (Kokkos::CudaUVMSpace::cuda_pin_uvm_to_host()) + cudaMemAdvise(ptr, arg_alloc_size, cudaMemAdviseSetPreferredLocation, + cudaCpuDeviceId); +#endif + + if (error_code != cudaSuccess) { // TODO tag as unlikely branch + cudaGetLastError(); // This is the only way to clear the last error, + // which we should do here since we're turning it + // into an exception here + throw Experimental::CudaRawMemoryAllocationFailure( + arg_alloc_size, error_code, + Experimental::RawMemoryAllocationFailure::AllocationMechanism:: + CudaMallocManaged); + } + } + Cuda::impl_static_fence(); + if (Kokkos::Profiling::profileLibraryLoaded()) { + const size_t reported_size = + (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; + Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size); + } + return ptr; +} +void *CudaHostPinnedSpace::allocate(const size_t arg_alloc_size) const { + return allocate("[unlabeled]", arg_alloc_size); +} +void *CudaHostPinnedSpace::allocate(const char *arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size) const { + return impl_allocate(arg_label, arg_alloc_size, arg_logical_size); +} +void *CudaHostPinnedSpace::impl_allocate( + const char *arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { + void *ptr = nullptr; + + auto error_code = cudaHostAlloc(&ptr, arg_alloc_size, cudaHostAllocDefault); + if (error_code != cudaSuccess) { // TODO tag as unlikely branch + cudaGetLastError(); // This is the only way to clear the last error, which + // we should do here since we're turning it into an + // exception here + throw Experimental::CudaRawMemoryAllocationFailure( + arg_alloc_size, error_code, + Experimental::RawMemoryAllocationFailure::AllocationMechanism:: + CudaHostAlloc); + } + if (Kokkos::Profiling::profileLibraryLoaded()) { + const size_t reported_size = + (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; + Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size); + } + return ptr; +} + +// </editor-fold> end allocate() }}}1 +//============================================================================== +void CudaSpace::deallocate(void *const arg_alloc_ptr, + const size_t arg_alloc_size) const { + deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size); +} +void CudaSpace::deallocate(const char *arg_label, void *const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t arg_logical_size) const { + impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size); +} +void CudaSpace::impl_deallocate( + const char *arg_label, void *const arg_alloc_ptr, + const size_t arg_alloc_size, const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { + if (Kokkos::Profiling::profileLibraryLoaded()) { + const size_t reported_size = + (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; + Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, + reported_size); + } + + try { + CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr)); + } catch (...) { + } +} +void CudaUVMSpace::deallocate(void *const arg_alloc_ptr, + const size_t arg_alloc_size) const { + deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size); +} + +void CudaUVMSpace::deallocate(const char *arg_label, void *const arg_alloc_ptr, + const size_t arg_alloc_size + + , + const size_t arg_logical_size) const { + impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size); +} +void CudaUVMSpace::impl_deallocate( + const char *arg_label, void *const arg_alloc_ptr, + const size_t arg_alloc_size + + , + const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { + Cuda::impl_static_fence(); + if (Kokkos::Profiling::profileLibraryLoaded()) { + const size_t reported_size = + (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; + Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, + reported_size); + } + try { + if (arg_alloc_ptr != nullptr) { + Kokkos::Impl::num_uvm_allocations--; + CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr)); + } + } catch (...) { + } + Cuda::impl_static_fence(); +} + +void CudaHostPinnedSpace::deallocate(void *const arg_alloc_ptr, + const size_t arg_alloc_size) const { + deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size); +} +void CudaHostPinnedSpace::deallocate(const char *arg_label, + void *const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t arg_logical_size) const { + impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size); +} + +void CudaHostPinnedSpace::impl_deallocate( + const char *arg_label, void *const arg_alloc_ptr, + const size_t arg_alloc_size, const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { + if (Kokkos::Profiling::profileLibraryLoaded()) { + const size_t reported_size = + (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; + Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, + reported_size); + } + try { + CUDA_SAFE_CALL(cudaFreeHost(arg_alloc_ptr)); + } catch (...) { + } +} + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +#ifdef KOKKOS_ENABLE_DEBUG +SharedAllocationRecord<void, void> + SharedAllocationRecord<Kokkos::CudaSpace, void>::s_root_record; + +SharedAllocationRecord<void, void> + SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::s_root_record; + +SharedAllocationRecord<void, void> + SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>::s_root_record; +#endif + +::cudaTextureObject_t +SharedAllocationRecord<Kokkos::CudaSpace, void>::attach_texture_object( + const unsigned sizeof_alias, void *const alloc_ptr, + size_t const alloc_size) { + enum { TEXTURE_BOUND_1D = 1u << 27 }; + + if ((alloc_ptr == nullptr) || + (sizeof_alias * TEXTURE_BOUND_1D <= alloc_size)) { + std::ostringstream msg; + msg << "Kokkos::CudaSpace ERROR: Cannot attach texture object to" + << " alloc_ptr(" << alloc_ptr << ")" + << " alloc_size(" << alloc_size << ")" + << " max_size(" << (sizeof_alias * TEXTURE_BOUND_1D) << ")"; + std::cerr << msg.str() << std::endl; + std::cerr.flush(); + Kokkos::Impl::throw_runtime_exception(msg.str()); + } + + ::cudaTextureObject_t tex_obj; + + struct cudaResourceDesc resDesc; + struct cudaTextureDesc texDesc; + + memset(&resDesc, 0, sizeof(resDesc)); + memset(&texDesc, 0, sizeof(texDesc)); + + resDesc.resType = cudaResourceTypeLinear; + resDesc.res.linear.desc = + (sizeof_alias == 4 + ? cudaCreateChannelDesc<int>() + : (sizeof_alias == 8 + ? cudaCreateChannelDesc< ::int2>() + : + /* sizeof_alias == 16 */ cudaCreateChannelDesc< ::int4>())); + resDesc.res.linear.sizeInBytes = alloc_size; + resDesc.res.linear.devPtr = alloc_ptr; + + CUDA_SAFE_CALL( + cudaCreateTextureObject(&tex_obj, &resDesc, &texDesc, nullptr)); + + return tex_obj; +} + +//============================================================================== +// <editor-fold desc="SharedAllocationRecord destructors"> {{{1 + +SharedAllocationRecord<Kokkos::CudaSpace, void>::~SharedAllocationRecord() { + const char *label = nullptr; + if (Kokkos::Profiling::profileLibraryLoaded()) { + SharedAllocationHeader header; + Kokkos::Impl::DeepCopy<Kokkos::CudaSpace, HostSpace>( + &header, RecordBase::m_alloc_ptr, sizeof(SharedAllocationHeader)); + label = header.label(); + } + auto alloc_size = SharedAllocationRecord<void, void>::m_alloc_size; + m_space.deallocate(label, SharedAllocationRecord<void, void>::m_alloc_ptr, + alloc_size, (alloc_size - sizeof(SharedAllocationHeader))); +} + +SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::~SharedAllocationRecord() { + const char *label = nullptr; + if (Kokkos::Profiling::profileLibraryLoaded()) { + label = RecordBase::m_alloc_ptr->m_label; + } + m_space.deallocate(label, SharedAllocationRecord<void, void>::m_alloc_ptr, + SharedAllocationRecord<void, void>::m_alloc_size, + (SharedAllocationRecord<void, void>::m_alloc_size - + sizeof(SharedAllocationHeader))); +} + +SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, + void>::~SharedAllocationRecord() { + m_space.deallocate(RecordBase::m_alloc_ptr->m_label, + SharedAllocationRecord<void, void>::m_alloc_ptr, + SharedAllocationRecord<void, void>::m_alloc_size, + (SharedAllocationRecord<void, void>::m_alloc_size - + sizeof(SharedAllocationHeader))); +} + +// </editor-fold> end SharedAllocationRecord destructors }}}1 +//============================================================================== + +//============================================================================== +// <editor-fold desc="SharedAllocationRecord constructors"> {{{1 + +SharedAllocationRecord<Kokkos::CudaSpace, void>::SharedAllocationRecord( + const Kokkos::CudaSpace &arg_space, const std::string &arg_label, + const size_t arg_alloc_size, + const SharedAllocationRecord<void, void>::function_type arg_dealloc) + // Pass through allocated [ SharedAllocationHeader , user_memory ] + // Pass through deallocation function + : base_t( +#ifdef KOKKOS_ENABLE_DEBUG + &SharedAllocationRecord<Kokkos::CudaSpace, void>::s_root_record, +#endif + Impl::checked_allocation_with_header(arg_space, arg_label, + arg_alloc_size), + sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc), + m_tex_obj(0), + m_space(arg_space) { + + SharedAllocationHeader header; + + this->base_t::_fill_host_accessible_header_info(header, arg_label); + + // Copy to device memory + Kokkos::Impl::DeepCopy<CudaSpace, HostSpace>(RecordBase::m_alloc_ptr, &header, + sizeof(SharedAllocationHeader)); +} + +SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::SharedAllocationRecord( + const Kokkos::CudaUVMSpace &arg_space, const std::string &arg_label, + const size_t arg_alloc_size, + const SharedAllocationRecord<void, void>::function_type arg_dealloc) + // Pass through allocated [ SharedAllocationHeader , user_memory ] + // Pass through deallocation function + : base_t( +#ifdef KOKKOS_ENABLE_DEBUG + &SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::s_root_record, +#endif + Impl::checked_allocation_with_header(arg_space, arg_label, + arg_alloc_size), + sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc), + m_tex_obj(0), + m_space(arg_space) { + this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, + arg_label); +} + +SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>:: + SharedAllocationRecord( + const Kokkos::CudaHostPinnedSpace &arg_space, + const std::string &arg_label, const size_t arg_alloc_size, + const SharedAllocationRecord<void, void>::function_type arg_dealloc) + // Pass through allocated [ SharedAllocationHeader , user_memory ] + // Pass through deallocation function + : base_t( +#ifdef KOKKOS_ENABLE_DEBUG + &SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, + void>::s_root_record, +#endif + Impl::checked_allocation_with_header(arg_space, arg_label, + arg_alloc_size), + sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc), + m_space(arg_space) { + this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, + arg_label); +} + +// </editor-fold> end SharedAllocationRecord constructors }}}1 +//============================================================================== + +void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes, + bool to_device) { + if ((ptr == nullptr) || (bytes == 0)) return; + cudaPointerAttributes attr; + CUDA_SAFE_CALL(cudaPointerGetAttributes(&attr, ptr)); + // I measured this and it turns out prefetching towards the host slows + // DualView syncs down. Probably because the latency is not too bad in the + // first place for the pull down. If we want to change that provde + // cudaCpuDeviceId as the device if to_device is false +#if CUDA_VERSION < 10000 + bool is_managed = attr.isManaged; +#else + bool is_managed = attr.type == cudaMemoryTypeManaged; +#endif + if (to_device && is_managed && + space.cuda_device_prop().concurrentManagedAccess) { + CUDA_SAFE_CALL(cudaMemPrefetchAsync(ptr, bytes, space.cuda_device(), + space.cuda_stream())); + } +} + +} // namespace Impl +} // namespace Kokkos + +//============================================================================== +// <editor-fold desc="Explicit instantiations of CRTP Base classes"> {{{1 + +#include <impl/Kokkos_SharedAlloc_timpl.hpp> + +namespace Kokkos { +namespace Impl { + +// To avoid additional compilation cost for something that's (mostly?) not +// performance sensitive, we explicity instantiate these CRTP base classes here, +// where we have access to the associated *_timpl.hpp header files. +template class SharedAllocationRecordCommon<Kokkos::CudaSpace>; +template class HostInaccessibleSharedAllocationRecordCommon<Kokkos::CudaSpace>; +template class SharedAllocationRecordCommon<Kokkos::CudaUVMSpace>; +template class SharedAllocationRecordCommon<Kokkos::CudaHostPinnedSpace>; + +} // end namespace Impl +} // end namespace Kokkos + +// </editor-fold> end Explicit instantiations of CRTP Base classes }}}1 +//============================================================================== + +#else +void KOKKOS_CORE_SRC_CUDA_CUDASPACE_PREVENT_LINK_ERROR() {} +#endif // KOKKOS_ENABLE_CUDA diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Alloc.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Alloc.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e76133fae8702484874cc6afe5b7aa92934cba02 --- /dev/null +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Alloc.hpp @@ -0,0 +1,153 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CUDA_ALLOCATION_TRACKING_HPP +#define KOKKOS_CUDA_ALLOCATION_TRACKING_HPP + +#include <Kokkos_Macros.hpp> +#ifdef KOKKOS_ENABLE_CUDA + +#include <impl/Kokkos_Traits.hpp> + +namespace Kokkos { +namespace Impl { + +template <class DestructFunctor> +SharedAllocationRecord* shared_allocation_record( + Kokkos::CudaSpace const& arg_space, void* const arg_alloc_ptr, + DestructFunctor const& arg_destruct) { + SharedAllocationRecord* const record = + SharedAllocationRecord::get_record(arg_alloc_ptr); + + // assert: record != 0 + + // assert: sizeof(DestructFunctor) <= record->m_destruct_size + + // assert: record->m_destruct_function == 0 + + DestructFunctor* const functor = reinterpret_cast<DestructFunctor*>( + reinterpret_cast<uintptr_t>(record) + sizeof(SharedAllocationRecord)); + + new (functor) DestructFunctor(arg_destruct); + + record->m_destruct_functor = &shared_allocation_destroy<DestructFunctor>; + + return record; +} + +/// class CudaUnmanagedAllocator +/// does nothing when deallocate(ptr,size) is called +struct CudaUnmanagedAllocator { + static const char* name() { return "Cuda Unmanaged Allocator"; } + + static void deallocate(void* /*ptr*/, size_t /*size*/) {} + + static bool support_texture_binding() { return true; } +}; + +/// class CudaUnmanagedAllocator +/// does nothing when deallocate(ptr,size) is called +struct CudaUnmanagedUVMAllocator { + static const char* name() { return "Cuda Unmanaged UVM Allocator"; } + + static void deallocate(void* /*ptr*/, size_t /*size*/) {} + + static bool support_texture_binding() { return true; } +}; + +/// class CudaUnmanagedHostAllocator +/// does nothing when deallocate(ptr,size) is called +class CudaUnmanagedHostAllocator { + public: + static const char* name() { return "Cuda Unmanaged Host Allocator"; } + // Unmanaged deallocate does nothing + static void deallocate(void* /*ptr*/, size_t /*size*/) {} +}; + +/// class CudaMallocAllocator +class CudaMallocAllocator { + public: + static const char* name() { return "Cuda Malloc Allocator"; } + + static void* allocate(size_t size); + + static void deallocate(void* ptr, size_t); + + static void* reallocate(void* old_ptr, size_t old_size, size_t new_size); + + static bool support_texture_binding() { return true; } +}; + +/// class CudaUVMAllocator +class CudaUVMAllocator { + public: + static const char* name() { return "Cuda UVM Allocator"; } + + static void* allocate(size_t size); + + static void deallocate(void* ptr, size_t); + + static void* reallocate(void* old_ptr, size_t old_size, size_t new_size); + + static bool support_texture_binding() { return true; } +}; + +/// class CudaHostAllocator +class CudaHostAllocator { + public: + static const char* name() { return "Cuda Host Allocator"; } + + static void* allocate(size_t size); + + static void deallocate(void* ptr, size_t); + + static void* reallocate(void* old_ptr, size_t old_size, size_t new_size); +}; + +} // namespace Impl +} // namespace Kokkos + +#endif // KOKKOS_ENABLE_CUDA + +#endif // #ifndef KOKKOS_CUDA_ALLOCATION_TRACKING_HPP diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Atomic_Intrinsics.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Atomic_Intrinsics.hpp new file mode 100644 index 0000000000000000000000000000000000000000..8a6c0433c8d848633457c98845b7758e63fae52d --- /dev/null +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Atomic_Intrinsics.hpp @@ -0,0 +1,1012 @@ +/* +@HEADER +================================================================================ + +ORIGINAL LICENSE +---------------- + +Copyright (c) 2018, NVIDIA Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + +================================================================================ + +LICENSE ASSOCIATED WITH SUBSEQUENT MODIFICATIONS +------------------------------------------------ + +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2019) Sandia Corporation +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +@HEADER +*/ + +#include <Kokkos_Macros.hpp> +#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA_ASM_ATOMICS) + +#include <cassert> + +#ifndef _SIMT_DETAILS_CONFIG +#define _SIMT_DETAILS_CONFIG + +namespace Kokkos { +namespace Impl { + +#ifndef __simt_scope +// Modification: Kokkos GPU atomics should default to `gpu` scope +#define __simt_scope "gpu" +#endif + +#define __simt_fence_signal_() asm volatile("" ::: "memory") +#define __simt_fence_sc_() \ + asm volatile("fence.sc." __simt_scope ";" ::: "memory") +#define __simt_fence_() asm volatile("fence." __simt_scope ";" ::: "memory") + +#define __simt_load_acquire_8_as_32(ptr, ret) \ + asm volatile("ld.acquire." __simt_scope ".b8 %0, [%1];" \ + : "=r"(ret) \ + : "l"(ptr) \ + : "memory") +#define __simt_load_relaxed_8_as_32(ptr, ret) \ + asm volatile("ld.relaxed." __simt_scope ".b8 %0, [%1];" \ + : "=r"(ret) \ + : "l"(ptr) \ + : "memory") +#define __simt_store_release_8_as_32(ptr, desired) \ + asm volatile("st.release." __simt_scope ".b8 [%0], %1;" ::"l"(ptr), \ + "r"(desired) \ + : "memory") +#define __simt_store_relaxed_8_as_32(ptr, desired) \ + asm volatile("st.relaxed." __simt_scope ".b8 [%0], %1;" ::"l"(ptr), \ + "r"(desired) \ + : "memory") + +#define __simt_load_acquire_16(ptr, ret) \ + asm volatile("ld.acquire." __simt_scope ".b16 %0, [%1];" \ + : "=h"(ret) \ + : "l"(ptr) \ + : "memory") +#define __simt_load_relaxed_16(ptr, ret) \ + asm volatile("ld.relaxed." __simt_scope ".b16 %0, [%1];" \ + : "=h"(ret) \ + : "l"(ptr) \ + : "memory") +#define __simt_store_release_16(ptr, desired) \ + asm volatile("st.release." __simt_scope ".b16 [%0], %1;" ::"l"(ptr), \ + "h"(desired) \ + : "memory") +#define __simt_store_relaxed_16(ptr, desired) \ + asm volatile("st.relaxed." __simt_scope ".b16 [%0], %1;" ::"l"(ptr), \ + "h"(desired) \ + : "memory") + +#define __simt_load_acquire_32(ptr, ret) \ + asm volatile("ld.acquire." __simt_scope ".b32 %0, [%1];" \ + : "=r"(ret) \ + : "l"(ptr) \ + : "memory") +#define __simt_load_relaxed_32(ptr, ret) \ + asm volatile("ld.relaxed." __simt_scope ".b32 %0, [%1];" \ + : "=r"(ret) \ + : "l"(ptr) \ + : "memory") +#define __simt_store_release_32(ptr, desired) \ + asm volatile("st.release." __simt_scope ".b32 [%0], %1;" ::"l"(ptr), \ + "r"(desired) \ + : "memory") +#define __simt_store_relaxed_32(ptr, desired) \ + asm volatile("st.relaxed." __simt_scope ".b32 [%0], %1;" ::"l"(ptr), \ + "r"(desired) \ + : "memory") +#define __simt_exch_release_32(ptr, old, desired) \ + asm volatile("atom.exch.release." __simt_scope ".b32 %0, [%1], %2;" \ + : "=r"(old) \ + : "l"(ptr), "r"(desired) \ + : "memory") +#define __simt_exch_acquire_32(ptr, old, desired) \ + asm volatile("atom.exch.acquire." __simt_scope ".b32 %0, [%1], %2;" \ + : "=r"(old) \ + : "l"(ptr), "r"(desired) \ + : "memory") +#define __simt_exch_acq_rel_32(ptr, old, desired) \ + asm volatile("atom.exch.acq_rel." __simt_scope ".b32 %0, [%1], %2;" \ + : "=r"(old) \ + : "l"(ptr), "r"(desired) \ + : "memory") +#define __simt_exch_relaxed_32(ptr, old, desired) \ + asm volatile("atom.exch.relaxed." __simt_scope ".b32 %0, [%1], %2;" \ + : "=r"(old) \ + : "l"(ptr), "r"(desired) \ + : "memory") +#define __simt_cas_release_32(ptr, old, expected, desired) \ + asm volatile("atom.cas.release." __simt_scope ".b32 %0, [%1], %2, %3;" \ + : "=r"(old) \ + : "l"(ptr), "r"(expected), "r"(desired) \ + : "memory") +#define __simt_cas_acquire_32(ptr, old, expected, desired) \ + asm volatile("atom.cas.acquire." __simt_scope ".b32 %0, [%1], %2, %3;" \ + : "=r"(old) \ + : "l"(ptr), "r"(expected), "r"(desired) \ + : "memory") +#define __simt_cas_acq_rel_32(ptr, old, expected, desired) \ + asm volatile("atom.cas.acq_rel." __simt_scope ".b32 %0, [%1], %2, %3;" \ + : "=r"(old) \ + : "l"(ptr), "r"(expected), "r"(desired) \ + : "memory") +#define __simt_cas_relaxed_32(ptr, old, expected, desired) \ + asm volatile("atom.cas.relaxed." __simt_scope ".b32 %0, [%1], %2, %3;" \ + : "=r"(old) \ + : "l"(ptr), "r"(expected), "r"(desired) \ + : "memory") +#define __simt_add_release_32(ptr, old, addend) \ + asm volatile("atom.add.release." __simt_scope ".u32 %0, [%1], %2;" \ + : "=r"(old) \ + : "l"(ptr), "r"(addend) \ + : "memory") +#define __simt_add_acquire_32(ptr, old, addend) \ + asm volatile("atom.add.acquire." __simt_scope ".u32 %0, [%1], %2;" \ + : "=r"(old) \ + : "l"(ptr), "r"(addend) \ + : "memory") +#define __simt_add_acq_rel_32(ptr, old, addend) \ + asm volatile("atom.add.acq_rel." __simt_scope ".u32 %0, [%1], %2;" \ + : "=r"(old) \ + : "l"(ptr), "r"(addend) \ + : "memory") +#define __simt_add_relaxed_32(ptr, old, addend) \ + asm volatile("atom.add.relaxed." __simt_scope ".u32 %0, [%1], %2;" \ + : "=r"(old) \ + : "l"(ptr), "r"(addend) \ + : "memory") +#define __simt_and_release_32(ptr, old, andend) \ + asm volatile("atom.and.release." __simt_scope ".b32 %0, [%1], %2;" \ + : "=r"(old) \ + : "l"(ptr), "r"(andend) \ + : "memory") +#define __simt_and_acquire_32(ptr, old, andend) \ + asm volatile("atom.and.acquire." __simt_scope ".b32 %0, [%1], %2;" \ + : "=r"(old) \ + : "l"(ptr), "r"(andend) \ + : "memory") +#define __simt_and_acq_rel_32(ptr, old, andend) \ + asm volatile("atom.and.acq_rel." __simt_scope ".b32 %0, [%1], %2;" \ + : "=r"(old) \ + : "l"(ptr), "r"(andend) \ + : "memory") +#define __simt_and_relaxed_32(ptr, old, andend) \ + asm volatile("atom.and.relaxed." __simt_scope ".b32 %0, [%1], %2;" \ + : "=r"(old) \ + : "l"(ptr), "r"(andend) \ + : "memory") +#define __simt_or_release_32(ptr, old, orend) \ + asm volatile("atom.or.release." __simt_scope ".b32 %0, [%1], %2;" \ + : "=r"(old) \ + : "l"(ptr), "r"(orend) \ + : "memory") +#define __simt_or_acquire_32(ptr, old, orend) \ + asm volatile("atom.or.acquire." __simt_scope ".b32 %0, [%1], %2;" \ + : "=r"(old) \ + : "l"(ptr), "r"(orend) \ + : "memory") +#define __simt_or_acq_rel_32(ptr, old, orend) \ + asm volatile("atom.or.acq_rel." __simt_scope ".b32 %0, [%1], %2;" \ + : "=r"(old) \ + : "l"(ptr), "r"(orend) \ + : "memory") +#define __simt_or_relaxed_32(ptr, old, orend) \ + asm volatile("atom.or.relaxed." __simt_scope ".b32 %0, [%1], %2;" \ + : "=r"(old) \ + : "l"(ptr), "r"(orend) \ + : "memory") +#define __simt_xor_release_32(ptr, old, xorend) \ + asm volatile("atom.xor.release." __simt_scope ".b32 %0, [%1], %2;" \ + : "=r"(old) \ + : "l"(ptr), "r"(xorend) \ + : "memory") +#define __simt_xor_acquire_32(ptr, old, xorend) \ + asm volatile("atom.xor.acquire." __simt_scope ".b32 %0, [%1], %2;" \ + : "=r"(old) \ + : "l"(ptr), "r"(xorend) \ + : "memory") +#define __simt_xor_acq_rel_32(ptr, old, xorend) \ + asm volatile("atom.xor.acq_rel." __simt_scope ".b32 %0, [%1], %2;" \ + : "=r"(old) \ + : "l"(ptr), "r"(xorend) \ + : "memory") +#define __simt_xor_relaxed_32(ptr, old, xorend) \ + asm volatile("atom.xor.relaxed." __simt_scope ".b32 %0, [%1], %2;" \ + : "=r"(old) \ + : "l"(ptr), "r"(xorend) \ + : "memory") + +#define __simt_load_acquire_64(ptr, ret) \ + asm volatile("ld.acquire." __simt_scope ".b64 %0, [%1];" \ + : "=l"(ret) \ + : "l"(ptr) \ + : "memory") +#define __simt_load_relaxed_64(ptr, ret) \ + asm volatile("ld.relaxed." __simt_scope ".b64 %0, [%1];" \ + : "=l"(ret) \ + : "l"(ptr) \ + : "memory") +#define __simt_store_release_64(ptr, desired) \ + asm volatile("st.release." __simt_scope ".b64 [%0], %1;" ::"l"(ptr), \ + "l"(desired) \ + : "memory") +#define __simt_store_relaxed_64(ptr, desired) \ + asm volatile("st.relaxed." __simt_scope ".b64 [%0], %1;" ::"l"(ptr), \ + "l"(desired) \ + : "memory") +#define __simt_exch_release_64(ptr, old, desired) \ + asm volatile("atom.exch.release." __simt_scope ".b64 %0, [%1], %2;" \ + : "=l"(old) \ + : "l"(ptr), "l"(desired) \ + : "memory") +#define __simt_exch_acquire_64(ptr, old, desired) \ + asm volatile("atom.exch.acquire." __simt_scope ".b64 %0, [%1], %2;" \ + : "=l"(old) \ + : "l"(ptr), "l"(desired) \ + : "memory") +#define __simt_exch_acq_rel_64(ptr, old, desired) \ + asm volatile("atom.exch.acq_rel." __simt_scope ".b64 %0, [%1], %2;" \ + : "=l"(old) \ + : "l"(ptr), "l"(desired) \ + : "memory") +#define __simt_exch_relaxed_64(ptr, old, desired) \ + asm volatile("atom.exch.relaxed." __simt_scope ".b64 %0, [%1], %2;" \ + : "=l"(old) \ + : "l"(ptr), "l"(desired) \ + : "memory") +#define __simt_cas_release_64(ptr, old, expected, desired) \ + asm volatile("atom.cas.release." __simt_scope ".b64 %0, [%1], %2, %3;" \ + : "=l"(old) \ + : "l"(ptr), "l"(expected), "l"(desired) \ + : "memory") +#define __simt_cas_acquire_64(ptr, old, expected, desired) \ + asm volatile("atom.cas.acquire." __simt_scope ".b64 %0, [%1], %2, %3;" \ + : "=l"(old) \ + : "l"(ptr), "l"(expected), "l"(desired) \ + : "memory") +#define __simt_cas_acq_rel_64(ptr, old, expected, desired) \ + asm volatile("atom.cas.acq_rel." __simt_scope ".b64 %0, [%1], %2, %3;" \ + : "=l"(old) \ + : "l"(ptr), "l"(expected), "l"(desired) \ + : "memory") +#define __simt_cas_relaxed_64(ptr, old, expected, desired) \ + asm volatile("atom.cas.relaxed." __simt_scope ".b64 %0, [%1], %2, %3;" \ + : "=l"(old) \ + : "l"(ptr), "l"(expected), "l"(desired) \ + : "memory") +#define __simt_add_release_64(ptr, old, addend) \ + asm volatile("atom.add.release." __simt_scope ".u64 %0, [%1], %2;" \ + : "=l"(old) \ + : "l"(ptr), "l"(addend) \ + : "memory") +#define __simt_add_acquire_64(ptr, old, addend) \ + asm volatile("atom.add.acquire." __simt_scope ".u64 %0, [%1], %2;" \ + : "=l"(old) \ + : "l"(ptr), "l"(addend) \ + : "memory") +#define __simt_add_acq_rel_64(ptr, old, addend) \ + asm volatile("atom.add.acq_rel." __simt_scope ".u64 %0, [%1], %2;" \ + : "=l"(old) \ + : "l"(ptr), "l"(addend) \ + : "memory") +#define __simt_add_relaxed_64(ptr, old, addend) \ + asm volatile("atom.add.relaxed." __simt_scope ".u64 %0, [%1], %2;" \ + : "=l"(old) \ + : "l"(ptr), "l"(addend) \ + : "memory") +#define __simt_and_release_64(ptr, old, andend) \ + asm volatile("atom.and.release." __simt_scope ".b64 %0, [%1], %2;" \ + : "=l"(old) \ + : "l"(ptr), "l"(andend) \ + : "memory") +#define __simt_and_acquire_64(ptr, old, andend) \ + asm volatile("atom.and.acquire." __simt_scope ".b64 %0, [%1], %2;" \ + : "=l"(old) \ + : "l"(ptr), "l"(andend) \ + : "memory") +#define __simt_and_acq_rel_64(ptr, old, andend) \ + asm volatile("atom.and.acq_rel." __simt_scope ".b64 %0, [%1], %2;" \ + : "=l"(old) \ + : "l"(ptr), "l"(andend) \ + : "memory") +#define __simt_and_relaxed_64(ptr, old, andend) \ + asm volatile("atom.and.relaxed." __simt_scope ".b64 %0, [%1], %2;" \ + : "=l"(old) \ + : "l"(ptr), "l"(andend) \ + : "memory") +#define __simt_or_release_64(ptr, old, orend) \ + asm volatile("atom.or.release." __simt_scope ".b64 %0, [%1], %2;" \ + : "=l"(old) \ + : "l"(ptr), "l"(orend) \ + : "memory") +#define __simt_or_acquire_64(ptr, old, orend) \ + asm volatile("atom.or.acquire." __simt_scope ".b64 %0, [%1], %2;" \ + : "=l"(old) \ + : "l"(ptr), "l"(orend) \ + : "memory") +#define __simt_or_acq_rel_64(ptr, old, orend) \ + asm volatile("atom.or.acq_rel." __simt_scope ".b64 %0, [%1], %2;" \ + : "=l"(old) \ + : "l"(ptr), "l"(orend) \ + : "memory") +#define __simt_or_relaxed_64(ptr, old, orend) \ + asm volatile("atom.or.relaxed." __simt_scope ".b64 %0, [%1], %2;" \ + : "=l"(old) \ + : "l"(ptr), "l"(orend) \ + : "memory") +#define __simt_xor_release_64(ptr, old, xorend) \ + asm volatile("atom.xor.release." __simt_scope ".b64 %0, [%1], %2;" \ + : "=l"(old) \ + : "l"(ptr), "l"(xorend) \ + : "memory") +#define __simt_xor_acquire_64(ptr, old, xorend) \ + asm volatile("atom.xor.acquire." __simt_scope ".b64 %0, [%1], %2;" \ + : "=l"(old) \ + : "l"(ptr), "l"(xorend) \ + : "memory") +#define __simt_xor_acq_rel_64(ptr, old, xorend) \ + asm volatile("atom.xor.acq_rel." __simt_scope ".b64 %0, [%1], %2;" \ + : "=l"(old) \ + : "l"(ptr), "l"(xorend) \ + : "memory") +#define __simt_xor_relaxed_64(ptr, old, xorend) \ + asm volatile("atom.xor.relaxed." __simt_scope ".b64 %0, [%1], %2;" \ + : "=l"(old) \ + : "l"(ptr), "l"(xorend) \ + : "memory") + +#define __simt_nanosleep(timeout) \ + asm volatile("nanosleep.u32 %0;" ::"r"(unsigned(timeout)) :) + +/* + definitions +*/ + +#ifndef __GCC_ATOMIC_BOOL_LOCK_FREE +#define __GCC_ATOMIC_BOOL_LOCK_FREE 2 +#define __GCC_ATOMIC_CHAR_LOCK_FREE 2 +#define __GCC_ATOMIC_CHAR16_T_LOCK_FREE 2 +#define __GCC_ATOMIC_CHAR32_T_LOCK_FREE 2 +#define __GCC_ATOMIC_WCHAR_T_LOCK_FREE 2 +#define __GCC_ATOMIC_SHORT_LOCK_FREE 2 +#define __GCC_ATOMIC_INT_LOCK_FREE 2 +#define __GCC_ATOMIC_LONG_LOCK_FREE 2 +#define __GCC_ATOMIC_LLONG_LOCK_FREE 2 +#define __GCC_ATOMIC_POINTER_LOCK_FREE 2 +#endif + +#ifndef __ATOMIC_RELAXED +#define __ATOMIC_RELAXED 0 +#define __ATOMIC_CONSUME 1 +#define __ATOMIC_ACQUIRE 2 +#define __ATOMIC_RELEASE 3 +#define __ATOMIC_ACQ_REL 4 +#define __ATOMIC_SEQ_CST 5 +#endif + +inline __device__ int __stronger_order_simt_(int a, int b) { + if (b == __ATOMIC_SEQ_CST) return __ATOMIC_SEQ_CST; + if (b == __ATOMIC_RELAXED) return a; + switch (a) { + case __ATOMIC_SEQ_CST: + case __ATOMIC_ACQ_REL: return a; + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: + if (b != __ATOMIC_ACQUIRE) + return __ATOMIC_ACQ_REL; + else + return __ATOMIC_ACQUIRE; + case __ATOMIC_RELEASE: + if (b != __ATOMIC_RELEASE) + return __ATOMIC_ACQ_REL; + else + return __ATOMIC_RELEASE; + case __ATOMIC_RELAXED: return b; + default: assert(0); + } + return __ATOMIC_SEQ_CST; +} + +/* + base +*/ + +#define DO__atomic_load_simt_(bytes, bits) \ + template <class type, \ + typename std::enable_if<sizeof(type) == bytes, int>::type = 0> \ + void __device__ __atomic_load_simt_(const type *ptr, type *ret, \ + int memorder) { \ + int##bits##_t tmp = 0; \ + switch (memorder) { \ + case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \ + case __ATOMIC_CONSUME: \ + case __ATOMIC_ACQUIRE: __simt_load_acquire_##bits(ptr, tmp); break; \ + case __ATOMIC_RELAXED: __simt_load_relaxed_##bits(ptr, tmp); break; \ + default: assert(0); \ + } \ + memcpy(ret, &tmp, bytes); \ + } +DO__atomic_load_simt_(1, 32) DO__atomic_load_simt_(2, 16) + DO__atomic_load_simt_(4, 32) DO__atomic_load_simt_(8, 64) + + template <class type> + type __device__ __atomic_load_n_simt_(const type *ptr, int memorder) { + type ret; + __atomic_load_simt_(ptr, &ret, memorder); + return ret; +} + +#define DO__atomic_store_simt_(bytes, bits) \ + template <class type, \ + typename std::enable_if<sizeof(type) == bytes, int>::type = 0> \ + void __device__ __atomic_store_simt_(type *ptr, type *val, int memorder) { \ + int##bits##_t tmp = 0; \ + memcpy(&tmp, val, bytes); \ + switch (memorder) { \ + case __ATOMIC_RELEASE: __simt_store_release_##bits(ptr, tmp); break; \ + case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \ + case __ATOMIC_RELAXED: __simt_store_relaxed_##bits(ptr, tmp); break; \ + default: assert(0); \ + } \ + } +DO__atomic_store_simt_(1, 32) DO__atomic_store_simt_(2, 16) + DO__atomic_store_simt_(4, 32) DO__atomic_store_simt_(8, 64) + + template <class type> + void __device__ + __atomic_store_n_simt_(type *ptr, type val, int memorder) { + __atomic_store_simt_(ptr, &val, memorder); +} + +#define DO__atomic_compare_exchange_simt_(bytes, bits) \ + template <class type, \ + typename std::enable_if<sizeof(type) == bytes, int>::type = 0> \ + bool __device__ __atomic_compare_exchange_simt_( \ + type *ptr, type *expected, const type *desired, bool, \ + int success_memorder, int failure_memorder) { \ + int##bits##_t tmp = 0, old = 0, old_tmp; \ + memcpy(&tmp, desired, bytes); \ + memcpy(&old, expected, bytes); \ + old_tmp = old; \ + switch (__stronger_order_simt_(success_memorder, failure_memorder)) { \ + case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \ + case __ATOMIC_CONSUME: \ + case __ATOMIC_ACQUIRE: \ + __simt_cas_acquire_##bits(ptr, old, old_tmp, tmp); \ + break; \ + case __ATOMIC_ACQ_REL: \ + __simt_cas_acq_rel_##bits(ptr, old, old_tmp, tmp); \ + break; \ + case __ATOMIC_RELEASE: \ + __simt_cas_release_##bits(ptr, old, old_tmp, tmp); \ + break; \ + case __ATOMIC_RELAXED: \ + __simt_cas_relaxed_##bits(ptr, old, old_tmp, tmp); \ + break; \ + default: assert(0); \ + } \ + bool const ret = old == old_tmp; \ + memcpy(expected, &old, bytes); \ + return ret; \ + } +DO__atomic_compare_exchange_simt_(4, 32) + DO__atomic_compare_exchange_simt_(8, 64) + + template <class type, + typename std::enable_if<sizeof(type) <= 2, int>::type = 0> + bool __device__ + __atomic_compare_exchange_simt_(type *ptr, type *expected, + const type *desired, bool, + int success_memorder, + int failure_memorder) { + using R = typename std::conditional<std::is_volatile<type>::value, + volatile uint32_t, uint32_t>::type; + auto const aligned = (R *)((intptr_t)ptr & ~(sizeof(uint32_t) - 1)); + auto const offset = uint32_t((intptr_t)ptr & (sizeof(uint32_t) - 1)) * 8; + auto const mask = ((1 << sizeof(type) * 8) - 1) << offset; + + uint32_t old = *expected << offset, old_value; + while (1) { + old_value = (old & mask) >> offset; + if (old_value != *expected) break; + uint32_t const attempt = (old & ~mask) | (*desired << offset); + if (__atomic_compare_exchange_simt_(aligned, &old, &attempt, true, + success_memorder, failure_memorder)) + return true; + } + *expected = old_value; + return false; +} + +template <class type> +bool __device__ __atomic_compare_exchange_n_simt_(type *ptr, type *expected, + type desired, bool weak, + int success_memorder, + int failure_memorder) { + return __atomic_compare_exchange_simt_(ptr, expected, &desired, weak, + success_memorder, failure_memorder); +} + +#define DO__atomic_exchange_simt_(bytes, bits) \ + template <class type, \ + typename std::enable_if<sizeof(type) == bytes, int>::type = 0> \ + void __device__ __atomic_exchange_simt_(type *ptr, type *val, type *ret, \ + int memorder) { \ + int##bits##_t tmp = 0; \ + memcpy(&tmp, val, bytes); \ + switch (memorder) { \ + case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \ + case __ATOMIC_CONSUME: \ + case __ATOMIC_ACQUIRE: __simt_exch_acquire_##bits(ptr, tmp, tmp); break; \ + case __ATOMIC_ACQ_REL: __simt_exch_acq_rel_##bits(ptr, tmp, tmp); break; \ + case __ATOMIC_RELEASE: __simt_exch_release_##bits(ptr, tmp, tmp); break; \ + case __ATOMIC_RELAXED: __simt_exch_relaxed_##bits(ptr, tmp, tmp); break; \ + default: assert(0); \ + } \ + memcpy(ret, &tmp, bytes); \ + } +DO__atomic_exchange_simt_(4, 32) DO__atomic_exchange_simt_(8, 64) + + template <class type, + typename std::enable_if<sizeof(type) <= 2, int>::type = 0> + void __device__ + __atomic_exchange_simt_(type *ptr, type *val, type *ret, int memorder) { + type expected = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED); + while (!__atomic_compare_exchange_simt_(ptr, &expected, val, true, memorder, + memorder)) + ; + *ret = expected; +} + +template <class type> +type __device__ __atomic_exchange_n_simt_(type *ptr, type val, int memorder) { + type ret; + __atomic_exchange_simt_(ptr, &val, &ret, memorder); + return ret; +} + +#define DO__atomic_fetch_add_simt_(bytes, bits) \ + template <class type, class delta, \ + typename std::enable_if<sizeof(type) == bytes, int>::type = 0> \ + type __device__ __atomic_fetch_add_simt_(type *ptr, delta val, \ + int memorder) { \ + type ret; \ + switch (memorder) { \ + case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \ + case __ATOMIC_CONSUME: \ + case __ATOMIC_ACQUIRE: __simt_add_acquire_##bits(ptr, ret, val); break; \ + case __ATOMIC_ACQ_REL: __simt_add_acq_rel_##bits(ptr, ret, val); break; \ + case __ATOMIC_RELEASE: __simt_add_release_##bits(ptr, ret, val); break; \ + case __ATOMIC_RELAXED: __simt_add_relaxed_##bits(ptr, ret, val); break; \ + default: assert(0); \ + } \ + return ret; \ + } +DO__atomic_fetch_add_simt_(4, 32) DO__atomic_fetch_add_simt_(8, 64) + + template <class type, class delta, + typename std::enable_if<sizeof(type) <= 2, int>::type = 0> + type __device__ + __atomic_fetch_add_simt_(type *ptr, delta val, int memorder) { + type expected = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED); + type const desired = expected + val; + while (!__atomic_compare_exchange_simt_(ptr, &expected, &desired, true, + memorder, memorder)) + ; + return expected; +} + +#define DO__atomic_fetch_sub_simt_(bytes, bits) \ + template <class type, class delta, \ + typename std::enable_if<sizeof(type) == bytes, int>::type = 0> \ + type __device__ __atomic_fetch_sub_simt_(type *ptr, delta val, \ + int memorder) { \ + type ret; \ + switch (memorder) { \ + case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \ + case __ATOMIC_CONSUME: \ + case __ATOMIC_ACQUIRE: __simt_add_acquire_##bits(ptr, ret, -val); break; \ + case __ATOMIC_ACQ_REL: __simt_add_acq_rel_##bits(ptr, ret, -val); break; \ + case __ATOMIC_RELEASE: __simt_add_release_##bits(ptr, ret, -val); break; \ + case __ATOMIC_RELAXED: __simt_add_relaxed_##bits(ptr, ret, -val); break; \ + default: assert(0); \ + } \ + return ret; \ + } +DO__atomic_fetch_sub_simt_(4, 32) DO__atomic_fetch_sub_simt_(8, 64) + + template <class type, class delta, + typename std::enable_if<sizeof(type) <= 2, int>::type = 0> + type __device__ + __atomic_fetch_sub_simt_(type *ptr, delta val, int memorder) { + type expected = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED); + type const desired = expected - val; + while (!__atomic_compare_exchange_simt_(ptr, &expected, &desired, true, + memorder, memorder)) + ; + return expected; +} + +#define DO__atomic_fetch_and_simt_(bytes, bits) \ + template <class type, \ + typename std::enable_if<sizeof(type) == bytes, int>::type = 0> \ + type __device__ __atomic_fetch_and_simt_(type *ptr, type val, \ + int memorder) { \ + type ret; \ + switch (memorder) { \ + case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \ + case __ATOMIC_CONSUME: \ + case __ATOMIC_ACQUIRE: __simt_and_acquire_##bits(ptr, ret, val); break; \ + case __ATOMIC_ACQ_REL: __simt_and_acq_rel_##bits(ptr, ret, val); break; \ + case __ATOMIC_RELEASE: __simt_and_release_##bits(ptr, ret, val); break; \ + case __ATOMIC_RELAXED: __simt_and_relaxed_##bits(ptr, ret, val); break; \ + default: assert(0); \ + } \ + return ret; \ + } +DO__atomic_fetch_and_simt_(4, 32) DO__atomic_fetch_and_simt_(8, 64) + + template <class type, class delta, + typename std::enable_if<sizeof(type) <= 2, int>::type = 0> + type __device__ + __atomic_fetch_and_simt_(type *ptr, delta val, int memorder) { + type expected = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED); + type const desired = expected & val; + while (!__atomic_compare_exchange_simt_(ptr, &expected, &desired, true, + memorder, memorder)) + ; + return expected; +} + +#define DO__atomic_fetch_xor_simt_(bytes, bits) \ + template <class type, \ + typename std::enable_if<sizeof(type) == bytes, int>::type = 0> \ + type __device__ __atomic_fetch_xor_simt_(type *ptr, type val, \ + int memorder) { \ + type ret; \ + switch (memorder) { \ + case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \ + case __ATOMIC_CONSUME: \ + case __ATOMIC_ACQUIRE: __simt_xor_acquire_##bits(ptr, ret, val); break; \ + case __ATOMIC_ACQ_REL: __simt_xor_acq_rel_##bits(ptr, ret, val); break; \ + case __ATOMIC_RELEASE: __simt_xor_release_##bits(ptr, ret, val); break; \ + case __ATOMIC_RELAXED: __simt_xor_relaxed_##bits(ptr, ret, val); break; \ + default: assert(0); \ + } \ + return ret; \ + } +DO__atomic_fetch_xor_simt_(4, 32) DO__atomic_fetch_xor_simt_(8, 64) + + template <class type, class delta, + typename std::enable_if<sizeof(type) <= 2, int>::type = 0> + type __device__ + __atomic_fetch_xor_simt_(type *ptr, delta val, int memorder) { + type expected = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED); + type const desired = expected ^ val; + while (!__atomic_compare_exchange_simt_(ptr, &expected, &desired, true, + memorder, memorder)) + ; + return expected; +} + +#define DO__atomic_fetch_or_simt_(bytes, bits) \ + template <class type, \ + typename std::enable_if<sizeof(type) == bytes, int>::type = 0> \ + type __device__ __atomic_fetch_or_simt_(type *ptr, type val, int memorder) { \ + type ret; \ + switch (memorder) { \ + case __ATOMIC_SEQ_CST: __simt_fence_sc_(); \ + case __ATOMIC_CONSUME: \ + case __ATOMIC_ACQUIRE: __simt_or_acquire_##bits(ptr, ret, val); break; \ + case __ATOMIC_ACQ_REL: __simt_or_acq_rel_##bits(ptr, ret, val); break; \ + case __ATOMIC_RELEASE: __simt_or_release_##bits(ptr, ret, val); break; \ + case __ATOMIC_RELAXED: __simt_or_relaxed_##bits(ptr, ret, val); break; \ + default: assert(0); \ + } \ + return ret; \ + } +DO__atomic_fetch_or_simt_(4, 32) DO__atomic_fetch_or_simt_(8, 64) + + template <class type, class delta, + typename std::enable_if<sizeof(type) <= 2, int>::type = 0> + type __device__ + __atomic_fetch_or_simt_(type *ptr, delta val, int memorder) { + type expected = __atomic_load_n_simt_(ptr, __ATOMIC_RELAXED); + type const desired = expected | val; + while (!__atomic_compare_exchange_simt_(ptr, &expected, &desired, true, + memorder, memorder)) + ; + return expected; +} + +template <class type> +inline bool __device__ __atomic_test_and_set_simt_(type *ptr, int memorder) { + return __atomic_exchange_n_simt_((char *)ptr, (char)1, memorder) == 1; +} +template <class type> +inline void __device__ __atomic_clear_simt_(type *ptr, int memorder) { + return __atomic_store_n_simt_((char *)ptr, (char)0, memorder); +} + +inline constexpr __device__ bool __atomic_always_lock_free_simt_(size_t size, + void *) { + return size <= 8; +} +inline __device__ bool __atomic_is_lock_free_simt_(size_t size, void *ptr) { + return __atomic_always_lock_free_simt_(size, ptr); +} + +/* + fences +*/ + +inline void __device__ __atomic_thread_fence_simt(int memorder) { + switch (memorder) { + case __ATOMIC_SEQ_CST: __simt_fence_sc_(); break; + case __ATOMIC_CONSUME: + case __ATOMIC_ACQUIRE: + case __ATOMIC_ACQ_REL: + case __ATOMIC_RELEASE: __simt_fence_(); break; + case __ATOMIC_RELAXED: break; + default: assert(0); + } +} +inline void __device__ __atomic_signal_fence_simt(int memorder) { + __atomic_thread_fence_simt(memorder); +} + +/* + non-volatile +*/ + +template <class type> +type __device__ __atomic_load_n_simt(const type *ptr, int memorder) { + return __atomic_load_n_simt_(const_cast<const type *>(ptr), memorder); +} +template <class type> +void __device__ __atomic_load_simt(const type *ptr, type *ret, int memorder) { + __atomic_load_simt_(const_cast<const type *>(ptr), ret, memorder); +} +template <class type> +void __device__ __atomic_store_n_simt(type *ptr, type val, int memorder) { + __atomic_store_n_simt_(const_cast<type *>(ptr), val, memorder); +} +template <class type> +void __device__ __atomic_store_simt(type *ptr, type *val, int memorder) { + __atomic_store_simt_(const_cast<type *>(ptr), val, memorder); +} +template <class type> +type __device__ __atomic_exchange_n_simt(type *ptr, type val, int memorder) { + return __atomic_exchange_n_simt_(const_cast<type *>(ptr), val, memorder); +} +template <class type> +void __device__ __atomic_exchange_simt(type *ptr, type *val, type *ret, + int memorder) { + __atomic_exchange_simt_(const_cast<type *>(ptr), val, ret, memorder); +} +template <class type> +bool __device__ __atomic_compare_exchange_n_simt(type *ptr, type *expected, + type desired, bool weak, + int success_memorder, + int failure_memorder) { + return __atomic_compare_exchange_n_simt_(const_cast<type *>(ptr), expected, + desired, weak, success_memorder, + failure_memorder); +} +template <class type> +bool __device__ __atomic_compare_exchange_simt(type *ptr, type *expected, + type *desired, bool weak, + int success_memorder, + int failure_memorder) { + return __atomic_compare_exchange_simt_(const_cast<type *>(ptr), expected, + desired, weak, success_memorder, + failure_memorder); +} +template <class type, class delta> +type __device__ __atomic_fetch_add_simt(type *ptr, delta val, int memorder) { + return __atomic_fetch_add_simt_(const_cast<type *>(ptr), val, memorder); +} +template <class type, class delta> +type __device__ __atomic_fetch_sub_simt(type *ptr, delta val, int memorder) { + return __atomic_fetch_sub_simt_(const_cast<type *>(ptr), val, memorder); +} +template <class type> +type __device__ __atomic_fetch_and_simt(type *ptr, type val, int memorder) { + return __atomic_fetch_and_simt_(const_cast<type *>(ptr), val, memorder); +} +template <class type> +type __device__ __atomic_fetch_xor_simt(type *ptr, type val, int memorder) { + return __atomic_fetch_xor_simt_(const_cast<type *>(ptr), val, memorder); +} +template <class type> +type __device__ __atomic_fetch_or_simt(type *ptr, type val, int memorder) { + return __atomic_fetch_or_simt_(const_cast<type *>(ptr), val, memorder); +} +template <class type> +bool __device__ __atomic_test_and_set_simt(void *ptr, int memorder) { + return __atomic_test_and_set_simt_(const_cast<void *>(ptr), memorder); +} +template <class type> +void __device__ __atomic_clear_simt(void *ptr, int memorder) { + return __atomic_clear_simt_(const_cast<void *>(ptr), memorder); +} +inline bool __device__ __atomic_always_lock_free_simt(size_t size, void *ptr) { + return __atomic_always_lock_free_simt_(size, const_cast<void *>(ptr)); +} +inline bool __device__ __atomic_is_lock_free_simt(size_t size, void *ptr) { + return __atomic_is_lock_free_simt_(size, const_cast<void *>(ptr)); +} + +/* + volatile +*/ + +template <class type> +type __device__ __atomic_load_n_simt(const volatile type *ptr, int memorder) { + return __atomic_load_n_simt_(const_cast<const type *>(ptr), memorder); +} +template <class type> +void __device__ __atomic_load_simt(const volatile type *ptr, type *ret, + int memorder) { + __atomic_load_simt_(const_cast<const type *>(ptr), ret, memorder); +} +template <class type> +void __device__ __atomic_store_n_simt(volatile type *ptr, type val, + int memorder) { + __atomic_store_n_simt_(const_cast<type *>(ptr), val, memorder); +} +template <class type> +void __device__ __atomic_store_simt(volatile type *ptr, type *val, + int memorder) { + __atomic_store_simt_(const_cast<type *>(ptr), val, memorder); +} +template <class type> +type __device__ __atomic_exchange_n_simt(volatile type *ptr, type val, + int memorder) { + return __atomic_exchange_n_simt_(const_cast<type *>(ptr), val, memorder); +} +template <class type> +void __device__ __atomic_exchange_simt(volatile type *ptr, type *val, type *ret, + int memorder) { + __atomic_exchange_simt_(const_cast<type *>(ptr), val, ret, memorder); +} +template <class type> +bool __device__ __atomic_compare_exchange_n_simt(volatile type *ptr, + type *expected, type desired, + bool weak, + int success_memorder, + int failure_memorder) { + return __atomic_compare_exchange_n_simt_(const_cast<type *>(ptr), expected, + desired, weak, success_memorder, + failure_memorder); +} +template <class type> +bool __device__ __atomic_compare_exchange_simt(volatile type *ptr, + type *expected, type *desired, + bool weak, int success_memorder, + int failure_memorder) { + return __atomic_compare_exchange_simt_(const_cast<type *>(ptr), expected, + desired, weak, success_memorder, + failure_memorder); +} +template <class type, class delta> +type __device__ __atomic_fetch_add_simt(volatile type *ptr, delta val, + int memorder) { + return __atomic_fetch_add_simt_(const_cast<type *>(ptr), val, memorder); +} +template <class type, class delta> +type __device__ __atomic_fetch_sub_simt(volatile type *ptr, delta val, + int memorder) { + return __atomic_fetch_sub_simt_(const_cast<type *>(ptr), val, memorder); +} +template <class type> +type __device__ __atomic_fetch_and_simt(volatile type *ptr, type val, + int memorder) { + return __atomic_fetch_and_simt_(const_cast<type *>(ptr), val, memorder); +} +template <class type> +type __device__ __atomic_fetch_xor_simt(volatile type *ptr, type val, + int memorder) { + return __atomic_fetch_xor_simt_(const_cast<type *>(ptr), val, memorder); +} +template <class type> +type __device__ __atomic_fetch_or_simt(volatile type *ptr, type val, + int memorder) { + return __atomic_fetch_or_simt_(const_cast<type *>(ptr), val, memorder); +} +template <class type> +bool __device__ __atomic_test_and_set_simt(volatile void *ptr, int memorder) { + return __atomic_test_and_set_simt_(const_cast<void *>(ptr), memorder); +} +template <class type> +void __device__ __atomic_clear_simt(volatile void *ptr, int memorder) { + return __atomic_clear_simt_(const_cast<void *>(ptr), memorder); +} + +} // end namespace Impl +} // end namespace Kokkos + +#endif //_SIMT_DETAILS_CONFIG + +#ifndef KOKKOS_SIMT_ATOMIC_BUILTIN_REPLACEMENTS_DEFINED +/* + builtins +*/ + +#define __atomic_load_n __atomic_load_n_simt +#define __atomic_load __atomic_load_simt +#define __atomic_store_n __atomic_store_n_simt +#define __atomic_store __atomic_store_simt +#define __atomic_exchange_n __atomic_exchange_n_simt +#define __atomic_exchange __atomic_exchange_simt +#define __atomic_compare_exchange_n __atomic_compare_exchange_n_simt +#define __atomic_compare_exchange __atomic_compare_exchange_simt +#define __atomic_fetch_add __atomic_fetch_add_simt +#define __atomic_fetch_sub __atomic_fetch_sub_simt +#define __atomic_fetch_and __atomic_fetch_and_simt +#define __atomic_fetch_xor __atomic_fetch_xor_simt +#define __atomic_fetch_or __atomic_fetch_or_simt +#define __atomic_test_and_set __atomic_test_and_set_simt +#define __atomic_clear __atomic_clear_simt +#define __atomic_always_lock_free __atomic_always_lock_free_simt +#define __atomic_is_lock_free __atomic_is_lock_free_simt +#define __atomic_thread_fence __atomic_thread_fence_simt +#define __atomic_signal_fence __atomic_signal_fence_simt + +#define KOKKOS_SIMT_ATOMIC_BUILTIN_REPLACEMENTS_DEFINED + +#endif //__CUDA_ARCH__ && KOKKOS_ENABLE_CUDA_ASM_ATOMICS +#endif // KOKKOS_SIMT_ATOMIC_BUILTIN_REPLACEMENTS_DEFINED diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Atomic_Intrinsics_Restore_Builtins.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Atomic_Intrinsics_Restore_Builtins.hpp new file mode 100644 index 0000000000000000000000000000000000000000..d7cd1bab1303a76e08723c07d6ff32cb613c0245 --- /dev/null +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Atomic_Intrinsics_Restore_Builtins.hpp @@ -0,0 +1,68 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2019) Sandia Corporation +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifdef KOKKOS_SIMT_ATOMIC_BUILTIN_REPLACEMENTS_DEFINED + +#undef __atomic_load_n +#undef __atomic_load +#undef __atomic_store_n +#undef __atomic_store +#undef __atomic_exchange_n +#undef __atomic_exchange +#undef __atomic_compare_exchange_n +#undef __atomic_compare_exchange +#undef __atomic_fetch_add +#undef __atomic_fetch_sub +#undef __atomic_fetch_and +#undef __atomic_fetch_xor +#undef __atomic_fetch_or +#undef __atomic_test_and_set +#undef __atomic_clear +#undef __atomic_always_lock_free +#undef __atomic_is_lock_free +#undef __atomic_thread_fence +#undef __atomic_signal_fence + +#undef KOKKOS_SIMT_ATOMIC_BUILTIN_REPLACEMENTS_DEFINED + +#endif // KOKKOS_SIMT_ATOMIC_BUILTIN_REPLACEMENTS_DEFINED diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp new file mode 100644 index 0000000000000000000000000000000000000000..0f4259072d97f26c0032e674bdf60b9031fcee11 --- /dev/null +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp @@ -0,0 +1,239 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CUDA_INTERNAL_HPP +#define KOKKOS_CUDA_INTERNAL_HPP + +#include <Kokkos_Macros.hpp> +#ifdef KOKKOS_ENABLE_CUDA + +#include <Cuda/Kokkos_Cuda_Error.hpp> + +namespace Kokkos { +namespace Impl { + +inline int cuda_max_active_blocks_per_sm(cudaDeviceProp const& properties, + cudaFuncAttributes const& attributes, + int block_size, size_t dynamic_shmem) { + // Limits due do registers/SM + int const regs_per_sm = properties.regsPerMultiprocessor; + int const regs_per_thread = attributes.numRegs; + int const max_blocks_regs = regs_per_sm / (regs_per_thread * block_size); + + // Limits due to shared memory/SM + size_t const shmem_per_sm = properties.sharedMemPerMultiprocessor; + size_t const shmem_per_block = properties.sharedMemPerBlock; + size_t const static_shmem = attributes.sharedSizeBytes; + size_t const dynamic_shmem_per_block = attributes.maxDynamicSharedSizeBytes; + size_t const total_shmem = static_shmem + dynamic_shmem; + + int const max_blocks_shmem = + total_shmem > shmem_per_block || dynamic_shmem > dynamic_shmem_per_block + ? 0 + : (total_shmem > 0 ? (int)shmem_per_sm / total_shmem + : max_blocks_regs); + + // Limits due to blocks/SM +#if CUDA_VERSION >= 11000 + int const max_blocks_per_sm = properties.maxBlocksPerMultiProcessor; +#else + int const max_blocks_per_sm = [&properties]() { + switch (properties.major) { + case 3: return 16; + case 5: + case 6: return 32; + case 7: { + int isTuring = properties.minor == 5; + return (isTuring) ? 16 : 32; + } + default: + throw_runtime_exception("Unknown device in cuda block size deduction"); + return 0; + } + }(); +#endif + + // Overall occupancy in blocks + return std::min({max_blocks_regs, max_blocks_shmem, max_blocks_per_sm}); +} + +template <typename UnaryFunction, typename LaunchBounds> +inline int cuda_deduce_block_size(bool early_termination, + cudaDeviceProp const& properties, + cudaFuncAttributes const& attributes, + UnaryFunction block_size_to_dynamic_shmem, + LaunchBounds) { + // Limits + int const max_threads_per_sm = properties.maxThreadsPerMultiProcessor; + // unsure if I need to do that or if this is already accounted for in the + // functor attributes + int const max_threads_per_block = + std::min(LaunchBounds::maxTperB == 0 ? (int)properties.maxThreadsPerBlock + : (int)LaunchBounds::maxTperB, + attributes.maxThreadsPerBlock); + int const min_blocks_per_sm = + LaunchBounds::minBperSM == 0 ? 1 : LaunchBounds::minBperSM; + + // Recorded maximum + int opt_block_size = 0; + int opt_threads_per_sm = 0; + + for (int block_size = max_threads_per_block; block_size > 0; + block_size -= 32) { + size_t const dynamic_shmem = block_size_to_dynamic_shmem(block_size); + + int blocks_per_sm = cuda_max_active_blocks_per_sm( + properties, attributes, block_size, dynamic_shmem); + + int threads_per_sm = blocks_per_sm * block_size; + + if (threads_per_sm > max_threads_per_sm) { + blocks_per_sm = max_threads_per_sm / block_size; + threads_per_sm = blocks_per_sm * block_size; + } + + if (blocks_per_sm >= min_blocks_per_sm) { + if (threads_per_sm >= opt_threads_per_sm) { + opt_block_size = block_size; + opt_threads_per_sm = threads_per_sm; + } + } + + if (early_termination && opt_block_size != 0) break; + } + + return opt_block_size; +} + +template <class FunctorType, class LaunchBounds> +int cuda_get_max_block_size(const CudaInternal* cuda_instance, + const cudaFuncAttributes& attr, + const FunctorType& f, const size_t vector_length, + const size_t shmem_block, + const size_t shmem_thread) { + (void)cuda_instance; + + auto const& prop = Kokkos::Cuda().cuda_device_prop(); + + auto const block_size_to_dynamic_shmem = [&f, vector_length, shmem_block, + shmem_thread](int block_size) { + size_t const functor_shmem = + Kokkos::Impl::FunctorTeamShmemSize<FunctorType>::value( + f, block_size / vector_length); + + size_t const dynamic_shmem = shmem_block + + shmem_thread * (block_size / vector_length) + + functor_shmem; + return dynamic_shmem; + }; + + return cuda_deduce_block_size(true, prop, attr, block_size_to_dynamic_shmem, + LaunchBounds{}); +} + +template <class FunctorType, class LaunchBounds> +int cuda_get_opt_block_size(const CudaInternal* cuda_instance, + const cudaFuncAttributes& attr, + const FunctorType& f, const size_t vector_length, + const size_t shmem_block, + const size_t shmem_thread) { + (void)cuda_instance; + + auto const& prop = Kokkos::Cuda().cuda_device_prop(); + + auto const block_size_to_dynamic_shmem = [&f, vector_length, shmem_block, + shmem_thread](int block_size) { + size_t const functor_shmem = + Kokkos::Impl::FunctorTeamShmemSize<FunctorType>::value( + f, block_size / vector_length); + + size_t const dynamic_shmem = shmem_block + + shmem_thread * (block_size / vector_length) + + functor_shmem; + return dynamic_shmem; + }; + + return cuda_deduce_block_size(false, prop, attr, block_size_to_dynamic_shmem, + LaunchBounds{}); +} + +// Assuming cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1) +// NOTE these number can be obtained several ways: +// * One option is to download the CUDA Occupancy Calculator spreadsheet, select +// "Compute Capability" first and check what is the smallest "Shared Memory +// Size Config" that is available. The "Shared Memory Per Multiprocessor" in +// bytes is then to be found below in the summary. +// * Another option would be to look for the information in the "Tuning +// Guide(s)" of the CUDA Toolkit Documentation for each GPU architecture, in +// the "Shared Memory" section (more tedious) +inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) { + int const compute_capability = properties.major * 10 + properties.minor; + return [compute_capability]() { + switch (compute_capability) { + case 30: + case 32: + case 35: return 16; + case 37: return 80; + case 50: + case 53: + case 60: + case 62: return 64; + case 52: + case 61: return 96; + case 70: + case 80: + case 86: return 8; + case 75: return 32; + default: + Kokkos::Impl::throw_runtime_exception( + "Unknown device in cuda block size deduction"); + } + return 0; + }() * 1024; +} +} // namespace Impl +} // namespace Kokkos + +#endif // KOKKOS_ENABLE_CUDA +#endif /* #ifndef KOKKOS_CUDA_INTERNAL_HPP */ diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp new file mode 100644 index 0000000000000000000000000000000000000000..4759001d81f99afc0a1e2aa6cf64462d9e7fcdc9 --- /dev/null +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp @@ -0,0 +1,124 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CUDA_ERROR_HPP +#define KOKKOS_CUDA_ERROR_HPP + +#include <Kokkos_Macros.hpp> +#ifdef KOKKOS_ENABLE_CUDA + +#include <impl/Kokkos_Error.hpp> + +#include <iosfwd> + +namespace Kokkos { +namespace Impl { + +void cuda_device_synchronize(); + +void cuda_internal_error_throw(cudaError e, const char* name, + const char* file = nullptr, const int line = 0); + +inline void cuda_internal_safe_call(cudaError e, const char* name, + const char* file = nullptr, + const int line = 0) { + if (cudaSuccess != e) { + cuda_internal_error_throw(e, name, file, line); + } +} + +#define CUDA_SAFE_CALL(call) \ + Kokkos::Impl::cuda_internal_safe_call(call, #call, __FILE__, __LINE__) + +} // namespace Impl + +namespace Experimental { + +class CudaRawMemoryAllocationFailure : public RawMemoryAllocationFailure { + private: + using base_t = RawMemoryAllocationFailure; + + cudaError_t m_error_code = cudaSuccess; + + static FailureMode get_failure_mode(cudaError_t error_code) { + switch (error_code) { + case cudaErrorMemoryAllocation: return FailureMode::OutOfMemoryError; + case cudaErrorInvalidValue: return FailureMode::InvalidAllocationSize; + // TODO handle cudaErrorNotSupported for cudaMallocManaged + default: return FailureMode::Unknown; + } + } + + public: + // using base_t::base_t; + // would trigger + // + // error: cannot determine the exception specification of the default + // constructor due to a circular dependency + // + // using NVCC 9.1 and gcc 7.4 + CudaRawMemoryAllocationFailure( + size_t arg_attempted_size, size_t arg_attempted_alignment, + FailureMode arg_failure_mode = FailureMode::OutOfMemoryError, + AllocationMechanism arg_mechanism = + AllocationMechanism::StdMalloc) noexcept + : base_t(arg_attempted_size, arg_attempted_alignment, arg_failure_mode, + arg_mechanism) {} + + CudaRawMemoryAllocationFailure(size_t arg_attempted_size, + cudaError_t arg_error_code, + AllocationMechanism arg_mechanism) noexcept + : base_t(arg_attempted_size, /* CudaSpace doesn't handle alignment? */ 1, + get_failure_mode(arg_error_code), arg_mechanism), + m_error_code(arg_error_code) {} + + void append_additional_error_information(std::ostream& o) const override; +}; + +} // end namespace Experimental + +} // namespace Kokkos + +#endif // KOKKOS_ENABLE_CUDA +#endif // KOKKOS_CUDA_ERROR_HPP diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp new file mode 100644 index 0000000000000000000000000000000000000000..d6fadd82c0fbbb2d3927afe2b6b5b4566710dc1a --- /dev/null +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp @@ -0,0 +1,210 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_KOKKOS_CUDA_GRAPHNODEKERNEL_IMPL_HPP +#define KOKKOS_KOKKOS_CUDA_GRAPHNODEKERNEL_IMPL_HPP + +#include <Kokkos_Macros.hpp> + +#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_CUDA_ENABLE_GRAPHS) + +#include <Kokkos_Graph_fwd.hpp> + +#include <impl/Kokkos_GraphImpl.hpp> // GraphAccess needs to be complete +#include <impl/Kokkos_SharedAlloc.hpp> // SharedAllocationRecord + +#include <Kokkos_Parallel.hpp> +#include <Kokkos_Parallel_Reduce.hpp> +#include <Kokkos_PointerOwnership.hpp> + +#include <Kokkos_Cuda.hpp> +#include <cuda_runtime_api.h> + +namespace Kokkos { +namespace Impl { + +template <class PolicyType, class Functor, class PatternTag, class... Args> +class GraphNodeKernelImpl<Kokkos::Cuda, PolicyType, Functor, PatternTag, + Args...> + : public PatternImplSpecializationFromTag<PatternTag, Functor, PolicyType, + Args..., Kokkos::Cuda>::type { + private: + using base_t = + typename PatternImplSpecializationFromTag<PatternTag, Functor, PolicyType, + Args..., Kokkos::Cuda>::type; + using size_type = Kokkos::Cuda::size_type; + // These are really functioning as optional references, though I'm not sure + // that the cudaGraph_t one needs to be since it's a pointer under the + // covers and we're not modifying it + Kokkos::ObservingRawPtr<const cudaGraph_t> m_graph_ptr = nullptr; + Kokkos::ObservingRawPtr<cudaGraphNode_t> m_graph_node_ptr = nullptr; + // Note: owned pointer to CudaSpace memory (used for global memory launches), + // which we're responsible for deallocating, but not responsible for calling + // its destructor. + using Record = Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>; + // Basically, we have to make this mutable for the same reasons that the + // global kernel buffers in the Cuda instance are mutable... + mutable Kokkos::OwningRawPtr<base_t> m_driver_storage = nullptr; + + public: + using Policy = PolicyType; + using graph_kernel = GraphNodeKernelImpl; + + // TODO Ensure the execution space of the graph is the same as the one + // attached to the policy? + // TODO @graph kernel name info propagation + template <class PolicyDeduced, class... ArgsDeduced> + GraphNodeKernelImpl(std::string, Kokkos::Cuda const&, Functor arg_functor, + PolicyDeduced&& arg_policy, ArgsDeduced&&... args) + // This is super ugly, but it works for now and is the most minimal change + // to the codebase for now... + : base_t(std::move(arg_functor), (PolicyDeduced &&) arg_policy, + (ArgsDeduced &&) args...) {} + + // FIXME @graph Forward through the instance once that works in the backends + template <class PolicyDeduced> + GraphNodeKernelImpl(Kokkos::Cuda const& ex, Functor arg_functor, + PolicyDeduced&& arg_policy) + : GraphNodeKernelImpl("", ex, std::move(arg_functor), + (PolicyDeduced &&) arg_policy) {} + + ~GraphNodeKernelImpl() { + if (m_driver_storage) { + // We should be the only owner, but this is still the easiest way to + // allocate and deallocate aligned memory for these sorts of things + Record::decrement(Record::get_record(m_driver_storage)); + } + } + + void set_cuda_graph_ptr(cudaGraph_t* arg_graph_ptr) { + m_graph_ptr = arg_graph_ptr; + } + void set_cuda_graph_node_ptr(cudaGraphNode_t* arg_node_ptr) { + m_graph_node_ptr = arg_node_ptr; + } + cudaGraphNode_t* get_cuda_graph_node_ptr() const { return m_graph_node_ptr; } + cudaGraph_t const* get_cuda_graph_ptr() const { return m_graph_ptr; } + + Kokkos::ObservingRawPtr<base_t> allocate_driver_memory_buffer() const { + KOKKOS_EXPECTS(m_driver_storage == nullptr) + + auto* record = Record::allocate( + Kokkos::CudaSpace{}, "GraphNodeKernel global memory functor storage", + sizeof(base_t)); + + Record::increment(record); + m_driver_storage = reinterpret_cast<base_t*>(record->data()); + KOKKOS_ENSURES(m_driver_storage != nullptr) + return m_driver_storage; + } +}; + +struct CudaGraphNodeAggregateKernel { + using graph_kernel = CudaGraphNodeAggregateKernel; + + // Aggregates don't need a policy, but for the purposes of checking the static + // assertions about graph kerenls, + struct Policy { + using is_graph_kernel = std::true_type; + }; +}; + +template <class KernelType, + class Tag = + typename PatternTagFromImplSpecialization<KernelType>::type> +struct get_graph_node_kernel_type + : identity<GraphNodeKernelImpl<Kokkos::Cuda, typename KernelType::Policy, + typename KernelType::functor_type, Tag>> {}; +template <class KernelType> +struct get_graph_node_kernel_type<KernelType, Kokkos::ParallelReduceTag> + : identity<GraphNodeKernelImpl<Kokkos::Cuda, typename KernelType::Policy, + typename KernelType::functor_type, + Kokkos::ParallelReduceTag, + typename KernelType::reducer_type>> {}; + +//============================================================================== +// <editor-fold desc="get_cuda_graph_*() helper functions"> {{{1 + +template <class KernelType> +auto* allocate_driver_storage_for_kernel(KernelType const& kernel) { + using graph_node_kernel_t = + typename get_graph_node_kernel_type<KernelType>::type; + auto const& kernel_as_graph_kernel = + static_cast<graph_node_kernel_t const&>(kernel); + // TODO @graphs we need to somehow indicate the need for a fence in the + // destructor of the GraphImpl object (so that we don't have to + // just always do it) + return kernel_as_graph_kernel.allocate_driver_memory_buffer(); +} + +template <class KernelType> +auto const& get_cuda_graph_from_kernel(KernelType const& kernel) { + using graph_node_kernel_t = + typename get_graph_node_kernel_type<KernelType>::type; + auto const& kernel_as_graph_kernel = + static_cast<graph_node_kernel_t const&>(kernel); + cudaGraph_t const* graph_ptr = kernel_as_graph_kernel.get_cuda_graph_ptr(); + KOKKOS_EXPECTS(graph_ptr != nullptr); + return *graph_ptr; +} + +template <class KernelType> +auto& get_cuda_graph_node_from_kernel(KernelType const& kernel) { + using graph_node_kernel_t = + typename get_graph_node_kernel_type<KernelType>::type; + auto const& kernel_as_graph_kernel = + static_cast<graph_node_kernel_t const&>(kernel); + auto* graph_node_ptr = kernel_as_graph_kernel.get_cuda_graph_node_ptr(); + KOKKOS_EXPECTS(graph_node_ptr != nullptr); + return *graph_node_ptr; +} + +// </editor-fold> end get_cuda_graph_*() helper functions }}}1 +//============================================================================== + +} // end namespace Impl +} // end namespace Kokkos + +#endif // defined(KOKKOS_ENABLE_CUDA) +#endif // KOKKOS_KOKKOS_CUDA_GRAPHNODEKERNEL_IMPL_HPP diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNode_Impl.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNode_Impl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..f4539cd2ca378a95c845d4e189512cf9ba21a200 --- /dev/null +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNode_Impl.hpp @@ -0,0 +1,103 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_KOKKOS_CUDA_GRAPHNODE_IMPL_HPP +#define KOKKOS_KOKKOS_CUDA_GRAPHNODE_IMPL_HPP + +#include <Kokkos_Macros.hpp> + +#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_CUDA_ENABLE_GRAPHS) + +#include <Kokkos_Graph_fwd.hpp> + +#include <impl/Kokkos_GraphImpl.hpp> // GraphAccess needs to be complete + +#include <Kokkos_Cuda.hpp> +#include <cuda_runtime_api.h> + +namespace Kokkos { +namespace Impl { + +template <> +struct GraphNodeBackendSpecificDetails<Kokkos::Cuda> { + cudaGraphNode_t node = nullptr; + + //---------------------------------------------------------------------------- + // <editor-fold desc="Ctors, destructor, and assignment"> {{{2 + + explicit GraphNodeBackendSpecificDetails() = default; + + explicit GraphNodeBackendSpecificDetails( + _graph_node_is_root_ctor_tag) noexcept {} + + // </editor-fold> end Ctors, destructor, and assignment }}}2 + //---------------------------------------------------------------------------- +}; + +template <class Kernel, class PredecessorRef> +struct GraphNodeBackendDetailsBeforeTypeErasure<Kokkos::Cuda, Kernel, + PredecessorRef> { + protected: + //---------------------------------------------------------------------------- + // <editor-fold desc="ctors, destructor, and assignment"> {{{2 + + GraphNodeBackendDetailsBeforeTypeErasure( + Kokkos::Cuda const&, Kernel&, PredecessorRef const&, + GraphNodeBackendSpecificDetails<Kokkos::Cuda>&) noexcept {} + + GraphNodeBackendDetailsBeforeTypeErasure( + Kokkos::Cuda const&, _graph_node_is_root_ctor_tag, + GraphNodeBackendSpecificDetails<Kokkos::Cuda>&) noexcept {} + + // </editor-fold> end ctors, destructor, and assignment }}}2 + //---------------------------------------------------------------------------- +}; + +} // end namespace Impl +} // end namespace Kokkos + +#include <Cuda/Kokkos_Cuda_GraphNodeKernel.hpp> + +#endif // defined(KOKKOS_ENABLE_CUDA) +#endif // KOKKOS_KOKKOS_CUDA_GRAPHNODE_IMPL_HPP diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..3de7a69916130de41077bae684df0cbc87daea4b --- /dev/null +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp @@ -0,0 +1,219 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_KOKKOS_CUDA_GRAPH_IMPL_HPP +#define KOKKOS_KOKKOS_CUDA_GRAPH_IMPL_HPP + +#include <Kokkos_Macros.hpp> + +#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_CUDA_ENABLE_GRAPHS) + +#include <Kokkos_Graph_fwd.hpp> + +#include <impl/Kokkos_GraphImpl.hpp> // GraphAccess needs to be complete + +// GraphNodeImpl needs to be complete because GraphImpl here is a full +// specialization and not just a partial one +#include <impl/Kokkos_GraphNodeImpl.hpp> +#include <Cuda/Kokkos_Cuda_GraphNode_Impl.hpp> + +#include <Kokkos_Cuda.hpp> +#include <cuda_runtime_api.h> + +namespace Kokkos { +namespace Impl { + +template <> +struct GraphImpl<Kokkos::Cuda> { + public: + using execution_space = Kokkos::Cuda; + + private: + execution_space m_execution_space; + cudaGraph_t m_graph = nullptr; + cudaGraphExec_t m_graph_exec = nullptr; + + using cuda_graph_flags_t = unsigned int; + + using node_details_t = GraphNodeBackendSpecificDetails<Kokkos::Cuda>; + + void _instantiate_graph() { + constexpr size_t error_log_size = 256; + cudaGraphNode_t error_node = nullptr; + char error_log[error_log_size]; + CUDA_SAFE_CALL(cudaGraphInstantiate(&m_graph_exec, m_graph, &error_node, + error_log, error_log_size)); + // TODO @graphs print out errors + } + + public: + using root_node_impl_t = + GraphNodeImpl<Kokkos::Cuda, Kokkos::Experimental::TypeErasedTag, + Kokkos::Experimental::TypeErasedTag>; + using aggregate_kernel_impl_t = CudaGraphNodeAggregateKernel; + using aggregate_node_impl_t = + GraphNodeImpl<Kokkos::Cuda, aggregate_kernel_impl_t, + Kokkos::Experimental::TypeErasedTag>; + + // Not moveable or copyable; it spends its whole life as a shared_ptr in the + // Graph object + GraphImpl() = delete; + GraphImpl(GraphImpl const&) = delete; + GraphImpl(GraphImpl&&) = delete; + GraphImpl& operator=(GraphImpl const&) = delete; + GraphImpl& operator=(GraphImpl&&) = delete; + ~GraphImpl() { + // TODO @graphs we need to somehow indicate the need for a fence in the + // destructor of the GraphImpl object (so that we don't have to + // just always do it) + m_execution_space.fence(); + KOKKOS_EXPECTS(bool(m_graph)) + if (bool(m_graph_exec)) { + CUDA_SAFE_CALL(cudaGraphExecDestroy(m_graph_exec)); + } + CUDA_SAFE_CALL(cudaGraphDestroy(m_graph)); + }; + + explicit GraphImpl(Kokkos::Cuda arg_instance) + : m_execution_space(std::move(arg_instance)) { + CUDA_SAFE_CALL(cudaGraphCreate(&m_graph, cuda_graph_flags_t{0})); + } + + void add_node(std::shared_ptr<aggregate_node_impl_t> const& arg_node_ptr) { + // All of the predecessors are just added as normal, so all we need to + // do here is add an empty node + CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&(arg_node_ptr->node_details_t::node), + m_graph, + /* dependencies = */ nullptr, + /* numDependencies = */ 0)); + } + + template <class NodeImpl> + // requires NodeImplPtr is a shared_ptr to specialization of GraphNodeImpl + // Also requires that the kernel has the graph node tag in it's policy + void add_node(std::shared_ptr<NodeImpl> const& arg_node_ptr) { + static_assert( + NodeImpl::kernel_type::Policy::is_graph_kernel::value, + "Something has gone horribly wrong, but it's too complicated to " + "explain here. Buy Daisy a coffee and she'll explain it to you."); + KOKKOS_EXPECTS(bool(arg_node_ptr)); + // The Kernel launch from the execute() method has been shimmed to insert + // the node into the graph + auto& kernel = arg_node_ptr->get_kernel(); + // note: using arg_node_ptr->node_details_t::node caused an ICE in NVCC 10.1 + auto& cuda_node = static_cast<node_details_t*>(arg_node_ptr.get())->node; + KOKKOS_EXPECTS(!bool(cuda_node)); + kernel.set_cuda_graph_ptr(&m_graph); + kernel.set_cuda_graph_node_ptr(&cuda_node); + kernel.execute(); + KOKKOS_ENSURES(bool(cuda_node)); + } + + template <class NodeImplPtr, class PredecessorRef> + // requires PredecessorRef is a specialization of GraphNodeRef that has + // already been added to this graph and NodeImpl is a specialization of + // GraphNodeImpl that has already been added to this graph. + void add_predecessor(NodeImplPtr arg_node_ptr, PredecessorRef arg_pred_ref) { + KOKKOS_EXPECTS(bool(arg_node_ptr)) + auto pred_ptr = GraphAccess::get_node_ptr(arg_pred_ref); + KOKKOS_EXPECTS(bool(pred_ptr)) + + // clang-format off + // NOTE const-qualifiers below are commented out because of an API break + // from CUDA 10.0 to CUDA 10.1 + // cudaGraphAddDependencies(cudaGraph_t, cudaGraphNode_t*, cudaGraphNode_t*, size_t) + // cudaGraphAddDependencies(cudaGraph_t, const cudaGraphNode_t*, const cudaGraphNode_t*, size_t) + // clang-format on + auto /*const*/& pred_cuda_node = pred_ptr->node_details_t::node; + KOKKOS_EXPECTS(bool(pred_cuda_node)) + + auto /*const*/& cuda_node = arg_node_ptr->node_details_t::node; + KOKKOS_EXPECTS(bool(cuda_node)) + + CUDA_SAFE_CALL( + cudaGraphAddDependencies(m_graph, &pred_cuda_node, &cuda_node, 1)); + } + + void submit() { + if (!bool(m_graph_exec)) { + _instantiate_graph(); + } + CUDA_SAFE_CALL( + cudaGraphLaunch(m_graph_exec, m_execution_space.cuda_stream())); + } + + execution_space const& get_execution_space() const noexcept { + return m_execution_space; + } + + auto create_root_node_ptr() { + KOKKOS_EXPECTS(bool(m_graph)) + KOKKOS_EXPECTS(!bool(m_graph_exec)) + auto rv = std::make_shared<root_node_impl_t>( + get_execution_space(), _graph_node_is_root_ctor_tag{}); + CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&(rv->node_details_t::node), m_graph, + /* dependencies = */ nullptr, + /* numDependencies = */ 0)); + KOKKOS_ENSURES(bool(rv->node_details_t::node)) + return rv; + } + + template <class... PredecessorRefs> + // See requirements/expectations in GraphBuilder + auto create_aggregate_ptr(PredecessorRefs&&...) { + // The attachment to predecessors, which is all we really need, happens + // in the generic layer, which calls through to add_predecessor for + // each predecessor ref, so all we need to do here is create the (trivial) + // aggregate node. + return std::make_shared<aggregate_node_impl_t>( + m_execution_space, _graph_node_kernel_ctor_tag{}, + aggregate_kernel_impl_t{}); + } +}; + +} // end namespace Impl +} // end namespace Kokkos + +#endif // defined(KOKKOS_ENABLE_CUDA) +#endif // KOKKOS_KOKKOS_CUDA_GRAPH_IMPL_HPP diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp new file mode 100644 index 0000000000000000000000000000000000000000..ec9c434fe663900a5d5029896a5c98ce13266605 --- /dev/null +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp @@ -0,0 +1,951 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CUDA_HALF_HPP_ +#define KOKKOS_CUDA_HALF_HPP_ + +#include <Kokkos_Macros.hpp> +#ifdef KOKKOS_ENABLE_CUDA +#if !(defined(KOKKOS_COMPILER_CLANG) && KOKKOS_COMPILER_CLANG < 900) && \ + !(defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL50) || \ + defined(KOKKOS_ARCH_MAXWELL52)) +#include <cuda_fp16.h> + +#ifndef KOKKOS_IMPL_HALF_TYPE_DEFINED +// Make sure no one else tries to define half_t +#define KOKKOS_IMPL_HALF_TYPE_DEFINED + +namespace Kokkos { +namespace Impl { +struct half_impl_t { + using type = __half; +}; +} // namespace Impl +namespace Experimental { + +// Forward declarations +class half_t; + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(float val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(bool val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(double val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(short val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(int val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(long val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(long long val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned short val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned int val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned long val); +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned long long val); + +template <class T> +KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, float>::value, T> + cast_from_half(half_t); +template <class T> +KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, bool>::value, T> + cast_from_half(half_t); +template <class T> +KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, double>::value, T> + cast_from_half(half_t); +template <class T> +KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, short>::value, T> + cast_from_half(half_t); +template <class T> +KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, int>::value, T> + cast_from_half(half_t); +template <class T> +KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, long>::value, T> + cast_from_half(half_t); +template <class T> +KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, long long>::value, T> + cast_from_half(half_t); +template <class T> +KOKKOS_INLINE_FUNCTION + std::enable_if_t<std::is_same<T, unsigned short>::value, T> + cast_from_half(half_t); +template <class T> +KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, unsigned int>::value, T> + cast_from_half(half_t); +template <class T> +KOKKOS_INLINE_FUNCTION + std::enable_if_t<std::is_same<T, unsigned long>::value, T> + cast_from_half(half_t); +template <class T> +KOKKOS_INLINE_FUNCTION + std::enable_if_t<std::is_same<T, unsigned long long>::value, T> + cast_from_half(half_t); + +class half_t { + public: + using impl_type = Kokkos::Impl::half_impl_t::type; + + private: + impl_type val; + + public: + KOKKOS_FUNCTION + half_t() : val(0.0F) {} + + // Don't support implicit conversion back to impl_type. + // impl_type is a storage only type on host. + KOKKOS_FUNCTION + explicit operator impl_type() const { return val; } + KOKKOS_FUNCTION + explicit operator float() const { return cast_from_half<float>(*this); } + KOKKOS_FUNCTION + explicit operator bool() const { return cast_from_half<bool>(*this); } + KOKKOS_FUNCTION + explicit operator double() const { return cast_from_half<double>(*this); } + KOKKOS_FUNCTION + explicit operator short() const { return cast_from_half<short>(*this); } + KOKKOS_FUNCTION + explicit operator int() const { return cast_from_half<int>(*this); } + KOKKOS_FUNCTION + explicit operator long() const { return cast_from_half<long>(*this); } + KOKKOS_FUNCTION + explicit operator long long() const { + return cast_from_half<long long>(*this); + } + KOKKOS_FUNCTION + explicit operator unsigned short() const { + return cast_from_half<unsigned short>(*this); + } + KOKKOS_FUNCTION + explicit operator unsigned int() const { + return cast_from_half<unsigned int>(*this); + } + KOKKOS_FUNCTION + explicit operator unsigned long() const { + return cast_from_half<unsigned long>(*this); + } + KOKKOS_FUNCTION + explicit operator unsigned long long() const { + return cast_from_half<unsigned long long>(*this); + } + + /** + * Conversion constructors. + * + * Support implicit conversions from impl_type, float, double -> half_t + * Mixed precision expressions require upcasting which is done in the + * "// Binary Arithmetic" operator overloads below. + * + * Support implicit conversions from integral types -> half_t. + * Expressions involving half_t with integral types require downcasting + * the integral types to half_t. Existing operator overloads can handle this + * with the addition of the below implicit conversion constructors. + */ + KOKKOS_FUNCTION + half_t(impl_type rhs) : val(rhs) {} + KOKKOS_FUNCTION + half_t(float rhs) : val(cast_to_half(rhs).val) {} + KOKKOS_FUNCTION + half_t(double rhs) : val(cast_to_half(rhs).val) {} + KOKKOS_FUNCTION + explicit half_t(bool rhs) : val(cast_to_half(rhs).val) {} + KOKKOS_FUNCTION + half_t(short rhs) : val(cast_to_half(rhs).val) {} + KOKKOS_FUNCTION + half_t(int rhs) : val(cast_to_half(rhs).val) {} + KOKKOS_FUNCTION + half_t(long rhs) : val(cast_to_half(rhs).val) {} + KOKKOS_FUNCTION + half_t(long long rhs) : val(cast_to_half(rhs).val) {} + KOKKOS_FUNCTION + half_t(unsigned short rhs) : val(cast_to_half(rhs).val) {} + KOKKOS_FUNCTION + half_t(unsigned int rhs) : val(cast_to_half(rhs).val) {} + KOKKOS_FUNCTION + half_t(unsigned long rhs) : val(cast_to_half(rhs).val) {} + KOKKOS_FUNCTION + half_t(unsigned long long rhs) : val(cast_to_half(rhs).val) {} + + // Unary operators + KOKKOS_FUNCTION + half_t operator+() const { + half_t tmp = *this; +#ifdef __CUDA_ARCH__ + tmp.val = +tmp.val; +#else + tmp.val = __float2half(+__half2float(tmp.val)); +#endif + return tmp; + } + + KOKKOS_FUNCTION + half_t operator-() const { + half_t tmp = *this; +#ifdef __CUDA_ARCH__ + tmp.val = -tmp.val; +#else + tmp.val = __float2half(-__half2float(tmp.val)); +#endif + return tmp; + } + + // Prefix operators + KOKKOS_FUNCTION + half_t& operator++() { +#ifdef __CUDA_ARCH__ + ++val; +#else + float tmp = __half2float(val); + ++tmp; + val = __float2half(tmp); +#endif + return *this; + } + + KOKKOS_FUNCTION + half_t& operator--() { +#ifdef __CUDA_ARCH__ + --val; +#else + float tmp = __half2float(val); + --tmp; + val = __float2half(tmp); +#endif + return *this; + } + + // Postfix operators + KOKKOS_FUNCTION + half_t operator++(int) { + half_t tmp = *this; + operator++(); + return tmp; + } + + KOKKOS_FUNCTION + half_t operator--(int) { + half_t tmp = *this; + operator--(); + return tmp; + } + + // Binary operators + KOKKOS_FUNCTION + half_t& operator=(impl_type rhs) { + val = rhs; + return *this; + } + + template <class T> + KOKKOS_FUNCTION half_t& operator=(T rhs) { + val = cast_to_half(rhs).val; + return *this; + } + + template <class T> + KOKKOS_FUNCTION void operator=(T rhs) volatile { + val = cast_to_half(rhs).val; + } + + // Compound operators + KOKKOS_FUNCTION + half_t& operator+=(half_t rhs) { +#ifdef __CUDA_ARCH__ + val += rhs.val; +#else + val = __float2half(__half2float(val) + __half2float(rhs.val)); +#endif + return *this; + } + + KOKKOS_FUNCTION + volatile half_t& operator+=(half_t rhs) volatile { +#ifdef __CUDA_ARCH__ + // Cuda 10 supports __half volatile stores but not volatile arithmetic + // operands. Cast away volatile-ness of val for arithmetic but not for store + // location. + val = const_cast<impl_type&>(val) + rhs.val; +#else + // Use non-volatile val_ref to suppress: + // "warning: implicit dereference will not access object of type ‘volatile + // __half’ in statement" + auto val_ref = const_cast<impl_type&>(val); + val_ref = __float2half(__half2float(const_cast<impl_type&>(val)) + + __half2float(rhs.val)); +#endif + return *this; + } + + // Compund operators: upcast overloads for += + template <class T> + KOKKOS_FUNCTION std::enable_if_t< + std::is_same<T, float>::value || std::is_same<T, double>::value, T> friend + operator+=(T& lhs, half_t rhs) { + lhs += static_cast<T>(rhs); + return lhs; + } + + KOKKOS_FUNCTION + half_t& operator+=(float rhs) { + float result = static_cast<float>(val) + rhs; + val = static_cast<impl_type>(result); + return *this; + } + + KOKKOS_FUNCTION + half_t& operator+=(double rhs) { + double result = static_cast<double>(val) + rhs; + val = static_cast<impl_type>(result); + return *this; + } + + KOKKOS_FUNCTION + half_t& operator-=(half_t rhs) { +#ifdef __CUDA_ARCH__ + val -= rhs.val; +#else + val = __float2half(__half2float(val) - __half2float(rhs.val)); +#endif + return *this; + } + + KOKKOS_FUNCTION + volatile half_t& operator-=(half_t rhs) volatile { +#ifdef __CUDA_ARCH__ + // Cuda 10 supports __half volatile stores but not volatile arithmetic + // operands. Cast away volatile-ness of val for arithmetic but not for store + // location. + val = const_cast<impl_type&>(val) - rhs.val; +#else + // Use non-volatile val_ref to suppress: + // "warning: implicit dereference will not access object of type ‘volatile + // __half’ in statement" + auto val_ref = const_cast<impl_type&>(val); + val_ref = __float2half(__half2float(const_cast<impl_type&>(val)) - + __half2float(rhs.val)); +#endif + return *this; + } + + // Compund operators: upcast overloads for -= + template <class T> + KOKKOS_FUNCTION std::enable_if_t< + std::is_same<T, float>::value || std::is_same<T, double>::value, T> friend + operator-=(T& lhs, half_t rhs) { + lhs -= static_cast<T>(rhs); + return lhs; + } + + KOKKOS_FUNCTION + half_t& operator-=(float rhs) { + float result = static_cast<float>(val) - rhs; + val = static_cast<impl_type>(result); + return *this; + } + + KOKKOS_FUNCTION + half_t& operator-=(double rhs) { + double result = static_cast<double>(val) - rhs; + val = static_cast<impl_type>(result); + return *this; + } + + KOKKOS_FUNCTION + half_t& operator*=(half_t rhs) { +#ifdef __CUDA_ARCH__ + val *= rhs.val; +#else + val = __float2half(__half2float(val) * __half2float(rhs.val)); +#endif + return *this; + } + + KOKKOS_FUNCTION + volatile half_t& operator*=(half_t rhs) volatile { +#ifdef __CUDA_ARCH__ + // Cuda 10 supports __half volatile stores but not volatile arithmetic + // operands. Cast away volatile-ness of val for arithmetic but not for store + // location. + val = const_cast<impl_type&>(val) * rhs.val; +#else + // Use non-volatile val_ref to suppress: + // "warning: implicit dereference will not access object of type ‘volatile + // __half’ in statement" + auto val_ref = const_cast<impl_type&>(val); + val_ref = __float2half(__half2float(const_cast<impl_type&>(val)) * + __half2float(rhs.val)); +#endif + return *this; + } + + // Compund operators: upcast overloads for *= + template <class T> + KOKKOS_FUNCTION std::enable_if_t< + std::is_same<T, float>::value || std::is_same<T, double>::value, T> friend + operator*=(T& lhs, half_t rhs) { + lhs *= static_cast<T>(rhs); + return lhs; + } + + KOKKOS_FUNCTION + half_t& operator*=(float rhs) { + float result = static_cast<float>(val) * rhs; + val = static_cast<impl_type>(result); + return *this; + } + + KOKKOS_FUNCTION + half_t& operator*=(double rhs) { + double result = static_cast<double>(val) * rhs; + val = static_cast<impl_type>(result); + return *this; + } + + KOKKOS_FUNCTION + half_t& operator/=(half_t rhs) { +#ifdef __CUDA_ARCH__ + val /= rhs.val; +#else + val = __float2half(__half2float(val) / __half2float(rhs.val)); +#endif + return *this; + } + + KOKKOS_FUNCTION + volatile half_t& operator/=(half_t rhs) volatile { +#ifdef __CUDA_ARCH__ + // Cuda 10 supports __half volatile stores but not volatile arithmetic + // operands. Cast away volatile-ness of val for arithmetic but not for store + // location. + val = const_cast<impl_type&>(val) / rhs.val; +#else + // Use non-volatile val_ref to suppress: + // "warning: implicit dereference will not access object of type ‘volatile + // __half’ in statement" + auto val_ref = const_cast<impl_type&>(val); + val_ref = __float2half(__half2float(const_cast<impl_type&>(val)) / + __half2float(rhs.val)); +#endif + return *this; + } + + // Compund operators: upcast overloads for /= + template <class T> + KOKKOS_FUNCTION std::enable_if_t< + std::is_same<T, float>::value || std::is_same<T, double>::value, T> friend + operator/=(T& lhs, half_t rhs) { + lhs /= static_cast<T>(rhs); + return lhs; + } + + KOKKOS_FUNCTION + half_t& operator/=(float rhs) { + float result = static_cast<float>(val) / rhs; + val = static_cast<impl_type>(result); + return *this; + } + + KOKKOS_FUNCTION + half_t& operator/=(double rhs) { + double result = static_cast<double>(val) / rhs; + val = static_cast<impl_type>(result); + return *this; + } + + // Binary Arithmetic + KOKKOS_FUNCTION + half_t friend operator+(half_t lhs, half_t rhs) { +#ifdef __CUDA_ARCH__ + lhs.val += rhs.val; +#else + lhs.val = __float2half(__half2float(lhs.val) + __half2float(rhs.val)); +#endif + return lhs; + } + + // Binary Arithmetic upcast operators for + + template <class T> + KOKKOS_FUNCTION std::enable_if_t< + std::is_same<T, float>::value || std::is_same<T, double>::value, T> friend + operator+(half_t lhs, T rhs) { + return T(lhs) + rhs; + } + + template <class T> + KOKKOS_FUNCTION std::enable_if_t< + std::is_same<T, float>::value || std::is_same<T, double>::value, T> friend + operator+(T lhs, half_t rhs) { + return lhs + T(rhs); + } + + KOKKOS_FUNCTION + half_t friend operator-(half_t lhs, half_t rhs) { +#ifdef __CUDA_ARCH__ + lhs.val -= rhs.val; +#else + lhs.val = __float2half(__half2float(lhs.val) - __half2float(rhs.val)); +#endif + return lhs; + } + + // Binary Arithmetic upcast operators for - + template <class T> + KOKKOS_FUNCTION std::enable_if_t< + std::is_same<T, float>::value || std::is_same<T, double>::value, T> friend + operator-(half_t lhs, T rhs) { + return T(lhs) - rhs; + } + + template <class T> + KOKKOS_FUNCTION std::enable_if_t< + std::is_same<T, float>::value || std::is_same<T, double>::value, T> friend + operator-(T lhs, half_t rhs) { + return lhs - T(rhs); + } + + KOKKOS_FUNCTION + half_t friend operator*(half_t lhs, half_t rhs) { +#ifdef __CUDA_ARCH__ + lhs.val *= rhs.val; +#else + lhs.val = __float2half(__half2float(lhs.val) * __half2float(rhs.val)); +#endif + return lhs; + } + + // Binary Arithmetic upcast operators for * + template <class T> + KOKKOS_FUNCTION std::enable_if_t< + std::is_same<T, float>::value || std::is_same<T, double>::value, T> friend + operator*(half_t lhs, T rhs) { + return T(lhs) * rhs; + } + + template <class T> + KOKKOS_FUNCTION std::enable_if_t< + std::is_same<T, float>::value || std::is_same<T, double>::value, T> friend + operator*(T lhs, half_t rhs) { + return lhs * T(rhs); + } + + KOKKOS_FUNCTION + half_t friend operator/(half_t lhs, half_t rhs) { +#ifdef __CUDA_ARCH__ + lhs.val /= rhs.val; +#else + lhs.val = __float2half(__half2float(lhs.val) / __half2float(rhs.val)); +#endif + return lhs; + } + + // Binary Arithmetic upcast operators for / + template <class T> + KOKKOS_FUNCTION std::enable_if_t< + std::is_same<T, float>::value || std::is_same<T, double>::value, T> friend + operator/(half_t lhs, T rhs) { + return T(lhs) / rhs; + } + + template <class T> + KOKKOS_FUNCTION std::enable_if_t< + std::is_same<T, float>::value || std::is_same<T, double>::value, T> friend + operator/(T lhs, half_t rhs) { + return lhs / T(rhs); + } + + // Logical operators + KOKKOS_FUNCTION + bool operator!() const { +#ifdef __CUDA_ARCH__ + return static_cast<bool>(!val); +#else + return !__half2float(val); +#endif + } + + // NOTE: Loses short-circuit evaluation + KOKKOS_FUNCTION + bool operator&&(half_t rhs) const { +#ifdef __CUDA_ARCH__ + return static_cast<bool>(val && rhs.val); +#else + return __half2float(val) && __half2float(rhs.val); +#endif + } + + // NOTE: Loses short-circuit evaluation + KOKKOS_FUNCTION + bool operator||(half_t rhs) const { +#ifdef __CUDA_ARCH__ + return static_cast<bool>(val || rhs.val); +#else + return __half2float(val) || __half2float(rhs.val); +#endif + } + + // Comparison operators + KOKKOS_FUNCTION + bool operator==(half_t rhs) const { +#ifdef __CUDA_ARCH__ + return static_cast<bool>(val == rhs.val); +#else + return __half2float(val) == __half2float(rhs.val); +#endif + } + + KOKKOS_FUNCTION + bool operator!=(half_t rhs) const { +#ifdef __CUDA_ARCH__ + return static_cast<bool>(val != rhs.val); +#else + return __half2float(val) != __half2float(rhs.val); +#endif + } + + KOKKOS_FUNCTION + bool operator<(half_t rhs) const { +#ifdef __CUDA_ARCH__ + return static_cast<bool>(val < rhs.val); +#else + return __half2float(val) < __half2float(rhs.val); +#endif + } + + KOKKOS_FUNCTION + bool operator>(half_t rhs) const { +#ifdef __CUDA_ARCH__ + return static_cast<bool>(val > rhs.val); +#else + return __half2float(val) > __half2float(rhs.val); +#endif + } + + KOKKOS_FUNCTION + bool operator<=(half_t rhs) const { +#ifdef __CUDA_ARCH__ + return static_cast<bool>(val <= rhs.val); +#else + return __half2float(val) <= __half2float(rhs.val); +#endif + } + + KOKKOS_FUNCTION + bool operator>=(half_t rhs) const { +#ifdef __CUDA_ARCH__ + return static_cast<bool>(val >= rhs.val); +#else + return __half2float(val) >= __half2float(rhs.val); +#endif + } +}; + +// CUDA before 11.1 only has the half <-> float conversions marked host device +// So we will largely convert to float on the host for conversion +// But still call the correct functions on the device +#if (CUDA_VERSION < 11100) + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(half_t val) { return val; } + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(float val) { return half_t(__float2half(val)); } + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(bool val) { return cast_to_half(static_cast<float>(val)); } + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(double val) { + // double2half was only introduced in CUDA 11 too + return half_t(__float2half(static_cast<float>(val))); +} + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(short val) { +#ifdef __CUDA_ARCH__ + return half_t(__short2half_rn(val)); +#else + return half_t(__float2half(static_cast<float>(val))); +#endif +} + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned short val) { +#ifdef __CUDA_ARCH__ + return half_t(__ushort2half_rn(val)); +#else + return half_t(__float2half(static_cast<float>(val))); +#endif +} + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(int val) { +#ifdef __CUDA_ARCH__ + return half_t(__int2half_rn(val)); +#else + return half_t(__float2half(static_cast<float>(val))); +#endif +} + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned int val) { +#ifdef __CUDA_ARCH__ + return half_t(__uint2half_rn(val)); +#else + return half_t(__float2half(static_cast<float>(val))); +#endif +} + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(long long val) { +#ifdef __CUDA_ARCH__ + return half_t(__ll2half_rn(val)); +#else + return half_t(__float2half(static_cast<float>(val))); +#endif +} + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned long long val) { +#ifdef __CUDA_ARCH__ + return half_t(__ull2half_rn(val)); +#else + return half_t(__float2half(static_cast<float>(val))); +#endif +} + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(long val) { + return cast_to_half(static_cast<long long>(val)); +} + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned long val) { + return cast_to_half(static_cast<unsigned long long>(val)); +} + +template <class T> +KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, float>::value, T> +cast_from_half(half_t val) { + return __half2float(half_t::impl_type(val)); +} + +template <class T> +KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, bool>::value, T> +cast_from_half(half_t val) { + return static_cast<T>(cast_from_half<float>(val)); +} + +template <class T> +KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, double>::value, T> +cast_from_half(half_t val) { + return static_cast<T>(__half2float(half_t::impl_type(val))); +} + +template <class T> +KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, short>::value, T> +cast_from_half(half_t val) { +#ifdef __CUDA_ARCH__ + return __half2short_rz(half_t::impl_type(val)); +#else + return static_cast<T>(__half2float(half_t::impl_type(val))); +#endif +} + +template <class T> +KOKKOS_INLINE_FUNCTION + std::enable_if_t<std::is_same<T, unsigned short>::value, T> + cast_from_half(half_t val) { +#ifdef __CUDA_ARCH__ + return __half2ushort_rz(half_t::impl_type(val)); +#else + return static_cast<T>(__half2float(half_t::impl_type(val))); +#endif +} +template <class T> +KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, int>::value, T> +cast_from_half(half_t val) { +#ifdef __CUDA_ARCH__ + return __half2int_rz(half_t::impl_type(val)); +#else + return static_cast<T>(__half2float(half_t::impl_type(val))); +#endif +} + +template <class T> +KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, unsigned>::value, T> +cast_from_half(half_t val) { +#ifdef __CUDA_ARCH__ + return __half2uint_rz(half_t::impl_type(val)); +#else + return static_cast<T>(__half2float(half_t::impl_type(val))); +#endif +} + +template <class T> +KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, long long>::value, T> +cast_from_half(half_t val) { +#ifdef __CUDA_ARCH__ + return __half2ll_rz(half_t::impl_type(val)); +#else + return static_cast<T>(__half2float(half_t::impl_type(val))); +#endif +} + +template <class T> +KOKKOS_INLINE_FUNCTION + std::enable_if_t<std::is_same<T, unsigned long long>::value, T> + cast_from_half(half_t val) { +#ifdef __CUDA_ARCH__ + return __half2ull_rz(half_t::impl_type(val)); +#else + return static_cast<T>(__half2float(half_t::impl_type(val))); +#endif +} + +template <class T> +KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, long>::value, T> +cast_from_half(half_t val) { + return static_cast<T>(cast_from_half<long long>(val)); +} + +template <class T> +KOKKOS_INLINE_FUNCTION + std::enable_if_t<std::is_same<T, unsigned long>::value, T> + cast_from_half(half_t val) { + return static_cast<T>(cast_from_half<unsigned long long>(val)); +} + +#else // CUDA 11.1 versions follow + +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(float val) { return __float2half(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(double val) { return __double2half(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(short val) { return __short2half_rn(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned short val) { return __ushort2half_rn(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(int val) { return __int2half_rn(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned int val) { return __uint2half_rn(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(long long val) { return __ll2half_rn(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned long long val) { return __ull2half_rn(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(long val) { + return cast_to_half(static_cast<long long>(val)); +} +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned long val) { + return cast_to_half(static_cast<unsigned long long>(val)); +} + +template <class T> +KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, float>::value, T> +cast_from_half(half_t val) { + return __half2float(val); +} +template <class T> +KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, double>::value, T> +cast_from_half(half_t val) { + return __half2double(val); +} +template <class T> +KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, short>::value, T> +cast_from_half(half_t val) { + return __half2short_rz(val); +} +template <class T> +KOKKOS_INLINE_FUNCTION + std::enable_if_t<std::is_same<T, unsigned short>::value, T> + cast_from_half(half_t val) { + return __half2ushort_rz(val); +} +template <class T> +KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, int>::value, T> +cast_from_half(half_t val) { + return __half2int_rz(val); +} +template <class T> +KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, unsigned int>::value, T> +cast_from_half(half_t val) { + return __half2uint_rz(val); +} +template <class T> +KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, long long>::value, T> +cast_from_half(half_t val) { + return __half2ll_rz(val); +} +template <class T> +KOKKOS_INLINE_FUNCTION + std::enable_if_t<std::is_same<T, unsigned long long>::value, T> + cast_from_half(half_t val) { + return __half2ull_rz(val); +} +template <class T> +KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_same<T, long>::value, T> +cast_from_half(half_t val) { + return static_cast<T>(cast_from_half<long long>(val)); +} +template <class T> +KOKKOS_INLINE_FUNCTION + std::enable_if_t<std::is_same<T, unsigned long>::value, T> + cast_from_half(half_t val) { + return static_cast<T>(cast_from_half<unsigned long long>(val)); +} +#endif +} // namespace Experimental +} // namespace Kokkos +#endif // KOKKOS_IMPL_HALF_TYPE_DEFINED +#endif // KOKKOS_ENABLE_CUDA +#endif // Disables for half_t on cuda: + // Clang/8||KEPLER30||KEPLER32||KEPLER37||MAXWELL50||MAXWELL52 +#endif diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp new file mode 100644 index 0000000000000000000000000000000000000000..016cb6cdcbdd37740613724bb99efb9b4c32d7d4 --- /dev/null +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -0,0 +1,956 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +/*--------------------------------------------------------------------------*/ +/* Kokkos interfaces */ + +#include <Kokkos_Macros.hpp> +#ifdef KOKKOS_ENABLE_CUDA + +#include <Kokkos_Core.hpp> + +#include <Cuda/Kokkos_Cuda_Error.hpp> +#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp> +#include <Cuda/Kokkos_Cuda_Instance.hpp> +#include <Cuda/Kokkos_Cuda_Locks.hpp> +#include <Cuda/Kokkos_Cuda_UniqueToken.hpp> +#include <impl/Kokkos_Error.hpp> +#include <impl/Kokkos_Tools.hpp> + +/*--------------------------------------------------------------------------*/ +/* Standard 'C' libraries */ +#include <cstdlib> + +/* Standard 'C++' libraries */ +#include <vector> +#include <iostream> +#include <sstream> +#include <string> + +#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION +namespace Kokkos { +namespace Impl { + +bool CudaInternal::kokkos_impl_cuda_use_serial_execution_v = false; + +void CudaInternal::cuda_set_serial_execution(bool val) { + CudaInternal::kokkos_impl_cuda_use_serial_execution_v = val; +} +bool CudaInternal::cuda_use_serial_execution() { + return CudaInternal::kokkos_impl_cuda_use_serial_execution_v; +} + +} // namespace Impl +} // namespace Kokkos + +void kokkos_impl_cuda_set_serial_execution(bool val) { + Kokkos::Impl::CudaInternal::cuda_set_serial_execution(val); +} +bool kokkos_impl_cuda_use_serial_execution() { + return Kokkos::Impl::CudaInternal::cuda_use_serial_execution(); +} +#endif + +#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE + +__device__ __constant__ unsigned long kokkos_impl_cuda_constant_memory_buffer + [Kokkos::Impl::CudaTraits::ConstantMemoryUsage / sizeof(unsigned long)]; + +#endif + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { + +namespace { + +__global__ void query_cuda_kernel_arch(int *d_arch) { +#if defined(__CUDA_ARCH__) + *d_arch = __CUDA_ARCH__; +#else + *d_arch = 0; +#endif +} + +/** Query what compute capability is actually launched to the device: */ +int cuda_kernel_arch() { + int arch = 0; + int *d_arch = nullptr; + + cudaMalloc((void **)&d_arch, sizeof(int)); + cudaMemcpy(d_arch, &arch, sizeof(int), cudaMemcpyDefault); + + query_cuda_kernel_arch<<<1, 1>>>(d_arch); + + cudaMemcpy(&arch, d_arch, sizeof(int), cudaMemcpyDefault); + cudaFree(d_arch); + return arch; +} + +#ifdef KOKKOS_ENABLE_CUDA_UVM +bool cuda_launch_blocking() { + const char *env = getenv("CUDA_LAUNCH_BLOCKING"); + + if (env == nullptr) return false; + + return std::stoi(env); +} +#endif + +} // namespace + +void cuda_device_synchronize() { CUDA_SAFE_CALL(cudaDeviceSynchronize()); } + +void cuda_internal_error_throw(cudaError e, const char *name, const char *file, + const int line) { + std::ostringstream out; + out << name << " error( " << cudaGetErrorName(e) + << "): " << cudaGetErrorString(e); + if (file) { + out << " " << file << ":" << line; + } + throw_runtime_exception(out.str()); +} + +//---------------------------------------------------------------------------- +// Some significant cuda device properties: +// +// cudaDeviceProp::name : Text label for device +// cudaDeviceProp::major : Device major number +// cudaDeviceProp::minor : Device minor number +// cudaDeviceProp::warpSize : number of threads per warp +// cudaDeviceProp::multiProcessorCount : number of multiprocessors +// cudaDeviceProp::sharedMemPerBlock : capacity of shared memory per block +// cudaDeviceProp::totalConstMem : capacity of constant memory +// cudaDeviceProp::totalGlobalMem : capacity of global memory +// cudaDeviceProp::maxGridSize[3] : maximum grid size + +// +// Section 4.4.2.4 of the CUDA Toolkit Reference Manual +// +// struct cudaDeviceProp { +// char name[256]; +// size_t totalGlobalMem; +// size_t sharedMemPerBlock; +// int regsPerBlock; +// int warpSize; +// size_t memPitch; +// int maxThreadsPerBlock; +// int maxThreadsDim[3]; +// int maxGridSize[3]; +// size_t totalConstMem; +// int major; +// int minor; +// int clockRate; +// size_t textureAlignment; +// int deviceOverlap; +// int multiProcessorCount; +// int kernelExecTimeoutEnabled; +// int integrated; +// int canMapHostMemory; +// int computeMode; +// int concurrentKernels; +// int ECCEnabled; +// int pciBusID; +// int pciDeviceID; +// int tccDriver; +// int asyncEngineCount; +// int unifiedAddressing; +// int memoryClockRate; +// int memoryBusWidth; +// int l2CacheSize; +// int maxThreadsPerMultiProcessor; +// }; + +namespace { + +class CudaInternalDevices { + public: + enum { MAXIMUM_DEVICE_COUNT = 64 }; + struct cudaDeviceProp m_cudaProp[MAXIMUM_DEVICE_COUNT]; + int m_cudaDevCount; + + CudaInternalDevices(); + + static const CudaInternalDevices &singleton(); +}; + +CudaInternalDevices::CudaInternalDevices() { + // See 'cudaSetDeviceFlags' for host-device thread interaction + // Section 4.4.2.6 of the CUDA Toolkit Reference Manual + + CUDA_SAFE_CALL(cudaGetDeviceCount(&m_cudaDevCount)); + + if (m_cudaDevCount > MAXIMUM_DEVICE_COUNT) { + Kokkos::abort( + "Sorry, you have more GPUs per node than we thought anybody would ever " + "have. Please report this to github.com/kokkos/kokkos."); + } + for (int i = 0; i < m_cudaDevCount; ++i) { + CUDA_SAFE_CALL(cudaGetDeviceProperties(m_cudaProp + i, i)); + } +} + +const CudaInternalDevices &CudaInternalDevices::singleton() { + static CudaInternalDevices self; + return self; +} + +} // namespace + +unsigned long *CudaInternal::constantMemHostStaging = nullptr; +cudaEvent_t CudaInternal::constantMemReusable = nullptr; + +//---------------------------------------------------------------------------- + +void CudaInternal::print_configuration(std::ostream &s) const { + const CudaInternalDevices &dev_info = CudaInternalDevices::singleton(); + +#if defined(KOKKOS_ENABLE_CUDA) + s << "macro KOKKOS_ENABLE_CUDA : defined\n"; +#endif +#if defined(CUDA_VERSION) + s << "macro CUDA_VERSION = " << CUDA_VERSION << " = version " + << CUDA_VERSION / 1000 << "." << (CUDA_VERSION % 1000) / 10 << '\n'; +#endif + + for (int i = 0; i < dev_info.m_cudaDevCount; ++i) { + s << "Kokkos::Cuda[ " << i << " ] " << dev_info.m_cudaProp[i].name + << " capability " << dev_info.m_cudaProp[i].major << "." + << dev_info.m_cudaProp[i].minor << ", Total Global Memory: " + << human_memory_size(dev_info.m_cudaProp[i].totalGlobalMem) + << ", Shared Memory per Block: " + << human_memory_size(dev_info.m_cudaProp[i].sharedMemPerBlock); + if (m_cudaDev == i) s << " : Selected"; + s << std::endl; + } +} + +//---------------------------------------------------------------------------- + +CudaInternal::~CudaInternal() { + if (m_stream || m_scratchSpace || m_scratchFlags || m_scratchUnified || + m_scratchConcurrentBitset) { + std::cerr << "Kokkos::Cuda ERROR: Failed to call Kokkos::Cuda::finalize()" + << std::endl; + } + + m_cudaDev = -1; + m_cudaArch = -1; + m_multiProcCount = 0; + m_maxWarpCount = 0; + m_maxBlock = 0; + m_maxSharedWords = 0; + m_maxConcurrency = 0; + m_scratchSpaceCount = 0; + m_scratchFlagsCount = 0; + m_scratchUnifiedCount = 0; + m_scratchUnifiedSupported = 0; + m_streamCount = 0; + m_scratchSpace = nullptr; + m_scratchFlags = nullptr; + m_scratchUnified = nullptr; + m_scratchConcurrentBitset = nullptr; + m_stream = nullptr; + m_team_scratch_current_size = 0; + m_team_scratch_ptr = nullptr; +} + +int CudaInternal::verify_is_initialized(const char *const label) const { + if (m_cudaDev < 0) { + std::cerr << "Kokkos::Cuda::" << label << " : ERROR device not initialized" + << std::endl; + } + return 0 <= m_cudaDev; +} + +CudaInternal &CudaInternal::singleton() { + static CudaInternal self; + return self; +} +void CudaInternal::fence() const { + CUDA_SAFE_CALL(cudaStreamSynchronize(m_stream)); +} + +void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) { + if (was_finalized) + Kokkos::abort("Calling Cuda::initialize after Cuda::finalize is illegal\n"); + was_initialized = true; + if (is_initialized()) return; + + enum { WordSize = sizeof(size_type) }; + +#ifndef KOKKOS_IMPL_TURN_OFF_CUDA_HOST_INIT_CHECK + if (!HostSpace::execution_space::impl_is_initialized()) { + const std::string msg( + "Cuda::initialize ERROR : HostSpace::execution_space is not " + "initialized"); + throw_runtime_exception(msg); + } +#endif + + const CudaInternalDevices &dev_info = CudaInternalDevices::singleton(); + + const bool ok_init = nullptr == m_scratchSpace || nullptr == m_scratchFlags; + + const bool ok_id = + 0 <= cuda_device_id && cuda_device_id < dev_info.m_cudaDevCount; + + // Need device capability 3.0 or better + + const bool ok_dev = + ok_id && (3 <= dev_info.m_cudaProp[cuda_device_id].major && + 0 <= dev_info.m_cudaProp[cuda_device_id].minor); + + if (ok_init && ok_dev) { + const struct cudaDeviceProp &cudaProp = dev_info.m_cudaProp[cuda_device_id]; + + m_cudaDev = cuda_device_id; + m_deviceProp = cudaProp; + + CUDA_SAFE_CALL(cudaSetDevice(m_cudaDev)); + Kokkos::Impl::cuda_device_synchronize(); + + // Query what compute capability architecture a kernel executes: + m_cudaArch = cuda_kernel_arch(); + + if (m_cudaArch == 0) { + std::stringstream ss; + ss << "Kokkos::Cuda::initialize ERROR: likely mismatch of architecture\n"; + std::string msg = ss.str(); + Kokkos::abort(msg.c_str()); + } + + int compiled_major = m_cudaArch / 100; + int compiled_minor = (m_cudaArch % 100) / 10; + + if (compiled_major != cudaProp.major || compiled_minor > cudaProp.minor) { + std::stringstream ss; + ss << "Kokkos::Cuda::initialize ERROR: running kernels compiled for " + "compute capability " + << compiled_major << "." << compiled_minor + << " on device with compute capability " << cudaProp.major << "." + << cudaProp.minor << " is not supported by CUDA!\n"; + std::string msg = ss.str(); + Kokkos::abort(msg.c_str()); + } + if (Kokkos::show_warnings() && (compiled_major != cudaProp.major || + compiled_minor != cudaProp.minor)) { + std::cerr << "Kokkos::Cuda::initialize WARNING: running kernels compiled " + "for compute capability " + << compiled_major << "." << compiled_minor + << " on device with compute capability " << cudaProp.major + << "." << cudaProp.minor + << " , this will likely reduce potential performance." + << std::endl; + } + + // number of multiprocessors + + m_multiProcCount = cudaProp.multiProcessorCount; + + //---------------------------------- + // Maximum number of warps, + // at most one warp per thread in a warp for reduction. + + m_maxWarpCount = cudaProp.maxThreadsPerBlock / Impl::CudaTraits::WarpSize; + + if (Impl::CudaTraits::WarpSize < m_maxWarpCount) { + m_maxWarpCount = Impl::CudaTraits::WarpSize; + } + + m_maxSharedWords = cudaProp.sharedMemPerBlock / WordSize; + + //---------------------------------- + // Maximum number of blocks: + + m_maxBlock = cudaProp.maxGridSize[0]; + + m_shmemPerSM = cudaProp.sharedMemPerMultiprocessor; + m_maxShmemPerBlock = cudaProp.sharedMemPerBlock; + m_regsPerSM = cudaProp.regsPerMultiprocessor; + m_maxBlocksPerSM = + m_cudaArch < 500 + ? 16 + : (m_cudaArch < 750 ? 32 : (m_cudaArch == 750 ? 16 : 32)); + m_maxThreadsPerSM = cudaProp.maxThreadsPerMultiProcessor; + m_maxThreadsPerBlock = cudaProp.maxThreadsPerBlock; + + //---------------------------------- + + m_scratchUnifiedSupported = cudaProp.unifiedAddressing; + + if (Kokkos::show_warnings() && !m_scratchUnifiedSupported) { + std::cerr << "Kokkos::Cuda device " << cudaProp.name << " capability " + << cudaProp.major << "." << cudaProp.minor + << " does not support unified virtual address space" + << std::endl; + } + + //---------------------------------- + // Multiblock reduction uses scratch flags for counters + // and scratch space for partial reduction values. + // Allocate some initial space. This will grow as needed. + + { + const unsigned reduce_block_count = + m_maxWarpCount * Impl::CudaTraits::WarpSize; + + (void)scratch_unified(16 * sizeof(size_type)); + (void)scratch_flags(reduce_block_count * 2 * sizeof(size_type)); + (void)scratch_space(reduce_block_count * 16 * sizeof(size_type)); + } + //---------------------------------- + // Concurrent bitset for obtaining unique tokens from within + // an executing kernel. + { + m_maxConcurrency = m_maxThreadsPerSM * cudaProp.multiProcessorCount; + + const int32_t buffer_bound = + Kokkos::Impl::concurrent_bitset::buffer_bound(m_maxConcurrency); + + // Allocate and initialize uint32_t[ buffer_bound ] + + using Record = + Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>; + + Record *const r = + Record::allocate(Kokkos::CudaSpace(), "Kokkos::InternalScratchBitset", + sizeof(uint32_t) * buffer_bound); + + Record::increment(r); + + m_scratchConcurrentBitset = reinterpret_cast<uint32_t *>(r->data()); + + CUDA_SAFE_CALL(cudaMemset(m_scratchConcurrentBitset, 0, + sizeof(uint32_t) * buffer_bound)); + } + //---------------------------------- + + } else { + std::ostringstream msg; + msg << "Kokkos::Cuda::initialize(" << cuda_device_id << ") FAILED"; + + if (!ok_init) { + msg << " : Already initialized"; + } + if (!ok_id) { + msg << " : Device identifier out of range " + << "[0.." << dev_info.m_cudaDevCount << "]"; + } else if (!ok_dev) { + msg << " : Device "; + msg << dev_info.m_cudaProp[cuda_device_id].major; + msg << "."; + msg << dev_info.m_cudaProp[cuda_device_id].minor; + msg << " has insufficient capability, required 3.0 or better"; + } + Kokkos::Impl::throw_runtime_exception(msg.str()); + } + +#ifdef KOKKOS_ENABLE_CUDA_UVM + if (Kokkos::show_warnings() && !cuda_launch_blocking()) { + std::cerr << R"warning( +Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default + without setting CUDA_LAUNCH_BLOCKING=1. + The code must call Cuda().fence() after each kernel + or will likely crash when accessing data on the host.)warning" + << std::endl; + } + + const char *env_force_device_alloc = + getenv("CUDA_MANAGED_FORCE_DEVICE_ALLOC"); + bool force_device_alloc; + if (env_force_device_alloc == nullptr) + force_device_alloc = false; + else + force_device_alloc = std::stoi(env_force_device_alloc) != 0; + + const char *env_visible_devices = getenv("CUDA_VISIBLE_DEVICES"); + bool visible_devices_one = true; + if (env_visible_devices == nullptr) visible_devices_one = false; + + if (Kokkos::show_warnings() && + (!visible_devices_one && !force_device_alloc)) { + std::cerr << R"warning( +Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default + without setting CUDA_MANAGED_FORCE_DEVICE_ALLOC=1 or + setting CUDA_VISIBLE_DEVICES. + This could on multi GPU systems lead to severe performance" + penalties.)warning" + << std::endl; + } +#endif + +#ifdef KOKKOS_ENABLE_PRE_CUDA_10_DEPRECATION_API + cudaThreadSetCacheConfig(cudaFuncCachePreferShared); +#else + cudaDeviceSetCacheConfig(cudaFuncCachePreferShared); +#endif + + // Init the array for used for arbitrarily sized atomics + if (stream == nullptr) Impl::initialize_host_cuda_lock_arrays(); + + // Allocate a staging buffer for constant mem in pinned host memory + // and an event to avoid overwriting driver for previous kernel launches + if (stream == nullptr) { + CUDA_SAFE_CALL(cudaMallocHost((void **)&constantMemHostStaging, + CudaTraits::ConstantMemoryUsage)); + + CUDA_SAFE_CALL(cudaEventCreate(&constantMemReusable)); + } + + m_stream = stream; + m_team_scratch_current_size = 0; + m_team_scratch_ptr = nullptr; +} + +//---------------------------------------------------------------------------- + +using ScratchGrain = Cuda::size_type[Impl::CudaTraits::WarpSize]; +enum { sizeScratchGrain = sizeof(ScratchGrain) }; + +Cuda::size_type *CudaInternal::scratch_flags(const Cuda::size_type size) const { + if (verify_is_initialized("scratch_flags") && + m_scratchFlagsCount * sizeScratchGrain < size) { + m_scratchFlagsCount = (size + sizeScratchGrain - 1) / sizeScratchGrain; + + using Record = + Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>; + + if (m_scratchFlags) Record::decrement(Record::get_record(m_scratchFlags)); + + Record *const r = + Record::allocate(Kokkos::CudaSpace(), "Kokkos::InternalScratchFlags", + (sizeof(ScratchGrain) * m_scratchFlagsCount)); + + Record::increment(r); + + m_scratchFlags = reinterpret_cast<size_type *>(r->data()); + + CUDA_SAFE_CALL( + cudaMemset(m_scratchFlags, 0, m_scratchFlagsCount * sizeScratchGrain)); + } + + return m_scratchFlags; +} + +Cuda::size_type *CudaInternal::scratch_space(const Cuda::size_type size) const { + if (verify_is_initialized("scratch_space") && + m_scratchSpaceCount * sizeScratchGrain < size) { + m_scratchSpaceCount = (size + sizeScratchGrain - 1) / sizeScratchGrain; + + using Record = + Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>; + + if (m_scratchSpace) Record::decrement(Record::get_record(m_scratchSpace)); + + Record *const r = + Record::allocate(Kokkos::CudaSpace(), "Kokkos::InternalScratchSpace", + (sizeof(ScratchGrain) * m_scratchSpaceCount)); + + Record::increment(r); + + m_scratchSpace = reinterpret_cast<size_type *>(r->data()); + } + + return m_scratchSpace; +} + +Cuda::size_type *CudaInternal::scratch_unified( + const Cuda::size_type size) const { + if (verify_is_initialized("scratch_unified") && m_scratchUnifiedSupported && + m_scratchUnifiedCount * sizeScratchGrain < size) { + m_scratchUnifiedCount = (size + sizeScratchGrain - 1) / sizeScratchGrain; + + using Record = + Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>; + + if (m_scratchUnified) + Record::decrement(Record::get_record(m_scratchUnified)); + + Record *const r = Record::allocate( + Kokkos::CudaHostPinnedSpace(), "Kokkos::InternalScratchUnified", + (sizeof(ScratchGrain) * m_scratchUnifiedCount)); + + Record::increment(r); + + m_scratchUnified = reinterpret_cast<size_type *>(r->data()); + } + + return m_scratchUnified; +} + +Cuda::size_type *CudaInternal::scratch_functor( + const Cuda::size_type size) const { + if (verify_is_initialized("scratch_functor") && m_scratchFunctorSize < size) { + m_scratchFunctorSize = size; + + using Record = + Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>; + + if (m_scratchFunctor) + Record::decrement(Record::get_record(m_scratchFunctor)); + + Record *const r = + Record::allocate(Kokkos::CudaSpace(), "Kokkos::InternalScratchFunctor", + m_scratchFunctorSize); + + Record::increment(r); + + m_scratchFunctor = reinterpret_cast<size_type *>(r->data()); + } + + return m_scratchFunctor; +} + +void *CudaInternal::resize_team_scratch_space(std::int64_t bytes, + bool force_shrink) { + if (m_team_scratch_current_size == 0) { + m_team_scratch_current_size = bytes; + m_team_scratch_ptr = Kokkos::kokkos_malloc<Kokkos::CudaSpace>( + "Kokkos::CudaSpace::TeamScratchMemory", m_team_scratch_current_size); + } + if ((bytes > m_team_scratch_current_size) || + ((bytes < m_team_scratch_current_size) && (force_shrink))) { + m_team_scratch_current_size = bytes; + m_team_scratch_ptr = Kokkos::kokkos_realloc<Kokkos::CudaSpace>( + m_team_scratch_ptr, m_team_scratch_current_size); + } + return m_team_scratch_ptr; +} + +//---------------------------------------------------------------------------- + +void CudaInternal::finalize() { + // skip if finalize() has already been called + if (was_finalized) return; + + was_finalized = true; + if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) { + // Only finalize this if we're the singleton + if (this == &singleton()) { + Impl::finalize_host_cuda_lock_arrays(); + } + + using RecordCuda = Kokkos::Impl::SharedAllocationRecord<CudaSpace>; + using RecordHost = + Kokkos::Impl::SharedAllocationRecord<CudaHostPinnedSpace>; + + RecordCuda::decrement(RecordCuda::get_record(m_scratchFlags)); + RecordCuda::decrement(RecordCuda::get_record(m_scratchSpace)); + RecordHost::decrement(RecordHost::get_record(m_scratchUnified)); + RecordCuda::decrement(RecordCuda::get_record(m_scratchConcurrentBitset)); + if (m_scratchFunctorSize > 0) + RecordCuda::decrement(RecordCuda::get_record(m_scratchFunctor)); + + if (m_team_scratch_current_size > 0) + Kokkos::kokkos_free<Kokkos::CudaSpace>(m_team_scratch_ptr); + + m_cudaDev = -1; + m_multiProcCount = 0; + m_maxWarpCount = 0; + m_maxBlock = 0; + m_maxSharedWords = 0; + m_scratchSpaceCount = 0; + m_scratchFlagsCount = 0; + m_scratchUnifiedCount = 0; + m_streamCount = 0; + m_scratchSpace = nullptr; + m_scratchFlags = nullptr; + m_scratchUnified = nullptr; + m_scratchConcurrentBitset = nullptr; + m_stream = nullptr; + m_team_scratch_current_size = 0; + m_team_scratch_ptr = nullptr; + } + + // only destroy these if we're finalizing the singleton + if (this == &singleton()) { + cudaFreeHost(constantMemHostStaging); + cudaEventDestroy(constantMemReusable); + auto &deep_copy_space = + Kokkos::Impl::cuda_get_deep_copy_space(/*initialize*/ false); + if (deep_copy_space) + deep_copy_space->impl_internal_space_instance()->finalize(); + cudaStreamDestroy(cuda_get_deep_copy_stream()); + } +} + +//---------------------------------------------------------------------------- + +Cuda::size_type cuda_internal_multiprocessor_count() { + return CudaInternal::singleton().m_multiProcCount; +} + +CudaSpace::size_type cuda_internal_maximum_concurrent_block_count() { +#if defined(KOKKOS_ARCH_KEPLER) + // Compute capability 3.0 through 3.7 + enum : int { max_resident_blocks_per_multiprocessor = 16 }; +#else + // Compute capability 5.0 through 6.2 + enum : int { max_resident_blocks_per_multiprocessor = 32 }; +#endif + return CudaInternal::singleton().m_multiProcCount * + max_resident_blocks_per_multiprocessor; +}; + +Cuda::size_type cuda_internal_maximum_warp_count() { + return CudaInternal::singleton().m_maxWarpCount; +} + +Cuda::size_type cuda_internal_maximum_grid_count() { + return CudaInternal::singleton().m_maxBlock; +} + +Cuda::size_type cuda_internal_maximum_shared_words() { + return CudaInternal::singleton().m_maxSharedWords; +} + +Cuda::size_type *cuda_internal_scratch_space(const Cuda &instance, + const Cuda::size_type size) { + return instance.impl_internal_space_instance()->scratch_space(size); +} + +Cuda::size_type *cuda_internal_scratch_flags(const Cuda &instance, + const Cuda::size_type size) { + return instance.impl_internal_space_instance()->scratch_flags(size); +} + +Cuda::size_type *cuda_internal_scratch_unified(const Cuda &instance, + const Cuda::size_type size) { + return instance.impl_internal_space_instance()->scratch_unified(size); +} + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- + +namespace Kokkos { + +Cuda::size_type Cuda::detect_device_count() { + return Impl::CudaInternalDevices::singleton().m_cudaDevCount; +} + +int Cuda::concurrency() { + return Impl::CudaInternal::singleton().m_maxConcurrency; +} + +int Cuda::impl_is_initialized() { + return Impl::CudaInternal::singleton().is_initialized(); +} + +void Cuda::impl_initialize(const Cuda::SelectDevice config, + size_t /*num_instances*/) { + Impl::CudaInternal::singleton().initialize(config.cuda_device_id, nullptr); +} + +std::vector<unsigned> Cuda::detect_device_arch() { + const Impl::CudaInternalDevices &s = Impl::CudaInternalDevices::singleton(); + + std::vector<unsigned> output(s.m_cudaDevCount); + + for (int i = 0; i < s.m_cudaDevCount; ++i) { + output[i] = s.m_cudaProp[i].major * 100 + s.m_cudaProp[i].minor; + } + + return output; +} + +Cuda::size_type Cuda::device_arch() { + const int dev_id = Impl::CudaInternal::singleton().m_cudaDev; + + int dev_arch = 0; + + if (0 <= dev_id) { + const struct cudaDeviceProp &cudaProp = + Impl::CudaInternalDevices::singleton().m_cudaProp[dev_id]; + + dev_arch = cudaProp.major * 100 + cudaProp.minor; + } + + return dev_arch; +} + +void Cuda::impl_finalize() { Impl::CudaInternal::singleton().finalize(); } + +Cuda::Cuda() + : m_space_instance(&Impl::CudaInternal::singleton(), + [](Impl::CudaInternal *) {}) { + Impl::CudaInternal::singleton().verify_is_initialized( + "Cuda instance constructor"); +} + +Cuda::Cuda(cudaStream_t stream) + : m_space_instance(new Impl::CudaInternal, [](Impl::CudaInternal *ptr) { + ptr->finalize(); + delete ptr; + }) { + Impl::CudaInternal::singleton().verify_is_initialized( + "Cuda instance constructor"); + m_space_instance->initialize(Impl::CudaInternal::singleton().m_cudaDev, + stream); +} + +void Cuda::print_configuration(std::ostream &s, const bool) { + Impl::CudaInternal::singleton().print_configuration(s); +} + +void Cuda::impl_static_fence() { Kokkos::Impl::cuda_device_synchronize(); } + +void Cuda::fence() const { m_space_instance->fence(); } + +const char *Cuda::name() { return "Cuda"; } + +cudaStream_t Cuda::cuda_stream() const { return m_space_instance->m_stream; } +int Cuda::cuda_device() const { return m_space_instance->m_cudaDev; } +const cudaDeviceProp &Cuda::cuda_device_prop() const { + return m_space_instance->m_deviceProp; +} + +namespace Impl { + +int get_gpu(const InitArguments &args); + +int g_cuda_space_factory_initialized = + initialize_space_factory<CudaSpaceInitializer>("150_Cuda"); + +void CudaSpaceInitializer::initialize(const InitArguments &args) { + int use_gpu = get_gpu(args); + if (std::is_same<Kokkos::Cuda, Kokkos::DefaultExecutionSpace>::value || + 0 < use_gpu) { + if (use_gpu > -1) { + Kokkos::Cuda::impl_initialize(Kokkos::Cuda::SelectDevice(use_gpu)); + } else { + Kokkos::Cuda::impl_initialize(); + } + } +} + +void CudaSpaceInitializer::finalize(bool all_spaces) { + if ((std::is_same<Kokkos::Cuda, Kokkos::DefaultExecutionSpace>::value || + all_spaces) && + Kokkos::Cuda::impl_is_initialized()) { + Kokkos::Cuda::impl_finalize(); + } +} + +void CudaSpaceInitializer::fence() { Kokkos::Cuda::impl_static_fence(); } + +void CudaSpaceInitializer::print_configuration(std::ostream &msg, + const bool detail) { + msg << "Device Execution Space:\n"; + msg << " KOKKOS_ENABLE_CUDA: yes\n"; + + msg << "Cuda Atomics:\n"; + msg << " KOKKOS_ENABLE_CUDA_ATOMICS: "; +#ifdef KOKKOS_ENABLE_CUDA_ATOMICS + msg << "yes\n"; +#else + msg << "no\n"; +#endif + + msg << "Cuda Options:\n"; + msg << " KOKKOS_ENABLE_CUDA_LAMBDA: "; +#ifdef KOKKOS_ENABLE_CUDA_LAMBDA + msg << "yes\n"; +#else + msg << "no\n"; +#endif + msg << " KOKKOS_ENABLE_CUDA_LDG_INTRINSIC: "; +#ifdef KOKKOS_ENABLE_CUDA_LDG_INTRINSIC + msg << "yes\n"; +#else + msg << "no\n"; +#endif + msg << " KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE: "; +#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE + msg << "yes\n"; +#else + msg << "no\n"; +#endif + msg << " KOKKOS_ENABLE_CUDA_UVM: "; +#ifdef KOKKOS_ENABLE_CUDA_UVM + msg << "yes\n"; +#else + msg << "no\n"; +#endif + msg << " KOKKOS_ENABLE_CUSPARSE: "; +#ifdef KOKKOS_ENABLE_CUSPARSE + msg << "yes\n"; +#else + msg << "no\n"; +#endif + msg << " KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA: "; +#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA + msg << "yes\n"; +#else + msg << "no\n"; +#endif + + msg << "\nCuda Runtime Configuration:" << std::endl; + Cuda::print_configuration(msg, detail); +} +} // namespace Impl + +} // namespace Kokkos + +namespace Kokkos { +namespace Experimental { + +UniqueToken<Kokkos::Cuda, Kokkos::Experimental::UniqueTokenScope::Global>:: + UniqueToken(Kokkos::Cuda const &) + : m_buffer( + Kokkos::Impl::CudaInternal::singleton().m_scratchConcurrentBitset), + m_count(Kokkos::Impl::CudaInternal::singleton().m_maxConcurrency) {} + +} // namespace Experimental +} // namespace Kokkos + +#else + +void KOKKOS_CORE_SRC_CUDA_IMPL_PREVENT_LINK_ERROR() {} + +#endif // KOKKOS_ENABLE_CUDA diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp new file mode 100644 index 0000000000000000000000000000000000000000..aaec2c29260a5ad2b82e2daa653a58372253cd4d --- /dev/null +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp @@ -0,0 +1,194 @@ +#ifndef KOKKOS_CUDA_INSTANCE_HPP_ +#define KOKKOS_CUDA_INSTANCE_HPP_ + +#include <vector> +#include <impl/Kokkos_Tools.hpp> +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- +// These functions fulfill the purpose of allowing to work around +// a suspected system software issue, or to check for race conditions. +// They are not currently a fully officially supported capability. +#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION +extern "C" void kokkos_impl_cuda_set_serial_execution(bool); +extern "C" bool kokkos_impl_cuda_use_serial_execution(); +#endif + +namespace Kokkos { +namespace Impl { + +struct CudaTraits { + static constexpr CudaSpace::size_type WarpSize = 32 /* 0x0020 */; + static constexpr CudaSpace::size_type WarpIndexMask = + 0x001f; /* Mask for warpindex */ + static constexpr CudaSpace::size_type WarpIndexShift = + 5; /* WarpSize == 1 << WarpShift */ + + static constexpr CudaSpace::size_type ConstantMemoryUsage = + 0x008000; /* 32k bytes */ + static constexpr CudaSpace::size_type ConstantMemoryCache = + 0x002000; /* 8k bytes */ + static constexpr CudaSpace::size_type KernelArgumentLimit = + 0x001000; /* 4k bytes */ + static constexpr CudaSpace::size_type MaxHierarchicalParallelism = + 1024; /* team_size * vector_length */ + using ConstantGlobalBufferType = + unsigned long[ConstantMemoryUsage / sizeof(unsigned long)]; + + static constexpr int ConstantMemoryUseThreshold = 0x000200 /* 512 bytes */; + + KOKKOS_INLINE_FUNCTION static CudaSpace::size_type warp_count( + CudaSpace::size_type i) { + return (i + WarpIndexMask) >> WarpIndexShift; + } + + KOKKOS_INLINE_FUNCTION static CudaSpace::size_type warp_align( + CudaSpace::size_type i) { + constexpr CudaSpace::size_type Mask = ~WarpIndexMask; + return (i + WarpIndexMask) & Mask; + } +}; + +//---------------------------------------------------------------------------- + +CudaSpace::size_type cuda_internal_multiprocessor_count(); +CudaSpace::size_type cuda_internal_maximum_warp_count(); +CudaSpace::size_type cuda_internal_maximum_grid_count(); +CudaSpace::size_type cuda_internal_maximum_shared_words(); + +CudaSpace::size_type cuda_internal_maximum_concurrent_block_count(); + +CudaSpace::size_type* cuda_internal_scratch_flags( + const Cuda&, const CudaSpace::size_type size); +CudaSpace::size_type* cuda_internal_scratch_space( + const Cuda&, const CudaSpace::size_type size); +CudaSpace::size_type* cuda_internal_scratch_unified( + const Cuda&, const CudaSpace::size_type size); + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +namespace Kokkos { +namespace Impl { + +class CudaInternal { + private: + CudaInternal(const CudaInternal&); + CudaInternal& operator=(const CudaInternal&); +#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION + static bool kokkos_impl_cuda_use_serial_execution_v; +#endif + + public: + using size_type = Cuda::size_type; + + int m_cudaDev; + + // Device Properties + int m_cudaArch; + unsigned m_multiProcCount; + unsigned m_maxWarpCount; + unsigned m_maxBlock; + unsigned m_maxSharedWords; + uint32_t m_maxConcurrency; + int m_shmemPerSM; + int m_maxShmemPerBlock; + int m_regsPerSM; + int m_maxBlocksPerSM; + int m_maxThreadsPerSM; + int m_maxThreadsPerBlock; + + cudaDeviceProp m_deviceProp; + + // Scratch Spaces for Reductions + mutable size_type m_scratchSpaceCount; + mutable size_type m_scratchFlagsCount; + mutable size_type m_scratchUnifiedCount; + mutable size_type m_scratchFunctorSize; + + size_type m_scratchUnifiedSupported; + size_type m_streamCount; + mutable size_type* m_scratchSpace; + mutable size_type* m_scratchFlags; + mutable size_type* m_scratchUnified; + mutable size_type* m_scratchFunctor; + uint32_t* m_scratchConcurrentBitset; + cudaStream_t m_stream; + + // Team Scratch Level 1 Space + mutable int64_t m_team_scratch_current_size; + mutable void* m_team_scratch_ptr; + + bool was_initialized = false; + bool was_finalized = false; + + // FIXME_CUDA: these want to be per-device, not per-stream... use of 'static' + // here will break once there are multiple devices though + static unsigned long* constantMemHostStaging; + static cudaEvent_t constantMemReusable; + + static CudaInternal& singleton(); + + int verify_is_initialized(const char* const label) const; + + int is_initialized() const { + return nullptr != m_scratchSpace && nullptr != m_scratchFlags; + } + + void initialize(int cuda_device_id, cudaStream_t stream = nullptr); + void finalize(); + + void print_configuration(std::ostream&) const; + +#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION + static bool cuda_use_serial_execution(); + static void cuda_set_serial_execution(bool); +#endif + + void fence() const; + + ~CudaInternal(); + + CudaInternal() + : m_cudaDev(-1), + m_cudaArch(-1), + m_multiProcCount(0), + m_maxWarpCount(0), + m_maxBlock(0), + m_maxSharedWords(0), + m_maxConcurrency(0), + m_shmemPerSM(0), + m_maxShmemPerBlock(0), + m_regsPerSM(0), + m_maxBlocksPerSM(0), + m_maxThreadsPerSM(0), + m_maxThreadsPerBlock(0), + m_scratchSpaceCount(0), + m_scratchFlagsCount(0), + m_scratchUnifiedCount(0), + m_scratchFunctorSize(0), + m_scratchUnifiedSupported(0), + m_streamCount(0), + m_scratchSpace(nullptr), + m_scratchFlags(nullptr), + m_scratchUnified(nullptr), + m_scratchFunctor(nullptr), + m_scratchConcurrentBitset(nullptr), + m_stream(nullptr), + m_team_scratch_current_size(0), + m_team_scratch_ptr(nullptr) {} + + // Resizing of reduction related scratch spaces + size_type* scratch_space(const size_type size) const; + size_type* scratch_flags(const size_type size) const; + size_type* scratch_unified(const size_type size) const; + size_type* scratch_functor(const size_type size) const; + + // Resizing of team level 1 scratch + void* resize_team_scratch_space(std::int64_t bytes, + bool force_shrink = false); +}; + +} // Namespace Impl +} // Namespace Kokkos +#endif diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp new file mode 100644 index 0000000000000000000000000000000000000000..d892a893b330772ec5e4306ed20a44f8aa2369f1 --- /dev/null +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp @@ -0,0 +1,718 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CUDAEXEC_HPP +#define KOKKOS_CUDAEXEC_HPP + +#include <Kokkos_Macros.hpp> +#ifdef KOKKOS_ENABLE_CUDA + +#include <mutex> +#include <string> +#include <cstdint> +#include <cmath> +#include <Kokkos_Parallel.hpp> +#include <impl/Kokkos_Error.hpp> +#include <Cuda/Kokkos_Cuda_abort.hpp> +#include <Cuda/Kokkos_Cuda_Error.hpp> +#include <Cuda/Kokkos_Cuda_Locks.hpp> +#include <Cuda/Kokkos_Cuda_Instance.hpp> +#include <impl/Kokkos_GraphImpl_fwd.hpp> +#include <Cuda/Kokkos_Cuda_GraphNodeKernel.hpp> +#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +/** \brief Access to constant memory on the device */ +#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE + +__device__ __constant__ extern unsigned long + kokkos_impl_cuda_constant_memory_buffer[]; + +#else + +__device__ __constant__ unsigned long kokkos_impl_cuda_constant_memory_buffer + [Kokkos::Impl::CudaTraits::ConstantMemoryUsage / sizeof(unsigned long)]; + +#endif + +template <typename T> +inline __device__ T* kokkos_impl_cuda_shared_memory() { + extern __shared__ Kokkos::CudaSpace::size_type sh[]; + return (T*)sh; +} + +namespace Kokkos { +namespace Impl { + +//---------------------------------------------------------------------------- +// See section B.17 of Cuda C Programming Guide Version 3.2 +// for discussion of +// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor) +// function qualifier which could be used to improve performance. +//---------------------------------------------------------------------------- +// Maximize L1 cache and minimize shared memory: +// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 ); +// For 2.0 capability: 48 KB L1 and 16 KB shared +//---------------------------------------------------------------------------- + +template <class DriverType> +__global__ static void cuda_parallel_launch_constant_memory() { + const DriverType& driver = + *((const DriverType*)kokkos_impl_cuda_constant_memory_buffer); + + driver(); +} + +template <class DriverType, unsigned int maxTperB, unsigned int minBperSM> +__global__ __launch_bounds__( + maxTperB, minBperSM) static void cuda_parallel_launch_constant_memory() { + const DriverType& driver = + *((const DriverType*)kokkos_impl_cuda_constant_memory_buffer); + + driver(); +} + +template <class DriverType> +__global__ static void cuda_parallel_launch_local_memory( + const DriverType driver) { + driver(); +} + +template <class DriverType, unsigned int maxTperB, unsigned int minBperSM> +__global__ __launch_bounds__( + maxTperB, + minBperSM) static void cuda_parallel_launch_local_memory(const DriverType + driver) { + driver(); +} + +template <class DriverType> +__global__ static void cuda_parallel_launch_global_memory( + const DriverType* driver) { + driver->operator()(); +} + +template <class DriverType, unsigned int maxTperB, unsigned int minBperSM> +__global__ __launch_bounds__( + maxTperB, + minBperSM) static void cuda_parallel_launch_global_memory(const DriverType* + driver) { + driver->operator()(); +} + +//============================================================================== +// <editor-fold desc="Some helper functions for launch code readability"> {{{1 + +inline bool is_empty_launch(dim3 const& grid, dim3 const& block) { + return (grid.x == 0) || ((block.x * block.y * block.z) == 0); +} + +inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) { + if (cuda_instance->m_maxShmemPerBlock < shmem) { + Kokkos::Impl::throw_runtime_exception( + std::string("CudaParallelLaunch (or graph node creation) FAILED: shared" + " memory request is too large")); + } +} + +// This function needs to be template on DriverType and LaunchBounds +// so that the static bool is unique for each type combo +// KernelFuncPtr does not necessarily contain that type information. +template <class DriverType, class LaunchBounds, class KernelFuncPtr> +inline void configure_shmem_preference(KernelFuncPtr const& func, + bool prefer_shmem) { +#ifndef KOKKOS_ARCH_KEPLER + // On Kepler the L1 has no benefit since it doesn't cache reads + auto set_cache_config = [&] { + CUDA_SAFE_CALL(cudaFuncSetCacheConfig( + func, + (prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1))); + return prefer_shmem; + }; + static bool cache_config_preference_cached = set_cache_config(); + if (cache_config_preference_cached != prefer_shmem) { + cache_config_preference_cached = set_cache_config(); + } +#else + // Use the parameters so we don't get a warning + (void)func; + (void)prefer_shmem; +#endif +} + +template <class Policy> +std::enable_if_t<Policy::experimental_contains_desired_occupancy> +modify_launch_configuration_if_desired_occupancy_is_specified( + Policy const& policy, cudaDeviceProp const& properties, + cudaFuncAttributes const& attributes, dim3 const& block, int& shmem, + bool& prefer_shmem) { + int const block_size = block.x * block.y * block.z; + int const desired_occupancy = policy.impl_get_desired_occupancy().value(); + + size_t const shmem_per_sm_prefer_l1 = get_shmem_per_sm_prefer_l1(properties); + size_t const static_shmem = attributes.sharedSizeBytes; + + // round to nearest integer and avoid division by zero + int active_blocks = std::max( + 1, static_cast<int>(std::round( + static_cast<double>(properties.maxThreadsPerMultiProcessor) / + block_size * desired_occupancy / 100))); + int const dynamic_shmem = + shmem_per_sm_prefer_l1 / active_blocks - static_shmem; + + if (dynamic_shmem > shmem) { + shmem = dynamic_shmem; + prefer_shmem = false; + } +} + +template <class Policy> +std::enable_if_t<!Policy::experimental_contains_desired_occupancy> +modify_launch_configuration_if_desired_occupancy_is_specified( + Policy const&, cudaDeviceProp const&, cudaFuncAttributes const&, + dim3 const& /*block*/, int& /*shmem*/, bool& /*prefer_shmem*/) {} + +// </editor-fold> end Some helper functions for launch code readability }}}1 +//============================================================================== + +//============================================================================== +// <editor-fold desc="DeduceCudaLaunchMechanism"> {{{2 + +// Use local memory up to ConstantMemoryUseThreshold +// Use global memory above ConstantMemoryUsage +// In between use ConstantMemory + +template <class DriverType> +struct DeduceCudaLaunchMechanism { + constexpr static const Kokkos::Experimental::WorkItemProperty:: + HintLightWeight_t light_weight = + Kokkos::Experimental::WorkItemProperty::HintLightWeight; + constexpr static const Kokkos::Experimental::WorkItemProperty:: + HintHeavyWeight_t heavy_weight = + Kokkos::Experimental::WorkItemProperty::HintHeavyWeight; + constexpr static const typename DriverType::Policy::work_item_property + property = typename DriverType::Policy::work_item_property(); + + static constexpr const Experimental::CudaLaunchMechanism + valid_launch_mechanism = + // BuildValidMask + (sizeof(DriverType) < CudaTraits::KernelArgumentLimit + ? Experimental::CudaLaunchMechanism::LocalMemory + : Experimental::CudaLaunchMechanism::Default) | + (sizeof(DriverType) < CudaTraits::ConstantMemoryUsage + ? Experimental::CudaLaunchMechanism::ConstantMemory + : Experimental::CudaLaunchMechanism::Default) | + Experimental::CudaLaunchMechanism::GlobalMemory; + + static constexpr const Experimental::CudaLaunchMechanism + requested_launch_mechanism = + (((property & light_weight) == light_weight) + ? Experimental::CudaLaunchMechanism::LocalMemory + : Experimental::CudaLaunchMechanism::ConstantMemory) | + Experimental::CudaLaunchMechanism::GlobalMemory; + + static constexpr const Experimental::CudaLaunchMechanism + default_launch_mechanism = + // BuildValidMask + (sizeof(DriverType) < CudaTraits::ConstantMemoryUseThreshold) + ? Experimental::CudaLaunchMechanism::LocalMemory + : ((sizeof(DriverType) < CudaTraits::ConstantMemoryUsage) + ? Experimental::CudaLaunchMechanism::ConstantMemory + : Experimental::CudaLaunchMechanism::GlobalMemory); + + // None LightWeight HeavyWeight + // F<UseT LCG LCG L L LCG LG L L LCG CG L C + // UseT<F<KAL LCG LCG C C LCG LG C L LCG CG C C + // Kal<F<CMU CG LCG C C CG LG C G CG CG C C + // CMU<F G LCG G G G LG G G G CG G G + static constexpr const Experimental::CudaLaunchMechanism launch_mechanism = + ((property & light_weight) == light_weight) + ? (sizeof(DriverType) < CudaTraits::KernelArgumentLimit + ? Experimental::CudaLaunchMechanism::LocalMemory + : Experimental::CudaLaunchMechanism::GlobalMemory) + : (((property & heavy_weight) == heavy_weight) + ? (sizeof(DriverType) < CudaTraits::ConstantMemoryUsage + ? Experimental::CudaLaunchMechanism::ConstantMemory + : Experimental::CudaLaunchMechanism::GlobalMemory) + : (default_launch_mechanism)); +}; + +// </editor-fold> end DeduceCudaLaunchMechanism }}}2 +//============================================================================== + +//============================================================================== +// <editor-fold desc="CudaParallelLaunchKernelInvoker"> {{{1 + +// Base classes that summarize the differences between the different launch +// mechanisms + +template <class DriverType, class LaunchBounds, + Experimental::CudaLaunchMechanism LaunchMechanism> +struct CudaParallelLaunchKernelFunc; + +template <class DriverType, class LaunchBounds, + Experimental::CudaLaunchMechanism LaunchMechanism> +struct CudaParallelLaunchKernelInvoker; + +//------------------------------------------------------------------------------ +// <editor-fold desc="Local memory"> {{{2 + +template <class DriverType, unsigned int MaxThreadsPerBlock, + unsigned int MinBlocksPerSM> +struct CudaParallelLaunchKernelFunc< + DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>, + Experimental::CudaLaunchMechanism::LocalMemory> { + static std::decay_t<decltype(cuda_parallel_launch_local_memory< + DriverType, MaxThreadsPerBlock, MinBlocksPerSM>)> + get_kernel_func() { + return cuda_parallel_launch_local_memory<DriverType, MaxThreadsPerBlock, + MinBlocksPerSM>; + } +}; + +template <class DriverType> +struct CudaParallelLaunchKernelFunc< + DriverType, Kokkos::LaunchBounds<0, 0>, + Experimental::CudaLaunchMechanism::LocalMemory> { + static std::decay_t<decltype(cuda_parallel_launch_local_memory<DriverType>)> + get_kernel_func() { + return cuda_parallel_launch_local_memory<DriverType>; + } +}; + +//------------------------------------------------------------------------------ + +template <class DriverType, class LaunchBounds> +struct CudaParallelLaunchKernelInvoker< + DriverType, LaunchBounds, Experimental::CudaLaunchMechanism::LocalMemory> + : CudaParallelLaunchKernelFunc< + DriverType, LaunchBounds, + Experimental::CudaLaunchMechanism::LocalMemory> { + using base_t = CudaParallelLaunchKernelFunc< + DriverType, LaunchBounds, Experimental::CudaLaunchMechanism::LocalMemory>; + static_assert(sizeof(DriverType) < CudaTraits::KernelArgumentLimit, + "Kokkos Error: Requested CudaLaunchLocalMemory with a Functor " + "larger than 4096 bytes."); + + static void invoke_kernel(DriverType const& driver, dim3 const& grid, + dim3 const& block, int shmem, + CudaInternal const* cuda_instance) { + (base_t:: + get_kernel_func())<<<grid, block, shmem, cuda_instance->m_stream>>>( + driver); + } + +#ifdef KOKKOS_CUDA_ENABLE_GRAPHS + inline static void create_parallel_launch_graph_node( + DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem, + CudaInternal const* cuda_instance, bool prefer_shmem) { + //---------------------------------------- + auto const& graph = Impl::get_cuda_graph_from_kernel(driver); + KOKKOS_EXPECTS(bool(graph)); + auto& graph_node = Impl::get_cuda_graph_node_from_kernel(driver); + // Expect node not yet initialized + KOKKOS_EXPECTS(!bool(graph_node)); + + if (!Impl::is_empty_launch(grid, block)) { + Impl::check_shmem_request(cuda_instance, shmem); + Impl::configure_shmem_preference<DriverType, LaunchBounds>( + base_t::get_kernel_func(), prefer_shmem); + + void const* args[] = {&driver}; + + cudaKernelNodeParams params = {}; + + params.blockDim = block; + params.gridDim = grid; + params.sharedMemBytes = shmem; + params.func = (void*)base_t::get_kernel_func(); + params.kernelParams = (void**)args; + params.extra = nullptr; + + CUDA_SAFE_CALL(cudaGraphAddKernelNode( + &graph_node, graph, /* dependencies = */ nullptr, + /* numDependencies = */ 0, ¶ms)); + } else { + // We still need an empty node for the dependency structure + CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&graph_node, graph, + /* dependencies = */ nullptr, + /* numDependencies = */ 0)); + } + KOKKOS_ENSURES(bool(graph_node)) + } +#endif +}; + +// </editor-fold> end local memory }}}2 +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// <editor-fold desc="Global Memory"> {{{2 + +template <class DriverType, unsigned int MaxThreadsPerBlock, + unsigned int MinBlocksPerSM> +struct CudaParallelLaunchKernelFunc< + DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>, + Experimental::CudaLaunchMechanism::GlobalMemory> { + static void* get_kernel_func() { + return cuda_parallel_launch_global_memory<DriverType, MaxThreadsPerBlock, + MinBlocksPerSM>; + } +}; + +template <class DriverType> +struct CudaParallelLaunchKernelFunc< + DriverType, Kokkos::LaunchBounds<0, 0>, + Experimental::CudaLaunchMechanism::GlobalMemory> { + static std::decay_t<decltype(cuda_parallel_launch_global_memory<DriverType>)> + get_kernel_func() { + return cuda_parallel_launch_global_memory<DriverType>; + } +}; + +//------------------------------------------------------------------------------ + +template <class DriverType, class LaunchBounds> +struct CudaParallelLaunchKernelInvoker< + DriverType, LaunchBounds, Experimental::CudaLaunchMechanism::GlobalMemory> + : CudaParallelLaunchKernelFunc< + DriverType, LaunchBounds, + Experimental::CudaLaunchMechanism::GlobalMemory> { + using base_t = CudaParallelLaunchKernelFunc< + DriverType, LaunchBounds, + Experimental::CudaLaunchMechanism::GlobalMemory>; + + static void invoke_kernel(DriverType const& driver, dim3 const& grid, + dim3 const& block, int shmem, + CudaInternal const* cuda_instance) { + DriverType* driver_ptr = reinterpret_cast<DriverType*>( + cuda_instance->scratch_functor(sizeof(DriverType))); + + cudaMemcpyAsync(driver_ptr, &driver, sizeof(DriverType), cudaMemcpyDefault, + cuda_instance->m_stream); + (base_t:: + get_kernel_func())<<<grid, block, shmem, cuda_instance->m_stream>>>( + driver_ptr); + } + +#ifdef KOKKOS_CUDA_ENABLE_GRAPHS + inline static void create_parallel_launch_graph_node( + DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem, + CudaInternal const* cuda_instance, bool prefer_shmem) { + //---------------------------------------- + auto const& graph = Impl::get_cuda_graph_from_kernel(driver); + KOKKOS_EXPECTS(bool(graph)); + auto& graph_node = Impl::get_cuda_graph_node_from_kernel(driver); + // Expect node not yet initialized + KOKKOS_EXPECTS(!bool(graph_node)); + + if (!Impl::is_empty_launch(grid, block)) { + Impl::check_shmem_request(cuda_instance, shmem); + Impl::configure_shmem_preference<DriverType, LaunchBounds>( + base_t::get_kernel_func(), prefer_shmem); + + auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver); + + // Unlike in the non-graph case, we can get away with doing an async copy + // here because the `DriverType` instance is held in the GraphNodeImpl + // which is guaranteed to be alive until the graph instance itself is + // destroyed, where there should be a fence ensuring that the allocation + // associated with this kernel on the device side isn't deleted. + cudaMemcpyAsync(driver_ptr, &driver, sizeof(DriverType), + cudaMemcpyDefault, cuda_instance->m_stream); + + void const* args[] = {&driver_ptr}; + + cudaKernelNodeParams params = {}; + + params.blockDim = block; + params.gridDim = grid; + params.sharedMemBytes = shmem; + params.func = (void*)base_t::get_kernel_func(); + params.kernelParams = (void**)args; + params.extra = nullptr; + + CUDA_SAFE_CALL(cudaGraphAddKernelNode( + &graph_node, graph, /* dependencies = */ nullptr, + /* numDependencies = */ 0, ¶ms)); + } else { + // We still need an empty node for the dependency structure + CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&graph_node, graph, + /* dependencies = */ nullptr, + /* numDependencies = */ 0)); + } + KOKKOS_ENSURES(bool(graph_node)) + } +#endif +}; + +// </editor-fold> end Global Memory }}}2 +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// <editor-fold desc="Constant Memory"> {{{2 + +template <class DriverType, unsigned int MaxThreadsPerBlock, + unsigned int MinBlocksPerSM> +struct CudaParallelLaunchKernelFunc< + DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>, + Experimental::CudaLaunchMechanism::ConstantMemory> { + static std::decay_t<decltype(cuda_parallel_launch_constant_memory< + DriverType, MaxThreadsPerBlock, MinBlocksPerSM>)> + get_kernel_func() { + return cuda_parallel_launch_constant_memory<DriverType, MaxThreadsPerBlock, + MinBlocksPerSM>; + } +}; + +template <class DriverType> +struct CudaParallelLaunchKernelFunc< + DriverType, Kokkos::LaunchBounds<0, 0>, + Experimental::CudaLaunchMechanism::ConstantMemory> { + static std::decay_t< + decltype(cuda_parallel_launch_constant_memory<DriverType>)> + get_kernel_func() { + return cuda_parallel_launch_constant_memory<DriverType>; + } +}; + +//------------------------------------------------------------------------------ + +template <class DriverType, class LaunchBounds> +struct CudaParallelLaunchKernelInvoker< + DriverType, LaunchBounds, Experimental::CudaLaunchMechanism::ConstantMemory> + : CudaParallelLaunchKernelFunc< + DriverType, LaunchBounds, + Experimental::CudaLaunchMechanism::ConstantMemory> { + using base_t = CudaParallelLaunchKernelFunc< + DriverType, LaunchBounds, + Experimental::CudaLaunchMechanism::ConstantMemory>; + static_assert(sizeof(DriverType) < CudaTraits::ConstantMemoryUsage, + "Kokkos Error: Requested CudaLaunchConstantMemory with a " + "Functor larger than 32kB."); + + static void invoke_kernel(DriverType const& driver, dim3 const& grid, + dim3 const& block, int shmem, + CudaInternal const* cuda_instance) { + // Wait until the previous kernel that uses the constant buffer is done + CUDA_SAFE_CALL(cudaEventSynchronize(cuda_instance->constantMemReusable)); + + // Copy functor (synchronously) to staging buffer in pinned host memory + unsigned long* staging = cuda_instance->constantMemHostStaging; + memcpy(staging, &driver, sizeof(DriverType)); + + // Copy functor asynchronously from there to constant memory on the device + cudaMemcpyToSymbolAsync(kokkos_impl_cuda_constant_memory_buffer, staging, + sizeof(DriverType), 0, cudaMemcpyHostToDevice, + cudaStream_t(cuda_instance->m_stream)); + + // Invoke the driver function on the device + (base_t:: + get_kernel_func())<<<grid, block, shmem, cuda_instance->m_stream>>>(); + + // Record an event that says when the constant buffer can be reused + CUDA_SAFE_CALL(cudaEventRecord(cuda_instance->constantMemReusable, + cudaStream_t(cuda_instance->m_stream))); + } + +#ifdef KOKKOS_CUDA_ENABLE_GRAPHS + inline static void create_parallel_launch_graph_node( + DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem, + CudaInternal const* cuda_instance, bool prefer_shmem) { + // Just use global memory; coordinating through events to share constant + // memory with the non-graph interface is not really reasonable since + // events don't work with Graphs directly, and this would anyway require + // a much more complicated structure that finds previous nodes in the + // dependency structure of the graph and creates an implicit dependence + // based on the need for constant memory (which we would then have to + // somehow go and prove was not creating a dependency cycle, and I don't + // even know if there's an efficient way to do that, let alone in the + // structure we currenty have). + using global_launch_impl_t = CudaParallelLaunchKernelInvoker< + DriverType, LaunchBounds, + Experimental::CudaLaunchMechanism::GlobalMemory>; + global_launch_impl_t::create_parallel_launch_graph_node( + driver, grid, block, shmem, cuda_instance, prefer_shmem); + } +#endif +}; + +// </editor-fold> end Constant Memory }}}2 +//------------------------------------------------------------------------------ + +// </editor-fold> end CudaParallelLaunchKernelInvoker }}}1 +//============================================================================== + +//============================================================================== +// <editor-fold desc="CudaParallelLaunchImpl"> {{{1 + +template <class DriverType, class LaunchBounds, + Experimental::CudaLaunchMechanism LaunchMechanism> +struct CudaParallelLaunchImpl; + +template <class DriverType, unsigned int MaxThreadsPerBlock, + unsigned int MinBlocksPerSM, + Experimental::CudaLaunchMechanism LaunchMechanism> +struct CudaParallelLaunchImpl< + DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>, + LaunchMechanism> + : CudaParallelLaunchKernelInvoker< + DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>, + LaunchMechanism> { + using base_t = CudaParallelLaunchKernelInvoker< + DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>, + LaunchMechanism>; + + inline static void launch_kernel(const DriverType& driver, const dim3& grid, + const dim3& block, int shmem, + const CudaInternal* cuda_instance, + bool prefer_shmem) { + if (!Impl::is_empty_launch(grid, block)) { + // Prevent multiple threads to simultaneously set the cache configuration + // preference and launch the same kernel + static std::mutex mutex; + std::lock_guard<std::mutex> lock(mutex); + + Impl::check_shmem_request(cuda_instance, shmem); + + // If a desired occupancy is specified, we compute how much shared memory + // to ask for to achieve that occupancy, assuming that the cache + // configuration is `cudaFuncCachePreferL1`. If the amount of dynamic + // shared memory computed is actually smaller than `shmem` we overwrite + // `shmem` and set `prefer_shmem` to `false`. + modify_launch_configuration_if_desired_occupancy_is_specified( + driver.get_policy(), cuda_instance->m_deviceProp, + get_cuda_func_attributes(), block, shmem, prefer_shmem); + + Impl::configure_shmem_preference< + DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>( + base_t::get_kernel_func(), prefer_shmem); + + KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE(); + + // Invoke the driver function on the device + base_t::invoke_kernel(driver, grid, block, shmem, cuda_instance); + +#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) + CUDA_SAFE_CALL(cudaGetLastError()); + cuda_instance->fence(); +#endif + } + } + + static cudaFuncAttributes get_cuda_func_attributes() { + // Race condition inside of cudaFuncGetAttributes if the same address is + // given requires using a local variable as input instead of a static Rely + // on static variable initialization to make sure only one thread executes + // the code and the result is visible. + auto wrap_get_attributes = []() -> cudaFuncAttributes { + cudaFuncAttributes attr_tmp; + CUDA_SAFE_CALL( + cudaFuncGetAttributes(&attr_tmp, base_t::get_kernel_func())); + return attr_tmp; + }; + static cudaFuncAttributes attr = wrap_get_attributes(); + return attr; + } +}; + +// </editor-fold> end CudaParallelLaunchImpl }}}1 +//============================================================================== + +//============================================================================== +// <editor-fold desc="CudaParallelLaunch"> {{{1 + +template <class DriverType, class LaunchBounds = Kokkos::LaunchBounds<>, + Experimental::CudaLaunchMechanism LaunchMechanism = + DeduceCudaLaunchMechanism<DriverType>::launch_mechanism, + bool DoGraph = DriverType::Policy::is_graph_kernel::value +#ifndef KOKKOS_CUDA_ENABLE_GRAPHS + && false +#endif + > +struct CudaParallelLaunch; + +// General launch mechanism +template <class DriverType, class LaunchBounds, + Experimental::CudaLaunchMechanism LaunchMechanism> +struct CudaParallelLaunch<DriverType, LaunchBounds, LaunchMechanism, + /* DoGraph = */ false> + : CudaParallelLaunchImpl<DriverType, LaunchBounds, LaunchMechanism> { + using base_t = + CudaParallelLaunchImpl<DriverType, LaunchBounds, LaunchMechanism>; + template <class... Args> + CudaParallelLaunch(Args&&... args) { + base_t::launch_kernel((Args &&) args...); + } +}; + +#ifdef KOKKOS_CUDA_ENABLE_GRAPHS +// Launch mechanism for creating graph nodes +template <class DriverType, class LaunchBounds, + Experimental::CudaLaunchMechanism LaunchMechanism> +struct CudaParallelLaunch<DriverType, LaunchBounds, LaunchMechanism, + /* DoGraph = */ true> + : CudaParallelLaunchImpl<DriverType, LaunchBounds, LaunchMechanism> { + using base_t = + CudaParallelLaunchImpl<DriverType, LaunchBounds, LaunchMechanism>; + template <class... Args> + CudaParallelLaunch(Args&&... args) { + base_t::create_parallel_launch_graph_node((Args &&) args...); + } +}; +#endif + +// </editor-fold> end CudaParallelLaunch }}}1 +//============================================================================== + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* defined( KOKKOS_ENABLE_CUDA ) */ +#endif /* #ifndef KOKKOS_CUDAEXEC_HPP */ diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ff31649544033b773519152ca25a22494fdd2f5f --- /dev/null +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp @@ -0,0 +1,119 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#ifdef KOKKOS_ENABLE_CUDA +#include <Cuda/Kokkos_Cuda_Locks.hpp> +#include <Cuda/Kokkos_Cuda_Error.hpp> + +#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE +namespace Kokkos { +namespace Impl { +__device__ __constant__ CudaLockArrays g_device_cuda_lock_arrays = {nullptr, + nullptr, 0}; +} +} // namespace Kokkos +#endif + +namespace Kokkos { + +namespace { + +__global__ void init_lock_array_kernel_atomic() { + unsigned i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < CUDA_SPACE_ATOMIC_MASK + 1) { + Kokkos::Impl::g_device_cuda_lock_arrays.atomic[i] = 0; + } +} + +__global__ void init_lock_array_kernel_threadid(int N) { + unsigned i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < (unsigned)N) { + Kokkos::Impl::g_device_cuda_lock_arrays.scratch[i] = 0; + } +} + +} // namespace + +namespace Impl { + +CudaLockArrays g_host_cuda_lock_arrays = {nullptr, nullptr, 0}; + +void initialize_host_cuda_lock_arrays() { + if (g_host_cuda_lock_arrays.atomic != nullptr) return; + CUDA_SAFE_CALL(cudaMalloc(&g_host_cuda_lock_arrays.atomic, + sizeof(int) * (CUDA_SPACE_ATOMIC_MASK + 1))); + CUDA_SAFE_CALL(cudaMalloc(&g_host_cuda_lock_arrays.scratch, + sizeof(int) * (Cuda::concurrency()))); + CUDA_SAFE_CALL(cudaDeviceSynchronize()); + g_host_cuda_lock_arrays.n = Cuda::concurrency(); + KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE(); + init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256, + 256>>>(); + init_lock_array_kernel_threadid<<<(Kokkos::Cuda::concurrency() + 255) / 256, + 256>>>(Kokkos::Cuda::concurrency()); + CUDA_SAFE_CALL(cudaDeviceSynchronize()); +} + +void finalize_host_cuda_lock_arrays() { + if (g_host_cuda_lock_arrays.atomic == nullptr) return; + cudaFree(g_host_cuda_lock_arrays.atomic); + g_host_cuda_lock_arrays.atomic = nullptr; + cudaFree(g_host_cuda_lock_arrays.scratch); + g_host_cuda_lock_arrays.scratch = nullptr; + g_host_cuda_lock_arrays.n = 0; +#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE + KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE(); +#endif +} + +} // namespace Impl + +} // namespace Kokkos + +#else + +void KOKKOS_CORE_SRC_CUDA_CUDA_LOCKS_PREVENT_LINK_ERROR() {} + +#endif diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp new file mode 100644 index 0000000000000000000000000000000000000000..7640b8084d16a210408deb94a35f8962dfc92c99 --- /dev/null +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp @@ -0,0 +1,176 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CUDA_LOCKS_HPP +#define KOKKOS_CUDA_LOCKS_HPP + +#include <Kokkos_Macros.hpp> + +#ifdef KOKKOS_ENABLE_CUDA + +#include <cstdint> + +#include <Cuda/Kokkos_Cuda_Error.hpp> + +namespace Kokkos { +namespace Impl { + +struct CudaLockArrays { + std::int32_t* atomic; + std::int32_t* scratch; + std::int32_t n; +}; + +/// \brief This global variable in Host space is the central definition +/// of these arrays. +extern Kokkos::Impl::CudaLockArrays g_host_cuda_lock_arrays; + +/// \brief After this call, the g_host_cuda_lock_arrays variable has +/// valid, initialized arrays. +/// +/// This call is idempotent. +void initialize_host_cuda_lock_arrays(); + +/// \brief After this call, the g_host_cuda_lock_arrays variable has +/// all null pointers, and all array memory has been freed. +/// +/// This call is idempotent. +void finalize_host_cuda_lock_arrays(); + +} // namespace Impl +} // namespace Kokkos + +namespace Kokkos { +namespace Impl { + +/// \brief This global variable in CUDA space is what kernels use +/// to get access to the lock arrays. +/// +/// When relocatable device code is enabled, there can be one single +/// instance of this global variable for the entire executable, +/// whose definition will be in Kokkos_Cuda_Locks.cpp (and whose declaration +/// here must then be extern. +/// This one instance will be initialized by initialize_host_cuda_lock_arrays +/// and need not be modified afterwards. +/// +/// When relocatable device code is disabled, an instance of this variable +/// will be created in every translation unit that sees this header file +/// (we make this clear by marking it static, meaning no other translation +/// unit can link to it). +/// Since the Kokkos_Cuda_Locks.cpp translation unit cannot initialize the +/// instances in other translation units, we must update this CUDA global +/// variable based on the Host global variable prior to running any kernels +/// that will use it. +/// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro. +__device__ +#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE + __constant__ extern +#endif + Kokkos::Impl::CudaLockArrays g_device_cuda_lock_arrays; + +#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF + +/// \brief Acquire a lock for the address +/// +/// This function tries to acquire the lock for the hash value derived +/// from the provided ptr. If the lock is successfully acquired the +/// function returns true. Otherwise it returns false. +__device__ inline bool lock_address_cuda_space(void* ptr) { + size_t offset = size_t(ptr); + offset = offset >> 2; + offset = offset & CUDA_SPACE_ATOMIC_MASK; + return ( + 0 == + atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.atomic[offset], 0, 1)); +} + +/// \brief Release lock for the address +/// +/// This function releases the lock for the hash value derived +/// from the provided ptr. This function should only be called +/// after previously successfully acquiring a lock with +/// lock_address. +__device__ inline void unlock_address_cuda_space(void* ptr) { + size_t offset = size_t(ptr); + offset = offset >> 2; + offset = offset & CUDA_SPACE_ATOMIC_MASK; + atomicExch(&Kokkos::Impl::g_device_cuda_lock_arrays.atomic[offset], 0); +} + +} // namespace Impl +} // namespace Kokkos + +// Make lock_array_copied an explicit translation unit scope thingy +namespace Kokkos { +namespace Impl { +namespace { +static int lock_array_copied = 0; +inline int eliminate_warning_for_lock_array() { return lock_array_copied; } +} // namespace +} // namespace Impl +} // namespace Kokkos +/* Dan Ibanez: it is critical that this code be a macro, so that it will + capture the right address for Kokkos::Impl::g_device_cuda_lock_arrays! + putting this in an inline function will NOT do the right thing! */ +#define KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() \ + { \ + if (::Kokkos::Impl::lock_array_copied == 0) { \ + CUDA_SAFE_CALL( \ + cudaMemcpyToSymbol(Kokkos::Impl::g_device_cuda_lock_arrays, \ + &Kokkos::Impl::g_host_cuda_lock_arrays, \ + sizeof(Kokkos::Impl::CudaLockArrays))); \ + } \ + lock_array_copied = 1; \ + } + +#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE +#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() +#else +#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \ + KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() +#endif + +#endif /* defined( KOKKOS_ENABLE_CUDA ) */ + +#endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP */ diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp new file mode 100644 index 0000000000000000000000000000000000000000..12b7f70a97495fca628580dda12b115cb5c25a12 --- /dev/null +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp @@ -0,0 +1,37 @@ +#ifndef KOKKOS_CUDA_MDRANGEPOLICY_HPP_ +#define KOKKOS_CUDA_MDRANGEPOLICY_HPP_ + +#include <KokkosExp_MDRangePolicy.hpp> + +namespace Kokkos { + +template <> +struct default_outer_direction<Kokkos::Cuda> { + using type = Iterate; + static constexpr Iterate value = Iterate::Left; +}; + +template <> +struct default_inner_direction<Kokkos::Cuda> { + using type = Iterate; + static constexpr Iterate value = Iterate::Left; +}; + +namespace Impl { + +// Settings for MDRangePolicy +template <> +inline TileSizeProperties get_tile_size_properties<Kokkos::Cuda>( + const Kokkos::Cuda& space) { + TileSizeProperties properties; + properties.max_threads = + space.impl_internal_space_instance()->m_maxThreadsPerSM; + properties.default_largest_tile_size = 16; + properties.default_tile_size = 2; + properties.max_total_tile_size = 512; + return properties; +} + +} // Namespace Impl +} // Namespace Kokkos +#endif diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp new file mode 100644 index 0000000000000000000000000000000000000000..2834e6f3de012b718ae06ebb6f87d7d24e3e5756 --- /dev/null +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp @@ -0,0 +1,2877 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CUDA_PARALLEL_HPP +#define KOKKOS_CUDA_PARALLEL_HPP + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_CUDA) + +#include <algorithm> +#include <string> +#include <cstdio> +#include <cstdint> + +#include <utility> +#include <Kokkos_Parallel.hpp> + +#include <Cuda/Kokkos_Cuda_KernelLaunch.hpp> +#include <Cuda/Kokkos_Cuda_ReduceScan.hpp> +#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp> +#include <Cuda/Kokkos_Cuda_Locks.hpp> +#include <Cuda/Kokkos_Cuda_Team.hpp> +#include <Kokkos_Vectorization.hpp> +#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp> + +#include <impl/Kokkos_Tools.hpp> +#include <typeinfo> + +#include <KokkosExp_MDRangePolicy.hpp> +#include <impl/KokkosExp_IterateTileGPU.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +extern bool show_warnings() noexcept; + +namespace Impl { + +template <class... Properties> +class TeamPolicyInternal<Kokkos::Cuda, Properties...> + : public PolicyTraits<Properties...> { + public: + //! Tag this class as a kokkos execution policy + using execution_policy = TeamPolicyInternal; + + using traits = PolicyTraits<Properties...>; + + template <class ExecSpace, class... OtherProperties> + friend class TeamPolicyInternal; + + private: + enum { MAX_WARP = 8 }; + + typename traits::execution_space m_space; + int m_league_size; + int m_team_size; + int m_vector_length; + int m_team_scratch_size[2]; + int m_thread_scratch_size[2]; + int m_chunk_size; + bool m_tune_team; + bool m_tune_vector; + + public: + //! Execution space of this execution policy + using execution_space = Kokkos::Cuda; + + template <class... OtherProperties> + TeamPolicyInternal(const TeamPolicyInternal<OtherProperties...>& p) { + m_league_size = p.m_league_size; + m_team_size = p.m_team_size; + m_vector_length = p.m_vector_length; + m_team_scratch_size[0] = p.m_team_scratch_size[0]; + m_team_scratch_size[1] = p.m_team_scratch_size[1]; + m_thread_scratch_size[0] = p.m_thread_scratch_size[0]; + m_thread_scratch_size[1] = p.m_thread_scratch_size[1]; + m_chunk_size = p.m_chunk_size; + m_space = p.m_space; + m_tune_team = p.m_tune_team; + m_tune_vector = p.m_tune_vector; + } + + //---------------------------------------- + + template <class FunctorType> + int team_size_max(const FunctorType& f, const ParallelForTag&) const { + using closure_type = + Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>; + cudaFuncAttributes attr = + CudaParallelLaunch<closure_type, typename traits::launch_bounds>:: + get_cuda_func_attributes(); + int block_size = + Kokkos::Impl::cuda_get_max_block_size<FunctorType, + typename traits::launch_bounds>( + space().impl_internal_space_instance(), attr, f, + (size_t)impl_vector_length(), + (size_t)team_scratch_size(0) + 2 * sizeof(double), + (size_t)thread_scratch_size(0) + sizeof(double)); + return block_size / impl_vector_length(); + } + + template <class FunctorType> + inline int team_size_max(const FunctorType& f, + const ParallelReduceTag&) const { + using functor_analysis_type = + Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, + TeamPolicyInternal, FunctorType>; + using reducer_type = typename Impl::ParallelReduceReturnValue< + void, typename functor_analysis_type::value_type, + FunctorType>::reducer_type; + using closure_type = + Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>, + reducer_type>; + return internal_team_size_max<closure_type>(f); + } + + template <class FunctorType, class ReducerType> + inline int team_size_max(const FunctorType& f, const ReducerType& /*r*/, + const ParallelReduceTag&) const { + using closure_type = + Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>, + ReducerType>; + return internal_team_size_max<closure_type>(f); + } + + template <class FunctorType> + int team_size_recommended(const FunctorType& f, const ParallelForTag&) const { + using closure_type = + Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>; + cudaFuncAttributes attr = + CudaParallelLaunch<closure_type, typename traits::launch_bounds>:: + get_cuda_func_attributes(); + const int block_size = + Kokkos::Impl::cuda_get_opt_block_size<FunctorType, + typename traits::launch_bounds>( + space().impl_internal_space_instance(), attr, f, + (size_t)impl_vector_length(), + (size_t)team_scratch_size(0) + 2 * sizeof(double), + (size_t)thread_scratch_size(0) + sizeof(double)); + return block_size / impl_vector_length(); + } + + template <class FunctorType> + inline int team_size_recommended(const FunctorType& f, + const ParallelReduceTag&) const { + using functor_analysis_type = + Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, + TeamPolicyInternal, FunctorType>; + using reducer_type = typename Impl::ParallelReduceReturnValue< + void, typename functor_analysis_type::value_type, + FunctorType>::reducer_type; + using closure_type = + Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>, + reducer_type>; + return internal_team_size_recommended<closure_type>(f); + } + + template <class FunctorType, class ReducerType> + int team_size_recommended(const FunctorType& f, const ReducerType&, + const ParallelReduceTag&) const { + using closure_type = + Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>, + ReducerType>; + return internal_team_size_recommended<closure_type>(f); + } + + inline static int vector_length_max() { return Impl::CudaTraits::WarpSize; } + + inline static int verify_requested_vector_length( + int requested_vector_length) { + int test_vector_length = + std::min(requested_vector_length, vector_length_max()); + + // Allow only power-of-two vector_length + if (!(is_integral_power_of_two(test_vector_length))) { + int test_pow2 = 1; + for (int i = 0; i < 5; i++) { + test_pow2 = test_pow2 << 1; + if (test_pow2 > test_vector_length) { + break; + } + } + test_vector_length = test_pow2 >> 1; + } + + return test_vector_length; + } + + inline static int scratch_size_max(int level) { + return ( + level == 0 ? 1024 * 40 : // 48kB is the max for CUDA, but we need some + // for team_member.reduce etc. + 20 * 1024 * + 1024); // arbitrarily setting this to 20MB, for a Volta V100 + // that would give us about 3.2GB for 2 teams per SM + } + + //---------------------------------------- + + KOKKOS_DEPRECATED inline int vector_length() const { + return impl_vector_length(); + } + inline int impl_vector_length() const { return m_vector_length; } + inline int team_size() const { return m_team_size; } + inline int league_size() const { return m_league_size; } + inline bool impl_auto_team_size() const { return m_tune_team; } + inline bool impl_auto_vector_length() const { return m_tune_vector; } + inline void impl_set_team_size(size_t team_size) { m_team_size = team_size; } + inline void impl_set_vector_length(size_t vector_length) { + m_vector_length = vector_length; + } + inline int scratch_size(int level, int team_size_ = -1) const { + if (team_size_ < 0) team_size_ = m_team_size; + return m_team_scratch_size[level] + + team_size_ * m_thread_scratch_size[level]; + } + inline int team_scratch_size(int level) const { + return m_team_scratch_size[level]; + } + inline int thread_scratch_size(int level) const { + return m_thread_scratch_size[level]; + } + + const typename traits::execution_space& space() const { return m_space; } + + TeamPolicyInternal() + : m_space(typename traits::execution_space()), + m_league_size(0), + m_team_size(-1), + m_vector_length(0), + m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_chunk_size(Impl::CudaTraits::WarpSize), + m_tune_team(false), + m_tune_vector(false) {} + + /** \brief Specify league size, specify team size, specify vector length */ + TeamPolicyInternal(const execution_space space_, int league_size_, + int team_size_request, int vector_length_request = 1) + : m_space(space_), + m_league_size(league_size_), + m_team_size(team_size_request), + m_vector_length( + (vector_length_request > 0) + ? verify_requested_vector_length(vector_length_request) + : verify_requested_vector_length(1)), + m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_chunk_size(Impl::CudaTraits::WarpSize), + m_tune_team(bool(team_size_request <= 0)), + m_tune_vector(bool(vector_length_request <= 0)) { + // Make sure league size is permissible + if (league_size_ >= int(Impl::cuda_internal_maximum_grid_count())) + Impl::throw_runtime_exception( + "Requested too large league_size for TeamPolicy on Cuda execution " + "space."); + + // Make sure total block size is permissible + if (m_team_size * m_vector_length > + int(Impl::CudaTraits::MaxHierarchicalParallelism)) { + Impl::throw_runtime_exception( + std::string("Kokkos::TeamPolicy< Cuda > the team size is too large. " + "Team size x vector length must be smaller than 1024.")); + } + } + + /** \brief Specify league size, request team size, specify vector length */ + TeamPolicyInternal(const execution_space space_, int league_size_, + const Kokkos::AUTO_t& /* team_size_request */ + , + int vector_length_request = 1) + : TeamPolicyInternal(space_, league_size_, -1, vector_length_request) {} + + /** \brief Specify league size, request team size and vector length */ + TeamPolicyInternal(const execution_space space_, int league_size_, + const Kokkos::AUTO_t& /* team_size_request */, + const Kokkos::AUTO_t& /* vector_length_request */ + ) + : TeamPolicyInternal(space_, league_size_, -1, -1) {} + + /** \brief Specify league size, specify team size, request vector length */ + TeamPolicyInternal(const execution_space space_, int league_size_, + int team_size_request, const Kokkos::AUTO_t&) + : TeamPolicyInternal(space_, league_size_, team_size_request, -1) {} + + TeamPolicyInternal(int league_size_, int team_size_request, + int vector_length_request = 1) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, + team_size_request, vector_length_request) {} + + TeamPolicyInternal(int league_size_, const Kokkos::AUTO_t& team_size_request, + int vector_length_request = 1) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, + team_size_request, vector_length_request) + + {} + + /** \brief Specify league size, request team size */ + TeamPolicyInternal(int league_size_, const Kokkos::AUTO_t& team_size_request, + const Kokkos::AUTO_t& vector_length_request) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, + team_size_request, vector_length_request) {} + + /** \brief Specify league size, request team size */ + TeamPolicyInternal(int league_size_, int team_size_request, + const Kokkos::AUTO_t& vector_length_request) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, + team_size_request, vector_length_request) {} + + inline int chunk_size() const { return m_chunk_size; } + + /** \brief set chunk_size to a discrete value*/ + inline TeamPolicyInternal& set_chunk_size( + typename traits::index_type chunk_size_) { + m_chunk_size = chunk_size_; + return *this; + } + + /** \brief set per team scratch size for a specific level of the scratch + * hierarchy */ + inline TeamPolicyInternal& set_scratch_size(const int& level, + const PerTeamValue& per_team) { + m_team_scratch_size[level] = per_team.value; + return *this; + } + + /** \brief set per thread scratch size for a specific level of the scratch + * hierarchy */ + inline TeamPolicyInternal& set_scratch_size( + const int& level, const PerThreadValue& per_thread) { + m_thread_scratch_size[level] = per_thread.value; + return *this; + } + + /** \brief set per thread and per team scratch size for a specific level of + * the scratch hierarchy */ + inline TeamPolicyInternal& set_scratch_size( + const int& level, const PerTeamValue& per_team, + const PerThreadValue& per_thread) { + m_team_scratch_size[level] = per_team.value; + m_thread_scratch_size[level] = per_thread.value; + return *this; + } + + using member_type = Kokkos::Impl::CudaTeamMember; + + protected: + template <class ClosureType, class FunctorType, class BlockSizeCallable> + int internal_team_size_common(const FunctorType& f, + BlockSizeCallable&& block_size_callable) const { + using closure_type = ClosureType; + using functor_value_traits = + Impl::FunctorValueTraits<FunctorType, typename traits::work_tag>; + + cudaFuncAttributes attr = + CudaParallelLaunch<closure_type, typename traits::launch_bounds>:: + get_cuda_func_attributes(); + const int block_size = std::forward<BlockSizeCallable>(block_size_callable)( + space().impl_internal_space_instance(), attr, f, + (size_t)impl_vector_length(), + (size_t)team_scratch_size(0) + 2 * sizeof(double), + (size_t)thread_scratch_size(0) + sizeof(double) + + ((functor_value_traits::StaticValueSize != 0) + ? 0 + : functor_value_traits::value_size(f))); + KOKKOS_ASSERT(block_size > 0); + + // Currently we require Power-of-2 team size for reductions. + int p2 = 1; + while (p2 <= block_size) p2 *= 2; + p2 /= 2; + return p2 / impl_vector_length(); + } + + template <class ClosureType, class FunctorType> + int internal_team_size_max(const FunctorType& f) const { + return internal_team_size_common<ClosureType>( + f, + Kokkos::Impl::cuda_get_max_block_size<FunctorType, + typename traits::launch_bounds>); + } + + template <class ClosureType, class FunctorType> + int internal_team_size_recommended(const FunctorType& f) const { + return internal_team_size_common<ClosureType>( + f, + Kokkos::Impl::cuda_get_opt_block_size<FunctorType, + typename traits::launch_bounds>); + } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <class FunctorType, class... Traits> +class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> { + public: + using Policy = Kokkos::RangePolicy<Traits...>; + + private: + using Member = typename Policy::member_type; + using WorkTag = typename Policy::work_tag; + using LaunchBounds = typename Policy::launch_bounds; + + const FunctorType m_functor; + const Policy m_policy; + + ParallelFor() = delete; + ParallelFor& operator=(const ParallelFor&) = delete; + + template <class TagType> + inline __device__ + typename std::enable_if<std::is_same<TagType, void>::value>::type + exec_range(const Member i) const { + m_functor(i); + } + + template <class TagType> + inline __device__ + typename std::enable_if<!std::is_same<TagType, void>::value>::type + exec_range(const Member i) const { + m_functor(TagType(), i); + } + + public: + using functor_type = FunctorType; + + Policy const& get_policy() const { return m_policy; } + + inline __device__ void operator()() const { + const Member work_stride = blockDim.y * gridDim.x; + const Member work_end = m_policy.end(); + + for (Member iwork = + m_policy.begin() + threadIdx.y + blockDim.y * blockIdx.x; + iwork < work_end; + iwork = iwork < work_end - work_stride ? iwork + work_stride + : work_end) { + this->template exec_range<WorkTag>(iwork); + } + } + + inline void execute() const { + const typename Policy::index_type nwork = m_policy.end() - m_policy.begin(); + + cudaFuncAttributes attr = + CudaParallelLaunch<ParallelFor, + LaunchBounds>::get_cuda_func_attributes(); + const int block_size = + Kokkos::Impl::cuda_get_opt_block_size<FunctorType, LaunchBounds>( + m_policy.space().impl_internal_space_instance(), attr, m_functor, 1, + 0, 0); + KOKKOS_ASSERT(block_size > 0); + dim3 block(1, block_size, 1); + dim3 grid( + std::min( + typename Policy::index_type((nwork + block.y - 1) / block.y), + typename Policy::index_type(cuda_internal_maximum_grid_count())), + 1, 1); +#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION + if (Kokkos::Impl::CudaInternal::cuda_use_serial_execution()) { + block = dim3(1, 1, 1); + grid = dim3(1, 1, 1); + } +#endif + + CudaParallelLaunch<ParallelFor, LaunchBounds>( + *this, grid, block, 0, m_policy.space().impl_internal_space_instance(), + false); + } + + ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) + : m_functor(arg_functor), m_policy(arg_policy) {} +}; + +// MDRangePolicy impl +template <class FunctorType, class... Traits> +class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> { + public: + using Policy = Kokkos::MDRangePolicy<Traits...>; + using functor_type = FunctorType; + + private: + using RP = Policy; + using array_index_type = typename Policy::array_index_type; + using index_type = typename Policy::index_type; + using LaunchBounds = typename Policy::launch_bounds; + + const FunctorType m_functor; + const Policy m_rp; + + public: + template <typename Policy, typename Functor> + static int max_tile_size_product(const Policy& pol, const Functor&) { + cudaFuncAttributes attr = + CudaParallelLaunch<ParallelFor, + LaunchBounds>::get_cuda_func_attributes(); + auto const& prop = pol.space().cuda_device_prop(); + // Limits due to registers/SM, MDRange doesn't have + // shared memory constraints + int const regs_per_sm = prop.regsPerMultiprocessor; + int const regs_per_thread = attr.numRegs; + int const max_threads_per_sm = regs_per_sm / regs_per_thread; + return std::min( + max_threads_per_sm, + static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism)); + } + Policy const& get_policy() const { return m_rp; } + inline __device__ void operator()() const { + Kokkos::Impl::DeviceIterateTile<Policy::rank, Policy, FunctorType, + typename Policy::work_tag>(m_rp, m_functor) + .exec_range(); + } + + inline void execute() const { + using namespace std; + + if (m_rp.m_num_tiles == 0) return; + const array_index_type maxblocks = static_cast<array_index_type>( + m_rp.space().impl_internal_space_instance()->m_maxBlock); + if (RP::rank == 2) { + const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], 1); + KOKKOS_ASSERT(block.x > 0); + KOKKOS_ASSERT(block.y > 0); + const dim3 grid( + min((m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x, + maxblocks), + min((m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1) / block.y, + maxblocks), + 1); + CudaParallelLaunch<ParallelFor, LaunchBounds>( + *this, grid, block, 0, m_rp.space().impl_internal_space_instance(), + false); + } else if (RP::rank == 3) { + const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]); + KOKKOS_ASSERT(block.x > 0); + KOKKOS_ASSERT(block.y > 0); + KOKKOS_ASSERT(block.z > 0); + const dim3 grid( + min((m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x, + maxblocks), + min((m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1) / block.y, + maxblocks), + min((m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z, + maxblocks)); + CudaParallelLaunch<ParallelFor, LaunchBounds>( + *this, grid, block, 0, m_rp.space().impl_internal_space_instance(), + false); + } else if (RP::rank == 4) { + // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to + // threadIdx.z + const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1], m_rp.m_tile[2], + m_rp.m_tile[3]); + KOKKOS_ASSERT(block.y > 0); + KOKKOS_ASSERT(block.z > 0); + const dim3 grid( + min(static_cast<index_type>(m_rp.m_tile_end[0] * m_rp.m_tile_end[1]), + static_cast<index_type>(maxblocks)), + min((m_rp.m_upper[2] - m_rp.m_lower[2] + block.y - 1) / block.y, + maxblocks), + min((m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z, + maxblocks)); + CudaParallelLaunch<ParallelFor, LaunchBounds>( + *this, grid, block, 0, m_rp.space().impl_internal_space_instance(), + false); + } else if (RP::rank == 5) { + // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to + // threadIdx.z + const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1], + m_rp.m_tile[2] * m_rp.m_tile[3], m_rp.m_tile[4]); + KOKKOS_ASSERT(block.z > 0); + const dim3 grid( + min(static_cast<index_type>(m_rp.m_tile_end[0] * m_rp.m_tile_end[1]), + static_cast<index_type>(maxblocks)), + min(static_cast<index_type>(m_rp.m_tile_end[2] * m_rp.m_tile_end[3]), + static_cast<index_type>(maxblocks)), + min((m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z, + maxblocks)); + CudaParallelLaunch<ParallelFor, LaunchBounds>( + *this, grid, block, 0, m_rp.space().impl_internal_space_instance(), + false); + } else if (RP::rank == 6) { + // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to + // threadIdx.z + const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1], + m_rp.m_tile[2] * m_rp.m_tile[3], + m_rp.m_tile[4] * m_rp.m_tile[5]); + const dim3 grid( + min(static_cast<index_type>(m_rp.m_tile_end[0] * m_rp.m_tile_end[1]), + static_cast<index_type>(maxblocks)), + min(static_cast<index_type>(m_rp.m_tile_end[2] * m_rp.m_tile_end[3]), + static_cast<index_type>(maxblocks)), + min(static_cast<index_type>(m_rp.m_tile_end[4] * m_rp.m_tile_end[5]), + static_cast<index_type>(maxblocks))); + CudaParallelLaunch<ParallelFor, LaunchBounds>( + *this, grid, block, 0, m_rp.space().impl_internal_space_instance(), + false); + } else { + Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n"); + } + + } // end execute + + // inline + ParallelFor(const FunctorType& arg_functor, Policy arg_policy) + : m_functor(arg_functor), m_rp(arg_policy) {} +}; + +template <class FunctorType, class... Properties> +class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, + Kokkos::Cuda> { + public: + using Policy = TeamPolicy<Properties...>; + + private: + using Member = typename Policy::member_type; + using WorkTag = typename Policy::work_tag; + using LaunchBounds = typename Policy::launch_bounds; + + public: + using functor_type = FunctorType; + using size_type = Cuda::size_type; + + private: + // Algorithmic constraints: blockDim.y is a power of two AND blockDim.y == + // blockDim.z == 1 shared memory utilization: + // + // [ team reduce space ] + // [ team shared space ] + // + + const FunctorType m_functor; + const Policy m_policy; + const size_type m_league_size; + int m_team_size; + const size_type m_vector_size; + int m_shmem_begin; + int m_shmem_size; + void* m_scratch_ptr[2]; + int m_scratch_size[2]; + + template <class TagType> + __device__ inline + typename std::enable_if<std::is_same<TagType, void>::value>::type + exec_team(const Member& member) const { + m_functor(member); + } + + template <class TagType> + __device__ inline + typename std::enable_if<!std::is_same<TagType, void>::value>::type + exec_team(const Member& member) const { + m_functor(TagType(), member); + } + + public: + Policy const& get_policy() const { return m_policy; } + + __device__ inline void operator()() const { + // Iterate this block through the league + int64_t threadid = 0; + if (m_scratch_size[1] > 0) { + __shared__ int64_t base_thread_id; + if (threadIdx.x == 0 && threadIdx.y == 0) { + threadid = (blockIdx.x * blockDim.z + threadIdx.z) % + (Kokkos::Impl::g_device_cuda_lock_arrays.n / + (blockDim.x * blockDim.y)); + threadid *= blockDim.x * blockDim.y; + int done = 0; + while (!done) { + done = + (0 == + atomicCAS( + &Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid], + 0, 1)); + if (!done) { + threadid += blockDim.x * blockDim.y; + if (int64_t(threadid + blockDim.x * blockDim.y) >= + int64_t(Kokkos::Impl::g_device_cuda_lock_arrays.n)) + threadid = 0; + } + } + base_thread_id = threadid; + } + __syncthreads(); + threadid = base_thread_id; + } + + const int int_league_size = (int)m_league_size; + for (int league_rank = blockIdx.x; league_rank < int_league_size; + league_rank += gridDim.x) { + this->template exec_team<WorkTag>(typename Policy::member_type( + kokkos_impl_cuda_shared_memory<void>(), m_shmem_begin, m_shmem_size, + (void*)(((char*)m_scratch_ptr[1]) + + ptrdiff_t(threadid / (blockDim.x * blockDim.y)) * + m_scratch_size[1]), + m_scratch_size[1], league_rank, m_league_size)); + } + if (m_scratch_size[1] > 0) { + __syncthreads(); + if (threadIdx.x == 0 && threadIdx.y == 0) + Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid] = 0; + } + } + + inline void execute() const { + const int64_t shmem_size_total = m_shmem_begin + m_shmem_size; + dim3 grid(int(m_league_size), 1, 1); + const dim3 block(int(m_vector_size), int(m_team_size), 1); + +#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION + if (Kokkos::Impl::CudaInternal::cuda_use_serial_execution()) { + grid = dim3(1, 1, 1); + } +#endif + + CudaParallelLaunch<ParallelFor, LaunchBounds>( + *this, grid, block, shmem_size_total, + m_policy.space().impl_internal_space_instance(), + true); // copy to device and execute + } + + ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) + : m_functor(arg_functor), + m_policy(arg_policy), + m_league_size(arg_policy.league_size()), + m_team_size(arg_policy.team_size()), + m_vector_size(arg_policy.impl_vector_length()) { + cudaFuncAttributes attr = + CudaParallelLaunch<ParallelFor, + LaunchBounds>::get_cuda_func_attributes(); + m_team_size = + m_team_size >= 0 + ? m_team_size + : Kokkos::Impl::cuda_get_opt_block_size<FunctorType, LaunchBounds>( + m_policy.space().impl_internal_space_instance(), attr, + m_functor, m_vector_size, m_policy.team_scratch_size(0), + m_policy.thread_scratch_size(0)) / + m_vector_size; + + m_shmem_begin = (sizeof(double) * (m_team_size + 2)); + m_shmem_size = + (m_policy.scratch_size(0, m_team_size) + + FunctorTeamShmemSize<FunctorType>::value(m_functor, m_team_size)); + m_scratch_size[0] = m_policy.scratch_size(0, m_team_size); + m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); + + // Functor's reduce memory, team scan memory, and team shared memory depend + // upon team size. + m_scratch_ptr[0] = nullptr; + m_scratch_ptr[1] = + m_team_size <= 0 + ? nullptr + : m_policy.space() + .impl_internal_space_instance() + ->resize_team_scratch_space( + static_cast<ptrdiff_t>(m_scratch_size[1]) * + static_cast<ptrdiff_t>(Cuda::concurrency() / + (m_team_size * m_vector_size))); + + const int shmem_size_total = m_shmem_begin + m_shmem_size; + if (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < + shmem_size_total) { + printf( + "%i %i\n", + m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock, + shmem_size_total); + Kokkos::Impl::throw_runtime_exception(std::string( + "Kokkos::Impl::ParallelFor< Cuda > insufficient shared memory")); + } + + if (int(m_team_size) > + int(Kokkos::Impl::cuda_get_max_block_size<FunctorType, LaunchBounds>( + m_policy.space().impl_internal_space_instance(), attr, + arg_functor, arg_policy.impl_vector_length(), + arg_policy.team_scratch_size(0), + arg_policy.thread_scratch_size(0)) / + arg_policy.impl_vector_length())) { + Kokkos::Impl::throw_runtime_exception(std::string( + "Kokkos::Impl::ParallelFor< Cuda > requested too large team size.")); + } + } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <class FunctorType, class ReducerType, class... Traits> +class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType, + Kokkos::Cuda> { + public: + using Policy = Kokkos::RangePolicy<Traits...>; + + private: + using WorkRange = typename Policy::WorkRange; + using WorkTag = typename Policy::work_tag; + using Member = typename Policy::member_type; + using LaunchBounds = typename Policy::launch_bounds; + + using ReducerConditional = + Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, + FunctorType, ReducerType>; + using ReducerTypeFwd = typename ReducerConditional::type; + using WorkTagFwd = + typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, + WorkTag, void>::type; + + using ValueTraits = + Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>; + using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>; + using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>; + + public: + using pointer_type = typename ValueTraits::pointer_type; + using value_type = typename ValueTraits::value_type; + using reference_type = typename ValueTraits::reference_type; + using functor_type = FunctorType; + using size_type = Kokkos::Cuda::size_type; + using index_type = typename Policy::index_type; + using reducer_type = ReducerType; + + // Algorithmic constraints: blockSize is a power of two AND blockDim.y == + // blockDim.z == 1 + + const FunctorType m_functor; + const Policy m_policy; + const ReducerType m_reducer; + const pointer_type m_result_ptr; + const bool m_result_ptr_device_accessible; + const bool m_result_ptr_host_accessible; + size_type* m_scratch_space; + size_type* m_scratch_flags; + size_type* m_unified_space; + + // Shall we use the shfl based reduction or not (only use it for static sized + // types of more than 128bit) + enum { + UseShflReduction = false + }; //((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize) + //}; + // Some crutch to do function overloading + private: + using DummyShflReductionType = double; + using DummySHMEMReductionType = int; + + public: + Policy const& get_policy() const { return m_policy; } + + // Make the exec_range calls call to Reduce::DeviceIterateTile + template <class TagType> + __device__ inline + typename std::enable_if<std::is_same<TagType, void>::value>::type + exec_range(const Member& i, reference_type update) const { + m_functor(i, update); + } + + template <class TagType> + __device__ inline + typename std::enable_if<!std::is_same<TagType, void>::value>::type + exec_range(const Member& i, reference_type update) const { + m_functor(TagType(), i, update); + } + + __device__ inline void operator()() const { + /* run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, + DummySHMEMReductionType>::select(1,1.0) ); + } + + __device__ inline + void run(const DummySHMEMReductionType& ) const + {*/ + const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize / + sizeof(size_type)> + word_count(ValueTraits::value_size( + ReducerConditional::select(m_functor, m_reducer)) / + sizeof(size_type)); + + { + reference_type value = + ValueInit::init(ReducerConditional::select(m_functor, m_reducer), + kokkos_impl_cuda_shared_memory<size_type>() + + threadIdx.y * word_count.value); + + // Number of blocks is bounded so that the reduction can be limited to two + // passes. Each thread block is given an approximately equal amount of + // work to perform. Accumulate the values for this block. The accumulation + // ordering does not match the final pass, but is arithmatically + // equivalent. + + const WorkRange range(m_policy, blockIdx.x, gridDim.x); + + for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end(); + iwork < iwork_end; iwork += blockDim.y) { + this->template exec_range<WorkTag>(iwork, value); + } + } + + // Doing code duplication here to fix issue #3428 + // Suspect optimizer bug?? + // Reduce with final value at blockDim.y - 1 location. + // Shortcut for length zero reduction + if (m_policy.begin() == m_policy.end()) { + // This is the final block with the final result at the final threads' + // location + + size_type* const shared = kokkos_impl_cuda_shared_memory<size_type>() + + (blockDim.y - 1) * word_count.value; + size_type* const global = + m_result_ptr_device_accessible + ? reinterpret_cast<size_type*>(m_result_ptr) + : (m_unified_space ? m_unified_space : m_scratch_space); + + if (threadIdx.y == 0) { + Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final( + ReducerConditional::select(m_functor, m_reducer), shared); + } + + if (CudaTraits::WarpSize < word_count.value) { + __syncthreads(); + } + + for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { + global[i] = shared[i]; + } + // return ; + } + + if (m_policy.begin() != m_policy.end()) { + { + if (cuda_single_inter_block_reduce_scan<false, ReducerTypeFwd, + WorkTagFwd>( + ReducerConditional::select(m_functor, m_reducer), blockIdx.x, + gridDim.x, kokkos_impl_cuda_shared_memory<size_type>(), + m_scratch_space, m_scratch_flags)) { + // This is the final block with the final result at the final threads' + // location + + size_type* const shared = + kokkos_impl_cuda_shared_memory<size_type>() + + (blockDim.y - 1) * word_count.value; + size_type* const global = + m_result_ptr_device_accessible + ? reinterpret_cast<size_type*>(m_result_ptr) + : (m_unified_space ? m_unified_space : m_scratch_space); + + if (threadIdx.y == 0) { + Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final( + ReducerConditional::select(m_functor, m_reducer), shared); + } + + if (CudaTraits::WarpSize < word_count.value) { + __syncthreads(); + } + + for (unsigned i = threadIdx.y; i < word_count.value; + i += blockDim.y) { + global[i] = shared[i]; + } + } + } + } + } + /* __device__ inline + void run(const DummyShflReductionType&) const + { + value_type value; + ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , + &value); + // Number of blocks is bounded so that the reduction can be limited to + two passes. + // Each thread block is given an approximately equal amount of work to + perform. + // Accumulate the values for this block. + // The accumulation ordering does not match the final pass, but is + arithmatically equivalent. + + const WorkRange range( m_policy , blockIdx.x , gridDim.x ); + + for ( Member iwork = range.begin() + threadIdx.y , iwork_end = + range.end() ; iwork < iwork_end ; iwork += blockDim.y ) { this-> template + exec_range< WorkTag >( iwork , value ); + } + + pointer_type const result = (pointer_type) (m_unified_space ? + m_unified_space : m_scratch_space) ; + + int max_active_thread = range.end()-range.begin() < blockDim.y ? + range.end() - range.begin():blockDim.y; + + max_active_thread = (max_active_thread == + 0)?blockDim.y:max_active_thread; + + value_type init; + ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , + &init); + if(Impl::cuda_inter_block_reduction<ReducerTypeFwd,ValueJoin,WorkTagFwd> + (value,init,ValueJoin(ReducerConditional::select(m_functor , + m_reducer)),m_scratch_space,result,m_scratch_flags,max_active_thread)) { + const unsigned id = threadIdx.y*blockDim.x + threadIdx.x; + if(id==0) { + Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::final( + ReducerConditional::select(m_functor , m_reducer) , (void*) &value ); + *result = value; + } + } + }*/ + + // Determine block size constrained by shared memory: + inline unsigned local_block_size(const FunctorType& f) { + unsigned n = CudaTraits::WarpSize * 8; + int shmem_size = + cuda_single_inter_block_reduce_scan_shmem<false, FunctorType, WorkTag>( + f, n); + using closure_type = Impl::ParallelReduce<FunctorType, Policy, ReducerType>; + cudaFuncAttributes attr = + CudaParallelLaunch<closure_type, + LaunchBounds>::get_cuda_func_attributes(); + while ( + (n && + (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < + shmem_size)) || + (n > + static_cast<unsigned>( + Kokkos::Impl::cuda_get_max_block_size<FunctorType, LaunchBounds>( + m_policy.space().impl_internal_space_instance(), attr, f, 1, + shmem_size, 0)))) { + n >>= 1; + shmem_size = cuda_single_inter_block_reduce_scan_shmem<false, FunctorType, + WorkTag>(f, n); + } + return n; + } + + inline void execute() { + const index_type nwork = m_policy.end() - m_policy.begin(); + const bool need_device_set = ReduceFunctorHasInit<FunctorType>::value || + ReduceFunctorHasFinal<FunctorType>::value || + !m_result_ptr_host_accessible || +#ifdef KOKKOS_CUDA_ENABLE_GRAPHS + Policy::is_graph_kernel::value || +#endif + !std::is_same<ReducerType, InvalidType>::value; + if ((nwork > 0) || need_device_set) { + const int block_size = local_block_size(m_functor); + + KOKKOS_ASSERT(block_size > 0); + + m_scratch_space = cuda_internal_scratch_space( + m_policy.space(), ValueTraits::value_size(ReducerConditional::select( + m_functor, m_reducer)) * + block_size /* block_size == max block_count */); + m_scratch_flags = + cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type)); + m_unified_space = cuda_internal_scratch_unified( + m_policy.space(), ValueTraits::value_size(ReducerConditional::select( + m_functor, m_reducer))); + + // REQUIRED ( 1 , N , 1 ) + dim3 block(1, block_size, 1); + // Required grid.x <= block.y + dim3 grid(std::min(int(block.y), int((nwork + block.y - 1) / block.y)), 1, + 1); + + // TODO @graph We need to effectively insert this in to the graph + const int shmem = + UseShflReduction + ? 0 + : cuda_single_inter_block_reduce_scan_shmem<false, FunctorType, + WorkTag>(m_functor, + block.y); + + if ((nwork == 0) +#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION + || Kokkos::Impl::CudaInternal::cuda_use_serial_execution() +#endif + ) { + block = dim3(1, 1, 1); + grid = dim3(1, 1, 1); + } + + CudaParallelLaunch<ParallelReduce, LaunchBounds>( + *this, grid, block, shmem, + m_policy.space().impl_internal_space_instance(), + false); // copy to device and execute + + if (!m_result_ptr_device_accessible) { + m_policy.space().fence(); + + if (m_result_ptr) { + if (m_unified_space) { + const int count = ValueTraits::value_count( + ReducerConditional::select(m_functor, m_reducer)); + for (int i = 0; i < count; ++i) { + m_result_ptr[i] = pointer_type(m_unified_space)[i]; + } + } else { + const int size = ValueTraits::value_size( + ReducerConditional::select(m_functor, m_reducer)); + DeepCopy<HostSpace, CudaSpace>(m_result_ptr, m_scratch_space, size); + } + } + } + } else { + if (m_result_ptr) { + // TODO @graph We need to effectively insert this in to the graph + ValueInit::init(ReducerConditional::select(m_functor, m_reducer), + m_result_ptr); + } + } + } + + template <class ViewType> + ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, + const ViewType& arg_result, + typename std::enable_if<Kokkos::is_view<ViewType>::value, + void*>::type = nullptr) + : m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(InvalidType()), + m_result_ptr(arg_result.data()), + m_result_ptr_device_accessible( + MemorySpaceAccess<Kokkos::CudaSpace, + typename ViewType::memory_space>::accessible), + m_result_ptr_host_accessible( + MemorySpaceAccess<Kokkos::HostSpace, + typename ViewType::memory_space>::accessible), + m_scratch_space(nullptr), + m_scratch_flags(nullptr), + m_unified_space(nullptr) {} + + ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, + const ReducerType& reducer) + : m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(reducer), + m_result_ptr(reducer.view().data()), + m_result_ptr_device_accessible( + MemorySpaceAccess<Kokkos::CudaSpace, + typename ReducerType::result_view_type:: + memory_space>::accessible), + m_result_ptr_host_accessible( + MemorySpaceAccess<Kokkos::HostSpace, + typename ReducerType::result_view_type:: + memory_space>::accessible), + m_scratch_space(nullptr), + m_scratch_flags(nullptr), + m_unified_space(nullptr) {} +}; + +// MDRangePolicy impl +template <class FunctorType, class ReducerType, class... Traits> +class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType, + Kokkos::Cuda> { + public: + using Policy = Kokkos::MDRangePolicy<Traits...>; + + private: + using array_index_type = typename Policy::array_index_type; + using index_type = typename Policy::index_type; + + using WorkTag = typename Policy::work_tag; + using Member = typename Policy::member_type; + using LaunchBounds = typename Policy::launch_bounds; + + using ReducerConditional = + Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, + FunctorType, ReducerType>; + using ReducerTypeFwd = typename ReducerConditional::type; + using WorkTagFwd = + typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, + WorkTag, void>::type; + + using ValueTraits = + Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>; + using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>; + using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>; + + public: + using pointer_type = typename ValueTraits::pointer_type; + using value_type = typename ValueTraits::value_type; + using reference_type = typename ValueTraits::reference_type; + using functor_type = FunctorType; + using size_type = Cuda::size_type; + using reducer_type = ReducerType; + + // Algorithmic constraints: blockSize is a power of two AND blockDim.y == + // blockDim.z == 1 + + const FunctorType m_functor; + const Policy m_policy; // used for workrange and nwork + const ReducerType m_reducer; + const pointer_type m_result_ptr; + const bool m_result_ptr_device_accessible; + size_type* m_scratch_space; + size_type* m_scratch_flags; + size_type* m_unified_space; + + using DeviceIteratePattern = typename Kokkos::Impl::Reduce::DeviceIterateTile< + Policy::rank, Policy, FunctorType, typename Policy::work_tag, + reference_type>; + + // Shall we use the shfl based reduction or not (only use it for static sized + // types of more than 128bit + static constexpr bool UseShflReduction = false; + //((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize) + // Some crutch to do function overloading + private: + using DummyShflReductionType = double; + using DummySHMEMReductionType = int; + + public: + template <typename Policy, typename Functor> + static int max_tile_size_product(const Policy& pol, const Functor&) { + cudaFuncAttributes attr = + CudaParallelLaunch<ParallelReduce, + LaunchBounds>::get_cuda_func_attributes(); + auto const& prop = pol.space().cuda_device_prop(); + // Limits due do registers/SM + int const regs_per_sm = prop.regsPerMultiprocessor; + int const regs_per_thread = attr.numRegs; + int const max_threads_per_sm = regs_per_sm / regs_per_thread; + return std::min( + max_threads_per_sm, + static_cast<int>(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism)); + } + Policy const& get_policy() const { return m_policy; } + inline __device__ void exec_range(reference_type update) const { + Kokkos::Impl::Reduce::DeviceIterateTile<Policy::rank, Policy, FunctorType, + typename Policy::work_tag, + reference_type>(m_policy, m_functor, + update) + .exec_range(); + } + + inline __device__ void operator()() const { + /* run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, + DummySHMEMReductionType>::select(1,1.0) ); + } + + __device__ inline + void run(const DummySHMEMReductionType& ) const + {*/ + const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize / + sizeof(size_type)> + word_count(ValueTraits::value_size( + ReducerConditional::select(m_functor, m_reducer)) / + sizeof(size_type)); + + { + reference_type value = + ValueInit::init(ReducerConditional::select(m_functor, m_reducer), + kokkos_impl_cuda_shared_memory<size_type>() + + threadIdx.y * word_count.value); + + // Number of blocks is bounded so that the reduction can be limited to two + // passes. Each thread block is given an approximately equal amount of + // work to perform. Accumulate the values for this block. The accumulation + // ordering does not match the final pass, but is arithmatically + // equivalent. + + this->exec_range(value); + } + + // Reduce with final value at blockDim.y - 1 location. + // Problem: non power-of-two blockDim + if (cuda_single_inter_block_reduce_scan<false, ReducerTypeFwd, WorkTagFwd>( + ReducerConditional::select(m_functor, m_reducer), blockIdx.x, + gridDim.x, kokkos_impl_cuda_shared_memory<size_type>(), + m_scratch_space, m_scratch_flags)) { + // This is the final block with the final result at the final threads' + // location + size_type* const shared = kokkos_impl_cuda_shared_memory<size_type>() + + (blockDim.y - 1) * word_count.value; + size_type* const global = + m_result_ptr_device_accessible + ? reinterpret_cast<size_type*>(m_result_ptr) + : (m_unified_space ? m_unified_space : m_scratch_space); + + if (threadIdx.y == 0) { + Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final( + ReducerConditional::select(m_functor, m_reducer), shared); + } + + if (CudaTraits::WarpSize < word_count.value) { + __syncthreads(); + } + + for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { + global[i] = shared[i]; + } + } + } + + /* __device__ inline + void run(const DummyShflReductionType&) const + { + + value_type value; + ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , + &value); + // Number of blocks is bounded so that the reduction can be limited to + two passes. + // Each thread block is given an approximately equal amount of work to + perform. + // Accumulate the values for this block. + // The accumulation ordering does not match the final pass, but is + arithmatically equivalent. + + const Member work_part = + ( ( m_policy.m_num_tiles + ( gridDim.x - 1 ) ) / gridDim.x ); //portion + of tiles handled by each block + + this-> exec_range( value ); + + pointer_type const result = (pointer_type) (m_unified_space ? + m_unified_space : m_scratch_space) ; + + int max_active_thread = work_part < blockDim.y ? work_part:blockDim.y; + max_active_thread = (max_active_thread == + 0)?blockDim.y:max_active_thread; + + value_type init; + ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , + &init); + if(Impl::cuda_inter_block_reduction<ReducerTypeFwd,ValueJoin,WorkTagFwd> + (value,init,ValueJoin(ReducerConditional::select(m_functor , + m_reducer)),m_scratch_space,result,m_scratch_flags,max_active_thread)) { + const unsigned id = threadIdx.y*blockDim.x + threadIdx.x; + if(id==0) { + Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::final( + ReducerConditional::select(m_functor , m_reducer) , (void*) &value ); + *result = value; + } + } + } + */ + // Determine block size constrained by shared memory: + inline unsigned local_block_size(const FunctorType& f) { + unsigned n = CudaTraits::WarpSize * 8; + int shmem_size = + cuda_single_inter_block_reduce_scan_shmem<false, FunctorType, WorkTag>( + f, n); + using closure_type = Impl::ParallelReduce<FunctorType, Policy, ReducerType>; + cudaFuncAttributes attr = + CudaParallelLaunch<closure_type, + LaunchBounds>::get_cuda_func_attributes(); + while ( + (n && + (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < + shmem_size)) || + (n > + static_cast<unsigned>( + Kokkos::Impl::cuda_get_max_block_size<FunctorType, LaunchBounds>( + m_policy.space().impl_internal_space_instance(), attr, f, 1, + shmem_size, 0)))) { + n >>= 1; + shmem_size = cuda_single_inter_block_reduce_scan_shmem<false, FunctorType, + WorkTag>(f, n); + } + return n; + } + + inline void execute() { + const int nwork = m_policy.m_num_tiles; + if (nwork) { + int block_size = m_policy.m_prod_tile_dims; + // CONSTRAINT: Algorithm requires block_size >= product of tile dimensions + // Nearest power of two + int exponent_pow_two = std::ceil(std::log2(block_size)); + block_size = std::pow(2, exponent_pow_two); + int suggested_blocksize = local_block_size(m_functor); + + block_size = (block_size > suggested_blocksize) + ? block_size + : suggested_blocksize; // Note: block_size must be less + // than or equal to 512 + + m_scratch_space = cuda_internal_scratch_space( + m_policy.space(), ValueTraits::value_size(ReducerConditional::select( + m_functor, m_reducer)) * + block_size /* block_size == max block_count */); + m_scratch_flags = + cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type)); + m_unified_space = cuda_internal_scratch_unified( + m_policy.space(), ValueTraits::value_size(ReducerConditional::select( + m_functor, m_reducer))); + + // REQUIRED ( 1 , N , 1 ) + const dim3 block(1, block_size, 1); + // Required grid.x <= block.y + const dim3 grid(std::min(int(block.y), int(nwork)), 1, 1); + + // TODO @graph We need to effectively insert this in to the graph + const int shmem = + UseShflReduction + ? 0 + : cuda_single_inter_block_reduce_scan_shmem<false, FunctorType, + WorkTag>(m_functor, + block.y); + + CudaParallelLaunch<ParallelReduce, LaunchBounds>( + *this, grid, block, shmem, + m_policy.space().impl_internal_space_instance(), + false); // copy to device and execute + + if (!m_result_ptr_device_accessible) { + m_policy.space().fence(); + + if (m_result_ptr) { + if (m_unified_space) { + const int count = ValueTraits::value_count( + ReducerConditional::select(m_functor, m_reducer)); + for (int i = 0; i < count; ++i) { + m_result_ptr[i] = pointer_type(m_unified_space)[i]; + } + } else { + const int size = ValueTraits::value_size( + ReducerConditional::select(m_functor, m_reducer)); + DeepCopy<HostSpace, CudaSpace>(m_result_ptr, m_scratch_space, size); + } + } + } + } else { + if (m_result_ptr) { + // TODO @graph We need to effectively insert this in to the graph + ValueInit::init(ReducerConditional::select(m_functor, m_reducer), + m_result_ptr); + } + } + } + + template <class ViewType> + ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, + const ViewType& arg_result, + typename std::enable_if<Kokkos::is_view<ViewType>::value, + void*>::type = nullptr) + : m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(InvalidType()), + m_result_ptr(arg_result.data()), + m_result_ptr_device_accessible( + MemorySpaceAccess<Kokkos::CudaSpace, + typename ViewType::memory_space>::accessible), + m_scratch_space(nullptr), + m_scratch_flags(nullptr), + m_unified_space(nullptr) {} + + ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, + const ReducerType& reducer) + : m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(reducer), + m_result_ptr(reducer.view().data()), + m_result_ptr_device_accessible( + MemorySpaceAccess<Kokkos::CudaSpace, + typename ReducerType::result_view_type:: + memory_space>::accessible), + m_scratch_space(nullptr), + m_scratch_flags(nullptr), + m_unified_space(nullptr) {} +}; + +//---------------------------------------------------------------------------- + +template <class FunctorType, class ReducerType, class... Properties> +class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>, + ReducerType, Kokkos::Cuda> { + public: + using Policy = TeamPolicy<Properties...>; + + private: + using Member = typename Policy::member_type; + using WorkTag = typename Policy::work_tag; + using LaunchBounds = typename Policy::launch_bounds; + + using ReducerConditional = + Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, + FunctorType, ReducerType>; + using ReducerTypeFwd = typename ReducerConditional::type; + using WorkTagFwd = + typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, + WorkTag, void>::type; + + using ValueTraits = + Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>; + using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>; + using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>; + + using pointer_type = typename ValueTraits::pointer_type; + using reference_type = typename ValueTraits::reference_type; + using value_type = typename ValueTraits::value_type; + + public: + using functor_type = FunctorType; + using size_type = Cuda::size_type; + using reducer_type = ReducerType; + + enum : bool { + UseShflReduction = (true && (ValueTraits::StaticValueSize != 0)) + }; + + private: + using DummyShflReductionType = double; + using DummySHMEMReductionType = int; + + // Algorithmic constraints: blockDim.y is a power of two AND blockDim.y == + // blockDim.z == 1 shared memory utilization: + // + // [ global reduce space ] + // [ team reduce space ] + // [ team shared space ] + // + + const FunctorType m_functor; + const Policy m_policy; + const ReducerType m_reducer; + const pointer_type m_result_ptr; + const bool m_result_ptr_device_accessible; + const bool m_result_ptr_host_accessible; + size_type* m_scratch_space; + size_type* m_scratch_flags; + size_type* m_unified_space; + size_type m_team_begin; + size_type m_shmem_begin; + size_type m_shmem_size; + void* m_scratch_ptr[2]; + int m_scratch_size[2]; + const size_type m_league_size; + int m_team_size; + const size_type m_vector_size; + + template <class TagType> + __device__ inline + typename std::enable_if<std::is_same<TagType, void>::value>::type + exec_team(const Member& member, reference_type update) const { + m_functor(member, update); + } + + template <class TagType> + __device__ inline + typename std::enable_if<!std::is_same<TagType, void>::value>::type + exec_team(const Member& member, reference_type update) const { + m_functor(TagType(), member, update); + } + + public: + Policy const& get_policy() const { return m_policy; } + + __device__ inline void operator()() const { + int64_t threadid = 0; + if (m_scratch_size[1] > 0) { + __shared__ int64_t base_thread_id; + if (threadIdx.x == 0 && threadIdx.y == 0) { + threadid = (blockIdx.x * blockDim.z + threadIdx.z) % + (Kokkos::Impl::g_device_cuda_lock_arrays.n / + (blockDim.x * blockDim.y)); + threadid *= blockDim.x * blockDim.y; + int done = 0; + while (!done) { + done = + (0 == + atomicCAS( + &Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid], + 0, 1)); + if (!done) { + threadid += blockDim.x * blockDim.y; + if (int64_t(threadid + blockDim.x * blockDim.y) >= + int64_t(Kokkos::Impl::g_device_cuda_lock_arrays.n)) + threadid = 0; + } + } + base_thread_id = threadid; + } + __syncthreads(); + threadid = base_thread_id; + } + + run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, + DummySHMEMReductionType>::select(1, 1.0), + threadid); + if (m_scratch_size[1] > 0) { + __syncthreads(); + if (threadIdx.x == 0 && threadIdx.y == 0) + Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid] = 0; + } + } + + __device__ inline void run(const DummySHMEMReductionType&, + const int& threadid) const { + const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize / + sizeof(size_type)> + word_count(ValueTraits::value_size( + ReducerConditional::select(m_functor, m_reducer)) / + sizeof(size_type)); + + reference_type value = + ValueInit::init(ReducerConditional::select(m_functor, m_reducer), + kokkos_impl_cuda_shared_memory<size_type>() + + threadIdx.y * word_count.value); + + // Iterate this block through the league + const int int_league_size = (int)m_league_size; + for (int league_rank = blockIdx.x; league_rank < int_league_size; + league_rank += gridDim.x) { + this->template exec_team<WorkTag>( + Member(kokkos_impl_cuda_shared_memory<char>() + m_team_begin, + m_shmem_begin, m_shmem_size, + (void*)(((char*)m_scratch_ptr[1]) + + ptrdiff_t(threadid / (blockDim.x * blockDim.y)) * + m_scratch_size[1]), + m_scratch_size[1], league_rank, m_league_size), + value); + } + + // Reduce with final value at blockDim.y - 1 location. + // Doing code duplication here to fix issue #3428 + // Suspect optimizer bug?? + if (m_league_size == 0) { + // This is the final block with the final result at the final threads' + // location + + size_type* const shared = kokkos_impl_cuda_shared_memory<size_type>() + + (blockDim.y - 1) * word_count.value; + size_type* const global = + m_result_ptr_device_accessible + ? reinterpret_cast<size_type*>(m_result_ptr) + : (m_unified_space ? m_unified_space : m_scratch_space); + + if (threadIdx.y == 0) { + Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final( + ReducerConditional::select(m_functor, m_reducer), shared); + } + + if (CudaTraits::WarpSize < word_count.value) { + __syncthreads(); + } + + for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { + global[i] = shared[i]; + } + } + + if (m_league_size != 0) { + if (cuda_single_inter_block_reduce_scan<false, FunctorType, WorkTag>( + ReducerConditional::select(m_functor, m_reducer), blockIdx.x, + gridDim.x, kokkos_impl_cuda_shared_memory<size_type>(), + m_scratch_space, m_scratch_flags)) { + // This is the final block with the final result at the final threads' + // location + + size_type* const shared = kokkos_impl_cuda_shared_memory<size_type>() + + (blockDim.y - 1) * word_count.value; + size_type* const global = + m_result_ptr_device_accessible + ? reinterpret_cast<size_type*>(m_result_ptr) + : (m_unified_space ? m_unified_space : m_scratch_space); + + if (threadIdx.y == 0) { + Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final( + ReducerConditional::select(m_functor, m_reducer), shared); + } + + if (CudaTraits::WarpSize < word_count.value) { + __syncthreads(); + } + + for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { + global[i] = shared[i]; + } + } + } + } + + __device__ inline void run(const DummyShflReductionType&, + const int& threadid) const { + value_type value; + ValueInit::init(ReducerConditional::select(m_functor, m_reducer), &value); + + // Iterate this block through the league + const int int_league_size = (int)m_league_size; + for (int league_rank = blockIdx.x; league_rank < int_league_size; + league_rank += gridDim.x) { + this->template exec_team<WorkTag>( + Member(kokkos_impl_cuda_shared_memory<char>() + m_team_begin, + m_shmem_begin, m_shmem_size, + (void*)(((char*)m_scratch_ptr[1]) + + ptrdiff_t(threadid / (blockDim.x * blockDim.y)) * + m_scratch_size[1]), + m_scratch_size[1], league_rank, m_league_size), + value); + } + + pointer_type const result = + m_result_ptr_device_accessible + ? m_result_ptr + : (pointer_type)(m_unified_space ? m_unified_space + : m_scratch_space); + + value_type init; + ValueInit::init(ReducerConditional::select(m_functor, m_reducer), &init); + + if (int_league_size == 0) { + Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final( + ReducerConditional::select(m_functor, m_reducer), (void*)&value); + *result = value; + } else if ( + Impl::cuda_inter_block_reduction<FunctorType, ValueJoin, WorkTag>( + value, init, + ValueJoin(ReducerConditional::select(m_functor, m_reducer)), + m_scratch_space, result, m_scratch_flags, blockDim.y) + // This breaks a test + // Kokkos::Impl::CudaReductionsFunctor<FunctorType,WorkTag,false,true>::scalar_inter_block_reduction(ReducerConditional::select(m_functor + // , m_reducer) , blockIdx.x , gridDim.x , + // kokkos_impl_cuda_shared_memory<size_type>() , + // m_scratch_space , m_scratch_flags) + ) { + const unsigned id = threadIdx.y * blockDim.x + threadIdx.x; + if (id == 0) { + Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final( + ReducerConditional::select(m_functor, m_reducer), (void*)&value); + *result = value; + } + } + } + + inline void execute() { + const int nwork = m_league_size * m_team_size; + const bool need_device_set = ReduceFunctorHasInit<FunctorType>::value || + ReduceFunctorHasFinal<FunctorType>::value || + !m_result_ptr_host_accessible || +#ifdef KOKKOS_CUDA_ENABLE_GRAPHS + Policy::is_graph_kernel::value || +#endif + !std::is_same<ReducerType, InvalidType>::value; + if ((nwork > 0) || need_device_set) { + const int block_count = + UseShflReduction ? std::min(m_league_size, size_type(1024 * 32)) + : std::min(int(m_league_size), m_team_size); + + m_scratch_space = cuda_internal_scratch_space( + m_policy.space(), ValueTraits::value_size(ReducerConditional::select( + m_functor, m_reducer)) * + block_count); + m_scratch_flags = + cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type)); + m_unified_space = cuda_internal_scratch_unified( + m_policy.space(), ValueTraits::value_size(ReducerConditional::select( + m_functor, m_reducer))); + + dim3 block(m_vector_size, m_team_size, 1); + dim3 grid(block_count, 1, 1); + const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size; + + if ((nwork == 0) +#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION + || Kokkos::Impl::CudaInternal::cuda_use_serial_execution() +#endif + ) { + block = dim3(1, 1, 1); + grid = dim3(1, 1, 1); + } + + CudaParallelLaunch<ParallelReduce, LaunchBounds>( + *this, grid, block, shmem_size_total, + m_policy.space().impl_internal_space_instance(), + true); // copy to device and execute + + if (!m_result_ptr_device_accessible) { + m_policy.space().fence(); + + if (m_result_ptr) { + if (m_unified_space) { + const int count = ValueTraits::value_count( + ReducerConditional::select(m_functor, m_reducer)); + for (int i = 0; i < count; ++i) { + m_result_ptr[i] = pointer_type(m_unified_space)[i]; + } + } else { + const int size = ValueTraits::value_size( + ReducerConditional::select(m_functor, m_reducer)); + DeepCopy<HostSpace, CudaSpace>(m_result_ptr, m_scratch_space, size); + } + } + } + } else { + if (m_result_ptr) { + // TODO @graph We need to effectively insert this in to the graph + ValueInit::init(ReducerConditional::select(m_functor, m_reducer), + m_result_ptr); + } + } + } + + template <class ViewType> + ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, + const ViewType& arg_result, + typename std::enable_if<Kokkos::is_view<ViewType>::value, + void*>::type = nullptr) + : m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(InvalidType()), + m_result_ptr(arg_result.data()), + m_result_ptr_device_accessible( + MemorySpaceAccess<Kokkos::CudaSpace, + typename ViewType::memory_space>::accessible), + m_result_ptr_host_accessible( + MemorySpaceAccess<Kokkos::HostSpace, + typename ViewType::memory_space>::accessible), + m_scratch_space(nullptr), + m_scratch_flags(nullptr), + m_unified_space(nullptr), + m_team_begin(0), + m_shmem_begin(0), + m_shmem_size(0), + m_scratch_ptr{nullptr, nullptr}, + m_league_size(arg_policy.league_size()), + m_team_size(arg_policy.team_size()), + m_vector_size(arg_policy.impl_vector_length()) { + cudaFuncAttributes attr = + CudaParallelLaunch<ParallelReduce, + LaunchBounds>::get_cuda_func_attributes(); + m_team_size = + m_team_size >= 0 + ? m_team_size + : Kokkos::Impl::cuda_get_opt_block_size<FunctorType, LaunchBounds>( + m_policy.space().impl_internal_space_instance(), attr, + m_functor, m_vector_size, m_policy.team_scratch_size(0), + m_policy.thread_scratch_size(0)) / + m_vector_size; + + m_team_begin = + UseShflReduction + ? 0 + : cuda_single_inter_block_reduce_scan_shmem<false, FunctorType, + WorkTag>(arg_functor, + m_team_size); + m_shmem_begin = sizeof(double) * (m_team_size + 2); + m_shmem_size = + m_policy.scratch_size(0, m_team_size) + + FunctorTeamShmemSize<FunctorType>::value(arg_functor, m_team_size); + m_scratch_size[0] = m_shmem_size; + m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); + m_scratch_ptr[1] = + m_team_size <= 0 + ? nullptr + : m_policy.space() + .impl_internal_space_instance() + ->resize_team_scratch_space( + static_cast<std::int64_t>(m_scratch_size[1]) * + (static_cast<std::int64_t>( + Cuda::concurrency() / + (m_team_size * m_vector_size)))); + + // The global parallel_reduce does not support vector_length other than 1 at + // the moment + if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction) + Impl::throw_runtime_exception( + "Kokkos::parallel_reduce with a TeamPolicy using a vector length of " + "greater than 1 is not currently supported for CUDA for dynamic " + "sized reduction types."); + + if ((m_team_size < 32) && !UseShflReduction) + Impl::throw_runtime_exception( + "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller " + "than 32 is not currently supported with CUDA for dynamic sized " + "reduction types."); + + // Functor's reduce memory, team scan memory, and team shared memory depend + // upon team size. + + const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size; + + if (!Kokkos::Impl::is_integral_power_of_two(m_team_size) && + !UseShflReduction) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelReduce< Cuda > bad team size")); + } + + if (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < + shmem_size_total) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too much " + "L0 scratch memory")); + } + + if (int(m_team_size) > + arg_policy.team_size_max(m_functor, m_reducer, ParallelReduceTag())) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too " + "large team size.")); + } + } + + ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, + const ReducerType& reducer) + : m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(reducer), + m_result_ptr(reducer.view().data()), + m_result_ptr_device_accessible( + MemorySpaceAccess<Kokkos::CudaSpace, + typename ReducerType::result_view_type:: + memory_space>::accessible), + m_result_ptr_host_accessible( + MemorySpaceAccess<Kokkos::HostSpace, + typename ReducerType::result_view_type:: + memory_space>::accessible), + m_scratch_space(nullptr), + m_scratch_flags(nullptr), + m_unified_space(nullptr), + m_team_begin(0), + m_shmem_begin(0), + m_shmem_size(0), + m_scratch_ptr{nullptr, nullptr}, + m_league_size(arg_policy.league_size()), + m_team_size(arg_policy.team_size()), + m_vector_size(arg_policy.impl_vector_length()) { + cudaFuncAttributes attr = + CudaParallelLaunch<ParallelReduce, + LaunchBounds>::get_cuda_func_attributes(); + m_team_size = + m_team_size >= 0 + ? m_team_size + : Kokkos::Impl::cuda_get_opt_block_size<FunctorType, LaunchBounds>( + m_policy.space().impl_internal_space_instance(), attr, + m_functor, m_vector_size, m_policy.team_scratch_size(0), + m_policy.thread_scratch_size(0)) / + m_vector_size; + + m_team_begin = + UseShflReduction + ? 0 + : cuda_single_inter_block_reduce_scan_shmem<false, FunctorType, + WorkTag>(arg_functor, + m_team_size); + m_shmem_begin = sizeof(double) * (m_team_size + 2); + m_shmem_size = + m_policy.scratch_size(0, m_team_size) + + FunctorTeamShmemSize<FunctorType>::value(arg_functor, m_team_size); + m_scratch_size[0] = m_shmem_size; + m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); + m_scratch_ptr[1] = + m_team_size <= 0 + ? nullptr + : m_policy.space() + .impl_internal_space_instance() + ->resize_team_scratch_space( + static_cast<ptrdiff_t>(m_scratch_size[1]) * + static_cast<ptrdiff_t>(Cuda::concurrency() / + (m_team_size * m_vector_size))); + + // The global parallel_reduce does not support vector_length other than 1 at + // the moment + if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction) + Impl::throw_runtime_exception( + "Kokkos::parallel_reduce with a TeamPolicy using a vector length of " + "greater than 1 is not currently supported for CUDA for dynamic " + "sized reduction types."); + + if ((m_team_size < 32) && !UseShflReduction) + Impl::throw_runtime_exception( + "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller " + "than 32 is not currently supported with CUDA for dynamic sized " + "reduction types."); + + // Functor's reduce memory, team scan memory, and team shared memory depend + // upon team size. + + const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size; + + if ((!Kokkos::Impl::is_integral_power_of_two(m_team_size) && + !UseShflReduction) || + m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < + shmem_size_total) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelReduce< Cuda > bad team size")); + } + if (int(m_team_size) > + arg_policy.team_size_max(m_functor, m_reducer, ParallelReduceTag())) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too " + "large team size.")); + } + } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <class FunctorType, class... Traits> +class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> { + public: + using Policy = Kokkos::RangePolicy<Traits...>; + + private: + using Member = typename Policy::member_type; + using WorkTag = typename Policy::work_tag; + using WorkRange = typename Policy::WorkRange; + using LaunchBounds = typename Policy::launch_bounds; + + using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, WorkTag>; + using ValueInit = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>; + using ValueOps = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>; + + public: + using pointer_type = typename ValueTraits::pointer_type; + using reference_type = typename ValueTraits::reference_type; + using functor_type = FunctorType; + using size_type = Cuda::size_type; + + private: + // Algorithmic constraints: + // (a) blockDim.y is a power of two + // (b) blockDim.y == blockDim.z == 1 + // (c) gridDim.x <= blockDim.y * blockDim.y + // (d) gridDim.y == gridDim.z == 1 + + const FunctorType m_functor; + const Policy m_policy; + size_type* m_scratch_space; + size_type* m_scratch_flags; + size_type m_final; +#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION + bool m_run_serial; +#endif + + template <class TagType> + __device__ inline + typename std::enable_if<std::is_same<TagType, void>::value>::type + exec_range(const Member& i, reference_type update, + const bool final_result) const { + m_functor(i, update, final_result); + } + + template <class TagType> + __device__ inline + typename std::enable_if<!std::is_same<TagType, void>::value>::type + exec_range(const Member& i, reference_type update, + const bool final_result) const { + m_functor(TagType(), i, update, final_result); + } + + //---------------------------------------- + + __device__ inline void initial() const { + const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize / + sizeof(size_type)> + word_count(ValueTraits::value_size(m_functor) / sizeof(size_type)); + + size_type* const shared_value = + kokkos_impl_cuda_shared_memory<size_type>() + + word_count.value * threadIdx.y; + + ValueInit::init(m_functor, shared_value); + + // Number of blocks is bounded so that the reduction can be limited to two + // passes. Each thread block is given an approximately equal amount of work + // to perform. Accumulate the values for this block. The accumulation + // ordering does not match the final pass, but is arithmatically equivalent. + + const WorkRange range(m_policy, blockIdx.x, gridDim.x); + + for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end(); + iwork < iwork_end; iwork += blockDim.y) { + this->template exec_range<WorkTag>( + iwork, ValueOps::reference(shared_value), false); + } + + // Reduce and scan, writing out scan of blocks' totals and block-groups' + // totals. Blocks' scan values are written to 'blockIdx.x' location. + // Block-groups' scan values are at: i = ( j * blockDim.y - 1 ) for i < + // gridDim.x + cuda_single_inter_block_reduce_scan<true, FunctorType, WorkTag>( + m_functor, blockIdx.x, gridDim.x, + kokkos_impl_cuda_shared_memory<size_type>(), m_scratch_space, + m_scratch_flags); + } + + //---------------------------------------- + + __device__ inline void final() const { + const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize / + sizeof(size_type)> + word_count(ValueTraits::value_size(m_functor) / sizeof(size_type)); + + // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] , + // value[2] , ... } + size_type* const shared_data = kokkos_impl_cuda_shared_memory<size_type>(); + size_type* const shared_prefix = + shared_data + word_count.value * threadIdx.y; + size_type* const shared_accum = + shared_data + word_count.value * (blockDim.y + 1); + + // Starting value for this thread block is the previous block's total. + if (blockIdx.x) { + size_type* const block_total = + m_scratch_space + word_count.value * (blockIdx.x - 1); + for (unsigned i = threadIdx.y; i < word_count.value; ++i) { + shared_accum[i] = block_total[i]; + } + } else if (0 == threadIdx.y) { + ValueInit::init(m_functor, shared_accum); + } + + const WorkRange range(m_policy, blockIdx.x, gridDim.x); + + for (typename Policy::member_type iwork_base = range.begin(); + iwork_base < range.end(); iwork_base += blockDim.y) { +#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK + unsigned MASK = KOKKOS_IMPL_CUDA_ACTIVEMASK; +#endif + const typename Policy::member_type iwork = iwork_base + threadIdx.y; + + __syncthreads(); // Don't overwrite previous iteration values until they + // are used + + ValueInit::init(m_functor, shared_prefix + word_count.value); + + // Copy previous block's accumulation total into thread[0] prefix and + // inclusive scan value of this block + for (unsigned i = threadIdx.y; i < word_count.value; ++i) { + shared_data[i + word_count.value] = shared_data[i] = shared_accum[i]; + } +#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK + KOKKOS_IMPL_CUDA_SYNCWARP_MASK(MASK); +#else + KOKKOS_IMPL_CUDA_SYNCWARP; +#endif + if (CudaTraits::WarpSize < word_count.value) { + __syncthreads(); + } // Protect against large scan values. + + // Call functor to accumulate inclusive scan value for this work item + if (iwork < range.end()) { + this->template exec_range<WorkTag>( + iwork, ValueOps::reference(shared_prefix + word_count.value), + false); + } + + // Scan block values into locations shared_data[1..blockDim.y] + cuda_intra_block_reduce_scan<true, FunctorType, WorkTag>( + m_functor, + typename ValueTraits::pointer_type(shared_data + word_count.value)); + + { + size_type* const block_total = + shared_data + word_count.value * blockDim.y; + for (unsigned i = threadIdx.y; i < word_count.value; ++i) { + shared_accum[i] = block_total[i]; + } + } + + // Call functor with exclusive scan value + if (iwork < range.end()) { + this->template exec_range<WorkTag>( + iwork, ValueOps::reference(shared_prefix), true); + } + } + } + + public: + Policy const& get_policy() const { return m_policy; } + + //---------------------------------------- + + __device__ inline void operator()() const { +#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION + if (m_run_serial) { + typename ValueTraits::value_type value; + ValueInit::init(m_functor, (void*)&value); + const WorkRange range(m_policy, blockIdx.x, gridDim.x); + + for (typename Policy::member_type iwork_base = range.begin(); + iwork_base < range.end(); iwork_base++) { + this->template exec_range<WorkTag>(iwork_base, value, true); + } + } else { +#endif + if (!m_final) { + initial(); + } else { + final(); + } +#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION + } +#endif + } + + // Determine block size constrained by shared memory: + inline unsigned local_block_size(const FunctorType& f) { + // blockDim.y must be power of two = 128 (4 warps) or 256 (8 warps) or 512 + // (16 warps) gridDim.x <= blockDim.y * blockDim.y + // + // 4 warps was 10% faster than 8 warps and 20% faster than 16 warps in unit + // testing + + unsigned n = CudaTraits::WarpSize * 4; + while (n && + unsigned(m_policy.space() + .impl_internal_space_instance() + ->m_maxShmemPerBlock) < + cuda_single_inter_block_reduce_scan_shmem<false, FunctorType, + WorkTag>(f, n)) { + n >>= 1; + } + return n; + } + + inline void execute() { + const int nwork = m_policy.end() - m_policy.begin(); + if (nwork) { + enum { GridMaxComputeCapability_2x = 0x0ffff }; + + const int block_size = local_block_size(m_functor); + KOKKOS_ASSERT(block_size > 0); + + const int grid_max = + (block_size * block_size) < GridMaxComputeCapability_2x + ? (block_size * block_size) + : GridMaxComputeCapability_2x; + + // At most 'max_grid' blocks: + const int max_grid = + std::min(int(grid_max), int((nwork + block_size - 1) / block_size)); + + // How much work per block: + const int work_per_block = (nwork + max_grid - 1) / max_grid; + + // How many block are really needed for this much work: + const int grid_x = (nwork + work_per_block - 1) / work_per_block; + + m_scratch_space = cuda_internal_scratch_space( + m_policy.space(), ValueTraits::value_size(m_functor) * grid_x); + m_scratch_flags = + cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type) * 1); + + dim3 grid(grid_x, 1, 1); + dim3 block(1, block_size, 1); // REQUIRED DIMENSIONS ( 1 , N , 1 ) + const int shmem = ValueTraits::value_size(m_functor) * (block_size + 2); + +#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION + if (m_run_serial) { + block = dim3(1, 1, 1); + grid = dim3(1, 1, 1); + } else { +#endif + m_final = false; + CudaParallelLaunch<ParallelScan, LaunchBounds>( + *this, grid, block, shmem, + m_policy.space().impl_internal_space_instance(), + false); // copy to device and execute +#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION + } +#endif + m_final = true; + CudaParallelLaunch<ParallelScan, LaunchBounds>( + *this, grid, block, shmem, + m_policy.space().impl_internal_space_instance(), + false); // copy to device and execute + } + } + + ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy) + : m_functor(arg_functor), + m_policy(arg_policy), + m_scratch_space(nullptr), + m_scratch_flags(nullptr), + m_final(false) +#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION + , + m_run_serial(Kokkos::Impl::CudaInternal::cuda_use_serial_execution()) +#endif + { + } +}; + +//---------------------------------------------------------------------------- +template <class FunctorType, class ReturnType, class... Traits> +class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>, + ReturnType, Kokkos::Cuda> { + public: + using Policy = Kokkos::RangePolicy<Traits...>; + + private: + using Member = typename Policy::member_type; + using WorkTag = typename Policy::work_tag; + using WorkRange = typename Policy::WorkRange; + using LaunchBounds = typename Policy::launch_bounds; + + using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, WorkTag>; + using ValueInit = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>; + using ValueOps = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>; + + public: + using pointer_type = typename ValueTraits::pointer_type; + using reference_type = typename ValueTraits::reference_type; + using functor_type = FunctorType; + using size_type = Cuda::size_type; + + private: + // Algorithmic constraints: + // (a) blockDim.y is a power of two + // (b) blockDim.y == blockDim.z == 1 + // (c) gridDim.x <= blockDim.y * blockDim.y + // (d) gridDim.y == gridDim.z == 1 + + const FunctorType m_functor; + const Policy m_policy; + size_type* m_scratch_space; + size_type* m_scratch_flags; + size_type m_final; + ReturnType& m_returnvalue; +#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION + bool m_run_serial; +#endif + + template <class TagType> + __device__ inline + typename std::enable_if<std::is_same<TagType, void>::value>::type + exec_range(const Member& i, reference_type update, + const bool final_result) const { + m_functor(i, update, final_result); + } + + template <class TagType> + __device__ inline + typename std::enable_if<!std::is_same<TagType, void>::value>::type + exec_range(const Member& i, reference_type update, + const bool final_result) const { + m_functor(TagType(), i, update, final_result); + } + + //---------------------------------------- + + __device__ inline void initial() const { + const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize / + sizeof(size_type)> + word_count(ValueTraits::value_size(m_functor) / sizeof(size_type)); + + size_type* const shared_value = + kokkos_impl_cuda_shared_memory<size_type>() + + word_count.value * threadIdx.y; + + ValueInit::init(m_functor, shared_value); + + // Number of blocks is bounded so that the reduction can be limited to two + // passes. Each thread block is given an approximately equal amount of work + // to perform. Accumulate the values for this block. The accumulation + // ordering does not match the final pass, but is arithmatically equivalent. + + const WorkRange range(m_policy, blockIdx.x, gridDim.x); + + for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end(); + iwork < iwork_end; iwork += blockDim.y) { + this->template exec_range<WorkTag>( + iwork, ValueOps::reference(shared_value), false); + } + + // Reduce and scan, writing out scan of blocks' totals and block-groups' + // totals. Blocks' scan values are written to 'blockIdx.x' location. + // Block-groups' scan values are at: i = ( j * blockDim.y - 1 ) for i < + // gridDim.x + cuda_single_inter_block_reduce_scan<true, FunctorType, WorkTag>( + m_functor, blockIdx.x, gridDim.x, + kokkos_impl_cuda_shared_memory<size_type>(), m_scratch_space, + m_scratch_flags); + } + + //---------------------------------------- + + __device__ inline void final() const { + const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize / + sizeof(size_type)> + word_count(ValueTraits::value_size(m_functor) / sizeof(size_type)); + + // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] , + // value[2] , ... } + size_type* const shared_data = kokkos_impl_cuda_shared_memory<size_type>(); + size_type* const shared_prefix = + shared_data + word_count.value * threadIdx.y; + size_type* const shared_accum = + shared_data + word_count.value * (blockDim.y + 1); + + // Starting value for this thread block is the previous block's total. + if (blockIdx.x) { + size_type* const block_total = + m_scratch_space + word_count.value * (blockIdx.x - 1); + for (unsigned i = threadIdx.y; i < word_count.value; ++i) { + shared_accum[i] = block_total[i]; + } + } else if (0 == threadIdx.y) { + ValueInit::init(m_functor, shared_accum); + } + + const WorkRange range(m_policy, blockIdx.x, gridDim.x); + + for (typename Policy::member_type iwork_base = range.begin(); + iwork_base < range.end(); iwork_base += blockDim.y) { +#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK + unsigned MASK = KOKKOS_IMPL_CUDA_ACTIVEMASK; +#endif + + const typename Policy::member_type iwork = iwork_base + threadIdx.y; + + __syncthreads(); // Don't overwrite previous iteration values until they + // are used + + ValueInit::init(m_functor, shared_prefix + word_count.value); + + // Copy previous block's accumulation total into thread[0] prefix and + // inclusive scan value of this block + for (unsigned i = threadIdx.y; i < word_count.value; ++i) { + shared_data[i + word_count.value] = shared_data[i] = shared_accum[i]; + } + +#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK + KOKKOS_IMPL_CUDA_SYNCWARP_MASK(MASK); +#else + KOKKOS_IMPL_CUDA_SYNCWARP; +#endif + if (CudaTraits::WarpSize < word_count.value) { + __syncthreads(); + } // Protect against large scan values. + + // Call functor to accumulate inclusive scan value for this work item + if (iwork < range.end()) { + this->template exec_range<WorkTag>( + iwork, ValueOps::reference(shared_prefix + word_count.value), + false); + } + + // Scan block values into locations shared_data[1..blockDim.y] + cuda_intra_block_reduce_scan<true, FunctorType, WorkTag>( + m_functor, + typename ValueTraits::pointer_type(shared_data + word_count.value)); + + { + size_type* const block_total = + shared_data + word_count.value * blockDim.y; + for (unsigned i = threadIdx.y; i < word_count.value; ++i) { + shared_accum[i] = block_total[i]; + } + } + + // Call functor with exclusive scan value + if (iwork < range.end()) { + this->template exec_range<WorkTag>( + iwork, ValueOps::reference(shared_prefix), true); + } + } + } + + public: + Policy const& get_policy() const { return m_policy; } + + //---------------------------------------- + + __device__ inline void operator()() const { +#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION + if (m_run_serial) { + typename ValueTraits::value_type value; + ValueInit::init(m_functor, (void*)&value); + const WorkRange range(m_policy, blockIdx.x, gridDim.x); + + for (typename Policy::member_type iwork_base = range.begin(); + iwork_base < range.end(); iwork_base++) { + this->template exec_range<WorkTag>(iwork_base, value, true); + } + *((typename ValueTraits::value_type*)m_scratch_space) = value; + } else { +#endif + if (!m_final) { + initial(); + } else { + final(); + } +#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION + } +#endif + } + + // Determine block size constrained by shared memory: + inline unsigned local_block_size(const FunctorType& f) { + // blockDim.y must be power of two = 128 (4 warps) or 256 (8 warps) or 512 + // (16 warps) gridDim.x <= blockDim.y * blockDim.y + // + // 4 warps was 10% faster than 8 warps and 20% faster than 16 warps in unit + // testing + + unsigned n = CudaTraits::WarpSize * 4; + while (n && + unsigned(m_policy.space() + .impl_internal_space_instance() + ->m_maxShmemPerBlock) < + cuda_single_inter_block_reduce_scan_shmem<false, FunctorType, + WorkTag>(f, n)) { + n >>= 1; + } + return n; + } + + inline void execute() { + const int nwork = m_policy.end() - m_policy.begin(); + if (nwork) { + enum { GridMaxComputeCapability_2x = 0x0ffff }; + + const int block_size = local_block_size(m_functor); + KOKKOS_ASSERT(block_size > 0); + + const int grid_max = + (block_size * block_size) < GridMaxComputeCapability_2x + ? (block_size * block_size) + : GridMaxComputeCapability_2x; + + // At most 'max_grid' blocks: + const int max_grid = + std::min(int(grid_max), int((nwork + block_size - 1) / block_size)); + + // How much work per block: + const int work_per_block = (nwork + max_grid - 1) / max_grid; + + // How many block are really needed for this much work: + const int grid_x = (nwork + work_per_block - 1) / work_per_block; + + m_scratch_space = cuda_internal_scratch_space( + m_policy.space(), ValueTraits::value_size(m_functor) * grid_x); + m_scratch_flags = + cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type) * 1); + + dim3 grid(grid_x, 1, 1); + dim3 block(1, block_size, 1); // REQUIRED DIMENSIONS ( 1 , N , 1 ) + const int shmem = ValueTraits::value_size(m_functor) * (block_size + 2); + +#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION + if (m_run_serial) { + block = dim3(1, 1, 1); + grid = dim3(1, 1, 1); + } else { +#endif + + m_final = false; + CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>( + *this, grid, block, shmem, + m_policy.space().impl_internal_space_instance(), + false); // copy to device and execute +#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION + } +#endif + m_final = true; + CudaParallelLaunch<ParallelScanWithTotal, LaunchBounds>( + *this, grid, block, shmem, + m_policy.space().impl_internal_space_instance(), + false); // copy to device and execute + + const int size = ValueTraits::value_size(m_functor); +#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION + if (m_run_serial) + DeepCopy<HostSpace, CudaSpace>(&m_returnvalue, m_scratch_space, size); + else +#endif + DeepCopy<HostSpace, CudaSpace>( + &m_returnvalue, m_scratch_space + (grid_x - 1) * size / sizeof(int), + size); + } + } + + ParallelScanWithTotal(const FunctorType& arg_functor, + const Policy& arg_policy, ReturnType& arg_returnvalue) + : m_functor(arg_functor), + m_policy(arg_policy), + m_scratch_space(nullptr), + m_scratch_flags(nullptr), + m_final(false), + m_returnvalue(arg_returnvalue) +#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION + , + m_run_serial(Kokkos::Impl::CudaInternal::cuda_use_serial_execution()) +#endif + { + } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +namespace Impl { +template <class FunctorType, class ExecPolicy, class ValueType, + class Tag = typename ExecPolicy::work_tag> +struct CudaFunctorAdapter { + const FunctorType f; + using value_type = ValueType; + CudaFunctorAdapter(const FunctorType& f_) : f(f_) {} + + __device__ inline void operator()(typename ExecPolicy::work_tag, + const typename ExecPolicy::member_type& i, + ValueType& val) const { + // Insert Static Assert with decltype on ValueType equals third argument + // type of FunctorType::operator() + f(typename ExecPolicy::work_tag(), i, val); + } + + __device__ inline void operator()(typename ExecPolicy::work_tag, + const typename ExecPolicy::member_type& i, + const typename ExecPolicy::member_type& j, + ValueType& val) const { + // Insert Static Assert with decltype on ValueType equals third argument + // type of FunctorType::operator() + f(typename ExecPolicy::work_tag(), i, j, val); + } + + __device__ inline void operator()(typename ExecPolicy::work_tag, + const typename ExecPolicy::member_type& i, + const typename ExecPolicy::member_type& j, + const typename ExecPolicy::member_type& k, + ValueType& val) const { + // Insert Static Assert with decltype on ValueType equals third argument + // type of FunctorType::operator() + f(typename ExecPolicy::work_tag(), i, j, k, val); + } + + __device__ inline void operator()(typename ExecPolicy::work_tag, + const typename ExecPolicy::member_type& i, + const typename ExecPolicy::member_type& j, + const typename ExecPolicy::member_type& k, + const typename ExecPolicy::member_type& l, + ValueType& val) const { + // Insert Static Assert with decltype on ValueType equals third argument + // type of FunctorType::operator() + f(typename ExecPolicy::work_tag(), i, j, k, l, val); + } + + __device__ inline void operator()(typename ExecPolicy::work_tag, + const typename ExecPolicy::member_type& i, + const typename ExecPolicy::member_type& j, + const typename ExecPolicy::member_type& k, + const typename ExecPolicy::member_type& l, + const typename ExecPolicy::member_type& m, + ValueType& val) const { + // Insert Static Assert with decltype on ValueType equals third argument + // type of FunctorType::operator() + f(typename ExecPolicy::work_tag(), i, j, k, l, m, val); + } + + __device__ inline void operator()(typename ExecPolicy::work_tag, + const typename ExecPolicy::member_type& i, + const typename ExecPolicy::member_type& j, + const typename ExecPolicy::member_type& k, + const typename ExecPolicy::member_type& l, + const typename ExecPolicy::member_type& m, + const typename ExecPolicy::member_type& n, + ValueType& val) const { + // Insert Static Assert with decltype on ValueType equals third argument + // type of FunctorType::operator() + f(typename ExecPolicy::work_tag(), i, j, k, l, m, n, val); + } +}; + +template <class FunctorType, class ExecPolicy, class ValueType> +struct CudaFunctorAdapter<FunctorType, ExecPolicy, ValueType, void> { + const FunctorType f; + using value_type = ValueType; + CudaFunctorAdapter(const FunctorType& f_) : f(f_) {} + + __device__ inline void operator()(const typename ExecPolicy::member_type& i, + ValueType& val) const { + // Insert Static Assert with decltype on ValueType equals second argument + // type of FunctorType::operator() + f(i, val); + } + + __device__ inline void operator()(const typename ExecPolicy::member_type& i, + const typename ExecPolicy::member_type& j, + ValueType& val) const { + // Insert Static Assert with decltype on ValueType equals second argument + // type of FunctorType::operator() + f(i, j, val); + } + + __device__ inline void operator()(const typename ExecPolicy::member_type& i, + const typename ExecPolicy::member_type& j, + const typename ExecPolicy::member_type& k, + ValueType& val) const { + // Insert Static Assert with decltype on ValueType equals second argument + // type of FunctorType::operator() + f(i, j, k, val); + } + + __device__ inline void operator()(const typename ExecPolicy::member_type& i, + const typename ExecPolicy::member_type& j, + const typename ExecPolicy::member_type& k, + const typename ExecPolicy::member_type& l, + ValueType& val) const { + // Insert Static Assert with decltype on ValueType equals second argument + // type of FunctorType::operator() + f(i, j, k, l, val); + } + + __device__ inline void operator()(const typename ExecPolicy::member_type& i, + const typename ExecPolicy::member_type& j, + const typename ExecPolicy::member_type& k, + const typename ExecPolicy::member_type& l, + const typename ExecPolicy::member_type& m, + ValueType& val) const { + // Insert Static Assert with decltype on ValueType equals second argument + // type of FunctorType::operator() + f(i, j, k, l, m, val); + } + + __device__ inline void operator()(const typename ExecPolicy::member_type& i, + const typename ExecPolicy::member_type& j, + const typename ExecPolicy::member_type& k, + const typename ExecPolicy::member_type& l, + const typename ExecPolicy::member_type& m, + const typename ExecPolicy::member_type& n, + ValueType& val) const { + // Insert Static Assert with decltype on ValueType equals second argument + // type of FunctorType::operator() + f(i, j, k, l, m, n, val); + } + + __device__ inline void operator()(typename ExecPolicy::member_type& i, + ValueType& val) const { + // Insert Static Assert with decltype on ValueType equals second argument + // type of FunctorType::operator() + f(i, val); + } + + __device__ inline void operator()(typename ExecPolicy::member_type& i, + typename ExecPolicy::member_type& j, + ValueType& val) const { + // Insert Static Assert with decltype on ValueType equals second argument + // type of FunctorType::operator() + f(i, j, val); + } + + __device__ inline void operator()(typename ExecPolicy::member_type& i, + typename ExecPolicy::member_type& j, + typename ExecPolicy::member_type& k, + ValueType& val) const { + // Insert Static Assert with decltype on ValueType equals second argument + // type of FunctorType::operator() + f(i, j, k, val); + } + + __device__ inline void operator()(typename ExecPolicy::member_type& i, + typename ExecPolicy::member_type& j, + typename ExecPolicy::member_type& k, + typename ExecPolicy::member_type& l, + ValueType& val) const { + // Insert Static Assert with decltype on ValueType equals second argument + // type of FunctorType::operator() + f(i, j, k, l, val); + } + + __device__ inline void operator()(typename ExecPolicy::member_type& i, + typename ExecPolicy::member_type& j, + typename ExecPolicy::member_type& k, + typename ExecPolicy::member_type& l, + typename ExecPolicy::member_type& m, + ValueType& val) const { + // Insert Static Assert with decltype on ValueType equals second argument + // type of FunctorType::operator() + f(i, j, k, l, m, val); + } + + __device__ inline void operator()(typename ExecPolicy::member_type& i, + typename ExecPolicy::member_type& j, + typename ExecPolicy::member_type& k, + typename ExecPolicy::member_type& l, + typename ExecPolicy::member_type& m, + typename ExecPolicy::member_type& n, + ValueType& val) const { + // Insert Static Assert with decltype on ValueType equals second argument + // type of FunctorType::operator() + f(i, j, k, l, m, n, val); + } +}; + +template <class FunctorType, class ResultType, class Tag, + bool Enable = IsNonTrivialReduceFunctor<FunctorType>::value> +struct FunctorReferenceType { + using reference_type = ResultType&; +}; + +template <class FunctorType, class ResultType, class Tag> +struct FunctorReferenceType<FunctorType, ResultType, Tag, true> { + using reference_type = + typename Kokkos::Impl::FunctorValueTraits<FunctorType, + Tag>::reference_type; +}; + +template <class FunctorTypeIn, class ExecPolicy, class ValueType> +struct ParallelReduceFunctorType<FunctorTypeIn, ExecPolicy, ValueType, Cuda> { + enum { + FunctorHasValueType = IsNonTrivialReduceFunctor<FunctorTypeIn>::value + }; + using functor_type = typename Kokkos::Impl::if_c< + FunctorHasValueType, FunctorTypeIn, + Impl::CudaFunctorAdapter<FunctorTypeIn, ExecPolicy, ValueType>>::type; + static functor_type functor(const FunctorTypeIn& functor_in) { + return Impl::if_c<FunctorHasValueType, FunctorTypeIn, functor_type>::select( + functor_in, functor_type(functor_in)); + } +}; + +} // namespace Impl + +} // namespace Kokkos + +#endif /* defined(KOKKOS_ENABLE_CUDA) */ +#endif /* #ifndef KOKKOS_CUDA_PARALLEL_HPP */ diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp new file mode 100644 index 0000000000000000000000000000000000000000..fc9fc3770bead16eff4a0b5b6fea8b0a2039200f --- /dev/null +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp @@ -0,0 +1,987 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CUDA_REDUCESCAN_HPP +#define KOKKOS_CUDA_REDUCESCAN_HPP + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_CUDA) + +#include <utility> + +#include <Kokkos_Parallel.hpp> +#include <impl/Kokkos_FunctorAdapter.hpp> +#include <impl/Kokkos_Error.hpp> +#include <Cuda/Kokkos_Cuda_Vectorization.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +//---------------------------------------------------------------------------- +/* + * Algorithmic constraints: + * (a) threads with same threadIdx.y have same value + * (b) blockDim.x == power of two + * (c) blockDim.z == 1 + */ + +template <class ValueType, class JoinOp> +__device__ inline + typename std::enable_if<!Kokkos::is_reducer<ValueType>::value>::type + cuda_intra_warp_reduction(ValueType& result, const JoinOp& join, + const uint32_t max_active_thread = blockDim.y) { + unsigned int shift = 1; + + // Reduce over values from threads with different threadIdx.y + while (blockDim.x * shift < 32) { + const ValueType tmp = shfl_down(result, blockDim.x * shift, 32u); + // Only join if upper thread is active (this allows non power of two for + // blockDim.y + if (threadIdx.y + shift < max_active_thread) join(result, tmp); + shift *= 2; + } + + result = shfl(result, 0, 32); +} + +template <class ValueType, class JoinOp> +__device__ inline + typename std::enable_if<!Kokkos::is_reducer<ValueType>::value>::type + cuda_inter_warp_reduction(ValueType& value, const JoinOp& join, + const int max_active_thread = blockDim.y) { +#define STEP_WIDTH 4 + // Depending on the ValueType _shared__ memory must be aligned up to 8byte + // boundaries The reason not to use ValueType directly is that for types with + // constructors it could lead to race conditions + alignas(alignof(ValueType) > alignof(double) ? alignof(ValueType) + : alignof(double)) + __shared__ double sh_result[(sizeof(ValueType) + 7) / 8 * STEP_WIDTH]; + ValueType* result = (ValueType*)&sh_result; + const int step = 32 / blockDim.x; + int shift = STEP_WIDTH; + const int id = threadIdx.y % step == 0 ? threadIdx.y / step : 65000; + if (id < STEP_WIDTH) { + result[id] = value; + } + __syncthreads(); + while (shift <= max_active_thread / step) { + if (shift <= id && shift + STEP_WIDTH > id && threadIdx.x == 0) { + join(result[id % STEP_WIDTH], value); + } + __syncthreads(); + shift += STEP_WIDTH; + } + + value = result[0]; + for (int i = 1; (i * step < max_active_thread) && i < STEP_WIDTH; i++) + join(value, result[i]); +} + +template <class ValueType, class JoinOp> +__device__ inline + typename std::enable_if<!Kokkos::is_reducer<ValueType>::value>::type + cuda_intra_block_reduction(ValueType& value, const JoinOp& join, + const int max_active_thread = blockDim.y) { + cuda_intra_warp_reduction(value, join, max_active_thread); + cuda_inter_warp_reduction(value, join, max_active_thread); +} + +template <class FunctorType, class JoinOp, class ArgTag = void> +__device__ bool cuda_inter_block_reduction( + typename FunctorValueTraits<FunctorType, ArgTag>::reference_type value, + typename FunctorValueTraits<FunctorType, ArgTag>::reference_type neutral, + const JoinOp& join, Cuda::size_type* const m_scratch_space, + typename FunctorValueTraits<FunctorType, + ArgTag>::pointer_type const /*result*/, + Cuda::size_type* const m_scratch_flags, + const int max_active_thread = blockDim.y) { +#ifdef __CUDA_ARCH__ + using pointer_type = + typename FunctorValueTraits<FunctorType, ArgTag>::pointer_type; + using value_type = + typename FunctorValueTraits<FunctorType, ArgTag>::value_type; + + // Do the intra-block reduction with shfl operations and static shared memory + cuda_intra_block_reduction(value, join, max_active_thread); + + const int id = threadIdx.y * blockDim.x + threadIdx.x; + + // One thread in the block writes block result to global scratch_memory + if (id == 0) { + pointer_type global = ((pointer_type)m_scratch_space) + blockIdx.x; + *global = value; + } + + // One warp of last block performs inter block reduction through loading the + // block values from global scratch_memory + bool last_block = false; + __threadfence(); + __syncthreads(); + if (id < 32) { + Cuda::size_type count; + + // Figure out whether this is the last block + if (id == 0) count = Kokkos::atomic_fetch_add(m_scratch_flags, 1); + count = Kokkos::shfl(count, 0, 32); + + // Last block does the inter block reduction + if (count == gridDim.x - 1) { + // set flag back to zero + if (id == 0) *m_scratch_flags = 0; + last_block = true; + value = neutral; + + pointer_type const volatile global = (pointer_type)m_scratch_space; + + // Reduce all global values with splitting work over threads in one warp + const int step_size = + blockDim.x * blockDim.y < 32 ? blockDim.x * blockDim.y : 32; + for (int i = id; i < (int)gridDim.x; i += step_size) { + value_type tmp = global[i]; + join(value, tmp); + } + + // Perform shfl reductions within the warp only join if contribution is + // valid (allows gridDim.x non power of two and <32) + if (int(blockDim.x * blockDim.y) > 1) { + value_type tmp = Kokkos::shfl_down(value, 1, 32); + if (id + 1 < int(gridDim.x)) join(value, tmp); + } +#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK + unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK; + int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1); +#else + int active = KOKKOS_IMPL_CUDA_BALLOT(1); +#endif + if (int(blockDim.x * blockDim.y) > 2) { + value_type tmp = Kokkos::shfl_down(value, 2, 32); + if (id + 2 < int(gridDim.x)) join(value, tmp); + } +#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK + active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1); +#else + active += KOKKOS_IMPL_CUDA_BALLOT(1); +#endif + if (int(blockDim.x * blockDim.y) > 4) { + value_type tmp = Kokkos::shfl_down(value, 4, 32); + if (id + 4 < int(gridDim.x)) join(value, tmp); + } +#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK + active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1); +#else + active += KOKKOS_IMPL_CUDA_BALLOT(1); +#endif + if (int(blockDim.x * blockDim.y) > 8) { + value_type tmp = Kokkos::shfl_down(value, 8, 32); + if (id + 8 < int(gridDim.x)) join(value, tmp); + } +#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK + active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1); +#else + active += KOKKOS_IMPL_CUDA_BALLOT(1); +#endif + if (int(blockDim.x * blockDim.y) > 16) { + value_type tmp = Kokkos::shfl_down(value, 16, 32); + if (id + 16 < int(gridDim.x)) join(value, tmp); + } +#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK + active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1); +#else + active += KOKKOS_IMPL_CUDA_BALLOT(1); +#endif + } + } + // The last block has in its thread=0 the global reduction value through + // "value" + return last_block; +#else + (void)value; + (void)neutral; + (void)join; + (void)m_scratch_space; + (void)m_scratch_flags; + (void)max_active_thread; + return true; +#endif +} + +template <class ReducerType> +__device__ inline + typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type + cuda_intra_warp_reduction(const ReducerType& reducer, + typename ReducerType::value_type& result, + const uint32_t max_active_thread = blockDim.y) { + using ValueType = typename ReducerType::value_type; + + unsigned int shift = 1; + + // Reduce over values from threads with different threadIdx.y + while (blockDim.x * shift < 32) { + const ValueType tmp = shfl_down(result, blockDim.x * shift, 32u); + // Only join if upper thread is active (this allows non power of two for + // blockDim.y + if (threadIdx.y + shift < max_active_thread) reducer.join(result, tmp); + shift *= 2; + } + + result = shfl(result, 0, 32); + reducer.reference() = result; +} + +template <class ReducerType> +__device__ inline + typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type + cuda_inter_warp_reduction(const ReducerType& reducer, + typename ReducerType::value_type value, + const int max_active_thread = blockDim.y) { + using ValueType = typename ReducerType::value_type; + +#define STEP_WIDTH 4 + // Depending on the ValueType _shared__ memory must be aligned up to 8byte + // boundaries The reason not to use ValueType directly is that for types with + // constructors it could lead to race conditions + alignas(alignof(ValueType) > alignof(double) ? alignof(ValueType) + : alignof(double)) + __shared__ double sh_result[(sizeof(ValueType) + 7) / 8 * STEP_WIDTH]; + ValueType* result = (ValueType*)&sh_result; + const int step = 32 / blockDim.x; + int shift = STEP_WIDTH; + const int id = threadIdx.y % step == 0 ? threadIdx.y / step : 65000; + if (id < STEP_WIDTH) { + result[id] = value; + } + __syncthreads(); + while (shift <= max_active_thread / step) { + if (shift <= id && shift + STEP_WIDTH > id && threadIdx.x == 0) { + reducer.join(result[id % STEP_WIDTH], value); + } + __syncthreads(); + shift += STEP_WIDTH; + } + + value = result[0]; + for (int i = 1; (i * step < max_active_thread) && i < STEP_WIDTH; i++) + reducer.join(value, result[i]); + + reducer.reference() = value; +} + +template <class ReducerType> +__device__ inline + typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type + cuda_intra_block_reduction(const ReducerType& reducer, + typename ReducerType::value_type value, + const int max_active_thread = blockDim.y) { + cuda_intra_warp_reduction(reducer, value, max_active_thread); + cuda_inter_warp_reduction(reducer, value, max_active_thread); +} + +template <class ReducerType> +__device__ inline + typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type + cuda_intra_block_reduction(const ReducerType& reducer, + const int max_active_thread = blockDim.y) { + cuda_intra_block_reduction(reducer, reducer.reference(), max_active_thread); +} + +template <class ReducerType> +__device__ inline + typename std::enable_if<Kokkos::is_reducer<ReducerType>::value, bool>::type + cuda_inter_block_reduction(const ReducerType& reducer, + Cuda::size_type* const m_scratch_space, + Cuda::size_type* const m_scratch_flags, + const int max_active_thread = blockDim.y) { +#ifdef __CUDA_ARCH__ + using pointer_type = typename ReducerType::value_type*; + using value_type = typename ReducerType::value_type; + + // Do the intra-block reduction with shfl operations and static shared memory + cuda_intra_block_reduction(reducer, max_active_thread); + + value_type value = reducer.reference(); + + const int id = threadIdx.y * blockDim.x + threadIdx.x; + + // One thread in the block writes block result to global scratch_memory + if (id == 0) { + pointer_type global = ((pointer_type)m_scratch_space) + blockIdx.x; + *global = value; + } + + // One warp of last block performs inter block reduction through loading the + // block values from global scratch_memory + bool last_block = false; + + __threadfence(); + __syncthreads(); + if (id < 32) { + Cuda::size_type count; + + // Figure out whether this is the last block + if (id == 0) count = Kokkos::atomic_fetch_add(m_scratch_flags, 1); + count = Kokkos::shfl(count, 0, 32); + + // Last block does the inter block reduction + if (count == gridDim.x - 1) { + // set flag back to zero + if (id == 0) *m_scratch_flags = 0; + last_block = true; + reducer.init(value); + + pointer_type const volatile global = (pointer_type)m_scratch_space; + + // Reduce all global values with splitting work over threads in one warp + const int step_size = + blockDim.x * blockDim.y < 32 ? blockDim.x * blockDim.y : 32; + for (int i = id; i < (int)gridDim.x; i += step_size) { + value_type tmp = global[i]; + reducer.join(value, tmp); + } + + // Perform shfl reductions within the warp only join if contribution is + // valid (allows gridDim.x non power of two and <32) + if (int(blockDim.x * blockDim.y) > 1) { + value_type tmp = Kokkos::shfl_down(value, 1, 32); + if (id + 1 < int(gridDim.x)) reducer.join(value, tmp); + } +#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK + unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK; + int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1); +#else + int active = KOKKOS_IMPL_CUDA_BALLOT(1); +#endif + if (int(blockDim.x * blockDim.y) > 2) { + value_type tmp = Kokkos::shfl_down(value, 2, 32); + if (id + 2 < int(gridDim.x)) reducer.join(value, tmp); + } +#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK + active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1); +#else + active += KOKKOS_IMPL_CUDA_BALLOT(1); +#endif + if (int(blockDim.x * blockDim.y) > 4) { + value_type tmp = Kokkos::shfl_down(value, 4, 32); + if (id + 4 < int(gridDim.x)) reducer.join(value, tmp); + } +#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK + active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1); +#else + active += KOKKOS_IMPL_CUDA_BALLOT(1); +#endif + if (int(blockDim.x * blockDim.y) > 8) { + value_type tmp = Kokkos::shfl_down(value, 8, 32); + if (id + 8 < int(gridDim.x)) reducer.join(value, tmp); + } +#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK + active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1); +#else + active += KOKKOS_IMPL_CUDA_BALLOT(1); +#endif + if (int(blockDim.x * blockDim.y) > 16) { + value_type tmp = Kokkos::shfl_down(value, 16, 32); + if (id + 16 < int(gridDim.x)) reducer.join(value, tmp); + } +#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK + active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1); +#else + active += KOKKOS_IMPL_CUDA_BALLOT(1); +#endif + } + } + + // The last block has in its thread=0 the global reduction value through + // "value" + return last_block; +#else + (void)reducer; + (void)m_scratch_space; + (void)m_scratch_flags; + (void)max_active_thread; + return true; +#endif +} + +template <class FunctorType, class ArgTag, bool DoScan, bool UseShfl> +struct CudaReductionsFunctor; + +template <class FunctorType, class ArgTag> +struct CudaReductionsFunctor<FunctorType, ArgTag, false, true> { + using ValueTraits = FunctorValueTraits<FunctorType, ArgTag>; + using ValueJoin = FunctorValueJoin<FunctorType, ArgTag>; + using ValueInit = FunctorValueInit<FunctorType, ArgTag>; + using ValueOps = FunctorValueOps<FunctorType, ArgTag>; + using pointer_type = typename ValueTraits::pointer_type; + using Scalar = typename ValueTraits::value_type; + + __device__ static inline void scalar_intra_warp_reduction( + const FunctorType& functor, + Scalar value, // Contribution + const bool skip_vector, // Skip threads if Kokkos vector lanes are not + // part of the reduction + const int width, // How much of the warp participates + Scalar& result) { + unsigned mask = + width == 32 + ? 0xffffffff + : ((1 << width) - 1) + << ((threadIdx.y * blockDim.x + threadIdx.x) / width) * width; + for (int delta = skip_vector ? blockDim.x : 1; delta < width; delta *= 2) { + Scalar tmp = Kokkos::shfl_down(value, delta, width, mask); + ValueJoin::join(functor, &value, &tmp); + } + + Impl::in_place_shfl(result, value, 0, width, mask); + } + + __device__ static inline void scalar_intra_block_reduction( + const FunctorType& functor, Scalar value, const bool skip, + Scalar* my_global_team_buffer_element, const int shared_elements, + Scalar* shared_team_buffer_element) { + const int warp_id = (threadIdx.y * blockDim.x) / 32; + Scalar* const my_shared_team_buffer_element = + shared_team_buffer_element + warp_id % shared_elements; + + // Warp Level Reduction, ignoring Kokkos vector entries + scalar_intra_warp_reduction(functor, value, skip, 32, value); + + if (warp_id < shared_elements) { + *my_shared_team_buffer_element = value; + } + // Wait for every warp to be done before using one warp to do final cross + // warp reduction + __syncthreads(); + + const int num_warps = blockDim.x * blockDim.y / 32; + for (int w = shared_elements; w < num_warps; w += shared_elements) { + if (warp_id >= w && warp_id < w + shared_elements) { + if ((threadIdx.y * blockDim.x + threadIdx.x) % 32 == 0) + ValueJoin::join(functor, my_shared_team_buffer_element, &value); + } + __syncthreads(); + } + + if (warp_id == 0) { + ValueInit::init(functor, &value); + for (unsigned int i = threadIdx.y * blockDim.x + threadIdx.x; + i < blockDim.y * blockDim.x / 32; i += 32) + ValueJoin::join(functor, &value, &shared_team_buffer_element[i]); + scalar_intra_warp_reduction(functor, value, false, 32, + *my_global_team_buffer_element); + } + } + + __device__ static inline bool scalar_inter_block_reduction( + const FunctorType& functor, const Cuda::size_type /*block_id*/, + const Cuda::size_type block_count, Cuda::size_type* const shared_data, + Cuda::size_type* const global_data, Cuda::size_type* const global_flags) { + Scalar* const global_team_buffer_element = ((Scalar*)global_data); + Scalar* const my_global_team_buffer_element = + global_team_buffer_element + blockIdx.x; + Scalar* shared_team_buffer_elements = ((Scalar*)shared_data); + Scalar value = shared_team_buffer_elements[threadIdx.y]; + int shared_elements = blockDim.x * blockDim.y / 32; + int global_elements = block_count; + __syncthreads(); + + scalar_intra_block_reduction(functor, value, true, + my_global_team_buffer_element, shared_elements, + shared_team_buffer_elements); + __threadfence(); + __syncthreads(); + unsigned int num_teams_done = 0; + // The cast in the atomic call is necessary to find matching call with + // MSVC/NVCC + if (threadIdx.x + threadIdx.y == 0) { + num_teams_done = + Kokkos::atomic_fetch_add(global_flags, static_cast<unsigned int>(1)) + + 1; + } + bool is_last_block = false; + if (__syncthreads_or(num_teams_done == gridDim.x)) { + is_last_block = true; + *global_flags = 0; + ValueInit::init(functor, &value); + for (int i = threadIdx.y * blockDim.x + threadIdx.x; i < global_elements; + i += blockDim.x * blockDim.y) { + ValueJoin::join(functor, &value, &global_team_buffer_element[i]); + } + scalar_intra_block_reduction( + functor, value, false, shared_team_buffer_elements + (blockDim.y - 1), + shared_elements, shared_team_buffer_elements); + } + return is_last_block; + } +}; + +template <class FunctorType, class ArgTag> +struct CudaReductionsFunctor<FunctorType, ArgTag, false, false> { + using ValueTraits = FunctorValueTraits<FunctorType, ArgTag>; + using ValueJoin = FunctorValueJoin<FunctorType, ArgTag>; + using ValueInit = FunctorValueInit<FunctorType, ArgTag>; + using ValueOps = FunctorValueOps<FunctorType, ArgTag>; + using pointer_type = typename ValueTraits::pointer_type; + using Scalar = typename ValueTraits::value_type; + + __device__ static inline void scalar_intra_warp_reduction( + const FunctorType& functor, + Scalar* value, // Contribution + const bool skip_vector, // Skip threads if Kokkos vector lanes are not + // part of the reduction + const int width) // How much of the warp participates + { +#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK + unsigned mask = + width == 32 + ? 0xffffffff + : ((1 << width) - 1) + << ((threadIdx.y * blockDim.x + threadIdx.x) / width) * width; +#endif + const int lane_id = (threadIdx.y * blockDim.x + threadIdx.x) % 32; + for (int delta = skip_vector ? blockDim.x : 1; delta < width; delta *= 2) { + if (lane_id + delta < 32) { + ValueJoin::join(functor, value, value + delta); + } +#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK + KOKKOS_IMPL_CUDA_SYNCWARP_MASK(mask); +#else + KOKKOS_IMPL_CUDA_SYNCWARP; +#endif + } + *value = *(value - lane_id); + } + + __device__ static inline void scalar_intra_block_reduction( + const FunctorType& functor, Scalar value, const bool skip, Scalar* result, + const int /*shared_elements*/, Scalar* shared_team_buffer_element) { + const int warp_id = (threadIdx.y * blockDim.x) / 32; + Scalar* const my_shared_team_buffer_element = + shared_team_buffer_element + threadIdx.y * blockDim.x + threadIdx.x; + *my_shared_team_buffer_element = value; + // Warp Level Reduction, ignoring Kokkos vector entries + scalar_intra_warp_reduction(functor, my_shared_team_buffer_element, skip, + 32); + // Wait for every warp to be done before using one warp to do final cross + // warp reduction + __syncthreads(); + + if (warp_id == 0) { + const unsigned int delta = (threadIdx.y * blockDim.x + threadIdx.x) * 32; + if (delta < blockDim.x * blockDim.y) + *my_shared_team_buffer_element = shared_team_buffer_element[delta]; + KOKKOS_IMPL_CUDA_SYNCWARP; + scalar_intra_warp_reduction(functor, my_shared_team_buffer_element, false, + blockDim.x * blockDim.y / 32); + if (threadIdx.x + threadIdx.y == 0) *result = *shared_team_buffer_element; + } + } + + __device__ static inline bool scalar_inter_block_reduction( + const FunctorType& functor, const Cuda::size_type /*block_id*/, + const Cuda::size_type block_count, Cuda::size_type* const shared_data, + Cuda::size_type* const global_data, Cuda::size_type* const global_flags) { + Scalar* const global_team_buffer_element = ((Scalar*)global_data); + Scalar* const my_global_team_buffer_element = + global_team_buffer_element + blockIdx.x; + Scalar* shared_team_buffer_elements = ((Scalar*)shared_data); + Scalar value = shared_team_buffer_elements[threadIdx.y]; + int shared_elements = blockDim.x * blockDim.y / 32; + int global_elements = block_count; + __syncthreads(); + + scalar_intra_block_reduction(functor, value, true, + my_global_team_buffer_element, shared_elements, + shared_team_buffer_elements); + __threadfence(); + __syncthreads(); + + unsigned int num_teams_done = 0; + // The cast in the atomic call is necessary to find matching call with + // MSVC/NVCC + if (threadIdx.x + threadIdx.y == 0) { + num_teams_done = + Kokkos::atomic_fetch_add(global_flags, static_cast<unsigned int>(1)) + + 1; + } + bool is_last_block = false; + if (__syncthreads_or(num_teams_done == gridDim.x)) { + is_last_block = true; + *global_flags = 0; + ValueInit::init(functor, &value); + for (int i = threadIdx.y * blockDim.x + threadIdx.x; i < global_elements; + i += blockDim.x * blockDim.y) { + ValueJoin::join(functor, &value, &global_team_buffer_element[i]); + } + scalar_intra_block_reduction( + functor, value, false, shared_team_buffer_elements + (blockDim.y - 1), + shared_elements, shared_team_buffer_elements); + } + return is_last_block; + } +}; +//---------------------------------------------------------------------------- +// See section B.17 of Cuda C Programming Guide Version 3.2 +// for discussion of +// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor) +// function qualifier which could be used to improve performance. +//---------------------------------------------------------------------------- +// Maximize shared memory and minimize L1 cache: +// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferShared ); +// For 2.0 capability: 48 KB shared and 16 KB L1 +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- +/* + * Algorithmic constraints: + * (a) blockDim.y is a power of two + * (b) blockDim.y <= 1024 + * (c) blockDim.x == blockDim.z == 1 + */ + +template <bool DoScan, class FunctorType, class ArgTag> +__device__ void cuda_intra_block_reduce_scan( + const FunctorType& functor, + const typename FunctorValueTraits<FunctorType, ArgTag>::pointer_type + base_data) { + using ValueTraits = FunctorValueTraits<FunctorType, ArgTag>; + using ValueJoin = FunctorValueJoin<FunctorType, ArgTag>; + + using pointer_type = typename ValueTraits::pointer_type; + + const unsigned value_count = ValueTraits::value_count(functor); + const unsigned BlockSizeMask = blockDim.y - 1; + + // Must have power of two thread count + + if (BlockSizeMask & blockDim.y) { + Kokkos::abort("Cuda::cuda_intra_block_scan requires power-of-two blockDim"); + } + +#define BLOCK_REDUCE_STEP(R, TD, S) \ + if (!(R & ((1 << (S + 1)) - 1))) { \ + ValueJoin::join(functor, TD, (TD - (value_count << S))); \ + } + +#define BLOCK_SCAN_STEP(TD, N, S) \ + if (N == (1 << S)) { \ + ValueJoin::join(functor, TD, (TD - (value_count << S))); \ + } + + const unsigned rtid_intra = threadIdx.y ^ BlockSizeMask; + const pointer_type tdata_intra = base_data + value_count * threadIdx.y; + + { // Intra-warp reduction: + KOKKOS_IMPL_CUDA_SYNCWARP; + BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 0) + KOKKOS_IMPL_CUDA_SYNCWARP; + BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 1) + KOKKOS_IMPL_CUDA_SYNCWARP; + BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 2) + KOKKOS_IMPL_CUDA_SYNCWARP; + BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 3) + KOKKOS_IMPL_CUDA_SYNCWARP; + BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 4) + KOKKOS_IMPL_CUDA_SYNCWARP; + } + + __syncthreads(); // Wait for all warps to reduce + + { // Inter-warp reduce-scan by a single warp to avoid extra synchronizations + const unsigned rtid_inter = (threadIdx.y ^ BlockSizeMask) + << CudaTraits::WarpIndexShift; + +#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK + unsigned inner_mask = + KOKKOS_IMPL_CUDA_BALLOT_MASK(0xffffffff, (rtid_inter < blockDim.y)); +#endif + if (rtid_inter < blockDim.y) { + const pointer_type tdata_inter = + base_data + value_count * (rtid_inter ^ BlockSizeMask); + +#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK + if ((1 << 5) < BlockSizeMask) { + KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask); + BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 5) + } + if ((1 << 6) < BlockSizeMask) { + KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask); + BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 6) + } + if ((1 << 7) < BlockSizeMask) { + KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask); + BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 7) + } + if ((1 << 8) < BlockSizeMask) { + KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask); + BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 8) + } + if ((1 << 9) < BlockSizeMask) { + KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask); + BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 9) + } +#else + if ((1 << 5) < BlockSizeMask) { + KOKKOS_IMPL_CUDA_SYNCWARP; + BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 5) + } + if ((1 << 6) < BlockSizeMask) { + KOKKOS_IMPL_CUDA_SYNCWARP; + BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 6) + } + if ((1 << 7) < BlockSizeMask) { + KOKKOS_IMPL_CUDA_SYNCWARP; + BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 7) + } + if ((1 << 8) < BlockSizeMask) { + KOKKOS_IMPL_CUDA_SYNCWARP; + BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 8) + } + if ((1 << 9) < BlockSizeMask) { + KOKKOS_IMPL_CUDA_SYNCWARP; + BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 9) + } +#endif + + if (DoScan) { + int n = + (rtid_inter & 32) + ? 32 + : ((rtid_inter & 64) + ? 64 + : ((rtid_inter & 128) ? 128 + : ((rtid_inter & 256) ? 256 : 0))); + + if (!(rtid_inter + n < blockDim.y)) n = 0; + +#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK + KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask); + BLOCK_SCAN_STEP(tdata_inter, n, 8) + KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask); + BLOCK_SCAN_STEP(tdata_inter, n, 7) + KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask); + BLOCK_SCAN_STEP(tdata_inter, n, 6) + KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask); + BLOCK_SCAN_STEP(tdata_inter, n, 5) +#else + KOKKOS_IMPL_CUDA_SYNCWARP; + BLOCK_SCAN_STEP(tdata_inter, n, 8) + KOKKOS_IMPL_CUDA_SYNCWARP; + BLOCK_SCAN_STEP(tdata_inter, n, 7) + KOKKOS_IMPL_CUDA_SYNCWARP; + BLOCK_SCAN_STEP(tdata_inter, n, 6) + KOKKOS_IMPL_CUDA_SYNCWARP; + BLOCK_SCAN_STEP(tdata_inter, n, 5) +#endif + } + } + } + + __syncthreads(); // Wait for inter-warp reduce-scan to complete + + if (DoScan) { + int n = + (rtid_intra & 1) + ? 1 + : ((rtid_intra & 2) + ? 2 + : ((rtid_intra & 4) + ? 4 + : ((rtid_intra & 8) ? 8 + : ((rtid_intra & 16) ? 16 : 0)))); + + if (!(rtid_intra + n < blockDim.y)) n = 0; + KOKKOS_IMPL_CUDA_SYNCWARP; + BLOCK_SCAN_STEP(tdata_intra, n, 4) __threadfence_block(); + KOKKOS_IMPL_CUDA_SYNCWARP; + BLOCK_SCAN_STEP(tdata_intra, n, 3) __threadfence_block(); + KOKKOS_IMPL_CUDA_SYNCWARP; + BLOCK_SCAN_STEP(tdata_intra, n, 2) __threadfence_block(); + KOKKOS_IMPL_CUDA_SYNCWARP; + BLOCK_SCAN_STEP(tdata_intra, n, 1) __threadfence_block(); + KOKKOS_IMPL_CUDA_SYNCWARP; + BLOCK_SCAN_STEP(tdata_intra, n, 0) __threadfence_block(); + KOKKOS_IMPL_CUDA_SYNCWARP; + } + +#undef BLOCK_SCAN_STEP +#undef BLOCK_REDUCE_STEP +} + +//---------------------------------------------------------------------------- +/**\brief Input value-per-thread starting at 'shared_data'. + * Reduction value at last thread's location. + * + * If 'DoScan' then write blocks' scan values and block-groups' scan values. + * + * Global reduce result is in the last threads' 'shared_data' location. + */ + +template <bool DoScan, class FunctorType, class ArgTag> +__device__ bool cuda_single_inter_block_reduce_scan2( + const FunctorType& functor, const Cuda::size_type block_id, + const Cuda::size_type block_count, Cuda::size_type* const shared_data, + Cuda::size_type* const global_data, Cuda::size_type* const global_flags) { + using size_type = Cuda::size_type; + using ValueTraits = FunctorValueTraits<FunctorType, ArgTag>; + using ValueJoin = FunctorValueJoin<FunctorType, ArgTag>; + using ValueInit = FunctorValueInit<FunctorType, ArgTag>; + using ValueOps = FunctorValueOps<FunctorType, ArgTag>; + + using pointer_type = typename ValueTraits::pointer_type; + + // '__ffs' = position of the least significant bit set to 1. + // 'blockDim.y' is guaranteed to be a power of two so this + // is the integral shift value that can replace an integral divide. + const unsigned BlockSizeShift = __ffs(blockDim.y) - 1; + const unsigned BlockSizeMask = blockDim.y - 1; + + // Must have power of two thread count + if (BlockSizeMask & blockDim.y) { + Kokkos::abort( + "Cuda::cuda_single_inter_block_reduce_scan requires power-of-two " + "blockDim"); + } + + const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize / + sizeof(size_type)> + word_count(ValueTraits::value_size(functor) / sizeof(size_type)); + + // Reduce the accumulation for the entire block. + cuda_intra_block_reduce_scan<false, FunctorType, ArgTag>( + functor, pointer_type(shared_data)); + + { + // Write accumulation total to global scratch space. + // Accumulation total is the last thread's data. + size_type* const shared = shared_data + word_count.value * BlockSizeMask; + size_type* const global = global_data + word_count.value * block_id; + + for (int i = int(threadIdx.y); i < int(word_count.value); + i += int(blockDim.y)) { + global[i] = shared[i]; + } + } + __threadfence(); + + // Contributing blocks note that their contribution has been completed via an + // atomic-increment flag If this block is not the last block to contribute to + // this group then the block is done. + const bool is_last_block = !__syncthreads_or( + threadIdx.y + ? 0 + : (1 + atomicInc(global_flags, block_count - 1) < block_count)); + + if (is_last_block) { + const size_type b = + (long(block_count) * long(threadIdx.y)) >> BlockSizeShift; + const size_type e = + (long(block_count) * long(threadIdx.y + 1)) >> BlockSizeShift; + + { + void* const shared_ptr = shared_data + word_count.value * threadIdx.y; + /* reference_type shared_value = */ ValueInit::init(functor, shared_ptr); + + for (size_type i = b; i < e; ++i) { + ValueJoin::join(functor, shared_ptr, + global_data + word_count.value * i); + } + } + + cuda_intra_block_reduce_scan<DoScan, FunctorType, ArgTag>( + functor, pointer_type(shared_data)); + + if (DoScan) { + size_type* const shared_value = + shared_data + + word_count.value * (threadIdx.y ? threadIdx.y - 1 : blockDim.y); + + if (!threadIdx.y) { + ValueInit::init(functor, shared_value); + } + + // Join previous inclusive scan value to each member + for (size_type i = b; i < e; ++i) { + size_type* const global_value = global_data + word_count.value * i; + ValueJoin::join(functor, shared_value, global_value); + ValueOps ::copy(functor, global_value, shared_value); + } + } + } + + return is_last_block; +} + +template <bool DoScan, class FunctorType, class ArgTag> +__device__ bool cuda_single_inter_block_reduce_scan( + const FunctorType& functor, const Cuda::size_type block_id, + const Cuda::size_type block_count, Cuda::size_type* const shared_data, + Cuda::size_type* const global_data, Cuda::size_type* const global_flags) { + using ValueTraits = FunctorValueTraits<FunctorType, ArgTag>; + if (!DoScan && ValueTraits::StaticValueSize > 0) + return Kokkos::Impl::CudaReductionsFunctor< + FunctorType, ArgTag, false, (ValueTraits::StaticValueSize > 16)>:: + scalar_inter_block_reduction(functor, block_id, block_count, + shared_data, global_data, global_flags); + else + return cuda_single_inter_block_reduce_scan2<DoScan, FunctorType, ArgTag>( + functor, block_id, block_count, shared_data, global_data, global_flags); +} + +// Size in bytes required for inter block reduce or scan +template <bool DoScan, class FunctorType, class ArgTag> +inline unsigned cuda_single_inter_block_reduce_scan_shmem( + const FunctorType& functor, const unsigned BlockSize) { + return (BlockSize + 2) * + Impl::FunctorValueTraits<FunctorType, ArgTag>::value_size(functor); +} + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #if defined(KOKKOS_ENABLE_CUDA) */ +#endif /* KOKKOS_CUDA_REDUCESCAN_HPP */ diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp new file mode 100644 index 0000000000000000000000000000000000000000..777f57ced45b246af52cea73e796fbeae01cb57c --- /dev/null +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp @@ -0,0 +1,72 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ENABLE_TASKDAG) + +#include <Kokkos_Core.hpp> + +#include <impl/Kokkos_TaskQueue_impl.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template class TaskQueue< + Kokkos::Cuda, + Impl::default_tasking_memory_space_for_execution_space_t<Kokkos::Cuda> >; +template class TaskQueueMultiple< + Kokkos::Cuda, + Impl::default_tasking_memory_space_for_execution_space_t<Kokkos::Cuda> >; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +#else +void KOKKOS_CORE_SRC_CUDA_KOKKOS_CUDA_TASK_PREVENT_LINK_ERROR() {} +#endif /* #if defined( KOKKOS_ENABLE_CUDA ) && defined( KOKKOS_ENABLE_TASKDAG \ + ) */ diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp new file mode 100644 index 0000000000000000000000000000000000000000..2004edbeacdb4b5b309ea3bd6eb83b3abcfacea6 --- /dev/null +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp @@ -0,0 +1,1209 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_IMPL_CUDA_TASK_HPP +#define KOKKOS_IMPL_CUDA_TASK_HPP + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_TASKDAG) + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#include <Kokkos_Core_fwd.hpp> + +#include <impl/Kokkos_TaskBase.hpp> +#include <Cuda/Kokkos_Cuda_Error.hpp> // CUDA_SAFE_CALL +#include <impl/Kokkos_TaskTeamMember.hpp> + +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { +namespace { + +template <typename TaskType> +__global__ void set_cuda_task_base_apply_function_pointer( + typename TaskType::function_type* ptr, + typename TaskType::destroy_type* dtor) { + *ptr = TaskType::apply; + *dtor = TaskType::destroy; +} + +template <typename Scheduler> +__global__ void cuda_task_queue_execute(Scheduler scheduler, + int32_t shmem_size) { + TaskQueueSpecialization<Scheduler>::driver(std::move(scheduler), shmem_size); +} + +} // namespace + +template <class, class> +class TaskExec; + +template <class QueueType> +class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> { + public: + using scheduler_type = SimpleTaskScheduler<Kokkos::Cuda, QueueType>; + using execution_space = Kokkos::Cuda; + using memory_space = Kokkos::CudaUVMSpace; + using member_type = TaskExec<Kokkos::Cuda, scheduler_type>; + + enum : long { max_league_size = 16 }; + enum : int { warps_per_block = 4 }; + + KOKKOS_INLINE_FUNCTION + static void iff_single_thread_recursive_execute(scheduler_type const&) {} + + static int get_max_team_count(execution_space const&) { + return Kokkos::Impl::cuda_internal_multiprocessor_count() * warps_per_block; + } + + __device__ static void driver(scheduler_type scheduler, + int32_t shmem_per_warp) { + using queue_type = typename scheduler_type::task_queue_type; + using task_base_type = typename scheduler_type::task_base_type; + using runnable_task_base_type = + typename scheduler_type::runnable_task_base_type; + using scheduling_info_storage_type = SchedulingInfoStorage< + runnable_task_base_type, + typename scheduler_type::task_scheduling_info_type>; + + extern __shared__ int32_t shmem_all[]; + + int32_t* const warp_shmem = + shmem_all + (threadIdx.z * shmem_per_warp) / sizeof(int32_t); + + task_base_type* const shared_memory_task_copy = (task_base_type*)warp_shmem; + + const int warp_lane = threadIdx.x + threadIdx.y * blockDim.x; + + member_type single_exec(scheduler, warp_shmem, 1); + member_type team_exec(scheduler, warp_shmem, blockDim.y); + + auto& queue = scheduler.queue(); + auto& team_scheduler = team_exec.scheduler(); + + auto current_task = OptionalRef<task_base_type>(); + + // Loop until all queues are empty and no tasks in flight + while (!queue.is_done()) { + if (warp_lane == 0) { // should be (?) same as team_exec.team_rank() == 0 + // pop off a task + current_task = + queue.pop_ready_task(team_scheduler.team_scheduler_info()); + } + + // Broadcast task pointer: + + // Sync before the broadcast + KOKKOS_IMPL_CUDA_SYNCWARP; + + // pretend it's an int* for shuffle purposes + ((int*)¤t_task)[0] = + KOKKOS_IMPL_CUDA_SHFL(((int*)¤t_task)[0], 0, 32); + ((int*)¤t_task)[1] = + KOKKOS_IMPL_CUDA_SHFL(((int*)¤t_task)[1], 0, 32); + + if (current_task) { + KOKKOS_ASSERT(!current_task->as_runnable_task().get_respawn_flag()); + + int32_t b = sizeof(scheduling_info_storage_type) / sizeof(int32_t); + static_assert( + sizeof(scheduling_info_storage_type) % sizeof(int32_t) == 0, + "bad task size"); + int32_t const e = current_task->get_allocation_size() / sizeof(int32_t); + KOKKOS_ASSERT(current_task->get_allocation_size() % sizeof(int32_t) == + 0); + + int32_t volatile* const task_mem = + (int32_t volatile*)current_task.get(); + + // do a coordinated copy of the task closure from global to shared + // memory: + for (int32_t i = warp_lane; i < e; i += CudaTraits::WarpSize) { + warp_shmem[i] = task_mem[i]; + } + + // Synchronize threads of the warp and insure memory + // writes are visible to all threads in the warp. + KOKKOS_IMPL_CUDA_SYNCWARP; + + if (shared_memory_task_copy->is_team_runnable()) { + // Thread Team Task + shared_memory_task_copy->as_runnable_task().run(team_exec); + } else if (threadIdx.y == 0) { + // TODO @tasking @optimization DSH Change this to warp_lane == 0 when + // we allow blockDim.x to be more than 1 Single Thread Task + shared_memory_task_copy->as_runnable_task().run(single_exec); + } + + // Synchronize threads of the warp and insure memory + // writes are visible to all threads in the warp. + + KOKKOS_IMPL_CUDA_SYNCWARP; + + // if(warp_lane < b % CudaTraits::WarpSize) b += CudaTraits::WarpSize; + // b -= b % CudaTraits::WarpSize; + + // copy task closure from shared to global memory: + for (int32_t i = b + warp_lane; i < e; i += CudaTraits::WarpSize) { + task_mem[i] = warp_shmem[i]; + } + + // Synchronize threads of the warp and insure memory + // writes are visible to root thread of the warp for + // respawn or completion. + + KOKKOS_IMPL_CUDA_SYNCWARP; + + if (warp_lane == 0) { + // If respawn requested copy respawn data back to main memory + if (shared_memory_task_copy->as_runnable_task().get_respawn_flag()) { + if (shared_memory_task_copy->as_runnable_task().has_predecessor()) { + // It's not necessary to make this a volatile write because + // the next read of the predecessor is on this thread in complete, + // and the predecessor is cleared there (using a volatile write) + current_task->as_runnable_task().acquire_predecessor_from( + shared_memory_task_copy->as_runnable_task()); + } + + // It may not necessary to make this a volatile write, since the + // next read will be done by this thread in complete where the + // rescheduling occurs, but since the task could be stolen later + // before this is written again, we should do the volatile write + // here. (It might not be necessary though because I don't know + // where else the priority would be read after it is scheduled + // by this thread; for now, we leave it volatile, but we should + // benchmark the cost of this.) + current_task.as_volatile()->set_priority( + shared_memory_task_copy->get_priority()); + + // It's not necessary to make this a volatile write, since the + // next read of it (if true) will be by this thread in `complete()`, + // which will unset the flag (using volatile) once it has handled + // the respawn + current_task->as_runnable_task().set_respawn_flag(); + } + + queue.complete((*std::move(current_task)).as_runnable_task(), + team_scheduler.team_scheduler_info()); + } + } + } + } + + static void execute(scheduler_type const& scheduler) { + const int shared_per_warp = 2048; + const dim3 grid(Kokkos::Impl::cuda_internal_multiprocessor_count(), 1, 1); + const dim3 block(1, Kokkos::Impl::CudaTraits::WarpSize, warps_per_block); + const int shared_total = shared_per_warp * warps_per_block; + const cudaStream_t stream = nullptr; + + KOKKOS_ASSERT( + static_cast<long>(grid.x * grid.y * grid.z * block.x * block.y * + block.z) == + static_cast<long>(get_max_team_count(scheduler.get_execution_space()) * + Kokkos::Impl::CudaTraits::WarpSize)); + + auto& queue = scheduler.queue(); + + CUDA_SAFE_CALL(cudaDeviceSynchronize()); + + // Query the stack size, in bytes: + + size_t previous_stack_size = 0; + CUDA_SAFE_CALL( + cudaDeviceGetLimit(&previous_stack_size, cudaLimitStackSize)); + + // If not large enough then set the stack size, in bytes: + + const size_t larger_stack_size = 1 << 11; + + if (previous_stack_size < larger_stack_size) { + CUDA_SAFE_CALL(cudaDeviceSetLimit(cudaLimitStackSize, larger_stack_size)); + } + + cuda_task_queue_execute<<<grid, block, shared_total, stream>>>( + scheduler, shared_per_warp); + + CUDA_SAFE_CALL(cudaGetLastError()); + + CUDA_SAFE_CALL(cudaDeviceSynchronize()); + + if (previous_stack_size < larger_stack_size) { + CUDA_SAFE_CALL( + cudaDeviceSetLimit(cudaLimitStackSize, previous_stack_size)); + } + } + + template <typename TaskType> + static + // TODO @tasking @optimiazation DSH specialize this for trivially + // destructible types + void + get_function_pointer(typename TaskType::function_type& ptr, + typename TaskType::destroy_type& dtor) { + using function_type = typename TaskType::function_type; + using destroy_type = typename TaskType::destroy_type; + + // TODO @tasking @minor DSH make sure there aren't any alignment concerns? + void* storage = cuda_internal_scratch_unified( + Kokkos::Cuda(), sizeof(function_type) + sizeof(destroy_type)); + function_type* ptr_ptr = (function_type*)storage; + destroy_type* dtor_ptr = + (destroy_type*)((char*)storage + sizeof(function_type)); + + CUDA_SAFE_CALL(cudaDeviceSynchronize()); + + set_cuda_task_base_apply_function_pointer<TaskType> + <<<1, 1>>>(ptr_ptr, dtor_ptr); + + CUDA_SAFE_CALL(cudaGetLastError()); + CUDA_SAFE_CALL(cudaDeviceSynchronize()); + + ptr = *ptr_ptr; + dtor = *dtor_ptr; + } +}; + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +template <class Scheduler> +class TaskQueueSpecializationConstrained< + Scheduler, + typename std::enable_if<std::is_same<typename Scheduler::execution_space, + Kokkos::Cuda>::value>::type> { + public: + using scheduler_type = Scheduler; + using execution_space = Kokkos::Cuda; + using memory_space = Kokkos::CudaUVMSpace; + using member_type = TaskExec<Kokkos::Cuda, Scheduler>; + + enum : long { max_league_size = 16 }; + + KOKKOS_INLINE_FUNCTION + static void iff_single_thread_recursive_execute(scheduler_type const&) {} + + __device__ static void driver(scheduler_type scheduler, + int32_t shmem_per_warp) { + using queue_type = typename scheduler_type::queue_type; + using task_root_type = TaskBase; + + extern __shared__ int32_t shmem_all[]; + + task_root_type* const end = (task_root_type*)task_root_type::EndTag; + task_root_type* const no_more_tasks_sentinel = nullptr; + + int32_t* const warp_shmem = + shmem_all + (threadIdx.z * shmem_per_warp) / sizeof(int32_t); + + task_root_type* const task_shmem = (task_root_type*)warp_shmem; + + const int warp_lane = threadIdx.x + threadIdx.y * blockDim.x; + + member_type single_exec(scheduler, warp_shmem, 1); + member_type team_exec(scheduler, warp_shmem, blockDim.y); + + auto& team_queue = team_exec.scheduler().queue(); + + task_root_type* task_ptr = no_more_tasks_sentinel; + + // Loop until all queues are empty and no tasks in flight + + do { + // Each team lead attempts to acquire either a thread team task + // or collection of single thread tasks for the team. + + if (0 == warp_lane) { + if (*((volatile int*)&team_queue.m_ready_count) > 0) { + task_ptr = end; + // Attempt to acquire a task + // Loop by priority and then type + for (int i = 0; i < queue_type::NumQueue && end == task_ptr; ++i) { + for (int j = 0; j < 2 && end == task_ptr; ++j) { + task_ptr = queue_type::pop_ready_task(&team_queue.m_ready[i][j]); + } + } + } else { + // returns nullptr if and only if all other queues have a ready + // count of 0 also. Otherwise, returns a task from another queue + // or `end` if one couldn't be popped + task_ptr = team_queue.attempt_to_steal_task(); +#if 0 + if(task != no_more_tasks_sentinel && task != end) { + std::printf("task stolen on rank %d\n", team_exec.league_rank()); + } +#endif + } + } + + // Synchronize warp with memory fence before broadcasting task pointer: + + // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "A" ); + KOKKOS_IMPL_CUDA_SYNCWARP; + + // Broadcast task pointer: + + ((int*)&task_ptr)[0] = KOKKOS_IMPL_CUDA_SHFL(((int*)&task_ptr)[0], 0, 32); + ((int*)&task_ptr)[1] = KOKKOS_IMPL_CUDA_SHFL(((int*)&task_ptr)[1], 0, 32); + +#if defined(KOKKOS_ENABLE_DEBUG) + KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN("TaskQueue CUDA task_ptr"); +#endif + + if (0 == task_ptr) break; // 0 == queue->m_ready_count + + if (end != task_ptr) { + // Whole warp copy task's closure to/from shared memory. + // Use all threads of warp for coalesced read/write. + + int32_t const b = sizeof(task_root_type) / sizeof(int32_t); + int32_t const e = + *((int32_t volatile*)(&task_ptr->m_alloc_size)) / sizeof(int32_t); + + int32_t volatile* const task_mem = (int32_t volatile*)task_ptr; + + KOKKOS_ASSERT(e * sizeof(int32_t) < shmem_per_warp); + + // copy task closure from global to shared memory: + + for (int32_t i = warp_lane; i < e; i += CudaTraits::WarpSize) { + warp_shmem[i] = task_mem[i]; + } + + // Synchronize threads of the warp and insure memory + // writes are visible to all threads in the warp. + + // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "B" ); + KOKKOS_IMPL_CUDA_SYNCWARP; + + if (task_root_type::TaskTeam == task_shmem->m_task_type) { + // Thread Team Task + (*task_shmem->m_apply)(task_shmem, &team_exec); + } else if (0 == threadIdx.y) { + // Single Thread Task + (*task_shmem->m_apply)(task_shmem, &single_exec); + } + + // Synchronize threads of the warp and insure memory + // writes are visible to all threads in the warp. + + // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "C" ); + KOKKOS_IMPL_CUDA_SYNCWARP; + + // copy task closure from shared to global memory: + + for (int32_t i = b + warp_lane; i < e; i += CudaTraits::WarpSize) { + task_mem[i] = warp_shmem[i]; + } + + // Synchronize threads of the warp and insure memory + // writes are visible to root thread of the warp for + // respawn or completion. + + // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "D" ); + KOKKOS_IMPL_CUDA_SYNCWARP; + + // If respawn requested copy respawn data back to main memory + + if (0 == warp_lane) { + if (((task_root_type*)task_root_type::LockTag) != + task_shmem->m_next) { + ((volatile task_root_type*)task_ptr)->m_next = task_shmem->m_next; + ((volatile task_root_type*)task_ptr)->m_priority = + task_shmem->m_priority; + } + + team_queue.complete(task_ptr); + } + } + } while (1); + } + + static void execute(scheduler_type const& scheduler) { + const int shared_per_warp = 2048; + const int warps_per_block = 4; + const dim3 grid(Kokkos::Impl::cuda_internal_multiprocessor_count(), 1, 1); + // const dim3 grid( 1 , 1 , 1 ); + const dim3 block(1, Kokkos::Impl::CudaTraits::WarpSize, warps_per_block); + const int shared_total = shared_per_warp * warps_per_block; + const cudaStream_t stream = 0; + + auto& queue = scheduler.queue(); + queue.initialize_team_queues(warps_per_block * grid.x); + + CUDA_SAFE_CALL(cudaDeviceSynchronize()); + + // Query the stack size, in bytes: + + size_t previous_stack_size = 0; + CUDA_SAFE_CALL( + cudaDeviceGetLimit(&previous_stack_size, cudaLimitStackSize)); + + // If not large enough then set the stack size, in bytes: + + const size_t larger_stack_size = 2048; + + if (previous_stack_size < larger_stack_size) { + CUDA_SAFE_CALL(cudaDeviceSetLimit(cudaLimitStackSize, larger_stack_size)); + } + + cuda_task_queue_execute<<<grid, block, shared_total, stream>>>( + scheduler, shared_per_warp); + + CUDA_SAFE_CALL(cudaGetLastError()); + + CUDA_SAFE_CALL(cudaDeviceSynchronize()); + + if (previous_stack_size < larger_stack_size) { + CUDA_SAFE_CALL( + cudaDeviceSetLimit(cudaLimitStackSize, previous_stack_size)); + } + } + + template <typename TaskType> + static void get_function_pointer(typename TaskType::function_type& ptr, + typename TaskType::destroy_type& dtor) { + using function_type = typename TaskType::function_type; + using destroy_type = typename TaskType::destroy_type; + + void* storage = cuda_internal_scratch_unified( + Kokkos::Cuda(), sizeof(function_type) + sizeof(destroy_type)); + function_type* ptr_ptr = (function_type*)storage; + destroy_type* dtor_ptr = + (destroy_type*)((char*)storage + sizeof(function_type)); + + CUDA_SAFE_CALL(cudaDeviceSynchronize()); + + set_cuda_task_base_apply_function_pointer<TaskType> + <<<1, 1>>>(ptr_ptr, dtor_ptr); + + CUDA_SAFE_CALL(cudaGetLastError()); + CUDA_SAFE_CALL(cudaDeviceSynchronize()); + + ptr = *ptr_ptr; + dtor = *dtor_ptr; + } +}; + +extern template class TaskQueue< + Kokkos::Cuda, + default_tasking_memory_space_for_execution_space_t<Kokkos::Cuda>>; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +/**\brief Impl::TaskExec<Cuda> is the TaskScheduler<Cuda>::member_type + * passed to tasks running in a Cuda space. + * + * Cuda thread blocks for tasking are dimensioned: + * blockDim.x == vector length + * blockDim.y == team size + * blockDim.z == number of teams + * where + * blockDim.x * blockDim.y == WarpSize + * + * Current implementation requires blockDim.x == 1. + * Vector level parallelism with blockDim.y > 1 on Volta will + * require a vector-level synchronization mask for vector-level + * collective operaitons. + * + * Both single thread and thread team tasks are run by a full Cuda warp. + * A single thread task is called by warp lane #0 and the remaining + * lanes of the warp are idle. + * + * When executing a single thread task the syncwarp or other + * warp synchronizing functions must not be called. + */ +template <class Scheduler> +class TaskExec<Kokkos::Cuda, Scheduler> { + private: + enum : int { WarpSize = Kokkos::Impl::CudaTraits::WarpSize }; + + TaskExec(TaskExec&&) = delete; + TaskExec(TaskExec const&) = delete; + TaskExec& operator=(TaskExec&&) = delete; + TaskExec& operator=(TaskExec const&) = delete; + + friend class Kokkos::Impl::TaskQueue< + Kokkos::Cuda, + default_tasking_memory_space_for_execution_space_t<Kokkos::Cuda>>; + template <class, class> + friend class Kokkos::Impl::TaskQueueSpecializationConstrained; + template <class> + friend class Kokkos::Impl::TaskQueueSpecialization; + + int32_t* m_team_shmem; + const int m_team_size; + Scheduler m_scheduler; + + // If constructed with arg_team_size == 1 the object + // can only be used by 0 == threadIdx.y. + KOKKOS_INLINE_FUNCTION + TaskExec(Scheduler const& parent_scheduler, int32_t* arg_team_shmem, + int arg_team_size = blockDim.y) + : m_team_shmem(arg_team_shmem), + m_team_size(arg_team_size), + m_scheduler(parent_scheduler.get_team_scheduler(league_rank())) {} + + public: + using thread_team_member = TaskExec; + +#if defined(__CUDA_ARCH__) + __device__ int team_rank() const { return threadIdx.y; } + __device__ int team_size() const { return m_team_size; } + //__device__ int league_rank() const { return threadIdx.z; } + __device__ int league_rank() const { + return blockIdx.x * blockDim.z + threadIdx.z; + } + __device__ int league_size() const { return blockDim.z * gridDim.x; } + + __device__ void team_barrier() const { + if (1 < m_team_size) { + KOKKOS_IMPL_CUDA_SYNCWARP; + } + } + + template <class ValueType> + __device__ void team_broadcast(ValueType& val, const int thread_id) const { + if (1 < m_team_size) { + // WarpSize = blockDim.X * blockDim.y + // thread_id < blockDim.y + ValueType tmp(val); // input might not be register variable + Impl::in_place_shfl(val, tmp, blockDim.x * thread_id, WarpSize); + } + } + +#else + __host__ int team_rank() const { return 0; } + __host__ int team_size() const { return 0; } + __host__ int league_rank() const { return 0; } + __host__ int league_size() const { return 0; } + __host__ void team_barrier() const {} + template <class ValueType> + __host__ void team_broadcast(ValueType&, const int) const {} +#endif + + KOKKOS_INLINE_FUNCTION Scheduler const& scheduler() const noexcept { + return m_scheduler; + } + KOKKOS_INLINE_FUNCTION Scheduler& scheduler() noexcept { return m_scheduler; } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <typename iType, typename Scheduler> +struct TeamThreadRangeBoundariesStruct<iType, + TaskExec<Kokkos::Cuda, Scheduler>> { + using index_type = iType; + using member_type = TaskExec<Kokkos::Cuda, Scheduler>; + + const iType start; + const iType end; + const iType increment; + member_type const& thread; + +#if defined(__CUDA_ARCH__) + + __device__ inline TeamThreadRangeBoundariesStruct( + member_type const& arg_thread, const iType& arg_count) + : start(threadIdx.y), + end(arg_count), + increment(blockDim.y), + thread(arg_thread) {} + + __device__ inline TeamThreadRangeBoundariesStruct( + member_type const& arg_thread, const iType& arg_start, + const iType& arg_end) + : start(arg_start + threadIdx.y), + end(arg_end), + increment(blockDim.y), + thread(arg_thread) {} + +#else + + TeamThreadRangeBoundariesStruct(member_type const& arg_thread, + const iType& arg_count); + + TeamThreadRangeBoundariesStruct(member_type const& arg_thread, + const iType& arg_start, const iType& arg_end); + +#endif +}; + +//---------------------------------------------------------------------------- + +template <typename iType, typename Scheduler> +struct ThreadVectorRangeBoundariesStruct<iType, + TaskExec<Kokkos::Cuda, Scheduler>> { + using index_type = iType; + using member_type = TaskExec<Kokkos::Cuda, Scheduler>; + + const index_type start; + const index_type end; + const index_type increment; + const member_type& thread; + +#if defined(__CUDA_ARCH__) + + __device__ inline ThreadVectorRangeBoundariesStruct( + member_type const& arg_thread, const index_type& arg_count) + : start(threadIdx.x), + end(arg_count), + increment(blockDim.x), + thread(arg_thread) {} + + __device__ inline ThreadVectorRangeBoundariesStruct( + member_type const& arg_thread, const index_type& arg_begin, + const index_type& arg_end) + : start(arg_begin + threadIdx.x), + end(arg_end), + increment(blockDim.x), + thread(arg_thread) {} + +#else + + ThreadVectorRangeBoundariesStruct(member_type const& arg_thread, + const index_type& arg_count); + + ThreadVectorRangeBoundariesStruct(member_type const& arg_thread, + const index_type& arg_begin, + const index_type& arg_end); + +#endif +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- + +namespace Kokkos { + +// template<typename iType> +// KOKKOS_INLINE_FUNCTION +// Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Cuda > +// > TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread, const iType +// & count ) +//{ +// return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< +// Kokkos::Cuda > >( thread, count ); +//} +// +// template<typename iType1, typename iType2> +// KOKKOS_INLINE_FUNCTION +// Impl::TeamThreadRangeBoundariesStruct +// < typename std::common_type<iType1,iType2>::type +// , Impl::TaskExec< Kokkos::Cuda > > +// TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread +// , const iType1 & begin, const iType2 & end ) +//{ +// using iType = typename std::common_type< iType1, iType2 >::type; +// return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< +// Kokkos::Cuda > >( +// thread, iType(begin), iType(end) ); +//} +// +// template<typename iType> +// KOKKOS_INLINE_FUNCTION +// Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > +// > ThreadVectorRange( const Impl::TaskExec< Kokkos::Cuda > & thread +// , const iType & count ) +//{ +// return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< +// Kokkos::Cuda > >(thread,count); +//} +// +// template<typename iType> +// KOKKOS_INLINE_FUNCTION +// Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > +// > ThreadVectorRange( const Impl::TaskExec< Kokkos::Cuda > & thread +// , const iType & arg_begin +// , const iType & arg_end ) +//{ +// return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< +// Kokkos::Cuda > >(thread,arg_begin,arg_end); +//} + +// KOKKOS_INLINE_FUNCTION +// Impl::ThreadSingleStruct<Impl::TaskExec< Kokkos::Cuda > > +// PerTeam(const Impl::TaskExec< Kokkos::Cuda >& thread) +// { +// return Impl::ThreadSingleStruct<Impl::TaskExec< Kokkos::Cuda > >(thread); +// } + +// KOKKOS_INLINE_FUNCTION +// Impl::VectorSingleStruct<Impl::TaskExec< Kokkos::Cuda > > +// PerThread(const Impl::TaskExec< Kokkos::Cuda >& thread) +// { +// return Impl::VectorSingleStruct<Impl::TaskExec< Kokkos::Cuda > >(thread); +// } + +/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each + * i=0..N-1. + * + * The range i=0..N-1 is mapped to all threads of the the calling thread team. + */ +template <typename iType, class Lambda, class Scheduler> +KOKKOS_INLINE_FUNCTION void parallel_for( + const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::TaskExec<Kokkos::Cuda, Scheduler>>& loop_boundaries, + const Lambda& lambda) { + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) { + lambda(i); + } +} + +template <typename iType, class Lambda, class Scheduler> +KOKKOS_INLINE_FUNCTION void parallel_for( + const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::TaskExec<Kokkos::Cuda, Scheduler>>& loop_boundaries, + const Lambda& lambda) { + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) { + lambda(i); + } +} + +// reduce across corresponding lanes between team members within warp +// assume stride*team_size == warp_size +template <typename ValueType, class JoinType> +KOKKOS_INLINE_FUNCTION void strided_shfl_warp_reduction(const JoinType& join, + ValueType& val, + int team_size, + int stride) { + for (int lane_delta = (team_size * stride) >> 1; lane_delta >= stride; + lane_delta >>= 1) { + join(val, Kokkos::shfl_down(val, lane_delta, team_size * stride)); + } +} + +// multiple within-warp non-strided reductions +template <typename ValueType, class JoinType> +KOKKOS_INLINE_FUNCTION void multi_shfl_warp_reduction(const JoinType& join, + ValueType& val, + int vec_length) { + for (int lane_delta = vec_length >> 1; lane_delta; lane_delta >>= 1) { + join(val, Kokkos::shfl_down(val, lane_delta, vec_length)); + } +} + +// broadcast within warp +template <class ValueType> +KOKKOS_INLINE_FUNCTION ValueType shfl_warp_broadcast(ValueType& val, + int src_lane, int width) { + if (1 < width) { + return Kokkos::shfl(val, src_lane, width); + } else { + return val; + } +} + +/*// all-reduce across corresponding vector lanes between team members within +warp +// assume vec_length*team_size == warp_size +// blockDim.x == vec_length == stride +// blockDim.y == team_size +// threadIdx.x == position in vec +// threadIdx.y == member number +template< typename iType, class Lambda, typename ValueType, class JoinType > +KOKKOS_INLINE_FUNCTION +void parallel_reduce + (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< +Kokkos::Cuda > >& loop_boundaries, const Lambda & lambda, const JoinType& join, + ValueType& initialized_result) { + + ValueType result = initialized_result; + for( iType i = loop_boundaries.start; i < loop_boundaries.end; +i+=loop_boundaries.increment) { lambda(i,result); + } + initialized_result = result; + + strided_shfl_warp_reduction<ValueType, JoinType>( + join, + initialized_result, + loop_boundaries.thread.team_size(), + blockDim.x); + initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, +threadIdx.x, Impl::CudaTraits::WarpSize ); +}*/ + +// all-reduce across corresponding vector lanes between team members within warp +// if no join() provided, use sum +// assume vec_length*team_size == warp_size +// blockDim.x == vec_length == stride +// blockDim.y == team_size +// threadIdx.x == position in vec +// threadIdx.y == member number +template <typename iType, class Lambda, typename ValueType, class Scheduler> +KOKKOS_INLINE_FUNCTION void parallel_reduce( + const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::TaskExec<Kokkos::Cuda, Scheduler>>& loop_boundaries, + const Lambda& lambda, ValueType& initialized_result) { + // TODO @internal_documentation what is the point of creating this temporary? + ValueType result = initialized_result; + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) { + lambda(i, result); + } + initialized_result = result; + + if (1 < loop_boundaries.thread.team_size()) { + strided_shfl_warp_reduction( + [&](ValueType& val1, const ValueType& val2) { val1 += val2; }, + initialized_result, loop_boundaries.thread.team_size(), blockDim.x); + + initialized_result = shfl_warp_broadcast<ValueType>( + initialized_result, threadIdx.x, Impl::CudaTraits::WarpSize); + } +} + +template <typename iType, class Lambda, typename ReducerType, class Scheduler> +KOKKOS_INLINE_FUNCTION void parallel_reduce( + const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::TaskExec<Kokkos::Cuda, Scheduler>>& loop_boundaries, + const Lambda& lambda, const ReducerType& reducer) { + using ValueType = typename ReducerType::value_type; + // TODO @internal_documentation what is the point of creating this temporary? + ValueType result = ValueType(); + reducer.init(result); + + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) { + lambda(i, result); + } + + if (1 < loop_boundaries.thread.team_size()) { + strided_shfl_warp_reduction( + [&](ValueType& val1, const ValueType& val2) { + reducer.join(val1, val2); + }, + result, loop_boundaries.thread.team_size(), blockDim.x); + + reducer.reference() = shfl_warp_broadcast<ValueType>( + result, threadIdx.x, Impl::CudaTraits::WarpSize); + } else { + reducer.reference() = result; + } +} +// all-reduce within team members within warp +// assume vec_length*team_size == warp_size +// blockDim.x == vec_length == stride +// blockDim.y == team_size +// threadIdx.x == position in vec +// threadIdx.y == member number +/*template< typename iType, class Lambda, typename ValueType, class JoinType > +KOKKOS_INLINE_FUNCTION +void parallel_reduce + (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< +Kokkos::Cuda > >& loop_boundaries, const Lambda & lambda, const JoinType& join, + ValueType& initialized_result) { + + ValueType result = initialized_result; + for( iType i = loop_boundaries.start; i < loop_boundaries.end; +i+=loop_boundaries.increment) { lambda(i,result); + } + initialized_result = result; + + multi_shfl_warp_reduction<ValueType, JoinType>(join, initialized_result, +blockDim.x); initialized_result = shfl_warp_broadcast<ValueType>( +initialized_result, 0, blockDim.x ); +}*/ + +// all-reduce within team members within warp +// if no join() provided, use sum +// assume vec_length*team_size == warp_size +// blockDim.x == vec_length == stride +// blockDim.y == team_size +// threadIdx.x == position in vec +// threadIdx.y == member number +template <typename iType, class Lambda, typename ValueType, class Scheduler> +KOKKOS_INLINE_FUNCTION void parallel_reduce( + const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::TaskExec<Kokkos::Cuda, Scheduler>>& loop_boundaries, + const Lambda& lambda, ValueType& initialized_result) { + ValueType result = initialized_result; + + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) { + lambda(i, result); + } + + initialized_result = result; + + if (1 < loop_boundaries.thread.team_size()) { + // initialized_result = multi_shfl_warp_reduction( + multi_shfl_warp_reduction( + [&](ValueType& val1, const ValueType& val2) { val1 += val2; }, + initialized_result, blockDim.x); + + initialized_result = + shfl_warp_broadcast<ValueType>(initialized_result, 0, blockDim.x); + } +} + +template <typename iType, class Lambda, typename ReducerType, class Scheduler> +KOKKOS_INLINE_FUNCTION void parallel_reduce( + const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::TaskExec<Kokkos::Cuda, Scheduler>>& loop_boundaries, + const Lambda& lambda, const ReducerType& reducer) { + using ValueType = typename ReducerType::value_type; + + ValueType result = ValueType(); + reducer.init(result); + + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) { + lambda(i, result); + } + + if (1 < loop_boundaries.thread.team_size()) { + multi_shfl_warp_reduction( + [&](ValueType& val1, const ValueType& val2) { + reducer.join(val1, val2); + }, + result, blockDim.x); + + reducer.reference() = shfl_warp_broadcast<ValueType>(result, 0, blockDim.x); + } else { + reducer.reference() = result; + } +} +// scan across corresponding vector lanes between team members within warp +// assume vec_length*team_size == warp_size +// blockDim.x == vec_length == stride +// blockDim.y == team_size +// threadIdx.x == position in vec +// threadIdx.y == member number +template <typename iType, class Closure, class Scheduler> +KOKKOS_INLINE_FUNCTION void parallel_scan( + const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::TaskExec<Kokkos::Cuda, Scheduler>>& loop_boundaries, + const Closure& closure) { + // Extract value_type from closure + + using value_type = typename Kokkos::Impl::FunctorAnalysis< + Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type; + + if (1 < loop_boundaries.thread.team_size()) { + // make sure all threads perform all loop iterations + const iType bound = loop_boundaries.end + loop_boundaries.start; + const int lane = threadIdx.y * blockDim.x; + + value_type accum = 0; + value_type val, y, local_total; + + for (iType i = loop_boundaries.start; i < bound; + i += loop_boundaries.increment) { + val = 0; + if (i < loop_boundaries.end) closure(i, val, false); + + // intra-blockDim.y exclusive scan on 'val' + // accum = accumulated, sum in total for this iteration + + // INCLUSIVE scan + for (int offset = blockDim.x; offset < Impl::CudaTraits::WarpSize; + offset <<= 1) { + y = Kokkos::shfl_up(val, offset, Impl::CudaTraits::WarpSize); + if (lane >= offset) { + val += y; + } + } + + // pass accum to all threads + local_total = shfl_warp_broadcast<value_type>( + val, threadIdx.x + Impl::CudaTraits::WarpSize - blockDim.x, + Impl::CudaTraits::WarpSize); + + // make EXCLUSIVE scan by shifting values over one + val = Kokkos::shfl_up(val, blockDim.x, Impl::CudaTraits::WarpSize); + if (threadIdx.y == 0) { + val = 0; + } + + val += accum; + if (i < loop_boundaries.end) closure(i, val, true); + accum += local_total; + } + } else { + value_type accum = 0; + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) { + closure(i, accum, true); + } + } +} + +// scan within team member (vector) within warp +// assume vec_length*team_size == warp_size +// blockDim.x == vec_length == stride +// blockDim.y == team_size +// threadIdx.x == position in vec +// threadIdx.y == member number +template <typename iType, class Closure, class Scheduler> +KOKKOS_INLINE_FUNCTION void parallel_scan( + const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::TaskExec<Kokkos::Cuda, Scheduler>>& loop_boundaries, + const Closure& closure) { + // Extract value_type from closure + + using value_type = typename Kokkos::Impl::FunctorAnalysis< + Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type; + + if (1 < loop_boundaries.thread.team_size()) { + // make sure all threads perform all loop iterations + const iType bound = loop_boundaries.end + loop_boundaries.start; + + value_type accum = 0; + value_type val, y, local_total; + + for (iType i = loop_boundaries.start; i < bound; + i += loop_boundaries.increment) { + val = 0; + if (i < loop_boundaries.end) closure(i, val, false); + + // intra-blockDim.x exclusive scan on 'val' + // accum = accumulated, sum in total for this iteration + + // INCLUSIVE scan + for (int offset = 1; offset < blockDim.x; offset <<= 1) { + y = Kokkos::shfl_up(val, offset, blockDim.x); + if (threadIdx.x >= offset) { + val += y; + } + } + + // pass accum to all threads + local_total = + shfl_warp_broadcast<value_type>(val, blockDim.x - 1, blockDim.x); + + // make EXCLUSIVE scan by shifting values over one + val = Kokkos::shfl_up(val, 1, blockDim.x); + if (threadIdx.x == 0) { + val = 0; + } + + val += accum; + if (i < loop_boundaries.end) closure(i, val, true); + accum += local_total; + } + } else { + value_type accum = 0; + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) { + closure(i, accum, true); + } + } +} + +} /* namespace Kokkos */ + +namespace Kokkos { + +template <class FunctorType, class Scheduler> +KOKKOS_INLINE_FUNCTION void single( + const Impl::VectorSingleStruct<Impl::TaskExec<Kokkos::Cuda, Scheduler>>&, + const FunctorType& lambda) { +#ifdef __CUDA_ARCH__ + if (threadIdx.x == 0) lambda(); +#endif +} + +template <class FunctorType, class Scheduler> +KOKKOS_INLINE_FUNCTION void single( + const Impl::ThreadSingleStruct<Impl::TaskExec<Kokkos::Cuda, Scheduler>>&, + const FunctorType& lambda) { +#ifdef __CUDA_ARCH__ + if (threadIdx.x == 0 && threadIdx.y == 0) lambda(); +#endif +} + +template <class FunctorType, class ValueType, class Scheduler> +KOKKOS_INLINE_FUNCTION void single( + const Impl::VectorSingleStruct<Impl::TaskExec<Kokkos::Cuda, Scheduler>>& s, + const FunctorType& lambda, ValueType& val) { +#ifdef __CUDA_ARCH__ + if (threadIdx.x == 0) lambda(val); + if (1 < s.team_member.team_size()) { + val = shfl(val, 0, blockDim.x); + } +#endif +} + +template <class FunctorType, class ValueType, class Scheduler> +KOKKOS_INLINE_FUNCTION void single( + const Impl::ThreadSingleStruct<Impl::TaskExec<Kokkos::Cuda, Scheduler>>& + single_struct, + const FunctorType& lambda, ValueType& val) { +#ifdef __CUDA_ARCH__ + if (threadIdx.x == 0 && threadIdx.y == 0) { + lambda(val); + } + single_struct.team_member.team_broadcast(val, 0); +#endif +} + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */ +#endif /* #ifndef KOKKOS_IMPL_CUDA_TASK_HPP */ diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e7806390155d46fd811a21432d9f9d268c457468 --- /dev/null +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp @@ -0,0 +1,1167 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CUDA_TEAM_HPP +#define KOKKOS_CUDA_TEAM_HPP + +#include <algorithm> + +#include <Kokkos_Macros.hpp> + +/* only compile this file if CUDA is enabled for Kokkos */ +#if defined(KOKKOS_ENABLE_CUDA) + +#include <utility> +#include <Kokkos_Parallel.hpp> + +#include <Cuda/Kokkos_Cuda_KernelLaunch.hpp> +#include <Cuda/Kokkos_Cuda_ReduceScan.hpp> +#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp> +#include <Kokkos_Vectorization.hpp> + +#include <impl/Kokkos_Tools.hpp> +#include <typeinfo> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <typename Type> +struct CudaJoinFunctor { + using value_type = Type; + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& update, + volatile const value_type& input) { + update += input; + } +}; + +/**\brief Team member_type passed to TeamPolicy or TeamTask closures. + * + * Cuda thread blocks for team closures are dimensioned as: + * blockDim.x == number of "vector lanes" per "thread" + * blockDim.y == number of "threads" per team + * blockDim.z == number of teams in a block + * where + * A set of teams exactly fill a warp OR a team is the whole block + * ( 0 == WarpSize % ( blockDim.x * blockDim.y ) ) + * OR + * ( 1 == blockDim.z ) + * + * Thus when 1 < blockDim.z the team is warp-synchronous + * and __syncthreads should not be called in team collectives. + * + * When multiple teams are mapped onto a single block then the + * total available shared memory must be partitioned among teams. + */ +class CudaTeamMember { + public: + using execution_space = Kokkos::Cuda; + using scratch_memory_space = execution_space::scratch_memory_space; + + private: + mutable void* m_team_reduce; + scratch_memory_space m_team_shared; + int m_team_reduce_size; + int m_league_rank; + int m_league_size; + + public: + KOKKOS_INLINE_FUNCTION + const execution_space::scratch_memory_space& team_shmem() const { + return m_team_shared.set_team_thread_mode(0, 1, 0); + } + + KOKKOS_INLINE_FUNCTION + const execution_space::scratch_memory_space& team_scratch( + const int& level) const { + return m_team_shared.set_team_thread_mode(level, 1, 0); + } + + KOKKOS_INLINE_FUNCTION + const execution_space::scratch_memory_space& thread_scratch( + const int& level) const { + return m_team_shared.set_team_thread_mode(level, team_size(), team_rank()); + } + + KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank; } + KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size; } + KOKKOS_INLINE_FUNCTION int team_rank() const { +#ifdef __CUDA_ARCH__ + return threadIdx.y; +#else + return 0; +#endif + } + + KOKKOS_INLINE_FUNCTION int team_size() const { +#ifdef __CUDA_ARCH__ + return blockDim.y; +#else + return 1; +#endif + } + + KOKKOS_INLINE_FUNCTION void team_barrier() const { +#ifdef __CUDA_ARCH__ + if (1 == blockDim.z) + __syncthreads(); // team == block + else + __threadfence_block(); // team <= warp +#endif + } + + //-------------------------------------------------------------------------- + + template <class ValueType> + KOKKOS_INLINE_FUNCTION void team_broadcast(ValueType& val, + const int& thread_id) const { + (void)val; + (void)thread_id; +#ifdef __CUDA_ARCH__ + if (1 == blockDim.z) { // team == block + __syncthreads(); + // Wait for shared data write until all threads arrive here + if (threadIdx.x == 0u && threadIdx.y == (uint32_t)thread_id) { + *((ValueType*)m_team_reduce) = val; + } + __syncthreads(); // Wait for shared data read until root thread writes + val = *((ValueType*)m_team_reduce); + } else { // team <= warp + ValueType tmp(val); // input might not be a register variable + Impl::in_place_shfl(val, tmp, blockDim.x * thread_id, + blockDim.x * blockDim.y); + } +#endif + } + + template <class Closure, class ValueType> + KOKKOS_INLINE_FUNCTION void team_broadcast(Closure const& f, ValueType& val, + const int& thread_id) const { + (void)f; + (void)val; + (void)thread_id; +#ifdef __CUDA_ARCH__ + f(val); + + if (1 == blockDim.z) { // team == block + __syncthreads(); + // Wait for shared data write until all threads arrive here + if (threadIdx.x == 0u && threadIdx.y == (uint32_t)thread_id) { + *((ValueType*)m_team_reduce) = val; + } + __syncthreads(); // Wait for shared data read until root thread writes + val = *((ValueType*)m_team_reduce); + } else { // team <= warp + ValueType tmp(val); // input might not be a register variable + Impl::in_place_shfl(val, tmp, blockDim.x * thread_id, + blockDim.x * blockDim.y); + } +#endif + } + + //-------------------------------------------------------------------------- + /**\brief Reduction across a team + * + * Mapping of teams onto blocks: + * blockDim.x is "vector lanes" + * blockDim.y is team "threads" + * blockDim.z is number of teams per block + * + * Requires: + * blockDim.x is power two + * blockDim.x <= CudaTraits::WarpSize + * ( 0 == CudaTraits::WarpSize % ( blockDim.x * blockDim.y ) + * OR + * ( 1 == blockDim.z ) + */ + template <typename ReducerType> + KOKKOS_INLINE_FUNCTION + typename std::enable_if<is_reducer<ReducerType>::value>::type + team_reduce(ReducerType const& reducer) const noexcept { + team_reduce(reducer, reducer.reference()); + } + + template <typename ReducerType> + KOKKOS_INLINE_FUNCTION + typename std::enable_if<is_reducer<ReducerType>::value>::type + team_reduce(ReducerType const& reducer, + typename ReducerType::value_type& value) const noexcept { + (void)reducer; + (void)value; +#ifdef __CUDA_ARCH__ + cuda_intra_block_reduction(reducer, value, blockDim.y); +#endif /* #ifdef __CUDA_ARCH__ */ + } + + //-------------------------------------------------------------------------- + /** \brief Intra-team exclusive prefix sum with team_rank() ordering + * with intra-team non-deterministic ordering accumulation. + * + * The global inter-team accumulation value will, at the end of the + * league's parallel execution, be the scan's total. + * Parallel execution ordering of the league's teams is non-deterministic. + * As such the base value for each team's scan operation is similarly + * non-deterministic. + */ + template <typename Type> + KOKKOS_INLINE_FUNCTION Type team_scan(const Type& value, + Type* const global_accum) const { +#ifdef __CUDA_ARCH__ + Type* const base_data = (Type*)m_team_reduce; + + __syncthreads(); // Don't write in to shared data until all threads have + // entered this function + + if (0 == threadIdx.y) { + base_data[0] = 0; + } + + base_data[threadIdx.y + 1] = value; + + Impl::cuda_intra_block_reduce_scan<true, Impl::CudaJoinFunctor<Type>, void>( + Impl::CudaJoinFunctor<Type>(), base_data + 1); + + if (global_accum) { + if (blockDim.y == threadIdx.y + 1) { + base_data[blockDim.y] = + atomic_fetch_add(global_accum, base_data[blockDim.y]); + } + __syncthreads(); // Wait for atomic + base_data[threadIdx.y] += base_data[blockDim.y]; + } + + return base_data[threadIdx.y]; +#else + (void)value; + (void)global_accum; + return Type(); +#endif + } + + /** \brief Intra-team exclusive prefix sum with team_rank() ordering. + * + * The highest rank thread can compute the reduction total as + * reduction_total = dev.team_scan( value ) + value ; + */ + template <typename Type> + KOKKOS_INLINE_FUNCTION Type team_scan(const Type& value) const { + return this->template team_scan<Type>(value, nullptr); + } + + //---------------------------------------- + + template <typename ReducerType> + KOKKOS_INLINE_FUNCTION static + typename std::enable_if<is_reducer<ReducerType>::value>::type + vector_reduce(ReducerType const& reducer) { + vector_reduce(reducer, reducer.reference()); + } + + template <typename ReducerType> + KOKKOS_INLINE_FUNCTION static + typename std::enable_if<is_reducer<ReducerType>::value>::type + vector_reduce(ReducerType const& reducer, + typename ReducerType::value_type& value) { + (void)reducer; + (void)value; +#ifdef __CUDA_ARCH__ + if (blockDim.x == 1) return; + + // Intra vector lane shuffle reduction: + typename ReducerType::value_type tmp(value); + typename ReducerType::value_type tmp2 = tmp; + + unsigned mask = + blockDim.x == 32 + ? 0xffffffff + : ((1 << blockDim.x) - 1) + << ((threadIdx.y % (32 / blockDim.x)) * blockDim.x); + + for (int i = blockDim.x; (i >>= 1);) { + Impl::in_place_shfl_down(tmp2, tmp, i, blockDim.x, mask); + if ((int)threadIdx.x < i) { + reducer.join(tmp, tmp2); + } + } + + // Broadcast from root lane to all other lanes. + // Cannot use "butterfly" algorithm to avoid the broadcast + // because floating point summation is not associative + // and thus different threads could have different results. + + Impl::in_place_shfl(tmp2, tmp, 0, blockDim.x, mask); + value = tmp2; + reducer.reference() = tmp2; +#endif + } + + //-------------------------------------------------------------------------- + /**\brief Global reduction across all blocks + * + * Return !0 if reducer contains the final value + */ + template <typename ReducerType> + KOKKOS_INLINE_FUNCTION static + typename std::enable_if<is_reducer<ReducerType>::value, int>::type + global_reduce(ReducerType const& reducer, int* const global_scratch_flags, + void* const global_scratch_space, void* const shmem, + int const shmem_size) { +#ifdef __CUDA_ARCH__ + + using value_type = typename ReducerType::value_type; + using pointer_type = value_type volatile*; + + // Number of shared memory entries for the reduction: + const int nsh = shmem_size / sizeof(value_type); + + // Number of CUDA threads in the block, rank within the block + const int nid = blockDim.x * blockDim.y * blockDim.z; + const int tid = + threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z); + + // Reduces within block using all available shared memory + // Contributes if it is the root "vector lane" + + // wn == number of warps in the block + // wx == which lane within the warp + // wy == which warp within the block + + const int wn = + (nid + CudaTraits::WarpIndexMask) >> CudaTraits::WarpIndexShift; + const int wx = tid & CudaTraits::WarpIndexMask; + const int wy = tid >> CudaTraits::WarpIndexShift; + + //------------------------ + { // Intra warp shuffle reduction from contributing CUDA threads + + value_type tmp(reducer.reference()); + + for (int i = CudaTraits::WarpSize; (int)blockDim.x <= (i >>= 1);) { + Impl::in_place_shfl_down(reducer.reference(), tmp, i, + CudaTraits::WarpSize); + + // Root of each vector lane reduces "thread" contribution + if (0 == threadIdx.x && wx < i) { + reducer.join(&tmp, reducer.data()); + } + } + + // Reduce across warps using shared memory. + // Number of warps may not be power of two. + + __syncthreads(); // Wait before shared data write + + // Number of shared memory entries for the reduction + // is at most one per warp + const int nentry = wn < nsh ? wn : nsh; + + if (0 == wx && wy < nentry) { + // Root thread of warp 'wy' has warp's value to contribute + ((value_type*)shmem)[wy] = tmp; + } + + __syncthreads(); // Wait for write to be visible to block + + // When more warps than shared entries + // then warps must take turns joining their contribution + // to the designated shared memory entry. + for (int i = nentry; i < wn; i += nentry) { + const int k = wy - i; + + if (0 == wx && i <= wy && k < nentry) { + // Root thread of warp 'wy' has warp's value to contribute + reducer.join(((value_type*)shmem) + k, &tmp); + } + + __syncthreads(); // Wait for write to be visible to block + } + + // One warp performs the inter-warp reduction: + + if (0 == wy) { + // Start fan-in at power of two covering nentry + + for (int i = (1 << (32 - __clz(nentry - 1))); (i >>= 1);) { + const int k = wx + i; + if (wx < i && k < nentry) { + reducer.join(((pointer_type)shmem) + wx, ((pointer_type)shmem) + k); + __threadfence_block(); // Wait for write to be visible to warp + } + } + } + } + //------------------------ + { // Write block's value to global_scratch_memory + + int last_block = 0; + + if (0 == wx) { + reducer.copy(((pointer_type)global_scratch_space) + + blockIdx.x * reducer.length(), + reducer.data()); + + __threadfence(); // Wait until global write is visible. + + last_block = (int)gridDim.x == + 1 + Kokkos::atomic_fetch_add(global_scratch_flags, 1); + + // If last block then reset count + if (last_block) *global_scratch_flags = 0; + } + + last_block = __syncthreads_or(last_block); + + if (!last_block) return 0; + } + //------------------------ + // Last block reads global_scratch_memory into shared memory. + + const int nentry = nid < gridDim.x ? (nid < nsh ? nid : nsh) + : (gridDim.x < nsh ? gridDim.x : nsh); + + // nentry = min( nid , nsh , gridDim.x ) + + // whole block reads global memory into shared memory: + + if (tid < nentry) { + const int offset = tid * reducer.length(); + + reducer.copy(((pointer_type)shmem) + offset, + ((pointer_type)global_scratch_space) + offset); + + for (int i = nentry + tid; i < (int)gridDim.x; i += nentry) { + reducer.join( + ((pointer_type)shmem) + offset, + ((pointer_type)global_scratch_space) + i * reducer.length()); + } + } + + __syncthreads(); // Wait for writes to be visible to block + + if (0 == wy) { + // Iterate to reduce shared memory to single warp fan-in size + + const int nreduce = + CudaTraits::WarpSize < nentry ? CudaTraits::WarpSize : nentry; + + // nreduce = min( CudaTraits::WarpSize , nsh , gridDim.x ) + + if (wx < nreduce && nreduce < nentry) { + for (int i = nreduce + wx; i < nentry; i += nreduce) { + reducer.join(((pointer_type)shmem) + wx, ((pointer_type)shmem) + i); + } + __threadfence_block(); // Wait for writes to be visible to warp + } + + // Start fan-in at power of two covering nentry + + for (int i = (1 << (32 - __clz(nreduce - 1))); (i >>= 1);) { + const int k = wx + i; + if (wx < i && k < nreduce) { + reducer.join(((pointer_type)shmem) + wx, ((pointer_type)shmem) + k); + __threadfence_block(); // Wait for writes to be visible to warp + } + } + + if (0 == wx) { + reducer.copy(reducer.data(), (pointer_type)shmem); + return 1; + } + } + return 0; + +#else + (void)reducer; + (void)global_scratch_flags; + (void)global_scratch_space; + (void)shmem; + (void)shmem_size; + return 0; +#endif + } + + //---------------------------------------- + // Private for the driver + + KOKKOS_INLINE_FUNCTION + CudaTeamMember(void* shared, const int shared_begin, const int shared_size, + void* scratch_level_1_ptr, const int scratch_level_1_size, + const int arg_league_rank, const int arg_league_size) + : m_team_reduce(shared), + m_team_shared(((char*)shared) + shared_begin, shared_size, + scratch_level_1_ptr, scratch_level_1_size), + m_team_reduce_size(shared_begin), + m_league_rank(arg_league_rank), + m_league_size(arg_league_size) {} + + public: + // Declare to avoid unused private member warnings which are trigger + // when SFINAE excludes the member function which uses these variables + // Making another class a friend also surpresses these warnings + bool impl_avoid_sfinae_warning() const noexcept { + return m_team_reduce_size > 0 && m_team_reduce != nullptr; + } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <typename iType> +struct TeamThreadRangeBoundariesStruct<iType, CudaTeamMember> { + using index_type = iType; + const CudaTeamMember& member; + const iType start; + const iType end; + + KOKKOS_INLINE_FUNCTION + TeamThreadRangeBoundariesStruct(const CudaTeamMember& thread_, iType count) + : member(thread_), start(0), end(count) {} + + KOKKOS_INLINE_FUNCTION + TeamThreadRangeBoundariesStruct(const CudaTeamMember& thread_, iType begin_, + iType end_) + : member(thread_), start(begin_), end(end_) {} +}; + +template <typename iType> +struct TeamVectorRangeBoundariesStruct<iType, CudaTeamMember> { + using index_type = iType; + const CudaTeamMember& member; + const iType start; + const iType end; + + KOKKOS_INLINE_FUNCTION + TeamVectorRangeBoundariesStruct(const CudaTeamMember& thread_, + const iType& count) + : member(thread_), start(0), end(count) {} + + KOKKOS_INLINE_FUNCTION + TeamVectorRangeBoundariesStruct(const CudaTeamMember& thread_, + const iType& begin_, const iType& end_) + : member(thread_), start(begin_), end(end_) {} +}; + +template <typename iType> +struct ThreadVectorRangeBoundariesStruct<iType, CudaTeamMember> { + using index_type = iType; + const index_type start; + const index_type end; + + KOKKOS_INLINE_FUNCTION + ThreadVectorRangeBoundariesStruct(const CudaTeamMember, index_type count) + : start(static_cast<index_type>(0)), end(count) {} + + KOKKOS_INLINE_FUNCTION + ThreadVectorRangeBoundariesStruct(index_type count) + : start(static_cast<index_type>(0)), end(count) {} + + KOKKOS_INLINE_FUNCTION + ThreadVectorRangeBoundariesStruct(const CudaTeamMember, index_type arg_begin, + index_type arg_end) + : start(arg_begin), end(arg_end) {} + + KOKKOS_INLINE_FUNCTION + ThreadVectorRangeBoundariesStruct(index_type arg_begin, index_type arg_end) + : start(arg_begin), end(arg_end) {} +}; + +} // namespace Impl + +template <typename iType> +KOKKOS_INLINE_FUNCTION + Impl::TeamThreadRangeBoundariesStruct<iType, Impl::CudaTeamMember> + TeamThreadRange(const Impl::CudaTeamMember& thread, iType count) { + return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::CudaTeamMember>( + thread, count); +} + +template <typename iType1, typename iType2> +KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct< + typename std::common_type<iType1, iType2>::type, Impl::CudaTeamMember> +TeamThreadRange(const Impl::CudaTeamMember& thread, iType1 begin, iType2 end) { + using iType = typename std::common_type<iType1, iType2>::type; + return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::CudaTeamMember>( + thread, iType(begin), iType(end)); +} + +template <typename iType> +KOKKOS_INLINE_FUNCTION + Impl::TeamVectorRangeBoundariesStruct<iType, Impl::CudaTeamMember> + TeamVectorRange(const Impl::CudaTeamMember& thread, const iType& count) { + return Impl::TeamVectorRangeBoundariesStruct<iType, Impl::CudaTeamMember>( + thread, count); +} + +template <typename iType1, typename iType2> +KOKKOS_INLINE_FUNCTION Impl::TeamVectorRangeBoundariesStruct< + typename std::common_type<iType1, iType2>::type, Impl::CudaTeamMember> +TeamVectorRange(const Impl::CudaTeamMember& thread, const iType1& begin, + const iType2& end) { + using iType = typename std::common_type<iType1, iType2>::type; + return Impl::TeamVectorRangeBoundariesStruct<iType, Impl::CudaTeamMember>( + thread, iType(begin), iType(end)); +} + +template <typename iType> +KOKKOS_INLINE_FUNCTION + Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::CudaTeamMember> + ThreadVectorRange(const Impl::CudaTeamMember& thread, iType count) { + return Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::CudaTeamMember>( + thread, count); +} + +template <typename iType1, typename iType2> +KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct< + typename std::common_type<iType1, iType2>::type, Impl::CudaTeamMember> +ThreadVectorRange(const Impl::CudaTeamMember& thread, iType1 arg_begin, + iType2 arg_end) { + using iType = typename std::common_type<iType1, iType2>::type; + return Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::CudaTeamMember>( + thread, iType(arg_begin), iType(arg_end)); +} + +KOKKOS_INLINE_FUNCTION +Impl::ThreadSingleStruct<Impl::CudaTeamMember> PerTeam( + const Impl::CudaTeamMember& thread) { + return Impl::ThreadSingleStruct<Impl::CudaTeamMember>(thread); +} + +KOKKOS_INLINE_FUNCTION +Impl::VectorSingleStruct<Impl::CudaTeamMember> PerThread( + const Impl::CudaTeamMember& thread) { + return Impl::VectorSingleStruct<Impl::CudaTeamMember>(thread); +} + +//---------------------------------------------------------------------------- + +/** \brief Inter-thread parallel_for. + * + * Executes closure(iType i) for each i=[0..N). + * + * The range [0..N) is mapped to all threads of the the calling thread team. + */ +template <typename iType, class Closure> +KOKKOS_INLINE_FUNCTION void parallel_for( + const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::CudaTeamMember>& + loop_boundaries, + const Closure& closure) { + (void)loop_boundaries; + (void)closure; +#ifdef __CUDA_ARCH__ + for (iType i = loop_boundaries.start + threadIdx.y; i < loop_boundaries.end; + i += blockDim.y) + closure(i); +#endif +} + +//---------------------------------------------------------------------------- + +/** \brief Inter-thread parallel_reduce with a reducer. + * + * Executes closure(iType i, ValueType & val) for each i=[0..N) + * + * The range [0..N) is mapped to all threads of the + * calling thread team and a summation of val is + * performed and put into result. + */ +template <typename iType, class Closure, class ReducerType> +KOKKOS_INLINE_FUNCTION + typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type + parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::CudaTeamMember>& loop_boundaries, + const Closure& closure, const ReducerType& reducer) { + (void)loop_boundaries; + (void)closure; + (void)reducer; +#ifdef __CUDA_ARCH__ + typename ReducerType::value_type value; + reducer.init(value); + + for (iType i = loop_boundaries.start + threadIdx.y; i < loop_boundaries.end; + i += blockDim.y) { + closure(i, value); + } + + loop_boundaries.member.team_reduce(reducer, value); + +#endif +} + +/** \brief Inter-thread parallel_reduce assuming summation. + * + * Executes closure(iType i, ValueType & val) for each i=[0..N) + * + * The range [0..N) is mapped to all threads of the + * calling thread team and a summation of val is + * performed and put into result. + */ +template <typename iType, class Closure, typename ValueType> +KOKKOS_INLINE_FUNCTION + typename std::enable_if<!Kokkos::is_reducer<ValueType>::value>::type + parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::CudaTeamMember>& loop_boundaries, + const Closure& closure, ValueType& result) { + (void)loop_boundaries; + (void)closure; + (void)result; +#ifdef __CUDA_ARCH__ + ValueType val; + Kokkos::Sum<ValueType> reducer(val); + + reducer.init(reducer.reference()); + + for (iType i = loop_boundaries.start + threadIdx.y; i < loop_boundaries.end; + i += blockDim.y) { + closure(i, val); + } + + loop_boundaries.member.team_reduce(reducer, val); + result = reducer.reference(); +#endif +} + +template <typename iType, class Closure> +KOKKOS_INLINE_FUNCTION void parallel_for( + const Impl::TeamVectorRangeBoundariesStruct<iType, Impl::CudaTeamMember>& + loop_boundaries, + const Closure& closure) { + (void)loop_boundaries; + (void)closure; +#ifdef __CUDA_ARCH__ + for (iType i = loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x; + i < loop_boundaries.end; i += blockDim.y * blockDim.x) + closure(i); +#endif +} + +template <typename iType, class Closure, class ReducerType> +KOKKOS_INLINE_FUNCTION + typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type + parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< + iType, Impl::CudaTeamMember>& loop_boundaries, + const Closure& closure, const ReducerType& reducer) { + (void)loop_boundaries; + (void)closure; + (void)reducer; +#ifdef __CUDA_ARCH__ + typename ReducerType::value_type value; + reducer.init(value); + + for (iType i = loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x; + i < loop_boundaries.end; i += blockDim.y * blockDim.x) { + closure(i, value); + } + + loop_boundaries.member.vector_reduce(reducer, value); + loop_boundaries.member.team_reduce(reducer, value); +#endif +} + +template <typename iType, class Closure, typename ValueType> +KOKKOS_INLINE_FUNCTION + typename std::enable_if<!Kokkos::is_reducer<ValueType>::value>::type + parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< + iType, Impl::CudaTeamMember>& loop_boundaries, + const Closure& closure, ValueType& result) { + (void)loop_boundaries; + (void)closure; + (void)result; +#ifdef __CUDA_ARCH__ + ValueType val; + Kokkos::Sum<ValueType> reducer(val); + + reducer.init(reducer.reference()); + + for (iType i = loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x; + i < loop_boundaries.end; i += blockDim.y * blockDim.x) { + closure(i, val); + } + + loop_boundaries.member.vector_reduce(reducer); + loop_boundaries.member.team_reduce(reducer); + result = reducer.reference(); +#endif +} + +//---------------------------------------------------------------------------- + +/** \brief Intra-thread vector parallel_for. + * + * Executes closure(iType i) for each i=[0..N) + * + * The range [0..N) is mapped to all vector lanes of the the calling thread. + */ +template <typename iType, class Closure> +KOKKOS_INLINE_FUNCTION void parallel_for( + const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::CudaTeamMember>& + loop_boundaries, + const Closure& closure) { + (void)loop_boundaries; + (void)closure; +#ifdef __CUDA_ARCH__ + for (iType i = loop_boundaries.start + threadIdx.x; i < loop_boundaries.end; + i += blockDim.x) { + closure(i); + } +#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK + KOKKOS_IMPL_CUDA_SYNCWARP_MASK( + blockDim.x == 32 ? 0xffffffff + : ((1 << blockDim.x) - 1) + << (threadIdx.y % (32 / blockDim.x)) * blockDim.x); +#else + KOKKOS_IMPL_CUDA_SYNCWARP; +#endif +#endif +} + +//---------------------------------------------------------------------------- + +/** \brief Intra-thread vector parallel_reduce. + * + * Calls closure(iType i, ValueType & val) for each i=[0..N). + * + * The range [0..N) is mapped to all vector lanes of + * the calling thread and a reduction of val is performed using += + * and output into result. + * + * The identity value for the += operator is assumed to be the default + * constructed value. + */ +template <typename iType, class Closure, class ReducerType> +KOKKOS_INLINE_FUNCTION + typename std::enable_if<is_reducer<ReducerType>::value>::type + parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::CudaTeamMember> const& loop_boundaries, + Closure const& closure, ReducerType const& reducer) { + (void)loop_boundaries; + (void)closure; + (void)reducer; +#ifdef __CUDA_ARCH__ + + reducer.init(reducer.reference()); + + for (iType i = loop_boundaries.start + threadIdx.x; i < loop_boundaries.end; + i += blockDim.x) { + closure(i, reducer.reference()); + } + + Impl::CudaTeamMember::vector_reduce(reducer); + +#endif +} + +/** \brief Intra-thread vector parallel_reduce. + * + * Calls closure(iType i, ValueType & val) for each i=[0..N). + * + * The range [0..N) is mapped to all vector lanes of + * the calling thread and a reduction of val is performed using += + * and output into result. + * + * The identity value for the += operator is assumed to be the default + * constructed value. + */ +template <typename iType, class Closure, typename ValueType> +KOKKOS_INLINE_FUNCTION + typename std::enable_if<!is_reducer<ValueType>::value>::type + parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::CudaTeamMember> const& loop_boundaries, + Closure const& closure, ValueType& result) { + (void)loop_boundaries; + (void)closure; + (void)result; +#ifdef __CUDA_ARCH__ + result = ValueType(); + + for (iType i = loop_boundaries.start + threadIdx.x; i < loop_boundaries.end; + i += blockDim.x) { + closure(i, result); + } + + Impl::CudaTeamMember::vector_reduce(Kokkos::Sum<ValueType>(result)); + +#endif +} + +//---------------------------------------------------------------------------- + +/** \brief Inter-thread parallel exclusive prefix sum. + * + * Executes closure(iType i, ValueType & val, bool final) for each i=[0..N) + * + * The range [0..N) is mapped to each rank in the team (whose global rank is + * less than N) and a scan operation is performed. The last call to closure has + * final == true. + */ +// This is the same code as in HIP and largely the same as in OpenMPTarget +template <typename iType, typename FunctorType> +KOKKOS_INLINE_FUNCTION void parallel_scan( + const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::CudaTeamMember>& + loop_bounds, + const FunctorType& lambda) { + // Extract value_type from lambda + using value_type = typename Kokkos::Impl::FunctorAnalysis< + Kokkos::Impl::FunctorPatternInterface::SCAN, void, + FunctorType>::value_type; + + const auto start = loop_bounds.start; + const auto end = loop_bounds.end; + auto& member = loop_bounds.member; + const auto team_size = member.team_size(); + const auto team_rank = member.team_rank(); + const auto nchunk = (end - start + team_size - 1) / team_size; + value_type accum = 0; + // each team has to process one or more chunks of the prefix scan + for (iType i = 0; i < nchunk; ++i) { + auto ii = start + i * team_size + team_rank; + // local accumulation for this chunk + value_type local_accum = 0; + // user updates value with prefix value + if (ii < loop_bounds.end) lambda(ii, local_accum, false); + // perform team scan + local_accum = member.team_scan(local_accum); + // add this blocks accum to total accumulation + auto val = accum + local_accum; + // user updates their data with total accumulation + if (ii < loop_bounds.end) lambda(ii, val, true); + // the last value needs to be propogated to next chunk + if (team_rank == team_size - 1) accum = val; + // broadcast last value to rest of the team + member.team_broadcast(accum, team_size - 1); + } +} + +//---------------------------------------------------------------------------- + +/** \brief Intra-thread vector parallel scan with reducer. + * + * Executes closure(iType i, ValueType & val, bool final) for each i=[0..N) + * + * The range [0..N) is mapped to all vector lanes in the + * thread and a scan operation is performed. + * The last call to closure has final == true. + */ +template <typename iType, class Closure, typename ReducerType> +KOKKOS_INLINE_FUNCTION + typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type + parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::CudaTeamMember>& loop_boundaries, + const Closure& closure, const ReducerType& reducer) { + (void)loop_boundaries; + (void)closure; + (void)reducer; +#ifdef __CUDA_ARCH__ + + using value_type = typename ReducerType::value_type; + value_type accum; + reducer.init(accum); + const value_type identity = accum; + + // Loop through boundaries by vector-length chunks + // must scan at each iteration + + // All thread "lanes" must loop the same number of times. + // Determine an loop end for all thread "lanes." + // Requires: + // blockDim.x is power of two and thus + // ( end % blockDim.x ) == ( end & ( blockDim.x - 1 ) ) + // 1 <= blockDim.x <= CudaTraits::WarpSize + + const int mask = blockDim.x - 1; + const unsigned active_mask = + blockDim.x == 32 ? 0xffffffff + : ((1 << blockDim.x) - 1) + << (threadIdx.y % (32 / blockDim.x)) * blockDim.x; + const int rem = loop_boundaries.end & mask; // == end % blockDim.x + const int end = loop_boundaries.end + (rem ? blockDim.x - rem : 0); + + for (int i = threadIdx.x; i < end; i += blockDim.x) { + value_type val = identity; + + // First acquire per-lane contributions. + // This sets i's val to i-1's contribution + // to make the latter in_place_shfl_up an + // exclusive scan -- the final accumulation + // of i's val will be included in the second + // closure call later. + if (i < loop_boundaries.end && threadIdx.x > 0) closure(i - 1, val, false); + + // Bottom up exclusive scan in triangular pattern + // where each CUDA thread is the root of a reduction tree + // from the zeroth "lane" to itself. + // [t] += [t-1] if t >= 1 + // [t] += [t-2] if t >= 2 + // [t] += [t-4] if t >= 4 + // ... + // This differs from the non-reducer overload, where an inclusive scan was + // implemented, because in general the binary operator cannot be inverted + // and we would not be able to remove the inclusive contribution by + // inversion. + for (int j = 1; j < (int)blockDim.x; j <<= 1) { + value_type tmp = identity; + Impl::in_place_shfl_up(tmp, val, j, blockDim.x, active_mask); + if (j <= (int)threadIdx.x) { + reducer.join(val, tmp); + } + } + + // Include accumulation + reducer.join(val, accum); + + // Update i's contribution into the val + // and add it to accum for next round + if (i < loop_boundaries.end) closure(i, val, true); + Impl::in_place_shfl(accum, val, mask, blockDim.x, active_mask); + } + +#endif +} + +//---------------------------------------------------------------------------- + +/** \brief Intra-thread vector parallel exclusive prefix sum. + * + * Executes closure(iType i, ValueType & val, bool final) for each i=[0..N) + * + * The range [0..N) is mapped to all vector lanes in the + * thread and a scan operation is performed. + * The last call to closure has final == true. + */ +template <typename iType, class Closure> +KOKKOS_INLINE_FUNCTION void parallel_scan( + const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::CudaTeamMember>& + loop_boundaries, + const Closure& closure) { + using value_type = typename Kokkos::Impl::FunctorAnalysis< + Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type; + value_type dummy; + parallel_scan(loop_boundaries, closure, Kokkos::Sum<value_type>(dummy)); +} + +} // namespace Kokkos + +namespace Kokkos { + +template <class FunctorType> +KOKKOS_INLINE_FUNCTION void single( + const Impl::VectorSingleStruct<Impl::CudaTeamMember>&, + const FunctorType& lambda) { + (void)lambda; +#ifdef __CUDA_ARCH__ + if (threadIdx.x == 0) lambda(); +#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK + KOKKOS_IMPL_CUDA_SYNCWARP_MASK( + blockDim.x == 32 ? 0xffffffff + : ((1 << blockDim.x) - 1) + << (threadIdx.y % (32 / blockDim.x)) * blockDim.x); +#else + KOKKOS_IMPL_CUDA_SYNCWARP; +#endif +#endif +} + +template <class FunctorType> +KOKKOS_INLINE_FUNCTION void single( + const Impl::ThreadSingleStruct<Impl::CudaTeamMember>&, + const FunctorType& lambda) { + (void)lambda; +#ifdef __CUDA_ARCH__ + if (threadIdx.x == 0 && threadIdx.y == 0) lambda(); +#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK + KOKKOS_IMPL_CUDA_SYNCWARP_MASK( + blockDim.x == 32 ? 0xffffffff + : ((1 << blockDim.x) - 1) + << (threadIdx.y % (32 / blockDim.x)) * blockDim.x); +#else + KOKKOS_IMPL_CUDA_SYNCWARP; +#endif +#endif +} + +template <class FunctorType, class ValueType> +KOKKOS_INLINE_FUNCTION void single( + const Impl::VectorSingleStruct<Impl::CudaTeamMember>&, + const FunctorType& lambda, ValueType& val) { + (void)lambda; + (void)val; +#ifdef __CUDA_ARCH__ + if (threadIdx.x == 0) lambda(val); + unsigned mask = blockDim.x == 32 + ? 0xffffffff + : ((1 << blockDim.x) - 1) + << ((threadIdx.y % (32 / blockDim.x)) * blockDim.x); + Impl::in_place_shfl(val, val, 0, blockDim.x, mask); +#endif +} + +template <class FunctorType, class ValueType> +KOKKOS_INLINE_FUNCTION void single( + const Impl::ThreadSingleStruct<Impl::CudaTeamMember>& single_struct, + const FunctorType& lambda, ValueType& val) { + (void)single_struct; + (void)lambda; + (void)val; +#ifdef __CUDA_ARCH__ + if (threadIdx.x == 0 && threadIdx.y == 0) { + lambda(val); + } + single_struct.team_member.team_broadcast(val, 0); +#endif +} + +} // namespace Kokkos + +#endif /* defined(KOKKOS_ENABLE_CUDA) */ + +#endif /* #ifndef KOKKOS_CUDA_TEAM_HPP */ diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp new file mode 100644 index 0000000000000000000000000000000000000000..f846c06ce573fcd13f797bbaaa9375af4ce8ad33 --- /dev/null +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp @@ -0,0 +1,133 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CUDA_UNIQUE_TOKEN_HPP +#define KOKKOS_CUDA_UNIQUE_TOKEN_HPP + +#include <Kokkos_Macros.hpp> +#ifdef KOKKOS_ENABLE_CUDA + +#include <Kokkos_CudaSpace.hpp> +#include <Kokkos_UniqueToken.hpp> +#include <impl/Kokkos_SharedAlloc.hpp> +#include <impl/Kokkos_ConcurrentBitset.hpp> + +namespace Kokkos { +namespace Experimental { + +// both global and instance Unique Tokens are implemented in the same way +template <> +class UniqueToken<Cuda, UniqueTokenScope::Global> { + protected: + uint32_t volatile* m_buffer; + uint32_t m_count; + + public: + using execution_space = Cuda; + using size_type = int32_t; + + explicit UniqueToken(execution_space const& = execution_space()); + + KOKKOS_DEFAULTED_FUNCTION + UniqueToken(const UniqueToken&) = default; + + KOKKOS_DEFAULTED_FUNCTION + UniqueToken(UniqueToken&&) = default; + + KOKKOS_DEFAULTED_FUNCTION + UniqueToken& operator=(const UniqueToken&) = default; + + KOKKOS_DEFAULTED_FUNCTION + UniqueToken& operator=(UniqueToken&&) = default; + + /// \brief upper bound for acquired values, i.e. 0 <= value < size() + KOKKOS_INLINE_FUNCTION + size_type size() const noexcept { return m_count; } + + /// \brief acquire value such that 0 <= value < size() + KOKKOS_INLINE_FUNCTION + size_type acquire() const { + const Kokkos::pair<int, int> result = + Kokkos::Impl::concurrent_bitset::acquire_bounded( + m_buffer, m_count, Kokkos::Impl::clock_tic() % m_count); + + if (result.first < 0) { + Kokkos::abort( + "UniqueToken<Cuda> failure to acquire tokens, no tokens available"); + } + + return result.first; + } + + /// \brief release an acquired value + KOKKOS_INLINE_FUNCTION + void release(size_type i) const noexcept { + Kokkos::Impl::concurrent_bitset::release(m_buffer, i); + } +}; + +template <> +class UniqueToken<Cuda, UniqueTokenScope::Instance> + : public UniqueToken<Cuda, UniqueTokenScope::Global> { + private: + Kokkos::View<uint32_t*, ::Kokkos::CudaSpace> m_buffer_view; + + public: + explicit UniqueToken(execution_space const& arg = execution_space()) + : UniqueToken<Cuda, UniqueTokenScope::Global>(arg) {} + + UniqueToken(size_type max_size, execution_space const& = execution_space()) + : m_buffer_view( + "UniqueToken::m_buffer_view", + ::Kokkos::Impl::concurrent_bitset::buffer_bound(max_size)) { + m_buffer = m_buffer_view.data(); + m_count = max_size; + } +}; + +} // namespace Experimental +} // namespace Kokkos + +#endif // KOKKOS_ENABLE_CUDA +#endif // KOKKOS_CUDA_UNIQUE_TOKEN_HPP diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp new file mode 100644 index 0000000000000000000000000000000000000000..7f7b7b6e78adc3de9d5ae446565eedc7d00439f5 --- /dev/null +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp @@ -0,0 +1,232 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#ifndef KOKKOS_CUDA_VECTORIZATION_HPP +#define KOKKOS_CUDA_VECTORIZATION_HPP + +#include <Kokkos_Macros.hpp> +#ifdef KOKKOS_ENABLE_CUDA + +#include <type_traits> +#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp> + +namespace Kokkos { + +namespace Impl { + +// Include all lanes +constexpr unsigned shfl_all_mask = 0xffffffffu; + +//---------------------------------------------------------------------------- +// Shuffle operations require input to be a register (stack) variable + +// Derived implements do_shfl_op(unsigned mask, T& in, int lane, int width), +// which turns in to one of KOKKOS_IMPL_CUDA_SHFL(_UP_|_DOWN_|_)MASK +// Since the logic with respect to value sizes, etc., is the same everywhere, +// put it all in one place. +template <class Derived> +struct in_place_shfl_op { + // CRTP boilerplate + __device__ KOKKOS_IMPL_FORCEINLINE const Derived& self() const noexcept { + return *static_cast<Derived const*>(this); + } + + // sizeof(Scalar) <= sizeof(int) case + template <class Scalar> + // requires _assignable_from_bits<Scalar> + __device__ inline typename std::enable_if<sizeof(Scalar) <= sizeof(int)>::type + operator()(Scalar& out, Scalar const& in, int lane_or_delta, int width, + unsigned mask = shfl_all_mask) const noexcept { + using shfl_type = int; + union conv_type { + Scalar orig; + shfl_type conv; + // This should be fine, members get explicitly reset, which changes the + // active member + KOKKOS_FUNCTION conv_type() { conv = 0; } + }; + conv_type tmp_in; + tmp_in.orig = in; + shfl_type tmp_out; + tmp_out = reinterpret_cast<shfl_type&>(tmp_in.orig); + conv_type res; + //------------------------------------------------ + res.conv = self().do_shfl_op(mask, tmp_out, lane_or_delta, width); + //------------------------------------------------ + out = reinterpret_cast<Scalar&>(res.conv); + } + +// TODO: figure out why 64-bit shfl fails in Clang +#if !defined(KOKKOS_COMPILER_CLANG) + // sizeof(Scalar) == sizeof(double) case + // requires _assignable_from_bits<Scalar> + template <class Scalar> + __device__ inline + typename std::enable_if<sizeof(Scalar) == sizeof(double)>::type + operator()(Scalar& out, Scalar const& in, int lane_or_delta, int width, + unsigned mask = shfl_all_mask) const noexcept { + //------------------------------------------------ + reinterpret_cast<double&>(out) = self().do_shfl_op( + mask, *reinterpret_cast<double const*>(&in), lane_or_delta, width); + //------------------------------------------------ + } +#else + // sizeof(Scalar) == sizeof(double) case + // requires _assignable_from_bits<Scalar> + template <typename Scalar> + __device__ inline + typename std::enable_if<sizeof(Scalar) == sizeof(double)>::type + operator()(Scalar& out, const Scalar& val, int lane_or_delta, int width, + unsigned mask = shfl_all_mask) const noexcept { + //------------------------------------------------ + int lo = __double2loint(*reinterpret_cast<const double*>(&val)); + int hi = __double2hiint(*reinterpret_cast<const double*>(&val)); + lo = self().do_shfl_op(mask, lo, lane_or_delta, width); + hi = self().do_shfl_op(mask, hi, lane_or_delta, width); + auto tmp = __hiloint2double(hi, lo); + out = reinterpret_cast<Scalar&>(tmp); + //------------------------------------------------ + } +#endif + + // sizeof(Scalar) > sizeof(double) case + template <typename Scalar> + __device__ inline + typename std::enable_if<(sizeof(Scalar) > sizeof(double))>::type + operator()(Scalar& out, const Scalar& val, int lane_or_delta, int width, + unsigned mask = shfl_all_mask) const noexcept { + // TODO DSH shouldn't this be KOKKOS_IMPL_CUDA_MAX_SHFL_SIZEOF instead of + // sizeof(int)? (Need benchmarks to decide which is faster) + using shuffle_as_t = int; + enum : int { N = sizeof(Scalar) / sizeof(shuffle_as_t) }; + + for (int i = 0; i < N; ++i) { + reinterpret_cast<shuffle_as_t*>(&out)[i] = self().do_shfl_op( + mask, reinterpret_cast<shuffle_as_t const*>(&val)[i], lane_or_delta, + width); + } + } +}; + +struct in_place_shfl_fn : in_place_shfl_op<in_place_shfl_fn> { + template <class T> + __device__ KOKKOS_IMPL_FORCEINLINE T do_shfl_op(unsigned mask, T& val, + int lane, int width) const + noexcept { + (void)mask; + (void)val; + (void)lane; + (void)width; + return KOKKOS_IMPL_CUDA_SHFL_MASK(mask, val, lane, width); + } +}; +template <class... Args> +__device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl(Args&&... args) noexcept { + in_place_shfl_fn{}((Args &&) args...); +} + +struct in_place_shfl_up_fn : in_place_shfl_op<in_place_shfl_up_fn> { + template <class T> + __device__ KOKKOS_IMPL_FORCEINLINE T do_shfl_op(unsigned mask, T& val, + int lane, int width) const + noexcept { + return KOKKOS_IMPL_CUDA_SHFL_UP_MASK(mask, val, lane, width); + } +}; +template <class... Args> +__device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl_up( + Args&&... args) noexcept { + in_place_shfl_up_fn{}((Args &&) args...); +} + +struct in_place_shfl_down_fn : in_place_shfl_op<in_place_shfl_down_fn> { + template <class T> + __device__ KOKKOS_IMPL_FORCEINLINE T do_shfl_op(unsigned mask, T& val, + int lane, int width) const + noexcept { + (void)mask; + (void)val; + (void)lane; + (void)width; + return KOKKOS_IMPL_CUDA_SHFL_DOWN_MASK(mask, val, lane, width); + } +}; +template <class... Args> +__device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl_down( + Args&&... args) noexcept { + in_place_shfl_down_fn{}((Args &&) args...); +} + +} // namespace Impl + +template <class T> +// requires default_constructible<T> && _assignable_from_bits<T> +__device__ inline T shfl(const T& val, const int& srcLane, const int& width, + unsigned mask = Impl::shfl_all_mask) { + T rv = {}; + Impl::in_place_shfl(rv, val, srcLane, width, mask); + return rv; +} + +template <class T> +// requires default_constructible<T> && _assignable_from_bits<T> +__device__ inline T shfl_down(const T& val, int delta, int width, + unsigned mask = Impl::shfl_all_mask) { + T rv = {}; + Impl::in_place_shfl_down(rv, val, delta, width, mask); + return rv; +} + +template <class T> +// requires default_constructible<T> && _assignable_from_bits<T> +__device__ inline T shfl_up(const T& val, int delta, int width, + unsigned mask = Impl::shfl_all_mask) { + T rv = {}; + Impl::in_place_shfl_up(rv, val, delta, width, mask); + return rv; +} + +} // end namespace Kokkos + +#endif // defined( KOKKOS_ENABLE_CUDA ) +#endif // !defined( KOKKOS_CUDA_VECTORIZATION_HPP ) diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp new file mode 100644 index 0000000000000000000000000000000000000000..0cdd84ce27157e118065c6fbcf2da71a875b81e0 --- /dev/null +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp @@ -0,0 +1,49 @@ +#include <Kokkos_Macros.hpp> + +#if defined(__CUDA_ARCH__) +#define KOKKOS_IMPL_CUDA_ACTIVEMASK __activemask() +#define KOKKOS_IMPL_CUDA_SYNCWARP __syncwarp(0xffffffff) +#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK(m) __syncwarp(m) +#define KOKKOS_IMPL_CUDA_BALLOT(x) __ballot_sync(__activemask(), x) +#define KOKKOS_IMPL_CUDA_BALLOT_MASK(m, x) __ballot_sync(m, x) +#define KOKKOS_IMPL_CUDA_SHFL(x, y, z) __shfl_sync(0xffffffff, x, y, z) +#define KOKKOS_IMPL_CUDA_SHFL_MASK(m, x, y, z) __shfl_sync(m, x, y, z) +#define KOKKOS_IMPL_CUDA_SHFL_UP(x, y, z) __shfl_up_sync(0xffffffff, x, y, z) +#define KOKKOS_IMPL_CUDA_SHFL_UP_MASK(m, x, y, z) __shfl_up_sync(m, x, y, z) +#define KOKKOS_IMPL_CUDA_SHFL_DOWN(x, y, z) \ + __shfl_down_sync(0xffffffff, x, y, z) +#define KOKKOS_IMPL_CUDA_SHFL_DOWN_MASK(m, x, y, z) __shfl_down_sync(m, x, y, z) +#else +#define KOKKOS_IMPL_CUDA_ACTIVEMASK 0 +#define KOKKOS_IMPL_CUDA_SYNCWARP +#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK(m) (void)m +#define KOKKOS_IMPL_CUDA_BALLOT(x) 0 +#define KOKKOS_IMPL_CUDA_BALLOT_MASK(m, x) 0 +#define KOKKOS_IMPL_CUDA_SHFL(x, y, z) 0 +#define KOKKOS_IMPL_CUDA_SHFL_MASK(m, x, y, z) 0 +#define KOKKOS_IMPL_CUDA_SHFL_UP(x, y, z) 0 +#define KOKKOS_IMPL_CUDA_SHFL_DOWN(x, y, z) 0 +#define KOKKOS_IMPL_CUDA_SHFL_DOWN_MASK(m, x, y, z) 0 +#endif + +#if !defined(KOKKOS_COMPILER_CLANG) +#define KOKKOS_IMPL_CUDA_MAX_SHFL_SIZEOF sizeof(long long) +#else +#define KOKKOS_IMPL_CUDA_MAX_SHFL_SIZEOF sizeof(int) +#endif + +#if defined(__CUDA_ARCH__) +#define KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN(MSG) \ + { \ + __syncwarp(); \ + const unsigned b = __activemask(); \ + if (b != 0xffffffff) { \ + printf(" SYNCWARP AT %s (%d,%d,%d) (%d,%d,%d) failed %x\n", MSG, \ + blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, \ + threadIdx.z, b); \ + return; \ + } \ + } +#else +#define KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN(MSG) +#endif diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp new file mode 100644 index 0000000000000000000000000000000000000000..c55956ede9665bc3005fa570d7ac120404a54d49 --- /dev/null +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp @@ -0,0 +1,291 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_EXPERIMENTAL_CUDA_VIEW_HPP +#define KOKKOS_EXPERIMENTAL_CUDA_VIEW_HPP + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_CUDA) + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +// Cuda Texture fetches can be performed for 4, 8 and 16 byte objects +// (int,int2,int4) Via reinterpret_case this can be used to support all scalar +// types of those sizes. Any other scalar type falls back to either normal reads +// out of global memory, or using the __ldg intrinsic on Kepler GPUs or newer +// (Compute Capability >= 3.0) + +template <typename ValueType, typename AliasType> +struct CudaTextureFetch { + ::cudaTextureObject_t m_obj; + const ValueType* m_ptr; + int m_offset; + + // Deference operator pulls through texture object and returns by value + template <typename iType> + KOKKOS_INLINE_FUNCTION ValueType operator[](const iType& i) const { +#if defined(__CUDA_ARCH__) && (300 <= __CUDA_ARCH__) + AliasType v = tex1Dfetch<AliasType>(m_obj, i + m_offset); + return *(reinterpret_cast<ValueType*>(&v)); +#else + return m_ptr[i]; +#endif + } + + // Pointer to referenced memory + KOKKOS_INLINE_FUNCTION + operator const ValueType*() const { return m_ptr; } + + KOKKOS_INLINE_FUNCTION + CudaTextureFetch() : m_obj(), m_ptr(), m_offset() {} + + KOKKOS_DEFAULTED_FUNCTION + ~CudaTextureFetch() = default; + + KOKKOS_INLINE_FUNCTION + CudaTextureFetch(const CudaTextureFetch& rhs) + : m_obj(rhs.m_obj), m_ptr(rhs.m_ptr), m_offset(rhs.m_offset) {} + + KOKKOS_INLINE_FUNCTION + CudaTextureFetch(CudaTextureFetch&& rhs) + : m_obj(rhs.m_obj), m_ptr(rhs.m_ptr), m_offset(rhs.m_offset) {} + + KOKKOS_INLINE_FUNCTION + CudaTextureFetch& operator=(const CudaTextureFetch& rhs) { + m_obj = rhs.m_obj; + m_ptr = rhs.m_ptr; + m_offset = rhs.m_offset; + return *this; + } + + KOKKOS_INLINE_FUNCTION + CudaTextureFetch& operator=(CudaTextureFetch&& rhs) { + m_obj = rhs.m_obj; + m_ptr = rhs.m_ptr; + m_offset = rhs.m_offset; + return *this; + } + + // Texture object spans the entire allocation. + // This handle may view a subset of the allocation, so an offset is required. + template <class CudaMemorySpace> + inline explicit CudaTextureFetch( + const ValueType* const arg_ptr, + Kokkos::Impl::SharedAllocationRecord<CudaMemorySpace, void>* record) + : m_obj(record->template attach_texture_object<AliasType>()), + m_ptr(arg_ptr), + m_offset(record->attach_texture_object_offset( + reinterpret_cast<const AliasType*>(arg_ptr))) {} + + // Texture object spans the entire allocation. + // This handle may view a subset of the allocation, so an offset is required. + KOKKOS_INLINE_FUNCTION + CudaTextureFetch(const CudaTextureFetch& rhs, size_t offset) + : m_obj(rhs.m_obj), + m_ptr(rhs.m_ptr + offset), + m_offset(offset + rhs.m_offset) {} +}; + +#if defined(KOKKOS_ENABLE_CUDA_LDG_INTRINSIC) + +template <typename ValueType, typename AliasType> +struct CudaLDGFetch { + const ValueType* m_ptr; + + template <typename iType> + KOKKOS_INLINE_FUNCTION ValueType operator[](const iType& i) const { +#if defined(__CUDA_ARCH__) && (350 <= _CUDA_ARCH__) + AliasType v = __ldg(reinterpret_cast<const AliasType*>(&m_ptr[i])); + return *(reinterpret_cast<ValueType*>(&v)); +#else + return m_ptr[i]; +#endif + } + + KOKKOS_INLINE_FUNCTION + operator const ValueType*() const { return m_ptr; } + + KOKKOS_INLINE_FUNCTION + CudaLDGFetch() : m_ptr() {} + + KOKKOS_DEFAULTED_FUNCTION + ~CudaLDGFetch() = default; + + KOKKOS_INLINE_FUNCTION + CudaLDGFetch(const CudaLDGFetch& rhs) : m_ptr(rhs.m_ptr) {} + + KOKKOS_INLINE_FUNCTION + CudaLDGFetch(CudaLDGFetch&& rhs) : m_ptr(rhs.m_ptr) {} + + KOKKOS_INLINE_FUNCTION + CudaLDGFetch& operator=(const CudaLDGFetch& rhs) { + m_ptr = rhs.m_ptr; + return *this; + } + + KOKKOS_INLINE_FUNCTION + CudaLDGFetch& operator=(CudaLDGFetch&& rhs) { + m_ptr = rhs.m_ptr; + return *this; + } + + template <class CudaMemorySpace> + inline explicit CudaLDGFetch( + const ValueType* const arg_ptr, + Kokkos::Impl::SharedAllocationRecord<CudaMemorySpace, void>*) + : m_ptr(arg_ptr) {} + + KOKKOS_INLINE_FUNCTION + CudaLDGFetch(CudaLDGFetch const rhs, size_t offset) + : m_ptr(rhs.m_ptr + offset) {} +}; + +#endif + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +/** \brief Replace Default ViewDataHandle with Cuda texture fetch + * specialization if 'const' value type, CudaSpace and random access. + */ +template <class Traits> +class ViewDataHandle< + Traits, typename std::enable_if<( + // Is Cuda memory space + (std::is_same<typename Traits::memory_space, + Kokkos::CudaSpace>::value || + std::is_same<typename Traits::memory_space, + Kokkos::CudaUVMSpace>::value) && + // Is a trivial const value of 4, 8, or 16 bytes + std::is_trivial<typename Traits::const_value_type>::value && + std::is_same<typename Traits::const_value_type, + typename Traits::value_type>::value && + (sizeof(typename Traits::const_value_type) == 4 || + sizeof(typename Traits::const_value_type) == 8 || + sizeof(typename Traits::const_value_type) == 16) && + // Random access trait + (Traits::memory_traits::is_random_access != 0))>::type> { + public: + using track_type = Kokkos::Impl::SharedAllocationTracker; + + using value_type = typename Traits::const_value_type; + using return_type = typename Traits::const_value_type; // NOT a reference + + using alias_type = typename std::conditional< + (sizeof(value_type) == 4), int, + typename std::conditional< + (sizeof(value_type) == 8), ::int2, + typename std::conditional<(sizeof(value_type) == 16), ::int4, + void>::type>::type>::type; + +#if defined(KOKKOS_ENABLE_CUDA_LDG_INTRINSIC) + using handle_type = Kokkos::Impl::CudaLDGFetch<value_type, alias_type>; +#else + using handle_type = Kokkos::Impl::CudaTextureFetch<value_type, alias_type>; +#endif + + KOKKOS_INLINE_FUNCTION + static handle_type const& assign(handle_type const& arg_handle, + track_type const& /* arg_tracker */) { + return arg_handle; + } + + KOKKOS_INLINE_FUNCTION + static handle_type const assign(handle_type const& arg_handle, + size_t offset) { + return handle_type(arg_handle, offset); + } + + KOKKOS_INLINE_FUNCTION + static handle_type assign(value_type* arg_data_ptr, + track_type const& arg_tracker) { + if (arg_data_ptr == nullptr) return handle_type(); + +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + // Assignment of texture = non-texture requires creation of a texture object + // which can only occur on the host. In addition, 'get_record' is only + // valid if called in a host execution space + + using memory_space = typename Traits::memory_space; + using record = typename Impl::SharedAllocationRecord<memory_space, void>; + + record* const r = arg_tracker.template get_record<memory_space>(); + +#if !defined(KOKKOS_ENABLE_CUDA_LDG_INTRINSIC) + if (0 == r) { + Kokkos::abort( + "Cuda const random access View using Cuda texture memory requires " + "Kokkos to allocate the View's memory"); + } +#endif + + return handle_type(arg_data_ptr, r); + +#else + (void)arg_tracker; + Kokkos::Impl::cuda_abort( + "Cannot create Cuda texture object from within a Cuda kernel"); + return handle_type(); +#endif + } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #if defined( KOKKOS_ENABLE_CUDA ) */ +#endif /* #ifndef KOKKOS_CUDA_VIEW_HPP */ diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp new file mode 100644 index 0000000000000000000000000000000000000000..fc52e415145218afa2c495e9f055e051e9921305 --- /dev/null +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp @@ -0,0 +1,116 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CUDA_WORKGRAPHPOLICY_HPP +#define KOKKOS_CUDA_WORKGRAPHPOLICY_HPP + +#include <Kokkos_Cuda.hpp> +#include <Cuda/Kokkos_Cuda_KernelLaunch.hpp> + +namespace Kokkos { +namespace Impl { + +template <class FunctorType, class... Traits> +class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>, + Kokkos::Cuda> { + public: + using Policy = Kokkos::WorkGraphPolicy<Traits...>; + using Self = ParallelFor<FunctorType, Policy, Kokkos::Cuda>; + + private: + Policy m_policy; + FunctorType m_functor; + + template <class TagType> + __device__ inline + typename std::enable_if<std::is_same<TagType, void>::value>::type + exec_one(const std::int32_t w) const noexcept { + m_functor(w); + } + + template <class TagType> + __device__ inline + typename std::enable_if<!std::is_same<TagType, void>::value>::type + exec_one(const std::int32_t w) const noexcept { + const TagType t{}; + m_functor(t, w); + } + + public: + Policy const& get_policy() const { return m_policy; } + + __device__ inline void operator()() const noexcept { + if (0 == (threadIdx.y % 16)) { + // Spin until COMPLETED_TOKEN. + // END_TOKEN indicates no work is currently available. + + for (std::int32_t w = Policy::END_TOKEN; + Policy::COMPLETED_TOKEN != (w = m_policy.pop_work());) { + if (Policy::END_TOKEN != w) { + exec_one<typename Policy::work_tag>(w); + m_policy.completed_work(w); + } + } + } + } + + inline void execute() { + const int warps_per_block = 4; + const dim3 grid(Kokkos::Impl::cuda_internal_multiprocessor_count(), 1, 1); + const dim3 block(1, Kokkos::Impl::CudaTraits::WarpSize, warps_per_block); + const int shared = 0; + + Kokkos::Impl::CudaParallelLaunch<Self>( + *this, grid, block, shared, Cuda().impl_internal_space_instance(), + false); + } + + inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) + : m_policy(arg_policy), m_functor(arg_functor) {} +}; + +} // namespace Impl +} // namespace Kokkos + +#endif /* #define KOKKOS_CUDA_WORKGRAPHPOLICY_HPP */ diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp new file mode 100644 index 0000000000000000000000000000000000000000..c0daa274f82a62c17dd40515622b7d9c0092d6ef --- /dev/null +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp @@ -0,0 +1,101 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CUDA_ABORT_HPP +#define KOKKOS_CUDA_ABORT_HPP + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_CUDA) + +#include <cuda.h> + +extern "C" { +/* Cuda runtime function, declared in <crt/device_runtime.h> + * Requires capability 2.x or better. + */ +extern __device__ void __assertfail(const void *message, const void *file, + unsigned int line, const void *function, + size_t charsize); +} + +namespace Kokkos { +namespace Impl { + +#if !defined(__APPLE__) +// required to workaround failures in random number generator unit tests with +// pre-volta architectures +#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) +__device__ inline void cuda_abort(const char *const message) { +#else +[[noreturn]] __device__ inline void cuda_abort(const char *const message) { +#endif + const char empty[] = ""; + + __assertfail((const void *)message, (const void *)empty, (unsigned int)0, + (const void *)empty, sizeof(char)); + + // This loop is never executed. It's intended to suppress warnings that the + // function returns, even though it does not. This is necessary because + // __assertfail is not marked as [[noreturn]], even though it does not return. + // Disable with KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK to workaround failures + // in random number generator unit tests with pre-volta architectures +#if !defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) + while (true) + ; +#endif +} +#else +__device__ inline void cuda_abort(const char *const message) { + // __assertfail is not supported on MAC +} +#endif + +} // namespace Impl +} // namespace Kokkos +#else +void KOKKOS_CORE_SRC_CUDA_ABORT_PREVENT_LINK_ERROR() {} +#endif /* #if defined( KOKKOS_ENABLE_CUDA ) */ +#endif /* #ifndef KOKKOS_CUDA_ABORT_HPP */ diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Abort.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Abort.hpp new file mode 100644 index 0000000000000000000000000000000000000000..98b457d8cf52ddcd69ededd5ba3cc75d09509b49 --- /dev/null +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Abort.hpp @@ -0,0 +1,77 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_HIP_ABORT_HPP +#define KOKKOS_HIP_ABORT_HPP + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_HIP) + +#include <hip/hip_runtime.h> + +namespace Kokkos { +namespace Impl { + +[[noreturn]] __device__ __attribute__((noinline)) void hip_abort( + char const *msg) { +#ifdef NDEBUG + (void)msg; +#else + // disable printf on release builds, as it has a non-trivial performance + // impact + printf("Aborting with message `%s'.\n", msg); +#endif + abort(); + // This loop is never executed. It's intended to suppress warnings that the + // function returns, even though it does not. This is necessary because + // abort() is not marked as [[noreturn]], even though it does not return. + while (true) + ; +} + +} // namespace Impl +} // namespace Kokkos + +#endif +#endif diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Atomic.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Atomic.hpp new file mode 100644 index 0000000000000000000000000000000000000000..263ba97d735705c9c02c67938e0a2aa3bf215654 --- /dev/null +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Atomic.hpp @@ -0,0 +1,623 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_HIP_ATOMIC_HPP +#define KOKKOS_HIP_ATOMIC_HPP + +#include <impl/Kokkos_Atomic_Memory_Order.hpp> +#include <impl/Kokkos_Memory_Fence.hpp> +#include <HIP/Kokkos_HIP_Locks.hpp> + +#if defined(KOKKOS_ENABLE_HIP_ATOMICS) +namespace Kokkos { +// HIP can do: +// Types int/unsigned int +// variants: +// atomic_exchange/compare_exchange/fetch_add/fetch_sub/fetch_max/fetch_min/fetch_and/fetch_or/fetch_xor/fetch_inc/fetch_dec + +// atomic_exchange ------------------------------------------------------------- + +__inline__ __device__ int atomic_exchange(volatile int *const dest, + const int val) { + return atomicExch(const_cast<int *>(dest), val); +} + +__inline__ __device__ unsigned int atomic_exchange( + volatile unsigned int *const dest, const unsigned int val) { + return atomicExch(const_cast<unsigned int *>(dest), val); +} + +__inline__ __device__ unsigned long long int atomic_exchange( + volatile unsigned long long int *const dest, + const unsigned long long int val) { + return atomicExch(const_cast<unsigned long long *>(dest), val); +} + +__inline__ __device__ float atomic_exchange(volatile float *const dest, + const float val) { + return atomicExch(const_cast<float *>(dest), val); +} + +template <typename T> +__inline__ __device__ T atomic_exchange( + volatile T *const dest, + typename std::enable_if<sizeof(T) == sizeof(int), const T &>::type val) { + int tmp = atomicExch(reinterpret_cast<int *>(const_cast<T *>(dest)), + *reinterpret_cast<int *>(const_cast<T *>(&val))); + return reinterpret_cast<T &>(tmp); +} + +template <typename T> +__inline__ __device__ T atomic_exchange( + volatile T *const dest, + typename std::enable_if<sizeof(T) != sizeof(int) && + sizeof(T) == sizeof(unsigned long long int), + const T &>::type val) { + using type = unsigned long long int; + + type tmp = atomicExch(reinterpret_cast<type *>(const_cast<T *>(dest)), + *reinterpret_cast<type *>(const_cast<T *>(&val))); + return reinterpret_cast<T &>(tmp); +} + +template <typename T> +__inline__ __device__ T +atomic_exchange(volatile T *const dest, + typename std::enable_if<sizeof(T) != sizeof(int) && + sizeof(T) != sizeof(long long), + const T>::type &val) { + T return_val; + int done = 0; + unsigned int active = __ballot(1); + unsigned int done_active = 0; + while (active != done_active) { + if (!done) { + if (Impl::lock_address_hip_space((void *)dest)) { + return_val = *dest; + *dest = val; + Impl::unlock_address_hip_space((void *)dest); + done = 1; + } + } + done_active = __ballot(done); + } + return return_val; +} + +// atomic_assign --------------------------------------------------------------- + +template <typename T> +__inline__ __device__ void atomic_assign( + volatile T *const dest, + typename std::enable_if<sizeof(T) == sizeof(int), const T &>::type val) { + atomicExch(reinterpret_cast<int *>(const_cast<T *>(dest)), + *reinterpret_cast<int *>(const_cast<T *>(&val))); +} + +template <typename T> +__inline__ __device__ void atomic_assign( + volatile T *const dest, + typename std::enable_if<sizeof(T) != sizeof(int) && + sizeof(T) == sizeof(unsigned long long int), + const T &>::type val) { + using type = unsigned long long int; + atomicExch(reinterpret_cast<type *>(const_cast<T *>(dest)), + *reinterpret_cast<type *>(const_cast<T *>(&val))); +} + +template <typename T> +__inline__ __device__ void atomic_assign( + volatile T *const dest, + typename std::enable_if<sizeof(T) != sizeof(int) && + sizeof(T) != sizeof(unsigned long long int), + const T &>::type val) { + atomic_exchange(dest, val); +} + +// atomic_compare_exchange ----------------------------------------------------- + +inline __device__ int atomic_compare_exchange(volatile int *dest, int compare, + const int &val) { + return atomicCAS(const_cast<int *>(dest), compare, val); +} + +inline __device__ unsigned int atomic_compare_exchange( + volatile unsigned int *dest, unsigned int compare, + const unsigned int &val) { + return atomicCAS(const_cast<unsigned int *>(dest), compare, val); +} + +inline __device__ unsigned long long int atomic_compare_exchange( + volatile unsigned long long int *dest, unsigned long long int compare, + const unsigned long long int &val) { + return atomicCAS(const_cast<unsigned long long int *>(dest), compare, val); +} + +template <class T> +__inline__ __device__ T atomic_compare_exchange( + volatile T *dest, T compare, + typename std::enable_if<sizeof(T) == sizeof(int), const T &>::type val) { + // FIXME_HIP UB + union U { + int i; + T f; + __inline__ __device__ U() {} + } idest, icompare, ival; + icompare.f = compare; + ival.f = val; + idest.i = atomicCAS(reinterpret_cast<int *>(const_cast<T *>(dest)), + icompare.i, ival.i); + return idest.f; +} + +template <class T> +__inline__ __device__ T atomic_compare_exchange( + volatile T *dest, T compare, + typename std::enable_if<sizeof(T) == sizeof(unsigned long long int), + const T &>::type val) { + // FIXME_HIP UB + union U { + unsigned long long int i; + T f; + __inline__ __device__ U() {} + } idest, icompare, ival; + icompare.f = compare; + ival.f = val; + idest.i = atomicCAS( + reinterpret_cast<unsigned long long int *>(const_cast<T *>(dest)), + icompare.i, ival.i); + return idest.f; +} + +template <typename T> +__inline__ __device__ T atomic_compare_exchange( + volatile T *const dest, const T &compare, + typename std::enable_if<sizeof(T) != sizeof(int) && + sizeof(T) != sizeof(long long), + const T>::type &val) { + T return_val; + int done = 0; + unsigned int active = __ballot(1); + unsigned int done_active = 0; + while (active != done_active) { + if (!done) { + if (Impl::lock_address_hip_space((void *)dest)) { + return_val = *dest; + if (return_val == compare) *dest = val; + Impl::unlock_address_hip_space((void *)dest); + done = 1; + } + } + done_active = __ballot(done); + } + return return_val; +} + +// atomic_fetch_add ------------------------------------------------------------ + +inline __device__ int atomic_fetch_add(volatile int *dest, const int &val) { + return atomicAdd(const_cast<int *>(dest), val); +} + +inline __device__ unsigned int atomic_fetch_add(volatile unsigned int *dest, + const unsigned int &val) { + return atomicAdd(const_cast<unsigned int *>(dest), val); +} + +inline __device__ unsigned long long atomic_fetch_add( + volatile unsigned long long *dest, const unsigned long long &val) { + return atomicAdd(const_cast<unsigned long long *>(dest), val); +} + +inline __device__ float atomic_fetch_add(volatile float *dest, + const float &val) { + return atomicAdd(const_cast<float *>(dest), val); +} + +template <typename T> +inline __device__ T atomic_fetch_add( + volatile T *const dest, + typename std::enable_if<sizeof(T) == sizeof(int), const T>::type val) { + // FIXME_HIP UB + union U { + int i; + T t; + __inline__ __device__ U() {} + } assume, oldval, newval; + + oldval.t = *dest; + + do { + assume.i = oldval.i; + newval.t = assume.t + val; + oldval.i = atomicCAS(reinterpret_cast<int *>(const_cast<T *>(dest)), + assume.i, newval.i); + } while (assume.i != oldval.i); + + return oldval.t; +} + +template <typename T> +inline __device__ T atomic_fetch_add( + volatile T *const dest, + typename std::enable_if<sizeof(T) == sizeof(long long), const T>::type + val) { + // FIXME_HIP UB + union U { + unsigned long long i; + T t; + __inline__ __device__ U() {} + } assume, oldval, newval; + + oldval.t = *dest; + + do { + assume.i = oldval.i; + newval.t = assume.t + val; + oldval.i = atomic_compare_exchange( + reinterpret_cast<volatile unsigned long long *>(dest), assume.i, + newval.i); + } while (assume.i != oldval.i); + + return oldval.t; +} + +__inline__ __device__ char atomic_fetch_add(volatile char *dest, + const char &val) { + unsigned int oldval, newval, assume; + oldval = *reinterpret_cast<volatile unsigned int *>(&dest); + + do { + assume = oldval; + newval = assume & 0x7fffff00 + ((assume & 0xff) + val) & 0xff; + oldval = + atomicCAS(reinterpret_cast<unsigned int *>(const_cast<char *>(dest)), + assume, newval); + } while (assume != oldval); + + return oldval; +} + +__inline__ __device__ short atomic_fetch_add(volatile short *dest, + const short &val) { + unsigned int oldval, newval, assume; + oldval = *reinterpret_cast<volatile unsigned int *>(&dest); + + do { + assume = oldval; + newval = assume & 0x7fff0000 + ((assume & 0xffff) + val) & 0xffff; + oldval = + atomicCAS(reinterpret_cast<unsigned int *>(const_cast<short *>(dest)), + assume, newval); + } while (assume != oldval); + + return oldval; +} + +__inline__ __device__ long long atomic_fetch_add(volatile long long *dest, + const long long &val) { + return atomicAdd( + reinterpret_cast<unsigned long long *>(const_cast<long long *>(dest)), + val); +} + +template <class T> +__inline__ __device__ T +atomic_fetch_add(volatile T *dest, + typename std::enable_if<sizeof(T) != sizeof(int) && + sizeof(T) != sizeof(long long), + const T &>::type val) { + T return_val; + int done = 0; + unsigned int active = __ballot(1); + unsigned int done_active = 0; + while (active != done_active) { + if (!done) { + if (Kokkos::Impl::lock_address_hip_space((void *)dest)) { + return_val = *dest; + *dest = return_val + val; + Kokkos::Impl::unlock_address_hip_space((void *)dest); + done = 1; + } + } + done_active = __ballot(done); + } + return return_val; +} + +// atmic_fetch_sub ------------------------------------------------------------- + +__inline__ __device__ int atomic_fetch_sub(volatile int *dest, int const &val) { + return atomicSub(const_cast<int *>(dest), val); +} + +__inline__ __device__ unsigned int atomic_fetch_sub(volatile unsigned int *dest, + unsigned int const &val) { + return atomicSub(const_cast<unsigned int *>(dest), val); +} + +__inline__ __device__ unsigned long long atomic_fetch_sub( + unsigned long long *dest, int64_t const &val) { + return atomicAdd(reinterpret_cast<unsigned long long *>(dest), + -reinterpret_cast<unsigned long long const &>(val)); +} + +__inline__ __device__ char atomic_fetch_sub(volatile char *dest, + const char &val) { + unsigned int oldval, newval, assume; + oldval = *reinterpret_cast<volatile unsigned int *>(dest); + + do { + assume = oldval; + newval = assume & 0x7fffff00 + ((assume & 0xff) - val) & 0xff; + oldval = + atomicCAS(reinterpret_cast<unsigned int *>(const_cast<char *>(dest)), + assume, newval); + } while (assume != oldval); + + return oldval; +} + +__inline__ __device__ short atomic_fetch_sub(volatile short *dest, + const short &val) { + unsigned int oldval, newval, assume; + oldval = *reinterpret_cast<volatile unsigned int *>(dest); + + do { + assume = oldval; + newval = assume & 0x7fff0000 + ((assume & 0xffff) - val) & 0xffff; + oldval = + atomicCAS(reinterpret_cast<unsigned int *>(const_cast<short *>(dest)), + assume, newval); + } while (assume != oldval); + + return oldval; +} + +__inline__ __device__ long long atomic_fetch_sub(volatile long long *dest, + const long long &val) { + return static_cast<long long>(atomicAdd( + reinterpret_cast<unsigned long long int *>(const_cast<long long *>(dest)), + -reinterpret_cast<unsigned long long int const &>(val))); +} + +template <class T> +__inline__ __device__ T atomic_fetch_sub( + volatile T *dest, + typename std::enable_if<sizeof(T) == sizeof(int), T>::type val) { + // FIXME_HIP UB + union U { + int i; + T t; + __inline__ __device__ U() {} + } assume, oldval, newval; + + oldval.t = *dest; + + do { + assume.i = oldval.i; + newval.t = assume.t - val; + oldval.i = atomic_compare_exchange(reinterpret_cast<volatile int *>(dest), + assume.i, newval.i); + } while (assume.i != oldval.i); + + return oldval.t; +} + +template <typename T> +inline __device__ T atomic_fetch_sub( + volatile T *const dest, + typename std::enable_if<sizeof(T) == sizeof(long long), const T>::type + val) { + // FIXME_HIP UB + union U { + unsigned long long i; + T t; + __inline__ __device__ U() {} + } assume, oldval, newval; + + oldval.t = *dest; + + do { + assume.i = oldval.i; + newval.t = assume.t - val; + oldval.i = atomic_compare_exchange( + reinterpret_cast<volatile unsigned long long *>(dest), assume.i, + newval.i); + } while (assume.i != oldval.i); + + return oldval.t; +} + +template <class T> +__inline__ __device__ T atomic_fetch_sub( + volatile T *dest, + typename std::enable_if<sizeof(T) == sizeof(char), T>::type val) { + unsigned int oldval, newval, assume; + oldval = *reinterpret_cast<volatile unsigned int *>(dest); + + do { + assume = oldval; + newval = assume & 0x7fffff00 + ((assume & 0xff) - val) & 0xff; + oldval = atomicCAS(reinterpret_cast<unsigned int *>(dest), assume, newval); + } while (assume != oldval); + + return reinterpret_cast<T>(oldval) & 0xff; +} + +template <class T> +__inline__ __device__ T atomic_fetch_sub( + volatile T *dest, + typename std::enable_if<sizeof(T) == sizeof(short), T>::type val) { + unsigned int oldval, newval, assume; + oldval = *reinterpret_cast<int *>(dest); + + do { + assume = oldval; + newval = assume & 0x7fff0000 + ((assume & 0xffff) - val) & 0xffff; + oldval = atomicCAS(reinterpret_cast<unsigned int *>(dest), assume, newval); + } while (assume != oldval); + + return reinterpret_cast<T>(oldval) & 0xffff; +} + +template <typename T> +__inline__ __device__ T +atomic_fetch_sub(volatile T *const dest, + typename std::enable_if<sizeof(T) != sizeof(int) && + sizeof(T) != sizeof(long long), + const T>::type &val) { + T return_val; + int done = 0; + unsigned int active = __ballot(1); + unsigned int done_active = 0; + while (active != done_active) { + if (!done) { + if (Impl::lock_address_hip_space((void *)dest)) { + return_val = *dest; + *dest = return_val - val; + Impl::unlock_address_hip_space((void *)dest); + done = 1; + } + } + done_active = __ballot(done); + } + return return_val; +} + +// atomic_fetch_or ------------------------------------------------------------- + +__inline__ __device__ int atomic_fetch_or(volatile int *const dest, + int const val) { + return atomicOr(const_cast<int *>(dest), val); +} + +__inline__ __device__ unsigned int atomic_fetch_or( + volatile unsigned int *const dest, unsigned int const val) { + return atomicOr(const_cast<unsigned int *>(dest), val); +} + +__inline__ __device__ unsigned long long int atomic_fetch_or( + volatile unsigned long long int *const dest, + unsigned long long int const val) { + return atomicOr(const_cast<unsigned long long int *>(dest), val); +} + +// atomic_fetch_and ------------------------------------------------------------ + +__inline__ __device__ int atomic_fetch_and(volatile int *const dest, + int const val) { + return atomicAnd(const_cast<int *>(dest), val); +} + +__inline__ __device__ unsigned int atomic_fetch_and( + volatile unsigned int *const dest, unsigned int const val) { + return atomicAnd(const_cast<unsigned int *>(dest), val); +} + +__inline__ __device__ unsigned long long int atomic_fetch_and( + volatile unsigned long long int *const dest, + unsigned long long int const val) { + return atomicAnd(const_cast<unsigned long long int *>(dest), val); +} + +namespace Impl { + +template <typename T> +__inline__ __device__ void _atomic_store(T *ptr, T val, + memory_order_relaxed_t) { + (void)atomic_exchange(ptr, val); +} + +template <typename T> +__inline__ __device__ void _atomic_store(T *ptr, T val, + memory_order_seq_cst_t) { + memory_fence(); + atomic_store(ptr, val, memory_order_relaxed); + memory_fence(); +} + +template <typename T> +__inline__ __device__ void _atomic_store(T *ptr, T val, + memory_order_release_t) { + memory_fence(); + atomic_store(ptr, val, memory_order_relaxed); +} + +template <typename T> +__inline__ __device__ void _atomic_store(T *ptr, T val) { + atomic_store(ptr, val, memory_order_relaxed); +} + +template <typename T> +__inline__ __device__ T _atomic_load(T *ptr, memory_order_relaxed_t) { + T dummy{}; + return atomic_compare_exchange(ptr, dummy, dummy); +} + +template <typename T> +__inline__ __device__ T _atomic_load(T *ptr, memory_order_seq_cst_t) { + memory_fence(); + T rv = atomic_load(ptr, memory_order_relaxed); + memory_fence(); + return rv; +} + +template <typename T> +__inline__ __device__ T _atomic_load(T *ptr, memory_order_acquire_t) { + T rv = atomic_load(ptr, memory_order_relaxed); + memory_fence(); + return rv; +} + +template <typename T> +__inline__ __device__ T _atomic_load(T *ptr) { + return atomic_load(ptr, memory_order_relaxed); +} + +} // namespace Impl +} // namespace Kokkos +#endif + +#endif diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp new file mode 100644 index 0000000000000000000000000000000000000000..9278d1bdc9efcc2a76183085c974afef41413e3c --- /dev/null +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp @@ -0,0 +1,179 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_HIP_BLOCKSIZE_DEDUCTION_HPP +#define KOKKOS_HIP_BLOCKSIZE_DEDUCTION_HPP + +#include <Kokkos_Macros.hpp> + +#if defined(__HIPCC__) + +#include <HIP/Kokkos_HIP_Instance.hpp> +#include <HIP/Kokkos_HIP_KernelLaunch.hpp> + +namespace Kokkos { +namespace Experimental { +namespace Impl { + +template <typename DriverType, bool, int MaxThreadsPerBlock, int MinBlocksPerSM> +void hipOccupancy(int *numBlocks, int blockSize, int sharedmem) { + // FIXME_HIP - currently the "constant" path is unimplemented. + // we should look at whether it's functional, and + // perform some simple scaling studies to see when / + // if the constant launcher outperforms the current + // pass by pointer shared launcher + HIP_SAFE_CALL(hipOccupancyMaxActiveBlocksPerMultiprocessor( + numBlocks, + hip_parallel_launch_local_memory<DriverType, MaxThreadsPerBlock, + MinBlocksPerSM>, + blockSize, sharedmem)); +} + +template <typename DriverType, bool constant> +void hipOccupancy(int *numBlocks, int blockSize, int sharedmem) { + hipOccupancy<DriverType, constant, HIPTraits::MaxThreadsPerBlock, 1>( + numBlocks, blockSize, sharedmem); +} + +template <class FunctorType, class LaunchBounds, typename F> +int hip_internal_get_block_size(const F &condition_check, + const HIPInternal *hip_instance, + const hipFuncAttributes &attr, + const FunctorType &f, + const size_t vector_length, + const size_t shmem_block, + const size_t shmem_thread) { + const int min_blocks_per_sm = + LaunchBounds::minBperSM == 0 ? 1 : LaunchBounds::minBperSM; + const int max_threads_per_block = LaunchBounds::maxTperB == 0 + ? HIPTraits::MaxThreadsPerBlock + : LaunchBounds::maxTperB; + + const int regs_per_wavefront = std::max(attr.numRegs, 1); + const int regs_per_sm = hip_instance->m_regsPerSM; + const int shmem_per_sm = hip_instance->m_shmemPerSM; + const int max_shmem_per_block = hip_instance->m_maxShmemPerBlock; + const int max_blocks_per_sm = hip_instance->m_maxBlocksPerSM; + const int max_threads_per_sm = hip_instance->m_maxThreadsPerSM; + + int block_size = max_threads_per_block; + KOKKOS_ASSERT(block_size > 0); + const int blocks_per_warp = + (block_size + HIPTraits::WarpSize - 1) / HIPTraits::WarpSize; + + int functor_shmem = ::Kokkos::Impl::FunctorTeamShmemSize<FunctorType>::value( + f, block_size / vector_length); + int total_shmem = shmem_block + shmem_thread * (block_size / vector_length) + + functor_shmem + attr.sharedSizeBytes; + int max_blocks_regs = regs_per_sm / (regs_per_wavefront * blocks_per_warp); + int max_blocks_shmem = + (total_shmem < max_shmem_per_block) + ? (total_shmem > 0 ? shmem_per_sm / total_shmem : max_blocks_regs) + : 0; + int blocks_per_sm = std::min(max_blocks_regs, max_blocks_shmem); + int threads_per_sm = blocks_per_sm * block_size; + if (threads_per_sm > max_threads_per_sm) { + blocks_per_sm = max_threads_per_sm / block_size; + threads_per_sm = blocks_per_sm * block_size; + } + int opt_block_size = + (blocks_per_sm >= min_blocks_per_sm) ? block_size : min_blocks_per_sm; + int opt_threads_per_sm = threads_per_sm; + block_size -= HIPTraits::WarpSize; + while (condition_check(blocks_per_sm) && + (block_size >= HIPTraits::WarpSize)) { + functor_shmem = ::Kokkos::Impl::FunctorTeamShmemSize<FunctorType>::value( + f, block_size / vector_length); + total_shmem = shmem_block + shmem_thread * (block_size / vector_length) + + functor_shmem + attr.sharedSizeBytes; + max_blocks_regs = regs_per_sm / (regs_per_wavefront * blocks_per_warp); + max_blocks_shmem = + (total_shmem < max_shmem_per_block) + ? (total_shmem > 0 ? shmem_per_sm / total_shmem : max_blocks_regs) + : 0; + blocks_per_sm = std::min(max_blocks_regs, max_blocks_shmem); + threads_per_sm = blocks_per_sm * block_size; + if (threads_per_sm > max_threads_per_sm) { + blocks_per_sm = max_threads_per_sm / block_size; + threads_per_sm = blocks_per_sm * block_size; + } + if ((blocks_per_sm >= min_blocks_per_sm) && + (blocks_per_sm <= max_blocks_per_sm)) { + if (threads_per_sm >= opt_threads_per_sm) { + opt_block_size = block_size; + opt_threads_per_sm = threads_per_sm; + } + } + block_size -= HIPTraits::WarpSize; + } + return opt_block_size; +} + +template <class FunctorType, class LaunchBounds> +int hip_get_max_block_size(const HIPInternal *hip_instance, + const hipFuncAttributes &attr, const FunctorType &f, + const size_t vector_length, const size_t shmem_block, + const size_t shmem_thread) { + return hip_internal_get_block_size<FunctorType, LaunchBounds>( + [](int x) { return x == 0; }, hip_instance, attr, f, vector_length, + shmem_block, shmem_thread); +} + +template <typename FunctorType, typename LaunchBounds> +int hip_get_opt_block_size(HIPInternal const *hip_instance, + hipFuncAttributes const &attr, FunctorType const &f, + size_t const vector_length, size_t const shmem_block, + size_t const shmem_thread) { + return hip_internal_get_block_size<FunctorType, LaunchBounds>( + [](int) { return true; }, hip_instance, attr, f, vector_length, + shmem_block, shmem_thread); +} + +} // namespace Impl +} // namespace Experimental +} // namespace Kokkos + +#endif + +#endif diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp new file mode 100644 index 0000000000000000000000000000000000000000..b3480bcad00c7ec6bc1a011a49fe7f9ae5eba345 --- /dev/null +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp @@ -0,0 +1,110 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_HIP_ERROR_HPP +#define KOKKOS_HIP_ERROR_HPP + +#include <Kokkos_Macros.hpp> +#include <impl/Kokkos_Error.hpp> + +#include <hip/hip_runtime.h> + +#include <ostream> + +namespace Kokkos { +namespace Impl { + +void hip_internal_error_throw(hipError_t e, const char* name, + const char* file = nullptr, const int line = 0); + +inline void hip_internal_safe_call(hipError_t e, const char* name, + const char* file = nullptr, + const int line = 0) { + if (hipSuccess != e) { + hip_internal_error_throw(e, name, file, line); + } +} + +} // namespace Impl +} // namespace Kokkos + +#define HIP_SAFE_CALL(call) \ + Kokkos::Impl::hip_internal_safe_call(call, #call, __FILE__, __LINE__) + +namespace Kokkos { +namespace Experimental { + +class HIPRawMemoryAllocationFailure : public RawMemoryAllocationFailure { + private: + hipError_t m_error_code = hipSuccess; + + static FailureMode get_failure_mode(hipError_t error_code) { + switch (error_code) { + case hipErrorMemoryAllocation: return FailureMode::OutOfMemoryError; + case hipErrorInvalidValue: return FailureMode::InvalidAllocationSize; + default: return FailureMode::Unknown; + } + } + + public: + HIPRawMemoryAllocationFailure(size_t arg_attempted_size, + hipError_t arg_error_code, + AllocationMechanism arg_mechanism) noexcept + : RawMemoryAllocationFailure( + arg_attempted_size, /* HIPSpace doesn't handle alignment? */ 1, + get_failure_mode(arg_error_code), arg_mechanism), + m_error_code(arg_error_code) {} + + void append_additional_error_information(std::ostream& o) const override { + if (m_error_code != hipSuccess) { + o << " The HIP allocation returned the error code \"\"" + << hipGetErrorName(m_error_code) << "\"."; + } + } +}; + +} // namespace Experimental +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp new file mode 100644 index 0000000000000000000000000000000000000000..18ef10e22cd39b30118f78882a3ce747c19b9901 --- /dev/null +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp @@ -0,0 +1,488 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +/*--------------------------------------------------------------------------*/ +/* Kokkos interfaces */ + +#include <Kokkos_Core.hpp> + +#include <HIP/Kokkos_HIP_Instance.hpp> +#include <Kokkos_HIP.hpp> +#include <Kokkos_HIP_Space.hpp> +#include <impl/Kokkos_Error.hpp> + +/*--------------------------------------------------------------------------*/ +/* Standard 'C' libraries */ +#include <stdlib.h> + +/* Standard 'C++' libraries */ +#include <iostream> +#include <sstream> +#include <string> +#include <vector> + +namespace Kokkos { +namespace Experimental { +namespace { +class HIPInternalDevices { + public: + enum { MAXIMUM_DEVICE_COUNT = 64 }; + struct hipDeviceProp_t m_hipProp[MAXIMUM_DEVICE_COUNT]; + int m_hipDevCount; + + HIPInternalDevices(); + + static HIPInternalDevices const &singleton(); +}; + +HIPInternalDevices::HIPInternalDevices() { + HIP_SAFE_CALL(hipGetDeviceCount(&m_hipDevCount)); + + if (m_hipDevCount > MAXIMUM_DEVICE_COUNT) { + Kokkos::abort( + "Sorry, you have more GPUs per node than we thought anybody would ever " + "have. Please report this to github.com/kokkos/kokkos."); + } + for (int i = 0; i < m_hipDevCount; ++i) { + HIP_SAFE_CALL(hipGetDeviceProperties(m_hipProp + i, i)); + } +} + +const HIPInternalDevices &HIPInternalDevices::singleton() { + static HIPInternalDevices self; + return self; +} +} // namespace + +namespace Impl { + +//---------------------------------------------------------------------------- + +void HIPInternal::print_configuration(std::ostream &s) const { + const HIPInternalDevices &dev_info = HIPInternalDevices::singleton(); + + s << "macro KOKKOS_ENABLE_HIP : defined" << '\n'; +#if defined(HIP_VERSION) + s << "macro HIP_VERSION = " << HIP_VERSION << " = version " + << HIP_VERSION / 100 << "." << HIP_VERSION % 100 << '\n'; +#endif + + for (int i = 0; i < dev_info.m_hipDevCount; ++i) { + s << "Kokkos::Experimental::HIP[ " << i << " ] " + << dev_info.m_hipProp[i].name << " version " + << (dev_info.m_hipProp[i].major) << "." << dev_info.m_hipProp[i].minor + << ", Total Global Memory: " + << ::Kokkos::Impl::human_memory_size(dev_info.m_hipProp[i].totalGlobalMem) + << ", Shared Memory per Block: " + << ::Kokkos::Impl::human_memory_size( + dev_info.m_hipProp[i].sharedMemPerBlock); + if (m_hipDev == i) s << " : Selected"; + s << '\n'; + } +} + +//---------------------------------------------------------------------------- + +HIPInternal::~HIPInternal() { + if (m_scratchSpace || m_scratchFlags || m_scratchConcurrentBitset) { + std::cerr << "Kokkos::Experimental::HIP ERROR: Failed to call " + "Kokkos::Experimental::HIP::finalize()" + << std::endl; + std::cerr.flush(); + } + + m_hipDev = -1; + m_hipArch = -1; + m_multiProcCount = 0; + m_maxWarpCount = 0; + m_maxSharedWords = 0; + m_maxShmemPerBlock = 0; + m_scratchSpaceCount = 0; + m_scratchFlagsCount = 0; + m_scratchSpace = nullptr; + m_scratchFlags = nullptr; + m_scratchConcurrentBitset = nullptr; + m_stream = nullptr; +} + +int HIPInternal::verify_is_initialized(const char *const label) const { + if (m_hipDev < 0) { + std::cerr << "Kokkos::Experimental::HIP::" << label + << " : ERROR device not initialized" << std::endl; + } + return 0 <= m_hipDev; +} + +HIPInternal &HIPInternal::singleton() { + static HIPInternal *self = nullptr; + if (!self) { + self = new HIPInternal(); + } + return *self; +} + +void HIPInternal::fence() const { + HIP_SAFE_CALL(hipStreamSynchronize(m_stream)); + // can reset our cycle id now as well + m_cycleId = 0; +} + +void HIPInternal::initialize(int hip_device_id, hipStream_t stream) { + if (was_finalized) + Kokkos::abort("Calling HIP::initialize after HIP::finalize is illegal\n"); + + if (is_initialized()) return; + + int constexpr WordSize = sizeof(size_type); + + if (!HostSpace::execution_space::impl_is_initialized()) { + const std::string msg( + "HIP::initialize ERROR : HostSpace::execution_space " + "is not initialized"); + Kokkos::Impl::throw_runtime_exception(msg); + } + + const HIPInternalDevices &dev_info = HIPInternalDevices::singleton(); + + const bool ok_init = nullptr == m_scratchSpace || nullptr == m_scratchFlags; + + // Need at least a GPU device + const bool ok_id = + 0 <= hip_device_id && hip_device_id < dev_info.m_hipDevCount; + + if (ok_init && ok_id) { + const struct hipDeviceProp_t &hipProp = dev_info.m_hipProp[hip_device_id]; + + m_hipDev = hip_device_id; + m_deviceProp = hipProp; + + HIP_SAFE_CALL(hipSetDevice(m_hipDev)); + + m_stream = stream; + m_team_scratch_current_size = 0; + m_team_scratch_ptr = nullptr; + + // number of multiprocessors + m_multiProcCount = hipProp.multiProcessorCount; + + //---------------------------------- + // Maximum number of warps, + // at most one warp per thread in a warp for reduction. + m_maxWarpCount = hipProp.maxThreadsPerBlock / Impl::HIPTraits::WarpSize; + if (HIPTraits::WarpSize < m_maxWarpCount) { + m_maxWarpCount = Impl::HIPTraits::WarpSize; + } + m_maxSharedWords = hipProp.sharedMemPerBlock / WordSize; + + //---------------------------------- + // Maximum number of blocks + m_maxBlock = hipProp.maxGridSize[0]; + + // theoretically, we can get 40 WF's / CU, but only can sustain 32 + // see + // https://github.com/ROCm-Developer-Tools/HIP/blob/a0b5dfd625d99af7e288629747b40dd057183173/vdi/hip_platform.cpp#L742 + m_maxBlocksPerSM = 32; + // FIXME_HIP - Nick to implement this upstream + // Register count comes from Sec. 2.2. "Data Sharing" of the + // Vega 7nm ISA document (see the diagram) + // https://developer.amd.com/wp-content/resources/Vega_7nm_Shader_ISA.pdf + // VGPRS = 4 (SIMD/CU) * 256 VGPR/SIMD * 64 registers / VGPR = + // 65536 VGPR/CU + m_regsPerSM = 65536; + m_shmemPerSM = hipProp.maxSharedMemoryPerMultiProcessor; + m_maxShmemPerBlock = hipProp.sharedMemPerBlock; + m_maxThreadsPerSM = m_maxBlocksPerSM * HIPTraits::WarpSize; + //---------------------------------- + // Multiblock reduction uses scratch flags for counters + // and scratch space for partial reduction values. + // Allocate some initial space. This will grow as needed. + { + const unsigned reduce_block_count = + m_maxWarpCount * Impl::HIPTraits::WarpSize; + + (void)scratch_flags(reduce_block_count * 2 * sizeof(size_type)); + (void)scratch_space(reduce_block_count * 16 * sizeof(size_type)); + } + //---------------------------------- + // Concurrent bitset for obtaining unique tokens from within + // an executing kernel. + { + const int32_t buffer_bound = + Kokkos::Impl::concurrent_bitset::buffer_bound(HIP::concurrency()); + + // Allocate and initialize uint32_t[ buffer_bound ] + + using Record = + Kokkos::Impl::SharedAllocationRecord<Kokkos::Experimental::HIPSpace, + void>; + + Record *const r = Record::allocate(Kokkos::Experimental::HIPSpace(), + "Kokkos::InternalScratchBitset", + sizeof(uint32_t) * buffer_bound); + + Record::increment(r); + + m_scratchConcurrentBitset = reinterpret_cast<uint32_t *>(r->data()); + + HIP_SAFE_CALL(hipMemset(m_scratchConcurrentBitset, 0, + sizeof(uint32_t) * buffer_bound)); + } + //---------------------------------- + + } else { + std::ostringstream msg; + msg << "Kokkos::Experimental::HIP::initialize(" << hip_device_id + << ") FAILED"; + + if (!ok_init) { + msg << " : Already initialized"; + } + if (!ok_id) { + msg << " : Device identifier out of range " + << "[0.." << dev_info.m_hipDevCount - 1 << "]"; + } + Kokkos::Impl::throw_runtime_exception(msg.str()); + } + + // Init the array for used for arbitrarily sized atomics + if (m_stream == nullptr) ::Kokkos::Impl::initialize_host_hip_lock_arrays(); +} + +//---------------------------------------------------------------------------- + +using ScratchGrain = + Kokkos::Experimental::HIP::size_type[Impl::HIPTraits::WarpSize]; +enum { sizeScratchGrain = sizeof(ScratchGrain) }; + +Kokkos::Experimental::HIP::size_type *HIPInternal::scratch_space( + const Kokkos::Experimental::HIP::size_type size) { + if (verify_is_initialized("scratch_space") && + m_scratchSpaceCount * sizeScratchGrain < size) { + m_scratchSpaceCount = (size + sizeScratchGrain - 1) / sizeScratchGrain; + + using Record = + Kokkos::Impl::SharedAllocationRecord<Kokkos::Experimental::HIPSpace, + void>; + + if (m_scratchSpace) Record::decrement(Record::get_record(m_scratchSpace)); + + Record *const r = Record::allocate( + Kokkos::Experimental::HIPSpace(), "Kokkos::InternalScratchSpace", + (sizeScratchGrain * m_scratchSpaceCount)); + + Record::increment(r); + + m_scratchSpace = reinterpret_cast<size_type *>(r->data()); + } + + return m_scratchSpace; +} + +Kokkos::Experimental::HIP::size_type *HIPInternal::scratch_flags( + const Kokkos::Experimental::HIP::size_type size) { + if (verify_is_initialized("scratch_flags") && + m_scratchFlagsCount * sizeScratchGrain < size) { + m_scratchFlagsCount = (size + sizeScratchGrain - 1) / sizeScratchGrain; + + using Record = + Kokkos::Impl::SharedAllocationRecord<Kokkos::Experimental::HIPSpace, + void>; + + if (m_scratchFlags) Record::decrement(Record::get_record(m_scratchFlags)); + + Record *const r = Record::allocate( + Kokkos::Experimental::HIPSpace(), "Kokkos::InternalScratchFlags", + (sizeScratchGrain * m_scratchFlagsCount)); + + Record::increment(r); + + m_scratchFlags = reinterpret_cast<size_type *>(r->data()); + + HIP_SAFE_CALL( + hipMemset(m_scratchFlags, 0, m_scratchFlagsCount * sizeScratchGrain)); + } + + return m_scratchFlags; +} + +void *HIPInternal::resize_team_scratch_space(std::int64_t bytes, + bool force_shrink) { + if (m_team_scratch_current_size == 0) { + m_team_scratch_current_size = bytes; + m_team_scratch_ptr = Kokkos::kokkos_malloc<Kokkos::Experimental::HIPSpace>( + "Kokkos::HIPSpace::TeamScratchMemory", m_team_scratch_current_size); + } + if ((bytes > m_team_scratch_current_size) || + ((bytes < m_team_scratch_current_size) && (force_shrink))) { + m_team_scratch_current_size = bytes; + m_team_scratch_ptr = Kokkos::kokkos_realloc<Kokkos::Experimental::HIPSpace>( + m_team_scratch_ptr, m_team_scratch_current_size); + } + return m_team_scratch_ptr; +} + +//---------------------------------------------------------------------------- + +void HIPInternal::finalize() { + this->fence(); + was_finalized = true; + if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) { + using RecordHIP = + Kokkos::Impl::SharedAllocationRecord<Kokkos::Experimental::HIPSpace>; + + RecordHIP::decrement(RecordHIP::get_record(m_scratchFlags)); + RecordHIP::decrement(RecordHIP::get_record(m_scratchSpace)); + RecordHIP::decrement(RecordHIP::get_record(m_scratchConcurrentBitset)); + + if (m_team_scratch_current_size > 0) + Kokkos::kokkos_free<Kokkos::Experimental::HIPSpace>(m_team_scratch_ptr); + + m_hipDev = -1; + m_hipArch = -1; + m_multiProcCount = 0; + m_maxWarpCount = 0; + m_maxBlock = 0; + m_maxSharedWords = 0; + m_maxShmemPerBlock = 0; + m_scratchSpaceCount = 0; + m_scratchFlagsCount = 0; + m_scratchSpace = nullptr; + m_scratchFlags = nullptr; + m_scratchConcurrentBitset = nullptr; + m_stream = nullptr; + m_team_scratch_current_size = 0; + m_team_scratch_ptr = nullptr; + } + if (nullptr != d_driverWorkArray) { + HIP_SAFE_CALL(hipHostFree(d_driverWorkArray)); + d_driverWorkArray = nullptr; + } +} + +char *HIPInternal::get_next_driver(size_t driverTypeSize) const { + std::lock_guard<std::mutex> const lock(m_mutexWorkArray); + if (d_driverWorkArray == nullptr) { + HIP_SAFE_CALL( + hipHostMalloc(&d_driverWorkArray, + m_maxDriverCycles * m_maxDriverTypeSize * sizeof(char), + hipHostMallocNonCoherent)); + } + if (driverTypeSize > m_maxDriverTypeSize) { + // fence handles the cycle id reset for us + fence(); + HIP_SAFE_CALL(hipHostFree(d_driverWorkArray)); + m_maxDriverTypeSize = driverTypeSize; + if (m_maxDriverTypeSize % 128 != 0) + m_maxDriverTypeSize = + m_maxDriverTypeSize + 128 - m_maxDriverTypeSize % 128; + HIP_SAFE_CALL( + hipHostMalloc(&d_driverWorkArray, + m_maxDriverCycles * m_maxDriverTypeSize * sizeof(char), + hipHostMallocNonCoherent)); + } else { + m_cycleId = (m_cycleId + 1) % m_maxDriverCycles; + if (m_cycleId == 0) { + // ensure any outstanding kernels are completed before we wrap around + fence(); + } + } + return &d_driverWorkArray[m_maxDriverTypeSize * m_cycleId]; +} + +//---------------------------------------------------------------------------- + +Kokkos::Experimental::HIP::size_type hip_internal_multiprocessor_count() { + return HIPInternal::singleton().m_multiProcCount; +} + +Kokkos::Experimental::HIP::size_type hip_internal_maximum_warp_count() { + return HIPInternal::singleton().m_maxWarpCount; +} + +Kokkos::Experimental::HIP::size_type hip_internal_maximum_grid_count() { + return HIPInternal::singleton().m_maxBlock; +} + +Kokkos::Experimental::HIP::size_type *hip_internal_scratch_space( + const Kokkos::Experimental::HIP::size_type size) { + return HIPInternal::singleton().scratch_space(size); +} + +Kokkos::Experimental::HIP::size_type *hip_internal_scratch_flags( + const Kokkos::Experimental::HIP::size_type size) { + return HIPInternal::singleton().scratch_flags(size); +} + +} // namespace Impl +} // namespace Experimental +} // namespace Kokkos + +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { +void hip_device_synchronize() { HIP_SAFE_CALL(hipDeviceSynchronize()); } + +void hip_internal_error_throw(hipError_t e, const char *name, const char *file, + const int line) { + std::ostringstream out; + out << name << " error( " << hipGetErrorName(e) + << "): " << hipGetErrorString(e); + if (file) { + out << " " << file << ":" << line; + } + throw_runtime_exception(out.str()); +} +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { +HIP::size_type HIP::detect_device_count() { + return HIPInternalDevices::singleton().m_hipDevCount; +} +} // namespace Experimental +} // namespace Kokkos diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp new file mode 100644 index 0000000000000000000000000000000000000000..f4f88628e313a2d22d23a09e4ce25630d242a566 --- /dev/null +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp @@ -0,0 +1,164 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +/*--------------------------------------------------------------------------*/ + +#ifndef KOKKOS_HIP_INSTANCE_HPP +#define KOKKOS_HIP_INSTANCE_HPP + +#include <Kokkos_HIP_Space.hpp> + +#include <mutex> + +namespace Kokkos { +namespace Experimental { +namespace Impl { + +struct HIPTraits { + static int constexpr WarpSize = 64; + static int constexpr WarpIndexMask = 0x003f; /* hexadecimal for 63 */ + static int constexpr WarpIndexShift = 6; /* WarpSize == 1 << WarpShift*/ + static int constexpr MaxThreadsPerBlock = + 1024; // FIXME_HIP -- assumed constant for now + + static int constexpr ConstantMemoryUsage = 0x008000; /* 32k bytes */ + static int constexpr ConstantMemoryUseThreshold = 0x000200; /* 512 bytes */ +}; + +//---------------------------------------------------------------------------- + +HIP::size_type hip_internal_maximum_warp_count(); +HIP::size_type hip_internal_maximum_grid_count(); +HIP::size_type hip_internal_multiprocessor_count(); + +HIP::size_type *hip_internal_scratch_space(const HIP::size_type size); +HIP::size_type *hip_internal_scratch_flags(const HIP::size_type size); + +//---------------------------------------------------------------------------- + +class HIPInternal { + private: + HIPInternal(const HIPInternal &); + HIPInternal &operator=(const HIPInternal &); + + public: + using size_type = ::Kokkos::Experimental::HIP::size_type; + + int m_hipDev = -1; + int m_hipArch = -1; + unsigned m_multiProcCount = 0; + unsigned m_maxWarpCount = 0; + unsigned m_maxBlock = 0; + unsigned m_maxBlocksPerSM = 0; + unsigned m_maxSharedWords = 0; + int m_regsPerSM; + int m_shmemPerSM = 0; + int m_maxShmemPerBlock = 0; + int m_maxThreadsPerSM = 0; + + // array of DriverTypes to be allocated in host-pinned memory for async + // kernel launches + mutable char *d_driverWorkArray = nullptr; + // number of kernel launches that can be in-flight w/o synchronization + const int m_maxDriverCycles = 100; + // max size of a DriverType [bytes] + mutable size_t m_maxDriverTypeSize = 1024 * 10; + // the current index in the driverWorkArray + mutable int m_cycleId = 0; + // mutex to access d_driverWorkArray + mutable std::mutex m_mutexWorkArray; + + // Scratch Spaces for Reductions + size_type m_scratchSpaceCount = 0; + size_type m_scratchFlagsCount = 0; + + size_type *m_scratchSpace = nullptr; + size_type *m_scratchFlags = nullptr; + uint32_t *m_scratchConcurrentBitset = nullptr; + + hipDeviceProp_t m_deviceProp; + + hipStream_t m_stream = nullptr; + + // Team Scratch Level 1 Space + mutable int64_t m_team_scratch_current_size = 0; + mutable void *m_team_scratch_ptr = nullptr; + mutable std::mutex m_team_scratch_mutex; + + bool was_finalized = false; + + static HIPInternal &singleton(); + + int verify_is_initialized(const char *const label) const; + + int is_initialized() const { return m_hipDev >= 0; } + + void initialize(int hip_device_id, hipStream_t stream = nullptr); + void finalize(); + + void print_configuration(std::ostream &) const; + + void fence() const; + + // returns the next driver type pointer in our work array + char *get_next_driver(size_t driverTypeSize) const; + + ~HIPInternal(); + + HIPInternal() = default; + + // Resizing of reduction related scratch spaces + size_type *scratch_space(const size_type size); + size_type *scratch_flags(const size_type size); + + // Resizing of team level 1 scratch + void *resize_team_scratch_space(std::int64_t bytes, + bool force_shrink = false); +}; + +} // namespace Impl +} // namespace Experimental +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp new file mode 100644 index 0000000000000000000000000000000000000000..f774423b378b0753a98c9e4df512b599910028dd --- /dev/null +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp @@ -0,0 +1,230 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_HIP_KERNEL_LAUNCH_HPP +#define KOKKOS_HIP_KERNEL_LAUNCH_HPP + +#include <Kokkos_Macros.hpp> + +#if defined(__HIPCC__) + +#include <HIP/Kokkos_HIP_Error.hpp> +#include <HIP/Kokkos_HIP_Instance.hpp> +#include <Kokkos_HIP_Space.hpp> + +// Must use global variable on the device with HIP-Clang +#ifdef __HIP__ +__device__ __constant__ unsigned long kokkos_impl_hip_constant_memory_buffer + [Kokkos::Experimental::Impl::HIPTraits::ConstantMemoryUsage / + sizeof(unsigned long)]; +#endif + +namespace Kokkos { +namespace Experimental { +template <typename T> +inline __device__ T *kokkos_impl_hip_shared_memory() { + HIP_DYNAMIC_SHARED(HIPSpace::size_type, sh); + return (T *)sh; +} +} // namespace Experimental +} // namespace Kokkos + +namespace Kokkos { +namespace Experimental { +namespace Impl { + +template <typename DriverType> +__global__ static void hip_parallel_launch_constant_memory() { + const DriverType &driver = *(reinterpret_cast<const DriverType *>( + kokkos_impl_hip_constant_memory_buffer)); + driver(); +} + +template <typename DriverType, unsigned int maxTperB, unsigned int minBperSM> +__global__ __launch_bounds__( + maxTperB, minBperSM) static void hip_parallel_launch_constant_memory() { + const DriverType &driver = *(reinterpret_cast<const DriverType *>( + kokkos_impl_hip_constant_memory_buffer)); + + driver->operator()(); +} + +template <class DriverType> +__global__ static void hip_parallel_launch_local_memory( + const DriverType *driver) { + driver->operator()(); +} + +template <class DriverType, unsigned int maxTperB, unsigned int minBperSM> +__global__ __launch_bounds__( + maxTperB, + minBperSM) static void hip_parallel_launch_local_memory(const DriverType + *driver) { + driver->operator()(); +} + +enum class HIPLaunchMechanism : unsigned { + Default = 0, + ConstantMemory = 1, + GlobalMemory = 2, + LocalMemory = 4 +}; + +constexpr inline HIPLaunchMechanism operator|(HIPLaunchMechanism p1, + HIPLaunchMechanism p2) { + return static_cast<HIPLaunchMechanism>(static_cast<unsigned>(p1) | + static_cast<unsigned>(p2)); +} +constexpr inline HIPLaunchMechanism operator&(HIPLaunchMechanism p1, + HIPLaunchMechanism p2) { + return static_cast<HIPLaunchMechanism>(static_cast<unsigned>(p1) & + static_cast<unsigned>(p2)); +} + +template <HIPLaunchMechanism l> +struct HIPDispatchProperties { + HIPLaunchMechanism launch_mechanism = l; +}; + +template <typename DriverType, typename LaunchBounds, + HIPLaunchMechanism LaunchMechanism> +struct HIPParallelLaunchKernelFunc; + +template <typename DriverType, unsigned int MaxThreadsPerBlock, + unsigned int MinBlocksPerSM> +struct HIPParallelLaunchKernelFunc< + DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>, + HIPLaunchMechanism::LocalMemory> { + static auto get_kernel_func() { + return hip_parallel_launch_local_memory<DriverType, MaxThreadsPerBlock, + MinBlocksPerSM>; + } +}; + +template <typename DriverType> +struct HIPParallelLaunchKernelFunc<DriverType, Kokkos::LaunchBounds<0, 0>, + HIPLaunchMechanism::LocalMemory> { + static auto get_kernel_func() { + return hip_parallel_launch_local_memory<DriverType, 1024, 1>; + } +}; + +template <typename DriverType, typename LaunchBounds, + HIPLaunchMechanism LaunchMechanism> +struct HIPParallelLaunchKernelInvoker; + +template <typename DriverType, typename LaunchBounds> +struct HIPParallelLaunchKernelInvoker<DriverType, LaunchBounds, + HIPLaunchMechanism::LocalMemory> + : HIPParallelLaunchKernelFunc<DriverType, LaunchBounds, + HIPLaunchMechanism::LocalMemory> { + using base_t = HIPParallelLaunchKernelFunc<DriverType, LaunchBounds, + HIPLaunchMechanism::LocalMemory>; + + static void invoke_kernel(DriverType const *driver, dim3 const &grid, + dim3 const &block, int shmem, + HIPInternal const *hip_instance) { + (base_t::get_kernel_func())<<<grid, block, shmem, hip_instance->m_stream>>>( + driver); + } +}; + +template <typename DriverType, typename LaunchBounds = Kokkos::LaunchBounds<>, + HIPLaunchMechanism LaunchMechanism = HIPLaunchMechanism::LocalMemory> +struct HIPParallelLaunch; + +template <typename DriverType, unsigned int MaxThreadsPerBlock, + unsigned int MinBlocksPerSM> +struct HIPParallelLaunch< + DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>, + HIPLaunchMechanism::LocalMemory> + : HIPParallelLaunchKernelInvoker< + DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>, + HIPLaunchMechanism::LocalMemory> { + using base_t = HIPParallelLaunchKernelInvoker< + DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>, + HIPLaunchMechanism::LocalMemory>; + + HIPParallelLaunch(const DriverType &driver, const dim3 &grid, + const dim3 &block, const int shmem, + const HIPInternal *hip_instance, + const bool /*prefer_shmem*/) { + if ((grid.x != 0) && ((block.x * block.y * block.z) != 0)) { + if (hip_instance->m_maxShmemPerBlock < shmem) { + Kokkos::Impl::throw_runtime_exception( + "HIPParallelLaunch FAILED: shared memory request is too large"); + } + + KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE(); + + // Invoke the driver function on the device + DriverType *d_driver = reinterpret_cast<DriverType *>( + hip_instance->get_next_driver(sizeof(DriverType))); + std::memcpy((void *)d_driver, (void *)&driver, sizeof(DriverType)); + base_t::invoke_kernel(d_driver, grid, block, shmem, hip_instance); + +#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) + HIP_SAFE_CALL(hipGetLastError()); + hip_instance->fence(); +#endif + } + } + + static hipFuncAttributes get_hip_func_attributes() { + static hipFuncAttributes attr = []() { + hipFuncAttributes attr; + HIP_SAFE_CALL(hipFuncGetAttributes( + &attr, reinterpret_cast<void const *>(base_t::get_kernel_func()))); + return attr; + }(); + return attr; + } +}; +} // namespace Impl +} // namespace Experimental +} // namespace Kokkos + +#endif + +#endif diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4f5271b6f644605e24ab277a7b08b25ba8c2ea84 --- /dev/null +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp @@ -0,0 +1,119 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Macros.hpp> + +#include <HIP/Kokkos_HIP_Locks.hpp> +#include <HIP/Kokkos_HIP_Error.hpp> +#include <Kokkos_HIP_Space.hpp> + +#include <hip/hip_runtime.h> + +#include <iostream> + +namespace Kokkos { + +#ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE +namespace Impl { +__device__ __constant__ HIPLockArrays g_device_hip_lock_arrays = {nullptr, + nullptr, 0}; +} +#endif + +namespace { + +__global__ void init_lock_array_kernel_atomic() { + unsigned i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < KOKKOS_IMPL_HIP_SPACE_ATOMIC_MASK + 1) { + Kokkos::Impl::g_device_hip_lock_arrays.atomic[i] = 0; + } +} + +__global__ void init_lock_array_kernel_threadid(int N) { + unsigned i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < static_cast<unsigned>(N)) { + Kokkos::Impl::g_device_hip_lock_arrays.scratch[i] = 0; + } +} + +} // namespace + +namespace Impl { + +HIPLockArrays g_host_hip_lock_arrays = {nullptr, nullptr, 0}; + +void initialize_host_hip_lock_arrays() { + if (g_host_hip_lock_arrays.atomic != nullptr) return; + HIP_SAFE_CALL(hipMalloc( + &g_host_hip_lock_arrays.atomic, + sizeof(std::int32_t) * (KOKKOS_IMPL_HIP_SPACE_ATOMIC_MASK + 1))); + HIP_SAFE_CALL(hipMalloc( + &g_host_hip_lock_arrays.scratch, + sizeof(std::int32_t) * (::Kokkos::Experimental::HIP::concurrency()))); + + g_host_hip_lock_arrays.n = ::Kokkos::Experimental::HIP::concurrency(); + + KOKKOS_COPY_HIP_LOCK_ARRAYS_TO_DEVICE(); + init_lock_array_kernel_atomic<<< + (KOKKOS_IMPL_HIP_SPACE_ATOMIC_MASK + 1 + 255) / 256, 256, 0, nullptr>>>(); + init_lock_array_kernel_threadid<<< + (::Kokkos::Experimental::HIP::concurrency() + 255) / 256, 256, 0, + nullptr>>>(::Kokkos::Experimental::HIP::concurrency()); +} + +void finalize_host_hip_lock_arrays() { + if (g_host_hip_lock_arrays.atomic == nullptr) return; + HIP_SAFE_CALL(hipFree(g_host_hip_lock_arrays.atomic)); + g_host_hip_lock_arrays.atomic = nullptr; + HIP_SAFE_CALL(hipFree(g_host_hip_lock_arrays.scratch)); + g_host_hip_lock_arrays.scratch = nullptr; + g_host_hip_lock_arrays.n = 0; +#ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE + KOKKOS_COPY_HIP_LOCK_ARRAYS_TO_DEVICE(); +#endif +} + +} // namespace Impl + +} // namespace Kokkos diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Locks.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Locks.hpp new file mode 100644 index 0000000000000000000000000000000000000000..f34f85f43b0bb2ac2b07d4149957f5027991395f --- /dev/null +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Locks.hpp @@ -0,0 +1,167 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_HIP_LOCKS_HPP +#define KOKKOS_HIP_LOCKS_HPP + +#include <Kokkos_Macros.hpp> + +#include <cstdint> + +#include <HIP/Kokkos_HIP_Error.hpp> + +namespace Kokkos { +namespace Impl { + +struct HIPLockArrays { + std::int32_t* atomic; + std::int32_t* scratch; + std::int32_t n; +}; + +/// \brief This global variable in Host space is the central definition +/// of these arrays. +extern HIPLockArrays g_host_hip_lock_arrays; + +/// \brief After this call, the g_host_hip_lock_arrays variable has +/// valid, initialized arrays. +/// +/// This call is idempotent. +void initialize_host_hip_lock_arrays(); + +/// \brief After this call, the g_host_hip_lock_arrays variable has +/// all null pointers, and all array memory has been freed. +/// +/// This call is idempotent. +void finalize_host_hip_lock_arrays(); + +#if defined(__HIPCC__) + +/// \brief This global variable in HIP space is what kernels use +/// to get access to the lock arrays. +/// +/// When relocatable device code is enabled, there can be one single +/// instance of this global variable for the entire executable, +/// whose definition will be in Kokkos_HIP_Locks.cpp (and whose declaration +/// here must then be extern). +/// This one instance will be initialized by initialize_host_HIP_lock_arrays +/// and need not be modified afterwards. +/// +/// When relocatable device code is disabled, an instance of this variable +/// will be created in every translation unit that sees this header file. +/// Since the Kokkos_HIP_Locks.cpp translation unit cannot initialize the +/// instances in other translation units, we must update this HIP global +/// variable based on the Host global variable prior to running any kernels +/// that will use it. +/// That is the purpose of the KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE macro. +__device__ +#ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE + __constant__ extern +#endif + HIPLockArrays g_device_hip_lock_arrays; + +#define KOKKOS_IMPL_HIP_SPACE_ATOMIC_MASK 0x1FFFF + +/// \brief Acquire a lock for the address +/// +/// This function tries to acquire the lock for the hash value derived +/// from the provided ptr. If the lock is successfully acquired the +/// function returns true. Otherwise it returns false. +__device__ inline bool lock_address_hip_space(void* ptr) { + auto offset = reinterpret_cast<size_t>(ptr); + offset = offset >> 2; + offset = offset & KOKKOS_IMPL_HIP_SPACE_ATOMIC_MASK; + return (0 == atomicCAS(&g_device_hip_lock_arrays.atomic[offset], 0, 1)); +} + +/// \brief Release lock for the address +/// +/// This function releases the lock for the hash value derived +/// from the provided ptr. This function should only be called +/// after previously successfully aquiring a lock with +/// lock_address. +__device__ inline void unlock_address_hip_space(void* ptr) { + auto offset = reinterpret_cast<size_t>(ptr); + offset = offset >> 2; + offset = offset & KOKKOS_IMPL_HIP_SPACE_ATOMIC_MASK; + atomicExch(&g_device_hip_lock_arrays.atomic[offset], 0); +} + +} // namespace Impl +} // namespace Kokkos + +// Make lock_array_copied an explicit translation unit scope thingy +namespace Kokkos { +namespace Impl { +namespace { +static int lock_array_copied = 0; +inline int eliminate_warning_for_lock_array() { return lock_array_copied; } +} // namespace +} // namespace Impl +} // namespace Kokkos + +/* Dan Ibanez: it is critical that this code be a macro, so that it will + capture the right address for g_device_hip_lock_arrays! + putting this in an inline function will NOT do the right thing! */ +#define KOKKOS_COPY_HIP_LOCK_ARRAYS_TO_DEVICE() \ + { \ + if (::Kokkos::Impl::lock_array_copied == 0) { \ + HIP_SAFE_CALL(hipMemcpyToSymbol( \ + HIP_SYMBOL(::Kokkos::Impl::g_device_hip_lock_arrays), \ + &::Kokkos::Impl::g_host_hip_lock_arrays, \ + sizeof(::Kokkos::Impl::HIPLockArrays))); \ + } \ + ::Kokkos::Impl::lock_array_copied = 1; \ + } + +#ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE +#define KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE() +#else +#define KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE() \ + KOKKOS_COPY_HIP_LOCK_ARRAYS_TO_DEVICE() +#endif + +#endif /* defined( __HIPCC__ ) */ + +#endif /* #ifndef KOKKOS_HIP_LOCKS_HPP */ diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp new file mode 100644 index 0000000000000000000000000000000000000000..ce1aff9586d25911104d17d53860409f3e73b10b --- /dev/null +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp @@ -0,0 +1,37 @@ +#ifndef KOKKOS_HIP_MDRANGEPOLICY_HPP_ +#define KOKKOS_HIP_MDRANGEPOLICY_HPP_ + +#include <KokkosExp_MDRangePolicy.hpp> + +namespace Kokkos { + +template <> +struct default_outer_direction<Kokkos::Experimental::HIP> { + using type = Iterate; + static constexpr Iterate value = Iterate::Left; +}; + +template <> +struct default_inner_direction<Kokkos::Experimental::HIP> { + using type = Iterate; + static constexpr Iterate value = Iterate::Left; +}; + +namespace Impl { + +// Settings for MDRangePolicy +template <> +inline TileSizeProperties get_tile_size_properties<Kokkos::Experimental::HIP>( + const Kokkos::Experimental::HIP& space) { + TileSizeProperties properties; + properties.max_threads = + space.impl_internal_space_instance()->m_maxThreadsPerSM; + properties.default_largest_tile_size = 16; + properties.default_tile_size = 4; + properties.max_total_tile_size = 1024; + return properties; +} + +} // Namespace Impl +} // Namespace Kokkos +#endif diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp new file mode 100644 index 0000000000000000000000000000000000000000..35e7d6fb853ae9e4f245e0fe0c2a71f4f2d4d6c2 --- /dev/null +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp @@ -0,0 +1,441 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_HIP_PARALLEL_MDRANGE_HPP +#define KOKKOS_HIP_PARALLEL_MDRANGE_HPP + +#include <HIP/Kokkos_HIP_BlockSize_Deduction.hpp> +#include <HIP/Kokkos_HIP_KernelLaunch.hpp> +#include <HIP/Kokkos_HIP_ReduceScan.hpp> +#include <KokkosExp_MDRangePolicy.hpp> +#include <impl/KokkosExp_IterateTileGPU.hpp> +#include <Kokkos_Parallel.hpp> + +namespace Kokkos { +namespace Impl { +// ParallelFor +template <class FunctorType, class... Traits> +class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, + Kokkos::Experimental::HIP> { + public: + using Policy = Kokkos::MDRangePolicy<Traits...>; + + private: + using array_index_type = typename Policy::array_index_type; + using index_type = typename Policy::index_type; + using LaunchBounds = typename Policy::launch_bounds; + + const FunctorType m_functor; + const Policy m_policy; + + ParallelFor() = delete; + ParallelFor& operator=(ParallelFor const&) = delete; + + public: + inline __device__ void operator()() const { + Kokkos::Impl::DeviceIterateTile<Policy::rank, Policy, FunctorType, + typename Policy::work_tag>(m_policy, + m_functor) + .exec_range(); + } + + inline void execute() const { + if (m_policy.m_num_tiles == 0) return; + array_index_type const maxblocks = static_cast<array_index_type>( + m_policy.space().impl_internal_space_instance()->m_maxBlock); + if (Policy::rank == 2) { + dim3 const block(m_policy.m_tile[0], m_policy.m_tile[1], 1); + dim3 const grid( + std::min((m_policy.m_upper[0] - m_policy.m_lower[0] + block.x - 1) / + block.x, + maxblocks), + std::min((m_policy.m_upper[1] - m_policy.m_lower[1] + block.y - 1) / + block.y, + maxblocks), + 1); + Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, LaunchBounds>( + *this, grid, block, 0, + m_policy.space().impl_internal_space_instance(), false); + } else if (Policy::rank == 3) { + dim3 const block(m_policy.m_tile[0], m_policy.m_tile[1], + m_policy.m_tile[2]); + dim3 const grid( + std::min((m_policy.m_upper[0] - m_policy.m_lower[0] + block.x - 1) / + block.x, + maxblocks), + std::min((m_policy.m_upper[1] - m_policy.m_lower[1] + block.y - 1) / + block.y, + maxblocks), + std::min((m_policy.m_upper[2] - m_policy.m_lower[2] + block.z - 1) / + block.z, + maxblocks)); + Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, LaunchBounds>( + *this, grid, block, 0, + m_policy.space().impl_internal_space_instance(), false); + } else if (Policy::rank == 4) { + // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to + // threadIdx.z + dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1], + m_policy.m_tile[2], m_policy.m_tile[3]); + dim3 const grid( + std::min(static_cast<uint32_t>(m_policy.m_tile_end[0] * + m_policy.m_tile_end[1]), + static_cast<uint32_t>(maxblocks)), + std::min((m_policy.m_upper[2] - m_policy.m_lower[2] + block.y - 1) / + block.y, + maxblocks), + std::min((m_policy.m_upper[3] - m_policy.m_lower[3] + block.z - 1) / + block.z, + maxblocks)); + Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, LaunchBounds>( + *this, grid, block, 0, + m_policy.space().impl_internal_space_instance(), false); + } else if (Policy::rank == 5) { + // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 + // to threadIdx.z + dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1], + m_policy.m_tile[2] * m_policy.m_tile[3], + m_policy.m_tile[4]); + dim3 const grid( + std::min(static_cast<index_type>(m_policy.m_tile_end[0] * + m_policy.m_tile_end[1]), + static_cast<index_type>(maxblocks)), + std::min(static_cast<index_type>(m_policy.m_tile_end[2] * + m_policy.m_tile_end[3]), + static_cast<index_type>(maxblocks)), + std::min((m_policy.m_upper[4] - m_policy.m_lower[4] + block.z - 1) / + block.z, + maxblocks)); + Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, LaunchBounds>( + *this, grid, block, 0, + m_policy.space().impl_internal_space_instance(), false); + } else if (Policy::rank == 6) { + // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; + // id4,id5 to threadIdx.z + dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1], + m_policy.m_tile[2] * m_policy.m_tile[3], + m_policy.m_tile[4] * m_policy.m_tile[5]); + dim3 const grid(std::min(static_cast<index_type>(m_policy.m_tile_end[0] * + m_policy.m_tile_end[1]), + static_cast<index_type>(maxblocks)), + std::min(static_cast<index_type>(m_policy.m_tile_end[2] * + m_policy.m_tile_end[3]), + static_cast<index_type>(maxblocks)), + std::min(static_cast<index_type>(m_policy.m_tile_end[4] * + m_policy.m_tile_end[5]), + static_cast<index_type>(maxblocks))); + Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, LaunchBounds>( + *this, grid, block, 0, + m_policy.space().impl_internal_space_instance(), false); + } else { + Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with HIP\n"); + } + + } // end execute + + ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy) + : m_functor(arg_functor), m_policy(arg_policy) {} + + template <typename Policy, typename Functor> + static int max_tile_size_product(const Policy& pol, const Functor&) { + using closure_type = + ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, + Kokkos::Experimental::HIP>; + hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch< + closure_type, LaunchBounds>::get_hip_func_attributes(); + auto const& prop = pol.space().hip_device_prop(); + // Limits due to registers/SM, MDRange doesn't have + // shared memory constraints + int const regs_per_sm = prop.regsPerMultiprocessor; + int const regs_per_thread = attr.numRegs; + int const max_threads_per_sm = regs_per_sm / regs_per_thread; + return std::min( + max_threads_per_sm, + static_cast<int>( + Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock)); + } +}; + +// ParallelReduce +template <class FunctorType, class ReducerType, class... Traits> +class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType, + Kokkos::Experimental::HIP> { + public: + using Policy = Kokkos::MDRangePolicy<Traits...>; + + private: + using array_index_type = typename Policy::array_index_type; + using index_type = typename Policy::index_type; + + using WorkTag = typename Policy::work_tag; + using Member = typename Policy::member_type; + using LaunchBounds = typename Policy::launch_bounds; + + using ReducerConditional = + Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, + FunctorType, ReducerType>; + using ReducerTypeFwd = typename ReducerConditional::type; + using WorkTagFwd = + typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, + WorkTag, void>::type; + + using ValueTraits = + Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>; + using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>; + using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>; + + public: + using pointer_type = typename ValueTraits::pointer_type; + using value_type = typename ValueTraits::value_type; + using reference_type = typename ValueTraits::reference_type; + using functor_type = FunctorType; + using size_type = Experimental::HIP::size_type; + + // Algorithmic constraints: blockSize is a power of two AND blockDim.y == + // blockDim.z == 1 + + const FunctorType m_functor; + const Policy m_policy; // used for workrange and nwork + const ReducerType m_reducer; + const pointer_type m_result_ptr; + const bool m_result_ptr_device_accessible; + size_type* m_scratch_space; + size_type* m_scratch_flags; + + using DeviceIteratePattern = typename Kokkos::Impl::Reduce::DeviceIterateTile< + Policy::rank, Policy, FunctorType, WorkTag, reference_type>; + + public: + inline __device__ void exec_range(reference_type update) const { + DeviceIteratePattern(m_policy, m_functor, update).exec_range(); + } + + inline __device__ void operator()() const { + const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize / + sizeof(size_type)> + word_count(ValueTraits::value_size( + ReducerConditional::select(m_functor, m_reducer)) / + sizeof(size_type)); + + { + reference_type value = ValueInit::init( + ReducerConditional::select(m_functor, m_reducer), + Experimental::kokkos_impl_hip_shared_memory<size_type>() + + threadIdx.y * word_count.value); + + // Number of blocks is bounded so that the reduction can be limited to two + // passes. Each thread block is given an approximately equal amount of + // work to perform. Accumulate the values for this block. The accumulation + // ordering does not match the final pass, but is arithmatically + // equivalent. + + this->exec_range(value); + } + + // Reduce with final value at blockDim.y - 1 location. + // Problem: non power-of-two blockDim + if (::Kokkos::Impl::hip_single_inter_block_reduce_scan< + false, ReducerTypeFwd, WorkTagFwd>( + ReducerConditional::select(m_functor, m_reducer), blockIdx.x, + gridDim.x, Experimental::kokkos_impl_hip_shared_memory<size_type>(), + m_scratch_space, m_scratch_flags)) { + // This is the final block with the final result at the final threads' + // location + size_type* const shared = + Experimental::kokkos_impl_hip_shared_memory<size_type>() + + (blockDim.y - 1) * word_count.value; + size_type* const global = m_result_ptr_device_accessible + ? reinterpret_cast<size_type*>(m_result_ptr) + : m_scratch_space; + + if (threadIdx.y == 0) { + Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final( + ReducerConditional::select(m_functor, m_reducer), shared); + } + + if (Experimental::Impl::HIPTraits::WarpSize < word_count.value) { + __syncthreads(); + } + + for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { + global[i] = shared[i]; + } + } + } + + // Determine block size constrained by shared memory: + // This is copy/paste from Kokkos_HIP_Parallel_Range + inline unsigned local_block_size(const FunctorType& f) { + unsigned int n = + ::Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock; + int shmem_size = ::Kokkos::Impl::hip_single_inter_block_reduce_scan_shmem< + false, FunctorType, WorkTag>(f, n); + using closure_type = Impl::ParallelReduce<FunctorType, Policy, ReducerType>; + hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch< + closure_type, LaunchBounds>::get_hip_func_attributes(); + while ( + (n && + (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < + shmem_size)) || + (n > + static_cast<unsigned>( + ::Kokkos::Experimental::Impl::hip_get_max_block_size<FunctorType, + LaunchBounds>( + m_policy.space().impl_internal_space_instance(), attr, f, 1, + shmem_size, 0)))) { + n >>= 1; + shmem_size = ::Kokkos::Impl::hip_single_inter_block_reduce_scan_shmem< + false, FunctorType, WorkTag>(f, n); + } + return n; + } + + inline void execute() { + const int nwork = m_policy.m_num_tiles; + if (nwork) { + int block_size = m_policy.m_prod_tile_dims; + // CONSTRAINT: Algorithm requires block_size >= product of tile dimensions + // Nearest power of two + int exponent_pow_two = std::ceil(std::log2(block_size)); + block_size = std::pow(2, exponent_pow_two); + int suggested_blocksize = local_block_size(m_functor); + + block_size = (block_size > suggested_blocksize) + ? block_size + : suggested_blocksize; // Note: block_size must be less + // than or equal to 512 + + m_scratch_space = + ::Kokkos::Experimental::Impl::hip_internal_scratch_space( + ValueTraits::value_size( + ReducerConditional::select(m_functor, m_reducer)) * + block_size /* block_size == max block_count */); + m_scratch_flags = + ::Kokkos::Experimental::Impl::hip_internal_scratch_flags( + sizeof(size_type)); + + // REQUIRED ( 1 , N , 1 ) + const dim3 block(1, block_size, 1); + // Required grid.x <= block.y + const dim3 grid(std::min(static_cast<uint32_t>(block.y), + static_cast<uint32_t>(nwork)), + 1, 1); + + const int shmem = + ::Kokkos::Impl::hip_single_inter_block_reduce_scan_shmem< + false, FunctorType, WorkTag>(m_functor, block.y); + + Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelReduce, + LaunchBounds>( + *this, grid, block, shmem, + m_policy.space().impl_internal_space_instance(), + false); // copy to device and execute + + if (!m_result_ptr_device_accessible) { + m_policy.space().fence(); + + if (m_result_ptr) { + const int size = ValueTraits::value_size( + ReducerConditional::select(m_functor, m_reducer)); + DeepCopy<HostSpace, Experimental::HIPSpace>(m_result_ptr, + m_scratch_space, size); + } + } + } else { + if (m_result_ptr) { + ValueInit::init(ReducerConditional::select(m_functor, m_reducer), + m_result_ptr); + } + } + } + + template <class ViewType> + ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, + const ViewType& arg_result, + typename std::enable_if<Kokkos::is_view<ViewType>::value, + void*>::type = nullptr) + : m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(InvalidType()), + m_result_ptr(arg_result.data()), + m_result_ptr_device_accessible( + MemorySpaceAccess<Kokkos::Experimental::HIPSpace, + typename ViewType::memory_space>::accessible), + m_scratch_space(nullptr), + m_scratch_flags(nullptr) {} + + ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, + const ReducerType& reducer) + : m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(reducer), + m_result_ptr(reducer.view().data()), + m_result_ptr_device_accessible( + MemorySpaceAccess<Kokkos::Experimental::HIPSpace, + typename ReducerType::result_view_type:: + memory_space>::accessible), + m_scratch_space(nullptr), + m_scratch_flags(nullptr) {} + template <typename Policy, typename Functor> + static int max_tile_size_product(const Policy& pol, const Functor&) { + using closure_type = + ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, + ReducerType, Kokkos::Experimental::HIP>; + hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch< + closure_type, LaunchBounds>::get_hip_func_attributes(); + auto const& prop = pol.space().hip_device_prop(); + // Limits due do registers/SM + int const regs_per_sm = prop.regsPerMultiprocessor; + int const regs_per_thread = attr.numRegs; + int const max_threads_per_sm = regs_per_sm / regs_per_thread; + return std::min( + max_threads_per_sm, + static_cast<int>( + Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock)); + } +}; +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp new file mode 100644 index 0000000000000000000000000000000000000000..7d2825eeb4c6be1d060d1e8d7c3eb67097729ccf --- /dev/null +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp @@ -0,0 +1,747 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKO_HIP_PARALLEL_RANGE_HPP +#define KOKKO_HIP_PARALLEL_RANGE_HPP + +#include <Kokkos_Parallel.hpp> + +#if defined(__HIPCC__) + +#include <HIP/Kokkos_HIP_BlockSize_Deduction.hpp> +#include <HIP/Kokkos_HIP_KernelLaunch.hpp> +#include <HIP/Kokkos_HIP_ReduceScan.hpp> +#include <HIP/Kokkos_HIP_Shuffle_Reduce.hpp> +#include <impl/Kokkos_Traits.hpp> + +namespace Kokkos { +namespace Impl { + +template <class FunctorType, class... Traits> +class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, + Kokkos::Experimental::HIP> { + public: + using Policy = Kokkos::RangePolicy<Traits...>; + + private: + using Member = typename Policy::member_type; + using WorkTag = typename Policy::work_tag; + using LaunchBounds = typename Policy::launch_bounds; + + const FunctorType m_functor; + const Policy m_policy; + + ParallelFor() = delete; + ParallelFor& operator=(const ParallelFor&) = delete; + + template <class TagType> + inline __device__ + typename std::enable_if<std::is_same<TagType, void>::value>::type + exec_range(const Member i) const { + m_functor(i); + } + + template <class TagType> + inline __device__ + typename std::enable_if<!std::is_same<TagType, void>::value>::type + exec_range(const Member i) const { + m_functor(TagType(), i); + } + + public: + using functor_type = FunctorType; + + inline __device__ void operator()() const { + const Member work_stride = blockDim.y * gridDim.x; + const Member work_end = m_policy.end(); + + for (Member iwork = + m_policy.begin() + threadIdx.y + blockDim.y * blockIdx.x; + iwork < work_end; + iwork = iwork < work_end - work_stride ? iwork + work_stride + : work_end) { + this->template exec_range<WorkTag>(iwork); + } + } + + inline void execute() const { + const typename Policy::index_type nwork = m_policy.end() - m_policy.begin(); + + const int block_size = + LaunchBounds::maxTperB + ? LaunchBounds::maxTperB + : ::Kokkos::Experimental::Impl::HIPTraits:: + MaxThreadsPerBlock; // FIXME_HIP Choose block_size better + const dim3 block(1, block_size, 1); + const dim3 grid( + typename Policy::index_type((nwork + block.y - 1) / block.y), 1, 1); + + Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, LaunchBounds>( + *this, grid, block, 0, m_policy.space().impl_internal_space_instance(), + false); + } + + ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) + : m_functor(arg_functor), m_policy(arg_policy) {} +}; + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +template <class FunctorType, class ReducerType, class... Traits> +class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType, + Kokkos::Experimental::HIP> { + public: + using Policy = Kokkos::RangePolicy<Traits...>; + + private: + using WorkRange = typename Policy::WorkRange; + using WorkTag = typename Policy::work_tag; + using Member = typename Policy::member_type; + using LaunchBounds = typename Policy::launch_bounds; + + using ReducerConditional = + Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, + FunctorType, ReducerType>; + using ReducerTypeFwd = typename ReducerConditional::type; + using WorkTagFwd = + typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, + WorkTag, void>::type; + + using ValueTraits = + Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>; + using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>; + using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>; + + public: + using pointer_type = typename ValueTraits::pointer_type; + using value_type = typename ValueTraits::value_type; + using reference_type = typename ValueTraits::reference_type; + using functor_type = FunctorType; + using size_type = Kokkos::Experimental::HIP::size_type; + using index_type = typename Policy::index_type; + + // Algorithmic constraints: blockSize is a power of two AND blockDim.y == + // blockDim.z == 1 + + const FunctorType m_functor; + const Policy m_policy; + const ReducerType m_reducer; + const pointer_type m_result_ptr; + const bool m_result_ptr_device_accessible; + const bool m_result_ptr_host_accessible; + size_type* m_scratch_space = nullptr; + size_type* m_scratch_flags = nullptr; + +#if HIP_VERSION < 401 + static bool constexpr UseShflReduction = + ((sizeof(value_type) > 2 * sizeof(double)) && + static_cast<bool>(ValueTraits::StaticValueSize)); +#else + static bool constexpr UseShflReduction = + static_cast<bool>(ValueTraits::StaticValueSize); +#endif + + private: + struct ShflReductionTag {}; + struct SHMEMReductionTag {}; + + // Make the exec_range calls call to Reduce::DeviceIterateTile + template <class TagType> + __device__ inline + typename std::enable_if<std::is_same<TagType, void>::value>::type + exec_range(const Member& i, reference_type update) const { + m_functor(i, update); + } + + template <class TagType> + __device__ inline + typename std::enable_if<!std::is_same<TagType, void>::value>::type + exec_range(const Member& i, reference_type update) const { + m_functor(TagType(), i, update); + } + + public: + __device__ inline void operator()() const { + using ReductionTag = + typename std::conditional<UseShflReduction, ShflReductionTag, + SHMEMReductionTag>::type; + run(ReductionTag{}); + } + + __device__ inline void run(SHMEMReductionTag) const { + const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize / + sizeof(size_type)> + word_count(ValueTraits::value_size( + ReducerConditional::select(m_functor, m_reducer)) / + sizeof(size_type)); + + { + reference_type value = ValueInit::init( + ReducerConditional::select(m_functor, m_reducer), + ::Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>() + + threadIdx.y * word_count.value); + + // Number of blocks is bounded so that the reduction can be limited to two + // passes. Each thread block is given an approximately equal amount of + // work to perform. Accumulate the values for this block. The accumulation + // ordering does not match the final pass, but is arithmetically + // equivalent. + + const WorkRange range(m_policy, blockIdx.x, gridDim.x); + + for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end(); + iwork < iwork_end; iwork += blockDim.y) { + this->template exec_range<WorkTag>(iwork, value); + } + } + + // Reduce with final value at blockDim.y - 1 location. + // Shortcut for length zero reduction + bool do_final_reduction = m_policy.begin() == m_policy.end(); + if (!do_final_reduction) + do_final_reduction = hip_single_inter_block_reduce_scan< + false, ReducerTypeFwd, WorkTagFwd>( + ReducerConditional::select(m_functor, m_reducer), blockIdx.x, + gridDim.x, + ::Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>(), + m_scratch_space, m_scratch_flags); + if (do_final_reduction) { + // This is the final block with the final result at the final threads' + // location + + size_type* const shared = + ::Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>() + + (blockDim.y - 1) * word_count.value; + size_type* const global = m_result_ptr_device_accessible + ? reinterpret_cast<size_type*>(m_result_ptr) + : m_scratch_space; + + if (threadIdx.y == 0) { + Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final( + ReducerConditional::select(m_functor, m_reducer), shared); + } + + if (::Kokkos::Experimental::Impl::HIPTraits::WarpSize < + word_count.value) { + __syncthreads(); + } + + for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { + global[i] = shared[i]; + } + } + } + + __device__ inline void run(ShflReductionTag) const { + value_type value; + ValueInit::init(ReducerConditional::select(m_functor, m_reducer), &value); + // Number of blocks is bounded so that the reduction can be limited to two + // passes. Each thread block is given an approximately equal amount of work + // to perform. Accumulate the values for this block. The accumulation + // ordering does not match the final pass, but is arithmetically equivalent. + + WorkRange const range(m_policy, blockIdx.x, gridDim.x); + + for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end(); + iwork < iwork_end; iwork += blockDim.y) { + this->template exec_range<WorkTag>(iwork, value); + } + + pointer_type const result = reinterpret_cast<pointer_type>(m_scratch_space); + + int max_active_thread = static_cast<int>(range.end() - range.begin()) < + static_cast<int>(blockDim.y) + ? range.end() - range.begin() + : blockDim.y; + + max_active_thread = + (max_active_thread == 0) ? blockDim.y : max_active_thread; + + value_type init; + ValueInit::init(ReducerConditional::select(m_functor, m_reducer), &init); + if (m_policy.begin() == m_policy.end()) { + Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final( + ReducerConditional::select(m_functor, m_reducer), + reinterpret_cast<void*>(&value)); + pointer_type const final_result = + m_result_ptr_device_accessible ? m_result_ptr : result; + *final_result = value; + } else if (Impl::hip_inter_block_shuffle_reduction<ReducerTypeFwd, + ValueJoin, WorkTagFwd>( + value, init, + ValueJoin(ReducerConditional::select(m_functor, m_reducer)), + m_scratch_space, result, m_scratch_flags, + max_active_thread)) { + unsigned int const id = threadIdx.y * blockDim.x + threadIdx.x; + if (id == 0) { + Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final( + ReducerConditional::select(m_functor, m_reducer), + reinterpret_cast<void*>(&value)); + pointer_type const final_result = + m_result_ptr_device_accessible ? m_result_ptr : result; + *final_result = value; + } + } + } + + // Determine block size constrained by shared memory: + inline unsigned local_block_size(const FunctorType& f) { + unsigned int n = + ::Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock; + int shmem_size = + hip_single_inter_block_reduce_scan_shmem<false, FunctorType, WorkTag>( + f, n); + using closure_type = Impl::ParallelReduce<FunctorType, Policy, ReducerType>; + hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch< + closure_type, LaunchBounds>::get_hip_func_attributes(); + while ( + (n && + (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < + shmem_size)) || + (n > + static_cast<unsigned int>( + ::Kokkos::Experimental::Impl::hip_get_max_block_size<FunctorType, + LaunchBounds>( + m_policy.space().impl_internal_space_instance(), attr, f, 1, + shmem_size, 0)))) { + n >>= 1; + shmem_size = + hip_single_inter_block_reduce_scan_shmem<false, FunctorType, WorkTag>( + f, n); + } + return n; + } + + inline void execute() { + const index_type nwork = m_policy.end() - m_policy.begin(); + const bool need_device_set = ReduceFunctorHasInit<FunctorType>::value || + ReduceFunctorHasFinal<FunctorType>::value || + !m_result_ptr_host_accessible || + !std::is_same<ReducerType, InvalidType>::value; + if ((nwork > 0) || need_device_set) { + const int block_size = local_block_size(m_functor); + KOKKOS_ASSERT(block_size > 0); + + m_scratch_space = + ::Kokkos::Experimental::Impl::hip_internal_scratch_space( + ValueTraits::value_size( + ReducerConditional::select(m_functor, m_reducer)) * + block_size /* block_size == max block_count */); + m_scratch_flags = + ::Kokkos::Experimental::Impl::hip_internal_scratch_flags( + sizeof(size_type)); + + // REQUIRED ( 1 , N , 1 ) + dim3 block(1, block_size, 1); + // Required grid.x <= block.y + dim3 grid(std::min(block.y, static_cast<uint32_t>((nwork + block.y - 1) / + block.y)), + 1, 1); + + if (nwork == 0) { + block = dim3(1, 1, 1); + grid = dim3(1, 1, 1); + } + const int shmem = + UseShflReduction + ? 0 + : hip_single_inter_block_reduce_scan_shmem<false, FunctorType, + WorkTag>(m_functor, + block.y); + + Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelReduce, + LaunchBounds>( + *this, grid, block, shmem, + m_policy.space().impl_internal_space_instance(), + false); // copy to device and execute + + if (!m_result_ptr_device_accessible) { + m_policy.space().impl_internal_space_instance()->fence(); + + if (m_result_ptr) { + const int size = ValueTraits::value_size( + ReducerConditional::select(m_functor, m_reducer)); + DeepCopy<HostSpace, ::Kokkos::Experimental::HIPSpace>( + m_result_ptr, m_scratch_space, size); + } + } + } else { + if (m_result_ptr) { + ValueInit::init(ReducerConditional::select(m_functor, m_reducer), + m_result_ptr); + } + } + } + + template <class ViewType> + ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, + const ViewType& arg_result, + typename std::enable_if<Kokkos::is_view<ViewType>::value, + void*>::type = nullptr) + : m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(InvalidType()), + m_result_ptr(arg_result.data()), + m_result_ptr_device_accessible( + MemorySpaceAccess<Kokkos::Experimental::HIPSpace, + typename ViewType::memory_space>::accessible), + m_result_ptr_host_accessible( + MemorySpaceAccess<Kokkos::HostSpace, + typename ViewType::memory_space>::accessible) {} + + ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy, + const ReducerType& reducer) + : m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(reducer), + m_result_ptr(reducer.view().data()), + m_result_ptr_device_accessible( + MemorySpaceAccess<Kokkos::Experimental::HIPSpace, + typename ReducerType::result_view_type:: + memory_space>::accessible), + m_result_ptr_host_accessible( + MemorySpaceAccess<Kokkos::HostSpace, + typename ReducerType::result_view_type:: + memory_space>::accessible) {} +}; + +template <class FunctorType, class... Traits> +class ParallelScanHIPBase { + public: + using Policy = Kokkos::RangePolicy<Traits...>; + + protected: + using Member = typename Policy::member_type; + using WorkTag = typename Policy::work_tag; + using WorkRange = typename Policy::WorkRange; + using LaunchBounds = typename Policy::launch_bounds; + + using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, WorkTag>; + using ValueInit = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>; + using ValueOps = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>; + + public: + using pointer_type = typename ValueTraits::pointer_type; + using reference_type = typename ValueTraits::reference_type; + using functor_type = FunctorType; + using size_type = Kokkos::Experimental::HIP::size_type; + using index_type = typename Policy::index_type; + + protected: + // Algorithmic constraints: + // (a) blockDim.y is a power of two + // (b) blockDim.x == blockDim.z == 1 + // (c) gridDim.x <= blockDim.y * blockDim.y + // (d) gridDim.y == gridDim.z == 1 + + const FunctorType m_functor; + const Policy m_policy; + size_type* m_scratch_space = nullptr; + size_type* m_scratch_flags = nullptr; + size_type m_final = false; + int m_grid_x = 0; + + private: + template <class TagType> + __device__ inline + typename std::enable_if<std::is_same<TagType, void>::value>::type + exec_range(const Member& i, reference_type update, + const bool final_result) const { + m_functor(i, update, final_result); + } + + template <class TagType> + __device__ inline + typename std::enable_if<!std::is_same<TagType, void>::value>::type + exec_range(const Member& i, reference_type update, + const bool final_result) const { + m_functor(TagType(), i, update, final_result); + } + + //---------------------------------------- + + __device__ inline void initial() const { + const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize / + sizeof(size_type)> + word_count(ValueTraits::value_size(m_functor) / sizeof(size_type)); + + size_type* const shared_value = + Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>() + + word_count.value * threadIdx.y; + + ValueInit::init(m_functor, shared_value); + + // Number of blocks is bounded so that the reduction can be limited to two + // passes. Each thread block is given an approximately equal amount of work + // to perform. Accumulate the values for this block. The accumulation + // ordering does not match the final pass, but is arithmetically equivalent. + + const WorkRange range(m_policy, blockIdx.x, gridDim.x); + + for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end(); + iwork < iwork_end; iwork += blockDim.y) { + this->template exec_range<WorkTag>( + iwork, ValueOps::reference(shared_value), false); + } + + // Reduce and scan, writing out scan of blocks' totals and block-groups' + // totals. Blocks' scan values are written to 'blockIdx.x' location. + // Block-groups' scan values are at: i = ( j * blockDim.y - 1 ) for i < + // gridDim.x + hip_single_inter_block_reduce_scan<true, FunctorType, WorkTag>( + m_functor, blockIdx.x, gridDim.x, + Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>(), + m_scratch_space, m_scratch_flags); + } + + //---------------------------------------- + + __device__ inline void final() const { + const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize / + sizeof(size_type)> + word_count(ValueTraits::value_size(m_functor) / sizeof(size_type)); + + // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] , + // value[2] , ... } + size_type* const shared_data = + Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>(); + size_type* const shared_prefix = + shared_data + word_count.value * threadIdx.y; + size_type* const shared_accum = + shared_data + word_count.value * (blockDim.y + 1); + + // Starting value for this thread block is the previous block's total. + if (blockIdx.x) { + size_type* const block_total = + m_scratch_space + word_count.value * (blockIdx.x - 1); + for (unsigned i = threadIdx.y; i < word_count.value; ++i) { + shared_accum[i] = block_total[i]; + } + } else if (0 == threadIdx.y) { + ValueInit::init(m_functor, shared_accum); + } + + const WorkRange range(m_policy, blockIdx.x, gridDim.x); + + for (typename Policy::member_type iwork_base = range.begin(); + iwork_base < range.end(); iwork_base += blockDim.y) { + const typename Policy::member_type iwork = iwork_base + threadIdx.y; + + __syncthreads(); // Don't overwrite previous iteration values until they + // are used + + ValueInit::init(m_functor, shared_prefix + word_count.value); + + // Copy previous block's accumulation total into thread[0] prefix and + // inclusive scan value of this block + for (unsigned i = threadIdx.y; i < word_count.value; ++i) { + shared_data[i + word_count.value] = shared_data[i] = shared_accum[i]; + } + + // Make sure the write is seen by all threads + __threadfence_block(); + + // Call functor to accumulate inclusive scan value for this work item + const bool doWork = (iwork < range.end()); + if (doWork) { + this->template exec_range<WorkTag>( + iwork, ValueOps::reference(shared_prefix + word_count.value), + false); + } + + // Scan block values into locations shared_data[1..blockDim.y] + hip_intra_block_reduce_scan<true, FunctorType, WorkTag>( + m_functor, + typename ValueTraits::pointer_type(shared_data + word_count.value)); + + { + size_type* const block_total = + shared_data + word_count.value * blockDim.y; + for (unsigned i = threadIdx.y; i < word_count.value; ++i) { + shared_accum[i] = block_total[i]; + } + } + + // Call functor with exclusive scan value + if (doWork) { + this->template exec_range<WorkTag>( + iwork, ValueOps::reference(shared_prefix), true); + } + } + } + + public: + //---------------------------------------- + + __device__ inline void operator()() const { + if (!m_final) { + initial(); + } else { + final(); + } + } + + // Determine block size constrained by shared memory: + inline unsigned local_block_size(const FunctorType& f) { + // blockDim.y must be power of two = 128 (2 warps) or 256 (4 warps) or + // 512 (8 warps) gridDim.x <= blockDim.y * blockDim.y + // + // TODO check best option + + unsigned n = Experimental::Impl::HIPTraits::WarpSize * 4; + while (n && static_cast<unsigned>(m_policy.space() + .impl_internal_space_instance() + ->m_maxShmemPerBlock) < + hip_single_inter_block_reduce_scan_shmem<false, FunctorType, + WorkTag>(f, n)) { + n >>= 1; + } + return n; + } + + inline void impl_execute() { + const index_type nwork = m_policy.end() - m_policy.begin(); + if (nwork) { + // FIXME_HIP we cannot choose it larger for large work sizes to work + // correctly, the unit tests fail with wrong results + const int gridMaxComputeCapability_2x = 0x01fff; + + const int block_size = static_cast<int>(local_block_size(m_functor)); + KOKKOS_ASSERT(block_size > 0); + + const int grid_max = + std::min(block_size * block_size, gridMaxComputeCapability_2x); + + // At most 'max_grid' blocks: + const int max_grid = + std::min<int>(grid_max, (nwork + block_size - 1) / block_size); + + // How much work per block: + const int work_per_block = (nwork + max_grid - 1) / max_grid; + + // How many block are really needed for this much work: + m_grid_x = (nwork + work_per_block - 1) / work_per_block; + + m_scratch_space = Kokkos::Experimental::Impl::hip_internal_scratch_space( + ValueTraits::value_size(m_functor) * m_grid_x); + m_scratch_flags = Kokkos::Experimental::Impl::hip_internal_scratch_flags( + sizeof(size_type) * 1); + + dim3 grid(m_grid_x, 1, 1); + dim3 block(1, block_size, 1); // REQUIRED DIMENSIONS ( 1 , N , 1 ) + const int shmem = ValueTraits::value_size(m_functor) * (block_size + 2); + + m_final = false; + Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelScanHIPBase, + LaunchBounds>( + *this, grid, block, shmem, + m_policy.space().impl_internal_space_instance(), + false); // copy to device and execute + + m_final = true; + Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelScanHIPBase, + LaunchBounds>( + *this, grid, block, shmem, + m_policy.space().impl_internal_space_instance(), + false); // copy to device and execute + } + } + + ParallelScanHIPBase(const FunctorType& arg_functor, const Policy& arg_policy) + : m_functor(arg_functor), m_policy(arg_policy) {} +}; + +template <class FunctorType, class... Traits> +class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, + Kokkos::Experimental::HIP> + : private ParallelScanHIPBase<FunctorType, Traits...> { + public: + using Base = ParallelScanHIPBase<FunctorType, Traits...>; + using Base::operator(); + + inline void execute() { Base::impl_execute(); } + + ParallelScan(const FunctorType& arg_functor, + const typename Base::Policy& arg_policy) + : Base(arg_functor, arg_policy) {} +}; + +//---------------------------------------------------------------------------- + +template <class FunctorType, class ReturnType, class... Traits> +class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>, + ReturnType, Kokkos::Experimental::HIP> + : private ParallelScanHIPBase<FunctorType, Traits...> { + public: + using Base = ParallelScanHIPBase<FunctorType, Traits...>; + using Base::operator(); + + ReturnType& m_returnvalue; + + inline void execute() { + Base::impl_execute(); + + const auto nwork = Base::m_policy.end() - Base::m_policy.begin(); + if (nwork) { + const int size = Base::ValueTraits::value_size(Base::m_functor); + DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace>( + &m_returnvalue, + Base::m_scratch_space + (Base::m_grid_x - 1) * size / sizeof(int), + size); + } + } + + ParallelScanWithTotal(const FunctorType& arg_functor, + const typename Base::Policy& arg_policy, + ReturnType& arg_returnvalue) + : Base(arg_functor, arg_policy), m_returnvalue(arg_returnvalue) {} +}; + +} // namespace Impl +} // namespace Kokkos + +#endif + +#endif diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp new file mode 100644 index 0000000000000000000000000000000000000000..96c3ff2a751027a4eb05b03c99487207c9acf708 --- /dev/null +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp @@ -0,0 +1,1070 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKO_HIP_PARALLEL_TEAM_HPP +#define KOKKO_HIP_PARALLEL_TEAM_HPP + +#include <Kokkos_Parallel.hpp> + +#if defined(__HIPCC__) + +#include <HIP/Kokkos_HIP_KernelLaunch.hpp> +#include <HIP/Kokkos_HIP_Locks.hpp> +#include <HIP/Kokkos_HIP_Team.hpp> +#include <HIP/Kokkos_HIP_Instance.hpp> + +namespace Kokkos { +namespace Impl { +template <typename... Properties> +class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...> + : public PolicyTraits<Properties...> { + public: + using execution_policy = TeamPolicyInternal; + + using traits = PolicyTraits<Properties...>; + + template <typename ExecSpace, typename... OtherProperties> + friend class TeamPolicyInternal; + + private: + static int constexpr MAX_WARP = 8; + + typename traits::execution_space m_space; + int m_league_size; + int m_team_size; + int m_vector_length; + int m_team_scratch_size[2]; + int m_thread_scratch_size[2]; + int m_chunk_size; + bool m_tune_team_size; + bool m_tune_vector_length; + + public: + using execution_space = Kokkos::Experimental::HIP; + + template <class... OtherProperties> + TeamPolicyInternal(TeamPolicyInternal<OtherProperties...> const& p) { + m_league_size = p.m_league_size; + m_team_size = p.m_team_size; + m_vector_length = p.m_vector_length; + m_team_scratch_size[0] = p.m_team_scratch_size[0]; + m_team_scratch_size[1] = p.m_team_scratch_size[1]; + m_thread_scratch_size[0] = p.m_thread_scratch_size[0]; + m_thread_scratch_size[1] = p.m_thread_scratch_size[1]; + m_chunk_size = p.m_chunk_size; + m_space = p.m_space; + m_tune_team_size = p.m_tune_team_size; + m_tune_vector_length = p.m_tune_vector_length; + } + + template <typename FunctorType> + int team_size_max(FunctorType const& f, ParallelForTag const&) const { + using closure_type = + Impl::ParallelFor<FunctorType, TeamPolicy<Properties...> >; + hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch< + closure_type, + typename traits::launch_bounds>::get_hip_func_attributes(); + int const block_size = ::Kokkos::Experimental::Impl::hip_get_max_block_size< + FunctorType, typename traits::launch_bounds>( + space().impl_internal_space_instance(), attr, f, + static_cast<size_t>(impl_vector_length()), + static_cast<size_t>(team_scratch_size(0)) + 2 * sizeof(double), + static_cast<size_t>(thread_scratch_size(0)) + sizeof(double)); + return block_size / impl_vector_length(); + } + + template <class FunctorType> + inline int team_size_max(const FunctorType& f, + const ParallelReduceTag&) const { + using functor_analysis_type = + Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, + TeamPolicyInternal, FunctorType>; + using reducer_type = typename Impl::ParallelReduceReturnValue< + void, typename functor_analysis_type::value_type, + FunctorType>::reducer_type; + using closure_type = + Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>, + reducer_type>; + return internal_team_size_max<closure_type>(f); + } + + template <class FunctorType, class ReducerType> + inline int team_size_max(const FunctorType& f, const ReducerType& /*r*/, + const ParallelReduceTag&) const { + using closure_type = + Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>, + ReducerType>; + return internal_team_size_max<closure_type>(f); + } + + template <typename FunctorType> + int team_size_recommended(FunctorType const& f, ParallelForTag const&) const { + using closure_type = + Impl::ParallelFor<FunctorType, TeamPolicy<Properties...> >; + hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch< + closure_type, + typename traits::launch_bounds>::get_hip_func_attributes(); + int const block_size = ::Kokkos::Experimental::Impl::hip_get_opt_block_size< + FunctorType, typename traits::launch_bounds>( + space().impl_internal_space_instance(), attr, f, + static_cast<size_t>(impl_vector_length()), + static_cast<size_t>(team_scratch_size(0)) + 2 * sizeof(double), + static_cast<size_t>(thread_scratch_size(0)) + sizeof(double)); + return block_size / impl_vector_length(); + } + + template <typename FunctorType> + inline int team_size_recommended(FunctorType const& f, + ParallelReduceTag const&) const { + using functor_analysis_type = + Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, + TeamPolicyInternal, FunctorType>; + using reducer_type = typename Impl::ParallelReduceReturnValue< + void, typename functor_analysis_type::value_type, + FunctorType>::reducer_type; + using closure_type = + Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>, + reducer_type>; + return internal_team_size_recommended<closure_type>(f); + } + + template <class FunctorType, class ReducerType> + int team_size_recommended(FunctorType const& f, ReducerType const&, + ParallelReduceTag const&) const { + using closure_type = + Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>, + ReducerType>; + return internal_team_size_recommended<closure_type>(f); + } + inline bool impl_auto_vector_length() const { return m_tune_vector_length; } + inline bool impl_auto_team_size() const { return m_tune_team_size; } + static int vector_length_max() { + return ::Kokkos::Experimental::Impl::HIPTraits::WarpSize; + } + + static int verify_requested_vector_length(int requested_vector_length) { + int test_vector_length = + std::min(requested_vector_length, vector_length_max()); + + // Allow only power-of-two vector_length + if (!(is_integral_power_of_two(test_vector_length))) { + int test_pow2 = 1; + int constexpr warp_size = Experimental::Impl::HIPTraits::WarpSize; + while (test_pow2 < warp_size) { + test_pow2 <<= 1; + if (test_pow2 > test_vector_length) { + break; + } + } + test_vector_length = test_pow2 >> 1; + } + + return test_vector_length; + } + + static int scratch_size_max(int level) { + return ( + level == 0 ? 1024 * 40 : // FIXME_HIP arbitrarily setting this to 48kB + 20 * 1024 * 1024); // FIXME_HIP arbitrarily setting this to 20MB + } + inline void impl_set_vector_length(size_t size) { m_vector_length = size; } + inline void impl_set_team_size(size_t size) { m_team_size = size; } + int impl_vector_length() const { return m_vector_length; } + KOKKOS_DEPRECATED int vector_length() const { return impl_vector_length(); } + + int team_size() const { return m_team_size; } + + int league_size() const { return m_league_size; } + + int scratch_size(int level, int team_size_ = -1) const { + if (team_size_ < 0) team_size_ = m_team_size; + return m_team_scratch_size[level] + + team_size_ * m_thread_scratch_size[level]; + } + + int team_scratch_size(int level) const { return m_team_scratch_size[level]; } + + int thread_scratch_size(int level) const { + return m_thread_scratch_size[level]; + } + + typename traits::execution_space space() const { return m_space; } + + TeamPolicyInternal() + : m_space(typename traits::execution_space()), + m_league_size(0), + m_team_size(-1), + m_vector_length(0), + m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_chunk_size(::Kokkos::Experimental::Impl::HIPTraits::WarpSize), + m_tune_team_size(false), + m_tune_vector_length(false) {} + + /** \brief Specify league size, request team size */ + TeamPolicyInternal(const execution_space space_, int league_size_, + int team_size_request, int vector_length_request = 1) + : m_space(space_), + m_league_size(league_size_), + m_team_size(team_size_request), + m_vector_length( + (vector_length_request > 0) + ? verify_requested_vector_length(vector_length_request) + : (verify_requested_vector_length(1))), + m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_chunk_size(::Kokkos::Experimental::Impl::HIPTraits::WarpSize), + m_tune_team_size(bool(team_size_request <= 0)), + m_tune_vector_length(bool(vector_length_request <= 0)) { + // Make sure league size is permissible + if (league_size_ >= + static_cast<int>( + ::Kokkos::Experimental::Impl::hip_internal_maximum_grid_count())) + Impl::throw_runtime_exception( + "Requested too large league_size for TeamPolicy on HIP execution " + "space."); + + // Make sure total block size is permissible + if (m_team_size * m_vector_length > 1024) { + Impl::throw_runtime_exception( + std::string("Kokkos::TeamPolicy< HIP > the team size is too large. " + "Team size x vector length must be smaller than 1024.")); + } + } + + /** \brief Specify league size, request team size */ + TeamPolicyInternal(const execution_space space_, int league_size_, + const Kokkos::AUTO_t& /* team_size_request */, + int vector_length_request = 1) + : TeamPolicyInternal(space_, league_size_, -1, vector_length_request) {} + // FLAG + /** \brief Specify league size and team size, request vector length*/ + TeamPolicyInternal(const execution_space space_, int league_size_, + int team_size_request, + const Kokkos::AUTO_t& /* vector_length_request */ + ) + : TeamPolicyInternal(space_, league_size_, team_size_request, -1) + + {} + + /** \brief Specify league size, request team size and vector length*/ + TeamPolicyInternal(const execution_space space_, int league_size_, + const Kokkos::AUTO_t& /* team_size_request */, + const Kokkos::AUTO_t& /* vector_length_request */ + + ) + : TeamPolicyInternal(space_, league_size_, -1, -1) + + {} + + TeamPolicyInternal(int league_size_, int team_size_request, + int vector_length_request = 1) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, + team_size_request, vector_length_request) {} + + TeamPolicyInternal(int league_size_, + const Kokkos::AUTO_t& /* team_size_request */, + int vector_length_request = 1) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1, + vector_length_request) {} + + /** \brief Specify league size and team size, request vector length*/ + TeamPolicyInternal(int league_size_, int team_size_request, + const Kokkos::AUTO_t& /* vector_length_request */ + + ) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, + team_size_request, -1) + + {} + + /** \brief Specify league size, request team size and vector length*/ + TeamPolicyInternal(int league_size_, + const Kokkos::AUTO_t& /* team_size_request */, + const Kokkos::AUTO_t& /* vector_length_request */ + + ) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1, + -1) {} + + int chunk_size() const { return m_chunk_size; } + + TeamPolicyInternal& set_chunk_size(typename traits::index_type chunk_size_) { + m_chunk_size = chunk_size_; + return *this; + } + + /** \brief set per team scratch size for a specific level of the scratch + * hierarchy */ + TeamPolicyInternal& set_scratch_size(int level, + PerTeamValue const& per_team) { + m_team_scratch_size[level] = per_team.value; + return *this; + } + + /** \brief set per thread scratch size for a specific level of the scratch + * hierarchy */ + TeamPolicyInternal& set_scratch_size(int level, + PerThreadValue const& per_thread) { + m_thread_scratch_size[level] = per_thread.value; + return *this; + } + + /** \brief set per thread and per team scratch size for a specific level of + * the scratch hierarchy */ + TeamPolicyInternal& set_scratch_size(int level, PerTeamValue const& per_team, + PerThreadValue const& per_thread) { + m_team_scratch_size[level] = per_team.value; + m_thread_scratch_size[level] = per_thread.value; + return *this; + } + + using member_type = Kokkos::Impl::HIPTeamMember; + + protected: + template <class ClosureType, class FunctorType, class BlockSizeCallable> + int internal_team_size_common(const FunctorType& f, + BlockSizeCallable&& block_size_callable) const { + using closure_type = ClosureType; + using functor_value_traits = + Impl::FunctorValueTraits<FunctorType, typename traits::work_tag>; + + hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch< + closure_type, + typename traits::launch_bounds>::get_hip_func_attributes(); + const int block_size = std::forward<BlockSizeCallable>(block_size_callable)( + space().impl_internal_space_instance(), attr, f, + static_cast<size_t>(impl_vector_length()), + static_cast<size_t>(team_scratch_size(0)) + 2 * sizeof(double), + static_cast<size_t>(thread_scratch_size(0)) + sizeof(double) + + ((functor_value_traits::StaticValueSize != 0) + ? 0 + : functor_value_traits::value_size(f))); + KOKKOS_ASSERT(block_size > 0); + + // Currently we require Power-of-2 team size for reductions. + int p2 = 1; + while (p2 <= block_size) p2 *= 2; + p2 /= 2; + return p2 / impl_vector_length(); + } + + template <class ClosureType, class FunctorType> + int internal_team_size_max(const FunctorType& f) const { + return internal_team_size_common<ClosureType>( + f, ::Kokkos::Experimental::Impl::hip_get_max_block_size< + FunctorType, typename traits::launch_bounds>); + } + + template <class ClosureType, class FunctorType> + int internal_team_size_recommended(const FunctorType& f) const { + return internal_team_size_common<ClosureType>( + f, ::Kokkos::Experimental::Impl::hip_get_opt_block_size< + FunctorType, typename traits::launch_bounds>); + } +}; + +template <typename FunctorType, typename... Properties> +class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, + Kokkos::Experimental::HIP> { + public: + using Policy = TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>; + using functor_type = FunctorType; + using size_type = ::Kokkos::Experimental::HIP::size_type; + + private: + using member_type = typename Policy::member_type; + using work_tag = typename Policy::work_tag; + using launch_bounds = typename Policy::launch_bounds; + + // Algorithmic constraints: blockDim.y is a power of two AND + // blockDim.y == blockDim.z == 1 shared memory utilization: + // + // [ team reduce space ] + // [ team shared space ] + + FunctorType const m_functor; + Policy const m_policy; + size_type const m_league_size; + int m_team_size; + size_type const m_vector_size; + int m_shmem_begin; + int m_shmem_size; + void* m_scratch_ptr[2]; + int m_scratch_size[2]; + // Only let one ParallelFor/Reduce modify the team scratch memory. The + // constructor acquires the mutex which is released in the destructor. + std::unique_lock<std::mutex> m_scratch_lock; + + template <typename TagType> + __device__ inline + typename std::enable_if<std::is_same<TagType, void>::value>::type + exec_team(const member_type& member) const { + m_functor(member); + } + + template <typename TagType> + __device__ inline + typename std::enable_if<!std::is_same<TagType, void>::value>::type + exec_team(const member_type& member) const { + m_functor(TagType(), member); + } + + public: + __device__ inline void operator()() const { + // Iterate this block through the league + int64_t threadid = 0; + if (m_scratch_size[1] > 0) { + __shared__ int64_t base_thread_id; + if (threadIdx.x == 0 && threadIdx.y == 0) { + threadid = (blockIdx.x * blockDim.z + threadIdx.z) % + (Kokkos::Impl::g_device_hip_lock_arrays.n / + (blockDim.x * blockDim.y)); + threadid *= blockDim.x * blockDim.y; + int done = 0; + while (!done) { + done = (0 == + atomicCAS( + &Kokkos::Impl::g_device_hip_lock_arrays.scratch[threadid], + 0, 1)); + if (!done) { + threadid += blockDim.x * blockDim.y; + if (int64_t(threadid + blockDim.x * blockDim.y) >= + int64_t(Kokkos::Impl::g_device_hip_lock_arrays.n)) + threadid = 0; + } + } + base_thread_id = threadid; + } + __syncthreads(); + threadid = base_thread_id; + } + + int const int_league_size = static_cast<int>(m_league_size); + for (int league_rank = blockIdx.x; league_rank < int_league_size; + league_rank += gridDim.x) { + this->template exec_team<work_tag>(typename Policy::member_type( + ::Kokkos::Experimental::kokkos_impl_hip_shared_memory<void>(), + m_shmem_begin, m_shmem_size, + static_cast<void*>(static_cast<char*>(m_scratch_ptr[1]) + + ptrdiff_t(threadid / (blockDim.x * blockDim.y)) * + m_scratch_size[1]), + m_scratch_size[1], league_rank, m_league_size)); + } + if (m_scratch_size[1] > 0) { + __syncthreads(); + if (threadIdx.x == 0 && threadIdx.y == 0) + Kokkos::Impl::g_device_hip_lock_arrays.scratch[threadid] = 0; + } + } + + inline void execute() const { + int64_t const shmem_size_total = m_shmem_begin + m_shmem_size; + dim3 const grid(static_cast<int>(m_league_size), 1, 1); + dim3 const block(static_cast<int>(m_vector_size), + static_cast<int>(m_team_size), 1); + + ::Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, launch_bounds>( + *this, grid, block, shmem_size_total, + m_policy.space().impl_internal_space_instance(), + true); // copy to device and execute + } + + ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy) + : m_functor(arg_functor), + m_policy(arg_policy), + m_league_size(arg_policy.league_size()), + m_team_size(arg_policy.team_size()), + m_vector_size(arg_policy.impl_vector_length()), + m_scratch_lock(m_policy.space() + .impl_internal_space_instance() + ->m_team_scratch_mutex) { + hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch< + ParallelFor, launch_bounds>::get_hip_func_attributes(); + m_team_size = + m_team_size >= 0 + ? m_team_size + : ::Kokkos::Experimental::Impl::hip_get_opt_block_size< + FunctorType, launch_bounds>( + m_policy.space().impl_internal_space_instance(), attr, + m_functor, m_vector_size, m_policy.team_scratch_size(0), + m_policy.thread_scratch_size(0)) / + m_vector_size; + + m_shmem_begin = (sizeof(double) * (m_team_size + 2)); + m_shmem_size = + (m_policy.scratch_size(0, m_team_size) + + FunctorTeamShmemSize<FunctorType>::value(m_functor, m_team_size)); + m_scratch_size[0] = m_policy.scratch_size(0, m_team_size); + m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); + + // Functor's reduce memory, team scan memory, and team shared memory depend + // upon team size. + m_scratch_ptr[0] = nullptr; + m_scratch_ptr[1] = + m_team_size <= 0 + ? nullptr + : m_policy.space() + .impl_internal_space_instance() + ->resize_team_scratch_space( + static_cast<ptrdiff_t>(m_scratch_size[1]) * + static_cast<ptrdiff_t>( + ::Kokkos::Experimental::HIP::concurrency() / + (m_team_size * m_vector_size))); + + int const shmem_size_total = m_shmem_begin + m_shmem_size; + if (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < + shmem_size_total) { + printf( + "%i %i\n", + m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock, + shmem_size_total); + Kokkos::Impl::throw_runtime_exception(std::string( + "Kokkos::Impl::ParallelFor< HIP > insufficient shared memory")); + } + + if (static_cast<int>(m_team_size) > + static_cast<int>( + ::Kokkos::Experimental::Impl::hip_get_max_block_size<FunctorType, + launch_bounds>( + m_policy.space().impl_internal_space_instance(), attr, + arg_functor, arg_policy.impl_vector_length(), + arg_policy.team_scratch_size(0), + arg_policy.thread_scratch_size(0)) / + arg_policy.impl_vector_length())) { + Kokkos::Impl::throw_runtime_exception(std::string( + "Kokkos::Impl::ParallelFor< HIP > requested too large team size.")); + } + } +}; + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +template <class FunctorType, class ReducerType, class... Properties> +class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>, + ReducerType, Kokkos::Experimental::HIP> { + public: + using Policy = TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>; + + private: + using member_type = typename Policy::member_type; + using work_tag = typename Policy::work_tag; + using launch_bounds = typename Policy::launch_bounds; + + using reducer_conditional = + Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, + FunctorType, ReducerType>; + using reducer_type_fwd = typename reducer_conditional::type; + using work_tag_fwd = + typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, + work_tag, void>::type; + + using value_traits = + Kokkos::Impl::FunctorValueTraits<reducer_type_fwd, work_tag_fwd>; + using value_init = + Kokkos::Impl::FunctorValueInit<reducer_type_fwd, work_tag_fwd>; + using value_join = + Kokkos::Impl::FunctorValueJoin<reducer_type_fwd, work_tag_fwd>; + + using pointer_type = typename value_traits::pointer_type; + using reference_type = typename value_traits::reference_type; + using value_type = typename value_traits::value_type; + + public: + using functor_type = FunctorType; + using size_type = Kokkos::Experimental::HIP::size_type; + + static int constexpr UseShflReduction = (value_traits::StaticValueSize != 0); + + private: + struct ShflReductionTag {}; + struct SHMEMReductionTag {}; + + // Algorithmic constraints: blockDim.y is a power of two AND + // blockDim.y == blockDim.z == 1 shared memory utilization: + // + // [ global reduce space ] + // [ team reduce space ] + // [ team shared space ] + // + + const FunctorType m_functor; + const Policy m_policy; + const ReducerType m_reducer; + const pointer_type m_result_ptr; + const bool m_result_ptr_device_accessible; + const bool m_result_ptr_host_accessible; + size_type* m_scratch_space; + size_type* m_scratch_flags; + size_type m_team_begin; + size_type m_shmem_begin; + size_type m_shmem_size; + void* m_scratch_ptr[2]; + int m_scratch_size[2]; + const size_type m_league_size; + int m_team_size; + const size_type m_vector_size; + // Only let one ParallelFor/Reduce modify the team scratch memory. The + // constructor acquires the mutex which is released in the destructor. + std::unique_lock<std::mutex> m_scratch_lock; + + template <class TagType> + __device__ inline + typename std::enable_if<std::is_same<TagType, void>::value>::type + exec_team(member_type const& member, reference_type update) const { + m_functor(member, update); + } + + template <class TagType> + __device__ inline + typename std::enable_if<!std::is_same<TagType, void>::value>::type + exec_team(member_type const& member, reference_type update) const { + m_functor(TagType(), member, update); + } + + __device__ inline void iterate_through_league(int const threadid, + reference_type value) const { + int const int_league_size = static_cast<int>(m_league_size); + for (int league_rank = blockIdx.x; league_rank < int_league_size; + league_rank += gridDim.x) { + this->template exec_team<work_tag>( + member_type( + Kokkos::Experimental::kokkos_impl_hip_shared_memory<char>() + + m_team_begin, + m_shmem_begin, m_shmem_size, + reinterpret_cast<void*>( + reinterpret_cast<char*>(m_scratch_ptr[1]) + + static_cast<ptrdiff_t>(threadid / (blockDim.x * blockDim.y)) * + m_scratch_size[1]), + m_scratch_size[1], league_rank, m_league_size), + value); + } + } + + public: + __device__ inline void operator()() const { + int64_t threadid = 0; + if (m_scratch_size[1] > 0) { + __shared__ int64_t base_thread_id; + if (threadIdx.x == 0 && threadIdx.y == 0) { + threadid = (blockIdx.x * blockDim.z + threadIdx.z) % + (Kokkos::Impl::g_device_hip_lock_arrays.n / + (blockDim.x * blockDim.y)); + threadid *= blockDim.x * blockDim.y; + int done = 0; + while (!done) { + done = (0 == + atomicCAS( + &Kokkos::Impl::g_device_hip_lock_arrays.scratch[threadid], + 0, 1)); + if (!done) { + threadid += blockDim.x * blockDim.y; + if (static_cast<int64_t>(threadid + blockDim.x * blockDim.y) >= + static_cast<int64_t>(Kokkos::Impl::g_device_hip_lock_arrays.n)) + threadid = 0; + } + } + base_thread_id = threadid; + } + __syncthreads(); + threadid = base_thread_id; + } + + using ReductionTag = std::conditional_t<UseShflReduction, ShflReductionTag, + SHMEMReductionTag>; + run(ReductionTag{}, threadid); + + if (m_scratch_size[1] > 0) { + __syncthreads(); + if (threadIdx.x == 0 && threadIdx.y == 0) { + Kokkos::Impl::g_device_hip_lock_arrays.scratch[threadid] = 0; + } + } + } + + __device__ inline void run(SHMEMReductionTag, int const threadid) const { + integral_nonzero_constant<size_type, value_traits::StaticValueSize / + sizeof(size_type)> const + word_count(value_traits::value_size( + reducer_conditional::select(m_functor, m_reducer)) / + sizeof(size_type)); + + reference_type value = value_init::init( + reducer_conditional::select(m_functor, m_reducer), + Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>() + + threadIdx.y * word_count.value); + + // Iterate this block through the league + iterate_through_league(threadid, value); + + // Reduce with final value at blockDim.y - 1 location. + bool do_final_reduce = (m_league_size == 0); + if (!do_final_reduce) + do_final_reduce = + hip_single_inter_block_reduce_scan<false, FunctorType, work_tag>( + reducer_conditional::select(m_functor, m_reducer), blockIdx.x, + gridDim.x, + Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>(), + m_scratch_space, m_scratch_flags); + if (do_final_reduce) { + // This is the final block with the final result at the final threads' + // location + + size_type* const shared = + Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>() + + (blockDim.y - 1) * word_count.value; + size_type* const global = m_result_ptr_device_accessible + ? reinterpret_cast<size_type*>(m_result_ptr) + : m_scratch_space; + + if (threadIdx.y == 0) { + Kokkos::Impl::FunctorFinal<reducer_type_fwd, work_tag_fwd>::final( + reducer_conditional::select(m_functor, m_reducer), shared); + } + + if (Kokkos::Experimental::Impl::HIPTraits::WarpSize < word_count.value) { + __syncthreads(); + } + + for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { + global[i] = shared[i]; + } + } + } + + __device__ inline void run(ShflReductionTag, int const threadid) const { + value_type value; + value_init::init(reducer_conditional::select(m_functor, m_reducer), &value); + + // Iterate this block through the league + iterate_through_league(threadid, value); + + pointer_type const result = + m_result_ptr_device_accessible + ? m_result_ptr + : reinterpret_cast<pointer_type>(m_scratch_space); + + value_type init; + value_init::init(reducer_conditional::select(m_functor, m_reducer), &init); + if (m_league_size == 0) { + Kokkos::Impl::FunctorFinal<reducer_type_fwd, work_tag_fwd>::final( + reducer_conditional::select(m_functor, m_reducer), + reinterpret_cast<void*>(&value)); + *result = value; + } else if (Impl::hip_inter_block_shuffle_reduction<FunctorType, value_join, + work_tag>( + value, init, + value_join( + reducer_conditional::select(m_functor, m_reducer)), + m_scratch_space, result, m_scratch_flags, blockDim.y)) { + unsigned int const id = threadIdx.y * blockDim.x + threadIdx.x; + if (id == 0) { + Kokkos::Impl::FunctorFinal<reducer_type_fwd, work_tag_fwd>::final( + reducer_conditional::select(m_functor, m_reducer), + reinterpret_cast<void*>(&value)); + *result = value; + } + } + } + + inline void execute() { + const int nwork = m_league_size * m_team_size; + const bool need_device_set = ReduceFunctorHasInit<FunctorType>::value || + ReduceFunctorHasFinal<FunctorType>::value || + !m_result_ptr_host_accessible || + !std::is_same<ReducerType, InvalidType>::value; + if ((nwork > 0) || need_device_set) { + const int block_count = + UseShflReduction + ? std::min( + m_league_size, + size_type(1024 * + Kokkos::Experimental::Impl::HIPTraits::WarpSize)) + : std::min(static_cast<int>(m_league_size), m_team_size); + + m_scratch_space = Kokkos::Experimental::Impl::hip_internal_scratch_space( + value_traits::value_size( + reducer_conditional::select(m_functor, m_reducer)) * + block_count); + m_scratch_flags = Kokkos::Experimental::Impl::hip_internal_scratch_flags( + sizeof(size_type)); + + dim3 block(m_vector_size, m_team_size, 1); + dim3 grid(block_count, 1, 1); + if (nwork == 0) { + block = dim3(1, 1, 1); + grid = dim3(1, 1, 1); + } + const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size; + + Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelReduce, + launch_bounds>( + *this, grid, block, shmem_size_total, + m_policy.space().impl_internal_space_instance(), + true); // copy to device and execute + + if (!m_result_ptr_device_accessible) { + m_policy.space().impl_internal_space_instance()->fence(); + + if (m_result_ptr) { + const int size = value_traits::value_size( + reducer_conditional::select(m_functor, m_reducer)); + DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace>( + m_result_ptr, m_scratch_space, size); + } + } + } else { + if (m_result_ptr) { + value_init::init(reducer_conditional::select(m_functor, m_reducer), + m_result_ptr); + } + } + } + + template <class ViewType> + ParallelReduce(FunctorType const& arg_functor, Policy const& arg_policy, + ViewType const& arg_result, + typename std::enable_if<Kokkos::is_view<ViewType>::value, + void*>::type = nullptr) + : m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(InvalidType()), + m_result_ptr(arg_result.data()), + m_result_ptr_device_accessible( + MemorySpaceAccess<Kokkos::Experimental::HIPSpace, + typename ViewType::memory_space>::accessible), + m_result_ptr_host_accessible( + MemorySpaceAccess<Kokkos::HostSpace, + typename ViewType::memory_space>::accessible), + m_scratch_space(nullptr), + m_scratch_flags(nullptr), + m_team_begin(0), + m_shmem_begin(0), + m_shmem_size(0), + m_scratch_ptr{nullptr, nullptr}, + m_league_size(arg_policy.league_size()), + m_team_size(arg_policy.team_size()), + m_vector_size(arg_policy.impl_vector_length()), + m_scratch_lock(m_policy.space() + .impl_internal_space_instance() + ->m_team_scratch_mutex) { + hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch< + ParallelReduce, launch_bounds>::get_hip_func_attributes(); + m_team_size = + m_team_size >= 0 + ? m_team_size + : Kokkos::Experimental::Impl::hip_get_opt_block_size<FunctorType, + launch_bounds>( + m_policy.space().impl_internal_space_instance(), attr, + m_functor, m_vector_size, m_policy.team_scratch_size(0), + m_policy.thread_scratch_size(0)) / + m_vector_size; + + m_team_begin = + UseShflReduction + ? 0 + : hip_single_inter_block_reduce_scan_shmem<false, FunctorType, + work_tag>(arg_functor, + m_team_size); + m_shmem_begin = sizeof(double) * (m_team_size + 2); + m_shmem_size = + m_policy.scratch_size(0, m_team_size) + + FunctorTeamShmemSize<FunctorType>::value(arg_functor, m_team_size); + m_scratch_size[0] = m_shmem_size; + m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); + m_scratch_ptr[1] = + m_team_size <= 0 + ? nullptr + : m_policy.space() + .impl_internal_space_instance() + ->resize_team_scratch_space( + static_cast<std::int64_t>(m_scratch_size[1]) * + (static_cast<std::int64_t>( + Kokkos::Experimental::HIP::concurrency() / + (m_team_size * m_vector_size)))); + + // The global parallel_reduce does not support vector_length other than 1 at + // the moment + if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction) + Impl::throw_runtime_exception( + "Kokkos::parallel_reduce with a TeamPolicy using a vector length of " + "greater than 1 is not currently supported for HIP for dynamic " + "sized reduction types."); + + if ((m_team_size < Kokkos::Experimental::Impl::HIPTraits::WarpSize) && + !UseShflReduction) + Impl::throw_runtime_exception( + "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller " + "than 64 is not currently supported with HIP for dynamic sized " + "reduction types."); + + // Functor's reduce memory, team scan memory, and team shared memory depend + // upon team size. + + const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size; + + if (!Kokkos::Impl::is_integral_power_of_two(m_team_size) && + !UseShflReduction) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelReduce< HIP > bad team size")); + } + + if (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < + shmem_size_total) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelReduce< HIP > requested too much " + "L0 scratch memory")); + } + + if (static_cast<int>(m_team_size) > + arg_policy.team_size_max(m_functor, m_reducer, ParallelReduceTag())) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelReduce< HIP > requested too " + "large team size.")); + } + } + + ParallelReduce(FunctorType const& arg_functor, Policy const& arg_policy, + ReducerType const& reducer) + : m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(reducer), + m_result_ptr(reducer.view().data()), + m_result_ptr_device_accessible( + MemorySpaceAccess<Kokkos::Experimental::HIPSpace, + typename ReducerType::result_view_type:: + memory_space>::accessible), + m_result_ptr_host_accessible( + MemorySpaceAccess<Kokkos::HostSpace, + typename ReducerType::result_view_type:: + memory_space>::accessible), + m_scratch_space(nullptr), + m_scratch_flags(nullptr), + m_team_begin(0), + m_shmem_begin(0), + m_shmem_size(0), + m_scratch_ptr{nullptr, nullptr}, + m_league_size(arg_policy.league_size()), + m_team_size(arg_policy.team_size()), + m_vector_size(arg_policy.impl_vector_length()), + m_scratch_lock(m_policy.space() + .impl_internal_space_instance() + ->m_team_scratch_mutex) { + hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch< + ParallelReduce, launch_bounds>::get_hip_func_attributes(); + m_team_size = + m_team_size >= 0 + ? m_team_size + : Kokkos::Experimental::Impl::hip_get_opt_block_size<FunctorType, + launch_bounds>( + m_policy.space().impl_internal_space_instance(), attr, + m_functor, m_vector_size, m_policy.team_scratch_size(0), + m_policy.thread_scratch_size(0)) / + m_vector_size; + + m_team_begin = + UseShflReduction + ? 0 + : hip_single_inter_block_reduce_scan_shmem<false, FunctorType, + work_tag>(arg_functor, + m_team_size); + m_shmem_begin = sizeof(double) * (m_team_size + 2); + m_shmem_size = + m_policy.scratch_size(0, m_team_size) + + FunctorTeamShmemSize<FunctorType>::value(arg_functor, m_team_size); + m_scratch_size[0] = m_shmem_size; + m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); + m_scratch_ptr[1] = + m_team_size <= 0 + ? nullptr + : m_policy.space() + .impl_internal_space_instance() + ->resize_team_scratch_space( + static_cast<ptrdiff_t>(m_scratch_size[1]) * + static_cast<ptrdiff_t>( + Kokkos::Experimental::HIP::concurrency() / + (m_team_size * m_vector_size))); + + // The global parallel_reduce does not support vector_length other than 1 at + // the moment + if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction) + Impl::throw_runtime_exception( + "Kokkos::parallel_reduce with a TeamPolicy using a vector length of " + "greater than 1 is not currently supported for HIP for dynamic " + "sized reduction types."); + + if ((m_team_size < Kokkos::Experimental::Impl::HIPTraits::WarpSize) && + !UseShflReduction) + Impl::throw_runtime_exception( + "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller " + "than 64 is not currently supported with HIP for dynamic sized " + "reduction types."); + + // Functor's reduce memory, team scan memory, and team shared memory depend + // upon team size. + + const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size; + + if ((!Kokkos::Impl::is_integral_power_of_two(m_team_size) && + !UseShflReduction) || + m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < + shmem_size_total) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelReduce< HIP > bad team size")); + } + if (static_cast<int>(m_team_size) > + arg_policy.team_size_max(m_functor, m_reducer, ParallelReduceTag())) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelReduce< HIP > requested too " + "large team size.")); + } + } +}; +} // namespace Impl +} // namespace Kokkos + +#endif + +#endif diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_ReduceScan.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_ReduceScan.hpp new file mode 100644 index 0000000000000000000000000000000000000000..98dab9a0fbca41de38234fab4173cd4d4f763699 --- /dev/null +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_ReduceScan.hpp @@ -0,0 +1,539 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_HIP_REDUCESCAN_HPP +#define KOKKOS_HIP_REDUCESCAN_HPP + +#include <Kokkos_Macros.hpp> + +#if defined(__HIPCC__) + +#include <HIP/Kokkos_HIP_Vectorization.hpp> + +namespace Kokkos { +namespace Impl { + +//---------------------------------------------------------------------------- +// Reduction-only implementation +//---------------------------------------------------------------------------- + +template <class FunctorType, class ArgTag, bool UseShfl> +struct HIPReductionsFunctor; + +template <typename FunctorType, typename ArgTag> +struct HIPReductionsFunctor<FunctorType, ArgTag, true> { + using ValueTraits = FunctorValueTraits<FunctorType, ArgTag>; + using ValueJoin = FunctorValueJoin<FunctorType, ArgTag>; + using ValueInit = FunctorValueInit<FunctorType, ArgTag>; + using ValueOps = FunctorValueOps<FunctorType, ArgTag>; + using pointer_type = typename ValueTraits::pointer_type; + using Scalar = typename ValueTraits::value_type; + + __device__ static inline void scalar_intra_warp_reduction( + FunctorType const& functor, + Scalar value, // Contribution + bool const skip_vector, // Skip threads if Kokkos vector lanes are not + // part of the reduction + int const width, // How much of the warp participates + Scalar& result) { + for (int delta = skip_vector ? blockDim.x : 1; delta < width; delta *= 2) { + Scalar tmp = Kokkos::Experimental::shfl_down(value, delta, width); + ValueJoin::join(functor, &value, &tmp); + } + + Experimental::Impl::in_place_shfl(result, value, 0, width); + } + + __device__ static inline void scalar_intra_block_reduction( + FunctorType const& functor, Scalar value, bool const skip, + Scalar* my_global_team_buffer_element, int const shared_elements, + Scalar* shared_team_buffer_element) { + unsigned int constexpr warp_size = + Kokkos::Experimental::Impl::HIPTraits::WarpSize; + int const warp_id = (threadIdx.y * blockDim.x) / warp_size; + Scalar* const my_shared_team_buffer_element = + shared_team_buffer_element + warp_id % shared_elements; + + // Warp Level Reduction, ignoring Kokkos vector entries + scalar_intra_warp_reduction(functor, value, skip, warp_size, value); + + if (warp_id < shared_elements) { + *my_shared_team_buffer_element = value; + } + // Wait for every warp to be done before using one warp to do the final + // cross warp reduction + __syncthreads(); + + int const num_warps = blockDim.x * blockDim.y / warp_size; + for (int w = shared_elements; w < num_warps; w += shared_elements) { + if (warp_id >= w && warp_id < w + shared_elements) { + if ((threadIdx.y * blockDim.x + threadIdx.x) % warp_size == 0) + ValueJoin::join(functor, my_shared_team_buffer_element, &value); + } + __syncthreads(); + } + + if (warp_id == 0) { + ValueInit::init(functor, &value); + for (unsigned int i = threadIdx.y * blockDim.x + threadIdx.x; + i < blockDim.y * blockDim.x / warp_size; i += warp_size) { + ValueJoin::join(functor, &value, &shared_team_buffer_element[i]); + } + scalar_intra_warp_reduction(functor, value, false, warp_size, + *my_global_team_buffer_element); + } + } + + __device__ static inline bool scalar_inter_block_reduction( + FunctorType const& functor, + ::Kokkos::Experimental::HIP::size_type const block_count, + ::Kokkos::Experimental::HIP::size_type* const shared_data, + ::Kokkos::Experimental::HIP::size_type* const global_data, + ::Kokkos::Experimental::HIP::size_type* const global_flags) { + Scalar* const global_team_buffer_element = + reinterpret_cast<Scalar*>(global_data); + Scalar* const my_global_team_buffer_element = + global_team_buffer_element + blockIdx.x; + Scalar* shared_team_buffer_elements = + reinterpret_cast<Scalar*>(shared_data); + Scalar value = shared_team_buffer_elements[threadIdx.y]; + unsigned int constexpr warp_size = + Kokkos::Experimental::Impl::HIPTraits::WarpSize; + int shared_elements = blockDim.x * blockDim.y / warp_size; + int global_elements = block_count; + __syncthreads(); + + scalar_intra_block_reduction(functor, value, true, + my_global_team_buffer_element, shared_elements, + shared_team_buffer_elements); + __threadfence(); + __syncthreads(); + + // Use the last block that is done to do the do the reduction across the + // block + __shared__ unsigned int num_teams_done; + if (threadIdx.x + threadIdx.y == 0) { + __threadfence(); + num_teams_done = Kokkos::atomic_fetch_add(global_flags, 1) + 1; + } + bool is_last_block = false; + // FIXME_HIP HIP does not support syncthreads_or. That's why we need to make + // num_teams_done __shared__ + // if (__syncthreads_or(num_teams_done == gridDim.x)) {*/ + __syncthreads(); + if (num_teams_done == gridDim.x) { + is_last_block = true; + *global_flags = 0; + ValueInit::init(functor, &value); + for (int i = threadIdx.y * blockDim.x + threadIdx.x; i < global_elements; + i += blockDim.x * blockDim.y) { + ValueJoin::join(functor, &value, &global_team_buffer_element[i]); + } + scalar_intra_block_reduction( + functor, value, false, shared_team_buffer_elements + blockDim.y - 1, + shared_elements, shared_team_buffer_elements); + } + + return is_last_block; + } +}; + +template <typename FunctorType, typename ArgTag> +struct HIPReductionsFunctor<FunctorType, ArgTag, false> { + using ValueTraits = FunctorValueTraits<FunctorType, ArgTag>; + using ValueJoin = FunctorValueJoin<FunctorType, ArgTag>; + using ValueInit = FunctorValueInit<FunctorType, ArgTag>; + using ValueOps = FunctorValueOps<FunctorType, ArgTag>; + using pointer_type = typename ValueTraits::pointer_type; + using Scalar = typename ValueTraits::value_type; + + __device__ static inline void scalar_intra_warp_reduction( + FunctorType const& functor, + Scalar* value, // Contribution + bool const skip_vector, // Skip threads if Kokkos vector lanes are not + // part of the reduction + int const width) // How much of the warp participates + { + int const lane_id = (threadIdx.y * blockDim.x + threadIdx.x) % + ::Kokkos::Experimental::Impl::HIPTraits::WarpSize; + for (int delta = skip_vector ? blockDim.x : 1; delta < width; delta *= 2) { + if (lane_id + delta < ::Kokkos::Experimental::Impl::HIPTraits::WarpSize) { + ValueJoin::join(functor, value, value + delta); + } + } + *value = *(value - lane_id); + } + + __device__ static inline void scalar_intra_block_reduction( + FunctorType const& functor, Scalar value, bool const skip, Scalar* result, + int const /*shared_elements*/, Scalar* shared_team_buffer_element) { + int const warp_id = (threadIdx.y * blockDim.x) / + ::Kokkos::Experimental::Impl::HIPTraits::WarpSize; + Scalar* const my_shared_team_buffer_element = + shared_team_buffer_element + threadIdx.y * blockDim.x + threadIdx.x; + *my_shared_team_buffer_element = value; + // Warp Level Reduction, ignoring Kokkos vector entries + scalar_intra_warp_reduction( + functor, my_shared_team_buffer_element, skip, + ::Kokkos::Experimental::Impl::HIPTraits::WarpSize); + // Wait for every warp to be done before using one warp to do final cross + // warp reduction + __syncthreads(); + + if (warp_id == 0) { + const unsigned int delta = + (threadIdx.y * blockDim.x + threadIdx.x) * + ::Kokkos::Experimental::Impl::HIPTraits::WarpSize; + if (delta < blockDim.x * blockDim.y) + *my_shared_team_buffer_element = shared_team_buffer_element[delta]; + scalar_intra_warp_reduction( + functor, my_shared_team_buffer_element, false, + blockDim.x * blockDim.y / + ::Kokkos::Experimental::Impl::HIPTraits::WarpSize); + if (threadIdx.x + threadIdx.y == 0) *result = *shared_team_buffer_element; + } + } + + __device__ static inline bool scalar_inter_block_reduction( + FunctorType const& functor, + ::Kokkos::Experimental::HIP::size_type const block_count, + ::Kokkos::Experimental::HIP::size_type* const shared_data, + ::Kokkos::Experimental::HIP::size_type* const global_data, + ::Kokkos::Experimental::HIP::size_type* const global_flags) { + Scalar* const global_team_buffer_element = + reinterpret_cast<Scalar*>(global_data); + Scalar* const my_global_team_buffer_element = + global_team_buffer_element + blockIdx.x; + Scalar* shared_team_buffer_elements = + reinterpret_cast<Scalar*>(shared_data); + Scalar value = shared_team_buffer_elements[threadIdx.y]; + int shared_elements = (blockDim.x * blockDim.y) / + ::Kokkos::Experimental::Impl::HIPTraits::WarpSize; + int global_elements = block_count; + __syncthreads(); + + // Do the scalar reduction inside each block + scalar_intra_block_reduction(functor, value, true, + my_global_team_buffer_element, shared_elements, + shared_team_buffer_elements); + __syncthreads(); + + // Use the last block that is done to do the do the reduction across the + // block + __shared__ unsigned int num_teams_done; + if (threadIdx.x + threadIdx.y == 0) { + __threadfence(); + num_teams_done = Kokkos::atomic_fetch_add(global_flags, 1) + 1; + } + bool is_last_block = false; + // FIXME_HIP HIP does not support syncthreads_or. That's why we need to make + // num_teams_done __shared__ + // if (__syncthreads_or(num_teams_done == gridDim.x)) {*/ + __syncthreads(); + if (num_teams_done == gridDim.x) { + is_last_block = true; + *global_flags = 0; + ValueInit::init(functor, &value); + for (int i = threadIdx.y * blockDim.x + threadIdx.x; i < global_elements; + i += blockDim.x * blockDim.y) { + ValueJoin::join(functor, &value, &global_team_buffer_element[i]); + } + scalar_intra_block_reduction( + functor, value, false, shared_team_buffer_elements + (blockDim.y - 1), + shared_elements, shared_team_buffer_elements); + } + + return is_last_block; + } +}; + +//---------------------------------------------------------------------------- +// Fused reduction and scan implementation +//---------------------------------------------------------------------------- +/* + * Algorithmic constraints: + * (a) blockDim.y is a power of two + * (b) blockDim.y <= 1024 + * (c) blockDim.x == blockDim.z == 1 + */ + +template <bool DoScan, class FunctorType, class ArgTag> +__device__ void hip_intra_block_reduce_scan( + FunctorType const& functor, + typename FunctorValueTraits<FunctorType, ArgTag>::pointer_type const + base_data) { + using ValueTraits = FunctorValueTraits<FunctorType, ArgTag>; + using ValueJoin = FunctorValueJoin<FunctorType, ArgTag>; + + using pointer_type = typename ValueTraits::pointer_type; + + unsigned int const value_count = ValueTraits::value_count(functor); + unsigned int const BlockSizeMask = blockDim.y - 1; + int const WarpMask = Experimental::Impl::HIPTraits::WarpSize - 1; + + // Must have power of two thread count + if ((blockDim.y - 1) & blockDim.y) { + Kokkos::abort( + "HIP::hip_intra_block_reduce_scan requires power-of-two " + "blockDim.y\n"); + } + + auto block_reduce_step = + [&functor, value_count](int const R, pointer_type const TD, int const S) { + if (R > ((1 << S) - 1)) { + ValueJoin::join(functor, TD, (TD - (value_count << S))); + } + }; + + { // Intra-warp reduction: + const unsigned rtid_intra = threadIdx.y & WarpMask; + const pointer_type tdata_intra = base_data + value_count * threadIdx.y; + + block_reduce_step(rtid_intra, tdata_intra, 0); + block_reduce_step(rtid_intra, tdata_intra, 1); + block_reduce_step(rtid_intra, tdata_intra, 2); + block_reduce_step(rtid_intra, tdata_intra, 3); + block_reduce_step(rtid_intra, tdata_intra, 4); + block_reduce_step(rtid_intra, tdata_intra, 5); + } + + __syncthreads(); // Wait for all warps to reduce + + { // Inter-warp reduce-scan by a single warp to avoid extra synchronizations + unsigned int const rtid_inter = + ((threadIdx.y + 1) << Experimental::Impl::HIPTraits::WarpIndexShift) - + 1; + + if (rtid_inter < blockDim.y) { + pointer_type const tdata_inter = base_data + value_count * rtid_inter; + + if ((1 << 6) < BlockSizeMask) { + block_reduce_step(rtid_inter, tdata_inter, 6); + } + if ((1 << 7) < BlockSizeMask) { + block_reduce_step(rtid_inter, tdata_inter, 7); + } + if ((1 << 8) < BlockSizeMask) { + block_reduce_step(rtid_inter, tdata_inter, 8); + } + if ((1 << 9) < BlockSizeMask) { + block_reduce_step(rtid_inter, tdata_inter, 9); + } + if ((1 << 10) < BlockSizeMask) { + block_reduce_step(rtid_inter, tdata_inter, 10); + } + } + } + + __syncthreads(); // Wait for inter-warp reduce-scan to complete + + if (DoScan) { + // Update all the values for the respective warps (except for the last one) + // by adding from the last value of the previous warp. + if (threadIdx.y >= Experimental::Impl::HIPTraits::WarpSize && + (threadIdx.y & WarpMask) != + Experimental::Impl::HIPTraits::WarpSize - 1) { + const int offset_to_previous_warp_total = (threadIdx.y & (~WarpMask)) - 1; + ValueJoin::join(functor, base_data + value_count * threadIdx.y, + base_data + value_count * offset_to_previous_warp_total); + } + } +} + +//---------------------------------------------------------------------------- +/**\brief Input value-per-thread starting at 'shared_data'. + * Reduction value at last thread's location. + * + * If 'DoScan' then write blocks' scan values and block-groups' scan values. + * + * Global reduce result is in the last threads' 'shared_data' location. + */ + +template <bool DoScan, class FunctorType, class ArgTag> +__device__ bool hip_single_inter_block_reduce_scan_impl( + FunctorType const& functor, + ::Kokkos::Experimental::HIP::size_type const block_id, + ::Kokkos::Experimental::HIP::size_type const block_count, + ::Kokkos::Experimental::HIP::size_type* const shared_data, + ::Kokkos::Experimental::HIP::size_type* const global_data, + ::Kokkos::Experimental::HIP::size_type* const global_flags) { + using size_type = ::Kokkos::Experimental::HIP::size_type; + using ValueTraits = FunctorValueTraits<FunctorType, ArgTag>; + using ValueJoin = FunctorValueJoin<FunctorType, ArgTag>; + using ValueInit = FunctorValueInit<FunctorType, ArgTag>; + using ValueOps = FunctorValueOps<FunctorType, ArgTag>; + + using pointer_type = typename ValueTraits::pointer_type; + + // '__ffs' = position of the least significant bit set to 1. + // 'blockDim.y' is guaranteed to be a power of two so this + // is the integral shift value that can replace an integral divide. + unsigned int const BlockSizeShift = __ffs(blockDim.y) - 1; + unsigned int const BlockSizeMask = blockDim.y - 1; + + // Must have power of two thread count + if (BlockSizeMask & blockDim.y) { + Kokkos::abort( + "HIP::hip_single_inter_block_reduce_scan requires power-of-two " + "blockDim"); + } + + integral_nonzero_constant<size_type, ValueTraits::StaticValueSize / + sizeof(size_type)> const + word_count(ValueTraits::value_size(functor) / sizeof(size_type)); + + // Reduce the accumulation for the entire block. + hip_intra_block_reduce_scan<false, FunctorType, ArgTag>( + functor, pointer_type(shared_data)); + + { + // Write accumulation total to global scratch space. + // Accumulation total is the last thread's data. + size_type* const shared = shared_data + word_count.value * BlockSizeMask; + size_type* const global = global_data + word_count.value * block_id; + + for (size_t i = threadIdx.y; i < word_count.value; i += blockDim.y) { + global[i] = shared[i]; + } + } + + // Contributing blocks note that their contribution has been completed via an + // atomic-increment flag If this block is not the last block to contribute to + // this group then the block is done. + // FIXME_HIP __syncthreads_or is not supported by HIP yet. + // const bool is_last_block = !__syncthreads_or( + // threadIdx.y + // ? 0 + // : (1 + atomicInc(global_flags, block_count - 1) < block_count)); + __shared__ int n_done; + n_done = 0; + __syncthreads(); + if (threadIdx.y == 0) { + __threadfence(); + n_done = 1 + atomicInc(global_flags, block_count - 1); + } + __syncthreads(); + bool const is_last_block = (n_done == static_cast<int>(block_count)); + + if (is_last_block) { + size_type const b = (static_cast<long long int>(block_count) * + static_cast<long long int>(threadIdx.y)) >> + BlockSizeShift; + size_type const e = (static_cast<long long int>(block_count) * + static_cast<long long int>(threadIdx.y + 1)) >> + BlockSizeShift; + + { + void* const shared_ptr = shared_data + word_count.value * threadIdx.y; + /* reference_type shared_value = */ ValueInit::init(functor, shared_ptr); + + for (size_type i = b; i < e; ++i) { + ValueJoin::join(functor, shared_ptr, + global_data + word_count.value * i); + } + } + + hip_intra_block_reduce_scan<DoScan, FunctorType, ArgTag>( + functor, pointer_type(shared_data)); + + if (DoScan) { + size_type* const shared_value = + shared_data + + word_count.value * (threadIdx.y ? threadIdx.y - 1 : blockDim.y); + + if (!threadIdx.y) { + ValueInit::init(functor, shared_value); + } + + // Join previous inclusive scan value to each member + for (size_type i = b; i < e; ++i) { + size_type* const global_value = global_data + word_count.value * i; + ValueJoin::join(functor, shared_value, global_value); + ValueOps::copy(functor, global_value, shared_value); + } + } + } + + return is_last_block; +} + +template <bool DoScan, typename FunctorType, typename ArgTag> +__device__ bool hip_single_inter_block_reduce_scan( + FunctorType const& functor, + ::Kokkos::Experimental::HIP::size_type const block_id, + ::Kokkos::Experimental::HIP::size_type const block_count, + ::Kokkos::Experimental::HIP::size_type* const shared_data, + ::Kokkos::Experimental::HIP::size_type* const global_data, + ::Kokkos::Experimental::HIP::size_type* const global_flags) { + using ValueTraits = FunctorValueTraits<FunctorType, ArgTag>; + // If we are doing a reduction and StaticValueSize is true, we use the + // reduction-only path. Otherwise, we use the common path between reduction + // and scan. + if (!DoScan && static_cast<bool>(ValueTraits::StaticValueSize)) + // FIXME_HIP_PERFORMANCE I don't know where 16 comes from. This inequality + // determines if we use shared memory (false) or shuffle (true) + return Kokkos::Impl::HIPReductionsFunctor< + FunctorType, ArgTag, (ValueTraits::StaticValueSize > 16)>:: + scalar_inter_block_reduction(functor, block_count, shared_data, + global_data, global_flags); + else { + return hip_single_inter_block_reduce_scan_impl<DoScan, FunctorType, ArgTag>( + functor, block_id, block_count, shared_data, global_data, global_flags); + } +} + +// Size in bytes required for inter block reduce or scan +template <bool DoScan, class FunctorType, class ArgTag> +inline unsigned hip_single_inter_block_reduce_scan_shmem( + const FunctorType& functor, const unsigned BlockSize) { + return (BlockSize + 2) * + Impl::FunctorValueTraits<FunctorType, ArgTag>::value_size(functor); +} + +} // namespace Impl +} // namespace Kokkos + +#endif + +#endif diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp new file mode 100644 index 0000000000000000000000000000000000000000..fe7c34bb80973a224d1d2ff6d092c4e9bc3e1571 --- /dev/null +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp @@ -0,0 +1,339 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_HIP_SHUFFLE_REDUCE_HPP +#define KOKKOS_HIP_SHUFFLE_REDUCE_HPP + +#include <Kokkos_Macros.hpp> + +#if defined(__HIPCC__) + +#include <HIP/Kokkos_HIP_Vectorization.hpp> + +#include <climits> + +namespace Kokkos { +namespace Impl { + +/* Algorithmic constraints: + * (a) threads with the same threadIdx.x have same value + * (b) blockDim.x == power of two + * (x) blockDim.z == 1 + */ +template <typename ValueType, typename JoinOp, + typename std::enable_if<!Kokkos::is_reducer<ValueType>::value, + int>::type = 0> +__device__ inline void hip_intra_warp_shuffle_reduction( + ValueType& result, JoinOp const& join, + uint32_t const max_active_thread = blockDim.y) { + unsigned int shift = 1; + + // Reduce over values from threads with different threadIdx.y + unsigned int constexpr warp_size = + Kokkos::Experimental::Impl::HIPTraits::WarpSize; + while (blockDim.x * shift < warp_size) { + ValueType const tmp = + Kokkos::Experimental::shfl_down(result, blockDim.x * shift, warp_size); + // Only join if upper thread is active (this allows non power of two for + // blockDim.y) + if (threadIdx.y + shift < max_active_thread) { + join(result, tmp); + } + shift *= 2; + } + + // Broadcast the result to all the threads in the warp + result = Kokkos::Experimental::shfl(result, 0, warp_size); +} + +template <typename ValueType, typename JoinOp, + typename std::enable_if<!Kokkos::is_reducer<ValueType>::value, + int>::type = 0> +__device__ inline void hip_inter_warp_shuffle_reduction( + ValueType& value, const JoinOp& join, + const int max_active_thread = blockDim.y) { + unsigned int constexpr warp_size = + Kokkos::Experimental::Impl::HIPTraits::WarpSize; + int constexpr step_width = 8; + // Depending on the ValueType __shared__ memory must be aligned up to 8 byte + // boundaries. The reason not to use ValueType directly is that for types with + // constructors it could lead to race conditions. + __shared__ double sh_result[(sizeof(ValueType) + 7) / 8 * step_width]; + ValueType* result = reinterpret_cast<ValueType*>(&sh_result); + int const step = warp_size / blockDim.x; + int shift = step_width; + // Skip the code below if threadIdx.y % step != 0 + int const id = threadIdx.y % step == 0 ? threadIdx.y / step : INT_MAX; + if (id < step_width) { + result[id] = value; + } + __syncthreads(); + while (shift <= max_active_thread / step) { + if (shift <= id && shift + step_width > id && threadIdx.x == 0) { + join(result[id % step_width], value); + } + __syncthreads(); + shift += step_width; + } + + value = result[0]; + for (int i = 1; (i * step < max_active_thread) && (i < step_width); ++i) + join(value, result[i]); +} + +template <typename ValueType, typename JoinOp, + typename std::enable_if<!Kokkos::is_reducer<ValueType>::value, + int>::type = 0> +__device__ inline void hip_intra_block_shuffle_reduction( + ValueType& value, JoinOp const& join, + int const max_active_thread = blockDim.y) { + hip_intra_warp_shuffle_reduction(value, join, max_active_thread); + hip_inter_warp_shuffle_reduction(value, join, max_active_thread); +} + +template <class FunctorType, class JoinOp, class ArgTag = void> +__device__ inline bool hip_inter_block_shuffle_reduction( + typename FunctorValueTraits<FunctorType, ArgTag>::reference_type value, + typename FunctorValueTraits<FunctorType, ArgTag>::reference_type neutral, + JoinOp const& join, + Kokkos::Experimental::HIP::size_type* const m_scratch_space, + typename FunctorValueTraits<FunctorType, + ArgTag>::pointer_type const /*result*/, + Kokkos::Experimental::HIP::size_type* const m_scratch_flags, + int const max_active_thread = blockDim.y) { + using pointer_type = + typename FunctorValueTraits<FunctorType, ArgTag>::pointer_type; + using value_type = + typename FunctorValueTraits<FunctorType, ArgTag>::value_type; + + // Do the intra-block reduction with shfl operations for the intra warp + // reduction and static shared memory for the inter warp reduction + hip_intra_block_shuffle_reduction(value, join, max_active_thread); + + int const id = threadIdx.y * blockDim.x + threadIdx.x; + + // One thread in the block writes block result to global scratch_memory + if (id == 0) { + pointer_type global = + reinterpret_cast<pointer_type>(m_scratch_space) + blockIdx.x; + *global = value; + } + + // One warp of last block performs inter block reduction through loading the + // block values from global scratch_memory + bool last_block = false; + __threadfence(); + __syncthreads(); + int constexpr warp_size = Kokkos::Experimental::Impl::HIPTraits::WarpSize; + if (id < warp_size) { + Kokkos::Experimental::HIP::size_type count; + + // Figure out whether this is the last block + if (id == 0) count = Kokkos::atomic_fetch_add(m_scratch_flags, 1); + count = Kokkos::Experimental::shfl(count, 0, warp_size); + + // Last block does the inter block reduction + if (count == gridDim.x - 1) { + // set flag back to zero + if (id == 0) *m_scratch_flags = 0; + last_block = true; + value = neutral; + + pointer_type const volatile global = + reinterpret_cast<pointer_type>(m_scratch_space); + + // Reduce all global values with splitting work over threads in one warp + const int step_size = blockDim.x * blockDim.y < warp_size + ? blockDim.x * blockDim.y + : warp_size; + for (int i = id; i < static_cast<int>(gridDim.x); i += step_size) { + value_type tmp = global[i]; + join(value, tmp); + } + + // Perform shfl reductions within the warp only join if contribution is + // valid (allows gridDim.x non power of two and <warp_size) + for (unsigned int i = 1; i < warp_size; i *= 2) { + if ((blockDim.x * blockDim.y) > i) { + value_type tmp = Kokkos::Experimental::shfl_down(value, i, warp_size); + if (id + i < gridDim.x) join(value, tmp); + } + } + } + } + // The last block has in its thread=0 the global reduction value through + // "value" + return last_block; +} + +// We implemente the same functions as above but the user provide a Reducer +// instead of JoinOP +template <typename ReducerType, + typename std::enable_if<Kokkos::is_reducer<ReducerType>::value, + int>::type = 0> +__device__ inline void hip_intra_warp_shuffle_reduction( + const ReducerType& reducer, typename ReducerType::value_type& result, + const uint32_t max_active_thread = blockDim.y) { + using ValueType = typename ReducerType::value_type; + auto join_op = [&](ValueType& result, ValueType const& tmp) { + reducer.join(result, tmp); + }; + hip_intra_warp_shuffle_reduction(result, join_op, max_active_thread); + + reducer.reference() = result; +} + +template <typename ReducerType, + typename std::enable_if<Kokkos::is_reducer<ReducerType>::value, + int>::type = 0> +__device__ inline void hip_inter_warp_shuffle_reduction( + ReducerType const& reducer, typename ReducerType::value_type value, + int const max_active_thread = blockDim.y) { + using ValueType = typename ReducerType::value_type; + auto join_op = [&](ValueType& a, ValueType& b) { reducer.join(a, b); }; + hip_inter_warp_shuffle_reduction(value, join_op, max_active_thread); + + reducer.reference() = value; +} + +template <typename ReducerType, + typename std::enable_if<Kokkos::is_reducer<ReducerType>::value, + int>::type = 0> +__device__ inline void hip_intra_block_shuffle_reduction( + ReducerType const& reducer, typename ReducerType::value_type value, + int const max_active_thread = blockDim.y) { + hip_intra_warp_shuffle_reduction(reducer, value, max_active_thread); + hip_inter_warp_shuffle_reduction(reducer, value, max_active_thread); +} + +template <typename ReducerType, + typename std::enable_if<Kokkos::is_reducer<ReducerType>::value, + int>::type = 0> +__device__ inline void hip_intra_block_shuffle_reduction( + ReducerType const& reducer, int const max_active_thread = blockDim.y) { + hip_intra_block_shuffle_reduction(reducer, reducer.reference(), + max_active_thread); +} + +template <typename ReducerType, + typename std::enable_if<Kokkos::is_reducer<ReducerType>::value, + int>::type = 0> +__device__ inline bool hip_inter_block_shuffle_reduction( + ReducerType const& reducer, + Kokkos::Experimental::HIP::size_type* const m_scratch_space, + Kokkos::Experimental::HIP::size_type* const m_scratch_flags, + int const max_active_thread = blockDim.y) { + using pointer_type = typename ReducerType::value_type*; + using value_type = typename ReducerType::value_type; + + // Do the intra-block reduction with shfl operations for the intra warp + // reduction and static shared memory for the inter warp reduction + hip_intra_block_shuffle_reduction(reducer, max_active_thread); + + value_type value = reducer.reference(); + + int const id = threadIdx.y * blockDim.x + threadIdx.x; + + // One thread in the block writes block result to global scratch_memory + if (id == 0) { + pointer_type global = + reinterpret_cast<pointer_type>(m_scratch_space) + blockIdx.x; + *global = value; + } + + // One warp of last block performs inter block reduction through loading the + // block values from global scratch_memory + bool last_block = false; + + __threadfence(); + __syncthreads(); + int constexpr warp_size = Kokkos::Experimental::Impl::HIPTraits::WarpSize; + if (id < warp_size) { + Kokkos::Experimental::HIP::size_type count; + + // Figure out whether this is the last block + if (id == 0) count = Kokkos::atomic_fetch_add(m_scratch_flags, 1); + count = Kokkos::Experimental::shfl(count, 0, warp_size); + + // Last block does the inter block reduction + if (count == gridDim.x - 1) { + // Set flag back to zero + if (id == 0) *m_scratch_flags = 0; + last_block = true; + reducer.init(value); + + pointer_type const volatile global = + reinterpret_cast<pointer_type>(m_scratch_space); + + // Reduce all global values with splitting work over threads in one warp + int const step_size = blockDim.x * blockDim.y < warp_size + ? blockDim.x * blockDim.y + : warp_size; + for (int i = id; i < static_cast<int>(gridDim.x); i += step_size) { + value_type tmp = global[i]; + reducer.join(value, tmp); + } + + // Perform shfl reductions within the warp only join if contribution is + // valid (allows gridDim.x non power of two and <warp_size) + for (unsigned int i = 1; i < warp_size; i *= 2) { + if ((blockDim.x * blockDim.y) > i) { + value_type tmp = Kokkos::Experimental::shfl_down(value, i, warp_size); + if (id + i < gridDim.x) reducer.join(value, tmp); + } + __syncthreads(); + } + } + } + + // The last block has in its thread = 0 the global reduction value through + // "value" + return last_block; +} +} // namespace Impl +} // namespace Kokkos + +#endif + +#endif diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp new file mode 100644 index 0000000000000000000000000000000000000000..15ca089d14740b6a2c42c69945a17a0c7bfa1bcc --- /dev/null +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp @@ -0,0 +1,535 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Macros.hpp> + +#include <Kokkos_Core.hpp> +#include <Kokkos_HIP.hpp> +#include <Kokkos_HIP_Space.hpp> + +#include <impl/Kokkos_Error.hpp> +#include <impl/Kokkos_MemorySpace.hpp> + +#include <stdlib.h> +#include <iostream> +#include <sstream> +#include <stdexcept> +#include <algorithm> +#include <atomic> + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ +namespace Kokkos { +namespace Impl { + +namespace { +hipStream_t get_deep_copy_stream() { + static hipStream_t s = nullptr; + if (s == nullptr) { + HIP_SAFE_CALL(hipStreamCreate(&s)); + } + return s; +} +} // namespace + +DeepCopy<Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace, + Kokkos::Experimental::HIP>::DeepCopy(void* dst, const void* src, + size_t n) { + HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault)); +} + +DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace, + Kokkos::Experimental::HIP>::DeepCopy(void* dst, const void* src, + size_t n) { + HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault)); +} + +DeepCopy<Kokkos::Experimental::HIPSpace, HostSpace, + Kokkos::Experimental::HIP>::DeepCopy(void* dst, const void* src, + size_t n) { + HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault)); +} + +DeepCopy<Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace, + Kokkos::Experimental::HIP>::DeepCopy(const Kokkos::Experimental::HIP& + instance, + void* dst, const void* src, + size_t n) { + HIP_SAFE_CALL( + hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream())); +} + +DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIP>:: + DeepCopy(const Kokkos::Experimental::HIP& instance, void* dst, + const void* src, size_t n) { + HIP_SAFE_CALL( + hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream())); +} + +DeepCopy<Kokkos::Experimental::HIPSpace, HostSpace, Kokkos::Experimental::HIP>:: + DeepCopy(const Kokkos::Experimental::HIP& instance, void* dst, + const void* src, size_t n) { + HIP_SAFE_CALL( + hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream())); +} + +DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace, + Kokkos::Experimental::HIPHostPinnedSpace, + Kokkos::Experimental::HIP>::DeepCopy(void* dst, const void* src, + size_t n) { + HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault)); +} + +DeepCopy<HostSpace, Kokkos::Experimental::HIPHostPinnedSpace, + Kokkos::Experimental::HIP>::DeepCopy(void* dst, const void* src, + size_t n) { + HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault)); +} + +DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace, HostSpace, + Kokkos::Experimental::HIP>::DeepCopy(void* dst, const void* src, + size_t n) { + HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault)); +} + +DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace, + Kokkos::Experimental::HIPHostPinnedSpace, Kokkos::Experimental::HIP>:: + DeepCopy(const Kokkos::Experimental::HIP& instance, void* dst, + const void* src, size_t n) { + HIP_SAFE_CALL( + hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream())); +} + +DeepCopy<HostSpace, Kokkos::Experimental::HIPHostPinnedSpace, + Kokkos::Experimental::HIP>::DeepCopy(const Kokkos::Experimental::HIP& + instance, + void* dst, const void* src, + size_t n) { + HIP_SAFE_CALL( + hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream())); +} + +DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace, HostSpace, + Kokkos::Experimental::HIP>::DeepCopy(const Kokkos::Experimental::HIP& + instance, + void* dst, const void* src, + size_t n) { + HIP_SAFE_CALL( + hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream())); +} + +void DeepCopyAsyncHIP(void* dst, void const* src, size_t n) { + hipStream_t s = get_deep_copy_stream(); + HIP_SAFE_CALL(hipMemcpyAsync(dst, src, n, hipMemcpyDefault, s)); + HIP_SAFE_CALL(hipStreamSynchronize(s)); +} + +} // namespace Impl +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { + +KOKKOS_DEPRECATED void Experimental::HIPSpace::access_error() { + const std::string msg( + "Kokkos::Experimental::HIPSpace::access_error attempt to execute " + "Experimental::HIP function from non-HIP space"); + Kokkos::Impl::throw_runtime_exception(msg); +} + +KOKKOS_DEPRECATED void Experimental::HIPSpace::access_error(const void* const) { + const std::string msg( + "Kokkos::Experimental::HIPSpace::access_error attempt to execute " + "Experimental::HIP function from non-HIP space"); + Kokkos::Impl::throw_runtime_exception(msg); +} + +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Experimental { + +HIPSpace::HIPSpace() : m_device(HIP().hip_device()) {} + +HIPHostPinnedSpace::HIPHostPinnedSpace() {} + +void* HIPSpace::allocate(const size_t arg_alloc_size) const { + return allocate("[unlabeled]", arg_alloc_size); +} +void* HIPSpace::allocate( + + const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size) const { + return impl_allocate(arg_label, arg_alloc_size, arg_logical_size); +} +void* HIPSpace::impl_allocate( + + const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { + void* ptr = nullptr; + + auto const error_code = hipMalloc(&ptr, arg_alloc_size); + if (error_code != hipSuccess) { + // This is the only way to clear the last error, which we should do here + // since we're turning it into an exception here + (void)hipGetLastError(); + throw HIPRawMemoryAllocationFailure( + arg_alloc_size, error_code, + RawMemoryAllocationFailure::AllocationMechanism::HIPMalloc); + } + if (Kokkos::Profiling::profileLibraryLoaded()) { + const size_t reported_size = + (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; + Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size); + } + + return ptr; +} + +void* HIPHostPinnedSpace::allocate(const size_t arg_alloc_size) const { + return allocate("[unlabeled]", arg_alloc_size); +} +void* HIPHostPinnedSpace::allocate(const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size) const { + return impl_allocate(arg_label, arg_alloc_size, arg_logical_size); +} +void* HIPHostPinnedSpace::impl_allocate( + const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { + void* ptr = nullptr; + + auto const error_code = hipHostMalloc(&ptr, arg_alloc_size); + if (error_code != hipSuccess) { + // This is the only way to clear the last error, which we should do here + // since we're turning it into an exception here + (void)hipGetLastError(); + throw HIPRawMemoryAllocationFailure( + arg_alloc_size, error_code, + RawMemoryAllocationFailure::AllocationMechanism::HIPHostMalloc); + } + if (Kokkos::Profiling::profileLibraryLoaded()) { + const size_t reported_size = + (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; + Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size); + } + + return ptr; +} +void HIPSpace::deallocate(void* const arg_alloc_ptr, + const size_t arg_alloc_size) const { + deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size); +} +void HIPSpace::deallocate(const char* arg_label, void* const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t arg_logical_size) const { + impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size); +} +void HIPSpace::impl_deallocate( + const char* arg_label, void* const arg_alloc_ptr, + const size_t arg_alloc_size, const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { + if (Kokkos::Profiling::profileLibraryLoaded()) { + const size_t reported_size = + (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; + Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, + reported_size); + } + HIP_SAFE_CALL(hipFree(arg_alloc_ptr)); +} + +void HIPHostPinnedSpace::deallocate(void* const arg_alloc_ptr, + const size_t arg_alloc_size) const { + deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size); +} + +void HIPHostPinnedSpace::deallocate(const char* arg_label, + void* const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t arg_logical_size) const { + impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size); +} +void HIPHostPinnedSpace::impl_deallocate( + const char* arg_label, void* const arg_alloc_ptr, + const size_t arg_alloc_size, const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { + if (Kokkos::Profiling::profileLibraryLoaded()) { + const size_t reported_size = + (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; + Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, + reported_size); + } + HIP_SAFE_CALL(hipHostFree(arg_alloc_ptr)); +} + +} // namespace Experimental +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +#ifdef KOKKOS_ENABLE_DEBUG +SharedAllocationRecord<void, void> + SharedAllocationRecord<Kokkos::Experimental::HIPSpace, void>::s_root_record; + +SharedAllocationRecord<void, void> SharedAllocationRecord< + Kokkos::Experimental::HIPHostPinnedSpace, void>::s_root_record; +#endif + +SharedAllocationRecord<Kokkos::Experimental::HIPSpace, + void>::~SharedAllocationRecord() { + const char* label = nullptr; + if (Kokkos::Profiling::profileLibraryLoaded()) { + SharedAllocationHeader header; + Kokkos::Impl::DeepCopy<Kokkos::Experimental::HIPSpace, HostSpace>( + &header, RecordBase::m_alloc_ptr, sizeof(SharedAllocationHeader)); + label = header.label(); + } + auto alloc_size = SharedAllocationRecord<void, void>::m_alloc_size; + m_space.deallocate(label, SharedAllocationRecord<void, void>::m_alloc_ptr, + alloc_size, (alloc_size - sizeof(SharedAllocationHeader))); +} + +SharedAllocationRecord<Kokkos::Experimental::HIPHostPinnedSpace, + void>::~SharedAllocationRecord() { + m_space.deallocate(RecordBase::m_alloc_ptr->m_label, + SharedAllocationRecord<void, void>::m_alloc_ptr, + SharedAllocationRecord<void, void>::m_alloc_size); +} + +SharedAllocationRecord<Kokkos::Experimental::HIPSpace, void>:: + SharedAllocationRecord( + const Kokkos::Experimental::HIPSpace& arg_space, + const std::string& arg_label, const size_t arg_alloc_size, + const SharedAllocationRecord<void, void>::function_type arg_dealloc) + // Pass through allocated [ SharedAllocationHeader , user_memory ] + // Pass through deallocation function + : base_t( +#ifdef KOKKOS_ENABLE_DEBUG + &SharedAllocationRecord<Kokkos::Experimental::HIPSpace, + void>::s_root_record, +#endif + Kokkos::Impl::checked_allocation_with_header(arg_space, arg_label, + arg_alloc_size), + sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc), + m_space(arg_space) { + + SharedAllocationHeader header; + + this->base_t::_fill_host_accessible_header_info(header, arg_label); + + // Copy to device memory + Kokkos::Impl::DeepCopy<Kokkos::Experimental::HIPSpace, HostSpace>( + RecordBase::m_alloc_ptr, &header, sizeof(SharedAllocationHeader)); +} + +SharedAllocationRecord<Kokkos::Experimental::HIPHostPinnedSpace, void>:: + SharedAllocationRecord( + const Kokkos::Experimental::HIPHostPinnedSpace& arg_space, + const std::string& arg_label, const size_t arg_alloc_size, + const SharedAllocationRecord<void, void>::function_type arg_dealloc) + // Pass through allocated [ SharedAllocationHeader , user_memory ] + // Pass through deallocation function + : base_t( +#ifdef KOKKOS_ENABLE_DEBUG + &SharedAllocationRecord<Kokkos::Experimental::HIPHostPinnedSpace, + void>::s_root_record, +#endif + Kokkos::Impl::checked_allocation_with_header(arg_space, arg_label, + arg_alloc_size), + sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc), + m_space(arg_space) { + // Fill in the Header information, directly accessible via host pinned memory + this->base_t::_fill_host_accessible_header_info(*RecordBase::m_alloc_ptr, + arg_label); +} + +} // namespace Impl +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ +namespace Kokkos { +namespace Impl { +int get_gpu(const InitArguments& args); +} +namespace Experimental { + +int HIP::concurrency() { + auto const& prop = hip_device_prop(); + return prop.maxThreadsPerMultiProcessor * prop.multiProcessorCount; +} +int HIP::impl_is_initialized() { + return Impl::HIPInternal::singleton().is_initialized(); +} + +void HIP::impl_initialize(const HIP::SelectDevice config) { + Impl::HIPInternal::singleton().initialize(config.hip_device_id); +} + +void HIP::impl_finalize() { Impl::HIPInternal::singleton().finalize(); } + +HIP::HIP() + : m_space_instance(&Impl::HIPInternal::singleton(), + [](Impl::HIPInternal*) {}) { + Impl::HIPInternal::singleton().verify_is_initialized( + "HIP instance constructor"); +} + +HIP::HIP(hipStream_t const stream) + : m_space_instance(new Impl::HIPInternal, [](Impl::HIPInternal* ptr) { + ptr->finalize(); + delete ptr; + }) { + Impl::HIPInternal::singleton().verify_is_initialized( + "HIP instance constructor"); + m_space_instance->initialize(Impl::HIPInternal::singleton().m_hipDev, stream); +} + +void HIP::print_configuration(std::ostream& s, const bool) { + Impl::HIPInternal::singleton().print_configuration(s); +} + +void HIP::impl_static_fence() { HIP_SAFE_CALL(hipDeviceSynchronize()); } + +void HIP::fence() const { m_space_instance->fence(); } + +hipStream_t HIP::hip_stream() const { return m_space_instance->m_stream; } + +int HIP::hip_device() const { return impl_internal_space_instance()->m_hipDev; } + +hipDeviceProp_t const& HIP::hip_device_prop() { + return Impl::HIPInternal::singleton().m_deviceProp; +} + +const char* HIP::name() { return "HIP"; } + +} // namespace Experimental + +namespace Impl { + +int g_hip_space_factory_initialized = + initialize_space_factory<HIPSpaceInitializer>("150_HIP"); + +void HIPSpaceInitializer::initialize(const InitArguments& args) { + int use_gpu = Impl::get_gpu(args); + + if (std::is_same<Kokkos::Experimental::HIP, + Kokkos::DefaultExecutionSpace>::value || + 0 < use_gpu) { + if (use_gpu > -1) { + Kokkos::Experimental::HIP::impl_initialize( + Kokkos::Experimental::HIP::SelectDevice(use_gpu)); + } else { + Kokkos::Experimental::HIP::impl_initialize(); + } + } +} + +void HIPSpaceInitializer::finalize(const bool all_spaces) { + if (std::is_same<Kokkos::Experimental::HIP, + Kokkos::DefaultExecutionSpace>::value || + all_spaces) { + if (Kokkos::Experimental::HIP::impl_is_initialized()) + Kokkos::Experimental::HIP::impl_finalize(); + } +} + +void HIPSpaceInitializer::fence() { + Kokkos::Experimental::HIP::impl_static_fence(); +} + +void HIPSpaceInitializer::print_configuration(std::ostream& msg, + const bool detail) { + msg << "Devices:" << std::endl; + msg << " KOKKOS_ENABLE_HIP: "; + msg << "yes" << std::endl; + + msg << "HIP Options:" << std::endl; + msg << " KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE: "; +#ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE + msg << "yes" << std::endl; +#else + msg << "no" << std::endl; +#endif + + msg << "\nRuntime Configuration:" << std::endl; + Experimental::HIP::print_configuration(msg, detail); +} + +} // namespace Impl +} // namespace Kokkos + +//============================================================================== +// <editor-fold desc="Explicit instantiations of CRTP Base classes"> {{{1 + +#include <impl/Kokkos_SharedAlloc_timpl.hpp> + +namespace Kokkos { +namespace Impl { + +// To avoid additional compilation cost for something that's (mostly?) not +// performance sensitive, we explicity instantiate these CRTP base classes here, +// where we have access to the associated *_timpl.hpp header files. +template class HostInaccessibleSharedAllocationRecordCommon< + Kokkos::Experimental::HIPSpace>; +template class SharedAllocationRecordCommon<Kokkos::Experimental::HIPSpace>; +template class SharedAllocationRecordCommon< + Kokkos::Experimental::HIPHostPinnedSpace>; + +} // end namespace Impl +} // end namespace Kokkos + +// </editor-fold> end Explicit instantiations of CRTP Base classes }}}1 +//============================================================================== diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp new file mode 100644 index 0000000000000000000000000000000000000000..fe52886ced7c7a72454f9e731b3b5b4778f90073 --- /dev/null +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp @@ -0,0 +1,1125 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_HIP_TEAM_HPP +#define KOKKOS_HIP_TEAM_HPP + +#include <Kokkos_Macros.hpp> + +#if defined(__HIPCC__) + +#include <utility> +#include <Kokkos_Parallel.hpp> + +#include <HIP/Kokkos_HIP_KernelLaunch.hpp> +#include <HIP/Kokkos_HIP_ReduceScan.hpp> +#include <HIP/Kokkos_HIP_Shuffle_Reduce.hpp> +#include <HIP/Kokkos_HIP_BlockSize_Deduction.hpp> +#include <Kokkos_Vectorization.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <typename Type> +struct HIPJoinFunctor { + using value_type = Type; + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& update, + volatile const value_type& input) { + update += input; + } +}; + +/**\brief Team member_type passed to TeamPolicy or TeamTask closures. + * + * HIP thread blocks for team closures are dimensioned as: + * blockDim.x == number of "vector lanes" per "thread" + * blockDim.y == number of "threads" per team + * blockDim.z == number of teams in a block + * where + * A set of teams exactly fill a warp OR a team is the whole block + * ( 0 == WarpSize % ( blockDim.x * blockDim.y ) ) + * OR + * ( 1 == blockDim.z ) + + * Thus when 1 < blockDim.z the team is warp-synchronous + * and __syncthreads should not be called in team collectives. + * + * When multiple teams are mapped onto a single block then the + * total available shared memory must be partitioned among teams. + */ +class HIPTeamMember { + public: + using execution_space = Kokkos::Experimental::HIP; + using scratch_memory_space = execution_space::scratch_memory_space; + + private: + mutable void* m_team_reduce; + scratch_memory_space m_team_shared; + int m_team_reduce_size; + int m_league_rank; + int m_league_size; + + public: + KOKKOS_INLINE_FUNCTION + const execution_space::scratch_memory_space& team_shmem() const { + return m_team_shared.set_team_thread_mode(0, 1, 0); + } + + KOKKOS_INLINE_FUNCTION + const execution_space::scratch_memory_space& team_scratch( + const int& level) const { + return m_team_shared.set_team_thread_mode(level, 1, 0); + } + + KOKKOS_INLINE_FUNCTION + const execution_space::scratch_memory_space& thread_scratch( + const int& level) const { + return m_team_shared.set_team_thread_mode(level, team_size(), team_rank()); + } + + KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank; } + KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size; } + KOKKOS_INLINE_FUNCTION int team_rank() const { +#ifdef __HIP_DEVICE_COMPILE__ + return threadIdx.y; +#else + return 0; +#endif + } + + KOKKOS_INLINE_FUNCTION int team_size() const { +#ifdef __HIP_DEVICE_COMPILE__ + return blockDim.y; +#else + return 0; +#endif + } + + KOKKOS_INLINE_FUNCTION void team_barrier() const { +#ifdef __HIP_DEVICE_COMPILE__ + if (1 == blockDim.z) + __syncthreads(); // team == block + else + __threadfence_block(); // team <= warp +#endif + } + + //-------------------------------------------------------------------------- + + template <class ValueType> + KOKKOS_INLINE_FUNCTION void team_broadcast(ValueType& val, + const int& thread_id) const { +#ifdef __HIP_DEVICE_COMPILE__ + if (blockDim.z == 1) { // team == block + __syncthreads(); + // Wait for shared data write until all threads arrive here + if (threadIdx.x == 0u && + threadIdx.y == static_cast<uint32_t>(thread_id)) { + *(reinterpret_cast<ValueType*>(m_team_reduce)) = val; + } + __syncthreads(); // Wait for shared data read until root thread writes + val = *(reinterpret_cast<ValueType*>(m_team_reduce)); + } else { // team <= warp + ValueType tmp(val); // input might not be a register variable + ::Kokkos::Experimental::Impl::in_place_shfl( + val, tmp, blockDim.x * thread_id, blockDim.x * blockDim.y); + } +#else + (void)val; + (void)thread_id; +#endif + } + + template <class Closure, class ValueType> + KOKKOS_INLINE_FUNCTION void team_broadcast(Closure const& f, ValueType& val, + const int& thread_id) const { + f(val); + team_broadcast(val, thread_id); + } + + //-------------------------------------------------------------------------- + /**\brief Reduction across a team + * + * Mapping of teams onto blocks: + * blockDim.x is "vector lanes" + * blockDim.y is team "threads" + * blockDim.z is number of teams per block + * + * Requires: + * blockDim.x is power two + * blockDim.x <= HIPTraits::WarpSize + * ( 0 == HIPTraits::WarpSize % ( blockDim.x * blockDim.y ) + * OR + * ( 1 == blockDim.z ) + */ + template <typename ReducerType> + KOKKOS_INLINE_FUNCTION + typename std::enable_if<is_reducer<ReducerType>::value>::type + team_reduce(ReducerType const& reducer) const noexcept { + team_reduce(reducer, reducer.reference()); + } + + template <typename ReducerType> + KOKKOS_INLINE_FUNCTION + typename std::enable_if<is_reducer<ReducerType>::value>::type + team_reduce(ReducerType const& reducer, + typename ReducerType::value_type& value) const noexcept { +#ifdef __HIP_DEVICE_COMPILE__ + hip_intra_block_shuffle_reduction(reducer, value, blockDim.y); +#else + (void)reducer; + (void)value; +#endif + } + + //-------------------------------------------------------------------------- + /** \brief Intra-team exclusive prefix sum with team_rank() ordering + * with intra-team non-deterministic ordering accumulation. + * + * The global inter-team accumulation value will, at the end of the + * league's parallel execution, be the scan's total. + * Parallel execution ordering of the league's teams is non-deterministic. + * As such the base value for each team's scan operation is similarly + * non-deterministic. + */ + template <typename Type> + KOKKOS_INLINE_FUNCTION Type team_scan(const Type& value, + Type* const global_accum) const { +#ifdef __HIP_DEVICE_COMPILE__ + Type* const base_data = reinterpret_cast<Type*>(m_team_reduce); + + __syncthreads(); // Don't write in to shared data until all threads have + // entered this function + + if (0 == threadIdx.y) { + base_data[0] = 0; + } + + base_data[threadIdx.y + 1] = value; + + Impl::hip_intra_block_reduce_scan<true, Impl::HIPJoinFunctor<Type>, void>( + Impl::HIPJoinFunctor<Type>(), base_data + 1); + + if (global_accum) { + if (blockDim.y == threadIdx.y + 1) { + base_data[blockDim.y] = + atomic_fetch_add(global_accum, base_data[blockDim.y]); + } + __syncthreads(); // Wait for atomic + base_data[threadIdx.y] += base_data[blockDim.y]; + } + + return base_data[threadIdx.y]; +#else + (void)value; + (void)global_accum; + return Type(); +#endif + } + + /** \brief Intra-team exclusive prefix sum with team_rank() ordering. + * + * The highest rank thread can compute the reduction total as + * reduction_total = dev.team_scan( value ) + value ; + */ + template <typename Type> + KOKKOS_INLINE_FUNCTION Type team_scan(const Type& value) const { + return this->template team_scan<Type>(value, nullptr); + } + + //---------------------------------------- + + template <typename ReducerType> + KOKKOS_INLINE_FUNCTION static + typename std::enable_if<is_reducer<ReducerType>::value>::type + vector_reduce(ReducerType const& reducer) { + vector_reduce(reducer, reducer.reference()); + } + + template <typename ReducerType> + KOKKOS_INLINE_FUNCTION static + typename std::enable_if<is_reducer<ReducerType>::value>::type + vector_reduce(ReducerType const& reducer, + typename ReducerType::value_type& value) { +#ifdef __HIP_DEVICE_COMPILE__ + if (blockDim.x == 1) return; + + // Intra vector lane shuffle reduction: + typename ReducerType::value_type tmp(value); + typename ReducerType::value_type tmp2 = tmp; + + for (int i = blockDim.x; (i >>= 1);) { + ::Kokkos::Experimental::Impl::in_place_shfl_down(tmp2, tmp, i, + blockDim.x); + if (static_cast<int>(threadIdx.x) < i) { + reducer.join(tmp, tmp2); + } + } + + // Broadcast from root lane to all other lanes. + // Cannot use "butterfly" algorithm to avoid the broadcast + // because floating point summation is not associative + // and thus different threads could have different results. + + ::Kokkos::Experimental::Impl::in_place_shfl(tmp2, tmp, 0, blockDim.x); + value = tmp2; + reducer.reference() = tmp2; +#else + (void)reducer; + (void)value; +#endif + } + + //-------------------------------------------------------------------------- + /**\brief Global reduction across all blocks + * + * Return !0 if reducer contains the final value + */ + template <typename ReducerType> + KOKKOS_INLINE_FUNCTION static + typename std::enable_if<is_reducer<ReducerType>::value, int>::type + global_reduce(ReducerType const& reducer, int* const global_scratch_flags, + void* const global_scratch_space, void* const shmem, + int const shmem_size) { +#ifdef __HIP_DEVICE_COMPILE__ + using value_type = typename ReducerType::value_type; + using pointer_type = value_type volatile*; + + // Number of shared memory entries for the reduction: + const int nsh = shmem_size / sizeof(value_type); + + // Number of HIP threads in the block, rank within the block + const int nid = blockDim.x * blockDim.y * blockDim.z; + const int tid = + threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z); + + // Reduces within block using all available shared memory + // Contributes if it is the root "vector lane" + + // wn == number of warps in the block + // wx == which lane within the warp + // wy == which warp within the block + + const int wn = (nid + Experimental::Impl::HIPTraits::WarpIndexMask) >> + Experimental::Impl::HIPTraits::WarpIndexShift; + const int wx = tid & Experimental::Impl::HIPTraits::WarpIndexMask; + const int wy = tid >> Experimental::Impl::HIPTraits::WarpIndexShift; + + //------------------------ + { // Intra warp shuffle reduction from contributing HIP threads + + value_type tmp(reducer.reference()); + + int constexpr warp_size = + ::Kokkos::Experimental::Impl::HIPTraits::WarpSize; + for (int i = warp_size; static_cast<int>(blockDim.x) <= (i >>= 1);) { + Experimental::Impl::in_place_shfl_down(reducer.reference(), tmp, i, + warp_size); + + // Root of each vector lane reduces "thread" contribution + if (0 == threadIdx.x && wx < i) { + reducer.join(&tmp, reducer.data()); + } + } + + // Reduce across warps using shared memory. + // Number of warps may not be power of two. + + __syncthreads(); // Wait before shared data write + + // Number of shared memory entries for the reduction + // is at most one per warp + const int nentry = wn < nsh ? wn : nsh; + + if (0 == wx && wy < nentry) { + // Root thread of warp 'wy' has warp's value to contribute + (reinterpret_cast<value_type*>(shmem))[wy] = tmp; + } + + __syncthreads(); // Wait for write to be visible to block + + // When more warps than shared entries + // then warps must take turns joining their contribution + // to the designated shared memory entry. + for (int i = nentry; i < wn; i += nentry) { + const int k = wy - i; + + if (0 == wx && i <= wy && k < nentry) { + // Root thread of warp 'wy' has warp's value to contribute + reducer.join((reinterpret_cast<value_type*>(shmem)) + k, &tmp); + } + + __syncthreads(); // Wait for write to be visible to block + } + + // One warp performs the inter-warp reduction: + + if (0 == wy) { + // Start fan-in at power of two covering nentry + + for (int i = (1 << (warp_size - __clz(nentry - 1))); (i >>= 1);) { + const int k = wx + i; + if (wx < i && k < nentry) { + reducer.join((reinterpret_cast<pointer_type>(shmem)) + wx, + (reinterpret_cast<pointer_type>(shmem)) + k); + __threadfence_block(); // Wait for write to be visible to warp + } + } + } + } + //------------------------ + { // Write block's value to global_scratch_memory + + int last_block = 0; + + if (0 == wx) { + reducer.copy((reinterpret_cast<pointer_type>(global_scratch_space)) + + blockIdx.x * reducer.length(), + reducer.data()); + + __threadfence(); // Wait until global write is visible. + + last_block = static_cast<int>(gridDim.x) == + 1 + Kokkos::atomic_fetch_add(global_scratch_flags, 1); + + // If last block then reset count + if (last_block) *global_scratch_flags = 0; + } + + // FIXME hip does not support __syncthreads_or so we need to do it by hand + // last_block = __syncthreads_or(last_block); + + __shared__ int last_block_shared; + if (last_block) last_block_shared = last_block; + __threadfence_block(); + + if (!last_block_shared) return 0; + } + //------------------------ + // Last block reads global_scratch_memory into shared memory. + + const int nentry = nid < gridDim.x ? (nid < nsh ? nid : nsh) + : (gridDim.x < nsh ? gridDim.x : nsh); + + // nentry = min( nid , nsh , gridDim.x ) + + // whole block reads global memory into shared memory: + + if (tid < nentry) { + const int offset = tid * reducer.length(); + + reducer.copy( + (reinterpret_cast<pointer_type>(shmem)) + offset, + (reinterpret_cast<pointer_type>(global_scratch_space)) + offset); + + for (int i = nentry + tid; i < static_cast<int>(gridDim.x); i += nentry) { + reducer.join((reinterpret_cast<pointer_type>(shmem)) + offset, + (reinterpret_cast<pointer_type>(global_scratch_space)) + + i * reducer.length()); + } + } + + __syncthreads(); // Wait for writes to be visible to block + + if (0 == wy) { + // Iterate to reduce shared memory to single warp fan-in size + + int constexpr warp_size = + ::Kokkos::Experimental::Impl::HIPTraits::WarpSize; + const int nreduce = warp_size < nentry ? warp_size : nentry; + + if (wx < nreduce && nreduce < nentry) { + for (int i = nreduce + wx; i < nentry; i += nreduce) { + reducer.join(((pointer_type)shmem) + wx, ((pointer_type)shmem) + i); + } + __threadfence_block(); // Wait for writes to be visible to warp + } + + // Start fan-in at power of two covering nentry + + for (int i = (1 << (warp_size - __clz(nreduce - 1))); (i >>= 1);) { + const int k = wx + i; + if (wx < i && k < nreduce) { + reducer.join((reinterpret_cast<pointer_type>(shmem)) + wx, + (reinterpret_cast<pointer_type>(shmem)) + k); + __threadfence_block(); // Wait for writes to be visible to warp + } + } + + if (0 == wx) { + reducer.copy(reducer.data(), reinterpret_cast<pointer_type>(shmem)); + return 1; + } + } + return 0; +#else + (void)reducer; + (void)global_scratch_flags; + (void)global_scratch_space; + (void)shmem; + (void)shmem_size; + return 0; +#endif + } + + //---------------------------------------- + // Private for the driver + + KOKKOS_INLINE_FUNCTION + HIPTeamMember(void* shared, const int shared_begin, const int shared_size, + void* scratch_level_1_ptr, const int scratch_level_1_size, + const int arg_league_rank, const int arg_league_size) + : m_team_reduce(shared), + m_team_shared(((char*)shared) + shared_begin, shared_size, + scratch_level_1_ptr, scratch_level_1_size), + m_team_reduce_size(shared_begin), + m_league_rank(arg_league_rank), + m_league_size(arg_league_size) {} + + public: + // Declare to avoid unused private member warnings which are trigger + // when SFINAE excludes the member function which uses these variables + // Making another class a friend also surpresses these warnings + bool impl_avoid_sfinae_warning() const noexcept { + return m_team_reduce_size > 0 && m_team_reduce != nullptr; + } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <typename iType> +struct TeamThreadRangeBoundariesStruct<iType, HIPTeamMember> { + using index_type = iType; + const HIPTeamMember& member; + const iType start; + const iType end; + + KOKKOS_INLINE_FUNCTION + TeamThreadRangeBoundariesStruct(const HIPTeamMember& thread_, iType count) + : member(thread_), start(0), end(count) {} + + KOKKOS_INLINE_FUNCTION + TeamThreadRangeBoundariesStruct(const HIPTeamMember& thread_, iType begin_, + iType end_) + : member(thread_), start(begin_), end(end_) {} +}; + +template <typename iType> +struct TeamVectorRangeBoundariesStruct<iType, HIPTeamMember> { + using index_type = iType; + const HIPTeamMember& member; + const iType start; + const iType end; + + KOKKOS_INLINE_FUNCTION + TeamVectorRangeBoundariesStruct(const HIPTeamMember& thread_, + const iType& count) + : member(thread_), start(0), end(count) {} + + KOKKOS_INLINE_FUNCTION + TeamVectorRangeBoundariesStruct(const HIPTeamMember& thread_, + const iType& begin_, const iType& end_) + : member(thread_), start(begin_), end(end_) {} +}; + +template <typename iType> +struct ThreadVectorRangeBoundariesStruct<iType, HIPTeamMember> { + using index_type = iType; + const index_type start; + const index_type end; + + KOKKOS_INLINE_FUNCTION + ThreadVectorRangeBoundariesStruct(const HIPTeamMember, index_type count) + : start(static_cast<index_type>(0)), end(count) {} + + KOKKOS_INLINE_FUNCTION + ThreadVectorRangeBoundariesStruct(index_type count) + : start(static_cast<index_type>(0)), end(count) {} + + KOKKOS_INLINE_FUNCTION + ThreadVectorRangeBoundariesStruct(const HIPTeamMember, index_type arg_begin, + index_type arg_end) + : start(arg_begin), end(arg_end) {} + + KOKKOS_INLINE_FUNCTION + ThreadVectorRangeBoundariesStruct(index_type arg_begin, index_type arg_end) + : start(arg_begin), end(arg_end) {} +}; + +} // namespace Impl + +template <typename iType> +KOKKOS_INLINE_FUNCTION + Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HIPTeamMember> + TeamThreadRange(const Impl::HIPTeamMember& thread, iType count) { + return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HIPTeamMember>( + thread, count); +} + +template <typename iType1, typename iType2> +KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct< + typename std::common_type<iType1, iType2>::type, Impl::HIPTeamMember> +TeamThreadRange(const Impl::HIPTeamMember& thread, iType1 begin, iType2 end) { + using iType = typename std::common_type<iType1, iType2>::type; + return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HIPTeamMember>( + thread, iType(begin), iType(end)); +} + +template <typename iType> +KOKKOS_INLINE_FUNCTION + Impl::TeamVectorRangeBoundariesStruct<iType, Impl::HIPTeamMember> + TeamVectorRange(const Impl::HIPTeamMember& thread, const iType& count) { + return Impl::TeamVectorRangeBoundariesStruct<iType, Impl::HIPTeamMember>( + thread, count); +} + +template <typename iType1, typename iType2> +KOKKOS_INLINE_FUNCTION Impl::TeamVectorRangeBoundariesStruct< + typename std::common_type<iType1, iType2>::type, Impl::HIPTeamMember> +TeamVectorRange(const Impl::HIPTeamMember& thread, const iType1& begin, + const iType2& end) { + using iType = typename std::common_type<iType1, iType2>::type; + return Impl::TeamVectorRangeBoundariesStruct<iType, Impl::HIPTeamMember>( + thread, iType(begin), iType(end)); +} + +template <typename iType> +KOKKOS_INLINE_FUNCTION + Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HIPTeamMember> + ThreadVectorRange(const Impl::HIPTeamMember& thread, iType count) { + return Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HIPTeamMember>( + thread, count); +} + +template <typename iType1, typename iType2> +KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct< + typename std::common_type<iType1, iType2>::type, Impl::HIPTeamMember> +ThreadVectorRange(const Impl::HIPTeamMember& thread, iType1 arg_begin, + iType2 arg_end) { + using iType = typename std::common_type<iType1, iType2>::type; + return Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HIPTeamMember>( + thread, iType(arg_begin), iType(arg_end)); +} + +KOKKOS_INLINE_FUNCTION +Impl::ThreadSingleStruct<Impl::HIPTeamMember> PerTeam( + const Impl::HIPTeamMember& thread) { + return Impl::ThreadSingleStruct<Impl::HIPTeamMember>(thread); +} + +KOKKOS_INLINE_FUNCTION +Impl::VectorSingleStruct<Impl::HIPTeamMember> PerThread( + const Impl::HIPTeamMember& thread) { + return Impl::VectorSingleStruct<Impl::HIPTeamMember>(thread); +} + +//---------------------------------------------------------------------------- + +/** \brief Inter-thread parallel_for. + * + * Executes closure(iType i) for each i=[0..N). + * + * The range [0..N) is mapped to all threads of the the calling thread team. + */ +template <typename iType, class Closure> +KOKKOS_INLINE_FUNCTION void parallel_for( + const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HIPTeamMember>& + loop_boundaries, + const Closure& closure) { +#ifdef __HIP_DEVICE_COMPILE__ + for (iType i = loop_boundaries.start + threadIdx.y; i < loop_boundaries.end; + i += blockDim.y) + closure(i); +#else + (void)loop_boundaries; + (void)closure; +#endif +} + +//---------------------------------------------------------------------------- + +/** \brief Inter-thread parallel_reduce with a reducer. + * + * Executes closure(iType i, ValueType & val) for each i=[0..N) + * + * The range [0..N) is mapped to all threads of the + * calling thread team and a summation of val is + * performed and put into result. + */ +template <typename iType, class Closure, class ReducerType> +KOKKOS_INLINE_FUNCTION + typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type + parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::HIPTeamMember>& loop_boundaries, + const Closure& closure, const ReducerType& reducer) { +#ifdef __HIP_DEVICE_COMPILE__ + typename ReducerType::value_type value; + reducer.init(value); + + for (iType i = loop_boundaries.start + threadIdx.y; i < loop_boundaries.end; + i += blockDim.y) { + closure(i, value); + } + + loop_boundaries.member.team_reduce(reducer, value); +#else + (void)loop_boundaries; + (void)closure; + (void)reducer; +#endif +} + +/** \brief Inter-thread parallel_reduce assuming summation. + * + * Executes closure(iType i, ValueType & val) for each i=[0..N) + * + * The range [0..N) is mapped to all threads of the + * calling thread team and a summation of val is + * performed and put into result. + */ +template <typename iType, class Closure, typename ValueType> +KOKKOS_INLINE_FUNCTION + typename std::enable_if<!Kokkos::is_reducer<ValueType>::value>::type + parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::HIPTeamMember>& loop_boundaries, + const Closure& closure, ValueType& result) { +#ifdef __HIP_DEVICE_COMPILE__ + ValueType val; + Kokkos::Sum<ValueType> reducer(val); + + reducer.init(reducer.reference()); + + for (iType i = loop_boundaries.start + threadIdx.y; i < loop_boundaries.end; + i += blockDim.y) { + closure(i, val); + } + + loop_boundaries.member.team_reduce(reducer, val); + result = reducer.reference(); +#else + (void)loop_boundaries; + (void)closure; + (void)result; +#endif +} + +/** \brief Inter-thread parallel exclusive prefix sum. + * + * Executes closure(iType i, ValueType & val, bool final) for each i=[0..N) + * + * The range [0..N) is mapped to each rank in the team (whose global rank is + * less than N) and a scan operation is performed. The last call to closure has + * final == true. + */ +// This is the same code as in CUDA and largely the same as in OpenMPTarget +template <typename iType, typename FunctorType> +KOKKOS_INLINE_FUNCTION void parallel_scan( + const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HIPTeamMember>& + loop_bounds, + const FunctorType& lambda) { + // Extract value_type from lambda + using value_type = typename Kokkos::Impl::FunctorAnalysis< + Kokkos::Impl::FunctorPatternInterface::SCAN, void, + FunctorType>::value_type; + + const auto start = loop_bounds.start; + const auto end = loop_bounds.end; + auto& member = loop_bounds.member; + const auto team_size = member.team_size(); + const auto team_rank = member.team_rank(); + const auto nchunk = (end - start + team_size - 1) / team_size; + value_type accum = 0; + // each team has to process one or more chunks of the prefix scan + for (iType i = 0; i < nchunk; ++i) { + auto ii = start + i * team_size + team_rank; + // local accumulation for this chunk + value_type local_accum = 0; + // user updates value with prefix value + if (ii < loop_bounds.end) lambda(ii, local_accum, false); + // perform team scan + local_accum = member.team_scan(local_accum); + // add this blocks accum to total accumulation + auto val = accum + local_accum; + // user updates their data with total accumulation + if (ii < loop_bounds.end) lambda(ii, val, true); + // the last value needs to be propogated to next chunk + if (team_rank == team_size - 1) accum = val; + // broadcast last value to rest of the team + member.team_broadcast(accum, team_size - 1); + } +} + +template <typename iType, class Closure> +KOKKOS_INLINE_FUNCTION void parallel_for( + const Impl::TeamVectorRangeBoundariesStruct<iType, Impl::HIPTeamMember>& + loop_boundaries, + const Closure& closure) { +#ifdef __HIP_DEVICE_COMPILE__ + for (iType i = loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x; + i < loop_boundaries.end; i += blockDim.y * blockDim.x) + closure(i); +#else + (void)loop_boundaries; + (void)closure; +#endif +} + +template <typename iType, class Closure, class ReducerType> +KOKKOS_INLINE_FUNCTION + typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type + parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< + iType, Impl::HIPTeamMember>& loop_boundaries, + const Closure& closure, const ReducerType& reducer) { +#ifdef __HIP_DEVICE_COMPILE__ + typename ReducerType::value_type value; + reducer.init(value); + + for (iType i = loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x; + i < loop_boundaries.end; i += blockDim.y * blockDim.x) { + closure(i, value); + } + + loop_boundaries.member.vector_reduce(reducer, value); + loop_boundaries.member.team_reduce(reducer, value); +#else + (void)loop_boundaries; + (void)closure; + (void)reducer; +#endif +} + +template <typename iType, class Closure, typename ValueType> +KOKKOS_INLINE_FUNCTION + typename std::enable_if<!Kokkos::is_reducer<ValueType>::value>::type + parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< + iType, Impl::HIPTeamMember>& loop_boundaries, + const Closure& closure, ValueType& result) { +#ifdef __HIP_DEVICE_COMPILE__ + ValueType val; + Kokkos::Sum<ValueType> reducer(val); + + reducer.init(reducer.reference()); + + for (iType i = loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x; + i < loop_boundaries.end; i += blockDim.y * blockDim.x) { + closure(i, val); + } + + loop_boundaries.member.vector_reduce(reducer); + loop_boundaries.member.team_reduce(reducer); + result = reducer.reference(); +#else + (void)loop_boundaries; + (void)closure; + (void)result; +#endif +} + +//---------------------------------------------------------------------------- + +/** \brief Intra-thread vector parallel_for. + * + * Executes closure(iType i) for each i=[0..N) + * + * The range [0..N) is mapped to all vector lanes of the the calling thread. + */ +template <typename iType, class Closure> +KOKKOS_INLINE_FUNCTION void parallel_for( + const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HIPTeamMember>& + loop_boundaries, + const Closure& closure) { +#ifdef __HIP_DEVICE_COMPILE__ + for (iType i = loop_boundaries.start + threadIdx.x; i < loop_boundaries.end; + i += blockDim.x) { + closure(i); + } +#else + (void)loop_boundaries; + (void)closure; +#endif +} + +//---------------------------------------------------------------------------- + +/** \brief Intra-thread vector parallel_reduce. + * + * Calls closure(iType i, ValueType & val) for each i=[0..N). + * + * The range [0..N) is mapped to all vector lanes of + * the calling thread and a reduction of val is performed using += + * and output into result. + * + * The identity value for the += operator is assumed to be the default + * constructed value. + */ +template <typename iType, class Closure, class ReducerType> +KOKKOS_INLINE_FUNCTION + typename std::enable_if<is_reducer<ReducerType>::value>::type + parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::HIPTeamMember> const& loop_boundaries, + Closure const& closure, ReducerType const& reducer) { +#ifdef __HIP_DEVICE_COMPILE__ + reducer.init(reducer.reference()); + + for (iType i = loop_boundaries.start + threadIdx.x; i < loop_boundaries.end; + i += blockDim.x) { + closure(i, reducer.reference()); + } + + Impl::HIPTeamMember::vector_reduce(reducer); +#else + (void)loop_boundaries; + (void)closure; + (void)reducer; +#endif +} + +/** \brief Intra-thread vector parallel_reduce. + * + * Calls closure(iType i, ValueType & val) for each i=[0..N). + * + * The range [0..N) is mapped to all vector lanes of + * the calling thread and a reduction of val is performed using += + * and output into result. + * + * The identity value for the += operator is assumed to be the default + * constructed value. + */ +template <typename iType, class Closure, typename ValueType> +KOKKOS_INLINE_FUNCTION + typename std::enable_if<!is_reducer<ValueType>::value>::type + parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::HIPTeamMember> const& loop_boundaries, + Closure const& closure, ValueType& result) { +#ifdef __HIP_DEVICE_COMPILE__ + result = ValueType(); + + for (iType i = loop_boundaries.start + threadIdx.x; i < loop_boundaries.end; + i += blockDim.x) { + closure(i, result); + } + + Impl::HIPTeamMember::vector_reduce(Kokkos::Sum<ValueType>(result)); +#else + (void)loop_boundaries; + (void)closure; + (void)result; +#endif +} + +//---------------------------------------------------------------------------- + +/** \brief Intra-thread vector parallel scan with reducer. + * + * Executes closure(iType i, ValueType & val, bool final) for each i=[0..N) + * + * The range [0..N) is mapped to all vector lanes in the + * thread and a scan operation is performed. + * The last call to closure has final == true. + */ +template <typename iType, class Closure, typename ReducerType> +KOKKOS_INLINE_FUNCTION + typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type + parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::HIPTeamMember>& loop_boundaries, + const Closure& closure, const ReducerType& reducer) { +#ifdef __HIP_DEVICE_COMPILE__ + using value_type = typename ReducerType::value_type; + value_type accum; + reducer.init(accum); + const value_type identity = accum; + + // Loop through boundaries by vector-length chunks + // must scan at each iteration + + // All thread "lanes" must loop the same number of times. + // Determine an loop end for all thread "lanes." + // Requires: + // blockDim.x is power of two and thus + // ( end % blockDim.x ) == ( end & ( blockDim.x - 1 ) ) + // 1 <= blockDim.x <= HIPTraits::WarpSize + + const int mask = blockDim.x - 1; + const int rem = loop_boundaries.end & mask; // == end % blockDim.x + const int end = loop_boundaries.end + (rem ? blockDim.x - rem : 0); + + for (int i = threadIdx.x; i < end; i += blockDim.x) { + value_type val = identity; + + // First acquire per-lane contributions. + // This sets i's val to i-1's contribution + // to make the latter in_place_shfl_up an + // exclusive scan -- the final accumulation + // of i's val will be included in the second + // closure call later. + if (i < loop_boundaries.end && threadIdx.x > 0) closure(i - 1, val, false); + + // Bottom up exclusive scan in triangular pattern + // where each HIP thread is the root of a reduction tree + // from the zeroth "lane" to itself. + // [t] += [t-1] if t >= 1 + // [t] += [t-2] if t >= 2 + // [t] += [t-4] if t >= 4 + // ... + // This differs from the non-reducer overload, where an inclusive scan was + // implemented, because in general the binary operator cannot be inverted + // and we would not be able to remove the inclusive contribution by + // inversion. + for (int j = 1; j < static_cast<int>(blockDim.x); j <<= 1) { + value_type tmp = identity; + ::Kokkos::Experimental::Impl::in_place_shfl_up(tmp, val, j, blockDim.x); + if (j <= static_cast<int>(threadIdx.x)) { + reducer.join(val, tmp); + } + } + + // Include accumulation + reducer.join(val, accum); + + // Update i's contribution into the val + // and add it to accum for next round + if (i < loop_boundaries.end) closure(i, val, true); + ::Kokkos::Experimental::Impl::in_place_shfl(accum, val, blockDim.x - 1, + blockDim.x); + } +#else + (void)loop_boundaries; + (void)closure; + (void)reducer; +#endif +} + +//---------------------------------------------------------------------------- + +/** \brief Intra-thread vector parallel exclusive prefix sum. + * + * Executes closure(iType i, ValueType & val, bool final) for each i=[0..N) + * + * The range [0..N) is mapped to all vector lanes in the + * thread and a scan operation is performed. + * The last call to closure has final == true. + */ +template <typename iType, class Closure> +KOKKOS_INLINE_FUNCTION void parallel_scan( + const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HIPTeamMember>& + loop_boundaries, + const Closure& closure) { + using value_type = typename Kokkos::Impl::FunctorAnalysis< + Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type; + value_type dummy; + parallel_scan(loop_boundaries, closure, Kokkos::Sum<value_type>(dummy)); +} + +} // namespace Kokkos + +namespace Kokkos { + +template <class FunctorType> +KOKKOS_INLINE_FUNCTION void single( + const Impl::VectorSingleStruct<Impl::HIPTeamMember>&, + const FunctorType& lambda) { +#ifdef __HIP_DEVICE_COMPILE__ + if (threadIdx.x == 0) lambda(); +#else + (void)lambda; +#endif +} + +template <class FunctorType> +KOKKOS_INLINE_FUNCTION void single( + const Impl::ThreadSingleStruct<Impl::HIPTeamMember>&, + const FunctorType& lambda) { +#ifdef __HIP_DEVICE_COMPILE__ + if (threadIdx.x == 0 && threadIdx.y == 0) lambda(); +#else + (void)lambda; +#endif +} + +template <class FunctorType, class ValueType> +KOKKOS_INLINE_FUNCTION void single( + const Impl::VectorSingleStruct<Impl::HIPTeamMember>&, + const FunctorType& lambda, ValueType& val) { +#ifdef __HIP_DEVICE_COMPILE__ + if (threadIdx.x == 0) lambda(val); + ::Kokkos::Experimental::Impl::in_place_shfl(val, val, 0, blockDim.x); +#else + (void)lambda; + (void)val; +#endif +} + +template <class FunctorType, class ValueType> +KOKKOS_INLINE_FUNCTION void single( + const Impl::ThreadSingleStruct<Impl::HIPTeamMember>& single_struct, + const FunctorType& lambda, ValueType& val) { +#ifdef __HIP_DEVICE_COMPILE__ + if (threadIdx.x == 0 && threadIdx.y == 0) { + lambda(val); + } + single_struct.team_member.team_broadcast(val, 0); +#else + (void)single_struct; + (void)lambda; + (void)val; +#endif +} + +} // namespace Kokkos + +#endif /* defined( __HIPCC__ ) */ + +#endif /* #ifndef KOKKOS_HIP_TEAM_HPP */ diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_UniqueToken.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_UniqueToken.hpp new file mode 100644 index 0000000000000000000000000000000000000000..f7e38a508b1696fe09701bc5b01de4cecd2d1344 --- /dev/null +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_UniqueToken.hpp @@ -0,0 +1,129 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_HIP_UNIQUE_TOKEN_HPP +#define KOKKOS_HIP_UNIQUE_TOKEN_HPP + +#include <impl/Kokkos_ConcurrentBitset.hpp> +#include <Kokkos_HIP_Space.hpp> +#include <Kokkos_UniqueToken.hpp> + +namespace Kokkos { +namespace Experimental { + +// both global and instance Unique Tokens are implemented in the same way +template <> +class UniqueToken<HIP, UniqueTokenScope::Global> { + protected: + uint32_t volatile* m_buffer; + uint32_t m_count; + + public: + using execution_space = HIP; + using size_type = int32_t; + + explicit UniqueToken(execution_space const& = execution_space()) + : m_buffer(Impl::HIPInternal::singleton().m_scratchConcurrentBitset), + m_count(HIP::concurrency()) {} + + KOKKOS_DEFAULTED_FUNCTION + UniqueToken(const UniqueToken&) = default; + + KOKKOS_DEFAULTED_FUNCTION + UniqueToken(UniqueToken&&) = default; + + KOKKOS_DEFAULTED_FUNCTION + UniqueToken& operator=(const UniqueToken&) = default; + + KOKKOS_DEFAULTED_FUNCTION + UniqueToken& operator=(UniqueToken&&) = default; + + /// \brief upper bound for acquired values, i.e. 0 <= value < size() + KOKKOS_INLINE_FUNCTION + size_type size() const noexcept { return m_count; } + + /// \brief acquire value such that 0 <= value < size() + KOKKOS_INLINE_FUNCTION + size_type acquire() const { + const Kokkos::pair<int, int> result = + Kokkos::Impl::concurrent_bitset::acquire_bounded( + m_buffer, m_count, Kokkos::Impl::clock_tic() % m_count); + + if (result.first < 0) { + Kokkos::abort( + "UniqueToken<HIP> failure to acquire tokens, no tokens available"); + } + + return result.first; + } + + /// \brief release an acquired value + KOKKOS_INLINE_FUNCTION + void release(size_type i) const noexcept { + Kokkos::Impl::concurrent_bitset::release(m_buffer, i); + } +}; + +template <> +class UniqueToken<HIP, UniqueTokenScope::Instance> + : public UniqueToken<HIP, UniqueTokenScope::Global> { + View<uint32_t*, HIPSpace> m_buffer_view; + + public: + explicit UniqueToken(execution_space const& arg = execution_space()) + : UniqueToken<HIP, UniqueTokenScope::Global>(arg) {} + + UniqueToken(size_type max_size, execution_space const& = execution_space()) + : m_buffer_view( + "UniqueToken::m_buffer_view", + ::Kokkos::Impl::concurrent_bitset::buffer_bound(max_size)) { + m_buffer = m_buffer_view.data(); + m_count = max_size; + } +}; + +} // namespace Experimental +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp new file mode 100644 index 0000000000000000000000000000000000000000..c5ca89a9fdeb61bd7965df1b706443b96b294180 --- /dev/null +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp @@ -0,0 +1,215 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_HIP_VECTORIZATION_HPP +#define KOKKOS_HIP_VECTORIZATION_HPP + +#include <Kokkos_Macros.hpp> + +namespace Kokkos { +namespace Experimental { +namespace Impl { + +//---------------------------------------------------------------------------- +// Shuffle operations require input to be a register (stack) variable + +// Derived implements do_shfl_op( T& in, int lane, int width), +// which turns in to one of __shfl_XXX +// Since the logic with respect to value sizes, etc., is the same everywhere, +// put it all in one place. +template <class Derived> +struct in_place_shfl_op { + // CRTP boilerplate + __device__ KOKKOS_IMPL_FORCEINLINE const Derived& self() const noexcept { + return *static_cast<Derived const*>(this); + } + + // FIXME_HIP depends on UB + // sizeof(Scalar) < sizeof(int) case + template <class Scalar> + // requires _assignable_from_bits<Scalar> + __device__ inline typename std::enable_if<sizeof(Scalar) < sizeof(int)>::type + operator()(Scalar& out, Scalar const& in, int lane_or_delta, int width) const + noexcept { + using shfl_type = int; + union conv_type { + Scalar orig; + shfl_type conv; + }; + conv_type tmp_in; + tmp_in.orig = in; + conv_type tmp_out; + tmp_out.conv = tmp_in.conv; + conv_type res; + //------------------------------------------------ + res.conv = self().do_shfl_op( + reinterpret_cast<shfl_type const&>(tmp_out.conv), lane_or_delta, width); + //------------------------------------------------ + out = res.orig; + } + + // sizeof(Scalar) == sizeof(int) case + template <class Scalar> + // requires _assignable_from_bits<Scalar> + __device__ inline typename std::enable_if<sizeof(Scalar) == sizeof(int)>::type + operator()(Scalar& out, Scalar const& in, int lane_or_delta, int width) const + noexcept { + reinterpret_cast<int&>(out) = self().do_shfl_op( + reinterpret_cast<int const&>(in), lane_or_delta, width); + } + + template <class Scalar> + __device__ inline + typename std::enable_if<sizeof(Scalar) == sizeof(double)>::type + operator()(Scalar& out, Scalar const& in, int lane_or_delta, + int width) const noexcept { + reinterpret_cast<double&>(out) = self().do_shfl_op( + *reinterpret_cast<double const*>(&in), lane_or_delta, width); + } + + // sizeof(Scalar) > sizeof(double) case + template <typename Scalar> + __device__ inline + typename std::enable_if<(sizeof(Scalar) > sizeof(double))>::type + operator()(Scalar& out, const Scalar& val, int lane_or_delta, + int width) const noexcept { + using shuffle_as_t = int; + int constexpr N = sizeof(Scalar) / sizeof(shuffle_as_t); + + for (int i = 0; i < N; ++i) { + reinterpret_cast<shuffle_as_t*>(&out)[i] = self().do_shfl_op( + reinterpret_cast<shuffle_as_t const*>(&val)[i], lane_or_delta, width); + } + } +}; + +struct in_place_shfl_fn : in_place_shfl_op<in_place_shfl_fn> { + template <class T> + __device__ KOKKOS_IMPL_FORCEINLINE T do_shfl_op(T& val, int lane, + int width) const noexcept { + // FIXME_HIP Not sure why there is a race condition here. Note that the + // problem was also found in the CUDA backend with CUDA clang + // (https://github.com/kokkos/kokkos/issues/941) but it seems more limited + // in CUDA clang. + auto return_val = __shfl(val, lane, width); + __threadfence(); + return return_val; + } +}; + +template <class... Args> +__device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl(Args&&... args) noexcept { + in_place_shfl_fn{}((Args &&) args...); +} + +struct in_place_shfl_up_fn : in_place_shfl_op<in_place_shfl_up_fn> { + template <class T> + __device__ KOKKOS_IMPL_FORCEINLINE T do_shfl_op(T& val, int lane, + int width) const noexcept { + // FIXME_HIP Not sure why there is a race condition here. Note that the + // problem was also found in the CUDA backend with CUDA clang + // (https://github.com/kokkos/kokkos/issues/941) but it seems more limited + // in CUDA clang. + auto return_val = __shfl_up(val, lane, width); + __threadfence(); + return return_val; + } +}; + +template <class... Args> +__device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl_up( + Args&&... args) noexcept { + in_place_shfl_up_fn{}((Args &&) args...); +} + +struct in_place_shfl_down_fn : in_place_shfl_op<in_place_shfl_down_fn> { + template <class T> + __device__ KOKKOS_IMPL_FORCEINLINE T do_shfl_op(T& val, int lane, + int width) const noexcept { + // FIXME_HIP Not sure why there is a race condition here. Note that the + // problem was also found in the CUDA backend with CUDA clang + // (https://github.com/kokkos/kokkos/issues/941) but it seems more limited + // in CUDA clang. + auto return_val = __shfl_down(val, lane, width); + __threadfence(); + return return_val; + } +}; + +template <class... Args> +__device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl_down( + Args&&... args) noexcept { + in_place_shfl_down_fn{}((Args &&) args...); +} + +} // namespace Impl + +template <class T> +// requires default_constructible<T> && _assignable_from_bits<T> +__device__ inline T shfl(const T& val, const int& srcLane, const int& width) { + T rv = {}; + Impl::in_place_shfl(rv, val, srcLane, width); + return rv; +} + +template <class T> +// requires default_constructible<T> && _assignable_from_bits<T> +__device__ inline T shfl_down(const T& val, int delta, int width) { + T rv = {}; + Impl::in_place_shfl_down(rv, val, delta, width); + return rv; +} + +template <class T> +// requires default_constructible<T> && _assignable_from_bits<T> +__device__ inline T shfl_up(const T& val, int delta, int width) { + T rv = {}; + Impl::in_place_shfl_up(rv, val, delta, width); + return rv; +} + +} // namespace Experimental +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_WorkGraphPolicy.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_WorkGraphPolicy.hpp new file mode 100644 index 0000000000000000000000000000000000000000..3e053d8f14a5ce5211ac6687851c6dd807c56d94 --- /dev/null +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_WorkGraphPolicy.hpp @@ -0,0 +1,112 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_HIP_WORKGRAPHPOLICY_HPP +#define KOKKOS_HIP_WORKGRAPHPOLICY_HPP + +#include <Kokkos_HIP.hpp> + +namespace Kokkos { +namespace Impl { + +template <class FunctorType, class... Traits> +class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>, + Kokkos::Experimental::HIP> { + public: + using Policy = Kokkos::WorkGraphPolicy<Traits...>; + using Self = ParallelFor<FunctorType, Policy, Kokkos::Experimental::HIP>; + + private: + Policy m_policy; + FunctorType m_functor; + + template <class TagType> + __device__ inline + typename std::enable_if<std::is_same<TagType, void>::value>::type + exec_one(const std::int32_t w) const noexcept { + m_functor(w); + } + + template <class TagType> + __device__ inline + typename std::enable_if<!std::is_same<TagType, void>::value>::type + exec_one(const std::int32_t w) const noexcept { + const TagType t{}; + m_functor(t, w); + } + + public: + __device__ inline void operator()() const noexcept { + // Spin until COMPLETED_TOKEN. + // END_TOKEN indicates no work is currently available. + for (std::int32_t w = Policy::END_TOKEN; + Policy::COMPLETED_TOKEN != (w = m_policy.pop_work());) { + if (Policy::END_TOKEN != w) { + exec_one<typename Policy::work_tag>(w); + m_policy.completed_work(w); + } + } + } + + inline void execute() { + const int warps_per_block = 4; + const dim3 grid( + Kokkos::Experimental::Impl::hip_internal_multiprocessor_count(), 1, 1); + const dim3 block(1, Kokkos::Experimental::Impl::HIPTraits::WarpSize, + warps_per_block); + const int shared = 0; + + Kokkos::Experimental::Impl::HIPParallelLaunch<Self>( + *this, grid, block, shared, + Experimental::HIP().impl_internal_space_instance(), false); + } + + inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) + : m_policy(arg_policy), m_functor(arg_functor) {} +}; + +} // namespace Impl +} // namespace Kokkos + +#endif /* #define KOKKOS_HIP_WORKGRAPHPOLICY_HPP */ diff --git a/packages/kokkos/core/src/HPX/Kokkos_HPX.cpp b/packages/kokkos/core/src/HPX/Kokkos_HPX.cpp new file mode 100644 index 0000000000000000000000000000000000000000..910d5e52e6ac62e290f4d7c918217aa518ec3ec7 --- /dev/null +++ b/packages/kokkos/core/src/HPX/Kokkos_HPX.cpp @@ -0,0 +1,210 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> + +#ifdef KOKKOS_ENABLE_HPX +#include <Kokkos_HPX.hpp> + +#include <hpx/util/yield_while.hpp> + +namespace Kokkos { +namespace Experimental { + +bool HPX::m_hpx_initialized = false; +std::atomic<uint32_t> HPX::m_next_instance_id{1}; +#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH) +std::atomic<uint32_t> HPX::m_active_parallel_region_count{0}; +HPX::instance_data HPX::m_global_instance_data; +#else +Kokkos::Impl::thread_buffer HPX::m_global_buffer; +#endif + +int HPX::concurrency() { + hpx::runtime *rt = hpx::get_runtime_ptr(); + if (rt == nullptr) { + return hpx::threads::hardware_concurrency(); + } else { + if (hpx::threads::get_self_ptr() == nullptr) { + return hpx::resource::get_thread_pool(0).get_os_thread_count(); + } else { + return hpx::this_thread::get_pool()->get_os_thread_count(); + } + } +} + +void HPX::impl_initialize(int thread_count) { + hpx::runtime *rt = hpx::get_runtime_ptr(); + if (rt == nullptr) { + std::vector<std::string> config = { + "hpx.os_threads=" + std::to_string(thread_count), +#ifdef KOKKOS_ENABLE_DEBUG + "--hpx:attach-debugger=exception", +#endif + }; + int argc_hpx = 1; + char name[] = "kokkos_hpx"; + char *argv_hpx[] = {name, nullptr}; + hpx::start(nullptr, argc_hpx, argv_hpx, config); + +#if HPX_VERSION_FULL < 0x010400 + // This has been fixed in HPX 1.4.0. + // + // NOTE: Wait for runtime to start. hpx::start returns as soon as + // possible, meaning some operations are not allowed immediately + // after hpx::start. Notably, hpx::stop needs state_running. This + // needs to be fixed in HPX itself. + + // Get runtime pointer again after it has been started. + rt = hpx::get_runtime_ptr(); + hpx::util::yield_while( + [rt]() { return rt->get_state() < hpx::state_running; }); +#endif + + m_hpx_initialized = true; + } +} + +void HPX::impl_initialize() { + hpx::runtime *rt = hpx::get_runtime_ptr(); + if (rt == nullptr) { + std::vector<std::string> config = { +#ifdef KOKKOS_ENABLE_DEBUG + "--hpx:attach-debugger=exception", +#endif + }; + int argc_hpx = 1; + char name[] = "kokkos_hpx"; + char *argv_hpx[] = {name, nullptr}; + hpx::start(nullptr, argc_hpx, argv_hpx, config); + + // NOTE: Wait for runtime to start. hpx::start returns as soon as + // possible, meaning some operations are not allowed immediately + // after hpx::start. Notably, hpx::stop needs state_running. This + // needs to be fixed in HPX itself. + + // Get runtime pointer again after it has been started. + rt = hpx::get_runtime_ptr(); + hpx::util::yield_while( + [rt]() { return rt->get_state() < hpx::state_running; }); + + m_hpx_initialized = true; + } +} + +bool HPX::impl_is_initialized() noexcept { + hpx::runtime *rt = hpx::get_runtime_ptr(); + return rt != nullptr; +} + +void HPX::impl_finalize() { + if (m_hpx_initialized) { + hpx::runtime *rt = hpx::get_runtime_ptr(); + if (rt != nullptr) { + hpx::apply([]() { hpx::finalize(); }); + hpx::stop(); + } else { + Kokkos::abort( + "Kokkos::Experimental::HPX::impl_finalize: Kokkos started " + "HPX but something else already stopped HPX\n"); + } + } +} + +} // namespace Experimental + +namespace Impl { + +int g_hpx_space_factory_initialized = + initialize_space_factory<HPXSpaceInitializer>("060_HPX"); + +void HPXSpaceInitializer::initialize(const InitArguments &args) { + const int num_threads = args.num_threads; + + if (std::is_same<Kokkos::Experimental::HPX, + Kokkos::DefaultExecutionSpace>::value || + std::is_same<Kokkos::Experimental::HPX, + Kokkos::HostSpace::execution_space>::value) { + if (num_threads > 0) { + Kokkos::Experimental::HPX::impl_initialize(num_threads); + } else { + Kokkos::Experimental::HPX::impl_initialize(); + } + // std::cout << "Kokkos::initialize() fyi: HPX enabled and initialized" << + // std::endl ; + } else { + // std::cout << "Kokkos::initialize() fyi: HPX enabled but not initialized" + // << std::endl ; + } +} + +void HPXSpaceInitializer::finalize(const bool all_spaces) { + if (std::is_same<Kokkos::Experimental::HPX, + Kokkos::DefaultExecutionSpace>::value || + std::is_same<Kokkos::Experimental::HPX, + Kokkos::HostSpace::execution_space>::value || + all_spaces) { + if (Kokkos::Experimental::HPX::impl_is_initialized()) + Kokkos::Experimental::HPX::impl_finalize(); + } +} + +void HPXSpaceInitializer::fence() { Kokkos::Experimental::HPX().fence(); } + +void HPXSpaceInitializer::print_configuration(std::ostream &msg, + const bool detail) { + msg << "HPX Execution Space:" << std::endl; + msg << " KOKKOS_ENABLE_HPX: "; + msg << "yes" << std::endl; + + msg << "\nHPX Runtime Configuration:" << std::endl; + Kokkos::Experimental::HPX::print_configuration(msg, detail); +} + +} // namespace Impl +} // namespace Kokkos + +#else +void KOKKOS_CORE_SRC_IMPL_HPX_PREVENT_LINK_ERROR() {} +#endif //#ifdef KOKKOS_ENABLE_HPX diff --git a/packages/kokkos/core/src/HPX/Kokkos_HPX_ChunkedRoundRobinExecutor.hpp b/packages/kokkos/core/src/HPX/Kokkos_HPX_ChunkedRoundRobinExecutor.hpp new file mode 100644 index 0000000000000000000000000000000000000000..b364b4a6eb2f93a424bd8b4904d743456174425f --- /dev/null +++ b/packages/kokkos/core/src/HPX/Kokkos_HPX_ChunkedRoundRobinExecutor.hpp @@ -0,0 +1,208 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_HPX_CHUNKEDROUNDROBINEXECUTOR_HPP +#define KOKKOS_HPX_CHUNKEDROUNDROBINEXECUTOR_HPP + +#include <hpx/config.hpp> +#include <hpx/async_launch_policy_dispatch.hpp> +#include <hpx/lcos/local/latch.hpp> +#include <hpx/parallel/executors/execution.hpp> +#include <hpx/parallel/executors/post_policy_dispatch.hpp> +#include <hpx/runtime/get_os_thread_count.hpp> +#include <hpx/runtime/threads/thread_helpers.hpp> +#include <hpx/traits/is_executor.hpp> +#include <hpx/traits/is_launch_policy.hpp> +#include <hpx/util/deferred_call.hpp> + +#include <cstddef> +#include <type_traits> +#include <utility> +#include <vector> + +namespace Kokkos { +namespace Impl { + +/////////////////////////////////////////////////////////////////////////// +/// A \a ChunkedRoundRobinExecutor creates groups of parallel execution +/// agents which execute in threads implicitly created by the executor. This +/// executor uses the scheduling hint to spawn threads with the first grouped on +/// the first core, the second group getting the next consecutive threads, etc. +/// For example, if 10 tasks are spawned (num_tasks is set to 10) and num_cores +/// is set to 2 the executor will schedule the tasks in the following order: +/// +/// worker thread | 1 | 2 +/// --------------+---+--- +/// tasks | 1 | 6 +/// | 2 | 7 +/// | 3 | 8 +/// | 4 | 9 +/// | 5 | 10 +/// +/// rather than the typical round robin: +/// +/// worker thread | 1 | 2 +/// --------------+---+--- +/// tasks | 1 | 2 +/// | 3 | 4 +/// | 5 | 6 +/// | 7 | 8 +/// | 9 | 10 +struct ChunkedRoundRobinExecutor { + using execution_category = hpx::parallel::execution::parallel_execution_tag; + + HPX_CONSTEXPR explicit ChunkedRoundRobinExecutor( + std::size_t num_tasks = std::size_t(-1), std::size_t core_offset = 0, + std::size_t num_cores = hpx::get_os_thread_count()) + : num_tasks_(num_tasks), + core_offset_(core_offset), + num_cores_(num_cores), + num_tasks_per_core_(double(num_tasks_) / num_cores_), + num_tasks_spawned_(0) {} + + bool operator==(ChunkedRoundRobinExecutor const &rhs) const noexcept { + return num_cores_ == rhs.num_cores_ && num_tasks_ == rhs.num_tasks_; + } + + bool operator!=(ChunkedRoundRobinExecutor const &rhs) const noexcept { + return !(*this == rhs); + } + + ChunkedRoundRobinExecutor const &context() const noexcept { return *this; } + + template <typename F, typename... Ts> + hpx::future< + typename hpx::util::detail::invoke_deferred_result<F, Ts...>::type> + async_execute(F &&f, Ts &&... ts) const { + return hpx::detail::async_launch_policy_dispatch<hpx::launch>::call( + hpx::launch::async_policy{}, std::forward<F>(f), + std::forward<Ts>(ts)...); + } + + template <typename F, typename... Ts> + void post(F &&f, Ts &&... ts) const { + hpx::util::thread_description const desc( + f, "Kokkos::Impl::ChunkedRoundRobinExecutor::async_execute"); + hpx::threads::thread_schedule_hint const hint( + hpx::threads::thread_schedule_hint_mode_thread, + core_offset_ + std::floor(double(num_tasks_spawned_ % num_tasks_) / + num_tasks_per_core_)); + + hpx::threads::register_thread_nullary( + hpx::util::deferred_call(std::forward<F>(f), std::forward<Ts>(ts)...), + desc, hpx::threads::pending, false, + hpx::threads::thread_priority_normal, hint, + hpx::threads::thread_stacksize_default); + + ++num_tasks_spawned_; + } + + template <typename F, typename Shape, typename... Ts> + std::vector<hpx::future<typename hpx::parallel::execution::detail:: + bulk_function_result<F, Shape, Ts...>::type>> + bulk_async_execute(F &&f, Shape const &shape, Ts &&... ts) { + hpx::util::thread_description desc( + f, "Kokkos::Impl::ChunkedRoundRobinExecutor::bulk_sync_execute"); + + hpx::lcos::local::latch l(hpx::util::size(shape)); + // Keep a separate counter for bulk launch + std::size_t num_tasks_spawned = 0; + + for (auto const &s : shape) { + hpx::threads::thread_schedule_hint const hint( + hpx::threads::thread_schedule_hint_mode_thread, + core_offset_ + std::floor(double(num_tasks_spawned % num_tasks_) / + num_tasks_per_core_)); + + hpx::threads::register_thread_nullary( + [&, s]() { + hpx::util::invoke(f, s, ts...); + l.count_down(1); + }, + desc, hpx::threads::pending, false, + hpx::threads::thread_priority_normal, hint, + hpx::threads::thread_stacksize_default); + + ++num_tasks_spawned; + } + + // NOTE: We block here to avoid extra synchronization. Since this executor + // is only used in the HPX backend we get away with this. + l.wait(); + + return {}; + } + + private: + std::size_t num_tasks_; + std::size_t core_offset_; + std::size_t num_cores_; + double num_tasks_per_core_; + mutable std::size_t num_tasks_spawned_; +}; + +} // namespace Impl +} // namespace Kokkos + +namespace hpx { +namespace parallel { +namespace execution { + +template <> +struct is_one_way_executor<Kokkos::Impl::ChunkedRoundRobinExecutor> + : std::true_type {}; + +template <> +struct is_two_way_executor<Kokkos::Impl::ChunkedRoundRobinExecutor> + : std::true_type {}; + +template <> +struct is_bulk_two_way_executor<Kokkos::Impl::ChunkedRoundRobinExecutor> + : std::true_type {}; + +} // namespace execution +} // namespace parallel +} // namespace hpx + +#endif diff --git a/packages/kokkos/core/src/HPX/Kokkos_HPX_Task.cpp b/packages/kokkos/core/src/HPX/Kokkos_HPX_Task.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8d42589bdf37b0b5557aed6631de851276a5f5c4 --- /dev/null +++ b/packages/kokkos/core/src/HPX/Kokkos_HPX_Task.cpp @@ -0,0 +1,66 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_HPX) && defined(KOKKOS_ENABLE_TASKDAG) + +#include <Kokkos_Core.hpp> + +#include <impl/Kokkos_TaskQueue_impl.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template class TaskQueue<Kokkos::Experimental::HPX, + Kokkos::Experimental::HPX::memory_space>; + +} // namespace Impl +} // namespace Kokkos + +#else +void KOKKOS_CORE_SRC_IMPL_HPX_TASK_PREVENT_LINK_ERROR() {} +#endif // #if defined( KOKKOS_ENABLE_HPX ) && defined( KOKKOS_ENABLE_TASKDAG ) diff --git a/packages/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp b/packages/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp new file mode 100644 index 0000000000000000000000000000000000000000..df09e026fd9b45bc1c4f7d0c55e5ae10d336ad72 --- /dev/null +++ b/packages/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp @@ -0,0 +1,313 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_HPX_TASK_HPP +#define KOKKOS_HPX_TASK_HPP + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_HPX) && defined(KOKKOS_ENABLE_TASKDAG) + +#include <Kokkos_TaskScheduler_fwd.hpp> + +#include <HPX/Kokkos_HPX_ChunkedRoundRobinExecutor.hpp> +#include <Kokkos_HPX.hpp> + +#include <hpx/apply.hpp> +#include <hpx/lcos/local/latch.hpp> + +#include <type_traits> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <class QueueType> +class TaskQueueSpecialization< + SimpleTaskScheduler<Kokkos::Experimental::HPX, QueueType>> { + public: + using execution_space = Kokkos::Experimental::HPX; + using scheduler_type = + SimpleTaskScheduler<Kokkos::Experimental::HPX, QueueType>; + using member_type = + TaskTeamMemberAdapter<Kokkos::Impl::HPXTeamMember, scheduler_type>; + using memory_space = Kokkos::HostSpace; + + static void execute(scheduler_type const &scheduler) { + // NOTE: We create an instance so that we can use dispatch_execute_task. + // This is not necessarily the most efficient, but can be improved later. + TaskQueueSpecialization<scheduler_type> task_queue; + task_queue.scheduler = &scheduler; + Kokkos::Impl::dispatch_execute_task(&task_queue, + Kokkos::Experimental::HPX()); + Kokkos::Experimental::HPX().fence(); + } + + // Must provide task queue execution function + void execute_task() const { + using hpx::apply; + using hpx::lcos::local::latch; + using task_base_type = typename scheduler_type::task_base_type; + + const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + + thread_buffer &buffer = Kokkos::Experimental::HPX().impl_get_buffer(); + buffer.resize(num_worker_threads, 512); + + auto &queue = scheduler->queue(); + + latch num_tasks_remaining(num_worker_threads); + ChunkedRoundRobinExecutor exec(num_worker_threads); + + for (int thread = 0; thread < num_worker_threads; ++thread) { + apply(exec, [this, &num_tasks_remaining, &queue, &buffer, + num_worker_threads]() { + // NOTE: This implementation has been simplified based on the + // assumption that team_size = 1. The HPX backend currently only + // supports a team size of 1. + std::size_t t = Kokkos::Experimental::HPX::impl_hardware_thread_id(); + + buffer.get(Kokkos::Experimental::HPX::impl_hardware_thread_id()); + HPXTeamMember member( + TeamPolicyInternal<Kokkos::Experimental::HPX>( + Kokkos::Experimental::HPX(), num_worker_threads, 1), + 0, t, buffer.get(t), 512); + + member_type single_exec(*scheduler, member); + member_type &team_exec = single_exec; + + auto &team_scheduler = team_exec.scheduler(); + auto current_task = OptionalRef<task_base_type>(nullptr); + + while (!queue.is_done()) { + current_task = + queue.pop_ready_task(team_scheduler.team_scheduler_info()); + + if (current_task) { + KOKKOS_ASSERT(current_task->is_single_runnable() || + current_task->is_team_runnable()); + current_task->as_runnable_task().run(single_exec); + queue.complete((*std::move(current_task)).as_runnable_task(), + team_scheduler.team_scheduler_info()); + } + } + + num_tasks_remaining.count_down(1); + }); + } + + num_tasks_remaining.wait(); + +#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH) + Kokkos::Experimental::HPX::impl_decrement_active_parallel_region_count(); +#endif + } + + static uint32_t get_max_team_count(execution_space const &espace) { + return static_cast<uint32_t>(espace.concurrency()); + } + + template <typename TaskType> + static void get_function_pointer(typename TaskType::function_type &ptr, + typename TaskType::destroy_type &dtor) { + ptr = TaskType::apply; + dtor = TaskType::destroy; + } + + private: + const scheduler_type *scheduler; +}; + +template <class Scheduler> +class TaskQueueSpecializationConstrained< + Scheduler, typename std::enable_if< + std::is_same<typename Scheduler::execution_space, + Kokkos::Experimental::HPX>::value>::type> { + public: + using execution_space = Kokkos::Experimental::HPX; + using scheduler_type = Scheduler; + using member_type = + TaskTeamMemberAdapter<Kokkos::Impl::HPXTeamMember, scheduler_type>; + using memory_space = Kokkos::HostSpace; + + static void iff_single_thread_recursive_execute( + scheduler_type const &scheduler) { + using task_base_type = typename scheduler_type::task_base; + using queue_type = typename scheduler_type::queue_type; + + if (1 == Kokkos::Experimental::HPX::concurrency()) { + task_base_type *const end = (task_base_type *)task_base_type::EndTag; + task_base_type *task = end; + + HPXTeamMember member(TeamPolicyInternal<Kokkos::Experimental::HPX>( + Kokkos::Experimental::HPX(), 1, 1), + 0, 0, nullptr, 0); + member_type single_exec(scheduler, member); + + do { + task = end; + + // Loop by priority and then type + for (int i = 0; i < queue_type::NumQueue && end == task; ++i) { + for (int j = 0; j < 2 && end == task; ++j) { + task = + queue_type::pop_ready_task(&scheduler.m_queue->m_ready[i][j]); + } + } + + if (end == task) break; + + (*task->m_apply)(task, &single_exec); + + scheduler.m_queue->complete(task); + + } while (true); + } + } + + static void execute(scheduler_type const &scheduler) { + // NOTE: We create an instance so that we can use dispatch_execute_task. + // This is not necessarily the most efficient, but can be improved later. + TaskQueueSpecializationConstrained<scheduler_type> task_queue; + task_queue.scheduler = &scheduler; + Kokkos::Impl::dispatch_execute_task(&task_queue, + Kokkos::Experimental::HPX()); + Kokkos::Experimental::HPX().fence(); + } + + // Must provide task queue execution function + void execute_task() const { + using hpx::apply; + using hpx::lcos::local::latch; + using task_base_type = typename scheduler_type::task_base; + using queue_type = typename scheduler_type::queue_type; + + const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + static task_base_type *const end = (task_base_type *)task_base_type::EndTag; + constexpr task_base_type *no_more_tasks_sentinel = nullptr; + + thread_buffer &buffer = Kokkos::Experimental::HPX().impl_get_buffer(); + buffer.resize(num_worker_threads, 512); + + auto &queue = scheduler->queue(); + queue.initialize_team_queues(num_worker_threads); + + latch num_tasks_remaining(num_worker_threads); + ChunkedRoundRobinExecutor exec(num_worker_threads); + + for (int thread = 0; thread < num_worker_threads; ++thread) { + apply(exec, [this, &num_tasks_remaining, &buffer, num_worker_threads]() { + // NOTE: This implementation has been simplified based on the assumption + // that team_size = 1. The HPX backend currently only supports a team + // size of 1. + std::size_t t = Kokkos::Experimental::HPX::impl_hardware_thread_id(); + + buffer.get(Kokkos::Experimental::HPX::impl_hardware_thread_id()); + HPXTeamMember member( + TeamPolicyInternal<Kokkos::Experimental::HPX>( + Kokkos::Experimental::HPX(), num_worker_threads, 1), + 0, t, buffer.get(t), 512); + + member_type single_exec(*scheduler, member); + member_type &team_exec = single_exec; + + auto &team_queue = team_exec.scheduler().queue(); + task_base_type *task = no_more_tasks_sentinel; + + do { + if (task != no_more_tasks_sentinel && task != end) { + team_queue.complete(task); + } + + if (*((volatile int *)&team_queue.m_ready_count) > 0) { + task = end; + for (int i = 0; i < queue_type::NumQueue && end == task; ++i) { + for (int j = 0; j < 2 && end == task; ++j) { + task = queue_type::pop_ready_task(&team_queue.m_ready[i][j]); + } + } + } else { + task = team_queue.attempt_to_steal_task(); + } + + if (task != no_more_tasks_sentinel && task != end) { + (*task->m_apply)(task, &single_exec); + } + } while (task != no_more_tasks_sentinel); + + num_tasks_remaining.count_down(1); + }); + } + + num_tasks_remaining.wait(); + +#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH) + Kokkos::Experimental::HPX::impl_decrement_active_parallel_region_count(); +#endif + } + + template <typename TaskType> + static void get_function_pointer(typename TaskType::function_type &ptr, + typename TaskType::destroy_type &dtor) { + ptr = TaskType::apply; + dtor = TaskType::destroy; + } + + private: + const scheduler_type *scheduler; +}; + +extern template class TaskQueue< + Kokkos::Experimental::HPX, + typename Kokkos::Experimental::HPX::memory_space>; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */ +#endif /* #ifndef KOKKOS_HPX_TASK_HPP */ diff --git a/packages/kokkos/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp b/packages/kokkos/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp new file mode 100644 index 0000000000000000000000000000000000000000..527fe12ad937f9b89029f12d5c64044f40671572 --- /dev/null +++ b/packages/kokkos/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp @@ -0,0 +1,120 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_HPX_WORKGRAPHPOLICY_HPP +#define KOKKOS_HPX_WORKGRAPHPOLICY_HPP + +#include <Kokkos_HPX.hpp> + +#include <hpx/apply.hpp> +#include <hpx/lcos/local/latch.hpp> + +namespace Kokkos { +namespace Impl { + +template <class FunctorType, class... Traits> +class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>, + Kokkos::Experimental::HPX> { + private: + using Policy = Kokkos::WorkGraphPolicy<Traits...>; + using WorkTag = typename Policy::work_tag; + + Policy m_policy; + FunctorType m_functor; + + template <class TagType> + typename std::enable_if<std::is_same<TagType, void>::value>::type + execute_functor(const std::int32_t w) const noexcept { + m_functor(w); + } + + template <class TagType> + typename std::enable_if<!std::is_same<TagType, void>::value>::type + execute_functor(const std::int32_t w) const noexcept { + const TagType t{}; + m_functor(t, w); + } + + public: + void execute() const { + dispatch_execute_task(this, m_policy.space()); + m_policy.space().fence(); + } + + void execute_task() const { + const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + + using hpx::apply; + using hpx::lcos::local::latch; + + latch num_tasks_remaining(num_worker_threads); + ChunkedRoundRobinExecutor exec(num_worker_threads); + + for (int thread = 0; thread < num_worker_threads; ++thread) { + apply(exec, [this, &num_tasks_remaining]() { + std::int32_t w = m_policy.pop_work(); + while (w != Policy::COMPLETED_TOKEN) { + if (w != Policy::END_TOKEN) { + execute_functor<WorkTag>(w); + m_policy.completed_work(w); + } + + w = m_policy.pop_work(); + } + + num_tasks_remaining.count_down(1); + }); + } + + num_tasks_remaining.wait(); + } + + inline ParallelFor(const FunctorType &arg_functor, const Policy &arg_policy) + : m_policy(arg_policy), m_functor(arg_functor) {} +}; + +} // namespace Impl +} // namespace Kokkos + +#endif /* #define KOKKOS_HPX_WORKGRAPHPOLICY_HPP */ diff --git a/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp b/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp new file mode 100644 index 0000000000000000000000000000000000000000..b7d8e62f696073bfa4794b362401aaca288de021 --- /dev/null +++ b/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp @@ -0,0 +1,409 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP +#define KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP + +#include <initializer_list> + +#include <Kokkos_Layout.hpp> +#include <Kokkos_Array.hpp> +#include <impl/KokkosExp_Host_IterateTile.hpp> +#include <Kokkos_ExecPolicy.hpp> +#include <type_traits> + +namespace Kokkos { + +// ------------------------------------------------------------------ // +// Moved to Kokkos_Layout.hpp for more general accessibility +/* +enum class Iterate +{ + Default, // Default for the device + Left, // Left indices stride fastest + Right, // Right indices stride fastest +}; +*/ + +template <typename ExecSpace> +struct default_outer_direction { + using type = Iterate; + static constexpr Iterate value = Iterate::Right; +}; + +template <typename ExecSpace> +struct default_inner_direction { + using type = Iterate; + static constexpr Iterate value = Iterate::Right; +}; + +// Iteration Pattern +template <unsigned N, Iterate OuterDir = Iterate::Default, + Iterate InnerDir = Iterate::Default> +struct Rank { + static_assert(N != 0u, "Kokkos Error: rank 0 undefined"); + static_assert(N != 1u, + "Kokkos Error: rank 1 is not a multi-dimensional range"); + static_assert(N < 7u, "Kokkos Error: Unsupported rank..."); + + using iteration_pattern = Rank<N, OuterDir, InnerDir>; + + static constexpr int rank = N; + static constexpr Iterate outer_direction = OuterDir; + static constexpr Iterate inner_direction = InnerDir; +}; + +namespace Impl { +// NOTE the comparison below is encapsulated to silent warnings about pointless +// comparison of unsigned integer with zero +template <class T> +constexpr std::enable_if_t<!std::is_signed<T>::value, bool> +is_less_than_value_initialized_variable(T) { + return false; +} + +template <class T> +constexpr std::enable_if_t<std::is_signed<T>::value, bool> +is_less_than_value_initialized_variable(T arg) { + return arg < T{}; +} + +// Checked narrowing conversion that calls abort if the cast changes the value +template <class To, class From> +constexpr To checked_narrow_cast(From arg) { + constexpr const bool is_different_signedness = + (std::is_signed<To>::value != std::is_signed<From>::value); + auto const ret = static_cast<To>(arg); + if (static_cast<From>(ret) != arg || + (is_different_signedness && + is_less_than_value_initialized_variable(arg) != + is_less_than_value_initialized_variable(ret))) { + Kokkos::abort("unsafe narrowing conversion"); + } + return ret; +} +// NOTE prefer C array U[M] to std::initalizer_list<U> so that the number of +// elements can be deduced (https://stackoverflow.com/q/40241370) +// NOTE for some unfortunate reason the policy bounds are stored as signed +// integer arrays (point_type which is Kokkos::Array<std::int64_t>) so we +// specify the index type (actual policy index_type from the traits) and check +// ahead of time that narrowing conversions will be safe. +template <class IndexType, class Array, class U, std::size_t M> +constexpr Array to_array_potentially_narrowing(const U (&init)[M]) { + using T = typename Array::value_type; + Array a{}; + constexpr std::size_t N = a.size(); + static_assert(M <= N, ""); + auto* ptr = a.data(); + // NOTE equivalent to + // std::transform(std::begin(init), std::end(init), a.data(), + // [](U x) { return static_cast<T>(x); }); + // except that std::transform is not constexpr. + for (auto x : init) { + *ptr++ = checked_narrow_cast<T>(x); + (void)checked_narrow_cast<IndexType>(x); // see note above + } + return a; +} + +// NOTE Making a copy even when std::is_same<Array, Kokkos::Array<U, M>>::value +// is true to reduce code complexity. You may change this if you have a good +// reason to. Intentionally not enabling std::array at this time but this may +// change too. +template <class IndexType, class NVCC_WONT_LET_ME_CALL_YOU_Array, class U, + std::size_t M> +constexpr NVCC_WONT_LET_ME_CALL_YOU_Array to_array_potentially_narrowing( + Kokkos::Array<U, M> const& other) { + using T = typename NVCC_WONT_LET_ME_CALL_YOU_Array::value_type; + NVCC_WONT_LET_ME_CALL_YOU_Array a{}; + constexpr std::size_t N = a.size(); + static_assert(M <= N, ""); + for (std::size_t i = 0; i < M; ++i) { + a[i] = checked_narrow_cast<T>(other[i]); + (void)checked_narrow_cast<IndexType>(other[i]); // see note above + } + return a; +} + +struct TileSizeProperties { + int max_threads; + int default_largest_tile_size; + int default_tile_size; + int max_total_tile_size; +}; + +template <typename ExecutionSpace> +TileSizeProperties get_tile_size_properties(const ExecutionSpace&) { + // Host settings + TileSizeProperties properties; + properties.max_threads = std::numeric_limits<int>::max(); + properties.default_largest_tile_size = 0; + properties.default_tile_size = 2; + properties.max_total_tile_size = std::numeric_limits<int>::max(); + return properties; +} + +} // namespace Impl + +// multi-dimensional iteration pattern +template <typename... Properties> +struct MDRangePolicy : public Kokkos::Impl::PolicyTraits<Properties...> { + using traits = Kokkos::Impl::PolicyTraits<Properties...>; + using range_policy = RangePolicy<Properties...>; + + typename traits::execution_space m_space; + + using impl_range_policy = + RangePolicy<typename traits::execution_space, + typename traits::schedule_type, typename traits::index_type>; + + using execution_policy = + MDRangePolicy<Properties...>; // needed for is_execution_space + // interrogation + + template <class... OtherProperties> + friend struct MDRangePolicy; + + static_assert(!std::is_same<typename traits::iteration_pattern, void>::value, + "Kokkos Error: MD iteration pattern not defined"); + + using iteration_pattern = typename traits::iteration_pattern; + using work_tag = typename traits::work_tag; + using launch_bounds = typename traits::launch_bounds; + using member_type = typename range_policy::member_type; + + static constexpr int rank = iteration_pattern::rank; + + using index_type = typename traits::index_type; + using array_index_type = std::int64_t; + using point_type = Kokkos::Array<array_index_type, rank>; // was index_type + using tile_type = Kokkos::Array<array_index_type, rank>; + // If point_type or tile_type is not templated on a signed integral type (if + // it is unsigned), then if user passes in intializer_list of + // runtime-determined values of signed integral type that are not const will + // receive a compiler error due to an invalid case for implicit conversion - + // "conversion from integer or unscoped enumeration type to integer type that + // cannot represent all values of the original, except where source is a + // constant expression whose value can be stored exactly in the target type" + // This would require the user to either pass a matching index_type parameter + // as template parameter to the MDRangePolicy or static_cast the individual + // values + + point_type m_lower = {}; + point_type m_upper = {}; + tile_type m_tile = {}; + point_type m_tile_end = {}; + index_type m_num_tiles = 1; + index_type m_prod_tile_dims = 1; + bool m_tune_tile_size = false; + + static constexpr auto outer_direction = + (iteration_pattern::outer_direction != Iterate::Default) + ? iteration_pattern::outer_direction + : default_outer_direction<typename traits::execution_space>::value; + + static constexpr auto inner_direction = + iteration_pattern::inner_direction != Iterate::Default + ? iteration_pattern::inner_direction + : default_inner_direction<typename traits::execution_space>::value; + + static constexpr auto Right = Iterate::Right; + static constexpr auto Left = Iterate::Left; + + KOKKOS_INLINE_FUNCTION const typename traits::execution_space& space() const { + return m_space; + } + + MDRangePolicy() = default; + + template <typename LT, std::size_t LN, typename UT, std::size_t UN, + typename TT = array_index_type, std::size_t TN = rank, + typename = std::enable_if_t<std::is_integral<LT>::value && + std::is_integral<UT>::value && + std::is_integral<TT>::value>> + MDRangePolicy(const LT (&lower)[LN], const UT (&upper)[UN], + const TT (&tile)[TN] = {}) + : MDRangePolicy( + Impl::to_array_potentially_narrowing<index_type, decltype(m_lower)>( + lower), + Impl::to_array_potentially_narrowing<index_type, decltype(m_upper)>( + upper), + Impl::to_array_potentially_narrowing<index_type, decltype(m_tile)>( + tile)) { + static_assert( + LN == rank && UN == rank && TN <= rank, + "MDRangePolicy: Constructor initializer lists have wrong size"); + } + + template <typename LT, std::size_t LN, typename UT, std::size_t UN, + typename TT = array_index_type, std::size_t TN = rank, + typename = std::enable_if_t<std::is_integral<LT>::value && + std::is_integral<UT>::value && + std::is_integral<TT>::value>> + MDRangePolicy(const typename traits::execution_space& work_space, + const LT (&lower)[LN], const UT (&upper)[UN], + const TT (&tile)[TN] = {}) + : MDRangePolicy( + work_space, + Impl::to_array_potentially_narrowing<index_type, decltype(m_lower)>( + lower), + Impl::to_array_potentially_narrowing<index_type, decltype(m_upper)>( + upper), + Impl::to_array_potentially_narrowing<index_type, decltype(m_tile)>( + tile)) { + static_assert( + LN == rank && UN == rank && TN <= rank, + "MDRangePolicy: Constructor initializer lists have wrong size"); + } + + // NOTE: Keeping these two constructor despite the templated constructors + // from Kokkos arrays for backwards compability to allow construction from + // double-braced initializer lists. + MDRangePolicy(point_type const& lower, point_type const& upper, + tile_type const& tile = tile_type{}) + : MDRangePolicy(typename traits::execution_space(), lower, upper, tile) {} + + MDRangePolicy(const typename traits::execution_space& work_space, + point_type const& lower, point_type const& upper, + tile_type const& tile = tile_type{}) + : m_space(work_space), m_lower(lower), m_upper(upper), m_tile(tile) { + init_helper(Impl::get_tile_size_properties(work_space)); + } + + template <typename T, std::size_t NT = rank, + typename = std::enable_if_t<std::is_integral<T>::value>> + MDRangePolicy(Kokkos::Array<T, rank> const& lower, + Kokkos::Array<T, rank> const& upper, + Kokkos::Array<T, NT> const& tile = Kokkos::Array<T, NT>{}) + : MDRangePolicy(typename traits::execution_space(), lower, upper, tile) {} + + template <typename T, std::size_t NT = rank, + typename = std::enable_if_t<std::is_integral<T>::value>> + MDRangePolicy(const typename traits::execution_space& work_space, + Kokkos::Array<T, rank> const& lower, + Kokkos::Array<T, rank> const& upper, + Kokkos::Array<T, NT> const& tile = Kokkos::Array<T, NT>{}) + : MDRangePolicy( + work_space, + Impl::to_array_potentially_narrowing<index_type, decltype(m_lower)>( + lower), + Impl::to_array_potentially_narrowing<index_type, decltype(m_upper)>( + upper), + Impl::to_array_potentially_narrowing<index_type, decltype(m_tile)>( + tile)) {} + + template <class... OtherProperties> + MDRangePolicy(const MDRangePolicy<OtherProperties...> p) + : traits(p), // base class may contain data such as desired occupancy + m_space(p.m_space), + m_lower(p.m_lower), + m_upper(p.m_upper), + m_tile(p.m_tile), + m_tile_end(p.m_tile_end), + m_num_tiles(p.m_num_tiles), + m_prod_tile_dims(p.m_prod_tile_dims), + m_tune_tile_size(p.m_tune_tile_size) {} + + void impl_change_tile_size(const point_type& tile) { + m_tile = tile; + init_helper(Impl::get_tile_size_properties(m_space)); + } + bool impl_tune_tile_size() const { return m_tune_tile_size; } + + private: + void init_helper(Impl::TileSizeProperties properties) { + m_prod_tile_dims = 1; + int increment = 1; + int rank_start = 0; + int rank_end = rank; + if (inner_direction == Iterate::Right) { + increment = -1; + rank_start = rank - 1; + rank_end = -1; + } + for (int i = rank_start; i != rank_end; i += increment) { + const index_type length = m_upper[i] - m_lower[i]; + if (m_tile[i] <= 0) { + m_tune_tile_size = true; + if ((inner_direction == Iterate::Right && (i < rank - 1)) || + (inner_direction == Iterate::Left && (i > 0))) { + if (m_prod_tile_dims * properties.default_tile_size < + static_cast<index_type>(properties.max_total_tile_size)) { + m_tile[i] = properties.default_tile_size; + } else { + m_tile[i] = 1; + } + } else { + m_tile[i] = properties.default_largest_tile_size == 0 + ? std::max<int>(length, 1) + : properties.default_largest_tile_size; + } + } + m_tile_end[i] = + static_cast<index_type>((length + m_tile[i] - 1) / m_tile[i]); + m_num_tiles *= m_tile_end[i]; + m_prod_tile_dims *= m_tile[i]; + } + if (m_prod_tile_dims > static_cast<index_type>(properties.max_threads)) { + printf(" Product of tile dimensions exceed maximum limit: %d\n", + static_cast<int>(properties.max_threads)); + Kokkos::abort( + "ExecSpace Error: MDRange tile dims exceed maximum number " + "of threads per block - choose smaller tile dims"); + } + } +}; + +} // namespace Kokkos + +// For backward compatibility +namespace Kokkos { +namespace Experimental { +using Kokkos::Iterate; +using Kokkos::MDRangePolicy; +using Kokkos::Rank; +} // namespace Experimental +} // namespace Kokkos + +#endif // KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP diff --git a/packages/kokkos/core/src/Kokkos_AcquireUniqueTokenImpl.hpp b/packages/kokkos/core/src/Kokkos_AcquireUniqueTokenImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..d6227b7bcf8c8b91516d169cc90ca5c3cf87539a --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_AcquireUniqueTokenImpl.hpp @@ -0,0 +1,75 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_ACQUIRE_UNIQUE_TOKEN_IMPL_HPP +#define KOKKOS_ACQUIRE_UNIQUE_TOKEN_IMPL_HPP + +#include <Kokkos_Core.hpp> +#include <Kokkos_UniqueToken.hpp> +namespace Kokkos { +namespace Experimental { + +template <typename TeamPolicy> +KOKKOS_FUNCTION AcquireTeamUniqueToken<TeamPolicy>::AcquireTeamUniqueToken( + AcquireTeamUniqueToken<TeamPolicy>::token_type t, team_member_type team) + : my_token(t), my_team_acquired_val(team.team_scratch(0)), my_team(team) { + Kokkos::single(Kokkos::PerTeam(my_team), + [&]() { my_team_acquired_val() = my_token.acquire(); }); + my_team.team_barrier(); + + my_acquired_val = my_team_acquired_val(); +} + +template <typename TeamPolicy> +KOKKOS_FUNCTION AcquireTeamUniqueToken<TeamPolicy>::~AcquireTeamUniqueToken() { + my_team.team_barrier(); + Kokkos::single(Kokkos::PerTeam(my_team), + [&]() { my_token.release(my_acquired_val); }); + my_team.team_barrier(); +} + +} // namespace Experimental +} // namespace Kokkos + +#endif // KOKKOS_UNIQUE_TOKEN_HPP diff --git a/packages/kokkos/core/src/Kokkos_AnonymousSpace.hpp b/packages/kokkos/core/src/Kokkos_AnonymousSpace.hpp new file mode 100644 index 0000000000000000000000000000000000000000..fb94049d7ad7ed588b00cc1f9351162de32f08e5 --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_AnonymousSpace.hpp @@ -0,0 +1,111 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_ANONYMOUSSPACE_HPP +#define KOKKOS_ANONYMOUSSPACE_HPP + +#include <Kokkos_Core_fwd.hpp> +#include <Kokkos_Concepts.hpp> +#include <cstddef> + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { + +class AnonymousSpace { + public: + //! Tag this class as a kokkos memory space + using memory_space = AnonymousSpace; + using execution_space = Kokkos::DefaultExecutionSpace; + using size_type = size_t; + + //! This memory space preferred device_type + using device_type = Kokkos::Device<execution_space, memory_space>; + + /**\brief Default memory space instance */ + AnonymousSpace() = default; + AnonymousSpace(AnonymousSpace &&rhs) = default; + AnonymousSpace(const AnonymousSpace &rhs) = default; + AnonymousSpace &operator=(AnonymousSpace &&) = default; + AnonymousSpace &operator=(const AnonymousSpace &) = default; + ~AnonymousSpace() = default; + + /**\brief Return Name of the MemorySpace */ + static constexpr const char *name() { return "Anonymous"; } +}; + +} // namespace Kokkos + +//---------------------------------------------------------------------------- + +namespace Kokkos { + +namespace Impl { + +template <typename OtherSpace> +struct MemorySpaceAccess<Kokkos::AnonymousSpace, OtherSpace> { + enum : bool { assignable = true }; + enum : bool { accessible = true }; + enum : bool { deepcopy = true }; +}; + +template <typename OtherSpace> +struct MemorySpaceAccess<OtherSpace, Kokkos::AnonymousSpace> { + enum : bool { assignable = true }; + enum : bool { accessible = true }; + enum : bool { deepcopy = true }; +}; + +template <> +struct MemorySpaceAccess<Kokkos::AnonymousSpace, Kokkos::AnonymousSpace> { + enum : bool { assignable = true }; + enum : bool { accessible = true }; + enum : bool { deepcopy = true }; +}; + +} // namespace Impl + +} // namespace Kokkos + +#endif // #define KOKKOS_ANONYMOUSSPACE_HPP diff --git a/packages/kokkos/core/src/Kokkos_Array.hpp b/packages/kokkos/core/src/Kokkos_Array.hpp new file mode 100644 index 0000000000000000000000000000000000000000..0d1408df1d2486f00a947255fae54497020d2fa6 --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_Array.hpp @@ -0,0 +1,355 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_ARRAY_HPP +#define KOKKOS_ARRAY_HPP + +#include <Kokkos_Macros.hpp> +#include <impl/Kokkos_Error.hpp> + +#include <type_traits> +#include <algorithm> +#include <limits> +#include <cstddef> +#include <string> + +namespace Kokkos { + +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK +namespace Impl { +template <typename Integral, bool Signed = std::is_signed<Integral>::value> +struct ArrayBoundsCheck; + +template <typename Integral> +struct ArrayBoundsCheck<Integral, true> { + KOKKOS_INLINE_FUNCTION + ArrayBoundsCheck(Integral i, size_t N) { + if (i < 0) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + std::string s = "Kokkos::Array: index "; + s += std::to_string(i); + s += " < 0"; + Kokkos::Impl::throw_runtime_exception(s); +#else + Kokkos::abort("Kokkos::Array: negative index in device code"); +#endif + } + ArrayBoundsCheck<Integral, false>(i, N); + } +}; + +template <typename Integral> +struct ArrayBoundsCheck<Integral, false> { + KOKKOS_INLINE_FUNCTION + ArrayBoundsCheck(Integral i, size_t N) { + if (size_t(i) >= N) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + std::string s = "Kokkos::Array: index "; + s += std::to_string(i); + s += " >= "; + s += std::to_string(N); + Kokkos::Impl::throw_runtime_exception(s); +#else + Kokkos::abort("Kokkos::Array: index >= size"); +#endif + } + } +}; +} // end namespace Impl + +#define KOKKOS_ARRAY_BOUNDS_CHECK(i, N) \ + Kokkos::Impl::ArrayBoundsCheck<decltype(i)>(i, N) + +#else // !defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK ) + +#define KOKKOS_ARRAY_BOUNDS_CHECK(i, N) (void)0 + +#endif // !defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK ) + +/**\brief Derived from the C++17 'std::array'. + * Dropping the iterator interface. + */ +template <class T = void, size_t N = KOKKOS_INVALID_INDEX, class Proxy = void> +struct Array { + public: + /** + * The elements of this C array shall not be accessed directly. The data + * member has to be declared public to enable aggregate initialization as for + * std::array. We mark it as private in the documentation. + * @private + */ + T m_internal_implementation_private_member_data[N]; + + public: + using reference = T&; + using const_reference = typename std::add_const<T>::type&; + using size_type = size_t; + using difference_type = ptrdiff_t; + using value_type = T; + using pointer = T*; + using const_pointer = typename std::add_const<T>::type*; + + KOKKOS_INLINE_FUNCTION static constexpr size_type size() { return N; } + KOKKOS_INLINE_FUNCTION static constexpr bool empty() { return false; } + KOKKOS_INLINE_FUNCTION constexpr size_type max_size() const { return N; } + + template <typename iType> + KOKKOS_INLINE_FUNCTION reference operator[](const iType& i) { + static_assert( + (std::is_integral<iType>::value || std::is_enum<iType>::value), + "Must be integral argument"); + KOKKOS_ARRAY_BOUNDS_CHECK(i, N); + return m_internal_implementation_private_member_data[i]; + } + + template <typename iType> + KOKKOS_INLINE_FUNCTION const_reference operator[](const iType& i) const { + static_assert( + (std::is_integral<iType>::value || std::is_enum<iType>::value), + "Must be integral argument"); + KOKKOS_ARRAY_BOUNDS_CHECK(i, N); + return m_internal_implementation_private_member_data[i]; + } + + KOKKOS_INLINE_FUNCTION pointer data() { + return &m_internal_implementation_private_member_data[0]; + } + KOKKOS_INLINE_FUNCTION const_pointer data() const { + return &m_internal_implementation_private_member_data[0]; + } +}; + +template <class T, class Proxy> +struct Array<T, 0, Proxy> { + public: + using reference = T&; + using const_reference = typename std::add_const<T>::type&; + using size_type = size_t; + using difference_type = ptrdiff_t; + using value_type = T; + using pointer = T*; + using const_pointer = typename std::add_const<T>::type*; + + KOKKOS_INLINE_FUNCTION static constexpr size_type size() { return 0; } + KOKKOS_INLINE_FUNCTION static constexpr bool empty() { return true; } + KOKKOS_INLINE_FUNCTION constexpr size_type max_size() const { return 0; } + + template <typename iType> + KOKKOS_INLINE_FUNCTION reference operator[](const iType&) { + static_assert( + (std::is_integral<iType>::value || std::is_enum<iType>::value), + "Must be integer argument"); + Kokkos::abort("Unreachable code"); + return *reinterpret_cast<pointer>(-1); + } + + template <typename iType> + KOKKOS_INLINE_FUNCTION const_reference operator[](const iType&) const { + static_assert( + (std::is_integral<iType>::value || std::is_enum<iType>::value), + "Must be integer argument"); + Kokkos::abort("Unreachable code"); + return *reinterpret_cast<const_pointer>(-1); + } + + KOKKOS_INLINE_FUNCTION pointer data() { return pointer(0); } + KOKKOS_INLINE_FUNCTION const_pointer data() const { return const_pointer(0); } + + KOKKOS_DEFAULTED_FUNCTION ~Array() = default; + KOKKOS_DEFAULTED_FUNCTION Array() = default; + KOKKOS_DEFAULTED_FUNCTION Array(const Array&) = default; + KOKKOS_DEFAULTED_FUNCTION Array& operator=(const Array&) = default; + + // Some supported compilers are not sufficiently C++11 compliant + // for default move constructor and move assignment operator. + // Array( Array && ) = default ; + // Array & operator = ( Array && ) = default ; +}; + +template <> +struct Array<void, KOKKOS_INVALID_INDEX, void> { + struct contiguous {}; + struct strided {}; +}; + +template <class T> +struct Array<T, KOKKOS_INVALID_INDEX, Array<>::contiguous> { + private: + T* m_elem; + size_t m_size; + + public: + using reference = T&; + using const_reference = typename std::add_const<T>::type&; + using size_type = size_t; + using difference_type = ptrdiff_t; + using value_type = T; + using pointer = T*; + using const_pointer = typename std::add_const<T>::type*; + + KOKKOS_INLINE_FUNCTION constexpr size_type size() const { return m_size; } + KOKKOS_INLINE_FUNCTION constexpr bool empty() const { return 0 != m_size; } + KOKKOS_INLINE_FUNCTION constexpr size_type max_size() const { return m_size; } + + template <typename iType> + KOKKOS_INLINE_FUNCTION reference operator[](const iType& i) { + static_assert( + (std::is_integral<iType>::value || std::is_enum<iType>::value), + "Must be integral argument"); + KOKKOS_ARRAY_BOUNDS_CHECK(i, m_size); + return m_elem[i]; + } + + template <typename iType> + KOKKOS_INLINE_FUNCTION const_reference operator[](const iType& i) const { + static_assert( + (std::is_integral<iType>::value || std::is_enum<iType>::value), + "Must be integral argument"); + KOKKOS_ARRAY_BOUNDS_CHECK(i, m_size); + return m_elem[i]; + } + + KOKKOS_INLINE_FUNCTION pointer data() { return m_elem; } + KOKKOS_INLINE_FUNCTION const_pointer data() const { return m_elem; } + + KOKKOS_DEFAULTED_FUNCTION ~Array() = default; + KOKKOS_INLINE_FUNCTION_DELETED Array() = delete; + KOKKOS_INLINE_FUNCTION_DELETED Array(const Array& rhs) = delete; + + // Some supported compilers are not sufficiently C++11 compliant + // for default move constructor and move assignment operator. + // Array( Array && rhs ) = default ; + // Array & operator = ( Array && rhs ) = delete ; + + KOKKOS_INLINE_FUNCTION + Array& operator=(const Array& rhs) { + const size_t n = std::min(m_size, rhs.size()); + for (size_t i = 0; i < n; ++i) m_elem[i] = rhs[i]; + return *this; + } + + template <size_t N, class P> + KOKKOS_INLINE_FUNCTION Array& operator=(const Array<T, N, P>& rhs) { + const size_t n = std::min(m_size, rhs.size()); + for (size_t i = 0; i < n; ++i) m_elem[i] = rhs[i]; + return *this; + } + + KOKKOS_INLINE_FUNCTION constexpr Array(pointer arg_ptr, size_type arg_size, + size_type = 0) + : m_elem(arg_ptr), m_size(arg_size) {} +}; + +template <class T> +struct Array<T, KOKKOS_INVALID_INDEX, Array<>::strided> { + private: + T* m_elem; + size_t m_size; + size_t m_stride; + + public: + using reference = T&; + using const_reference = typename std::add_const<T>::type&; + using size_type = size_t; + using difference_type = ptrdiff_t; + using value_type = T; + using pointer = T*; + using const_pointer = typename std::add_const<T>::type*; + + KOKKOS_INLINE_FUNCTION constexpr size_type size() const { return m_size; } + KOKKOS_INLINE_FUNCTION constexpr bool empty() const { return 0 != m_size; } + KOKKOS_INLINE_FUNCTION constexpr size_type max_size() const { return m_size; } + + template <typename iType> + KOKKOS_INLINE_FUNCTION reference operator[](const iType& i) { + static_assert( + (std::is_integral<iType>::value || std::is_enum<iType>::value), + "Must be integral argument"); + KOKKOS_ARRAY_BOUNDS_CHECK(i, m_size); + return m_elem[i * m_stride]; + } + + template <typename iType> + KOKKOS_INLINE_FUNCTION const_reference operator[](const iType& i) const { + static_assert( + (std::is_integral<iType>::value || std::is_enum<iType>::value), + "Must be integral argument"); + KOKKOS_ARRAY_BOUNDS_CHECK(i, m_size); + return m_elem[i * m_stride]; + } + + KOKKOS_INLINE_FUNCTION pointer data() { return m_elem; } + KOKKOS_INLINE_FUNCTION const_pointer data() const { return m_elem; } + + KOKKOS_DEFAULTED_FUNCTION ~Array() = default; + KOKKOS_INLINE_FUNCTION_DELETED Array() = delete; + KOKKOS_INLINE_FUNCTION_DELETED Array(const Array&) = delete; + + // Some supported compilers are not sufficiently C++11 compliant + // for default move constructor and move assignment operator. + // Array( Array && rhs ) = default ; + // Array & operator = ( Array && rhs ) = delete ; + + KOKKOS_INLINE_FUNCTION + Array& operator=(const Array& rhs) { + const size_t n = std::min(m_size, rhs.size()); + for (size_t i = 0; i < n; ++i) m_elem[i] = rhs[i]; + return *this; + } + + template <size_t N, class P> + KOKKOS_INLINE_FUNCTION Array& operator=(const Array<T, N, P>& rhs) { + const size_t n = std::min(m_size, rhs.size()); + for (size_t i = 0; i < n; ++i) m_elem[i] = rhs[i]; + return *this; + } + + KOKKOS_INLINE_FUNCTION constexpr Array(pointer arg_ptr, size_type arg_size, + size_type arg_stride) + : m_elem(arg_ptr), m_size(arg_size), m_stride(arg_stride) {} +}; + +} // namespace Kokkos + +#endif /* #ifndef KOKKOS_ARRAY_HPP */ diff --git a/packages/kokkos/core/src/Kokkos_Atomic.hpp b/packages/kokkos/core/src/Kokkos_Atomic.hpp new file mode 100644 index 0000000000000000000000000000000000000000..8cd60fa6bae993895ac901fbbab8eb532a6a0ded --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_Atomic.hpp @@ -0,0 +1,329 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +/// \file Kokkos_Atomic.hpp +/// \brief Atomic functions +/// +/// This header file defines prototypes for the following atomic functions: +/// - exchange +/// - compare and exchange +/// - add +/// +/// Supported types include: +/// - signed and unsigned 4 and 8 byte integers +/// - float +/// - double +/// +/// They are implemented through GCC compatible intrinsics, OpenMP +/// directives and native CUDA intrinsics. +/// +/// Including this header file requires one of the following +/// compilers: +/// - NVCC (for CUDA device code only) +/// - GCC (for host code only) +/// - Intel (for host code only) +/// - A compiler that supports OpenMP 3.1 (for host code only) + +#ifndef KOKKOS_ATOMIC_HPP +#define KOKKOS_ATOMIC_HPP + +#include <Kokkos_Macros.hpp> +#include <Kokkos_HostSpace.hpp> +#include <impl/Kokkos_Traits.hpp> + +//---------------------------------------------------------------------------- + +// Need to fix this for pure clang on windows +#if defined(_WIN32) +#define KOKKOS_ENABLE_WINDOWS_ATOMICS + +#if defined(KOKKOS_ENABLE_CUDA) +#define KOKKOS_ENABLE_CUDA_ATOMICS +#if defined(KOKKOS_COMPILER_CLANG) +#define KOKKOS_ENABLE_GNU_ATOMICS +#endif +#endif + +#else // _WIN32 +#if defined(KOKKOS_ENABLE_CUDA) + +// Compiling NVIDIA device code, must use Cuda atomics: + +#define KOKKOS_ENABLE_CUDA_ATOMICS + +#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HIP_GPU) || \ + defined(KOKKOS_IMPL_ENABLE_OVERLOAD_HOST_DEVICE) + +#define KOKKOS_ENABLE_HIP_ATOMICS + +#endif + +#if !defined(KOKKOS_ENABLE_GNU_ATOMICS) && \ + !defined(KOKKOS_ENABLE_INTEL_ATOMICS) && \ + !defined(KOKKOS_ENABLE_OPENMP_ATOMICS) && \ + !defined(KOKKOS_ENABLE_STD_ATOMICS) && \ + !defined(KOKKOS_ENABLE_SERIAL_ATOMICS) + +// Compiling for non-Cuda atomic implementation has not been pre-selected. +// Choose the best implementation for the detected compiler. +// Preference: GCC, INTEL, OMP31 + +#if defined(KOKKOS_INTERNAL_NOT_PARALLEL) + +#define KOKKOS_ENABLE_SERIAL_ATOMICS + +#elif defined(KOKKOS_COMPILER_GNU) || defined(KOKKOS_COMPILER_CLANG) || \ + (defined(KOKKOS_COMPILER_NVCC) || defined(KOKKOS_COMPILER_IBM)) + +#define KOKKOS_ENABLE_GNU_ATOMICS + +#elif defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_CRAYC) + +#define KOKKOS_ENABLE_INTEL_ATOMICS + +#elif defined(_OPENMP) && (201107 <= _OPENMP) + +#define KOKKOS_ENABLE_OPENMP_ATOMICS + +#else + +#error "KOKKOS_ATOMICS_USE : Unsupported compiler" + +#endif + +#endif /* Not pre-selected atomic implementation */ +#endif + +#ifdef KOKKOS_ENABLE_CUDA +#include <Cuda/Kokkos_Cuda_Locks.hpp> +#endif + +namespace Kokkos { +template <typename T> +KOKKOS_INLINE_FUNCTION void atomic_add(volatile T* const dest, const T src); + +// Atomic increment +template <typename T> +KOKKOS_INLINE_FUNCTION void atomic_increment(volatile T* a); + +template <typename T> +KOKKOS_INLINE_FUNCTION void atomic_decrement(volatile T* a); +} // namespace Kokkos + +namespace Kokkos { + +inline const char* atomic_query_version() { +#if defined(KOKKOS_ENABLE_CUDA_ATOMICS) + return "KOKKOS_ENABLE_CUDA_ATOMICS"; +#elif defined(KOKKOS_ENABLE_GNU_ATOMICS) + return "KOKKOS_ENABLE_GNU_ATOMICS"; +#elif defined(KOKKOS_ENABLE_INTEL_ATOMICS) + return "KOKKOS_ENABLE_INTEL_ATOMICS"; +#elif defined(KOKKOS_ENABLE_OPENMP_ATOMICS) + return "KOKKOS_ENABLE_OPENMP_ATOMICS"; +#elif defined(KOKKOS_ENABLE_WINDOWS_ATOMICS) + return "KOKKOS_ENABLE_WINDOWS_ATOMICS"; +#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS) + return "KOKKOS_ENABLE_SERIAL_ATOMICS"; +#else +#error "No valid response for atomic_query_version!" +#endif +} + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +// Atomic Memory Orders +// +// Implements Strongly-typed analogs of C++ standard memory orders +#include "impl/Kokkos_Atomic_Memory_Order.hpp" + +#if defined(KOKKOS_ENABLE_HIP) +#include <HIP/Kokkos_HIP_Atomic.hpp> +#endif + +#if defined(KOKKOS_ENABLE_WINDOWS_ATOMICS) +#include "impl/Kokkos_Atomic_Windows.hpp" +#endif +//---------------------------------------------------------------------------- +// Atomic Assembly +// +// Implements CAS128-bit in assembly + +#include "impl/Kokkos_Atomic_Assembly.hpp" + +//---------------------------------------------------------------------------- +// Memory fence +// +// All loads and stores from this thread will be globally consistent before +// continuing +// +// void memory_fence() {...}; +#include "impl/Kokkos_Memory_Fence.hpp" + +//---------------------------------------------------------------------------- +// Atomic exchange +// +// template< typename T > +// T atomic_exchange( volatile T* const dest , const T val ) +// { T tmp = *dest ; *dest = val ; return tmp ; } + +#include "impl/Kokkos_Atomic_Exchange.hpp" + +//---------------------------------------------------------------------------- +// Atomic compare-and-exchange +// +// template<class T> +// bool atomic_compare_exchange_strong(volatile T* const dest, const T compare, +// const T val) { bool equal = compare == *dest ; if ( equal ) { *dest = val ; } +// return equal ; } + +#include "impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp" + +#include "impl/Kokkos_Atomic_Generic.hpp" + +//---------------------------------------------------------------------------- +// Atomic fetch and add +// +// template<class T> +// T atomic_fetch_add(volatile T* const dest, const T val) +// { T tmp = *dest ; *dest += val ; return tmp ; } + +#include "impl/Kokkos_Atomic_Fetch_Add.hpp" + +//---------------------------------------------------------------------------- +// Atomic increment +// +// template<class T> +// T atomic_increment(volatile T* const dest) +// { dest++; } + +#include "impl/Kokkos_Atomic_Increment.hpp" + +//---------------------------------------------------------------------------- +// Atomic Decrement +// +// template<class T> +// T atomic_decrement(volatile T* const dest) +// { dest--; } + +#include "impl/Kokkos_Atomic_Decrement.hpp" + +//---------------------------------------------------------------------------- +// Atomic fetch and sub +// +// template<class T> +// T atomic_fetch_sub(volatile T* const dest, const T val) +// { T tmp = *dest ; *dest -= val ; return tmp ; } + +#include "impl/Kokkos_Atomic_Fetch_Sub.hpp" + +//---------------------------------------------------------------------------- +// Atomic fetch and or +// +// template<class T> +// T atomic_fetch_or(volatile T* const dest, const T val) +// { T tmp = *dest ; *dest = tmp | val ; return tmp ; } + +#include "impl/Kokkos_Atomic_Fetch_Or.hpp" + +//---------------------------------------------------------------------------- +// Atomic fetch and and +// +// template<class T> +// T atomic_fetch_and(volatile T* const dest, const T val) +// { T tmp = *dest ; *dest = tmp & val ; return tmp ; } + +#include "impl/Kokkos_Atomic_Fetch_And.hpp" + +//---------------------------------------------------------------------------- +// Atomic MinMax +// +// template<class T> +// T atomic_min(volatile T* const dest, const T val) +// { T tmp = *dest ; *dest = min(*dest, val); return tmp ; } +// template<class T> +// T atomic_max(volatile T* const dest, const T val) +// { T tmp = *dest ; *dest = max(*dest, val); return tmp ; } + +#include "impl/Kokkos_Atomic_MinMax.hpp" + +//---------------------------------------------------------------------------- +// Provide volatile_load and safe_load +// +// T volatile_load(T const volatile * const ptr); +// +// T const& safe_load(T const * const ptr); +// XEON PHI +// T safe_load(T const * const ptr + +#include "impl/Kokkos_Volatile_Load.hpp" + +//---------------------------------------------------------------------------- +// Provide atomic loads and stores with memory order semantics + +#include "impl/Kokkos_Atomic_Load.hpp" +#include "impl/Kokkos_Atomic_Store.hpp" + +// Generic functions using the above defined functions +#include "impl/Kokkos_Atomic_Generic_Secondary.hpp" +//---------------------------------------------------------------------------- +// This atomic-style macro should be an inlined function, not a macro + +#if defined(KOKKOS_COMPILER_GNU) && !defined(__PGIC__) && \ + !defined(__CUDA_ARCH__) + +#define KOKKOS_NONTEMPORAL_PREFETCH_LOAD(addr) __builtin_prefetch(addr, 0, 0) +#define KOKKOS_NONTEMPORAL_PREFETCH_STORE(addr) __builtin_prefetch(addr, 1, 0) + +#else + +#define KOKKOS_NONTEMPORAL_PREFETCH_LOAD(addr) ((void)0) +#define KOKKOS_NONTEMPORAL_PREFETCH_STORE(addr) ((void)0) + +#endif + +//---------------------------------------------------------------------------- + +#endif /* KOKKOS_ATOMIC_HPP */ diff --git a/packages/kokkos/core/src/Kokkos_Complex.hpp b/packages/kokkos/core/src/Kokkos_Complex.hpp new file mode 100644 index 0000000000000000000000000000000000000000..6578723fc8e5dab1e605b1a5dc80f1daf4b2ebfb --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_Complex.hpp @@ -0,0 +1,1067 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#ifndef KOKKOS_COMPLEX_HPP +#define KOKKOS_COMPLEX_HPP + +#include <Kokkos_Atomic.hpp> +#include <Kokkos_MathematicalFunctions.hpp> +#include <Kokkos_NumericTraits.hpp> +#include <impl/Kokkos_Error.hpp> +#include <complex> +#include <type_traits> +#include <iosfwd> + +namespace Kokkos { + +/// \class complex +/// \brief Partial reimplementation of std::complex that works as the +/// result of a Kokkos::parallel_reduce. +/// \tparam RealType The type of the real and imaginary parts of the +/// complex number. As with std::complex, this is only defined for +/// \c float, \c double, and <tt>long double</tt>. The latter is +/// currently forbidden in CUDA device kernels. +template <class RealType> +class +#ifdef KOKKOS_ENABLE_COMPLEX_ALIGN + alignas(2 * sizeof(RealType)) +#endif + complex { + private: + RealType re_{}; + RealType im_{}; + + public: + //! The type of the real or imaginary parts of this complex number. + using value_type = RealType; + + //! Default constructor (initializes both real and imaginary parts to zero). + KOKKOS_DEFAULTED_FUNCTION + complex() noexcept = default; + + //! Copy constructor. + KOKKOS_DEFAULTED_FUNCTION + complex(const complex&) noexcept = default; + + KOKKOS_DEFAULTED_FUNCTION + complex& operator=(const complex&) noexcept = default; + + /// \brief Conversion constructor from compatible RType + template <class RType, + typename std::enable_if<std::is_convertible<RType, RealType>::value, + int>::type = 0> + KOKKOS_INLINE_FUNCTION complex(const complex<RType>& other) noexcept + // Intentionally do the conversions implicitly here so that users don't + // get any warnings about narrowing, etc., that they would expect to get + // otherwise. + : re_(other.real()), im_(other.imag()) {} + + /// \brief Conversion constructor from std::complex. + /// + /// This constructor cannot be called in a CUDA device function, + /// because std::complex's methods and nonmember functions are not + /// marked as CUDA device functions. + KOKKOS_INLINE_FUNCTION + complex(const std::complex<RealType>& src) noexcept + // We can use this aspect of the standard to avoid calling + // non-device-marked functions `std::real` and `std::imag`: "For any + // object z of type complex<T>, reinterpret_cast<T(&)[2]>(z)[0] is the + // real part of z and reinterpret_cast<T(&)[2]>(z)[1] is the imaginary + // part of z." Now we don't have to provide a whole bunch of the overloads + // of things taking either Kokkos::complex or std::complex + : re_(reinterpret_cast<const RealType (&)[2]>(src)[0]), + im_(reinterpret_cast<const RealType (&)[2]>(src)[1]) {} + + /// \brief Conversion operator to std::complex. + /// + /// This operator cannot be called in a CUDA device function, + /// because std::complex's methods and nonmember functions are not + /// marked as CUDA device functions. + // TODO: make explicit. DJS 2019-08-28 + operator std::complex<RealType>() const noexcept { + return std::complex<RealType>(re_, im_); + } + + /// \brief Constructor that takes just the real part, and sets the + /// imaginary part to zero. + KOKKOS_INLINE_FUNCTION complex(const RealType& val) noexcept + : re_(val), im_(static_cast<RealType>(0)) {} + + //! Constructor that takes the real and imaginary parts. + KOKKOS_INLINE_FUNCTION + complex(const RealType& re, const RealType& im) noexcept : re_(re), im_(im) {} + + //! Assignment operator (from a real number). + KOKKOS_INLINE_FUNCTION complex& operator=(const RealType& val) noexcept { + re_ = val; + im_ = RealType(0); + return *this; + } + + /// \brief Assignment operator from std::complex. + /// + /// This constructor cannot be called in a CUDA device function, + /// because std::complex's methods and nonmember functions are not + /// marked as CUDA device functions. + complex& operator=(const std::complex<RealType>& src) noexcept { + *this = complex(src); + return *this; + } + + //! The imaginary part of this complex number. + KOKKOS_INLINE_FUNCTION + KOKKOS_CONSTEXPR_14 RealType& imag() noexcept { return im_; } + + //! The real part of this complex number. + KOKKOS_INLINE_FUNCTION + KOKKOS_CONSTEXPR_14 RealType& real() noexcept { return re_; } + + //! The imaginary part of this complex number. + KOKKOS_INLINE_FUNCTION + constexpr RealType imag() const noexcept { return im_; } + + //! The real part of this complex number. + KOKKOS_INLINE_FUNCTION + constexpr RealType real() const noexcept { return re_; } + + //! Set the imaginary part of this complex number. + KOKKOS_INLINE_FUNCTION + KOKKOS_CONSTEXPR_14 + void imag(RealType v) noexcept { im_ = v; } + + //! Set the real part of this complex number. + KOKKOS_INLINE_FUNCTION + KOKKOS_CONSTEXPR_14 + void real(RealType v) noexcept { re_ = v; } + + KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator+=( + const complex<RealType>& src) noexcept { + re_ += src.re_; + im_ += src.im_; + return *this; + } + + KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator+=( + const RealType& src) noexcept { + re_ += src; + return *this; + } + + KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator-=( + const complex<RealType>& src) noexcept { + re_ -= src.re_; + im_ -= src.im_; + return *this; + } + + KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator-=( + const RealType& src) noexcept { + re_ -= src; + return *this; + } + + KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator*=( + const complex<RealType>& src) noexcept { + const RealType realPart = re_ * src.re_ - im_ * src.im_; + const RealType imagPart = re_ * src.im_ + im_ * src.re_; + re_ = realPart; + im_ = imagPart; + return *this; + } + + KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator*=( + const RealType& src) noexcept { + re_ *= src; + im_ *= src; + return *this; + } + + // Conditional noexcept, just in case RType throws on divide-by-zero + KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator/=( + const complex<RealType>& y) noexcept(noexcept(RealType{} / RealType{})) { + using Kokkos::Experimental::fabs; + // Scale (by the "1-norm" of y) to avoid unwarranted overflow. + // If the real part is +/-Inf and the imaginary part is -/+Inf, + // this won't change the result. + const RealType s = fabs(y.real()) + fabs(y.imag()); + + // If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0. + // In that case, the relation x/y == (x/s) / (y/s) doesn't hold, + // because y/s is NaN. + // TODO mark this branch unlikely + if (s == RealType(0)) { + this->re_ /= s; + this->im_ /= s; + } else { + const complex x_scaled(this->re_ / s, this->im_ / s); + const complex y_conj_scaled(y.re_ / s, -(y.im_) / s); + const RealType y_scaled_abs = + y_conj_scaled.re_ * y_conj_scaled.re_ + + y_conj_scaled.im_ * y_conj_scaled.im_; // abs(y) == abs(conj(y)) + *this = x_scaled * y_conj_scaled; + *this /= y_scaled_abs; + } + return *this; + } + + KOKKOS_CONSTEXPR_14 + KOKKOS_INLINE_FUNCTION complex& operator/=( + const std::complex<RealType>& y) noexcept(noexcept(RealType{} / + RealType{})) { + using Kokkos::Experimental::fabs; + // Scale (by the "1-norm" of y) to avoid unwarranted overflow. + // If the real part is +/-Inf and the imaginary part is -/+Inf, + // this won't change the result. + const RealType s = fabs(y.real()) + fabs(y.imag()); + + // If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0. + // In that case, the relation x/y == (x/s) / (y/s) doesn't hold, + // because y/s is NaN. + if (s == RealType(0)) { + this->re_ /= s; + this->im_ /= s; + } else { + const complex x_scaled(this->re_ / s, this->im_ / s); + const complex y_conj_scaled(y.re_ / s, -(y.im_) / s); + const RealType y_scaled_abs = + y_conj_scaled.re_ * y_conj_scaled.re_ + + y_conj_scaled.im_ * y_conj_scaled.im_; // abs(y) == abs(conj(y)) + *this = x_scaled * y_conj_scaled; + *this /= y_scaled_abs; + } + return *this; + } + + KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator/=( + const RealType& src) noexcept(noexcept(RealType{} / RealType{})) { + re_ /= src; + im_ /= src; + return *this; + } + + //--------------------------------------------------------------------------- + // TODO: refactor Kokkos reductions to remove dependency on + // volatile member overloads since they are being deprecated in c++20 + //--------------------------------------------------------------------------- + + //! Copy constructor from volatile. + template <class RType, + typename std::enable_if<std::is_convertible<RType, RealType>::value, + int>::type = 0> + KOKKOS_INLINE_FUNCTION complex(const volatile complex<RType>& src) noexcept + // Intentionally do the conversions implicitly here so that users don't + // get any warnings about narrowing, etc., that they would expect to get + // otherwise. + : re_(src.re_), im_(src.im_) {} + + /// \brief Assignment operator, for volatile <tt>*this</tt> and + /// nonvolatile input. + /// + /// \param src [in] Input; right-hand side of the assignment. + /// + /// This operator returns \c void instead of <tt>volatile + /// complex& </tt>. See Kokkos Issue #177 for the + /// explanation. In practice, this means that you should not chain + /// assignments with volatile lvalues. + // + // Templated, so as not to be a copy assignment operator (Kokkos issue #2577) + // Intended to behave as + // void operator=(const complex&) volatile noexcept + // + // Use cases: + // complex r; + // const complex cr; + // volatile complex vl; + // vl = r; + // vl = cr; + template <class Complex, + typename std::enable_if<std::is_same<Complex, complex>::value, + int>::type = 0> + KOKKOS_INLINE_FUNCTION void operator=(const Complex& src) volatile noexcept { + re_ = src.re_; + im_ = src.im_; + // We deliberately do not return anything here. See explanation + // in public documentation above. + } + + //! Assignment operator, volatile LHS and volatile RHS + // TODO Should this return void like the other volatile assignment operators? + // + // Templated, so as not to be a copy assignment operator (Kokkos issue #2577) + // Intended to behave as + // volatile complex& operator=(const volatile complex&) volatile noexcept + // + // Use cases: + // volatile complex vr; + // const volatile complex cvr; + // volatile complex vl; + // vl = vr; + // vl = cvr; + template <class Complex, + typename std::enable_if<std::is_same<Complex, complex>::value, + int>::type = 0> + KOKKOS_INLINE_FUNCTION volatile complex& operator=( + const volatile Complex& src) volatile noexcept { + re_ = src.re_; + im_ = src.im_; + return *this; + } + + //! Assignment operator, volatile RHS and non-volatile LHS + // + // Templated, so as not to be a copy assignment operator (Kokkos issue #2577) + // Intended to behave as + // complex& operator=(const volatile complex&) noexcept + // + // Use cases: + // volatile complex vr; + // const volatile complex cvr; + // complex l; + // l = vr; + // l = cvr; + // + template <class Complex, + typename std::enable_if<std::is_same<Complex, complex>::value, + int>::type = 0> + KOKKOS_INLINE_FUNCTION complex& operator=( + const volatile Complex& src) noexcept { + re_ = src.re_; + im_ = src.im_; + return *this; + } + + // Mirroring the behavior of the assignment operators from complex RHS in the + // RealType RHS versions. + + //! Assignment operator (from a volatile real number). + KOKKOS_INLINE_FUNCTION void operator=(const volatile RealType& val) noexcept { + re_ = val; + im_ = RealType(0); + // We deliberately do not return anything here. See explanation + // in public documentation above. + } + + //! Assignment operator volatile LHS and non-volatile RHS + KOKKOS_INLINE_FUNCTION complex& operator=( + const RealType& val) volatile noexcept { + re_ = val; + im_ = RealType(0); + return *this; + } + + //! Assignment operator volatile LHS and volatile RHS + // TODO Should this return void like the other volatile assignment operators? + KOKKOS_INLINE_FUNCTION complex& operator=( + const volatile RealType& val) volatile noexcept { + re_ = val; + im_ = RealType(0); + return *this; + } + + //! The imaginary part of this complex number (volatile overload). + KOKKOS_INLINE_FUNCTION + volatile RealType& imag() volatile noexcept { return im_; } + + //! The real part of this complex number (volatile overload). + KOKKOS_INLINE_FUNCTION + volatile RealType& real() volatile noexcept { return re_; } + + //! The imaginary part of this complex number (volatile overload). + KOKKOS_INLINE_FUNCTION + RealType imag() const volatile noexcept { return im_; } + + //! The real part of this complex number (volatile overload). + KOKKOS_INLINE_FUNCTION + RealType real() const volatile noexcept { return re_; } + + KOKKOS_INLINE_FUNCTION void operator+=( + const volatile complex<RealType>& src) volatile noexcept { + re_ += src.re_; + im_ += src.im_; + } + + KOKKOS_INLINE_FUNCTION void operator+=( + const volatile RealType& src) volatile noexcept { + re_ += src; + } + + KOKKOS_INLINE_FUNCTION void operator*=( + const volatile complex<RealType>& src) volatile noexcept { + const RealType realPart = re_ * src.re_ - im_ * src.im_; + const RealType imagPart = re_ * src.im_ + im_ * src.re_; + + re_ = realPart; + im_ = imagPart; + } + + KOKKOS_INLINE_FUNCTION void operator*=( + const volatile RealType& src) volatile noexcept { + re_ *= src; + im_ *= src; + } + + // TODO DSH 2019-10-7 why are there no volatile /= and friends? +}; + +//============================================================================== +// <editor-fold desc="Equality and inequality"> {{{1 + +// Note that this is not the same behavior as std::complex, which doesn't allow +// implicit conversions, but since this is the way we had it before, we have +// to do it this way now. + +//! Binary == operator for complex complex. +template <class RealType1, class RealType2> +KOKKOS_INLINE_FUNCTION bool operator==(complex<RealType1> const& x, + complex<RealType2> const& y) noexcept { + using common_type = typename std::common_type<RealType1, RealType2>::type; + return common_type(x.real()) == common_type(y.real()) && + common_type(x.imag()) == common_type(y.imag()); +} + +// TODO (here and elsewhere) decide if we should convert to a Kokkos::complex +// and do the comparison in a device-marked function +//! Binary == operator for std::complex complex. +template <class RealType1, class RealType2> +inline bool operator==(std::complex<RealType1> const& x, + complex<RealType2> const& y) noexcept { + using common_type = typename std::common_type<RealType1, RealType2>::type; + return common_type(x.real()) == common_type(y.real()) && + common_type(x.imag()) == common_type(y.imag()); +} + +//! Binary == operator for complex std::complex. +template <class RealType1, class RealType2> +inline bool operator==(complex<RealType1> const& x, + std::complex<RealType2> const& y) noexcept { + using common_type = typename std::common_type<RealType1, RealType2>::type; + return common_type(x.real()) == common_type(y.real()) && + common_type(x.imag()) == common_type(y.imag()); +} + +//! Binary == operator for complex real. +template < + class RealType1, class RealType2, + // Constraints to avoid participation in oparator==() for every possible RHS + typename std::enable_if<std::is_convertible<RealType2, RealType1>::value, + int>::type = 0> +KOKKOS_INLINE_FUNCTION bool operator==(complex<RealType1> const& x, + RealType2 const& y) noexcept { + using common_type = typename std::common_type<RealType1, RealType2>::type; + return common_type(x.real()) == common_type(y) && + common_type(x.imag()) == common_type(0); +} + +//! Binary == operator for real complex. +template < + class RealType1, class RealType2, + // Constraints to avoid participation in oparator==() for every possible RHS + typename std::enable_if<std::is_convertible<RealType1, RealType2>::value, + int>::type = 0> +KOKKOS_INLINE_FUNCTION bool operator==(RealType1 const& x, + complex<RealType2> const& y) noexcept { + using common_type = typename std::common_type<RealType1, RealType2>::type; + return common_type(x) == common_type(y.real()) && + common_type(0) == common_type(y.imag()); +} + +//! Binary != operator for complex complex. +template <class RealType1, class RealType2> +KOKKOS_INLINE_FUNCTION bool operator!=(complex<RealType1> const& x, + complex<RealType2> const& y) noexcept { + using common_type = typename std::common_type<RealType1, RealType2>::type; + return common_type(x.real()) != common_type(y.real()) || + common_type(x.imag()) != common_type(y.imag()); +} + +//! Binary != operator for std::complex complex. +template <class RealType1, class RealType2> +inline bool operator!=(std::complex<RealType1> const& x, + complex<RealType2> const& y) noexcept { + using common_type = typename std::common_type<RealType1, RealType2>::type; + return common_type(x.real()) != common_type(y.real()) || + common_type(x.imag()) != common_type(y.imag()); +} + +//! Binary != operator for complex std::complex. +template <class RealType1, class RealType2> +inline bool operator!=(complex<RealType1> const& x, + std::complex<RealType2> const& y) noexcept { + using common_type = typename std::common_type<RealType1, RealType2>::type; + return common_type(x.real()) != common_type(y.real()) || + common_type(x.imag()) != common_type(y.imag()); +} + +//! Binary != operator for complex real. +template < + class RealType1, class RealType2, + // Constraints to avoid participation in oparator==() for every possible RHS + typename std::enable_if<std::is_convertible<RealType2, RealType1>::value, + int>::type = 0> +KOKKOS_INLINE_FUNCTION bool operator!=(complex<RealType1> const& x, + RealType2 const& y) noexcept { + using common_type = typename std::common_type<RealType1, RealType2>::type; + return common_type(x.real()) != common_type(y) || + common_type(x.imag()) != common_type(0); +} + +//! Binary != operator for real complex. +template < + class RealType1, class RealType2, + // Constraints to avoid participation in oparator==() for every possible RHS + typename std::enable_if<std::is_convertible<RealType1, RealType2>::value, + int>::type = 0> +KOKKOS_INLINE_FUNCTION bool operator!=(RealType1 const& x, + complex<RealType2> const& y) noexcept { + using common_type = typename std::common_type<RealType1, RealType2>::type; + return common_type(x) != common_type(y.real()) || + common_type(0) != common_type(y.imag()); +} + +// </editor-fold> end Equality and inequality }}}1 +//============================================================================== + +//! Binary + operator for complex complex. +template <class RealType1, class RealType2> +KOKKOS_INLINE_FUNCTION + complex<typename std::common_type<RealType1, RealType2>::type> + operator+(const complex<RealType1>& x, + const complex<RealType2>& y) noexcept { + return complex<typename std::common_type<RealType1, RealType2>::type>( + x.real() + y.real(), x.imag() + y.imag()); +} + +//! Binary + operator for complex scalar. +template <class RealType1, class RealType2> +KOKKOS_INLINE_FUNCTION + complex<typename std::common_type<RealType1, RealType2>::type> + operator+(const complex<RealType1>& x, const RealType2& y) noexcept { + return complex<typename std::common_type<RealType1, RealType2>::type>( + x.real() + y, x.imag()); +} + +//! Binary + operator for scalar complex. +template <class RealType1, class RealType2> +KOKKOS_INLINE_FUNCTION + complex<typename std::common_type<RealType1, RealType2>::type> + operator+(const RealType1& x, const complex<RealType2>& y) noexcept { + return complex<typename std::common_type<RealType1, RealType2>::type>( + x + y.real(), y.imag()); +} + +//! Unary + operator for complex. +template <class RealType> +KOKKOS_INLINE_FUNCTION complex<RealType> operator+( + const complex<RealType>& x) noexcept { + return complex<RealType>{+x.real(), +x.imag()}; +} + +//! Binary - operator for complex. +template <class RealType1, class RealType2> +KOKKOS_INLINE_FUNCTION + complex<typename std::common_type<RealType1, RealType2>::type> + operator-(const complex<RealType1>& x, + const complex<RealType2>& y) noexcept { + return complex<typename std::common_type<RealType1, RealType2>::type>( + x.real() - y.real(), x.imag() - y.imag()); +} + +//! Binary - operator for complex scalar. +template <class RealType1, class RealType2> +KOKKOS_INLINE_FUNCTION + complex<typename std::common_type<RealType1, RealType2>::type> + operator-(const complex<RealType1>& x, const RealType2& y) noexcept { + return complex<typename std::common_type<RealType1, RealType2>::type>( + x.real() - y, x.imag()); +} + +//! Binary - operator for scalar complex. +template <class RealType1, class RealType2> +KOKKOS_INLINE_FUNCTION + complex<typename std::common_type<RealType1, RealType2>::type> + operator-(const RealType1& x, const complex<RealType2>& y) noexcept { + return complex<typename std::common_type<RealType1, RealType2>::type>( + x - y.real(), -y.imag()); +} + +//! Unary - operator for complex. +template <class RealType> +KOKKOS_INLINE_FUNCTION complex<RealType> operator-( + const complex<RealType>& x) noexcept { + return complex<RealType>(-x.real(), -x.imag()); +} + +//! Binary * operator for complex. +template <class RealType1, class RealType2> +KOKKOS_INLINE_FUNCTION + complex<typename std::common_type<RealType1, RealType2>::type> + operator*(const complex<RealType1>& x, + const complex<RealType2>& y) noexcept { + return complex<typename std::common_type<RealType1, RealType2>::type>( + x.real() * y.real() - x.imag() * y.imag(), + x.real() * y.imag() + x.imag() * y.real()); +} + +/// \brief Binary * operator for std::complex and complex. +/// +/// This needs to exist because template parameters can't be deduced when +/// conversions occur. We could probably fix this using hidden friends patterns +/// +/// This function cannot be called in a CUDA device function, because +/// std::complex's methods and nonmember functions are not marked as +/// CUDA device functions. +template <class RealType1, class RealType2> +inline complex<typename std::common_type<RealType1, RealType2>::type> operator*( + const std::complex<RealType1>& x, const complex<RealType2>& y) { + return complex<typename std::common_type<RealType1, RealType2>::type>( + x.real() * y.real() - x.imag() * y.imag(), + x.real() * y.imag() + x.imag() * y.real()); +} + +/// \brief Binary * operator for RealType times complex. +/// +/// This function exists because the compiler doesn't know that +/// RealType and complex<RealType> commute with respect to operator*. +template <class RealType1, class RealType2> +KOKKOS_INLINE_FUNCTION + complex<typename std::common_type<RealType1, RealType2>::type> + operator*(const RealType1& x, const complex<RealType2>& y) noexcept { + return complex<typename std::common_type<RealType1, RealType2>::type>( + x * y.real(), x * y.imag()); +} + +/// \brief Binary * operator for RealType times complex. +/// +/// This function exists because the compiler doesn't know that +/// RealType and complex<RealType> commute with respect to operator*. +template <class RealType1, class RealType2> +KOKKOS_INLINE_FUNCTION + complex<typename std::common_type<RealType1, RealType2>::type> + operator*(const complex<RealType1>& y, const RealType2& x) noexcept { + return complex<typename std::common_type<RealType1, RealType2>::type>( + x * y.real(), x * y.imag()); +} + +//! Imaginary part of a complex number. +template <class RealType> +KOKKOS_INLINE_FUNCTION RealType imag(const complex<RealType>& x) noexcept { + return x.imag(); +} + +//! Real part of a complex number. +template <class RealType> +KOKKOS_INLINE_FUNCTION RealType real(const complex<RealType>& x) noexcept { + return x.real(); +} + +//! Constructs a complex number from magnitude and phase angle +template <class T> +KOKKOS_INLINE_FUNCTION complex<T> polar(const T& r, const T& theta = T()) { + using Kokkos::Experimental::cos; + using Kokkos::Experimental::sin; + KOKKOS_EXPECTS(r >= 0); + return complex<T>(r * cos(theta), r * sin(theta)); +} + +//! Absolute value (magnitude) of a complex number. +template <class RealType> +KOKKOS_INLINE_FUNCTION RealType abs(const complex<RealType>& x) { + using Kokkos::Experimental::hypot; + return hypot(x.real(), x.imag()); +} + +//! Power of a complex number +template <class T> +KOKKOS_INLINE_FUNCTION complex<T> pow(const complex<T>& x, const T& y) { + using Kokkos::Experimental::atan2; + using Kokkos::Experimental::pow; + T r = abs(x); + T theta = atan2(x.imag(), x.real()); + return polar(pow(r, y), y * theta); +} + +template <class T> +KOKKOS_INLINE_FUNCTION complex<T> pow(const T& x, const complex<T>& y) { + return pow(complex<T>(x), y); +} + +template <class T> +KOKKOS_INLINE_FUNCTION complex<T> pow(const complex<T>& x, + const complex<T>& y) { + using Kokkos::Experimental::log; + + return x == T() ? T() : exp(y * log(x)); +} + +namespace Impl { +// NOTE promote would also be useful for math functions +template <class T, bool = std::is_integral<T>::value> +struct promote { + using type = double; +}; +template <class T> +struct promote<T, false> {}; +template <> +struct promote<long double> { + using type = long double; +}; +template <> +struct promote<double> { + using type = double; +}; +template <> +struct promote<float> { + using type = float; +}; +template <class T> +using promote_t = typename promote<T>::type; +template <class T, class U> +struct promote_2 { + using type = decltype(promote_t<T>() + promote_t<U>()); +}; +template <class T, class U> +using promote_2_t = typename promote_2<T, U>::type; +} // namespace Impl + +template <class T, class U, + class = std::enable_if_t<std::is_arithmetic<T>::value>> +KOKKOS_INLINE_FUNCTION complex<Impl::promote_2_t<T, U>> pow( + const T& x, const complex<U>& y) { + using type = Impl::promote_2_t<T, U>; + return pow(type(x), complex<type>(y)); +} + +template <class T, class U, + class = std::enable_if_t<std::is_arithmetic<U>::value>> +KOKKOS_INLINE_FUNCTION complex<Impl::promote_2_t<T, U>> pow(const complex<T>& x, + const U& y) { + using type = Impl::promote_2_t<T, U>; + return pow(complex<type>(x), type(y)); +} + +template <class T, class U> +KOKKOS_INLINE_FUNCTION complex<Impl::promote_2_t<T, U>> pow( + const complex<T>& x, const complex<U>& y) { + using type = Impl::promote_2_t<T, U>; + return pow(complex<type>(x), complex<type>(y)); +} + +//! Square root of a complex number. This is intended to match the stdc++ +//! implementation, which returns sqrt(z*z) = z; where z is complex number. +template <class RealType> +KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> sqrt( + const complex<RealType>& x) { + using Kokkos::Experimental::fabs; + using Kokkos::Experimental::sqrt; + + RealType r = x.real(); + RealType i = x.imag(); + + if (r == RealType()) { + RealType t = sqrt(fabs(i) / 2); + return Kokkos::complex<RealType>(t, i < RealType() ? -t : t); + } else { + RealType t = sqrt(2 * (abs(x) + fabs(r))); + RealType u = t / 2; + return r > RealType() ? Kokkos::complex<RealType>(u, i / t) + : Kokkos::complex<RealType>(fabs(i) / t, + i < RealType() ? -u : u); + } +} + +//! Conjugate of a complex number. +template <class RealType> +KOKKOS_INLINE_FUNCTION complex<RealType> conj( + const complex<RealType>& x) noexcept { + return complex<RealType>(real(x), -imag(x)); +} + +//! Exponential of a complex number. +template <class RealType> +KOKKOS_INLINE_FUNCTION complex<RealType> exp(const complex<RealType>& x) { + using Kokkos::Experimental::cos; + using Kokkos::Experimental::exp; + using Kokkos::Experimental::sin; + return exp(x.real()) * complex<RealType>(cos(x.imag()), sin(x.imag())); +} + +//! natural log of a complex number. +template <class RealType> +KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> log( + const complex<RealType>& x) { + using Kokkos::Experimental::atan2; + using Kokkos::Experimental::log; + RealType phi = atan2(x.imag(), x.real()); + return Kokkos::complex<RealType>(log(abs(x)), phi); +} + +//! sine of a complex number. +template <class RealType> +KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> sin( + const complex<RealType>& x) { + using Kokkos::Experimental::cos; + using Kokkos::Experimental::cosh; + using Kokkos::Experimental::sin; + using Kokkos::Experimental::sinh; + return Kokkos::complex<RealType>(sin(x.real()) * cosh(x.imag()), + cos(x.real()) * sinh(x.imag())); +} + +//! cosine of a complex number. +template <class RealType> +KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> cos( + const complex<RealType>& x) { + using Kokkos::Experimental::cos; + using Kokkos::Experimental::cosh; + using Kokkos::Experimental::sin; + using Kokkos::Experimental::sinh; + return Kokkos::complex<RealType>(cos(x.real()) * cosh(x.imag()), + -sin(x.real()) * sinh(x.imag())); +} + +//! tangent of a complex number. +template <class RealType> +KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> tan( + const complex<RealType>& x) { + return sin(x) / cos(x); +} + +//! hyperbolic sine of a complex number. +template <class RealType> +KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> sinh( + const complex<RealType>& x) { + using Kokkos::Experimental::cos; + using Kokkos::Experimental::cosh; + using Kokkos::Experimental::sin; + using Kokkos::Experimental::sinh; + return Kokkos::complex<RealType>(sinh(x.real()) * cos(x.imag()), + cosh(x.real()) * sin(x.imag())); +} + +//! hyperbolic cosine of a complex number. +template <class RealType> +KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> cosh( + const complex<RealType>& x) { + using Kokkos::Experimental::cos; + using Kokkos::Experimental::cosh; + using Kokkos::Experimental::sin; + using Kokkos::Experimental::sinh; + return Kokkos::complex<RealType>(cosh(x.real()) * cos(x.imag()), + sinh(x.real()) * sin(x.imag())); +} + +//! hyperbolic tangent of a complex number. +template <class RealType> +KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> tanh( + const complex<RealType>& x) { + return sinh(x) / cosh(x); +} + +//! inverse hyperbolic sine of a complex number. +template <class RealType> +KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> asinh( + const complex<RealType>& x) { + return log(x + sqrt(x * x + RealType(1.0))); +} + +//! inverse hyperbolic cosine of a complex number. +template <class RealType> +KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> acosh( + const complex<RealType>& x) { + return RealType(2.0) * log(sqrt(RealType(0.5) * (x + RealType(1.0))) + + sqrt(RealType(0.5) * (x - RealType(1.0)))); +} + +//! inverse hyperbolic tangent of a complex number. +template <class RealType> +KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> atanh( + const complex<RealType>& x) { + using Kokkos::Experimental::atan2; + using Kokkos::Experimental::log; + + const RealType i2 = x.imag() * x.imag(); + const RealType r = RealType(1.0) - i2 - x.real() * x.real(); + + RealType p = RealType(1.0) + x.real(); + RealType m = RealType(1.0) - x.real(); + + p = i2 + p * p; + m = i2 + m * m; + + RealType phi = atan2(RealType(2.0) * x.imag(), r); + return Kokkos::complex<RealType>(RealType(0.25) * (log(p) - log(m)), + RealType(0.5) * phi); +} + +//! inverse sine of a complex number. +template <class RealType> +KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> asin( + const complex<RealType>& x) { + Kokkos::complex<RealType> t = + asinh(Kokkos::complex<RealType>(-x.imag(), x.real())); + return Kokkos::complex<RealType>(t.imag(), -t.real()); +} + +//! inverse cosine of a complex number. +template <class RealType> +KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> acos( + const complex<RealType>& x) { + using Kokkos::Experimental::acos; + Kokkos::complex<RealType> t = asin(x); + RealType pi_2 = acos(RealType(0.0)); + return Kokkos::complex<RealType>(pi_2 - t.real(), -t.imag()); +} + +//! inverse tangent of a complex number. +template <class RealType> +KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> atan( + const complex<RealType>& x) { + using Kokkos::Experimental::atan2; + using Kokkos::Experimental::log; + const RealType r2 = x.real() * x.real(); + const RealType i = RealType(1.0) - r2 - x.imag() * x.imag(); + + RealType p = x.imag() + RealType(1.0); + RealType m = x.imag() - RealType(1.0); + + p = r2 + p * p; + m = r2 + m * m; + + return Kokkos::complex<RealType>( + RealType(0.5) * atan2(RealType(2.0) * x.real(), i), + RealType(0.25) * log(p / m)); +} + +/// This function cannot be called in a CUDA device function, +/// because std::complex's methods and nonmember functions are not +/// marked as CUDA device functions. +template <class RealType> +inline complex<RealType> exp(const std::complex<RealType>& c) { + return complex<RealType>(std::exp(c.real()) * std::cos(c.imag()), + std::exp(c.real()) * std::sin(c.imag())); +} + +//! Binary operator / for complex and real numbers +template <class RealType1, class RealType2> +KOKKOS_INLINE_FUNCTION + complex<typename std::common_type<RealType1, RealType2>::type> + operator/(const complex<RealType1>& x, + const RealType2& y) noexcept(noexcept(RealType1{} / + RealType2{})) { + return complex<typename std::common_type<RealType1, RealType2>::type>( + real(x) / y, imag(x) / y); +} + +//! Binary operator / for complex. +template <class RealType1, class RealType2> +KOKKOS_INLINE_FUNCTION + complex<typename std::common_type<RealType1, RealType2>::type> + operator/(const complex<RealType1>& x, + const complex<RealType2>& y) noexcept(noexcept(RealType1{} / + RealType2{})) { + using Kokkos::Experimental::fabs; + // Scale (by the "1-norm" of y) to avoid unwarranted overflow. + // If the real part is +/-Inf and the imaginary part is -/+Inf, + // this won't change the result. + using common_real_type = + typename std::common_type<RealType1, RealType2>::type; + const common_real_type s = fabs(real(y)) + fabs(imag(y)); + + // If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0. + // In that case, the relation x/y == (x/s) / (y/s) doesn't hold, + // because y/s is NaN. + if (s == 0.0) { + return complex<common_real_type>(real(x) / s, imag(x) / s); + } else { + const complex<common_real_type> x_scaled(real(x) / s, imag(x) / s); + const complex<common_real_type> y_conj_scaled(real(y) / s, -imag(y) / s); + const RealType1 y_scaled_abs = + real(y_conj_scaled) * real(y_conj_scaled) + + imag(y_conj_scaled) * imag(y_conj_scaled); // abs(y) == abs(conj(y)) + complex<common_real_type> result = x_scaled * y_conj_scaled; + result /= y_scaled_abs; + return result; + } +} + +//! Binary operator / for complex and real numbers +template <class RealType1, class RealType2> +KOKKOS_INLINE_FUNCTION + complex<typename std::common_type<RealType1, RealType2>::type> + operator/(const RealType1& x, + const complex<RealType2>& y) noexcept(noexcept(RealType1{} / + RealType2{})) { + return complex<typename std::common_type<RealType1, RealType2>::type>(x) / y; +} + +template <class RealType> +std::ostream& operator<<(std::ostream& os, const complex<RealType>& x) { + const std::complex<RealType> x_std(Kokkos::real(x), Kokkos::imag(x)); + os << x_std; + return os; +} + +template <class RealType> +std::istream& operator>>(std::istream& is, complex<RealType>& x) { + std::complex<RealType> x_std; + is >> x_std; + x = x_std; // only assigns on success of above + return is; +} + +template <class T> +struct reduction_identity<Kokkos::complex<T>> { + using t_red_ident = reduction_identity<T>; + KOKKOS_FORCEINLINE_FUNCTION constexpr static Kokkos::complex<T> + sum() noexcept { + return Kokkos::complex<T>(t_red_ident::sum(), t_red_ident::sum()); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static Kokkos::complex<T> + prod() noexcept { + return Kokkos::complex<T>(t_red_ident::prod(), t_red_ident::sum()); + } +}; + +} // namespace Kokkos + +#endif // KOKKOS_COMPLEX_HPP diff --git a/packages/kokkos/core/src/Kokkos_Concepts.hpp b/packages/kokkos/core/src/Kokkos_Concepts.hpp new file mode 100644 index 0000000000000000000000000000000000000000..2aba189487490d4f870cec407ec1d1f3b9ed001e --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_Concepts.hpp @@ -0,0 +1,498 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CORE_CONCEPTS_HPP +#define KOKKOS_CORE_CONCEPTS_HPP + +#include <type_traits> + +// Needed for 'is_space<S>::host_mirror_space +#include <Kokkos_Core_fwd.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +// Schedules for Execution Policies +struct Static {}; +struct Dynamic {}; + +// Schedule Wrapper Type +template <class T> +struct Schedule { + static_assert(std::is_same<T, Static>::value || + std::is_same<T, Dynamic>::value, + "Kokkos: Invalid Schedule<> type."); + using schedule_type = Schedule; + using type = T; +}; + +// Specify Iteration Index Type +template <typename T> +struct IndexType { + static_assert(std::is_integral<T>::value, "Kokkos: Invalid IndexType<>."); + using index_type = IndexType; + using type = T; +}; + +namespace Experimental { +struct WorkItemProperty { + template <unsigned long Property> + struct ImplWorkItemProperty { + static const unsigned value = Property; + using work_item_property = ImplWorkItemProperty<Property>; + }; + + constexpr static const ImplWorkItemProperty<0> None = + ImplWorkItemProperty<0>(); + constexpr static const ImplWorkItemProperty<1> HintLightWeight = + ImplWorkItemProperty<1>(); + constexpr static const ImplWorkItemProperty<2> HintHeavyWeight = + ImplWorkItemProperty<2>(); + constexpr static const ImplWorkItemProperty<4> HintRegular = + ImplWorkItemProperty<4>(); + constexpr static const ImplWorkItemProperty<8> HintIrregular = + ImplWorkItemProperty<8>(); + using None_t = ImplWorkItemProperty<0>; + using HintLightWeight_t = ImplWorkItemProperty<1>; + using HintHeavyWeight_t = ImplWorkItemProperty<2>; + using HintRegular_t = ImplWorkItemProperty<4>; + using HintIrregular_t = ImplWorkItemProperty<8>; +}; + +template <unsigned long pv1, unsigned long pv2> +inline constexpr WorkItemProperty::ImplWorkItemProperty<pv1 | pv2> operator|( + WorkItemProperty::ImplWorkItemProperty<pv1>, + WorkItemProperty::ImplWorkItemProperty<pv2>) { + return WorkItemProperty::ImplWorkItemProperty<pv1 | pv2>(); +} + +template <unsigned long pv1, unsigned long pv2> +inline constexpr WorkItemProperty::ImplWorkItemProperty<pv1 & pv2> operator&( + WorkItemProperty::ImplWorkItemProperty<pv1>, + WorkItemProperty::ImplWorkItemProperty<pv2>) { + return WorkItemProperty::ImplWorkItemProperty<pv1 & pv2>(); +} + +template <unsigned long pv1, unsigned long pv2> +inline constexpr bool operator==(WorkItemProperty::ImplWorkItemProperty<pv1>, + WorkItemProperty::ImplWorkItemProperty<pv2>) { + return pv1 == pv2; +} + +} // namespace Experimental + +/**\brief Specify Launch Bounds for CUDA execution. + * + * If no launch bounds specified then do not set launch bounds. + */ +template <unsigned int maxT = 0 /* Max threads per block */ + , + unsigned int minB = 0 /* Min blocks per SM */ + > +struct LaunchBounds { + using launch_bounds = LaunchBounds; + using type = LaunchBounds<maxT, minB>; + static unsigned int constexpr maxTperB{maxT}; + static unsigned int constexpr minBperSM{minB}; +}; + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +#define KOKKOS_IMPL_IS_CONCEPT(CONCEPT) \ + template <typename T> \ + struct is_##CONCEPT { \ + private: \ + template <typename, typename = std::true_type> \ + struct have : std::false_type {}; \ + template <typename U> \ + struct have<U, typename std::is_base_of<typename U::CONCEPT, U>::type> \ + : std::true_type {}; \ + template <typename U> \ + struct have<U, \ + typename std::is_base_of<typename U::CONCEPT##_type, U>::type> \ + : std::true_type {}; \ + \ + public: \ + static constexpr bool value = \ + is_##CONCEPT::template have<typename std::remove_cv<T>::type>::value; \ + constexpr operator bool() const noexcept { return value; } \ + }; + +// Public concept: + +KOKKOS_IMPL_IS_CONCEPT(memory_space) +KOKKOS_IMPL_IS_CONCEPT(memory_traits) +KOKKOS_IMPL_IS_CONCEPT(execution_space) +KOKKOS_IMPL_IS_CONCEPT(execution_policy) +KOKKOS_IMPL_IS_CONCEPT(array_layout) +KOKKOS_IMPL_IS_CONCEPT(reducer) +namespace Experimental { +KOKKOS_IMPL_IS_CONCEPT(work_item_property) +} + +namespace Impl { + +// For backward compatibility: + +using Kokkos::is_array_layout; +using Kokkos::is_execution_policy; +using Kokkos::is_execution_space; +using Kokkos::is_memory_space; +using Kokkos::is_memory_traits; + +// Implementation concept: + +KOKKOS_IMPL_IS_CONCEPT(iteration_pattern) +KOKKOS_IMPL_IS_CONCEPT(schedule_type) +KOKKOS_IMPL_IS_CONCEPT(index_type) +KOKKOS_IMPL_IS_CONCEPT(launch_bounds) +KOKKOS_IMPL_IS_CONCEPT(thread_team_member) +KOKKOS_IMPL_IS_CONCEPT(host_thread_team_member) +KOKKOS_IMPL_IS_CONCEPT(graph_kernel) + +} // namespace Impl + +#undef KOKKOS_IMPL_IS_CONCEPT + +} // namespace Kokkos + +namespace Kokkos { +namespace Impl { + +template <class Object> +class has_member_team_shmem_size { + template <typename T> + static int32_t test_for_member(decltype(&T::team_shmem_size)) { + return int32_t(0); + } + template <typename T> + static int64_t test_for_member(...) { + return int64_t(0); + } + + public: + constexpr static bool value = + sizeof(test_for_member<Object>(nullptr)) == sizeof(int32_t); +}; + +template <class Object> +class has_member_shmem_size { + template <typename T> + static int32_t test_for_member(decltype(&T::shmem_size_me)) { + return int32_t(0); + } + template <typename T> + static int64_t test_for_member(...) { + return int64_t(0); + } + + public: + constexpr static bool value = + sizeof(test_for_member<Object>(0)) == sizeof(int32_t); +}; + +} // namespace Impl +} // namespace Kokkos +//---------------------------------------------------------------------------- + +namespace Kokkos { + +template <class ExecutionSpace, class MemorySpace> +struct Device { + static_assert(Kokkos::is_execution_space<ExecutionSpace>::value, + "Execution space is not valid"); + static_assert(Kokkos::is_memory_space<MemorySpace>::value, + "Memory space is not valid"); + using execution_space = ExecutionSpace; + using memory_space = MemorySpace; + using device_type = Device<execution_space, memory_space>; +}; + +namespace Impl { + +template <typename T> +struct is_device_helper : std::false_type {}; + +template <typename ExecutionSpace, typename MemorySpace> +struct is_device_helper<Device<ExecutionSpace, MemorySpace>> : std::true_type { +}; + +} // namespace Impl + +template <typename T> +using is_device = + typename Impl::is_device_helper<typename std::remove_cv<T>::type>::type; + +//---------------------------------------------------------------------------- + +template <typename T> +struct is_space { + private: + template <typename, typename = void> + struct exe : std::false_type { + using space = void; + }; + + template <typename, typename = void> + struct mem : std::false_type { + using space = void; + }; + + template <typename, typename = void> + struct dev : std::false_type { + using space = void; + }; + + template <typename U> + struct exe<U, typename std::conditional<true, void, + typename U::execution_space>::type> + : std::is_same<U, typename U::execution_space>::type { + using space = typename U::execution_space; + }; + + template <typename U> + struct mem< + U, typename std::conditional<true, void, typename U::memory_space>::type> + : std::is_same<U, typename U::memory_space>::type { + using space = typename U::memory_space; + }; + + template <typename U> + struct dev< + U, typename std::conditional<true, void, typename U::device_type>::type> + : std::is_same<U, typename U::device_type>::type { + using space = typename U::device_type; + }; + + using is_exe = + typename is_space<T>::template exe<typename std::remove_cv<T>::type>; + using is_mem = + typename is_space<T>::template mem<typename std::remove_cv<T>::type>; + using is_dev = + typename is_space<T>::template dev<typename std::remove_cv<T>::type>; + + public: + static constexpr bool value = is_exe::value || is_mem::value || is_dev::value; + + constexpr operator bool() const noexcept { return value; } + + using execution_space = typename is_exe::space; + using memory_space = typename is_mem::space; + + // For backward compatibility, deprecated in favor of + // Kokkos::Impl::HostMirror<S>::host_mirror_space + + using host_memory_space = typename std::conditional< + std::is_same<memory_space, Kokkos::HostSpace>::value +#if defined(KOKKOS_ENABLE_CUDA) + || std::is_same<memory_space, Kokkos::CudaUVMSpace>::value || + std::is_same<memory_space, Kokkos::CudaHostPinnedSpace>::value +#endif /* #if defined( KOKKOS_ENABLE_CUDA ) */ + , + memory_space, Kokkos::HostSpace>::type; + +#if defined(KOKKOS_ENABLE_CUDA) + using host_execution_space = typename std::conditional< + std::is_same<execution_space, Kokkos::Cuda>::value, + Kokkos::DefaultHostExecutionSpace, execution_space>::type; +#else +#if defined(KOKKOS_ENABLE_OPENMPTARGET) + using host_execution_space = typename std::conditional< + std::is_same<execution_space, Kokkos::Experimental::OpenMPTarget>::value, + Kokkos::DefaultHostExecutionSpace, execution_space>::type; +#else + using host_execution_space = execution_space; +#endif +#endif + + using host_mirror_space = typename std::conditional< + std::is_same<execution_space, host_execution_space>::value && + std::is_same<memory_space, host_memory_space>::value, + T, Kokkos::Device<host_execution_space, host_memory_space>>::type; +}; + +// For backward compatibility + +namespace Impl { + +using Kokkos::is_space; + +} + +} // namespace Kokkos + +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +/**\brief Access relationship between DstMemorySpace and SrcMemorySpace + * + * The default case can assume accessibility for the same space. + * Specializations must be defined for different memory spaces. + */ +template <typename DstMemorySpace, typename SrcMemorySpace> +struct MemorySpaceAccess { + static_assert(Kokkos::is_memory_space<DstMemorySpace>::value && + Kokkos::is_memory_space<SrcMemorySpace>::value, + "template arguments must be memory spaces"); + + /**\brief Can a View (or pointer) to memory in SrcMemorySpace + * be assigned to a View (or pointer) to memory marked DstMemorySpace. + * + * 1. DstMemorySpace::execution_space == SrcMemorySpace::execution_space + * 2. All execution spaces that can access DstMemorySpace can also access + * SrcMemorySpace. + */ + enum { assignable = std::is_same<DstMemorySpace, SrcMemorySpace>::value }; + + /**\brief For all DstExecSpace::memory_space == DstMemorySpace + * DstExecSpace can access SrcMemorySpace. + */ + enum { accessible = assignable }; + + /**\brief Does a DeepCopy capability exist + * to DstMemorySpace from SrcMemorySpace + */ + enum { deepcopy = assignable }; +}; + +} // namespace Impl +} // namespace Kokkos + +namespace Kokkos { + +/**\brief Can AccessSpace access MemorySpace ? + * + * Requires: + * Kokkos::is_space< AccessSpace >::value + * Kokkos::is_memory_space< MemorySpace >::value + * + * Can AccessSpace::execution_space access MemorySpace ? + * enum : bool { accessible }; + * + * Is View<AccessSpace::memory_space> assignable from View<MemorySpace> ? + * enum : bool { assignable }; + * + * If ! accessible then through which intercessory memory space + * should a be used to deep copy memory for + * AccessSpace::execution_space + * to get access. + * When AccessSpace::memory_space == Kokkos::HostSpace + * then space is the View host mirror space. + */ +template <typename AccessSpace, typename MemorySpace> +struct SpaceAccessibility { + private: + static_assert(Kokkos::is_space<AccessSpace>::value, + "template argument #1 must be a Kokkos space"); + + static_assert(Kokkos::is_memory_space<MemorySpace>::value, + "template argument #2 must be a Kokkos memory space"); + + // The input AccessSpace may be a Device<ExecSpace,MemSpace> + // verify that it is a valid combination of spaces. + static_assert(Kokkos::Impl::MemorySpaceAccess< + typename AccessSpace::execution_space::memory_space, + typename AccessSpace::memory_space>::accessible, + "template argument #1 is an invalid space"); + + using exe_access = Kokkos::Impl::MemorySpaceAccess< + typename AccessSpace::execution_space::memory_space, MemorySpace>; + + using mem_access = + Kokkos::Impl::MemorySpaceAccess<typename AccessSpace::memory_space, + MemorySpace>; + + public: + /**\brief Can AccessSpace::execution_space access MemorySpace ? + * + * Default based upon memory space accessibility. + * Specialization required for other relationships. + */ + enum { accessible = exe_access::accessible }; + + /**\brief Can assign to AccessSpace from MemorySpace ? + * + * Default based upon memory space accessibility. + * Specialization required for other relationships. + */ + enum { + assignable = is_memory_space<AccessSpace>::value && mem_access::assignable + }; + + /**\brief Can deep copy to AccessSpace::memory_Space from MemorySpace ? */ + enum { deepcopy = mem_access::deepcopy }; + + // What intercessory space for AccessSpace::execution_space + // to be able to access MemorySpace? + // If same memory space or not accessible use the AccessSpace + // else construct a device with execution space and memory space. + using space = typename std::conditional< + std::is_same<typename AccessSpace::memory_space, MemorySpace>::value || + !exe_access::accessible, + AccessSpace, + Kokkos::Device<typename AccessSpace::execution_space, MemorySpace>>::type; +}; + +} // namespace Kokkos + +namespace Kokkos { +namespace Impl { + +using Kokkos::SpaceAccessibility; // For backward compatibility + +} +} // namespace Kokkos + +//---------------------------------------------------------------------------- + +#endif // KOKKOS_CORE_CONCEPTS_HPP diff --git a/packages/kokkos/core/src/Kokkos_CopyViews.hpp b/packages/kokkos/core/src/Kokkos_CopyViews.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a27d5f0e47284f7d06b3d9218d1f02bfb679468e --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_CopyViews.hpp @@ -0,0 +1,3264 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_COPYVIEWS_HPP_ +#define KOKKOS_COPYVIEWS_HPP_ +#include <string> +#include <Kokkos_Parallel.hpp> +#include <KokkosExp_MDRangePolicy.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +namespace Impl { + +template <class Layout> +struct ViewFillLayoutSelector {}; + +template <> +struct ViewFillLayoutSelector<Kokkos::LayoutLeft> { + static const Kokkos::Iterate iterate = Kokkos::Iterate::Left; +}; + +template <> +struct ViewFillLayoutSelector<Kokkos::LayoutRight> { + static const Kokkos::Iterate iterate = Kokkos::Iterate::Right; +}; + +} // namespace Impl +} // namespace Kokkos + +namespace Kokkos { +namespace Impl { + +template <class ViewType, class Layout, class ExecSpace, typename iType> +struct ViewFill<ViewType, Layout, ExecSpace, 0, iType> { + using ST = typename ViewType::non_const_value_type; + ViewFill(const ViewType& a, const ST& val, const ExecSpace& space) { + Kokkos::Impl::DeepCopy<typename ViewType::memory_space, Kokkos::HostSpace, + ExecSpace>(space, a.data(), &val, sizeof(ST)); + } +}; + +template <class ViewType, class Layout, class ExecSpace, typename iType> +struct ViewFill<ViewType, Layout, ExecSpace, 1, iType> { + ViewType a; + typename ViewType::const_value_type val; + using policy_type = Kokkos::RangePolicy<ExecSpace, Kokkos::IndexType<iType>>; + + ViewFill(const ViewType& a_, typename ViewType::const_value_type& val_, + const ExecSpace& space) + : a(a_), val(val_) { + Kokkos::parallel_for("Kokkos::ViewFill-1D", + policy_type(space, 0, a.extent(0)), *this); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const iType& i) const { a(i) = val; }; +}; + +template <class ViewType, class Layout, class ExecSpace, typename iType> +struct ViewFill<ViewType, Layout, ExecSpace, 2, iType> { + ViewType a; + typename ViewType::const_value_type val; + + using iterate_type = Kokkos::Rank<2, ViewFillLayoutSelector<Layout>::iterate, + ViewFillLayoutSelector<Layout>::iterate>; + using policy_type = + Kokkos::MDRangePolicy<ExecSpace, iterate_type, Kokkos::IndexType<iType>>; + + ViewFill(const ViewType& a_, typename ViewType::const_value_type& val_, + const ExecSpace& space) + : a(a_), val(val_) { + Kokkos::parallel_for("Kokkos::ViewFill-2D", + policy_type(space, {0, 0}, {a.extent(0), a.extent(1)}), + *this); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const iType& i0, const iType& i1) const { a(i0, i1) = val; }; +}; + +template <class ViewType, class Layout, class ExecSpace, typename iType> +struct ViewFill<ViewType, Layout, ExecSpace, 3, iType> { + ViewType a; + typename ViewType::const_value_type val; + + using iterate_type = Kokkos::Rank<3, ViewFillLayoutSelector<Layout>::iterate, + ViewFillLayoutSelector<Layout>::iterate>; + using policy_type = + Kokkos::MDRangePolicy<ExecSpace, iterate_type, Kokkos::IndexType<iType>>; + + ViewFill(const ViewType& a_, typename ViewType::const_value_type& val_, + const ExecSpace& space) + : a(a_), val(val_) { + Kokkos::parallel_for( + "Kokkos::ViewFill-3D", + policy_type(space, {0, 0, 0}, {a.extent(0), a.extent(1), a.extent(2)}), + *this); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const iType& i0, const iType& i1, const iType& i2) const { + a(i0, i1, i2) = val; + }; +}; + +template <class ViewType, class Layout, class ExecSpace, typename iType> +struct ViewFill<ViewType, Layout, ExecSpace, 4, iType> { + ViewType a; + typename ViewType::const_value_type val; + + using iterate_type = Kokkos::Rank<4, ViewFillLayoutSelector<Layout>::iterate, + ViewFillLayoutSelector<Layout>::iterate>; + using policy_type = + Kokkos::MDRangePolicy<ExecSpace, iterate_type, Kokkos::IndexType<iType>>; + + ViewFill(const ViewType& a_, typename ViewType::const_value_type& val_, + const ExecSpace& space) + : a(a_), val(val_) { + Kokkos::parallel_for( + "Kokkos::ViewFill-4D", + policy_type(space, {0, 0, 0, 0}, + {a.extent(0), a.extent(1), a.extent(2), a.extent(3)}), + *this); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const iType& i0, const iType& i1, const iType& i2, + const iType& i3) const { + a(i0, i1, i2, i3) = val; + }; +}; + +template <class ViewType, class Layout, class ExecSpace, typename iType> +struct ViewFill<ViewType, Layout, ExecSpace, 5, iType> { + ViewType a; + typename ViewType::const_value_type val; + + using iterate_type = Kokkos::Rank<5, ViewFillLayoutSelector<Layout>::iterate, + ViewFillLayoutSelector<Layout>::iterate>; + using policy_type = + Kokkos::MDRangePolicy<ExecSpace, iterate_type, Kokkos::IndexType<iType>>; + + ViewFill(const ViewType& a_, typename ViewType::const_value_type& val_, + const ExecSpace& space) + : a(a_), val(val_) { + Kokkos::parallel_for("Kokkos::ViewFill-5D", + policy_type(space, {0, 0, 0, 0, 0}, + {a.extent(0), a.extent(1), a.extent(2), + a.extent(3), a.extent(4)}), + *this); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const iType& i0, const iType& i1, const iType& i2, + const iType& i3, const iType& i4) const { + a(i0, i1, i2, i3, i4) = val; + }; +}; + +template <class ViewType, class Layout, class ExecSpace, typename iType> +struct ViewFill<ViewType, Layout, ExecSpace, 6, iType> { + ViewType a; + typename ViewType::const_value_type val; + + using iterate_type = Kokkos::Rank<6, ViewFillLayoutSelector<Layout>::iterate, + ViewFillLayoutSelector<Layout>::iterate>; + using policy_type = + Kokkos::MDRangePolicy<ExecSpace, iterate_type, Kokkos::IndexType<iType>>; + + ViewFill(const ViewType& a_, typename ViewType::const_value_type& val_, + const ExecSpace& space) + : a(a_), val(val_) { + Kokkos::parallel_for("Kokkos::ViewFill-6D", + policy_type(space, {0, 0, 0, 0, 0, 0}, + {a.extent(0), a.extent(1), a.extent(2), + a.extent(3), a.extent(4), a.extent(5)}), + *this); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const iType& i0, const iType& i1, const iType& i2, + const iType& i3, const iType& i4, const iType& i5) const { + a(i0, i1, i2, i3, i4, i5) = val; + }; +}; + +template <class ViewType, class Layout, class ExecSpace, typename iType> +struct ViewFill<ViewType, Layout, ExecSpace, 7, iType> { + ViewType a; + typename ViewType::const_value_type val; + + using iterate_type = Kokkos::Rank<6, ViewFillLayoutSelector<Layout>::iterate, + ViewFillLayoutSelector<Layout>::iterate>; + using policy_type = + Kokkos::MDRangePolicy<ExecSpace, iterate_type, Kokkos::IndexType<iType>>; + + ViewFill(const ViewType& a_, typename ViewType::const_value_type& val_, + const ExecSpace& space) + : a(a_), val(val_) { + Kokkos::parallel_for("Kokkos::ViewFill-7D", + policy_type(space, {0, 0, 0, 0, 0, 0}, + {a.extent(0), a.extent(1), a.extent(2), + a.extent(3), a.extent(5), a.extent(6)}), + *this); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const iType& i0, const iType& i1, const iType& i3, + const iType& i4, const iType& i5, const iType& i6) const { + for (iType i2 = 0; i2 < iType(a.extent(2)); i2++) + a(i0, i1, i2, i3, i4, i5, i6) = val; + }; +}; + +template <class ViewType, class Layout, class ExecSpace, typename iType> +struct ViewFill<ViewType, Layout, ExecSpace, 8, iType> { + ViewType a; + typename ViewType::const_value_type val; + + using iterate_type = Kokkos::Rank<6, ViewFillLayoutSelector<Layout>::iterate, + ViewFillLayoutSelector<Layout>::iterate>; + using policy_type = + Kokkos::MDRangePolicy<ExecSpace, iterate_type, Kokkos::IndexType<iType>>; + + ViewFill(const ViewType& a_, typename ViewType::const_value_type& val_, + const ExecSpace& space) + : a(a_), val(val_) { + Kokkos::parallel_for("Kokkos::ViewFill-8D", + policy_type(space, {0, 0, 0, 0, 0, 0}, + {a.extent(0), a.extent(1), a.extent(3), + a.extent(5), a.extent(6), a.extent(7)}), + *this); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const iType& i0, const iType& i1, const iType& i3, + const iType& i5, const iType& i6, const iType& i7) const { + for (iType i2 = 0; i2 < iType(a.extent(2)); i2++) + for (iType i4 = 0; i4 < iType(a.extent(4)); i4++) + a(i0, i1, i2, i3, i4, i5, i6, i7) = val; + }; +}; + +template <class ViewTypeA, class ViewTypeB, class Layout, class ExecSpace, + typename iType> +struct ViewCopy<ViewTypeA, ViewTypeB, Layout, ExecSpace, 1, iType> { + ViewTypeA a; + ViewTypeB b; + + using policy_type = Kokkos::RangePolicy<ExecSpace, Kokkos::IndexType<iType>>; + using value_type = typename ViewTypeA::value_type; + + ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_, + const ExecSpace space = ExecSpace()) + : a(a_), b(b_) { + Kokkos::parallel_for("Kokkos::ViewCopy-1D", + policy_type(space, 0, a.extent(0)), *this); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const iType& i0) const { + a(i0) = static_cast<value_type>(b(i0)); + }; +}; + +template <class ViewTypeA, class ViewTypeB, class Layout, class ExecSpace, + typename iType> +struct ViewCopy<ViewTypeA, ViewTypeB, Layout, ExecSpace, 2, iType> { + ViewTypeA a; + ViewTypeB b; + static const Kokkos::Iterate outer_iteration_pattern = + Kokkos::layout_iterate_type_selector<Layout>::outer_iteration_pattern; + static const Kokkos::Iterate inner_iteration_pattern = + Kokkos::layout_iterate_type_selector<Layout>::inner_iteration_pattern; + using iterate_type = + Kokkos::Rank<2, outer_iteration_pattern, inner_iteration_pattern>; + using policy_type = + Kokkos::MDRangePolicy<ExecSpace, iterate_type, Kokkos::IndexType<iType>>; + using value_type = typename ViewTypeA::value_type; + + ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_, + const ExecSpace space = ExecSpace()) + : a(a_), b(b_) { + Kokkos::parallel_for("Kokkos::ViewCopy-2D", + policy_type(space, {0, 0}, {a.extent(0), a.extent(1)}), + *this); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const iType& i0, const iType& i1) const { + a(i0, i1) = static_cast<value_type>(b(i0, i1)); + }; +}; + +template <class ViewTypeA, class ViewTypeB, class Layout, class ExecSpace, + typename iType> +struct ViewCopy<ViewTypeA, ViewTypeB, Layout, ExecSpace, 3, iType> { + ViewTypeA a; + ViewTypeB b; + + static const Kokkos::Iterate outer_iteration_pattern = + Kokkos::layout_iterate_type_selector<Layout>::outer_iteration_pattern; + static const Kokkos::Iterate inner_iteration_pattern = + Kokkos::layout_iterate_type_selector<Layout>::inner_iteration_pattern; + using iterate_type = + Kokkos::Rank<3, outer_iteration_pattern, inner_iteration_pattern>; + using policy_type = + Kokkos::MDRangePolicy<ExecSpace, iterate_type, Kokkos::IndexType<iType>>; + using value_type = typename ViewTypeA::value_type; + + ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_, + const ExecSpace space = ExecSpace()) + : a(a_), b(b_) { + Kokkos::parallel_for( + "Kokkos::ViewCopy-3D", + policy_type(space, {0, 0, 0}, {a.extent(0), a.extent(1), a.extent(2)}), + *this); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const iType& i0, const iType& i1, const iType& i2) const { + a(i0, i1, i2) = static_cast<value_type>(b(i0, i1, i2)); + }; +}; + +template <class ViewTypeA, class ViewTypeB, class Layout, class ExecSpace, + typename iType> +struct ViewCopy<ViewTypeA, ViewTypeB, Layout, ExecSpace, 4, iType> { + ViewTypeA a; + ViewTypeB b; + + static const Kokkos::Iterate outer_iteration_pattern = + Kokkos::layout_iterate_type_selector<Layout>::outer_iteration_pattern; + static const Kokkos::Iterate inner_iteration_pattern = + Kokkos::layout_iterate_type_selector<Layout>::inner_iteration_pattern; + using iterate_type = + Kokkos::Rank<4, outer_iteration_pattern, inner_iteration_pattern>; + using policy_type = + Kokkos::MDRangePolicy<ExecSpace, iterate_type, Kokkos::IndexType<iType>>; + + ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_, + const ExecSpace space = ExecSpace()) + : a(a_), b(b_) { + Kokkos::parallel_for( + "Kokkos::ViewCopy-4D", + policy_type(space, {0, 0, 0, 0}, + {a.extent(0), a.extent(1), a.extent(2), a.extent(3)}), + *this); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const iType& i0, const iType& i1, const iType& i2, + const iType& i3) const { + a(i0, i1, i2, i3) = b(i0, i1, i2, i3); + }; +}; + +template <class ViewTypeA, class ViewTypeB, class Layout, class ExecSpace, + typename iType> +struct ViewCopy<ViewTypeA, ViewTypeB, Layout, ExecSpace, 5, iType> { + ViewTypeA a; + ViewTypeB b; + + static const Kokkos::Iterate outer_iteration_pattern = + Kokkos::layout_iterate_type_selector<Layout>::outer_iteration_pattern; + static const Kokkos::Iterate inner_iteration_pattern = + Kokkos::layout_iterate_type_selector<Layout>::inner_iteration_pattern; + using iterate_type = + Kokkos::Rank<5, outer_iteration_pattern, inner_iteration_pattern>; + using policy_type = + Kokkos::MDRangePolicy<ExecSpace, iterate_type, Kokkos::IndexType<iType>>; + + ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_, + const ExecSpace space = ExecSpace()) + : a(a_), b(b_) { + Kokkos::parallel_for("Kokkos::ViewCopy-5D", + policy_type(space, {0, 0, 0, 0, 0}, + {a.extent(0), a.extent(1), a.extent(2), + a.extent(3), a.extent(4)}), + *this); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const iType& i0, const iType& i1, const iType& i2, + const iType& i3, const iType& i4) const { + a(i0, i1, i2, i3, i4) = b(i0, i1, i2, i3, i4); + }; +}; + +template <class ViewTypeA, class ViewTypeB, class Layout, class ExecSpace, + typename iType> +struct ViewCopy<ViewTypeA, ViewTypeB, Layout, ExecSpace, 6, iType> { + ViewTypeA a; + ViewTypeB b; + + static const Kokkos::Iterate outer_iteration_pattern = + Kokkos::layout_iterate_type_selector<Layout>::outer_iteration_pattern; + static const Kokkos::Iterate inner_iteration_pattern = + Kokkos::layout_iterate_type_selector<Layout>::inner_iteration_pattern; + using iterate_type = + Kokkos::Rank<6, outer_iteration_pattern, inner_iteration_pattern>; + using policy_type = + Kokkos::MDRangePolicy<ExecSpace, iterate_type, Kokkos::IndexType<iType>>; + + ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_, + const ExecSpace space = ExecSpace()) + : a(a_), b(b_) { + Kokkos::parallel_for("Kokkos::ViewCopy-6D", + policy_type(space, {0, 0, 0, 0, 0, 0}, + {a.extent(0), a.extent(1), a.extent(2), + a.extent(3), a.extent(4), a.extent(5)}), + *this); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const iType& i0, const iType& i1, const iType& i2, + const iType& i3, const iType& i4, const iType& i5) const { + a(i0, i1, i2, i3, i4, i5) = b(i0, i1, i2, i3, i4, i5); + }; +}; + +template <class ViewTypeA, class ViewTypeB, class Layout, class ExecSpace, + typename iType> +struct ViewCopy<ViewTypeA, ViewTypeB, Layout, ExecSpace, 7, iType> { + ViewTypeA a; + ViewTypeB b; + + static const Kokkos::Iterate outer_iteration_pattern = + Kokkos::layout_iterate_type_selector<Layout>::outer_iteration_pattern; + static const Kokkos::Iterate inner_iteration_pattern = + Kokkos::layout_iterate_type_selector<Layout>::inner_iteration_pattern; + using iterate_type = + Kokkos::Rank<6, outer_iteration_pattern, inner_iteration_pattern>; + using policy_type = + Kokkos::MDRangePolicy<ExecSpace, iterate_type, Kokkos::IndexType<iType>>; + + ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_, + const ExecSpace space = ExecSpace()) + : a(a_), b(b_) { + Kokkos::parallel_for("Kokkos::ViewCopy-7D", + policy_type(space, {0, 0, 0, 0, 0, 0}, + {a.extent(0), a.extent(1), a.extent(3), + a.extent(4), a.extent(5), a.extent(6)}), + *this); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const iType& i0, const iType& i1, const iType& i3, + const iType& i4, const iType& i5, const iType& i6) const { + for (iType i2 = 0; i2 < iType(a.extent(2)); i2++) + a(i0, i1, i2, i3, i4, i5, i6) = b(i0, i1, i2, i3, i4, i5, i6); + }; +}; + +template <class ViewTypeA, class ViewTypeB, class Layout, class ExecSpace, + typename iType> +struct ViewCopy<ViewTypeA, ViewTypeB, Layout, ExecSpace, 8, iType> { + ViewTypeA a; + ViewTypeB b; + + static const Kokkos::Iterate outer_iteration_pattern = + Kokkos::layout_iterate_type_selector<Layout>::outer_iteration_pattern; + static const Kokkos::Iterate inner_iteration_pattern = + Kokkos::layout_iterate_type_selector<Layout>::inner_iteration_pattern; + using iterate_type = + Kokkos::Rank<6, outer_iteration_pattern, inner_iteration_pattern>; + using policy_type = + Kokkos::MDRangePolicy<ExecSpace, iterate_type, Kokkos::IndexType<iType>>; + + ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_, + const ExecSpace space = ExecSpace()) + : a(a_), b(b_) { + Kokkos::parallel_for("Kokkos::ViewCopy-8D", + policy_type(space, {0, 0, 0, 0, 0, 0}, + {a.extent(0), a.extent(1), a.extent(3), + a.extent(5), a.extent(6), a.extent(7)}), + *this); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const iType& i0, const iType& i1, const iType& i3, + const iType& i5, const iType& i6, const iType& i7) const { + for (iType i2 = 0; i2 < iType(a.extent(2)); i2++) + for (iType i4 = 0; i4 < iType(a.extent(4)); i4++) + a(i0, i1, i2, i3, i4, i5, i6, i7) = b(i0, i1, i2, i3, i4, i5, i6, i7); + }; +}; + +} // namespace Impl +} // namespace Kokkos + +namespace Kokkos { +namespace Impl { + +template <class ExecutionSpace, class DstType, class SrcType> +void view_copy(const ExecutionSpace& space, const DstType& dst, + const SrcType& src) { + using dst_memory_space = typename DstType::memory_space; + using src_memory_space = typename SrcType::memory_space; + + enum { + ExecCanAccessSrc = + Kokkos::Impl::SpaceAccessibility<ExecutionSpace, + src_memory_space>::accessible + }; + enum { + ExecCanAccessDst = + Kokkos::Impl::SpaceAccessibility<ExecutionSpace, + dst_memory_space>::accessible + }; + + if (!(ExecCanAccessSrc && ExecCanAccessDst)) { + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::view_copy called with invalid execution space"); + } else { + // Figure out iteration order in case we need it + int64_t strides[DstType::Rank + 1]; + dst.stride(strides); + Kokkos::Iterate iterate; + if (Kokkos::is_layouttiled<typename DstType::array_layout>::value) { + iterate = Kokkos::layout_iterate_type_selector< + typename DstType::array_layout>::outer_iteration_pattern; + } else if (std::is_same<typename DstType::array_layout, + Kokkos::LayoutRight>::value) { + iterate = Kokkos::Iterate::Right; + } else if (std::is_same<typename DstType::array_layout, + Kokkos::LayoutLeft>::value) { + iterate = Kokkos::Iterate::Left; + } else if (std::is_same<typename DstType::array_layout, + Kokkos::LayoutStride>::value) { + if (strides[0] > strides[DstType::Rank - 1]) + iterate = Kokkos::Iterate::Right; + else + iterate = Kokkos::Iterate::Left; + } else { + if (std::is_same<typename DstType::execution_space::array_layout, + Kokkos::LayoutRight>::value) + iterate = Kokkos::Iterate::Right; + else + iterate = Kokkos::Iterate::Left; + } + + if ((dst.span() >= size_t(std::numeric_limits<int>::max())) || + (src.span() >= size_t(std::numeric_limits<int>::max()))) { + if (iterate == Kokkos::Iterate::Right) + Kokkos::Impl::ViewCopy< + typename DstType::uniform_runtime_nomemspace_type, + typename SrcType::uniform_runtime_const_nomemspace_type, + Kokkos::LayoutRight, ExecutionSpace, DstType::Rank, int64_t>( + dst, src, space); + else + Kokkos::Impl::ViewCopy< + typename DstType::uniform_runtime_nomemspace_type, + typename SrcType::uniform_runtime_const_nomemspace_type, + Kokkos::LayoutLeft, ExecutionSpace, DstType::Rank, int64_t>( + dst, src, space); + } else { + if (iterate == Kokkos::Iterate::Right) + Kokkos::Impl::ViewCopy< + typename DstType::uniform_runtime_nomemspace_type, + typename SrcType::uniform_runtime_const_nomemspace_type, + Kokkos::LayoutRight, ExecutionSpace, DstType::Rank, int>(dst, src, + space); + else + Kokkos::Impl::ViewCopy< + typename DstType::uniform_runtime_nomemspace_type, + typename SrcType::uniform_runtime_const_nomemspace_type, + Kokkos::LayoutLeft, ExecutionSpace, DstType::Rank, int>(dst, src, + space); + } + } +} + +template <class DstType, class SrcType> +void view_copy(const DstType& dst, const SrcType& src) { + using dst_execution_space = typename DstType::execution_space; + using src_execution_space = typename SrcType::execution_space; + using dst_memory_space = typename DstType::memory_space; + using src_memory_space = typename SrcType::memory_space; + + enum { + DstExecCanAccessSrc = + Kokkos::Impl::SpaceAccessibility<dst_execution_space, + src_memory_space>::accessible + }; + + enum { + SrcExecCanAccessDst = + Kokkos::Impl::SpaceAccessibility<src_execution_space, + dst_memory_space>::accessible + }; + + if (!DstExecCanAccessSrc && !SrcExecCanAccessDst) { + std::string message( + "Error: Kokkos::deep_copy with no available copy mechanism: "); + message += src.label(); + message += " to "; + message += dst.label(); + Kokkos::Impl::throw_runtime_exception(message); + } + + // Figure out iteration order in case we need it + int64_t strides[DstType::Rank + 1]; + dst.stride(strides); + Kokkos::Iterate iterate; + if (Kokkos::is_layouttiled<typename DstType::array_layout>::value) { + iterate = Kokkos::layout_iterate_type_selector< + typename DstType::array_layout>::outer_iteration_pattern; + } else if (std::is_same<typename DstType::array_layout, + Kokkos::LayoutRight>::value) { + iterate = Kokkos::Iterate::Right; + } else if (std::is_same<typename DstType::array_layout, + Kokkos::LayoutLeft>::value) { + iterate = Kokkos::Iterate::Left; + } else if (std::is_same<typename DstType::array_layout, + Kokkos::LayoutStride>::value) { + if (strides[0] > strides[DstType::Rank - 1]) + iterate = Kokkos::Iterate::Right; + else + iterate = Kokkos::Iterate::Left; + } else { + if (std::is_same<typename DstType::execution_space::array_layout, + Kokkos::LayoutRight>::value) + iterate = Kokkos::Iterate::Right; + else + iterate = Kokkos::Iterate::Left; + } + + if ((dst.span() >= size_t(std::numeric_limits<int>::max())) || + (src.span() >= size_t(std::numeric_limits<int>::max()))) { + if (DstExecCanAccessSrc) { + if (iterate == Kokkos::Iterate::Right) + Kokkos::Impl::ViewCopy< + typename DstType::uniform_runtime_nomemspace_type, + typename SrcType::uniform_runtime_const_nomemspace_type, + Kokkos::LayoutRight, dst_execution_space, DstType::Rank, int64_t>( + dst, src); + else + Kokkos::Impl::ViewCopy< + typename DstType::uniform_runtime_nomemspace_type, + typename SrcType::uniform_runtime_const_nomemspace_type, + Kokkos::LayoutLeft, dst_execution_space, DstType::Rank, int64_t>( + dst, src); + } else { + if (iterate == Kokkos::Iterate::Right) + Kokkos::Impl::ViewCopy< + typename DstType::uniform_runtime_nomemspace_type, + typename SrcType::uniform_runtime_const_nomemspace_type, + Kokkos::LayoutRight, src_execution_space, DstType::Rank, int64_t>( + dst, src); + else + Kokkos::Impl::ViewCopy< + typename DstType::uniform_runtime_nomemspace_type, + typename SrcType::uniform_runtime_const_nomemspace_type, + Kokkos::LayoutLeft, src_execution_space, DstType::Rank, int64_t>( + dst, src); + } + } else { + if (DstExecCanAccessSrc) { + if (iterate == Kokkos::Iterate::Right) + Kokkos::Impl::ViewCopy< + typename DstType::uniform_runtime_nomemspace_type, + typename SrcType::uniform_runtime_const_nomemspace_type, + Kokkos::LayoutRight, dst_execution_space, DstType::Rank, int>(dst, + src); + else + Kokkos::Impl::ViewCopy< + typename DstType::uniform_runtime_nomemspace_type, + typename SrcType::uniform_runtime_const_nomemspace_type, + Kokkos::LayoutLeft, dst_execution_space, DstType::Rank, int>(dst, + src); + } else { + if (iterate == Kokkos::Iterate::Right) + Kokkos::Impl::ViewCopy< + typename DstType::uniform_runtime_nomemspace_type, + typename SrcType::uniform_runtime_const_nomemspace_type, + Kokkos::LayoutRight, src_execution_space, DstType::Rank, int>(dst, + src); + else + Kokkos::Impl::ViewCopy< + typename DstType::uniform_runtime_nomemspace_type, + typename SrcType::uniform_runtime_const_nomemspace_type, + Kokkos::LayoutLeft, src_execution_space, DstType::Rank, int>(dst, + src); + } + } +} + +template <class DstType, class SrcType, int Rank, class... Args> +struct CommonSubview; + +template <class DstType, class SrcType, class Arg0, class... Args> +struct CommonSubview<DstType, SrcType, 1, Arg0, Args...> { + using dst_subview_type = typename Kokkos::Subview<DstType, Arg0>; + using src_subview_type = typename Kokkos::Subview<SrcType, Arg0>; + dst_subview_type dst_sub; + src_subview_type src_sub; + CommonSubview(const DstType& dst, const SrcType& src, const Arg0& arg0, + Args...) + : dst_sub(dst, arg0), src_sub(src, arg0) {} +}; + +template <class DstType, class SrcType, class Arg0, class Arg1, class... Args> +struct CommonSubview<DstType, SrcType, 2, Arg0, Arg1, Args...> { + using dst_subview_type = typename Kokkos::Subview<DstType, Arg0, Arg1>; + using src_subview_type = typename Kokkos::Subview<SrcType, Arg0, Arg1>; + dst_subview_type dst_sub; + src_subview_type src_sub; + CommonSubview(const DstType& dst, const SrcType& src, const Arg0& arg0, + const Arg1& arg1, Args...) + : dst_sub(dst, arg0, arg1), src_sub(src, arg0, arg1) {} +}; + +template <class DstType, class SrcType, class Arg0, class Arg1, class Arg2, + class... Args> +struct CommonSubview<DstType, SrcType, 3, Arg0, Arg1, Arg2, Args...> { + using dst_subview_type = typename Kokkos::Subview<DstType, Arg0, Arg1, Arg2>; + using src_subview_type = typename Kokkos::Subview<SrcType, Arg0, Arg1, Arg2>; + dst_subview_type dst_sub; + src_subview_type src_sub; + CommonSubview(const DstType& dst, const SrcType& src, const Arg0& arg0, + const Arg1& arg1, const Arg2& arg2, Args...) + : dst_sub(dst, arg0, arg1, arg2), src_sub(src, arg0, arg1, arg2) {} +}; + +template <class DstType, class SrcType, class Arg0, class Arg1, class Arg2, + class Arg3, class... Args> +struct CommonSubview<DstType, SrcType, 4, Arg0, Arg1, Arg2, Arg3, Args...> { + using dst_subview_type = + typename Kokkos::Subview<DstType, Arg0, Arg1, Arg2, Arg3>; + using src_subview_type = + typename Kokkos::Subview<SrcType, Arg0, Arg1, Arg2, Arg3>; + dst_subview_type dst_sub; + src_subview_type src_sub; + CommonSubview(const DstType& dst, const SrcType& src, const Arg0& arg0, + const Arg1& arg1, const Arg2& arg2, const Arg3& arg3, + const Args...) + : dst_sub(dst, arg0, arg1, arg2, arg3), + src_sub(src, arg0, arg1, arg2, arg3) {} +}; + +template <class DstType, class SrcType, class Arg0, class Arg1, class Arg2, + class Arg3, class Arg4, class... Args> +struct CommonSubview<DstType, SrcType, 5, Arg0, Arg1, Arg2, Arg3, Arg4, + Args...> { + using dst_subview_type = + typename Kokkos::Subview<DstType, Arg0, Arg1, Arg2, Arg3, Arg4>; + using src_subview_type = + typename Kokkos::Subview<SrcType, Arg0, Arg1, Arg2, Arg3, Arg4>; + dst_subview_type dst_sub; + src_subview_type src_sub; + CommonSubview(const DstType& dst, const SrcType& src, const Arg0& arg0, + const Arg1& arg1, const Arg2& arg2, const Arg3& arg3, + const Arg4& arg4, const Args...) + : dst_sub(dst, arg0, arg1, arg2, arg3, arg4), + src_sub(src, arg0, arg1, arg2, arg3, arg4) {} +}; + +template <class DstType, class SrcType, class Arg0, class Arg1, class Arg2, + class Arg3, class Arg4, class Arg5, class... Args> +struct CommonSubview<DstType, SrcType, 6, Arg0, Arg1, Arg2, Arg3, Arg4, Arg5, + Args...> { + using dst_subview_type = + typename Kokkos::Subview<DstType, Arg0, Arg1, Arg2, Arg3, Arg4, Arg5>; + using src_subview_type = + typename Kokkos::Subview<SrcType, Arg0, Arg1, Arg2, Arg3, Arg4, Arg5>; + dst_subview_type dst_sub; + src_subview_type src_sub; + CommonSubview(const DstType& dst, const SrcType& src, const Arg0& arg0, + const Arg1& arg1, const Arg2& arg2, const Arg3& arg3, + const Arg4& arg4, const Arg5& arg5, const Args...) + : dst_sub(dst, arg0, arg1, arg2, arg3, arg4, arg5), + src_sub(src, arg0, arg1, arg2, arg3, arg4, arg5) {} +}; + +template <class DstType, class SrcType, class Arg0, class Arg1, class Arg2, + class Arg3, class Arg4, class Arg5, class Arg6, class... Args> +struct CommonSubview<DstType, SrcType, 7, Arg0, Arg1, Arg2, Arg3, Arg4, Arg5, + Arg6, Args...> { + using dst_subview_type = typename Kokkos::Subview<DstType, Arg0, Arg1, Arg2, + Arg3, Arg4, Arg5, Arg6>; + using src_subview_type = typename Kokkos::Subview<SrcType, Arg0, Arg1, Arg2, + Arg3, Arg4, Arg5, Arg6>; + dst_subview_type dst_sub; + src_subview_type src_sub; + CommonSubview(const DstType& dst, const SrcType& src, const Arg0& arg0, + const Arg1& arg1, const Arg2& arg2, const Arg3& arg3, + const Arg4& arg4, const Arg5& arg5, const Arg6& arg6, Args...) + : dst_sub(dst, arg0, arg1, arg2, arg3, arg4, arg5, arg6), + src_sub(src, arg0, arg1, arg2, arg3, arg4, arg5, arg6) {} +}; + +template <class DstType, class SrcType, class Arg0, class Arg1, class Arg2, + class Arg3, class Arg4, class Arg5, class Arg6, class Arg7> +struct CommonSubview<DstType, SrcType, 8, Arg0, Arg1, Arg2, Arg3, Arg4, Arg5, + Arg6, Arg7> { + using dst_subview_type = + typename Kokkos::Subview<DstType, Arg0, Arg1, Arg2, Arg3, Arg4, Arg5, + Arg6, Arg7>; + using src_subview_type = + typename Kokkos::Subview<SrcType, Arg0, Arg1, Arg2, Arg3, Arg4, Arg5, + Arg6, Arg7>; + dst_subview_type dst_sub; + src_subview_type src_sub; + CommonSubview(const DstType& dst, const SrcType& src, const Arg0& arg0, + const Arg1& arg1, const Arg2& arg2, const Arg3& arg3, + const Arg4& arg4, const Arg5& arg5, const Arg6& arg6, + const Arg7& arg7) + : dst_sub(dst, arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7), + src_sub(src, arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7) {} +}; + +template <class DstType, class SrcType, + class ExecSpace = typename DstType::execution_space, + int Rank = DstType::Rank> +struct ViewRemap; + +template <class DstType, class SrcType, class ExecSpace> +struct ViewRemap<DstType, SrcType, ExecSpace, 1> { + using p_type = Kokkos::pair<int64_t, int64_t>; + + ViewRemap(const DstType& dst, const SrcType& src) { + if (dst.extent(0) == src.extent(0)) { + view_copy(dst, src); + } else { + p_type ext0(0, std::min(dst.extent(0), src.extent(0))); + using sv_adapter_type = CommonSubview<DstType, SrcType, 1, p_type>; + sv_adapter_type common_subview(dst, src, ext0); + view_copy(common_subview.dst_sub, common_subview.src_sub); + } + } +}; + +template <class DstType, class SrcType, class ExecSpace> +struct ViewRemap<DstType, SrcType, ExecSpace, 2> { + using p_type = Kokkos::pair<int64_t, int64_t>; + + ViewRemap(const DstType& dst, const SrcType& src) { + if (dst.extent(0) == src.extent(0)) { + if (dst.extent(1) == src.extent(1)) { + view_copy(dst, src); + } else { + p_type ext1(0, std::min(dst.extent(1), src.extent(1))); + using sv_adapter_type = + CommonSubview<DstType, SrcType, 2, Kokkos::Impl::ALL_t, p_type>; + sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1); + view_copy(common_subview.dst_sub, common_subview.src_sub); + } + } else { + if (dst.extent(1) == src.extent(1)) { + p_type ext0(0, std::min(dst.extent(0), src.extent(0))); + using sv_adapter_type = + CommonSubview<DstType, SrcType, 2, p_type, Kokkos::Impl::ALL_t>; + sv_adapter_type common_subview(dst, src, ext0, Kokkos::ALL); + view_copy(common_subview.dst_sub, common_subview.src_sub); + } else { + p_type ext0(0, std::min(dst.extent(0), src.extent(0))); + p_type ext1(0, std::min(dst.extent(1), src.extent(1))); + using sv_adapter_type = + CommonSubview<DstType, SrcType, 2, p_type, p_type>; + sv_adapter_type common_subview(dst, src, ext0, ext1); + view_copy(common_subview.dst_sub, common_subview.src_sub); + } + } + } +}; + +template <class DstType, class SrcType, class ExecSpace> +struct ViewRemap<DstType, SrcType, ExecSpace, 3> { + using p_type = Kokkos::pair<int64_t, int64_t>; + + ViewRemap(const DstType& dst, const SrcType& src) { + if (dst.extent(0) == src.extent(0)) { + if (dst.extent(2) == src.extent(2)) { + p_type ext1(0, std::min(dst.extent(1), src.extent(1))); + using sv_adapter_type = + CommonSubview<DstType, SrcType, 3, Kokkos::Impl::ALL_t, p_type, + Kokkos::Impl::ALL_t>; + sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, + Kokkos::ALL); + view_copy(common_subview.dst_sub, common_subview.src_sub); + } else { + p_type ext1(0, std::min(dst.extent(1), src.extent(1))); + p_type ext2(0, std::min(dst.extent(2), src.extent(2))); + using sv_adapter_type = + CommonSubview<DstType, SrcType, 3, Kokkos::Impl::ALL_t, p_type, + p_type>; + sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2); + view_copy(common_subview.dst_sub, common_subview.src_sub); + } + } else { + if (dst.extent(2) == src.extent(2)) { + p_type ext0(0, std::min(dst.extent(0), src.extent(0))); + p_type ext1(0, std::min(dst.extent(1), src.extent(1))); + using sv_adapter_type = CommonSubview<DstType, SrcType, 3, p_type, + p_type, Kokkos::Impl::ALL_t>; + sv_adapter_type common_subview(dst, src, ext0, ext1, Kokkos::ALL); + view_copy(common_subview.dst_sub, common_subview.src_sub); + } else { + p_type ext0(0, std::min(dst.extent(0), src.extent(0))); + p_type ext1(0, std::min(dst.extent(1), src.extent(1))); + p_type ext2(0, std::min(dst.extent(2), src.extent(2))); + using sv_adapter_type = + CommonSubview<DstType, SrcType, 3, p_type, p_type, p_type>; + sv_adapter_type common_subview(dst, src, ext0, ext1, ext2); + view_copy(common_subview.dst_sub, common_subview.src_sub); + } + } + } +}; + +template <class DstType, class SrcType, class ExecSpace> +struct ViewRemap<DstType, SrcType, ExecSpace, 4> { + using p_type = Kokkos::pair<int64_t, int64_t>; + + ViewRemap(const DstType& dst, const SrcType& src) { + if (dst.extent(0) == src.extent(0)) { + if (dst.extent(3) == src.extent(3)) { + p_type ext1(0, std::min(dst.extent(1), src.extent(1))); + p_type ext2(0, std::min(dst.extent(2), src.extent(2))); + using sv_adapter_type = + CommonSubview<DstType, SrcType, 4, Kokkos::Impl::ALL_t, p_type, + p_type, Kokkos::Impl::ALL_t>; + sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, + Kokkos::ALL); + view_copy(common_subview.dst_sub, common_subview.src_sub); + } else { + p_type ext1(0, std::min(dst.extent(1), src.extent(1))); + p_type ext2(0, std::min(dst.extent(2), src.extent(2))); + p_type ext3(0, std::min(dst.extent(3), src.extent(3))); + using sv_adapter_type = + CommonSubview<DstType, SrcType, 4, Kokkos::Impl::ALL_t, p_type, + p_type, p_type>; + sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3); + view_copy(common_subview.dst_sub, common_subview.src_sub); + } + } else { + if (dst.extent(7) == src.extent(7)) { + p_type ext0(0, std::min(dst.extent(0), src.extent(0))); + p_type ext1(0, std::min(dst.extent(1), src.extent(1))); + p_type ext2(0, std::min(dst.extent(2), src.extent(2))); + using sv_adapter_type = + CommonSubview<DstType, SrcType, 4, p_type, p_type, p_type, + Kokkos::Impl::ALL_t>; + sv_adapter_type common_subview(dst, src, ext0, ext1, ext2, Kokkos::ALL); + view_copy(common_subview.dst_sub, common_subview.src_sub); + } else { + p_type ext0(0, std::min(dst.extent(0), src.extent(0))); + p_type ext1(0, std::min(dst.extent(1), src.extent(1))); + p_type ext2(0, std::min(dst.extent(2), src.extent(2))); + p_type ext3(0, std::min(dst.extent(3), src.extent(3))); + using sv_adapter_type = + CommonSubview<DstType, SrcType, 4, p_type, p_type, p_type, p_type>; + sv_adapter_type common_subview(dst, src, ext0, ext1, ext2, ext3); + view_copy(common_subview.dst_sub, common_subview.src_sub); + } + } + } +}; + +template <class DstType, class SrcType, class ExecSpace> +struct ViewRemap<DstType, SrcType, ExecSpace, 5> { + using p_type = Kokkos::pair<int64_t, int64_t>; + + ViewRemap(const DstType& dst, const SrcType& src) { + if (dst.extent(0) == src.extent(0)) { + if (dst.extent(4) == src.extent(4)) { + p_type ext1(0, std::min(dst.extent(1), src.extent(1))); + p_type ext2(0, std::min(dst.extent(2), src.extent(2))); + p_type ext3(0, std::min(dst.extent(3), src.extent(3))); + using sv_adapter_type = + CommonSubview<DstType, SrcType, 5, Kokkos::Impl::ALL_t, p_type, + p_type, p_type, Kokkos::Impl::ALL_t>; + sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3, + Kokkos::ALL); + view_copy(common_subview.dst_sub, common_subview.src_sub); + } else { + p_type ext1(0, std::min(dst.extent(1), src.extent(1))); + p_type ext2(0, std::min(dst.extent(2), src.extent(2))); + p_type ext3(0, std::min(dst.extent(3), src.extent(3))); + p_type ext4(0, std::min(dst.extent(4), src.extent(4))); + using sv_adapter_type = + CommonSubview<DstType, SrcType, 5, Kokkos::Impl::ALL_t, p_type, + p_type, p_type, p_type>; + sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3, + ext4); + view_copy(common_subview.dst_sub, common_subview.src_sub); + } + } else { + if (dst.extent(4) == src.extent(4)) { + p_type ext0(0, std::min(dst.extent(0), src.extent(0))); + p_type ext1(0, std::min(dst.extent(1), src.extent(1))); + p_type ext2(0, std::min(dst.extent(2), src.extent(2))); + p_type ext3(0, std::min(dst.extent(3), src.extent(3))); + using sv_adapter_type = + CommonSubview<DstType, SrcType, 5, p_type, p_type, p_type, p_type, + Kokkos::Impl::ALL_t>; + sv_adapter_type common_subview(dst, src, ext0, ext1, ext2, ext3, + Kokkos::ALL); + view_copy(common_subview.dst_sub, common_subview.src_sub); + } else { + p_type ext0(0, std::min(dst.extent(0), src.extent(0))); + p_type ext1(0, std::min(dst.extent(1), src.extent(1))); + p_type ext2(0, std::min(dst.extent(2), src.extent(2))); + p_type ext3(0, std::min(dst.extent(3), src.extent(3))); + p_type ext4(0, std::min(dst.extent(4), src.extent(4))); + using sv_adapter_type = CommonSubview<DstType, SrcType, 5, p_type, + p_type, p_type, p_type, p_type>; + sv_adapter_type common_subview(dst, src, ext0, ext1, ext2, ext3, ext4); + view_copy(common_subview.dst_sub, common_subview.src_sub); + } + } + } +}; +template <class DstType, class SrcType, class ExecSpace> +struct ViewRemap<DstType, SrcType, ExecSpace, 6> { + using p_type = Kokkos::pair<int64_t, int64_t>; + + ViewRemap(const DstType& dst, const SrcType& src) { + if (dst.extent(0) == src.extent(0)) { + if (dst.extent(5) == src.extent(5)) { + p_type ext1(0, std::min(dst.extent(1), src.extent(1))); + p_type ext2(0, std::min(dst.extent(2), src.extent(2))); + p_type ext3(0, std::min(dst.extent(3), src.extent(3))); + p_type ext4(0, std::min(dst.extent(4), src.extent(4))); + using sv_adapter_type = + CommonSubview<DstType, SrcType, 6, Kokkos::Impl::ALL_t, p_type, + p_type, p_type, p_type, Kokkos::Impl::ALL_t>; + sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3, + ext4, Kokkos::ALL); + view_copy(common_subview.dst_sub, common_subview.src_sub); + } else { + p_type ext1(0, std::min(dst.extent(1), src.extent(1))); + p_type ext2(0, std::min(dst.extent(2), src.extent(2))); + p_type ext3(0, std::min(dst.extent(3), src.extent(3))); + p_type ext4(0, std::min(dst.extent(4), src.extent(4))); + p_type ext5(0, std::min(dst.extent(5), src.extent(5))); + using sv_adapter_type = + CommonSubview<DstType, SrcType, 6, Kokkos::Impl::ALL_t, p_type, + p_type, p_type, p_type, p_type>; + sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3, + ext4, ext5); + view_copy(common_subview.dst_sub, common_subview.src_sub); + } + } else { + if (dst.extent(5) == src.extent(5)) { + p_type ext0(0, std::min(dst.extent(0), src.extent(0))); + p_type ext1(0, std::min(dst.extent(1), src.extent(1))); + p_type ext2(0, std::min(dst.extent(2), src.extent(2))); + p_type ext3(0, std::min(dst.extent(3), src.extent(3))); + p_type ext4(0, std::min(dst.extent(4), src.extent(4))); + + using sv_adapter_type = + CommonSubview<DstType, SrcType, 6, p_type, p_type, p_type, p_type, + p_type, Kokkos::Impl::ALL_t>; + sv_adapter_type common_subview(dst, src, ext0, ext1, ext2, ext3, ext4, + Kokkos::ALL); + view_copy(common_subview.dst_sub, common_subview.src_sub); + } else { + p_type ext0(0, std::min(dst.extent(0), src.extent(0))); + p_type ext1(0, std::min(dst.extent(1), src.extent(1))); + p_type ext2(0, std::min(dst.extent(2), src.extent(2))); + p_type ext3(0, std::min(dst.extent(3), src.extent(3))); + p_type ext4(0, std::min(dst.extent(4), src.extent(4))); + p_type ext5(0, std::min(dst.extent(5), src.extent(5))); + + using sv_adapter_type = + CommonSubview<DstType, SrcType, 6, p_type, p_type, p_type, p_type, + p_type, p_type>; + sv_adapter_type common_subview(dst, src, ext0, ext1, ext2, ext3, ext4, + ext5); + view_copy(common_subview.dst_sub, common_subview.src_sub); + } + } + } +}; + +template <class DstType, class SrcType, class ExecSpace> +struct ViewRemap<DstType, SrcType, ExecSpace, 7> { + using p_type = Kokkos::pair<int64_t, int64_t>; + + ViewRemap(const DstType& dst, const SrcType& src) { + if (dst.extent(0) == src.extent(0)) { + if (dst.extent(6) == src.extent(6)) { + p_type ext1(0, std::min(dst.extent(1), src.extent(1))); + p_type ext2(0, std::min(dst.extent(2), src.extent(2))); + p_type ext3(0, std::min(dst.extent(3), src.extent(3))); + p_type ext4(0, std::min(dst.extent(4), src.extent(4))); + p_type ext5(0, std::min(dst.extent(5), src.extent(5))); + using sv_adapter_type = + CommonSubview<DstType, SrcType, 7, Kokkos::Impl::ALL_t, p_type, + p_type, p_type, p_type, p_type, Kokkos::Impl::ALL_t>; + sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3, + ext4, ext5, Kokkos::ALL); + view_copy(common_subview.dst_sub, common_subview.src_sub); + } else { + p_type ext1(0, std::min(dst.extent(1), src.extent(1))); + p_type ext2(0, std::min(dst.extent(2), src.extent(2))); + p_type ext3(0, std::min(dst.extent(3), src.extent(3))); + p_type ext4(0, std::min(dst.extent(4), src.extent(4))); + p_type ext5(0, std::min(dst.extent(5), src.extent(5))); + p_type ext6(0, std::min(dst.extent(6), src.extent(6))); + using sv_adapter_type = + CommonSubview<DstType, SrcType, 7, Kokkos::Impl::ALL_t, p_type, + p_type, p_type, p_type, p_type, p_type>; + sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3, + ext4, ext5, ext6); + view_copy(common_subview.dst_sub, common_subview.src_sub); + } + } else { + if (dst.extent(6) == src.extent(6)) { + p_type ext0(0, std::min(dst.extent(0), src.extent(0))); + p_type ext1(0, std::min(dst.extent(1), src.extent(1))); + p_type ext2(0, std::min(dst.extent(2), src.extent(2))); + p_type ext3(0, std::min(dst.extent(3), src.extent(3))); + p_type ext4(0, std::min(dst.extent(4), src.extent(4))); + p_type ext5(0, std::min(dst.extent(5), src.extent(5))); + using sv_adapter_type = + CommonSubview<DstType, SrcType, 7, p_type, p_type, p_type, p_type, + p_type, p_type, Kokkos::Impl::ALL_t>; + sv_adapter_type common_subview(dst, src, ext0, ext1, ext2, ext3, ext4, + ext5, Kokkos::ALL); + view_copy(common_subview.dst_sub, common_subview.src_sub); + } else { + p_type ext0(0, std::min(dst.extent(0), src.extent(0))); + p_type ext1(0, std::min(dst.extent(1), src.extent(1))); + p_type ext2(0, std::min(dst.extent(2), src.extent(2))); + p_type ext3(0, std::min(dst.extent(3), src.extent(3))); + p_type ext4(0, std::min(dst.extent(4), src.extent(4))); + p_type ext5(0, std::min(dst.extent(5), src.extent(5))); + p_type ext6(0, std::min(dst.extent(6), src.extent(6))); + using sv_adapter_type = + CommonSubview<DstType, SrcType, 7, p_type, p_type, p_type, p_type, + p_type, p_type, p_type>; + sv_adapter_type common_subview(dst, src, ext0, ext1, ext2, ext3, ext4, + ext5, ext6); + view_copy(common_subview.dst_sub, common_subview.src_sub); + } + } + } +}; + +template <class DstType, class SrcType, class ExecSpace> +struct ViewRemap<DstType, SrcType, ExecSpace, 8> { + using p_type = Kokkos::pair<int64_t, int64_t>; + + ViewRemap(const DstType& dst, const SrcType& src) { + if (dst.extent(0) == src.extent(0)) { + if (dst.extent(7) == src.extent(7)) { + p_type ext1(0, std::min(dst.extent(1), src.extent(1))); + p_type ext2(0, std::min(dst.extent(2), src.extent(2))); + p_type ext3(0, std::min(dst.extent(3), src.extent(3))); + p_type ext4(0, std::min(dst.extent(4), src.extent(4))); + p_type ext5(0, std::min(dst.extent(5), src.extent(5))); + p_type ext6(0, std::min(dst.extent(6), src.extent(6))); + using sv_adapter_type = + CommonSubview<DstType, SrcType, 8, Kokkos::Impl::ALL_t, p_type, + p_type, p_type, p_type, p_type, p_type, + Kokkos::Impl::ALL_t>; + sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3, + ext4, ext5, ext6, Kokkos::ALL); + view_copy(common_subview.dst_sub, common_subview.src_sub); + } else { + p_type ext1(0, std::min(dst.extent(1), src.extent(1))); + p_type ext2(0, std::min(dst.extent(2), src.extent(2))); + p_type ext3(0, std::min(dst.extent(3), src.extent(3))); + p_type ext4(0, std::min(dst.extent(4), src.extent(4))); + p_type ext5(0, std::min(dst.extent(5), src.extent(5))); + p_type ext6(0, std::min(dst.extent(6), src.extent(6))); + p_type ext7(0, std::min(dst.extent(7), src.extent(7))); + using sv_adapter_type = + CommonSubview<DstType, SrcType, 8, Kokkos::Impl::ALL_t, p_type, + p_type, p_type, p_type, p_type, p_type, p_type>; + sv_adapter_type common_subview(dst, src, Kokkos::ALL, ext1, ext2, ext3, + ext4, ext5, ext6, ext7); + view_copy(common_subview.dst_sub, common_subview.src_sub); + } + } else { + if (dst.extent(7) == src.extent(7)) { + p_type ext0(0, std::min(dst.extent(0), src.extent(0))); + p_type ext1(0, std::min(dst.extent(1), src.extent(1))); + p_type ext2(0, std::min(dst.extent(2), src.extent(2))); + p_type ext3(0, std::min(dst.extent(3), src.extent(3))); + p_type ext4(0, std::min(dst.extent(4), src.extent(4))); + p_type ext5(0, std::min(dst.extent(5), src.extent(5))); + p_type ext6(0, std::min(dst.extent(6), src.extent(6))); + using sv_adapter_type = + CommonSubview<DstType, SrcType, 8, p_type, p_type, p_type, p_type, + p_type, p_type, p_type, Kokkos::Impl::ALL_t>; + sv_adapter_type common_subview(dst, src, ext0, ext1, ext2, ext3, ext4, + ext5, ext6, Kokkos::ALL); + view_copy(common_subview.dst_sub, common_subview.src_sub); + } else { + p_type ext0(0, std::min(dst.extent(0), src.extent(0))); + p_type ext1(0, std::min(dst.extent(1), src.extent(1))); + p_type ext2(0, std::min(dst.extent(2), src.extent(2))); + p_type ext3(0, std::min(dst.extent(3), src.extent(3))); + p_type ext4(0, std::min(dst.extent(4), src.extent(4))); + p_type ext5(0, std::min(dst.extent(5), src.extent(5))); + p_type ext6(0, std::min(dst.extent(6), src.extent(6))); + p_type ext7(0, std::min(dst.extent(7), src.extent(7))); + using sv_adapter_type = + CommonSubview<DstType, SrcType, 8, p_type, p_type, p_type, p_type, + p_type, p_type, p_type, p_type>; + sv_adapter_type common_subview(dst, src, ext0, ext1, ext2, ext3, ext4, + ext5, ext6, ext7); + view_copy(common_subview.dst_sub, common_subview.src_sub); + } + } + } +}; + +} // namespace Impl + +/** \brief Deep copy a value from Host memory into a view. */ +template <class DT, class... DP> +inline void deep_copy( + const View<DT, DP...>& dst, + typename ViewTraits<DT, DP...>::const_value_type& value, + typename std::enable_if<std::is_same< + typename ViewTraits<DT, DP...>::specialize, void>::value>::type* = + nullptr) { + using ViewType = View<DT, DP...>; + using exec_space_type = typename ViewType::execution_space; + + if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { + Kokkos::Profiling::beginDeepCopy( + Kokkos::Profiling::make_space_handle(ViewType::memory_space::name()), + dst.label(), dst.data(), + Kokkos::Profiling::make_space_handle(Kokkos::HostSpace::name()), + "Scalar", &value, dst.span() * sizeof(typename ViewType::value_type)); + } + + if (dst.data() == nullptr) { + Kokkos::fence(); + if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) { + Kokkos::Profiling::endDeepCopy(); + } + return; + } + + Kokkos::fence(); + static_assert(std::is_same<typename ViewType::non_const_value_type, + typename ViewType::value_type>::value, + "deep_copy requires non-const type"); + + // If contiguous we can simply do a 1D flat loop + if (dst.span_is_contiguous()) { + using ViewTypeFlat = Kokkos::View< + typename ViewType::value_type*, Kokkos::LayoutRight, + Kokkos::Device<typename ViewType::execution_space, + typename std::conditional< + ViewType::Rank == 0, typename ViewType::memory_space, + Kokkos::AnonymousSpace>::type>, + Kokkos::MemoryTraits<0>>; + + ViewTypeFlat dst_flat(dst.data(), dst.size()); + if (dst.span() < static_cast<size_t>(std::numeric_limits<int>::max())) { + Kokkos::Impl::ViewFill<ViewTypeFlat, Kokkos::LayoutRight, exec_space_type, + ViewTypeFlat::Rank, int>(dst_flat, value, + exec_space_type()); + } else + Kokkos::Impl::ViewFill<ViewTypeFlat, Kokkos::LayoutRight, exec_space_type, + ViewTypeFlat::Rank, int64_t>(dst_flat, value, + exec_space_type()); + Kokkos::fence(); + if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) { + Kokkos::Profiling::endDeepCopy(); + } + return; + } + + // Figure out iteration order to do the ViewFill + int64_t strides[ViewType::Rank + 1]; + dst.stride(strides); + Kokkos::Iterate iterate; + if (std::is_same<typename ViewType::array_layout, + Kokkos::LayoutRight>::value) { + iterate = Kokkos::Iterate::Right; + } else if (std::is_same<typename ViewType::array_layout, + Kokkos::LayoutLeft>::value) { + iterate = Kokkos::Iterate::Left; + } else if (std::is_same<typename ViewType::array_layout, + Kokkos::LayoutStride>::value) { + if (strides[0] > strides[ViewType::Rank > 0 ? ViewType::Rank - 1 : 0]) + iterate = Kokkos::Iterate::Right; + else + iterate = Kokkos::Iterate::Left; + } else { + if (std::is_same<typename ViewType::execution_space::array_layout, + Kokkos::LayoutRight>::value) + iterate = Kokkos::Iterate::Right; + else + iterate = Kokkos::Iterate::Left; + } + + // Lets call the right ViewFill functor based on integer space needed and + // iteration type + using ViewTypeUniform = typename std::conditional< + ViewType::Rank == 0, typename ViewType::uniform_runtime_type, + typename ViewType::uniform_runtime_nomemspace_type>::type; + if (dst.span() > static_cast<size_t>(std::numeric_limits<int>::max())) { + if (iterate == Kokkos::Iterate::Right) + Kokkos::Impl::ViewFill<ViewTypeUniform, Kokkos::LayoutRight, + exec_space_type, ViewType::Rank, int64_t>( + dst, value, exec_space_type()); + else + Kokkos::Impl::ViewFill<ViewTypeUniform, Kokkos::LayoutLeft, + exec_space_type, ViewType::Rank, int64_t>( + dst, value, exec_space_type()); + } else { + if (iterate == Kokkos::Iterate::Right) + Kokkos::Impl::ViewFill<ViewTypeUniform, Kokkos::LayoutRight, + exec_space_type, ViewType::Rank, int>( + dst, value, exec_space_type()); + else + Kokkos::Impl::ViewFill<ViewTypeUniform, Kokkos::LayoutLeft, + exec_space_type, ViewType::Rank, int>( + dst, value, exec_space_type()); + } + Kokkos::fence(); + + if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) { + Kokkos::Profiling::endDeepCopy(); + } +} + +/** \brief Deep copy into a value in Host memory from a view. */ +template <class ST, class... SP> +inline void deep_copy( + typename ViewTraits<ST, SP...>::non_const_value_type& dst, + const View<ST, SP...>& src, + typename std::enable_if<std::is_same< + typename ViewTraits<ST, SP...>::specialize, void>::value>::type* = + nullptr) { + using src_traits = ViewTraits<ST, SP...>; + using src_memory_space = typename src_traits::memory_space; + + static_assert(src_traits::rank == 0, + "ERROR: Non-rank-zero view in deep_copy( value , View )"); + + if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { + Kokkos::Profiling::beginDeepCopy( + Kokkos::Profiling::make_space_handle(Kokkos::HostSpace::name()), + "Scalar", &dst, + Kokkos::Profiling::make_space_handle(src_memory_space::name()), + src.label(), src.data(), + src.span() * sizeof(typename src_traits::value_type)); + } + + if (src.data() == nullptr) { + Kokkos::fence(); + if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) { + Kokkos::Profiling::endDeepCopy(); + } + return; + } + + Kokkos::Impl::DeepCopy<HostSpace, src_memory_space>(&dst, src.data(), + sizeof(ST)); + if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) { + Kokkos::Profiling::endDeepCopy(); + } +} + +//---------------------------------------------------------------------------- +/** \brief A deep copy between views of compatible type, and rank zero. */ +template <class DT, class... DP, class ST, class... SP> +inline void deep_copy( + const View<DT, DP...>& dst, const View<ST, SP...>& src, + typename std::enable_if<( + std::is_same<typename ViewTraits<DT, DP...>::specialize, void>::value && + std::is_same<typename ViewTraits<ST, SP...>::specialize, void>::value && + (unsigned(ViewTraits<DT, DP...>::rank) == unsigned(0) && + unsigned(ViewTraits<ST, SP...>::rank) == unsigned(0)))>::type* = + nullptr) { + using dst_type = View<DT, DP...>; + using src_type = View<ST, SP...>; + + using value_type = typename dst_type::value_type; + using dst_memory_space = typename dst_type::memory_space; + using src_memory_space = typename src_type::memory_space; + + static_assert(std::is_same<typename dst_type::value_type, + typename src_type::non_const_value_type>::value, + "deep_copy requires matching non-const destination type"); + + if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { + Kokkos::Profiling::beginDeepCopy( + Kokkos::Profiling::make_space_handle(dst_memory_space::name()), + dst.label(), dst.data(), + Kokkos::Profiling::make_space_handle(src_memory_space::name()), + src.label(), src.data(), + src.span() * sizeof(typename dst_type::value_type)); + } + + if (dst.data() == nullptr && src.data() == nullptr) { + Kokkos::fence(); + if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) { + Kokkos::Profiling::endDeepCopy(); + } + return; + } + + Kokkos::fence(); + if (dst.data() != src.data()) { + Kokkos::Impl::DeepCopy<dst_memory_space, src_memory_space>( + dst.data(), src.data(), sizeof(value_type)); + Kokkos::fence(); + } + if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) { + Kokkos::Profiling::endDeepCopy(); + } +} + +//---------------------------------------------------------------------------- +/** \brief A deep copy between views of the default specialization, compatible + * type, same non-zero rank, same contiguous layout. + */ +template <class DT, class... DP, class ST, class... SP> +inline void deep_copy( + const View<DT, DP...>& dst, const View<ST, SP...>& src, + typename std::enable_if<( + std::is_same<typename ViewTraits<DT, DP...>::specialize, void>::value && + std::is_same<typename ViewTraits<ST, SP...>::specialize, void>::value && + (unsigned(ViewTraits<DT, DP...>::rank) != 0 || + unsigned(ViewTraits<ST, SP...>::rank) != 0))>::type* = nullptr) { + using dst_type = View<DT, DP...>; + using src_type = View<ST, SP...>; + using dst_execution_space = typename dst_type::execution_space; + using src_execution_space = typename src_type::execution_space; + using dst_memory_space = typename dst_type::memory_space; + using src_memory_space = typename src_type::memory_space; + using dst_value_type = typename dst_type::value_type; + using src_value_type = typename src_type::value_type; + + static_assert(std::is_same<typename dst_type::value_type, + typename dst_type::non_const_value_type>::value, + "deep_copy requires non-const destination type"); + + static_assert((unsigned(dst_type::rank) == unsigned(src_type::rank)), + "deep_copy requires Views of equal rank"); + + if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { + Kokkos::Profiling::beginDeepCopy( + Kokkos::Profiling::make_space_handle(dst_memory_space::name()), + dst.label(), dst.data(), + Kokkos::Profiling::make_space_handle(src_memory_space::name()), + src.label(), src.data(), + src.span() * sizeof(typename dst_type::value_type)); + } + + if (dst.data() == nullptr || src.data() == nullptr) { + // throw if dimension mismatch + if ((src.extent(0) != dst.extent(0)) || (src.extent(1) != dst.extent(1)) || + (src.extent(2) != dst.extent(2)) || (src.extent(3) != dst.extent(3)) || + (src.extent(4) != dst.extent(4)) || (src.extent(5) != dst.extent(5)) || + (src.extent(6) != dst.extent(6)) || (src.extent(7) != dst.extent(7))) { + std::string message( + "Deprecation Error: Kokkos::deep_copy extents of views don't " + "match: "); + message += dst.label(); + message += "("; + for (int r = 0; r < dst_type::Rank - 1; r++) { + message += std::to_string(dst.extent(r)); + message += ","; + } + message += std::to_string(dst.extent(dst_type::Rank - 1)); + message += ") "; + message += src.label(); + message += "("; + for (int r = 0; r < src_type::Rank - 1; r++) { + message += std::to_string(src.extent(r)); + message += ","; + } + message += std::to_string(src.extent(src_type::Rank - 1)); + message += ") "; + + Kokkos::Impl::throw_runtime_exception(message); + } + Kokkos::fence(); + if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) { + Kokkos::Profiling::endDeepCopy(); + } + return; + } + + enum { + DstExecCanAccessSrc = + Kokkos::Impl::SpaceAccessibility<dst_execution_space, + src_memory_space>::accessible + }; + + enum { + SrcExecCanAccessDst = + Kokkos::Impl::SpaceAccessibility<src_execution_space, + dst_memory_space>::accessible + }; + + // Checking for Overlapping Views. + dst_value_type* dst_start = dst.data(); + dst_value_type* dst_end = dst.data() + dst.span(); + src_value_type* src_start = src.data(); + src_value_type* src_end = src.data() + src.span(); + if (((std::ptrdiff_t)dst_start == (std::ptrdiff_t)src_start) && + ((std::ptrdiff_t)dst_end == (std::ptrdiff_t)src_end) && + (dst.span_is_contiguous() && src.span_is_contiguous())) { + Kokkos::fence(); + if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) { + Kokkos::Profiling::endDeepCopy(); + } + return; + } + + if ((((std::ptrdiff_t)dst_start < (std::ptrdiff_t)src_end) && + ((std::ptrdiff_t)dst_end > (std::ptrdiff_t)src_start)) && + ((dst.span_is_contiguous() && src.span_is_contiguous()))) { + std::string message("Error: Kokkos::deep_copy of overlapping views: "); + message += dst.label(); + message += "("; + message += std::to_string((std::ptrdiff_t)dst_start); + message += ","; + message += std::to_string((std::ptrdiff_t)dst_end); + message += ") "; + message += src.label(); + message += "("; + message += std::to_string((std::ptrdiff_t)src_start); + message += ","; + message += std::to_string((std::ptrdiff_t)src_end); + message += ") "; + Kokkos::Impl::throw_runtime_exception(message); + } + + // Check for same extents + if ((src.extent(0) != dst.extent(0)) || (src.extent(1) != dst.extent(1)) || + (src.extent(2) != dst.extent(2)) || (src.extent(3) != dst.extent(3)) || + (src.extent(4) != dst.extent(4)) || (src.extent(5) != dst.extent(5)) || + (src.extent(6) != dst.extent(6)) || (src.extent(7) != dst.extent(7))) { + std::string message( + "Deprecation Error: Kokkos::deep_copy extents of views don't match: "); + message += dst.label(); + message += "("; + for (int r = 0; r < dst_type::Rank - 1; r++) { + message += std::to_string(dst.extent(r)); + message += ","; + } + message += std::to_string(dst.extent(dst_type::Rank - 1)); + message += ") "; + message += src.label(); + message += "("; + for (int r = 0; r < src_type::Rank - 1; r++) { + message += std::to_string(src.extent(r)); + message += ","; + } + message += std::to_string(src.extent(src_type::Rank - 1)); + message += ") "; + + Kokkos::Impl::throw_runtime_exception(message); + } + + // If same type, equal layout, equal dimensions, equal span, and contiguous + // memory then can byte-wise copy + + if (std::is_same<typename dst_type::value_type, + typename src_type::non_const_value_type>::value && + (std::is_same<typename dst_type::array_layout, + typename src_type::array_layout>::value || + (dst_type::rank == 1 && src_type::rank == 1)) && + dst.span_is_contiguous() && src.span_is_contiguous() && + ((dst_type::rank < 1) || (dst.stride_0() == src.stride_0())) && + ((dst_type::rank < 2) || (dst.stride_1() == src.stride_1())) && + ((dst_type::rank < 3) || (dst.stride_2() == src.stride_2())) && + ((dst_type::rank < 4) || (dst.stride_3() == src.stride_3())) && + ((dst_type::rank < 5) || (dst.stride_4() == src.stride_4())) && + ((dst_type::rank < 6) || (dst.stride_5() == src.stride_5())) && + ((dst_type::rank < 7) || (dst.stride_6() == src.stride_6())) && + ((dst_type::rank < 8) || (dst.stride_7() == src.stride_7()))) { + const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span(); + Kokkos::fence(); + if ((void*)dst.data() != (void*)src.data()) { + Kokkos::Impl::DeepCopy<dst_memory_space, src_memory_space>( + dst.data(), src.data(), nbytes); + Kokkos::fence(); + } + } else { + Kokkos::fence(); + Impl::view_copy(dst, src); + Kokkos::fence(); + } + if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) { + Kokkos::Profiling::endDeepCopy(); + } +} + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- +namespace Experimental { +/** \brief A local deep copy between views of the default specialization, + * compatible type, same non-zero rank. + */ +template <class TeamType, class DT, class... DP, class ST, class... SP> +void KOKKOS_INLINE_FUNCTION +local_deep_copy_contiguous(const TeamType& team, const View<DT, DP...>& dst, + const View<ST, SP...>& src) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, src.span()), + [&](const int& i) { dst.data()[i] = src.data()[i]; }); +} +//---------------------------------------------------------------------------- +template <class DT, class... DP, class ST, class... SP> +void KOKKOS_INLINE_FUNCTION local_deep_copy_contiguous( + const View<DT, DP...>& dst, const View<ST, SP...>& src) { + for (size_t i = 0; i < src.span(); ++i) { + dst.data()[i] = src.data()[i]; + } +} +//---------------------------------------------------------------------------- +template <class TeamType, class DT, class... DP, class ST, class... SP> +void KOKKOS_INLINE_FUNCTION local_deep_copy( + const TeamType& team, const View<DT, DP...>& dst, + const View<ST, SP...>& src, + typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == 1 && + unsigned(ViewTraits<ST, SP...>::rank) == + 1)>::type* = nullptr) { + if (dst.data() == nullptr) { + return; + } + + const size_t N = dst.extent(0); + + team.team_barrier(); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), + [&](const int& i) { dst(i) = src(i); }); + team.team_barrier(); +} +//---------------------------------------------------------------------------- +template <class TeamType, class DT, class... DP, class ST, class... SP> +void KOKKOS_INLINE_FUNCTION local_deep_copy( + const TeamType& team, const View<DT, DP...>& dst, + const View<ST, SP...>& src, + typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == 2 && + unsigned(ViewTraits<ST, SP...>::rank) == + 2)>::type* = nullptr) { + if (dst.data() == nullptr) { + return; + } + + const size_t N = dst.extent(0) * dst.extent(1); + + if (dst.span_is_contiguous() && src.span_is_contiguous()) { + team.team_barrier(); + local_deep_copy_contiguous(team, dst, src); + team.team_barrier(); + } else { + team.team_barrier(); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const int& i) { + int i0 = i % dst.extent(0); + int i1 = i / dst.extent(0); + dst(i0, i1) = src(i0, i1); + }); + team.team_barrier(); + } +} +//---------------------------------------------------------------------------- +template <class TeamType, class DT, class... DP, class ST, class... SP> +void KOKKOS_INLINE_FUNCTION local_deep_copy( + const TeamType& team, const View<DT, DP...>& dst, + const View<ST, SP...>& src, + typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == 3 && + unsigned(ViewTraits<ST, SP...>::rank) == + 3)>::type* = nullptr) { + if (dst.data() == nullptr) { + return; + } + + const size_t N = dst.extent(0) * dst.extent(1) * dst.extent(2); + + if (dst.span_is_contiguous() && src.span_is_contiguous()) { + team.team_barrier(); + local_deep_copy_contiguous(team, dst, src); + team.team_barrier(); + } else { + team.team_barrier(); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const int& i) { + int i0 = i % dst.extent(0); + int itmp = i / dst.extent(0); + int i1 = itmp % dst.extent(1); + int i2 = itmp / dst.extent(1); + dst(i0, i1, i2) = src(i0, i1, i2); + }); + team.team_barrier(); + } +} +//---------------------------------------------------------------------------- +template <class TeamType, class DT, class... DP, class ST, class... SP> +void KOKKOS_INLINE_FUNCTION local_deep_copy( + const TeamType& team, const View<DT, DP...>& dst, + const View<ST, SP...>& src, + typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == 4 && + unsigned(ViewTraits<ST, SP...>::rank) == + 4)>::type* = nullptr) { + if (dst.data() == nullptr) { + return; + } + + const size_t N = + dst.extent(0) * dst.extent(1) * dst.extent(2) * dst.extent(3); + + if (dst.span_is_contiguous() && src.span_is_contiguous()) { + team.team_barrier(); + local_deep_copy_contiguous(team, dst, src); + team.team_barrier(); + } else { + team.team_barrier(); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const int& i) { + int i0 = i % dst.extent(0); + int itmp = i / dst.extent(0); + int i1 = itmp % dst.extent(1); + itmp = itmp / dst.extent(1); + int i2 = itmp % dst.extent(2); + int i3 = itmp / dst.extent(2); + dst(i0, i1, i2, i3) = src(i0, i1, i2, i3); + }); + team.team_barrier(); + } +} +//---------------------------------------------------------------------------- +template <class TeamType, class DT, class... DP, class ST, class... SP> +void KOKKOS_INLINE_FUNCTION local_deep_copy( + const TeamType& team, const View<DT, DP...>& dst, + const View<ST, SP...>& src, + typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == 5 && + unsigned(ViewTraits<ST, SP...>::rank) == + 5)>::type* = nullptr) { + if (dst.data() == nullptr) { + return; + } + + const size_t N = dst.extent(0) * dst.extent(1) * dst.extent(2) * + dst.extent(3) * dst.extent(4); + + if (dst.span_is_contiguous() && src.span_is_contiguous()) { + team.team_barrier(); + local_deep_copy_contiguous(team, dst, src); + team.team_barrier(); + } else { + team.team_barrier(); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const int& i) { + int i0 = i % dst.extent(0); + int itmp = i / dst.extent(0); + int i1 = itmp % dst.extent(1); + itmp = itmp / dst.extent(1); + int i2 = itmp % dst.extent(2); + itmp = itmp / dst.extent(2); + int i3 = itmp % dst.extent(3); + int i4 = itmp / dst.extent(3); + dst(i0, i1, i2, i3, i4) = src(i0, i1, i2, i3, i4); + }); + team.team_barrier(); + } +} +//---------------------------------------------------------------------------- +template <class TeamType, class DT, class... DP, class ST, class... SP> +void KOKKOS_INLINE_FUNCTION local_deep_copy( + const TeamType& team, const View<DT, DP...>& dst, + const View<ST, SP...>& src, + typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == 6 && + unsigned(ViewTraits<ST, SP...>::rank) == + 6)>::type* = nullptr) { + if (dst.data() == nullptr) { + return; + } + + const size_t N = dst.extent(0) * dst.extent(1) * dst.extent(2) * + dst.extent(3) * dst.extent(4) * dst.extent(5); + + if (dst.span_is_contiguous() && src.span_is_contiguous()) { + team.team_barrier(); + local_deep_copy_contiguous(team, dst, src); + team.team_barrier(); + } else { + team.team_barrier(); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const int& i) { + int i0 = i % dst.extent(0); + int itmp = i / dst.extent(0); + int i1 = itmp % dst.extent(1); + itmp = itmp / dst.extent(1); + int i2 = itmp % dst.extent(2); + itmp = itmp / dst.extent(2); + int i3 = itmp % dst.extent(3); + itmp = itmp / dst.extent(3); + int i4 = itmp % dst.extent(4); + int i5 = itmp / dst.extent(4); + dst(i0, i1, i2, i3, i4, i5) = src(i0, i1, i2, i3, i4, i5); + }); + team.team_barrier(); + } +} +//---------------------------------------------------------------------------- +template <class TeamType, class DT, class... DP, class ST, class... SP> +void KOKKOS_INLINE_FUNCTION local_deep_copy( + const TeamType& team, const View<DT, DP...>& dst, + const View<ST, SP...>& src, + typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == 7 && + unsigned(ViewTraits<ST, SP...>::rank) == + 7)>::type* = nullptr) { + if (dst.data() == nullptr) { + return; + } + + const size_t N = dst.extent(0) * dst.extent(1) * dst.extent(2) * + dst.extent(3) * dst.extent(4) * dst.extent(5) * + dst.extent(6); + + if (dst.span_is_contiguous() && src.span_is_contiguous()) { + team.team_barrier(); + local_deep_copy_contiguous(team, dst, src); + team.team_barrier(); + } else { + team.team_barrier(); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const int& i) { + int i0 = i % dst.extent(0); + int itmp = i / dst.extent(0); + int i1 = itmp % dst.extent(1); + itmp = itmp / dst.extent(1); + int i2 = itmp % dst.extent(2); + itmp = itmp / dst.extent(2); + int i3 = itmp % dst.extent(3); + itmp = itmp / dst.extent(3); + int i4 = itmp % dst.extent(4); + itmp = itmp / dst.extent(4); + int i5 = itmp % dst.extent(5); + int i6 = itmp / dst.extent(5); + dst(i0, i1, i2, i3, i4, i5, i6) = src(i0, i1, i2, i3, i4, i5, i6); + }); + team.team_barrier(); + } +} +//---------------------------------------------------------------------------- +template <class DT, class... DP, class ST, class... SP> +void KOKKOS_INLINE_FUNCTION local_deep_copy( + const View<DT, DP...>& dst, const View<ST, SP...>& src, + typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == 1 && + unsigned(ViewTraits<ST, SP...>::rank) == + 1)>::type* = nullptr) { + if (dst.data() == nullptr) { + return; + } + + const size_t N = dst.extent(0); + + for (size_t i = 0; i < N; ++i) { + dst(i) = src(i); + } +} +//---------------------------------------------------------------------------- +template <class DT, class... DP, class ST, class... SP> +void KOKKOS_INLINE_FUNCTION local_deep_copy( + const View<DT, DP...>& dst, const View<ST, SP...>& src, + typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == 2 && + unsigned(ViewTraits<ST, SP...>::rank) == + 2)>::type* = nullptr) { + if (dst.data() == nullptr) { + return; + } + + if (dst.span_is_contiguous() && src.span_is_contiguous()) { + local_deep_copy_contiguous(dst, src); + } else { + for (size_t i0 = 0; i0 < dst.extent(0); ++i0) + for (size_t i1 = 0; i1 < dst.extent(1); ++i1) dst(i0, i1) = src(i0, i1); + } +} +//---------------------------------------------------------------------------- +template <class DT, class... DP, class ST, class... SP> +void KOKKOS_INLINE_FUNCTION local_deep_copy( + const View<DT, DP...>& dst, const View<ST, SP...>& src, + typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == 3 && + unsigned(ViewTraits<ST, SP...>::rank) == + 3)>::type* = nullptr) { + if (dst.data() == nullptr) { + return; + } + + if (dst.span_is_contiguous() && src.span_is_contiguous()) { + local_deep_copy_contiguous(dst, src); + } else { + for (size_t i0 = 0; i0 < dst.extent(0); ++i0) + for (size_t i1 = 0; i1 < dst.extent(1); ++i1) + for (size_t i2 = 0; i2 < dst.extent(2); ++i2) + dst(i0, i1, i2) = src(i0, i1, i2); + } +} +//---------------------------------------------------------------------------- +template <class DT, class... DP, class ST, class... SP> +void KOKKOS_INLINE_FUNCTION local_deep_copy( + const View<DT, DP...>& dst, const View<ST, SP...>& src, + typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == 4 && + unsigned(ViewTraits<ST, SP...>::rank) == + 4)>::type* = nullptr) { + if (dst.data() == nullptr) { + return; + } + + if (dst.span_is_contiguous() && src.span_is_contiguous()) { + local_deep_copy_contiguous(dst, src); + } else { + for (size_t i0 = 0; i0 < dst.extent(0); ++i0) + for (size_t i1 = 0; i1 < dst.extent(1); ++i1) + for (size_t i2 = 0; i2 < dst.extent(2); ++i2) + for (size_t i3 = 0; i3 < dst.extent(3); ++i3) + dst(i0, i1, i2, i3) = src(i0, i1, i2, i3); + } +} +//---------------------------------------------------------------------------- +template <class DT, class... DP, class ST, class... SP> +void KOKKOS_INLINE_FUNCTION local_deep_copy( + const View<DT, DP...>& dst, const View<ST, SP...>& src, + typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == 5 && + unsigned(ViewTraits<ST, SP...>::rank) == + 5)>::type* = nullptr) { + if (dst.data() == nullptr) { + return; + } + + if (dst.span_is_contiguous() && src.span_is_contiguous()) { + local_deep_copy_contiguous(dst, src); + } else { + for (size_t i0 = 0; i0 < dst.extent(0); ++i0) + for (size_t i1 = 0; i1 < dst.extent(1); ++i1) + for (size_t i2 = 0; i2 < dst.extent(2); ++i2) + for (size_t i3 = 0; i3 < dst.extent(3); ++i3) + for (size_t i4 = 0; i4 < dst.extent(4); ++i4) + dst(i0, i1, i2, i3, i4) = src(i0, i1, i2, i3, i4); + } +} +//---------------------------------------------------------------------------- +template <class DT, class... DP, class ST, class... SP> +void KOKKOS_INLINE_FUNCTION local_deep_copy( + const View<DT, DP...>& dst, const View<ST, SP...>& src, + typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == 6 && + unsigned(ViewTraits<ST, SP...>::rank) == + 6)>::type* = nullptr) { + if (dst.data() == nullptr) { + return; + } + + if (dst.span_is_contiguous() && src.span_is_contiguous()) { + local_deep_copy_contiguous(dst, src); + } else { + for (size_t i0 = 0; i0 < dst.extent(0); ++i0) + for (size_t i1 = 0; i1 < dst.extent(1); ++i1) + for (size_t i2 = 0; i2 < dst.extent(2); ++i2) + for (size_t i3 = 0; i3 < dst.extent(3); ++i3) + for (size_t i4 = 0; i4 < dst.extent(4); ++i4) + for (size_t i5 = 0; i5 < dst.extent(5); ++i5) + dst(i0, i1, i2, i3, i4, i5) = src(i0, i1, i2, i3, i4, i5); + } +} +//---------------------------------------------------------------------------- +template <class DT, class... DP, class ST, class... SP> +void KOKKOS_INLINE_FUNCTION local_deep_copy( + const View<DT, DP...>& dst, const View<ST, SP...>& src, + typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == 7 && + unsigned(ViewTraits<ST, SP...>::rank) == + 7)>::type* = nullptr) { + if (dst.data() == nullptr) { + return; + } + + if (dst.span_is_contiguous() && src.span_is_contiguous()) { + local_deep_copy_contiguous(dst, src); + } else { + for (size_t i0 = 0; i0 < dst.extent(0); ++i0) + for (size_t i1 = 0; i1 < dst.extent(1); ++i1) + for (size_t i2 = 0; i2 < dst.extent(2); ++i2) + for (size_t i3 = 0; i3 < dst.extent(3); ++i3) + for (size_t i4 = 0; i4 < dst.extent(4); ++i4) + for (size_t i5 = 0; i5 < dst.extent(5); ++i5) + for (size_t i6 = 0; i6 < dst.extent(6); ++i6) + dst(i0, i1, i2, i3, i4, i5, i6) = + src(i0, i1, i2, i3, i4, i5, i6); + } +} +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- +/** \brief Deep copy a value into a view. */ +template <class TeamType, class DT, class... DP> +void KOKKOS_INLINE_FUNCTION local_deep_copy_contiguous( + const TeamType& team, const View<DT, DP...>& dst, + typename ViewTraits<DT, DP...>::const_value_type& value) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, dst.span()), + [&](const int& i) { dst.data()[i] = value; }); +} +//---------------------------------------------------------------------------- +template <class DT, class... DP> +void KOKKOS_INLINE_FUNCTION local_deep_copy_contiguous( + const View<DT, DP...>& dst, + typename ViewTraits<DT, DP...>::const_value_type& value) { + for (size_t i = 0; i < dst.span(); ++i) { + dst.data()[i] = value; + } +} +//---------------------------------------------------------------------------- +template <class TeamType, class DT, class... DP> +void KOKKOS_INLINE_FUNCTION local_deep_copy( + const TeamType& team, const View<DT, DP...>& dst, + typename ViewTraits<DT, DP...>::const_value_type& value, + typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == + 1)>::type* = nullptr) { + if (dst.data() == nullptr) { + return; + } + + const size_t N = dst.extent(0); + + team.team_barrier(); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), + [&](const int& i) { dst(i) = value; }); + team.team_barrier(); +} +//---------------------------------------------------------------------------- +template <class TeamType, class DT, class... DP> +void KOKKOS_INLINE_FUNCTION local_deep_copy( + const TeamType& team, const View<DT, DP...>& dst, + typename ViewTraits<DT, DP...>::const_value_type& value, + typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == + 2)>::type* = nullptr) { + if (dst.data() == nullptr) { + return; + } + + const size_t N = dst.extent(0) * dst.extent(1); + + if (dst.span_is_contiguous()) { + team.team_barrier(); + local_deep_copy_contiguous(team, dst, value); + team.team_barrier(); + } else { + team.team_barrier(); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const int& i) { + int i0 = i % dst.extent(0); + int i1 = i / dst.extent(0); + dst(i0, i1) = value; + }); + team.team_barrier(); + } +} +//---------------------------------------------------------------------------- +template <class TeamType, class DT, class... DP> +void KOKKOS_INLINE_FUNCTION local_deep_copy( + const TeamType& team, const View<DT, DP...>& dst, + typename ViewTraits<DT, DP...>::const_value_type& value, + typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == + 3)>::type* = nullptr) { + if (dst.data() == nullptr) { + return; + } + + const size_t N = dst.extent(0) * dst.extent(1) * dst.extent(2); + + if (dst.span_is_contiguous()) { + team.team_barrier(); + local_deep_copy_contiguous(team, dst, value); + team.team_barrier(); + } else { + team.team_barrier(); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const int& i) { + int i0 = i % dst.extent(0); + int itmp = i / dst.extent(0); + int i1 = itmp % dst.extent(1); + int i2 = itmp / dst.extent(1); + dst(i0, i1, i2) = value; + }); + team.team_barrier(); + } +} +//---------------------------------------------------------------------------- +template <class TeamType, class DT, class... DP> +void KOKKOS_INLINE_FUNCTION local_deep_copy( + const TeamType& team, const View<DT, DP...>& dst, + typename ViewTraits<DT, DP...>::const_value_type& value, + typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == + 4)>::type* = nullptr) { + if (dst.data() == nullptr) { + return; + } + + const size_t N = + dst.extent(0) * dst.extent(1) * dst.extent(2) * dst.extent(3); + + if (dst.span_is_contiguous()) { + team.team_barrier(); + local_deep_copy_contiguous(team, dst, value); + team.team_barrier(); + } else { + team.team_barrier(); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const int& i) { + int i0 = i % dst.extent(0); + int itmp = i / dst.extent(0); + int i1 = itmp % dst.extent(1); + itmp = itmp / dst.extent(1); + int i2 = itmp % dst.extent(2); + int i3 = itmp / dst.extent(2); + dst(i0, i1, i2, i3) = value; + }); + team.team_barrier(); + } +} +//---------------------------------------------------------------------------- +template <class TeamType, class DT, class... DP> +void KOKKOS_INLINE_FUNCTION local_deep_copy( + const TeamType& team, const View<DT, DP...>& dst, + typename ViewTraits<DT, DP...>::const_value_type& value, + typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == + 5)>::type* = nullptr) { + if (dst.data() == nullptr) { + return; + } + + const size_t N = dst.extent(0) * dst.extent(1) * dst.extent(2) * + dst.extent(3) * dst.extent(4); + + if (dst.span_is_contiguous()) { + team.team_barrier(); + local_deep_copy_contiguous(team, dst, value); + team.team_barrier(); + } else { + team.team_barrier(); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const int& i) { + int i0 = i % dst.extent(0); + int itmp = i / dst.extent(0); + int i1 = itmp % dst.extent(1); + itmp = itmp / dst.extent(1); + int i2 = itmp % dst.extent(2); + itmp = itmp / dst.extent(2); + int i3 = itmp % dst.extent(3); + int i4 = itmp / dst.extent(3); + dst(i0, i1, i2, i3, i4) = value; + }); + team.team_barrier(); + } +} +//---------------------------------------------------------------------------- +template <class TeamType, class DT, class... DP> +void KOKKOS_INLINE_FUNCTION local_deep_copy( + const TeamType& team, const View<DT, DP...>& dst, + typename ViewTraits<DT, DP...>::const_value_type& value, + typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == + 6)>::type* = nullptr) { + if (dst.data() == nullptr) { + return; + } + + const size_t N = dst.extent(0) * dst.extent(1) * dst.extent(2) * + dst.extent(3) * dst.extent(4) * dst.extent(5); + + if (dst.span_is_contiguous()) { + team.team_barrier(); + local_deep_copy_contiguous(team, dst, value); + team.team_barrier(); + } else { + team.team_barrier(); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const int& i) { + int i0 = i % dst.extent(0); + int itmp = i / dst.extent(0); + int i1 = itmp % dst.extent(1); + itmp = itmp / dst.extent(1); + int i2 = itmp % dst.extent(2); + itmp = itmp / dst.extent(2); + int i3 = itmp % dst.extent(3); + itmp = itmp / dst.extent(3); + int i4 = itmp % dst.extent(4); + int i5 = itmp / dst.extent(4); + dst(i0, i1, i2, i3, i4, i5) = value; + }); + team.team_barrier(); + } +} +//---------------------------------------------------------------------------- +template <class TeamType, class DT, class... DP> +void KOKKOS_INLINE_FUNCTION local_deep_copy( + const TeamType& team, const View<DT, DP...>& dst, + typename ViewTraits<DT, DP...>::const_value_type& value, + typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == + 7)>::type* = nullptr) { + if (dst.data() == nullptr) { + return; + } + + const size_t N = dst.extent(0) * dst.extent(1) * dst.extent(2) * + dst.extent(3) * dst.extent(4) * dst.extent(5) * + dst.extent(6); + + if (dst.span_is_contiguous()) { + team.team_barrier(); + local_deep_copy_contiguous(team, dst, value); + team.team_barrier(); + } else { + team.team_barrier(); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const int& i) { + int i0 = i % dst.extent(0); + int itmp = i / dst.extent(0); + int i1 = itmp % dst.extent(1); + itmp = itmp / dst.extent(1); + int i2 = itmp % dst.extent(2); + itmp = itmp / dst.extent(2); + int i3 = itmp % dst.extent(3); + itmp = itmp / dst.extent(3); + int i4 = itmp % dst.extent(4); + itmp = itmp / dst.extent(4); + int i5 = itmp % dst.extent(5); + int i6 = itmp / dst.extent(5); + dst(i0, i1, i2, i3, i4, i5, i6) = value; + }); + team.team_barrier(); + } +} +//---------------------------------------------------------------------------- +template <class DT, class... DP> +void KOKKOS_INLINE_FUNCTION local_deep_copy( + const View<DT, DP...>& dst, + typename ViewTraits<DT, DP...>::const_value_type& value, + typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == + 1)>::type* = nullptr) { + if (dst.data() == nullptr) { + return; + } + + const size_t N = dst.extent(0); + + for (size_t i = 0; i < N; ++i) { + dst(i) = value; + } +} +//---------------------------------------------------------------------------- +template <class DT, class... DP> +void KOKKOS_INLINE_FUNCTION local_deep_copy( + const View<DT, DP...>& dst, + typename ViewTraits<DT, DP...>::const_value_type& value, + typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == + 2)>::type* = nullptr) { + if (dst.data() == nullptr) { + return; + } + + if (dst.span_is_contiguous()) { + local_deep_copy_contiguous(dst, value); + } else { + for (size_t i0 = 0; i0 < dst.extent(0); ++i0) + for (size_t i1 = 0; i1 < dst.extent(1); ++i1) dst(i0, i1) = value; + } +} +//---------------------------------------------------------------------------- +template <class DT, class... DP> +void KOKKOS_INLINE_FUNCTION local_deep_copy( + const View<DT, DP...>& dst, + typename ViewTraits<DT, DP...>::const_value_type& value, + typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == + 3)>::type* = nullptr) { + if (dst.data() == nullptr) { + return; + } + + if (dst.span_is_contiguous()) { + local_deep_copy_contiguous(dst, value); + } else { + for (size_t i0 = 0; i0 < dst.extent(0); ++i0) + for (size_t i1 = 0; i1 < dst.extent(1); ++i1) + for (size_t i2 = 0; i2 < dst.extent(2); ++i2) dst(i0, i1, i2) = value; + } +} +//---------------------------------------------------------------------------- +template <class DT, class... DP> +void KOKKOS_INLINE_FUNCTION local_deep_copy( + const View<DT, DP...>& dst, + typename ViewTraits<DT, DP...>::const_value_type& value, + typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == + 4)>::type* = nullptr) { + if (dst.data() == nullptr) { + return; + } + + if (dst.span_is_contiguous()) { + local_deep_copy_contiguous(dst, value); + } else { + for (size_t i0 = 0; i0 < dst.extent(0); ++i0) + for (size_t i1 = 0; i1 < dst.extent(1); ++i1) + for (size_t i2 = 0; i2 < dst.extent(2); ++i2) + for (size_t i3 = 0; i3 < dst.extent(3); ++i3) + dst(i0, i1, i2, i3) = value; + } +} +//---------------------------------------------------------------------------- +template <class DT, class... DP> +void KOKKOS_INLINE_FUNCTION local_deep_copy( + const View<DT, DP...>& dst, + typename ViewTraits<DT, DP...>::const_value_type& value, + typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == + 5)>::type* = nullptr) { + if (dst.data() == nullptr) { + return; + } + + if (dst.span_is_contiguous()) { + local_deep_copy_contiguous(dst, value); + } else { + for (size_t i0 = 0; i0 < dst.extent(0); ++i0) + for (size_t i1 = 0; i1 < dst.extent(1); ++i1) + for (size_t i2 = 0; i2 < dst.extent(2); ++i2) + for (size_t i3 = 0; i3 < dst.extent(3); ++i3) + for (size_t i4 = 0; i4 < dst.extent(4); ++i4) + dst(i0, i1, i2, i3, i4) = value; + } +} +//---------------------------------------------------------------------------- +template <class DT, class... DP> +void KOKKOS_INLINE_FUNCTION local_deep_copy( + const View<DT, DP...>& dst, + typename ViewTraits<DT, DP...>::const_value_type& value, + typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == + 6)>::type* = nullptr) { + if (dst.data() == nullptr) { + return; + } + + if (dst.span_is_contiguous()) { + local_deep_copy_contiguous(dst, value); + } else { + for (size_t i0 = 0; i0 < dst.extent(0); ++i0) + for (size_t i1 = 0; i1 < dst.extent(1); ++i1) + for (size_t i2 = 0; i2 < dst.extent(2); ++i2) + for (size_t i3 = 0; i3 < dst.extent(3); ++i3) + for (size_t i4 = 0; i4 < dst.extent(4); ++i4) + for (size_t i5 = 0; i5 < dst.extent(5); ++i5) + dst(i0, i1, i2, i3, i4, i5) = value; + } +} +//---------------------------------------------------------------------------- +template <class DT, class... DP> +void KOKKOS_INLINE_FUNCTION local_deep_copy( + const View<DT, DP...>& dst, + typename ViewTraits<DT, DP...>::const_value_type& value, + typename std::enable_if<(unsigned(ViewTraits<DT, DP...>::rank) == + 7)>::type* = nullptr) { + if (dst.data() == nullptr) { + return; + } + + if (dst.span_is_contiguous()) { + local_deep_copy_contiguous(dst, value); + } else { + for (size_t i0 = 0; i0 < dst.extent(0); ++i0) + for (size_t i1 = 0; i1 < dst.extent(1); ++i1) + for (size_t i2 = 0; i2 < dst.extent(2); ++i2) + for (size_t i3 = 0; i3 < dst.extent(3); ++i3) + for (size_t i4 = 0; i4 < dst.extent(4); ++i4) + for (size_t i5 = 0; i5 < dst.extent(5); ++i5) + for (size_t i6 = 0; i6 < dst.extent(6); ++i6) + dst(i0, i1, i2, i3, i4, i5, i6) = value; + } +} +} /* namespace Experimental */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +/** \brief Deep copy a value from Host memory into a view. ExecSpace can access + * dst */ +template <class ExecSpace, class DT, class... DP> +inline void deep_copy( + const ExecSpace& space, const View<DT, DP...>& dst, + typename ViewTraits<DT, DP...>::const_value_type& value, + typename std::enable_if< + Kokkos::Impl::is_execution_space<ExecSpace>::value && + std::is_same<typename ViewTraits<DT, DP...>::specialize, void>::value && + Kokkos::Impl::SpaceAccessibility< + ExecSpace, + typename ViewTraits<DT, DP...>::memory_space>::accessible>::type* = + nullptr) { + using dst_traits = ViewTraits<DT, DP...>; + static_assert(std::is_same<typename dst_traits::non_const_value_type, + typename dst_traits::value_type>::value, + "deep_copy requires non-const type"); + using dst_memory_space = typename dst_traits::memory_space; + if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { + Kokkos::Profiling::beginDeepCopy( + Kokkos::Profiling::make_space_handle(dst_memory_space::name()), + dst.label(), dst.data(), + Kokkos::Profiling::make_space_handle(Kokkos::HostSpace::name()), + "(none)", &value, dst.span() * sizeof(typename dst_traits::value_type)); + } + if (dst.data() == nullptr) { + space.fence(); + } else { + using ViewTypeUniform = typename std::conditional< + View<DT, DP...>::Rank == 0, + typename View<DT, DP...>::uniform_runtime_type, + typename View<DT, DP...>::uniform_runtime_nomemspace_type>::type; + Kokkos::Impl::ViewFill<ViewTypeUniform, typename dst_traits::array_layout, + ExecSpace>(dst, value, space); + } + if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) { + Kokkos::Profiling::endDeepCopy(); + } +} + +/** \brief Deep copy a value from Host memory into a view. ExecSpace can not + * access dst */ +template <class ExecSpace, class DT, class... DP> +inline void deep_copy( + const ExecSpace& space, const View<DT, DP...>& dst, + typename ViewTraits<DT, DP...>::const_value_type& value, + typename std::enable_if< + Kokkos::Impl::is_execution_space<ExecSpace>::value && + std::is_same<typename ViewTraits<DT, DP...>::specialize, void>::value && + !Kokkos::Impl::SpaceAccessibility< + ExecSpace, + typename ViewTraits<DT, DP...>::memory_space>::accessible>::type* = + nullptr) { + using dst_traits = ViewTraits<DT, DP...>; + static_assert(std::is_same<typename dst_traits::non_const_value_type, + typename dst_traits::value_type>::value, + "deep_copy requires non-const type"); + using dst_memory_space = typename dst_traits::memory_space; + if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { + Kokkos::Profiling::beginDeepCopy( + Kokkos::Profiling::make_space_handle(dst_memory_space::name()), + dst.label(), dst.data(), + Kokkos::Profiling::make_space_handle(Kokkos::HostSpace::name()), + "(none)", &value, dst.span() * sizeof(typename dst_traits::value_type)); + } + if (dst.data() == nullptr) { + space.fence(); + } else { + space.fence(); + using ViewTypeUniform = typename std::conditional< + View<DT, DP...>::Rank == 0, + typename View<DT, DP...>::uniform_runtime_type, + typename View<DT, DP...>::uniform_runtime_nomemspace_type>::type; + using fill_exec_space = typename dst_traits::memory_space::execution_space; + Kokkos::Impl::ViewFill<ViewTypeUniform, typename dst_traits::array_layout, + fill_exec_space>(dst, value, fill_exec_space()); + fill_exec_space().fence(); + } + if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) { + Kokkos::Profiling::endDeepCopy(); + } +} + +/** \brief Deep copy into a value in Host memory from a view. */ +template <class ExecSpace, class ST, class... SP> +inline void deep_copy( + const ExecSpace& exec_space, + typename ViewTraits<ST, SP...>::non_const_value_type& dst, + const View<ST, SP...>& src, + typename std::enable_if< + Kokkos::Impl::is_execution_space<ExecSpace>::value && + std::is_same<typename ViewTraits<ST, SP...>::specialize, + void>::value>::type* = nullptr) { + using src_traits = ViewTraits<ST, SP...>; + using src_memory_space = typename src_traits::memory_space; + static_assert(src_traits::rank == 0, + "ERROR: Non-rank-zero view in deep_copy( value , View )"); + if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { + Kokkos::Profiling::beginDeepCopy( + Kokkos::Profiling::make_space_handle(Kokkos::HostSpace::name()), + "(none)", &dst, + Kokkos::Profiling::make_space_handle(src_memory_space::name()), + src.label(), src.data(), sizeof(ST)); + } + + if (src.data() == nullptr) { + exec_space.fence(); + if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) { + Kokkos::Profiling::endDeepCopy(); + } + return; + } + + Kokkos::Impl::DeepCopy<HostSpace, src_memory_space, ExecSpace>( + exec_space, &dst, src.data(), sizeof(ST)); + if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) { + Kokkos::Profiling::endDeepCopy(); + } +} + +//---------------------------------------------------------------------------- +/** \brief A deep copy between views of compatible type, and rank zero. */ +template <class ExecSpace, class DT, class... DP, class ST, class... SP> +inline void deep_copy( + const ExecSpace& exec_space, const View<DT, DP...>& dst, + const View<ST, SP...>& src, + typename std::enable_if<( + Kokkos::Impl::is_execution_space<ExecSpace>::value && + std::is_same<typename ViewTraits<DT, DP...>::specialize, void>::value && + std::is_same<typename ViewTraits<ST, SP...>::specialize, void>::value && + (unsigned(ViewTraits<DT, DP...>::rank) == unsigned(0) && + unsigned(ViewTraits<ST, SP...>::rank) == unsigned(0)))>::type* = + nullptr) { + using src_traits = ViewTraits<ST, SP...>; + using dst_traits = ViewTraits<DT, DP...>; + + using src_memory_space = typename src_traits::memory_space; + using dst_memory_space = typename dst_traits::memory_space; + static_assert(std::is_same<typename dst_traits::value_type, + typename src_traits::non_const_value_type>::value, + "deep_copy requires matching non-const destination type"); + + if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { + Kokkos::Profiling::beginDeepCopy( + Kokkos::Profiling::make_space_handle(dst_memory_space::name()), + dst.label(), dst.data(), + Kokkos::Profiling::make_space_handle(src_memory_space::name()), + src.label(), src.data(), sizeof(DT)); + } + + if (dst.data() == nullptr && src.data() == nullptr) { + exec_space.fence(); + if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) { + Kokkos::Profiling::endDeepCopy(); + } + return; + } + + if (dst.data() != src.data()) { + Kokkos::Impl::DeepCopy<dst_memory_space, src_memory_space, ExecSpace>( + exec_space, dst.data(), src.data(), + sizeof(typename dst_traits::value_type)); + } + if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) { + Kokkos::Profiling::endDeepCopy(); + } +} + +//---------------------------------------------------------------------------- +/** \brief A deep copy between views of the default specialization, compatible + * type, same non-zero rank + */ +template <class ExecSpace, class DT, class... DP, class ST, class... SP> +inline void deep_copy( + const ExecSpace& exec_space, const View<DT, DP...>& dst, + const View<ST, SP...>& src, + typename std::enable_if<( + Kokkos::Impl::is_execution_space<ExecSpace>::value && + std::is_same<typename ViewTraits<DT, DP...>::specialize, void>::value && + std::is_same<typename ViewTraits<ST, SP...>::specialize, void>::value && + (unsigned(ViewTraits<DT, DP...>::rank) != 0 || + unsigned(ViewTraits<ST, SP...>::rank) != 0))>::type* = nullptr) { + using dst_type = View<DT, DP...>; + using src_type = View<ST, SP...>; + + static_assert(std::is_same<typename dst_type::value_type, + typename dst_type::non_const_value_type>::value, + "deep_copy requires non-const destination type"); + + static_assert((unsigned(dst_type::rank) == unsigned(src_type::rank)), + "deep_copy requires Views of equal rank"); + + using dst_execution_space = typename dst_type::execution_space; + using src_execution_space = typename src_type::execution_space; + using dst_memory_space = typename dst_type::memory_space; + using src_memory_space = typename src_type::memory_space; + using dst_value_type = typename dst_type::value_type; + using src_value_type = typename src_type::value_type; + + if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { + Kokkos::Profiling::beginDeepCopy( + Kokkos::Profiling::make_space_handle(dst_memory_space::name()), + dst.label(), dst.data(), + Kokkos::Profiling::make_space_handle(src_memory_space::name()), + src.label(), src.data(), dst.span() * sizeof(dst_value_type)); + } + + dst_value_type* dst_start = dst.data(); + dst_value_type* dst_end = dst.data() + dst.span(); + src_value_type* src_start = src.data(); + src_value_type* src_end = src.data() + src.span(); + + // Early dropout if identical range + if ((dst_start == nullptr || src_start == nullptr) || + ((std::ptrdiff_t(dst_start) == std::ptrdiff_t(src_start)) && + (std::ptrdiff_t(dst_end) == std::ptrdiff_t(src_end)))) { + // throw if dimension mismatch + if ((src.extent(0) != dst.extent(0)) || (src.extent(1) != dst.extent(1)) || + (src.extent(2) != dst.extent(2)) || (src.extent(3) != dst.extent(3)) || + (src.extent(4) != dst.extent(4)) || (src.extent(5) != dst.extent(5)) || + (src.extent(6) != dst.extent(6)) || (src.extent(7) != dst.extent(7))) { + std::string message( + "Deprecation Error: Kokkos::deep_copy extents of views don't " + "match: "); + message += dst.label(); + message += "("; + for (int r = 0; r < dst_type::Rank - 1; r++) { + message += std::to_string(dst.extent(r)); + message += ","; + } + message += std::to_string(dst.extent(dst_type::Rank - 1)); + message += ") "; + message += src.label(); + message += "("; + for (int r = 0; r < src_type::Rank - 1; r++) { + message += std::to_string(src.extent(r)); + message += ","; + } + message += std::to_string(src.extent(src_type::Rank - 1)); + message += ") "; + + Kokkos::Impl::throw_runtime_exception(message); + } + if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) { + Kokkos::Profiling::endDeepCopy(); + } + return; + } + + enum { + ExecCanAccessSrcDst = + Kokkos::Impl::SpaceAccessibility<ExecSpace, + dst_memory_space>::accessible && + Kokkos::Impl::SpaceAccessibility<ExecSpace, + src_memory_space>::accessible + }; + enum { + DstExecCanAccessSrc = + Kokkos::Impl::SpaceAccessibility<dst_execution_space, + src_memory_space>::accessible + }; + + enum { + SrcExecCanAccessDst = + Kokkos::Impl::SpaceAccessibility<src_execution_space, + dst_memory_space>::accessible + }; + + // Error out for non-identical overlapping views. + if ((((std::ptrdiff_t)dst_start < (std::ptrdiff_t)src_end) && + ((std::ptrdiff_t)dst_end > (std::ptrdiff_t)src_start)) && + ((dst.span_is_contiguous() && src.span_is_contiguous()))) { + std::string message("Error: Kokkos::deep_copy of overlapping views: "); + message += dst.label(); + message += "("; + message += std::to_string((std::ptrdiff_t)dst_start); + message += ","; + message += std::to_string((std::ptrdiff_t)dst_end); + message += ") "; + message += src.label(); + message += "("; + message += std::to_string((std::ptrdiff_t)src_start); + message += ","; + message += std::to_string((std::ptrdiff_t)src_end); + message += ") "; + Kokkos::Impl::throw_runtime_exception(message); + } + + // Check for same extents + if ((src.extent(0) != dst.extent(0)) || (src.extent(1) != dst.extent(1)) || + (src.extent(2) != dst.extent(2)) || (src.extent(3) != dst.extent(3)) || + (src.extent(4) != dst.extent(4)) || (src.extent(5) != dst.extent(5)) || + (src.extent(6) != dst.extent(6)) || (src.extent(7) != dst.extent(7))) { + std::string message( + "Deprecation Error: Kokkos::deep_copy extents of views don't match: "); + message += dst.label(); + message += "("; + for (int r = 0; r < dst_type::Rank - 1; r++) { + message += std::to_string(dst.extent(r)); + message += ","; + } + message += std::to_string(dst.extent(dst_type::Rank - 1)); + message += ") "; + message += src.label(); + message += "("; + for (int r = 0; r < src_type::Rank - 1; r++) { + message += std::to_string(src.extent(r)); + message += ","; + } + message += std::to_string(src.extent(src_type::Rank - 1)); + message += ") "; + + Kokkos::Impl::throw_runtime_exception(message); + } + + // If same type, equal layout, equal dimensions, equal span, and contiguous + // memory then can byte-wise copy + + if (std::is_same<typename dst_type::value_type, + typename src_type::non_const_value_type>::value && + (std::is_same<typename dst_type::array_layout, + typename src_type::array_layout>::value || + (dst_type::rank == 1 && src_type::rank == 1)) && + dst.span_is_contiguous() && src.span_is_contiguous() && + ((dst_type::rank < 1) || (dst.stride_0() == src.stride_0())) && + ((dst_type::rank < 2) || (dst.stride_1() == src.stride_1())) && + ((dst_type::rank < 3) || (dst.stride_2() == src.stride_2())) && + ((dst_type::rank < 4) || (dst.stride_3() == src.stride_3())) && + ((dst_type::rank < 5) || (dst.stride_4() == src.stride_4())) && + ((dst_type::rank < 6) || (dst.stride_5() == src.stride_5())) && + ((dst_type::rank < 7) || (dst.stride_6() == src.stride_6())) && + ((dst_type::rank < 8) || (dst.stride_7() == src.stride_7()))) { + const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span(); + if ((void*)dst.data() != (void*)src.data()) { + Kokkos::Impl::DeepCopy<dst_memory_space, src_memory_space, ExecSpace>( + exec_space, dst.data(), src.data(), nbytes); + } + } else { + // Copying data between views in accessible memory spaces and either + // non-contiguous or incompatible shape. + if (ExecCanAccessSrcDst) { + Impl::view_copy(exec_space, dst, src); + } else if (DstExecCanAccessSrc || SrcExecCanAccessDst) { + using cpy_exec_space = + typename std::conditional<DstExecCanAccessSrc, dst_execution_space, + src_execution_space>::type; + exec_space.fence(); + Impl::view_copy(cpy_exec_space(), dst, src); + cpy_exec_space().fence(); + } else { + Kokkos::Impl::throw_runtime_exception( + "deep_copy given views that would require a temporary allocation"); + } + } + if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) { + Kokkos::Profiling::endDeepCopy(); + } +} + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +/** \brief Resize a view with copying old data to new data at the corresponding + * indices. */ +template <class T, class... P> +inline typename std::enable_if< + std::is_same<typename Kokkos::View<T, P...>::array_layout, + Kokkos::LayoutLeft>::value || + std::is_same<typename Kokkos::View<T, P...>::array_layout, + Kokkos::LayoutRight>::value>::type +resize(Kokkos::View<T, P...>& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) { + using view_type = Kokkos::View<T, P...>; + + static_assert(Kokkos::ViewTraits<T, P...>::is_managed, + "Can only resize managed views"); + + // Fix #904 by checking dimensions before actually resizing. + // + // Rank is known at compile time, so hopefully the compiler will + // remove branches that are compile-time false. The upcoming "if + // constexpr" language feature would make this certain. + if (view_type::Rank == 1 && n0 == static_cast<size_t>(v.extent(0))) { + return; + } + if (view_type::Rank == 2 && n0 == static_cast<size_t>(v.extent(0)) && + n1 == static_cast<size_t>(v.extent(1))) { + return; + } + if (view_type::Rank == 3 && n0 == static_cast<size_t>(v.extent(0)) && + n1 == static_cast<size_t>(v.extent(1)) && + n2 == static_cast<size_t>(v.extent(2))) { + return; + } + if (view_type::Rank == 4 && n0 == static_cast<size_t>(v.extent(0)) && + n1 == static_cast<size_t>(v.extent(1)) && + n2 == static_cast<size_t>(v.extent(2)) && + n3 == static_cast<size_t>(v.extent(3))) { + return; + } + if (view_type::Rank == 5 && n0 == static_cast<size_t>(v.extent(0)) && + n1 == static_cast<size_t>(v.extent(1)) && + n2 == static_cast<size_t>(v.extent(2)) && + n3 == static_cast<size_t>(v.extent(3)) && + n4 == static_cast<size_t>(v.extent(4))) { + return; + } + if (view_type::Rank == 6 && n0 == static_cast<size_t>(v.extent(0)) && + n1 == static_cast<size_t>(v.extent(1)) && + n2 == static_cast<size_t>(v.extent(2)) && + n3 == static_cast<size_t>(v.extent(3)) && + n4 == static_cast<size_t>(v.extent(4)) && + n5 == static_cast<size_t>(v.extent(5))) { + return; + } + if (view_type::Rank == 7 && n0 == static_cast<size_t>(v.extent(0)) && + n1 == static_cast<size_t>(v.extent(1)) && + n2 == static_cast<size_t>(v.extent(2)) && + n3 == static_cast<size_t>(v.extent(3)) && + n4 == static_cast<size_t>(v.extent(4)) && + n5 == static_cast<size_t>(v.extent(5)) && + n6 == static_cast<size_t>(v.extent(6))) { + return; + } + if (view_type::Rank == 8 && n0 == static_cast<size_t>(v.extent(0)) && + n1 == static_cast<size_t>(v.extent(1)) && + n2 == static_cast<size_t>(v.extent(2)) && + n3 == static_cast<size_t>(v.extent(3)) && + n4 == static_cast<size_t>(v.extent(4)) && + n5 == static_cast<size_t>(v.extent(5)) && + n6 == static_cast<size_t>(v.extent(6)) && + n7 == static_cast<size_t>(v.extent(7))) { + return; + } + // If Kokkos ever supports Views of rank > 8, the above code won't + // be incorrect, because avoiding reallocation in resize() is just + // an optimization. + + // TODO (mfh 27 Jun 2017) If the old View has enough space but just + // different dimensions (e.g., if the product of the dimensions, + // including extra space for alignment, will not change), then + // consider just reusing storage. For now, Kokkos always + // reallocates if any of the dimensions change, even if the old View + // has enough space. + + view_type v_resized(v.label(), n0, n1, n2, n3, n4, n5, n6, n7); + + Kokkos::Impl::ViewRemap<view_type, view_type>(v_resized, v); + + v = v_resized; +} + +/** \brief Resize a view with copying old data to new data at the corresponding + * indices. */ +template <class I, class T, class... P> +inline typename std::enable_if< + std::is_same<typename Kokkos::View<T, P...>::array_layout, + Kokkos::LayoutLeft>::value || + std::is_same<typename Kokkos::View<T, P...>::array_layout, + Kokkos::LayoutRight>::value>::type +resize(const I& arg_prop, Kokkos::View<T, P...>& v, + const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) { + using view_type = Kokkos::View<T, P...>; + + static_assert(Kokkos::ViewTraits<T, P...>::is_managed, + "Can only resize managed views"); + + // Fix #904 by checking dimensions before actually resizing. + // + // Rank is known at compile time, so hopefully the compiler will + // remove branches that are compile-time false. The upcoming "if + // constexpr" language feature would make this certain. + if (view_type::Rank == 1 && n0 == static_cast<size_t>(v.extent(0))) { + return; + } + if (view_type::Rank == 2 && n0 == static_cast<size_t>(v.extent(0)) && + n1 == static_cast<size_t>(v.extent(1))) { + return; + } + if (view_type::Rank == 3 && n0 == static_cast<size_t>(v.extent(0)) && + n1 == static_cast<size_t>(v.extent(1)) && + n2 == static_cast<size_t>(v.extent(2))) { + return; + } + if (view_type::Rank == 4 && n0 == static_cast<size_t>(v.extent(0)) && + n1 == static_cast<size_t>(v.extent(1)) && + n2 == static_cast<size_t>(v.extent(2)) && + n3 == static_cast<size_t>(v.extent(3))) { + return; + } + if (view_type::Rank == 5 && n0 == static_cast<size_t>(v.extent(0)) && + n1 == static_cast<size_t>(v.extent(1)) && + n2 == static_cast<size_t>(v.extent(2)) && + n3 == static_cast<size_t>(v.extent(3)) && + n4 == static_cast<size_t>(v.extent(4))) { + return; + } + if (view_type::Rank == 6 && n0 == static_cast<size_t>(v.extent(0)) && + n1 == static_cast<size_t>(v.extent(1)) && + n2 == static_cast<size_t>(v.extent(2)) && + n3 == static_cast<size_t>(v.extent(3)) && + n4 == static_cast<size_t>(v.extent(4)) && + n5 == static_cast<size_t>(v.extent(5))) { + return; + } + if (view_type::Rank == 7 && n0 == static_cast<size_t>(v.extent(0)) && + n1 == static_cast<size_t>(v.extent(1)) && + n2 == static_cast<size_t>(v.extent(2)) && + n3 == static_cast<size_t>(v.extent(3)) && + n4 == static_cast<size_t>(v.extent(4)) && + n5 == static_cast<size_t>(v.extent(5)) && + n6 == static_cast<size_t>(v.extent(6))) { + return; + } + if (view_type::Rank == 8 && n0 == static_cast<size_t>(v.extent(0)) && + n1 == static_cast<size_t>(v.extent(1)) && + n2 == static_cast<size_t>(v.extent(2)) && + n3 == static_cast<size_t>(v.extent(3)) && + n4 == static_cast<size_t>(v.extent(4)) && + n5 == static_cast<size_t>(v.extent(5)) && + n6 == static_cast<size_t>(v.extent(6)) && + n7 == static_cast<size_t>(v.extent(7))) { + return; + } + // If Kokkos ever supports Views of rank > 8, the above code won't + // be incorrect, because avoiding reallocation in resize() is just + // an optimization. + + // TODO (mfh 27 Jun 2017) If the old View has enough space but just + // different dimensions (e.g., if the product of the dimensions, + // including extra space for alignment, will not change), then + // consider just reusing storage. For now, Kokkos always + // reallocates if any of the dimensions change, even if the old View + // has enough space. + + view_type v_resized(view_alloc(v.label(), std::forward<const I>(arg_prop)), + n0, n1, n2, n3, n4, n5, n6, n7); + + Kokkos::Impl::ViewRemap<view_type, view_type>(v_resized, v); + + v = v_resized; +} + +/** \brief Resize a view with copying old data to new data at the corresponding + * indices. */ +template <class T, class... P> +inline void resize(Kokkos::View<T, P...>& v, + const typename Kokkos::View<T, P...>::array_layout& layout) { + using view_type = Kokkos::View<T, P...>; + + static_assert(Kokkos::ViewTraits<T, P...>::is_managed, + "Can only resize managed views"); + + view_type v_resized(v.label(), layout); + + Kokkos::Impl::ViewRemap<view_type, view_type>(v_resized, v); + + v = v_resized; +} + +/** \brief Resize a view with discarding old data. */ +template <class T, class... P> +inline typename std::enable_if< + std::is_same<typename Kokkos::View<T, P...>::array_layout, + Kokkos::LayoutLeft>::value || + std::is_same<typename Kokkos::View<T, P...>::array_layout, + Kokkos::LayoutRight>::value>::type +realloc(Kokkos::View<T, P...>& v, + const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) { + using view_type = Kokkos::View<T, P...>; + + static_assert(Kokkos::ViewTraits<T, P...>::is_managed, + "Can only realloc managed views"); + + const std::string label = v.label(); + + v = view_type(); // Deallocate first, if the only view to allocation + v = view_type(label, n0, n1, n2, n3, n4, n5, n6, n7); +} + +/** \brief Resize a view with discarding old data. */ +template <class T, class... P> +inline void realloc( + Kokkos::View<T, P...>& v, + const typename Kokkos::View<T, P...>::array_layout& layout) { + using view_type = Kokkos::View<T, P...>; + + static_assert(Kokkos::ViewTraits<T, P...>::is_managed, + "Can only realloc managed views"); + + const std::string label = v.label(); + + v = view_type(); // Deallocate first, if the only view to allocation + v = view_type(label, layout); +} +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +// Deduce Mirror Types +template <class Space, class T, class... P> +struct MirrorViewType { + // The incoming view_type + using src_view_type = typename Kokkos::View<T, P...>; + // The memory space for the mirror view + using memory_space = typename Space::memory_space; + // Check whether it is the same memory space + enum { + is_same_memspace = + std::is_same<memory_space, typename src_view_type::memory_space>::value + }; + // The array_layout + using array_layout = typename src_view_type::array_layout; + // The data type (we probably want it non-const since otherwise we can't even + // deep_copy to it. + using data_type = typename src_view_type::non_const_data_type; + // The destination view type if it is not the same memory space + using dest_view_type = Kokkos::View<data_type, array_layout, Space>; + // If it is the same memory_space return the existsing view_type + // This will also keep the unmanaged trait if necessary + using view_type = typename std::conditional<is_same_memspace, src_view_type, + dest_view_type>::type; +}; + +template <class Space, class T, class... P> +struct MirrorType { + // The incoming view_type + using src_view_type = typename Kokkos::View<T, P...>; + // The memory space for the mirror view + using memory_space = typename Space::memory_space; + // Check whether it is the same memory space + enum { + is_same_memspace = + std::is_same<memory_space, typename src_view_type::memory_space>::value + }; + // The array_layout + using array_layout = typename src_view_type::array_layout; + // The data type (we probably want it non-const since otherwise we can't even + // deep_copy to it. + using data_type = typename src_view_type::non_const_data_type; + // The destination view type if it is not the same memory space + using view_type = Kokkos::View<data_type, array_layout, Space>; +}; + +} // namespace Impl + +template <class T, class... P> +inline typename Kokkos::View<T, P...>::HostMirror create_mirror( + const Kokkos::View<T, P...>& src, + typename std::enable_if< + std::is_same<typename ViewTraits<T, P...>::specialize, void>::value && + !std::is_same<typename Kokkos::ViewTraits<T, P...>::array_layout, + Kokkos::LayoutStride>::value>::type* = nullptr) { + using src_type = View<T, P...>; + using dst_type = typename src_type::HostMirror; + + return dst_type( + std::string(src.label()).append("_mirror"), + src.rank_dynamic > 0 ? src.extent(0) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + src.rank_dynamic > 1 ? src.extent(1) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + src.rank_dynamic > 2 ? src.extent(2) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + src.rank_dynamic > 3 ? src.extent(3) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + src.rank_dynamic > 4 ? src.extent(4) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + src.rank_dynamic > 5 ? src.extent(5) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + src.rank_dynamic > 6 ? src.extent(6) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + src.rank_dynamic > 7 ? src.extent(7) : KOKKOS_IMPL_CTOR_DEFAULT_ARG); +} + +template <class T, class... P> +inline typename Kokkos::View<T, P...>::HostMirror create_mirror( + const Kokkos::View<T, P...>& src, + typename std::enable_if< + std::is_same<typename ViewTraits<T, P...>::specialize, void>::value && + std::is_same<typename Kokkos::ViewTraits<T, P...>::array_layout, + Kokkos::LayoutStride>::value>::type* = nullptr) { + using src_type = View<T, P...>; + using dst_type = typename src_type::HostMirror; + + Kokkos::LayoutStride layout; + + layout.dimension[0] = src.extent(0); + layout.dimension[1] = src.extent(1); + layout.dimension[2] = src.extent(2); + layout.dimension[3] = src.extent(3); + layout.dimension[4] = src.extent(4); + layout.dimension[5] = src.extent(5); + layout.dimension[6] = src.extent(6); + layout.dimension[7] = src.extent(7); + + layout.stride[0] = src.stride_0(); + layout.stride[1] = src.stride_1(); + layout.stride[2] = src.stride_2(); + layout.stride[3] = src.stride_3(); + layout.stride[4] = src.stride_4(); + layout.stride[5] = src.stride_5(); + layout.stride[6] = src.stride_6(); + layout.stride[7] = src.stride_7(); + + return dst_type(std::string(src.label()).append("_mirror"), layout); +} + +// Create a mirror in a new space (specialization for different space) +template <class Space, class T, class... P> +typename Impl::MirrorType<Space, T, P...>::view_type create_mirror( + const Space&, const Kokkos::View<T, P...>& src, + typename std::enable_if<std::is_same< + typename ViewTraits<T, P...>::specialize, void>::value>::type* = + nullptr) { + return typename Impl::MirrorType<Space, T, P...>::view_type(src.label(), + src.layout()); +} + +template <class T, class... P> +inline typename Kokkos::View<T, P...>::HostMirror create_mirror_view( + const Kokkos::View<T, P...>& src, + typename std::enable_if< + (std::is_same< + typename Kokkos::View<T, P...>::memory_space, + typename Kokkos::View<T, P...>::HostMirror::memory_space>::value && + std::is_same<typename Kokkos::View<T, P...>::data_type, + typename Kokkos::View<T, P...>::HostMirror::data_type>:: + value)>::type* = nullptr) { + return src; +} + +template <class T, class... P> +inline typename Kokkos::View<T, P...>::HostMirror create_mirror_view( + const Kokkos::View<T, P...>& src, + typename std::enable_if<!( + std::is_same< + typename Kokkos::View<T, P...>::memory_space, + typename Kokkos::View<T, P...>::HostMirror::memory_space>::value && + std::is_same<typename Kokkos::View<T, P...>::data_type, + typename Kokkos::View<T, P...>::HostMirror::data_type>:: + value)>::type* = nullptr) { + return Kokkos::create_mirror(src); +} + +// Create a mirror view in a new space (specialization for same space) +template <class Space, class T, class... P> +typename Impl::MirrorViewType<Space, T, P...>::view_type create_mirror_view( + const Space&, const Kokkos::View<T, P...>& src, + typename std::enable_if< + Impl::MirrorViewType<Space, T, P...>::is_same_memspace>::type* = + nullptr) { + return src; +} + +// Create a mirror view in a new space (specialization for different space) +template <class Space, class T, class... P> +typename Impl::MirrorViewType<Space, T, P...>::view_type create_mirror_view( + const Space&, const Kokkos::View<T, P...>& src, + typename std::enable_if< + !Impl::MirrorViewType<Space, T, P...>::is_same_memspace>::type* = + nullptr) { + return typename Impl::MirrorViewType<Space, T, P...>::view_type(src.label(), + src.layout()); +} + +// Create a mirror view and deep_copy in a new space (specialization for same +// space) +template <class Space, class T, class... P> +typename Impl::MirrorViewType<Space, T, P...>::view_type +create_mirror_view_and_copy( + const Space&, const Kokkos::View<T, P...>& src, + std::string const& name = "", + typename std::enable_if< + Impl::MirrorViewType<Space, T, P...>::is_same_memspace>::type* = + nullptr) { + (void)name; + fence(); // same behavior as deep_copy(src, src) + return src; +} + +// Create a mirror view and deep_copy in a new space (specialization for +// different space) +template <class Space, class T, class... P> +typename Impl::MirrorViewType<Space, T, P...>::view_type +create_mirror_view_and_copy( + const Space&, const Kokkos::View<T, P...>& src, + std::string const& name = "", + typename std::enable_if< + !Impl::MirrorViewType<Space, T, P...>::is_same_memspace>::type* = + nullptr) { + using Mirror = typename Impl::MirrorViewType<Space, T, P...>::view_type; + std::string label = name.empty() ? src.label() : name; + auto mirror = typename Mirror::non_const_type{ + view_alloc(WithoutInitializing, label), src.layout()}; + deep_copy(mirror, src); + return mirror; +} + +// Create a mirror view in a new space without initializing (specialization for +// same space) +template <class Space, class T, class... P> +typename Impl::MirrorViewType<Space, T, P...>::view_type create_mirror_view( + const Space&, const Kokkos::View<T, P...>& src, + Kokkos::Impl::WithoutInitializing_t, + typename std::enable_if< + Impl::MirrorViewType<Space, T, P...>::is_same_memspace>::type* = + nullptr) { + return src; +} + +// Create a mirror view in a new space without initializing (specialization for +// different space) +template <class Space, class T, class... P> +typename Impl::MirrorViewType<Space, T, P...>::view_type create_mirror_view( + const Space&, const Kokkos::View<T, P...>& src, + Kokkos::Impl::WithoutInitializing_t, + typename std::enable_if< + !Impl::MirrorViewType<Space, T, P...>::is_same_memspace>::type* = + nullptr) { + using Mirror = typename Impl::MirrorViewType<Space, T, P...>::view_type; + return Mirror(view_alloc(WithoutInitializing, src.label()), src.layout()); +} + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif diff --git a/packages/kokkos/core/src/Kokkos_Core.hpp b/packages/kokkos/core/src/Kokkos_Core.hpp new file mode 100644 index 0000000000000000000000000000000000000000..c3771ab393f3aaf8f77cb474056d90e867ff03da --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_Core.hpp @@ -0,0 +1,293 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CORE_HPP +#define KOKKOS_CORE_HPP + +//---------------------------------------------------------------------------- +// Include the execution space header files for the enabled execution spaces. + +#include <Kokkos_Core_fwd.hpp> + +// Fundamental type description for half precision +// Should not rely on other backend infrastructure +#include <Kokkos_Half.hpp> +#include <KokkosCore_Config_DeclareBackend.hpp> + +#include <Kokkos_AnonymousSpace.hpp> +#include <Kokkos_LogicalSpaces.hpp> +#include <Kokkos_Pair.hpp> +#include <Kokkos_MathematicalFunctions.hpp> +#include <Kokkos_MemoryPool.hpp> +#include <Kokkos_Array.hpp> +#include <Kokkos_View.hpp> +#include <Kokkos_Vectorization.hpp> +#include <Kokkos_Atomic.hpp> +#include <Kokkos_hwloc.hpp> +#include <Kokkos_Timer.hpp> +#include <Kokkos_Tuners.hpp> +#include <Kokkos_TaskScheduler.hpp> +#include <Kokkos_Complex.hpp> +#include <Kokkos_CopyViews.hpp> +#include <functional> +#include <iosfwd> +#include <map> +#include <memory> + +//---------------------------------------------------------------------------- + +namespace Kokkos { + +struct InitArguments { + int num_threads; + int num_numa; + int device_id; + int ndevices; + int skip_device; + bool disable_warnings; + bool tune_internals; + bool tool_help = false; + std::string tool_lib = {}; + std::string tool_args = {}; + + InitArguments(int nt = -1, int nn = -1, int dv = -1, bool dw = false, + bool ti = false) + : num_threads{nt}, + num_numa{nn}, + device_id{dv}, + ndevices{-1}, + skip_device{9999}, + disable_warnings{dw}, + tune_internals{ti} {} +}; + +namespace Impl { + +/* ExecSpaceManager - Responsible for initializing all of the registered + * backends. Backends are registered using the register_space_initializer() + * function which should be called from a global context so that it is called + * prior to initialize_spaces() which is called from Kokkos::initialize() + */ +class ExecSpaceManager { + std::map<std::string, std::unique_ptr<ExecSpaceInitializerBase>> + exec_space_factory_list; + + public: + ExecSpaceManager() = default; + + void register_space_factory(std::string name, + std::unique_ptr<ExecSpaceInitializerBase> ptr); + void initialize_spaces(const Kokkos::InitArguments& args); + void finalize_spaces(const bool all_spaces); + void static_fence(); + void print_configuration(std::ostream& msg, const bool detail); + static ExecSpaceManager& get_instance(); +}; + +template <class SpaceInitializerType> +int initialize_space_factory(std::string name) { + auto space_ptr = std::make_unique<SpaceInitializerType>(); + ExecSpaceManager::get_instance().register_space_factory(name, + std::move(space_ptr)); + return 1; +} + +} // namespace Impl +void initialize(int& narg, char* arg[]); + +void initialize(InitArguments args = InitArguments()); + +namespace Impl { + +void pre_initialize(const InitArguments& args); + +void post_initialize(const InitArguments& args); + +void declare_configuration_metadata(const std::string& category, + const std::string& key, + const std::string& value); + +} // namespace Impl + +bool is_initialized() noexcept; + +bool show_warnings() noexcept; +bool tune_internals() noexcept; + +/** \brief Finalize the spaces that were initialized via Kokkos::initialize */ +void finalize(); + +/** + * \brief Push a user-defined function to be called in + * Kokkos::finalize, before any Kokkos state is finalized. + * + * \warning Only call this after Kokkos::initialize, but before + * Kokkos::finalize. + * + * This function is the Kokkos analog to std::atexit. If you call + * this with a function f, then your function will get called when + * Kokkos::finalize is called. Specifically, it will be called BEFORE + * Kokkos does any finalization. This means that all execution + * spaces, memory spaces, etc. that were initialized will still be + * initialized when your function is called. + * + * Just like std::atexit, if you call push_finalize_hook in sequence + * with multiple functions (f, g, h), Kokkos::finalize will call them + * in reverse order (h, g, f), as if popping a stack. Furthermore, + * just like std::atexit, if any of your functions throws but does not + * catch an exception, Kokkos::finalize will call std::terminate. + */ +void push_finalize_hook(std::function<void()> f); + +/** \brief Finalize all known execution spaces */ +void finalize_all(); + +void fence(); + +/** \brief Print "Bill of Materials" */ +void print_configuration(std::ostream&, const bool detail = false); + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +/* Allocate memory from a memory space. + * The allocation is tracked in Kokkos memory tracking system, so + * leaked memory can be identified. + */ +template <class Space = typename Kokkos::DefaultExecutionSpace::memory_space> +inline void* kokkos_malloc(const std::string& arg_alloc_label, + const size_t arg_alloc_size) { + using MemorySpace = typename Space::memory_space; + return Impl::SharedAllocationRecord<MemorySpace>::allocate_tracked( + MemorySpace(), arg_alloc_label, arg_alloc_size); +} + +template <class Space = typename Kokkos::DefaultExecutionSpace::memory_space> +inline void* kokkos_malloc(const size_t arg_alloc_size) { + using MemorySpace = typename Space::memory_space; + return Impl::SharedAllocationRecord<MemorySpace>::allocate_tracked( + MemorySpace(), "no-label", arg_alloc_size); +} + +template <class Space = typename Kokkos::DefaultExecutionSpace::memory_space> +inline void kokkos_free(void* arg_alloc) { + using MemorySpace = typename Space::memory_space; + return Impl::SharedAllocationRecord<MemorySpace>::deallocate_tracked( + arg_alloc); +} + +template <class Space = typename Kokkos::DefaultExecutionSpace::memory_space> +inline void* kokkos_realloc(void* arg_alloc, const size_t arg_alloc_size) { + using MemorySpace = typename Space::memory_space; + return Impl::SharedAllocationRecord<MemorySpace>::reallocate_tracked( + arg_alloc, arg_alloc_size); +} + +} // namespace Kokkos + +namespace Kokkos { + +/** \brief ScopeGuard + * Some user scope issues have been identified with some Kokkos::finalize + * calls; ScopeGuard aims to correct these issues. + * + * Two requirements for ScopeGuard: + * if Kokkos::is_initialized() in the constructor, don't call + * Kokkos::initialize or Kokkos::finalize it is not copyable or assignable + */ + +class ScopeGuard { + public: + ScopeGuard(int& narg, char* arg[]) { + sg_init = false; + if (!Kokkos::is_initialized()) { + initialize(narg, arg); + sg_init = true; + } + } + + ScopeGuard(const InitArguments& args = InitArguments()) { + sg_init = false; + if (!Kokkos::is_initialized()) { + initialize(args); + sg_init = true; + } + } + + ~ScopeGuard() { + if (Kokkos::is_initialized() && sg_init) { + finalize(); + } + } + + // private: + bool sg_init; + + ScopeGuard& operator=(const ScopeGuard&) = delete; + ScopeGuard(const ScopeGuard&) = delete; +}; + +} // namespace Kokkos + +#include <Kokkos_Crs.hpp> +#include <Kokkos_WorkGraphPolicy.hpp> +// Including this in Kokkos_Parallel_Reduce.hpp led to a circular dependency +// because Kokkos::Sum is used in Kokkos_Combined_Reducer.hpp and the default. +// The real answer is to finally break up Kokkos_Parallel_Reduce.hpp into +// smaller parts... +#include <impl/Kokkos_Combined_Reducer.hpp> +// Yet another workaround to deal with circular dependency issues because the +// implementation of the RAII wrapper is using Kokkos::single. +#include <Kokkos_AcquireUniqueTokenImpl.hpp> + +// Specializations requires after core definitions +#include <KokkosCore_Config_PostInclude.hpp> +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif diff --git a/packages/kokkos/core/src/Kokkos_Core_fwd.hpp b/packages/kokkos/core/src/Kokkos_Core_fwd.hpp new file mode 100644 index 0000000000000000000000000000000000000000..fe7eba3f6ef178848d2ea832341014d6dc5d1003 --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_Core_fwd.hpp @@ -0,0 +1,335 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CORE_FWD_HPP +#define KOKKOS_CORE_FWD_HPP + +//---------------------------------------------------------------------------- +// Kokkos_Macros.hpp does introspection on configuration options +// and compiler environment then sets a collection of #define macros. + +#include <Kokkos_Macros.hpp> +#include <impl/Kokkos_Error.hpp> +#include <impl/Kokkos_Utilities.hpp> + +#include <Kokkos_MasterLock.hpp> + +//---------------------------------------------------------------------------- +// Have assumed a 64bit build (8byte pointers) throughout the code base. + +static_assert(sizeof(void *) == 8, + "Kokkos assumes 64-bit build; i.e., 8-byte pointers"); + +//---------------------------------------------------------------------------- + +namespace Kokkos { + +struct AUTO_t { + KOKKOS_INLINE_FUNCTION + constexpr const AUTO_t &operator()() const { return *this; } +}; + +namespace { +/**\brief Token to indicate that a parameter's value is to be automatically + * selected */ +constexpr AUTO_t AUTO = Kokkos::AUTO_t(); +} // namespace + +struct InvalidType {}; + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +// Forward declarations for class inter-relationships + +namespace Kokkos { + +class HostSpace; ///< Memory space for main process and CPU execution spaces +class AnonymousSpace; + +template <class ExecutionSpace, class MemorySpace> +struct Device; + +// forward declare here so that backend initializer calls can use it. +struct InitArguments; + +} // namespace Kokkos + +// Include backend forward statements as determined by build options +#include <KokkosCore_Config_FwdBackend.hpp> + +//---------------------------------------------------------------------------- +// Set the default execution space. + +/// Define Kokkos::DefaultExecutionSpace as per configuration option +/// or chosen from the enabled execution spaces in the following order: +/// Kokkos::Cuda, Kokkos::Experimental::OpenMPTarget, Kokkos::OpenMP, +/// Kokkos::Threads, Kokkos::Serial + +#if defined(__clang_analyzer__) +#define KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION \ + [[clang::annotate("DefaultExecutionSpace")]] +#define KOKKOS_IMPL_DEFAULT_HOST_EXEC_SPACE_ANNOTATION \ + [[clang::annotate("DefaultHostExecutionSpace")]] +#else +#define KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION +#define KOKKOS_IMPL_DEFAULT_HOST_EXEC_SPACE_ANNOTATION +#endif + +namespace Kokkos { + +#if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA) +using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = Cuda; +#elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET) +using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = + Experimental::OpenMPTarget; +#elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HIP) +using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = + Experimental::HIP; +#elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SYCL) +using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = + Experimental::SYCL; +#elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP) +using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = OpenMP; +#elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS) +using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = Threads; +#elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HPX) +using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = + Kokkos::Experimental::HPX; +#elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL) +using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = Serial; +#else +#error \ + "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::Experimental::HIP, Kokkos::Experimental::SYCL, Kokkos::Experimental::OpenMPTarget, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Experimental::HPX, or Kokkos::Serial." +#endif + +#if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP) +using DefaultHostExecutionSpace KOKKOS_IMPL_DEFAULT_HOST_EXEC_SPACE_ANNOTATION = + OpenMP; +#elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS) +using DefaultHostExecutionSpace KOKKOS_IMPL_DEFAULT_HOST_EXEC_SPACE_ANNOTATION = + Threads; +#elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HPX) +using DefaultHostExecutionSpace KOKKOS_IMPL_DEFAULT_HOST_EXEC_SPACE_ANNOTATION = + Kokkos::Experimental::HPX; +#elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL) +using DefaultHostExecutionSpace KOKKOS_IMPL_DEFAULT_HOST_EXEC_SPACE_ANNOTATION = + Serial; +#elif defined(KOKKOS_ENABLE_OPENMP) +using DefaultHostExecutionSpace KOKKOS_IMPL_DEFAULT_HOST_EXEC_SPACE_ANNOTATION = + OpenMP; +#elif defined(KOKKOS_ENABLE_THREADS) +using DefaultHostExecutionSpace KOKKOS_IMPL_DEFAULT_HOST_EXEC_SPACE_ANNOTATION = + Threads; +#elif defined(KOKKOS_ENABLE_HPX) +using DefaultHostExecutionSpace KOKKOS_IMPL_DEFAULT_HOST_EXEC_SPACE_ANNOTATION = + Kokkos::Experimental::HPX; +#elif defined(KOKKOS_ENABLE_SERIAL) +using DefaultHostExecutionSpace KOKKOS_IMPL_DEFAULT_HOST_EXEC_SPACE_ANNOTATION = + Serial; +#else +#error \ + "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::OpenMP, Kokkos::Threads, Kokkos::Experimental::HPX, or Kokkos::Serial." +#endif + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +// Detect the active execution space and define its memory space. +// This is used to verify whether a running kernel can access +// a given memory space. + +namespace Kokkos { +namespace Impl { + +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA) && \ + defined(KOKKOS_ENABLE_CUDA) +using ActiveExecutionMemorySpace = Kokkos::CudaSpace; +#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) +using ActiveExecutionMemorySpace = Kokkos::Experimental::SYCLDeviceUSMSpace; +#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HIP_GPU) +using ActiveExecutionMemorySpace = Kokkos::Experimental::HIPSpace; +#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) +using ActiveExecutionMemorySpace = Kokkos::HostSpace; +#else +using ActiveExecutionMemorySpace = void; +#endif + +template <typename DstMemorySpace, typename SrcMemorySpace> +struct MemorySpaceAccess; + +template <typename DstMemorySpace, typename SrcMemorySpace, + bool = Kokkos::Impl::MemorySpaceAccess<DstMemorySpace, + SrcMemorySpace>::accessible> +struct verify_space { + KOKKOS_FUNCTION static void check() {} +}; + +template <typename DstMemorySpace, typename SrcMemorySpace> +struct verify_space<DstMemorySpace, SrcMemorySpace, false> { + KOKKOS_FUNCTION static void check() { + Kokkos::abort( + "Kokkos::View ERROR: attempt to access inaccessible memory space"); + }; +}; + +// Base class for exec space initializer factories +class ExecSpaceInitializerBase; + +} // namespace Impl + +namespace Experimental { +template <class, class, class, class> +class LogicalMemorySpace; +} + +} // namespace Kokkos + +#define KOKKOS_RESTRICT_EXECUTION_TO_DATA(DATA_SPACE, DATA_PTR) \ + Kokkos::Impl::verify_space<Kokkos::Impl::ActiveExecutionMemorySpace, \ + DATA_SPACE>::check(); + +#define KOKKOS_RESTRICT_EXECUTION_TO_(DATA_SPACE) \ + Kokkos::Impl::verify_space<Kokkos::Impl::ActiveExecutionMemorySpace, \ + DATA_SPACE>::check(); + +//---------------------------------------------------------------------------- + +namespace Kokkos { +void fence(); +} + +//---------------------------------------------------------------------------- + +namespace Kokkos { + +template <class DataType, class... Properties> +class View; + +namespace Impl { + +template <class DstSpace, class SrcSpace, + class ExecutionSpace = typename DstSpace::execution_space> +struct DeepCopy; + +template <class ViewType, class Layout = typename ViewType::array_layout, + class ExecSpace = typename ViewType::execution_space, + int Rank = ViewType::Rank, typename iType = int64_t> +struct ViewFill; + +template <class ViewTypeA, class ViewTypeB, class Layout, class ExecSpace, + int Rank, typename iType> +struct ViewCopy; + +template <class Functor, class Policy> +struct FunctorPolicyExecutionSpace; + +//---------------------------------------------------------------------------- +/// \class ParallelFor +/// \brief Implementation of the ParallelFor operator that has a +/// partial specialization for the device. +/// +/// This is an implementation detail of parallel_for. Users should +/// skip this and go directly to the nonmember function parallel_for. +template <class FunctorType, class ExecPolicy, + class ExecutionSpace = typename Impl::FunctorPolicyExecutionSpace< + FunctorType, ExecPolicy>::execution_space> +class ParallelFor; + +/// \class ParallelReduce +/// \brief Implementation detail of parallel_reduce. +/// +/// This is an implementation detail of parallel_reduce. Users should +/// skip this and go directly to the nonmember function parallel_reduce. +template <class FunctorType, class ExecPolicy, class ReducerType = InvalidType, + class ExecutionSpace = typename Impl::FunctorPolicyExecutionSpace< + FunctorType, ExecPolicy>::execution_space> +class ParallelReduce; + +/// \class ParallelScan +/// \brief Implementation detail of parallel_scan. +/// +/// This is an implementation detail of parallel_scan. Users should +/// skip this and go directly to the documentation of the nonmember +/// template function Kokkos::parallel_scan. +template <class FunctorType, class ExecPolicy, + class ExecutionSapce = typename Impl::FunctorPolicyExecutionSpace< + FunctorType, ExecPolicy>::execution_space> +class ParallelScan; + +template <class FunctorType, class ExecPolicy, class ReturnType = InvalidType, + class ExecutionSapce = typename Impl::FunctorPolicyExecutionSpace< + FunctorType, ExecPolicy>::execution_space> +class ParallelScanWithTotal; + +} // namespace Impl + +template <class ScalarType, class Space = HostSpace> +struct Sum; +template <class ScalarType, class Space = HostSpace> +struct Prod; +template <class ScalarType, class Space = HostSpace> +struct Min; +template <class ScalarType, class Space = HostSpace> +struct Max; +template <class ScalarType, class Space = HostSpace> +struct MinMax; +template <class ScalarType, class Index, class Space = HostSpace> +struct MinLoc; +template <class ScalarType, class Index, class Space = HostSpace> +struct MaxLoc; +template <class ScalarType, class Index, class Space = HostSpace> +struct MinMaxLoc; +template <class ScalarType, class Space = HostSpace> +struct BAnd; +template <class ScalarType, class Space = HostSpace> +struct BOr; +template <class ScalarType, class Space = HostSpace> +struct LAnd; +template <class ScalarType, class Space = HostSpace> +struct LOr; + +} // namespace Kokkos + +#endif /* #ifndef KOKKOS_CORE_FWD_HPP */ diff --git a/packages/kokkos/core/src/Kokkos_Crs.hpp b/packages/kokkos/core/src/Kokkos_Crs.hpp new file mode 100644 index 0000000000000000000000000000000000000000..1a10500b19a55f4f963807dd2cf1a28e6062f98c --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_Crs.hpp @@ -0,0 +1,425 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CRS_HPP +#define KOKKOS_CRS_HPP + +#include <Kokkos_View.hpp> +#include <Kokkos_CopyViews.hpp> + +namespace Kokkos { + +/// \class Crs +/// \brief Compressed row storage array. +/// +/// \tparam DataType The type of stored entries. If a Crs is +/// used as the graph of a sparse matrix, then this is usually an +/// integer type, the type of the column indices in the sparse +/// matrix. +/// +/// \tparam Arg1Type The second template parameter, corresponding +/// either to the Device type (if there are no more template +/// parameters) or to the Layout type (if there is at least one more +/// template parameter). +/// +/// \tparam Arg2Type The third template parameter, which if provided +/// corresponds to the Device type. +/// +/// \tparam SizeType The type of row offsets. Usually the default +/// parameter suffices. However, setting a nondefault value is +/// necessary in some cases, for example, if you want to have a +/// sparse matrices with dimensions (and therefore column indices) +/// that fit in \c int, but want to store more than <tt>INT_MAX</tt> +/// entries in the sparse matrix. +/// +/// A row has a range of entries: +/// <ul> +/// <li> <tt> row_map[i0] <= entry < row_map[i0+1] </tt> </li> +/// <li> <tt> 0 <= i1 < row_map[i0+1] - row_map[i0] </tt> </li> +/// <li> <tt> entries( entry , i2 , i3 , ... ); </tt> </li> +/// <li> <tt> entries( row_map[i0] + i1 , i2 , i3 , ... ); </tt> </li> +/// </ul> +template <class DataType, class Arg1Type, class Arg2Type = void, + typename SizeType = typename ViewTraits<DataType*, Arg1Type, Arg2Type, + void>::size_type> +class Crs { + protected: + using traits = ViewTraits<DataType*, Arg1Type, Arg2Type, void>; + + public: + using data_type = DataType; + using array_layout = typename traits::array_layout; + using execution_space = typename traits::execution_space; + using memory_space = typename traits::memory_space; + using device_type = typename traits::device_type; + using size_type = SizeType; + + using staticcrsgraph_type = Crs<DataType, Arg1Type, Arg2Type, SizeType>; + using HostMirror = + Crs<DataType, array_layout, typename traits::host_mirror_space, SizeType>; + using row_map_type = View<size_type*, array_layout, device_type>; + using entries_type = View<DataType*, array_layout, device_type>; + + row_map_type row_map; + entries_type entries; + + /* + * Default Constructors, operators and destructor + */ + KOKKOS_DEFAULTED_FUNCTION Crs() = default; + KOKKOS_DEFAULTED_FUNCTION Crs(Crs const&) = default; + KOKKOS_DEFAULTED_FUNCTION Crs(Crs&&) = default; + KOKKOS_DEFAULTED_FUNCTION Crs& operator=(Crs const&) = default; + KOKKOS_DEFAULTED_FUNCTION Crs& operator=(Crs&&) = default; + KOKKOS_DEFAULTED_FUNCTION ~Crs() = default; + + /** \brief Assign to a view of the rhs array. + * If the old view is the last view + * then allocated memory is deallocated. + */ + template <class EntriesType, class RowMapType> + KOKKOS_INLINE_FUNCTION Crs(const RowMapType& row_map_, + const EntriesType& entries_) + : row_map(row_map_), entries(entries_) {} + + /** \brief Return number of rows in the graph + */ + KOKKOS_INLINE_FUNCTION + size_type numRows() const { + return (row_map.extent(0) != 0) + ? row_map.extent(0) - static_cast<size_type>(1) + : static_cast<size_type>(0); + } +}; + +/*--------------------------------------------------------------------------*/ + +template <class OutCounts, class DataType, class Arg1Type, class Arg2Type, + class SizeType> +void get_crs_transpose_counts( + OutCounts& out, Crs<DataType, Arg1Type, Arg2Type, SizeType> const& in, + std::string const& name = "transpose_counts"); + +template <class OutCounts, class InCrs> +typename OutCounts::value_type get_crs_row_map_from_counts( + OutCounts& out, InCrs const& in, std::string const& name = "row_map"); + +template <class DataType, class Arg1Type, class Arg2Type, class SizeType> +void transpose_crs(Crs<DataType, Arg1Type, Arg2Type, SizeType>& out, + Crs<DataType, Arg1Type, Arg2Type, SizeType> const& in); + +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { + +template <class InCrs, class OutCounts> +class GetCrsTransposeCounts { + public: + using execution_space = typename InCrs::execution_space; + using self_type = GetCrsTransposeCounts<InCrs, OutCounts>; + using index_type = typename InCrs::size_type; + + private: + InCrs in; + OutCounts out; + + public: + KOKKOS_INLINE_FUNCTION + void operator()(index_type i) const { atomic_increment(&out[in.entries(i)]); } + GetCrsTransposeCounts(InCrs const& arg_in, OutCounts const& arg_out) + : in(arg_in), out(arg_out) { + using policy_type = RangePolicy<index_type, execution_space>; + using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>; + const closure_type closure(*this, + policy_type(0, index_type(in.entries.size()))); + closure.execute(); + execution_space().fence(); + } +}; + +template <class InCounts, class OutRowMap> +class CrsRowMapFromCounts { + public: + using execution_space = typename InCounts::execution_space; + using value_type = typename OutRowMap::value_type; + using index_type = typename InCounts::size_type; + using last_value_type = Kokkos::View<value_type, execution_space>; + + private: + InCounts m_in; + OutRowMap m_out; + last_value_type m_last_value; + + public: + KOKKOS_INLINE_FUNCTION + void operator()(index_type i, value_type& update, bool final_pass) const { + if (i < static_cast<index_type>(m_in.size())) { + update += m_in(i); + if (final_pass) m_out(i + 1) = update; + } else if (final_pass) { + m_out(0) = 0; + m_last_value() = update; + } + } + KOKKOS_INLINE_FUNCTION + void init(value_type& update) const { update = 0; } + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& update, + const volatile value_type& input) const { + update += input; + } + using self_type = CrsRowMapFromCounts<InCounts, OutRowMap>; + CrsRowMapFromCounts(InCounts const& arg_in, OutRowMap const& arg_out) + : m_in(arg_in), m_out(arg_out), m_last_value("last_value") {} + value_type execute() { + using policy_type = RangePolicy<index_type, execution_space>; + using closure_type = Kokkos::Impl::ParallelScan<self_type, policy_type>; + closure_type closure(*this, policy_type(0, m_in.size() + 1)); + closure.execute(); + auto last_value = Kokkos::create_mirror_view(m_last_value); + Kokkos::deep_copy(last_value, m_last_value); + return last_value(); + } +}; + +template <class InCrs, class OutCrs> +class FillCrsTransposeEntries { + public: + using execution_space = typename InCrs::execution_space; + using memory_space = typename InCrs::memory_space; + using value_type = typename OutCrs::entries_type::value_type; + using index_type = typename InCrs::size_type; + + private: + using counters_type = View<index_type*, memory_space>; + InCrs in; + OutCrs out; + counters_type counters; + + public: + KOKKOS_INLINE_FUNCTION + void operator()(index_type i) const { + auto begin = in.row_map(i); + auto end = in.row_map(i + 1); + for (auto j = begin; j < end; ++j) { + auto ti = in.entries(j); + auto tbegin = out.row_map(ti); + auto tj = atomic_fetch_add(&counters(ti), 1); + out.entries(tbegin + tj) = i; + } + } + using self_type = FillCrsTransposeEntries<InCrs, OutCrs>; + FillCrsTransposeEntries(InCrs const& arg_in, OutCrs const& arg_out) + : in(arg_in), out(arg_out), counters("counters", arg_out.numRows()) { + using policy_type = RangePolicy<index_type, execution_space>; + using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>; + const closure_type closure(*this, policy_type(0, index_type(in.numRows()))); + closure.execute(); + execution_space().fence(); + } +}; + +} // namespace Impl +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { + +template <class OutCounts, class DataType, class Arg1Type, class Arg2Type, + class SizeType> +void get_crs_transpose_counts( + OutCounts& out, Crs<DataType, Arg1Type, Arg2Type, SizeType> const& in, + std::string const& name) { + using InCrs = Crs<DataType, Arg1Type, Arg2Type, SizeType>; + out = OutCounts(name, in.numRows()); + Kokkos::Impl::GetCrsTransposeCounts<InCrs, OutCounts> functor(in, out); +} + +template <class OutRowMap, class InCounts> +typename OutRowMap::value_type get_crs_row_map_from_counts( + OutRowMap& out, InCounts const& in, std::string const& name) { + out = OutRowMap(view_alloc(WithoutInitializing, name), in.size() + 1); + Kokkos::Impl::CrsRowMapFromCounts<InCounts, OutRowMap> functor(in, out); + return functor.execute(); +} + +template <class DataType, class Arg1Type, class Arg2Type, class SizeType> +void transpose_crs(Crs<DataType, Arg1Type, Arg2Type, SizeType>& out, + Crs<DataType, Arg1Type, Arg2Type, SizeType> const& in) { + using crs_type = Crs<DataType, Arg1Type, Arg2Type, SizeType>; + using memory_space = typename crs_type::memory_space; + using counts_type = View<SizeType*, memory_space>; + { + counts_type counts; + Kokkos::get_crs_transpose_counts(counts, in); + Kokkos::get_crs_row_map_from_counts(out.row_map, counts, + "tranpose_row_map"); + } + out.entries = decltype(out.entries)("transpose_entries", in.entries.size()); + Kokkos::Impl::FillCrsTransposeEntries<crs_type, crs_type> entries_functor( + in, out); +} + +template <class CrsType, class Functor, + class ExecutionSpace = typename CrsType::execution_space> +struct CountAndFillBase; + +template <class CrsType, class Functor, class ExecutionSpace> +struct CountAndFillBase { + using data_type = typename CrsType::data_type; + using size_type = typename CrsType::size_type; + using row_map_type = typename CrsType::row_map_type; + using counts_type = row_map_type; + CrsType m_crs; + Functor m_functor; + counts_type m_counts; + struct Count {}; + inline void operator()(Count, size_type i) const { + m_counts(i) = m_functor(i, nullptr); + } + struct Fill {}; + inline void operator()(Fill, size_type i) const { + auto j = m_crs.row_map(i); + /* we don't want to access entries(entries.size()), even if its just to get + its address and never use it. this can happen when row (i) is empty and + all rows after it are also empty. we could compare to row_map(i + 1), but + that is a read from global memory, whereas dimension_0() should be part + of the View in registers (or constant memory) */ + data_type* fill = (j == static_cast<decltype(j)>(m_crs.entries.extent(0))) + ? nullptr + : (&(m_crs.entries(j))); + m_functor(i, fill); + } + CountAndFillBase(CrsType& crs, Functor const& f) : m_crs(crs), m_functor(f) {} +}; + +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) +#if defined(KOKKOS_ENABLE_CUDA) +#define EXEC_SPACE Kokkos::Cuda +#elif defined(KOKKOS_ENABLE_HIP) +#define EXEC_SPACE Kokkos::Experimental::HIP +#endif +template <class CrsType, class Functor> +struct CountAndFillBase<CrsType, Functor, EXEC_SPACE> { + using data_type = typename CrsType::data_type; + using size_type = typename CrsType::size_type; + using row_map_type = typename CrsType::row_map_type; + using counts_type = row_map_type; + CrsType m_crs; + Functor m_functor; + counts_type m_counts; + struct Count {}; + __device__ inline void operator()(Count, size_type i) const { + m_counts(i) = m_functor(i, nullptr); + } + struct Fill {}; + __device__ inline void operator()(Fill, size_type i) const { + auto j = m_crs.row_map(i); + /* we don't want to access entries(entries.size()), even if its just to get + its address and never use it. this can happen when row (i) is empty and + all rows after it are also empty. we could compare to row_map(i + 1), but + that is a read from global memory, whereas dimension_0() should be part + of the View in registers (or constant memory) */ + data_type* fill = (j == static_cast<decltype(j)>(m_crs.entries.extent(0))) + ? nullptr + : (&(m_crs.entries(j))); + m_functor(i, fill); + } + CountAndFillBase(CrsType& crs, Functor const& f) : m_crs(crs), m_functor(f) {} +}; +#endif + +template <class CrsType, class Functor> +struct CountAndFill : public CountAndFillBase<CrsType, Functor> { + using base_type = CountAndFillBase<CrsType, Functor>; + using typename base_type::Count; + using typename base_type::counts_type; + using typename base_type::data_type; + using typename base_type::Fill; + using typename base_type::size_type; + using entries_type = typename CrsType::entries_type; + using self_type = CountAndFill<CrsType, Functor>; + CountAndFill(CrsType& crs, size_type nrows, Functor const& f) + : base_type(crs, f) { + using execution_space = typename CrsType::execution_space; + this->m_counts = counts_type("counts", nrows); + { + using count_policy_type = RangePolicy<size_type, execution_space, Count>; + using count_closure_type = + Kokkos::Impl::ParallelFor<self_type, count_policy_type>; + const count_closure_type closure(*this, count_policy_type(0, nrows)); + closure.execute(); + } + auto nentries = Kokkos::get_crs_row_map_from_counts(this->m_crs.row_map, + this->m_counts); + this->m_counts = counts_type(); + this->m_crs.entries = entries_type("entries", nentries); + { + using fill_policy_type = RangePolicy<size_type, execution_space, Fill>; + using fill_closure_type = + Kokkos::Impl::ParallelFor<self_type, fill_policy_type>; + const fill_closure_type closure(*this, fill_policy_type(0, nrows)); + closure.execute(); + } + crs = this->m_crs; + } +}; + +template <class CrsType, class Functor> +void count_and_fill_crs(CrsType& crs, typename CrsType::size_type nrows, + Functor const& f) { + Kokkos::CountAndFill<CrsType, Functor>(crs, nrows, f); +} + +} // namespace Kokkos + +#endif /* #define KOKKOS_CRS_HPP */ diff --git a/packages/kokkos/core/src/Kokkos_Cuda.hpp b/packages/kokkos/core/src/Kokkos_Cuda.hpp new file mode 100644 index 0000000000000000000000000000000000000000..7a218120bb7bb3b053335946ae25ad58c8a85e6d --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_Cuda.hpp @@ -0,0 +1,316 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CUDA_HPP +#define KOKKOS_CUDA_HPP + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_CUDA) + +#include <Kokkos_Core_fwd.hpp> + +#include <iosfwd> +#include <vector> + +#include <impl/Kokkos_AnalyzePolicy.hpp> +#include <Kokkos_CudaSpace.hpp> + +#include <Kokkos_Parallel.hpp> +#include <Kokkos_TaskScheduler.hpp> +#include <Kokkos_Layout.hpp> +#include <Kokkos_ScratchSpace.hpp> +#include <Kokkos_MemoryTraits.hpp> +#include <impl/Kokkos_Tags.hpp> +#include <impl/Kokkos_ExecSpaceInitializer.hpp> +#include <impl/Kokkos_HostSharedPtr.hpp> + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { +class CudaExec; +class CudaInternal; +} // namespace Impl +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { + +namespace Impl { +namespace Experimental { +enum class CudaLaunchMechanism : unsigned { + Default = 0, + ConstantMemory = 1, + GlobalMemory = 2, + LocalMemory = 4 +}; + +constexpr inline CudaLaunchMechanism operator|(CudaLaunchMechanism p1, + CudaLaunchMechanism p2) { + return static_cast<CudaLaunchMechanism>(static_cast<unsigned>(p1) | + static_cast<unsigned>(p2)); +} +constexpr inline CudaLaunchMechanism operator&(CudaLaunchMechanism p1, + CudaLaunchMechanism p2) { + return static_cast<CudaLaunchMechanism>(static_cast<unsigned>(p1) & + static_cast<unsigned>(p2)); +} + +template <CudaLaunchMechanism l> +struct CudaDispatchProperties { + CudaLaunchMechanism launch_mechanism = l; +}; +} // namespace Experimental +} // namespace Impl +/// \class Cuda +/// \brief Kokkos Execution Space that uses CUDA to run on GPUs. +/// +/// An "execution space" represents a parallel execution model. It tells Kokkos +/// how to parallelize the execution of kernels in a parallel_for or +/// parallel_reduce. For example, the Threads execution space uses Pthreads or +/// C++11 threads on a CPU, the OpenMP execution space uses the OpenMP language +/// extensions, and the Serial execution space executes "parallel" kernels +/// sequentially. The Cuda execution space uses NVIDIA's CUDA programming +/// model to execute kernels in parallel on GPUs. +class Cuda { + public: + //! \name Type declarations that all Kokkos execution spaces must provide. + //@{ + + //! Tag this class as a kokkos execution space + using execution_space = Cuda; + +#if defined(KOKKOS_ENABLE_CUDA_UVM) + //! This execution space's preferred memory space. + using memory_space = CudaUVMSpace; +#else + //! This execution space's preferred memory space. + using memory_space = CudaSpace; +#endif + + //! This execution space preferred device_type + using device_type = Kokkos::Device<execution_space, memory_space>; + + //! The size_type best suited for this execution space. + using size_type = memory_space::size_type; + + //! This execution space's preferred array layout. + using array_layout = LayoutLeft; + + //! + using scratch_memory_space = ScratchMemorySpace<Cuda>; + + //@} + //-------------------------------------------------- + //! \name Functions that all Kokkos devices must implement. + //@{ + + /// \brief True if and only if this method is being called in a + /// thread-parallel function. + KOKKOS_INLINE_FUNCTION static int in_parallel() { +#if defined(__CUDA_ARCH__) + return true; +#else + return false; +#endif + } + + /** \brief Set the device in a "sleep" state. + * + * This function sets the device in a "sleep" state in which it is + * not ready for work. This may consume less resources than if the + * device were in an "awake" state, but it may also take time to + * bring the device from a sleep state to be ready for work. + * + * \return True if the device is in the "sleep" state, else false if + * the device is actively working and could not enter the "sleep" + * state. + */ + static bool sleep(); + + /// \brief Wake the device from the 'sleep' state so it is ready for work. + /// + /// \return True if the device is in the "ready" state, else "false" + /// if the device is actively working (which also means that it's + /// awake). + static bool wake(); + + /// \brief Wait until all dispatched functors complete. + /// + /// The parallel_for or parallel_reduce dispatch of a functor may + /// return asynchronously, before the functor completes. This + /// method does not return until all dispatched functors on this + /// device have completed. + static void impl_static_fence(); + + void fence() const; + + /** \brief Return the maximum amount of concurrency. */ + static int concurrency(); + + //! Print configuration information to the given output stream. + static void print_configuration(std::ostream&, const bool detail = false); + + //@} + //-------------------------------------------------- + //! \name Cuda space instances + + Cuda(); + + Cuda(cudaStream_t stream); + + //-------------------------------------------------------------------------- + //! \name Device-specific functions + //@{ + + struct SelectDevice { + int cuda_device_id; + SelectDevice() : cuda_device_id(0) {} + explicit SelectDevice(int id) : cuda_device_id(id) {} + }; + + //! Free any resources being consumed by the device. + static void impl_finalize(); + + //! Has been initialized + static int impl_is_initialized(); + + //! Initialize, telling the CUDA run-time library which device to use. + static void impl_initialize(const SelectDevice = SelectDevice(), + const size_t num_instances = 1); + + /// \brief Cuda device architecture of the selected device. + /// + /// This matches the __CUDA_ARCH__ specification. + static size_type device_arch(); + + //! Query device count. + static size_type detect_device_count(); + + /** \brief Detect the available devices and their architecture + * as defined by the __CUDA_ARCH__ specification. + */ + static std::vector<unsigned> detect_device_arch(); + + cudaStream_t cuda_stream() const; + int cuda_device() const; + const cudaDeviceProp& cuda_device_prop() const; + + //@} + //-------------------------------------------------------------------------- + + static const char* name(); + + inline Impl::CudaInternal* impl_internal_space_instance() const { + return m_space_instance.get(); + } + uint32_t impl_instance_id() const noexcept { return 0; } + + private: + Kokkos::Impl::HostSharedPtr<Impl::CudaInternal> m_space_instance; +}; + +namespace Tools { +namespace Experimental { +template <> +struct DeviceTypeTraits<Cuda> { + /// \brief An ID to differentiate (for example) Serial from OpenMP in Tooling + static constexpr DeviceType id = DeviceType::Cuda; +}; +} // namespace Experimental +} // namespace Tools + +namespace Impl { + +class CudaSpaceInitializer : public ExecSpaceInitializerBase { + public: + CudaSpaceInitializer() = default; + ~CudaSpaceInitializer() = default; + void initialize(const InitArguments& args) final; + void finalize(const bool all_spaces) final; + void fence() final; + void print_configuration(std::ostream& msg, const bool detail) final; +}; + +} // namespace Impl +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { + +template <> +struct MemorySpaceAccess<Kokkos::CudaSpace, + Kokkos::Cuda::scratch_memory_space> { + enum : bool { assignable = false }; + enum : bool { accessible = true }; + enum : bool { deepcopy = false }; +}; + +#if defined(KOKKOS_ENABLE_CUDA_UVM) + +// If forcing use of UVM everywhere +// then must assume that CudaUVMSpace +// can be a stand-in for CudaSpace. +// This will fail when a strange host-side execution space +// that defines CudaUVMSpace as its preferredmemory space. + +template <> +struct MemorySpaceAccess<Kokkos::CudaUVMSpace, + Kokkos::Cuda::scratch_memory_space> { + enum : bool { assignable = false }; + enum : bool { accessible = true }; + enum : bool { deepcopy = false }; +}; + +#endif + +} // namespace Impl +} // namespace Kokkos + +#endif /* #if defined( KOKKOS_ENABLE_CUDA ) */ +#endif /* #ifndef KOKKOS_CUDA_HPP */ diff --git a/packages/kokkos/core/src/Kokkos_CudaSpace.hpp b/packages/kokkos/core/src/Kokkos_CudaSpace.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e10fae93c7ca01ce90f31b5d22ca9bff7d113884 --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_CudaSpace.hpp @@ -0,0 +1,903 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CUDASPACE_HPP +#define KOKKOS_CUDASPACE_HPP + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_CUDA) + +#include <Kokkos_Core_fwd.hpp> + +#include <iosfwd> +#include <typeinfo> +#include <string> +#include <memory> + +#include <Kokkos_HostSpace.hpp> +#include <impl/Kokkos_SharedAlloc.hpp> + +#include <impl/Kokkos_Profiling_Interface.hpp> + +#include <Cuda/Kokkos_Cuda_abort.hpp> + +#ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST +extern "C" bool kokkos_impl_cuda_pin_uvm_to_host(); +extern "C" void kokkos_impl_cuda_set_pin_uvm_to_host(bool); +#endif + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { + +/** \brief Cuda on-device memory management */ + +class CudaSpace { + public: + //! Tag this class as a kokkos memory space + using memory_space = CudaSpace; + using execution_space = Kokkos::Cuda; + using device_type = Kokkos::Device<execution_space, memory_space>; + + using size_type = unsigned int; + + /*--------------------------------*/ + + CudaSpace(); + CudaSpace(CudaSpace&& rhs) = default; + CudaSpace(const CudaSpace& rhs) = default; + CudaSpace& operator=(CudaSpace&& rhs) = default; + CudaSpace& operator=(const CudaSpace& rhs) = default; + ~CudaSpace() = default; + + /**\brief Allocate untracked memory in the cuda space */ + void* allocate(const size_t arg_alloc_size) const; + void* allocate(const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const; + + /**\brief Deallocate untracked memory in the cuda space */ + void deallocate(void* const arg_alloc_ptr, const size_t arg_alloc_size) const; + void deallocate(const char* arg_label, void* const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const; + + private: + template <class, class, class, class> + friend class Kokkos::Experimental::LogicalMemorySpace; + void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size = 0, + const Kokkos::Tools::SpaceHandle = + Kokkos::Tools::make_space_handle(name())) const; + void impl_deallocate(const char* arg_label, void* const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0, + const Kokkos::Tools::SpaceHandle = + Kokkos::Tools::make_space_handle(name())) const; + + public: + /**\brief Return Name of the MemorySpace */ + static constexpr const char* name() { return m_name; } + + /*--------------------------------*/ + /** \brief Error reporting for HostSpace attempt to access CudaSpace */ + KOKKOS_DEPRECATED static void access_error(); + KOKKOS_DEPRECATED static void access_error(const void* const); + + private: + int m_device; ///< Which Cuda device + + static constexpr const char* m_name = "Cuda"; + friend class Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>; +}; +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { + +/** \brief Cuda memory that is accessible to Host execution space + * through Cuda's unified virtual memory (UVM) runtime. + */ +class CudaUVMSpace { + public: + //! Tag this class as a kokkos memory space + using memory_space = CudaUVMSpace; + using execution_space = Cuda; + using device_type = Kokkos::Device<execution_space, memory_space>; + using size_type = unsigned int; + + /** \brief If UVM capability is available */ + static bool available(); + + /*--------------------------------*/ + /** \brief CudaUVMSpace specific routine */ + KOKKOS_DEPRECATED static int number_of_allocations(); + + /*--------------------------------*/ + + /*--------------------------------*/ + + CudaUVMSpace(); + CudaUVMSpace(CudaUVMSpace&& rhs) = default; + CudaUVMSpace(const CudaUVMSpace& rhs) = default; + CudaUVMSpace& operator=(CudaUVMSpace&& rhs) = default; + CudaUVMSpace& operator=(const CudaUVMSpace& rhs) = default; + ~CudaUVMSpace() = default; + + /**\brief Allocate untracked memory in the cuda space */ + void* allocate(const size_t arg_alloc_size) const; + void* allocate(const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const; + + /**\brief Deallocate untracked memory in the cuda space */ + void deallocate(void* const arg_alloc_ptr, const size_t arg_alloc_size) const; + void deallocate(const char* arg_label, void* const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const; + + private: + template <class, class, class, class> + friend class Kokkos::Experimental::LogicalMemorySpace; + void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size = 0, + const Kokkos::Tools::SpaceHandle = + Kokkos::Tools::make_space_handle(name())) const; + void impl_deallocate(const char* arg_label, void* const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0, + const Kokkos::Tools::SpaceHandle = + Kokkos::Tools::make_space_handle(name())) const; + + public: + /**\brief Return Name of the MemorySpace */ + static constexpr const char* name() { return m_name; } + +#ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST + static bool cuda_pin_uvm_to_host(); + static void cuda_set_pin_uvm_to_host(bool val); +#endif + /*--------------------------------*/ + + private: + int m_device; ///< Which Cuda device + +#ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST + static bool kokkos_impl_cuda_pin_uvm_to_host_v; +#endif + static constexpr const char* m_name = "CudaUVM"; +}; + +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { + +/** \brief Host memory that is accessible to Cuda execution space + * through Cuda's host-pinned memory allocation. + */ +class CudaHostPinnedSpace { + public: + //! Tag this class as a kokkos memory space + /** \brief Memory is in HostSpace so use the HostSpace::execution_space */ + using execution_space = HostSpace::execution_space; + using memory_space = CudaHostPinnedSpace; + using device_type = Kokkos::Device<execution_space, memory_space>; + using size_type = unsigned int; + + /*--------------------------------*/ + + CudaHostPinnedSpace(); + CudaHostPinnedSpace(CudaHostPinnedSpace&& rhs) = default; + CudaHostPinnedSpace(const CudaHostPinnedSpace& rhs) = default; + CudaHostPinnedSpace& operator=(CudaHostPinnedSpace&& rhs) = default; + CudaHostPinnedSpace& operator=(const CudaHostPinnedSpace& rhs) = default; + ~CudaHostPinnedSpace() = default; + + /**\brief Allocate untracked memory in the space */ + void* allocate(const size_t arg_alloc_size) const; + void* allocate(const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const; + + /**\brief Deallocate untracked memory in the space */ + void deallocate(void* const arg_alloc_ptr, const size_t arg_alloc_size) const; + void deallocate(const char* arg_label, void* const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const; + + private: + template <class, class, class, class> + friend class Kokkos::Experimental::LogicalMemorySpace; + void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size = 0, + const Kokkos::Tools::SpaceHandle = + Kokkos::Tools::make_space_handle(name())) const; + void impl_deallocate(const char* arg_label, void* const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0, + const Kokkos::Tools::SpaceHandle = + Kokkos::Tools::make_space_handle(name())) const; + + public: + /**\brief Return Name of the MemorySpace */ + static constexpr const char* name() { return m_name; } + + private: + static constexpr const char* m_name = "CudaHostPinned"; + + /*--------------------------------*/ +}; + +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { + +cudaStream_t cuda_get_deep_copy_stream(); + +const std::unique_ptr<Kokkos::Cuda>& cuda_get_deep_copy_space( + bool initialize = true); + +static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaSpace, + Kokkos::CudaSpace>::assignable, + ""); +static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaUVMSpace, + Kokkos::CudaUVMSpace>::assignable, + ""); +static_assert( + Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaHostPinnedSpace, + Kokkos::CudaHostPinnedSpace>::assignable, + ""); + +//---------------------------------------- + +template <> +struct MemorySpaceAccess<Kokkos::HostSpace, Kokkos::CudaSpace> { + enum : bool { assignable = false }; + enum : bool { accessible = false }; + enum : bool { deepcopy = true }; +}; + +template <> +struct MemorySpaceAccess<Kokkos::HostSpace, Kokkos::CudaUVMSpace> { + // HostSpace::execution_space != CudaUVMSpace::execution_space + enum : bool { assignable = false }; + enum : bool { accessible = true }; + enum : bool { deepcopy = true }; +}; + +template <> +struct MemorySpaceAccess<Kokkos::HostSpace, Kokkos::CudaHostPinnedSpace> { + // HostSpace::execution_space == CudaHostPinnedSpace::execution_space + enum : bool { assignable = true }; + enum : bool { accessible = true }; + enum : bool { deepcopy = true }; +}; + +//---------------------------------------- + +template <> +struct MemorySpaceAccess<Kokkos::CudaSpace, Kokkos::HostSpace> { + enum : bool { assignable = false }; + enum : bool { accessible = false }; + enum : bool { deepcopy = true }; +}; + +template <> +struct MemorySpaceAccess<Kokkos::CudaSpace, Kokkos::CudaUVMSpace> { + // CudaSpace::execution_space == CudaUVMSpace::execution_space + enum : bool { assignable = true }; + enum : bool { accessible = true }; + enum : bool { deepcopy = true }; +}; + +template <> +struct MemorySpaceAccess<Kokkos::CudaSpace, Kokkos::CudaHostPinnedSpace> { + // CudaSpace::execution_space != CudaHostPinnedSpace::execution_space + enum : bool { assignable = false }; + enum : bool { accessible = true }; // CudaSpace::execution_space + enum : bool { deepcopy = true }; +}; + +//---------------------------------------- +// CudaUVMSpace::execution_space == Cuda +// CudaUVMSpace accessible to both Cuda and Host + +template <> +struct MemorySpaceAccess<Kokkos::CudaUVMSpace, Kokkos::HostSpace> { + enum : bool { assignable = false }; + enum : bool { accessible = false }; // Cuda cannot access HostSpace + enum : bool { deepcopy = true }; +}; + +template <> +struct MemorySpaceAccess<Kokkos::CudaUVMSpace, Kokkos::CudaSpace> { + // CudaUVMSpace::execution_space == CudaSpace::execution_space + // Can access CudaUVMSpace from Host but cannot access CudaSpace from Host + enum : bool { assignable = false }; + + // CudaUVMSpace::execution_space can access CudaSpace + enum : bool { accessible = true }; + enum : bool { deepcopy = true }; +}; + +template <> +struct MemorySpaceAccess<Kokkos::CudaUVMSpace, Kokkos::CudaHostPinnedSpace> { + // CudaUVMSpace::execution_space != CudaHostPinnedSpace::execution_space + enum : bool { assignable = false }; + enum : bool { accessible = true }; // CudaUVMSpace::execution_space + enum : bool { deepcopy = true }; +}; + +//---------------------------------------- +// CudaHostPinnedSpace::execution_space == HostSpace::execution_space +// CudaHostPinnedSpace accessible to both Cuda and Host + +template <> +struct MemorySpaceAccess<Kokkos::CudaHostPinnedSpace, Kokkos::HostSpace> { + enum : bool { assignable = false }; // Cannot access from Cuda + enum : bool { accessible = true }; // CudaHostPinnedSpace::execution_space + enum : bool { deepcopy = true }; +}; + +template <> +struct MemorySpaceAccess<Kokkos::CudaHostPinnedSpace, Kokkos::CudaSpace> { + enum : bool { assignable = false }; // Cannot access from Host + enum : bool { accessible = false }; + enum : bool { deepcopy = true }; +}; + +template <> +struct MemorySpaceAccess<Kokkos::CudaHostPinnedSpace, Kokkos::CudaUVMSpace> { + enum : bool { assignable = false }; // different execution_space + enum : bool { accessible = true }; // same accessibility + enum : bool { deepcopy = true }; +}; + +//---------------------------------------- + +} // namespace Impl +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { + +void DeepCopyAsyncCuda(void* dst, const void* src, size_t n); + +template <> +struct DeepCopy<CudaSpace, CudaSpace, Cuda> { + DeepCopy(void* dst, const void* src, size_t); + DeepCopy(const Cuda&, void* dst, const void* src, size_t); +}; + +template <> +struct DeepCopy<CudaSpace, HostSpace, Cuda> { + DeepCopy(void* dst, const void* src, size_t); + DeepCopy(const Cuda&, void* dst, const void* src, size_t); +}; + +template <> +struct DeepCopy<HostSpace, CudaSpace, Cuda> { + DeepCopy(void* dst, const void* src, size_t); + DeepCopy(const Cuda&, void* dst, const void* src, size_t); +}; + +template <> +struct DeepCopy<CudaUVMSpace, CudaUVMSpace, Cuda> { + DeepCopy(void* dst, const void* src, size_t n) { + (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n); + } + DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) { + (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n); + } +}; + +template <> +struct DeepCopy<CudaUVMSpace, HostSpace, Cuda> { + DeepCopy(void* dst, const void* src, size_t n) { + (void)DeepCopy<CudaSpace, HostSpace, Cuda>(dst, src, n); + } + DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) { + (void)DeepCopy<CudaSpace, HostSpace, Cuda>(instance, dst, src, n); + } +}; + +template <> +struct DeepCopy<HostSpace, CudaUVMSpace, Cuda> { + DeepCopy(void* dst, const void* src, size_t n) { + (void)DeepCopy<HostSpace, CudaSpace, Cuda>(dst, src, n); + } + DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) { + (void)DeepCopy<HostSpace, CudaSpace, Cuda>(instance, dst, src, n); + } +}; + +template <> +struct DeepCopy<CudaHostPinnedSpace, CudaHostPinnedSpace, Cuda> { + DeepCopy(void* dst, const void* src, size_t n) { + (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n); + } + DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) { + (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n); + } +}; + +template <> +struct DeepCopy<CudaHostPinnedSpace, HostSpace, Cuda> { + DeepCopy(void* dst, const void* src, size_t n) { + (void)DeepCopy<CudaSpace, HostSpace, Cuda>(dst, src, n); + } + DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) { + (void)DeepCopy<CudaSpace, HostSpace, Cuda>(instance, dst, src, n); + } +}; + +template <> +struct DeepCopy<HostSpace, CudaHostPinnedSpace, Cuda> { + DeepCopy(void* dst, const void* src, size_t n) { + (void)DeepCopy<HostSpace, CudaSpace, Cuda>(dst, src, n); + } + DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) { + (void)DeepCopy<HostSpace, CudaSpace, Cuda>(instance, dst, src, n); + } +}; + +template <> +struct DeepCopy<CudaUVMSpace, CudaSpace, Cuda> { + DeepCopy(void* dst, const void* src, size_t n) { + (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n); + } + DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) { + (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n); + } +}; + +template <> +struct DeepCopy<CudaSpace, CudaUVMSpace, Cuda> { + DeepCopy(void* dst, const void* src, size_t n) { + (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n); + } + DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) { + (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n); + } +}; + +template <> +struct DeepCopy<CudaUVMSpace, CudaHostPinnedSpace, Cuda> { + DeepCopy(void* dst, const void* src, size_t n) { + (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n); + } + DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) { + (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n); + } +}; + +template <> +struct DeepCopy<CudaHostPinnedSpace, CudaUVMSpace, Cuda> { + DeepCopy(void* dst, const void* src, size_t n) { + (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n); + } + DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) { + (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n); + } +}; + +template <> +struct DeepCopy<CudaSpace, CudaHostPinnedSpace, Cuda> { + DeepCopy(void* dst, const void* src, size_t n) { + (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n); + } + DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) { + (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n); + } +}; + +template <> +struct DeepCopy<CudaHostPinnedSpace, CudaSpace, Cuda> { + DeepCopy(void* dst, const void* src, size_t n) { + (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n); + } + DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) { + (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n); + } +}; + +template <class ExecutionSpace> +struct DeepCopy<CudaSpace, CudaSpace, ExecutionSpace> { + inline DeepCopy(void* dst, const void* src, size_t n) { + (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n); + } + + inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, + size_t n) { + exec.fence(); + DeepCopyAsyncCuda(dst, src, n); + } +}; + +template <class ExecutionSpace> +struct DeepCopy<CudaSpace, HostSpace, ExecutionSpace> { + inline DeepCopy(void* dst, const void* src, size_t n) { + (void)DeepCopy<CudaSpace, HostSpace, Cuda>(dst, src, n); + } + + inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, + size_t n) { + exec.fence(); + DeepCopyAsyncCuda(dst, src, n); + } +}; + +template <class ExecutionSpace> +struct DeepCopy<HostSpace, CudaSpace, ExecutionSpace> { + inline DeepCopy(void* dst, const void* src, size_t n) { + (void)DeepCopy<HostSpace, CudaSpace, Cuda>(dst, src, n); + } + + inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, + size_t n) { + exec.fence(); + DeepCopyAsyncCuda(dst, src, n); + } +}; + +template <class ExecutionSpace> +struct DeepCopy<CudaSpace, CudaUVMSpace, ExecutionSpace> { + inline DeepCopy(void* dst, const void* src, size_t n) { + (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n); + } + + inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, + size_t n) { + exec.fence(); + DeepCopyAsyncCuda(dst, src, n); + } +}; + +template <class ExecutionSpace> +struct DeepCopy<CudaSpace, CudaHostPinnedSpace, ExecutionSpace> { + inline DeepCopy(void* dst, const void* src, size_t n) { + (void)DeepCopy<CudaSpace, HostSpace, Cuda>(dst, src, n); + } + + inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, + size_t n) { + exec.fence(); + DeepCopyAsyncCuda(dst, src, n); + } +}; + +template <class ExecutionSpace> +struct DeepCopy<CudaUVMSpace, CudaSpace, ExecutionSpace> { + inline DeepCopy(void* dst, const void* src, size_t n) { + (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n); + } + + inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, + size_t n) { + exec.fence(); + DeepCopyAsyncCuda(dst, src, n); + } +}; + +template <class ExecutionSpace> +struct DeepCopy<CudaUVMSpace, CudaUVMSpace, ExecutionSpace> { + inline DeepCopy(void* dst, const void* src, size_t n) { + (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n); + } + + inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, + size_t n) { + exec.fence(); + DeepCopyAsyncCuda(dst, src, n); + } +}; + +template <class ExecutionSpace> +struct DeepCopy<CudaUVMSpace, CudaHostPinnedSpace, ExecutionSpace> { + inline DeepCopy(void* dst, const void* src, size_t n) { + (void)DeepCopy<CudaSpace, HostSpace, Cuda>(dst, src, n); + } + + inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, + size_t n) { + exec.fence(); + DeepCopyAsyncCuda(dst, src, n); + } +}; + +template <class ExecutionSpace> +struct DeepCopy<CudaUVMSpace, HostSpace, ExecutionSpace> { + inline DeepCopy(void* dst, const void* src, size_t n) { + (void)DeepCopy<CudaSpace, HostSpace, Cuda>(dst, src, n); + } + + inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, + size_t n) { + exec.fence(); + DeepCopyAsyncCuda(dst, src, n); + } +}; + +template <class ExecutionSpace> +struct DeepCopy<CudaHostPinnedSpace, CudaSpace, ExecutionSpace> { + inline DeepCopy(void* dst, const void* src, size_t n) { + (void)DeepCopy<HostSpace, CudaSpace, Cuda>(dst, src, n); + } + + inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, + size_t n) { + exec.fence(); + DeepCopyAsyncCuda(dst, src, n); + } +}; + +template <class ExecutionSpace> +struct DeepCopy<CudaHostPinnedSpace, CudaUVMSpace, ExecutionSpace> { + inline DeepCopy(void* dst, const void* src, size_t n) { + (void)DeepCopy<HostSpace, CudaSpace, Cuda>(dst, src, n); + } + + inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, + size_t n) { + exec.fence(); + DeepCopyAsyncCuda(dst, src, n); + } +}; + +template <class ExecutionSpace> +struct DeepCopy<CudaHostPinnedSpace, CudaHostPinnedSpace, ExecutionSpace> { + inline DeepCopy(void* dst, const void* src, size_t n) { + (void)DeepCopy<HostSpace, HostSpace, Cuda>(dst, src, n); + } + + inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, + size_t n) { + exec.fence(); + DeepCopyAsyncCuda(dst, src, n); + } +}; + +template <class ExecutionSpace> +struct DeepCopy<CudaHostPinnedSpace, HostSpace, ExecutionSpace> { + inline DeepCopy(void* dst, const void* src, size_t n) { + (void)DeepCopy<HostSpace, HostSpace, Cuda>(dst, src, n); + } + + inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, + size_t n) { + exec.fence(); + DeepCopyAsyncCuda(dst, src, n); + } +}; + +template <class ExecutionSpace> +struct DeepCopy<HostSpace, CudaUVMSpace, ExecutionSpace> { + inline DeepCopy(void* dst, const void* src, size_t n) { + (void)DeepCopy<HostSpace, CudaSpace, Cuda>(dst, src, n); + } + + inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, + size_t n) { + exec.fence(); + DeepCopyAsyncCuda(dst, src, n); + } +}; + +template <class ExecutionSpace> +struct DeepCopy<HostSpace, CudaHostPinnedSpace, ExecutionSpace> { + inline DeepCopy(void* dst, const void* src, size_t n) { + (void)DeepCopy<HostSpace, HostSpace, Cuda>(dst, src, n); + } + + inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, + size_t n) { + exec.fence(); + DeepCopyAsyncCuda(dst, src, n); + } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <> +class SharedAllocationRecord<Kokkos::CudaSpace, void> + : public HostInaccessibleSharedAllocationRecordCommon<Kokkos::CudaSpace> { + private: + friend class SharedAllocationRecord<Kokkos::CudaUVMSpace, void>; + friend class SharedAllocationRecordCommon<Kokkos::CudaSpace>; + friend class HostInaccessibleSharedAllocationRecordCommon<Kokkos::CudaSpace>; + + using RecordBase = SharedAllocationRecord<void, void>; + using base_t = + HostInaccessibleSharedAllocationRecordCommon<Kokkos::CudaSpace>; + + SharedAllocationRecord(const SharedAllocationRecord&) = delete; + SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; + + static ::cudaTextureObject_t attach_texture_object( + const unsigned sizeof_alias, void* const alloc_ptr, + const size_t alloc_size); + +#ifdef KOKKOS_ENABLE_DEBUG + static RecordBase s_root_record; +#endif + + ::cudaTextureObject_t m_tex_obj = 0; + const Kokkos::CudaSpace m_space; + + protected: + ~SharedAllocationRecord(); + SharedAllocationRecord() = default; + + SharedAllocationRecord( + const Kokkos::CudaSpace& arg_space, const std::string& arg_label, + const size_t arg_alloc_size, + const RecordBase::function_type arg_dealloc = &base_t::deallocate); + + public: + template <typename AliasType> + inline ::cudaTextureObject_t attach_texture_object() { + static_assert((std::is_same<AliasType, int>::value || + std::is_same<AliasType, ::int2>::value || + std::is_same<AliasType, ::int4>::value), + "Cuda texture fetch only supported for alias types of int, " + "::int2, or ::int4"); + + if (m_tex_obj == 0) { + m_tex_obj = attach_texture_object(sizeof(AliasType), + (void*)RecordBase::m_alloc_ptr, + RecordBase::m_alloc_size); + } + + return m_tex_obj; + } + + template <typename AliasType> + inline int attach_texture_object_offset(const AliasType* const ptr) { + // Texture object is attached to the entire allocation range + return ptr - reinterpret_cast<AliasType*>(RecordBase::m_alloc_ptr); + } +}; + +template <> +class SharedAllocationRecord<Kokkos::CudaUVMSpace, void> + : public SharedAllocationRecordCommon<Kokkos::CudaUVMSpace> { + private: + friend class SharedAllocationRecordCommon<Kokkos::CudaUVMSpace>; + + using base_t = SharedAllocationRecordCommon<Kokkos::CudaUVMSpace>; + using RecordBase = SharedAllocationRecord<void, void>; + + SharedAllocationRecord(const SharedAllocationRecord&) = delete; + SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; + + static RecordBase s_root_record; + + ::cudaTextureObject_t m_tex_obj = 0; + const Kokkos::CudaUVMSpace m_space; + + protected: + ~SharedAllocationRecord(); + SharedAllocationRecord() = default; + + SharedAllocationRecord( + const Kokkos::CudaUVMSpace& arg_space, const std::string& arg_label, + const size_t arg_alloc_size, + const RecordBase::function_type arg_dealloc = &base_t::deallocate); + + public: + template <typename AliasType> + inline ::cudaTextureObject_t attach_texture_object() { + static_assert((std::is_same<AliasType, int>::value || + std::is_same<AliasType, ::int2>::value || + std::is_same<AliasType, ::int4>::value), + "Cuda texture fetch only supported for alias types of int, " + "::int2, or ::int4"); + + if (m_tex_obj == 0) { + m_tex_obj = SharedAllocationRecord<Kokkos::CudaSpace, void>:: + attach_texture_object(sizeof(AliasType), + (void*)RecordBase::m_alloc_ptr, + RecordBase::m_alloc_size); + } + + return m_tex_obj; + } + + template <typename AliasType> + inline int attach_texture_object_offset(const AliasType* const ptr) { + // Texture object is attached to the entire allocation range + return ptr - reinterpret_cast<AliasType*>(RecordBase::m_alloc_ptr); + } +}; + +template <> +class SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void> + : public SharedAllocationRecordCommon<Kokkos::CudaHostPinnedSpace> { + private: + friend class SharedAllocationRecordCommon<Kokkos::CudaHostPinnedSpace>; + + using RecordBase = SharedAllocationRecord<void, void>; + using base_t = SharedAllocationRecordCommon<Kokkos::CudaHostPinnedSpace>; + + SharedAllocationRecord(const SharedAllocationRecord&) = delete; + SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; + + static RecordBase s_root_record; + + const Kokkos::CudaHostPinnedSpace m_space; + + protected: + ~SharedAllocationRecord(); + SharedAllocationRecord() = default; + + SharedAllocationRecord( + const Kokkos::CudaHostPinnedSpace& arg_space, + const std::string& arg_label, const size_t arg_alloc_size, + const RecordBase::function_type arg_dealloc = &deallocate); +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #if defined( KOKKOS_ENABLE_CUDA ) */ +#endif /* #define KOKKOS_CUDASPACE_HPP */ diff --git a/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp b/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp new file mode 100644 index 0000000000000000000000000000000000000000..55aed13670e69838d94fff2735d421cc49a11835 --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp @@ -0,0 +1,942 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_EXECPOLICY_HPP +#define KOKKOS_EXECPOLICY_HPP + +#include <Kokkos_Core_fwd.hpp> +#include <impl/Kokkos_Traits.hpp> +#include <impl/Kokkos_Error.hpp> +#include <impl/Kokkos_Tags.hpp> +#include <impl/Kokkos_AnalyzePolicy.hpp> +#include <Kokkos_Concepts.hpp> +#include <typeinfo> + +//---------------------------------------------------------------------------- + +namespace Kokkos { + +struct ParallelForTag {}; +struct ParallelScanTag {}; +struct ParallelReduceTag {}; + +struct ChunkSize { + int value; + ChunkSize(int value_) : value(value_) {} +}; + +/** \brief Execution policy for work over a range of an integral type. + * + * Valid template argument options: + * + * With a specified execution space: + * < ExecSpace , WorkTag , { IntConst | IntType } > + * < ExecSpace , WorkTag , void > + * < ExecSpace , { IntConst | IntType } , void > + * < ExecSpace , void , void > + * + * With the default execution space: + * < WorkTag , { IntConst | IntType } , void > + * < WorkTag , void , void > + * < { IntConst | IntType } , void , void > + * < void , void , void > + * + * IntType is a fundamental integral type + * IntConst is an Impl::integral_constant< IntType , Blocking > + * + * Blocking is the granularity of partitioning the range among threads. + */ +template <class... Properties> +class RangePolicy : public Impl::PolicyTraits<Properties...> { + public: + using traits = Impl::PolicyTraits<Properties...>; + + private: + typename traits::execution_space m_space; + typename traits::index_type m_begin; + typename traits::index_type m_end; + typename traits::index_type m_granularity; + typename traits::index_type m_granularity_mask; + + template <class... OtherProperties> + friend class RangePolicy; + + public: + //! Tag this class as an execution policy + using execution_policy = RangePolicy<Properties...>; + using member_type = typename traits::index_type; + using index_type = typename traits::index_type; + + KOKKOS_INLINE_FUNCTION const typename traits::execution_space& space() const { + return m_space; + } + KOKKOS_INLINE_FUNCTION member_type begin() const { return m_begin; } + KOKKOS_INLINE_FUNCTION member_type end() const { return m_end; } + + // TODO: find a better workaround for Clangs weird instantiation order + // This thing is here because of an instantiation error, where the RangePolicy + // is inserted into FunctorValue Traits, which tries decltype on the operator. + // It tries to do this even though the first argument of parallel for clearly + // doesn't match. + void operator()(const int&) const {} + + template <class... OtherProperties> + RangePolicy(const RangePolicy<OtherProperties...>& p) + : traits(p), // base class may contain data such as desired occupancy + m_space(p.m_space), + m_begin(p.m_begin), + m_end(p.m_end), + m_granularity(p.m_granularity), + m_granularity_mask(p.m_granularity_mask) {} + + inline RangePolicy() + : m_space(), + m_begin(0), + m_end(0), + m_granularity(0), + m_granularity_mask(0) {} + + /** \brief Total range */ + inline RangePolicy(const typename traits::execution_space& work_space, + const member_type work_begin, const member_type work_end) + : m_space(work_space), + m_begin(work_begin < work_end ? work_begin : 0), + m_end(work_begin < work_end ? work_end : 0), + m_granularity(0), + m_granularity_mask(0) { + set_auto_chunk_size(); + } + + /** \brief Total range */ + inline RangePolicy(const member_type work_begin, const member_type work_end) + : RangePolicy(typename traits::execution_space(), work_begin, work_end) { + set_auto_chunk_size(); + } + + /** \brief Total range */ + template <class... Args> + inline RangePolicy(const typename traits::execution_space& work_space, + const member_type work_begin, const member_type work_end, + Args... args) + : m_space(work_space), + m_begin(work_begin < work_end ? work_begin : 0), + m_end(work_begin < work_end ? work_end : 0), + m_granularity(0), + m_granularity_mask(0) { + set_auto_chunk_size(); + set(args...); + } + + /** \brief Total range */ + template <class... Args> + inline RangePolicy(const member_type work_begin, const member_type work_end, + Args... args) + : RangePolicy(typename traits::execution_space(), work_begin, work_end) { + set_auto_chunk_size(); + set(args...); + } + + private: + inline void set() {} + + public: + template <class... Args> + inline void set(Args...) { + static_assert( + 0 == sizeof...(Args), + "Kokkos::RangePolicy: unhandled constructor arguments encountered."); + } + + template <class... Args> + inline void set(const ChunkSize& chunksize, Args... args) { + m_granularity = chunksize.value; + m_granularity_mask = m_granularity - 1; + set(args...); + } + + public: + /** \brief return chunk_size */ + inline member_type chunk_size() const { return m_granularity; } + + /** \brief set chunk_size to a discrete value*/ + inline RangePolicy set_chunk_size(int chunk_size_) const { + RangePolicy p = *this; + p.m_granularity = chunk_size_; + p.m_granularity_mask = p.m_granularity - 1; + return p; + } + + private: + /** \brief finalize chunk_size if it was set to AUTO*/ + inline void set_auto_chunk_size() { + int64_t concurrency = + static_cast<int64_t>(traits::execution_space::concurrency()); + if (concurrency == 0) concurrency = 1; + + if (m_granularity > 0) { + if (!Impl::is_integral_power_of_two(m_granularity)) + Kokkos::abort("RangePolicy blocking granularity must be power of two"); + } + + int64_t new_chunk_size = 1; + while (new_chunk_size * 100 * concurrency < + static_cast<int64_t>(m_end - m_begin)) + new_chunk_size *= 2; + if (new_chunk_size < 128) { + new_chunk_size = 1; + while ((new_chunk_size * 40 * concurrency < + static_cast<int64_t>(m_end - m_begin)) && + (new_chunk_size < 128)) + new_chunk_size *= 2; + } + m_granularity = new_chunk_size; + m_granularity_mask = m_granularity - 1; + } + + public: + /** \brief Subrange for a partition's rank and size. + * + * Typically used to partition a range over a group of threads. + */ + struct WorkRange { + using work_tag = typename RangePolicy<Properties...>::work_tag; + using member_type = typename RangePolicy<Properties...>::member_type; + + KOKKOS_INLINE_FUNCTION member_type begin() const { return m_begin; } + KOKKOS_INLINE_FUNCTION member_type end() const { return m_end; } + + /** \brief Subrange for a partition's rank and size. + * + * Typically used to partition a range over a group of threads. + */ + KOKKOS_INLINE_FUNCTION + WorkRange(const RangePolicy& range, const int part_rank, + const int part_size) + : m_begin(0), m_end(0) { + if (part_size) { + // Split evenly among partitions, then round up to the granularity. + const member_type work_part = + ((((range.end() - range.begin()) + (part_size - 1)) / part_size) + + range.m_granularity_mask) & + ~member_type(range.m_granularity_mask); + + m_begin = range.begin() + work_part * part_rank; + m_end = m_begin + work_part; + + if (range.end() < m_begin) m_begin = range.end(); + if (range.end() < m_end) m_end = range.end(); + } + } + + private: + member_type m_begin; + member_type m_end; + WorkRange(); + WorkRange& operator=(const WorkRange&); + }; +}; + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +namespace Impl { + +template <class ExecSpace, class... Properties> +class TeamPolicyInternal : public Impl::PolicyTraits<Properties...> { + private: + using traits = Impl::PolicyTraits<Properties...>; + + public: + using index_type = typename traits::index_type; + + //---------------------------------------- + /** \brief Query maximum team size for a given functor. + * + * This size takes into account execution space concurrency limitations and + * scratch memory space limitations for reductions, team reduce/scan, and + * team shared memory. + * + * This function only works for single-operator functors. + * With multi-operator functors it cannot be determined + * which operator will be called. + */ + template <class FunctorType> + static int team_size_max(const FunctorType&); + + /** \brief Query recommended team size for a given functor. + * + * This size takes into account execution space concurrency limitations and + * scratch memory space limitations for reductions, team reduce/scan, and + * team shared memory. + * + * This function only works for single-operator functors. + * With multi-operator functors it cannot be determined + * which operator will be called. + */ + template <class FunctorType> + static int team_size_recommended(const FunctorType&); + + template <class FunctorType> + static int team_size_recommended(const FunctorType&, const int&); + + template <class FunctorType> + int team_size_recommended(const FunctorType& functor, + const int vector_length); + + //---------------------------------------- + /** \brief Construct policy with the given instance of the execution space */ + TeamPolicyInternal(const typename traits::execution_space&, + int league_size_request, int team_size_request, + int vector_length_request = 1); + + TeamPolicyInternal(const typename traits::execution_space&, + int league_size_request, const Kokkos::AUTO_t&, + int vector_length_request = 1); + + /** \brief Construct policy with the default instance of the execution space + */ + TeamPolicyInternal(int league_size_request, int team_size_request, + int vector_length_request = 1); + + TeamPolicyInternal(int league_size_request, const Kokkos::AUTO_t&, + int vector_length_request = 1); + + /* TeamPolicyInternal( int league_size_request , int team_size_request ); + + TeamPolicyInternal( int league_size_request , const Kokkos::AUTO_t & );*/ + + /** \brief The actual league size (number of teams) of the policy. + * + * This may be smaller than the requested league size due to limitations + * of the execution space. + */ + KOKKOS_INLINE_FUNCTION int league_size() const; + + /** \brief The actual team size (number of threads per team) of the policy. + * + * This may be smaller than the requested team size due to limitations + * of the execution space. + */ + KOKKOS_INLINE_FUNCTION int team_size() const; + + /** \brief Whether the policy has an automatically determined team size + */ + inline bool impl_auto_team_size() const; + /** \brief Whether the policy has an automatically determined vector length + */ + inline bool impl_auto_vector_length() const; + + static int vector_length_max(); + + KOKKOS_INLINE_FUNCTION int impl_vector_length() const; + + inline typename traits::index_type chunk_size() const; + + inline TeamPolicyInternal& set_chunk_size(int chunk_size); + + /** \brief Parallel execution of a functor calls the functor once with + * each member of the execution policy. + */ + struct member_type { + /** \brief Handle to the currently executing team shared scratch memory */ + KOKKOS_INLINE_FUNCTION + typename traits::execution_space::scratch_memory_space team_shmem() const; + + /** \brief Rank of this team within the league of teams */ + KOKKOS_INLINE_FUNCTION int league_rank() const; + + /** \brief Number of teams in the league */ + KOKKOS_INLINE_FUNCTION int league_size() const; + + /** \brief Rank of this thread within this team */ + KOKKOS_INLINE_FUNCTION int team_rank() const; + + /** \brief Number of threads in this team */ + KOKKOS_INLINE_FUNCTION int team_size() const; + + /** \brief Barrier among the threads of this team */ + KOKKOS_INLINE_FUNCTION void team_barrier() const; + + /** \brief Intra-team reduction. Returns join of all values of the team + * members. */ + template <class JoinOp> + KOKKOS_INLINE_FUNCTION typename JoinOp::value_type team_reduce( + const typename JoinOp::value_type, const JoinOp&) const; + + /** \brief Intra-team exclusive prefix sum with team_rank() ordering. + * + * The highest rank thread can compute the reduction total as + * reduction_total = dev.team_scan( value ) + value ; + */ + template <typename Type> + KOKKOS_INLINE_FUNCTION Type team_scan(const Type& value) const; + + /** \brief Intra-team exclusive prefix sum with team_rank() ordering + * with intra-team non-deterministic ordering accumulation. + * + * The global inter-team accumulation value will, at the end of the + * league's parallel execution, be the scan's total. + * Parallel execution ordering of the league's teams is non-deterministic. + * As such the base value for each team's scan operation is similarly + * non-deterministic. + */ + template <typename Type> + KOKKOS_INLINE_FUNCTION Type team_scan(const Type& value, + Type* const global_accum) const; + }; +}; + +struct PerTeamValue { + int value; + PerTeamValue(int arg); +}; + +struct PerThreadValue { + int value; + PerThreadValue(int arg); +}; + +template <class iType, class... Args> +struct ExtractVectorLength { + static inline iType value( + typename std::enable_if<std::is_integral<iType>::value, iType>::type val, + Args...) { + return val; + } + static inline + typename std::enable_if<!std::is_integral<iType>::value, int>::type + value( + typename std::enable_if<!std::is_integral<iType>::value, iType>::type, + Args...) { + return 1; + } +}; + +template <class iType, class... Args> +inline typename std::enable_if<std::is_integral<iType>::value, iType>::type +extract_vector_length(iType val, Args...) { + return val; +} + +template <class iType, class... Args> +inline typename std::enable_if<!std::is_integral<iType>::value, int>::type +extract_vector_length(iType, Args...) { + return 1; +} + +} // namespace Impl + +Impl::PerTeamValue PerTeam(const int& arg); +Impl::PerThreadValue PerThread(const int& arg); + +struct ScratchRequest { + int level; + + int per_team; + int per_thread; + + inline ScratchRequest(const int& level_, + const Impl::PerTeamValue& team_value) { + level = level_; + per_team = team_value.value; + per_thread = 0; + } + + inline ScratchRequest(const int& level_, + const Impl::PerThreadValue& thread_value) { + level = level_; + per_team = 0; + per_thread = thread_value.value; + } + + inline ScratchRequest(const int& level_, const Impl::PerTeamValue& team_value, + const Impl::PerThreadValue& thread_value) { + level = level_; + per_team = team_value.value; + per_thread = thread_value.value; + } + + inline ScratchRequest(const int& level_, + const Impl::PerThreadValue& thread_value, + const Impl::PerTeamValue& team_value) { + level = level_; + per_team = team_value.value; + per_thread = thread_value.value; + } +}; + +// Throws a runtime exception if level is not `0` or `1` +void team_policy_check_valid_storage_level_argument(int level); + +/** \brief Execution policy for parallel work over a league of teams of + * threads. + * + * The work functor is called for each thread of each team such that + * the team's member threads are guaranteed to be concurrent. + * + * The team's threads have access to team shared scratch memory and + * team collective operations. + * + * If the WorkTag is non-void then the first calling argument of the + * work functor's parentheses operator is 'const WorkTag &'. + * This allows a functor to have multiple work member functions. + * + * Order of template arguments does not matter, since the implementation + * uses variadic templates. Each and any of the template arguments can + * be omitted. + * + * Possible Template arguments and their default values: + * ExecutionSpace (DefaultExecutionSpace): where to execute code. Must be + * enabled. WorkTag (none): Tag which is used as the first argument for the + * functor operator. Schedule<Type> (Schedule<Static>): Scheduling Policy + * (Dynamic, or Static). IndexType<Type> (IndexType<ExecutionSpace::size_type>: + * Integer Index type used to iterate over the Index space. + * LaunchBounds<unsigned,unsigned> Launch Bounds for CUDA compilation, + * default of LaunchBounds<0,0> indicates no launch bounds specified. + */ +template <class... Properties> +class TeamPolicy + : public Impl::TeamPolicyInternal< + typename Impl::PolicyTraits<Properties...>::execution_space, + Properties...> { + using internal_policy = Impl::TeamPolicyInternal< + typename Impl::PolicyTraits<Properties...>::execution_space, + Properties...>; + + template <class... OtherProperties> + friend class TeamPolicy; + + public: + using traits = Impl::PolicyTraits<Properties...>; + + using execution_policy = TeamPolicy<Properties...>; + + TeamPolicy() : internal_policy(0, AUTO) {} + + /** \brief Construct policy with the given instance of the execution space */ + TeamPolicy(const typename traits::execution_space& space_, + int league_size_request, int team_size_request, + int vector_length_request = 1) + : internal_policy(space_, league_size_request, team_size_request, + vector_length_request) {} + + TeamPolicy(const typename traits::execution_space& space_, + int league_size_request, const Kokkos::AUTO_t&, + int vector_length_request = 1) + : internal_policy(space_, league_size_request, Kokkos::AUTO(), + vector_length_request) {} + + TeamPolicy(const typename traits::execution_space& space_, + int league_size_request, const Kokkos::AUTO_t&, + const Kokkos::AUTO_t&) + : internal_policy(space_, league_size_request, Kokkos::AUTO(), + Kokkos::AUTO()) {} + TeamPolicy(const typename traits::execution_space& space_, + int league_size_request, const int team_size_request, + const Kokkos::AUTO_t&) + : internal_policy(space_, league_size_request, team_size_request, + Kokkos::AUTO()) {} + /** \brief Construct policy with the default instance of the execution space + */ + TeamPolicy(int league_size_request, int team_size_request, + int vector_length_request = 1) + : internal_policy(league_size_request, team_size_request, + vector_length_request) {} + + TeamPolicy(int league_size_request, const Kokkos::AUTO_t&, + int vector_length_request = 1) + : internal_policy(league_size_request, Kokkos::AUTO(), + vector_length_request) {} + + TeamPolicy(int league_size_request, const Kokkos::AUTO_t&, + const Kokkos::AUTO_t&) + : internal_policy(league_size_request, Kokkos::AUTO(), Kokkos::AUTO()) {} + TeamPolicy(int league_size_request, const int team_size_request, + const Kokkos::AUTO_t&) + : internal_policy(league_size_request, team_size_request, + Kokkos::AUTO()) {} + + template <class... OtherProperties> + TeamPolicy(const TeamPolicy<OtherProperties...> p) : internal_policy(p) { + // Cannot call converting constructor in the member initializer list because + // it is not a direct base. + internal_policy::traits::operator=(p); + } + + private: + TeamPolicy(const internal_policy& p) : internal_policy(p) {} + + public: + inline TeamPolicy& set_chunk_size(int chunk) { + static_assert(std::is_same<decltype(internal_policy::set_chunk_size(chunk)), + internal_policy&>::value, + "internal set_chunk_size should return a reference"); + return static_cast<TeamPolicy&>(internal_policy::set_chunk_size(chunk)); + } + + inline TeamPolicy& set_scratch_size(const int& level, + const Impl::PerTeamValue& per_team) { + static_assert(std::is_same<decltype(internal_policy::set_scratch_size( + level, per_team)), + internal_policy&>::value, + "internal set_chunk_size should return a reference"); + + team_policy_check_valid_storage_level_argument(level); + return static_cast<TeamPolicy&>( + internal_policy::set_scratch_size(level, per_team)); + } + inline TeamPolicy& set_scratch_size(const int& level, + const Impl::PerThreadValue& per_thread) { + team_policy_check_valid_storage_level_argument(level); + return static_cast<TeamPolicy&>( + internal_policy::set_scratch_size(level, per_thread)); + } + inline TeamPolicy& set_scratch_size(const int& level, + const Impl::PerTeamValue& per_team, + const Impl::PerThreadValue& per_thread) { + team_policy_check_valid_storage_level_argument(level); + return static_cast<TeamPolicy&>( + internal_policy::set_scratch_size(level, per_team, per_thread)); + } + inline TeamPolicy& set_scratch_size(const int& level, + const Impl::PerThreadValue& per_thread, + const Impl::PerTeamValue& per_team) { + team_policy_check_valid_storage_level_argument(level); + return static_cast<TeamPolicy&>( + internal_policy::set_scratch_size(level, per_team, per_thread)); + } +}; + +namespace Impl { + +template <typename iType, class TeamMemberType> +struct TeamThreadRangeBoundariesStruct { + private: + KOKKOS_INLINE_FUNCTION static iType ibegin(const iType& arg_begin, + const iType& arg_end, + const iType& arg_rank, + const iType& arg_size) { + return arg_begin + + ((arg_end - arg_begin + arg_size - 1) / arg_size) * arg_rank; + } + + KOKKOS_INLINE_FUNCTION static iType iend(const iType& arg_begin, + const iType& arg_end, + const iType& arg_rank, + const iType& arg_size) { + const iType end_ = + arg_begin + + ((arg_end - arg_begin + arg_size - 1) / arg_size) * (arg_rank + 1); + return end_ < arg_end ? end_ : arg_end; + } + + public: + using index_type = iType; + const iType start; + const iType end; + enum { increment = 1 }; + const TeamMemberType& thread; + + KOKKOS_INLINE_FUNCTION + TeamThreadRangeBoundariesStruct(const TeamMemberType& arg_thread, + const iType& arg_end) + : start( + ibegin(0, arg_end, arg_thread.team_rank(), arg_thread.team_size())), + end(iend(0, arg_end, arg_thread.team_rank(), arg_thread.team_size())), + thread(arg_thread) {} + + KOKKOS_INLINE_FUNCTION + TeamThreadRangeBoundariesStruct(const TeamMemberType& arg_thread, + const iType& arg_begin, const iType& arg_end) + : start(ibegin(arg_begin, arg_end, arg_thread.team_rank(), + arg_thread.team_size())), + end(iend(arg_begin, arg_end, arg_thread.team_rank(), + arg_thread.team_size())), + thread(arg_thread) {} +}; + +template <typename iType, class TeamMemberType> +struct TeamVectorRangeBoundariesStruct { + private: + KOKKOS_INLINE_FUNCTION static iType ibegin(const iType& arg_begin, + const iType& arg_end, + const iType& arg_rank, + const iType& arg_size) { + return arg_begin + + ((arg_end - arg_begin + arg_size - 1) / arg_size) * arg_rank; + } + + KOKKOS_INLINE_FUNCTION static iType iend(const iType& arg_begin, + const iType& arg_end, + const iType& arg_rank, + const iType& arg_size) { + const iType end_ = + arg_begin + + ((arg_end - arg_begin + arg_size - 1) / arg_size) * (arg_rank + 1); + return end_ < arg_end ? end_ : arg_end; + } + + public: + using index_type = iType; + const iType start; + const iType end; + enum { increment = 1 }; + const TeamMemberType& thread; + + KOKKOS_INLINE_FUNCTION + TeamVectorRangeBoundariesStruct(const TeamMemberType& arg_thread, + const iType& arg_end) + : start( + ibegin(0, arg_end, arg_thread.team_rank(), arg_thread.team_size())), + end(iend(0, arg_end, arg_thread.team_rank(), arg_thread.team_size())), + thread(arg_thread) {} + + KOKKOS_INLINE_FUNCTION + TeamVectorRangeBoundariesStruct(const TeamMemberType& arg_thread, + const iType& arg_begin, const iType& arg_end) + : start(ibegin(arg_begin, arg_end, arg_thread.team_rank(), + arg_thread.team_size())), + end(iend(arg_begin, arg_end, arg_thread.team_rank(), + arg_thread.team_size())), + thread(arg_thread) {} +}; + +template <typename iType, class TeamMemberType> +struct ThreadVectorRangeBoundariesStruct { + using index_type = iType; + const index_type start; + const index_type end; + enum { increment = 1 }; + + KOKKOS_INLINE_FUNCTION + constexpr ThreadVectorRangeBoundariesStruct(const TeamMemberType, + const index_type& count) noexcept + : start(static_cast<index_type>(0)), end(count) {} + + KOKKOS_INLINE_FUNCTION + constexpr ThreadVectorRangeBoundariesStruct(const index_type& count) noexcept + : start(static_cast<index_type>(0)), end(count) {} + + KOKKOS_INLINE_FUNCTION + constexpr ThreadVectorRangeBoundariesStruct( + const TeamMemberType, const index_type& arg_begin, + const index_type& arg_end) noexcept + : start(static_cast<index_type>(arg_begin)), end(arg_end) {} + + KOKKOS_INLINE_FUNCTION + constexpr ThreadVectorRangeBoundariesStruct( + const index_type& arg_begin, const index_type& arg_end) noexcept + : start(static_cast<index_type>(arg_begin)), end(arg_end) {} +}; + +template <class TeamMemberType> +struct ThreadSingleStruct { + const TeamMemberType& team_member; + KOKKOS_INLINE_FUNCTION + ThreadSingleStruct(const TeamMemberType& team_member_) + : team_member(team_member_) {} +}; + +template <class TeamMemberType> +struct VectorSingleStruct { + const TeamMemberType& team_member; + KOKKOS_INLINE_FUNCTION + VectorSingleStruct(const TeamMemberType& team_member_) + : team_member(team_member_) {} +}; + +} // namespace Impl + +/** \brief Execution policy for parallel work over a threads within a team. + * + * The range is split over all threads in a team. The Mapping scheme depends on + * the architecture. This policy is used together with a parallel pattern as a + * nested layer within a kernel launched with the TeamPolicy. This variant + * expects a single count. So the range is (0,count]. + */ +template <typename iType, class TeamMemberType, class _never_use_this_overload> +KOKKOS_INLINE_FUNCTION_DELETED + Impl::TeamThreadRangeBoundariesStruct<iType, TeamMemberType> + TeamThreadRange(const TeamMemberType&, const iType& count) = delete; + +/** \brief Execution policy for parallel work over a threads within a team. + * + * The range is split over all threads in a team. The Mapping scheme depends on + * the architecture. This policy is used together with a parallel pattern as a + * nested layer within a kernel launched with the TeamPolicy. This variant + * expects a begin and end. So the range is (begin,end]. + */ +template <typename iType1, typename iType2, class TeamMemberType, + class _never_use_this_overload> +KOKKOS_INLINE_FUNCTION_DELETED Impl::TeamThreadRangeBoundariesStruct< + typename std::common_type<iType1, iType2>::type, TeamMemberType> +TeamThreadRange(const TeamMemberType&, const iType1& begin, + const iType2& end) = delete; + +/** \brief Execution policy for parallel work over a threads within a team. + * + * The range is split over all threads in a team. The Mapping scheme depends on + * the architecture. This policy is used together with a parallel pattern as a + * nested layer within a kernel launched with the TeamPolicy. This variant + * expects a single count. So the range is (0,count]. + */ +template <typename iType, class TeamMemberType, class _never_use_this_overload> +KOKKOS_INLINE_FUNCTION_DELETED + Impl::TeamThreadRangeBoundariesStruct<iType, TeamMemberType> + TeamVectorRange(const TeamMemberType&, const iType& count) = delete; + +/** \brief Execution policy for parallel work over a threads within a team. + * + * The range is split over all threads in a team. The Mapping scheme depends on + * the architecture. This policy is used together with a parallel pattern as a + * nested layer within a kernel launched with the TeamPolicy. This variant + * expects a begin and end. So the range is (begin,end]. + */ +template <typename iType1, typename iType2, class TeamMemberType, + class _never_use_this_overload> +KOKKOS_INLINE_FUNCTION_DELETED Impl::TeamThreadRangeBoundariesStruct< + typename std::common_type<iType1, iType2>::type, TeamMemberType> +TeamVectorRange(const TeamMemberType&, const iType1& begin, + const iType2& end) = delete; + +/** \brief Execution policy for a vector parallel loop. + * + * The range is split over all vector lanes in a thread. The Mapping scheme + * depends on the architecture. This policy is used together with a parallel + * pattern as a nested layer within a kernel launched with the TeamPolicy. This + * variant expects a single count. So the range is (0,count]. + */ +template <typename iType, class TeamMemberType, class _never_use_this_overload> +KOKKOS_INLINE_FUNCTION_DELETED + Impl::ThreadVectorRangeBoundariesStruct<iType, TeamMemberType> + ThreadVectorRange(const TeamMemberType&, const iType& count) = delete; + +template <typename iType1, typename iType2, class TeamMemberType, + class _never_use_this_overload> +KOKKOS_INLINE_FUNCTION_DELETED Impl::ThreadVectorRangeBoundariesStruct< + typename std::common_type<iType1, iType2>::type, TeamMemberType> +ThreadVectorRange(const TeamMemberType&, const iType1& arg_begin, + const iType2& arg_end) = delete; + +namespace Impl { + +template <typename FunctorType, typename TagType, + bool HasTag = !std::is_same<TagType, void>::value> +struct ParallelConstructName; + +template <typename FunctorType, typename TagType> +struct ParallelConstructName<FunctorType, TagType, true> { + ParallelConstructName(std::string const& label) : label_ref(label) { + if (label.empty()) { + default_name = std::string(typeid(FunctorType).name()) + "/" + + typeid(TagType).name(); + } + } + std::string const& get() { + return (label_ref.empty()) ? default_name : label_ref; + } + std::string const& label_ref; + std::string default_name; +}; + +template <typename FunctorType, typename TagType> +struct ParallelConstructName<FunctorType, TagType, false> { + ParallelConstructName(std::string const& label) : label_ref(label) { + if (label.empty()) { + default_name = std::string(typeid(FunctorType).name()); + } + } + std::string const& get() { + return (label_ref.empty()) ? default_name : label_ref; + } + std::string const& label_ref; + std::string default_name; +}; + +} // namespace Impl + +} // namespace Kokkos + +namespace Kokkos { + +namespace Impl { + +template <class PatternTag, class... Args> +struct PatternImplSpecializationFromTag; + +template <class... Args> +struct PatternImplSpecializationFromTag<Kokkos::ParallelForTag, Args...> + : identity<ParallelFor<Args...>> {}; + +template <class... Args> +struct PatternImplSpecializationFromTag<Kokkos::ParallelReduceTag, Args...> + : identity<ParallelReduce<Args...>> {}; + +template <class... Args> +struct PatternImplSpecializationFromTag<Kokkos::ParallelScanTag, Args...> + : identity<ParallelScan<Args...>> {}; + +template <class PatternImpl> +struct PatternTagFromImplSpecialization; + +template <class... Args> +struct PatternTagFromImplSpecialization<ParallelFor<Args...>> + : identity<ParallelForTag> {}; + +template <class... Args> +struct PatternTagFromImplSpecialization<ParallelReduce<Args...>> + : identity<ParallelReduceTag> {}; + +template <class... Args> +struct PatternTagFromImplSpecialization<ParallelScan<Args...>> + : identity<ParallelScanTag> {}; + +} // end namespace Impl + +} // namespace Kokkos +#endif /* #define KOKKOS_EXECPOLICY_HPP */ diff --git a/packages/kokkos/core/src/Kokkos_Extents.hpp b/packages/kokkos/core/src/Kokkos_Extents.hpp new file mode 100644 index 0000000000000000000000000000000000000000..683b76e1f960836134862d10fb62ab53f55a8463 --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_Extents.hpp @@ -0,0 +1,175 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2019) Sandia Corporation +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_KOKKOS_EXTENTS_HPP +#define KOKKOS_KOKKOS_EXTENTS_HPP + +#include <cstddef> +#include <type_traits> +#include <Kokkos_Macros.hpp> + +namespace Kokkos { +namespace Experimental { + +constexpr ptrdiff_t dynamic_extent = -1; + +template <ptrdiff_t... ExtentSpecs> +struct Extents { + /* TODO @enhancement flesh this out more */ +}; + +template <class Exts, ptrdiff_t NewExtent> +struct PrependExtent; + +template <ptrdiff_t... Exts, ptrdiff_t NewExtent> +struct PrependExtent<Extents<Exts...>, NewExtent> { + using type = Extents<NewExtent, Exts...>; +}; + +template <class Exts, ptrdiff_t NewExtent> +struct AppendExtent; + +template <ptrdiff_t... Exts, ptrdiff_t NewExtent> +struct AppendExtent<Extents<Exts...>, NewExtent> { + using type = Extents<Exts..., NewExtent>; +}; + +} // end namespace Experimental + +namespace Impl { + +namespace _parse_view_extents_impl { + +template <class T> +struct _all_remaining_extents_dynamic : std::true_type {}; + +template <class T> +struct _all_remaining_extents_dynamic<T*> : _all_remaining_extents_dynamic<T> { +}; + +template <class T, unsigned N> +struct _all_remaining_extents_dynamic<T[N]> : std::false_type {}; + +template <class T, class Result, class = void> +struct _parse_impl { + using type = Result; +}; + +// We have to treat the case of int**[x] specially, since it *doesn't* go +// backwards +template <class T, ptrdiff_t... ExtentSpec> +struct _parse_impl< + T*, Kokkos::Experimental::Extents<ExtentSpec...>, + typename std::enable_if<_all_remaining_extents_dynamic<T>::value>::type> + : _parse_impl<T, Kokkos::Experimental::Extents< + Kokkos::Experimental::dynamic_extent, ExtentSpec...>> { +}; + +// int*(*[x])[y] should still work also (meaning int[][x][][y]) +template <class T, ptrdiff_t... ExtentSpec> +struct _parse_impl< + T*, Kokkos::Experimental::Extents<ExtentSpec...>, + typename std::enable_if<!_all_remaining_extents_dynamic<T>::value>::type> { + using _next = Kokkos::Experimental::AppendExtent< + typename _parse_impl<T, Kokkos::Experimental::Extents<ExtentSpec...>, + void>::type, + Kokkos::Experimental::dynamic_extent>; + using type = typename _next::type; +}; + +template <class T, ptrdiff_t... ExtentSpec, unsigned N> +struct _parse_impl<T[N], Kokkos::Experimental::Extents<ExtentSpec...>, void> + : _parse_impl< + T, Kokkos::Experimental::Extents<ExtentSpec..., + ptrdiff_t(N)> // TODO @pedantic this + // could be a + // narrowing cast + > {}; + +} // end namespace _parse_view_extents_impl + +template <class DataType> +struct ParseViewExtents { + using type = typename _parse_view_extents_impl ::_parse_impl< + DataType, Kokkos::Experimental::Extents<>>::type; +}; + +template <class ValueType, ptrdiff_t Ext> +struct ApplyExtent { + using type = ValueType[Ext]; +}; + +template <class ValueType> +struct ApplyExtent<ValueType, Kokkos::Experimental::dynamic_extent> { + using type = ValueType*; +}; + +template <class ValueType, unsigned N, ptrdiff_t Ext> +struct ApplyExtent<ValueType[N], Ext> { + using type = typename ApplyExtent<ValueType, Ext>::type[N]; +}; + +template <class ValueType, ptrdiff_t Ext> +struct ApplyExtent<ValueType*, Ext> { + using type = ValueType * [Ext]; +}; + +template <class ValueType> +struct ApplyExtent<ValueType*, Kokkos::Experimental::dynamic_extent> { + using type = + typename ApplyExtent<ValueType, + Kokkos::Experimental::dynamic_extent>::type*; +}; + +template <class ValueType, unsigned N> +struct ApplyExtent<ValueType[N], Kokkos::Experimental::dynamic_extent> { + using type = + typename ApplyExtent<ValueType, + Kokkos::Experimental::dynamic_extent>::type[N]; +}; + +} // end namespace Impl + +} // end namespace Kokkos + +#endif // KOKKOS_KOKKOS_EXTENTS_HPP diff --git a/packages/kokkos/core/src/Kokkos_Future.hpp b/packages/kokkos/core/src/Kokkos_Future.hpp new file mode 100644 index 0000000000000000000000000000000000000000..b163bd1fc9018d6275d0a3bc7bb2bcac90d3955e --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_Future.hpp @@ -0,0 +1,499 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_FUTURE_HPP +#define KOKKOS_FUTURE_HPP + +//---------------------------------------------------------------------------- + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_TASKDAG) + +#include <Kokkos_Core_fwd.hpp> +#include <Kokkos_TaskScheduler_fwd.hpp> +//---------------------------------------------------------------------------- + +#include <impl/Kokkos_TaskQueue.hpp> +#include <impl/Kokkos_TaskResult.hpp> +#include <impl/Kokkos_TaskBase.hpp> +#include <Kokkos_Atomic.hpp> + +#include <Kokkos_Concepts.hpp> // is_space + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +// For now, hack this in as a partial specialization +// TODO @tasking @cleanup Make this the "normal" class template and make the old +// code the specialization +template <typename ValueType, typename ExecutionSpace, typename QueueType> +class BasicFuture<ValueType, SimpleTaskScheduler<ExecutionSpace, QueueType>> { + public: + using value_type = ValueType; + using execution_space = ExecutionSpace; + using scheduler_type = SimpleTaskScheduler<ExecutionSpace, QueueType>; + using queue_type = typename scheduler_type::task_queue_type; + + private: + template <class, class> + friend class SimpleTaskScheduler; + template <class, class> + friend class BasicFuture; + + using task_base_type = typename scheduler_type::task_base_type; + using task_queue_type = typename scheduler_type::task_queue_type; + + using task_queue_traits = typename scheduler_type::task_queue_traits; + using task_scheduling_info_type = + typename scheduler_type::task_scheduling_info_type; + + using result_storage_type = Impl::TaskResultStorage< + ValueType, + Impl::SchedulingInfoStorage<Impl::RunnableTaskBase<task_queue_traits>, + task_scheduling_info_type>>; + + OwningRawPtr<task_base_type> m_task = nullptr; + + KOKKOS_INLINE_FUNCTION + explicit BasicFuture(task_base_type* task) : m_task(task) { + // Note: reference count starts at 2 to account for initial increment + // TODO @tasking @minor DSH verify reference count here and/or encapsulate + // starting reference count closer to here + } + + public: + KOKKOS_INLINE_FUNCTION + BasicFuture() noexcept : m_task(nullptr) {} + + KOKKOS_INLINE_FUNCTION + BasicFuture(BasicFuture&& rhs) noexcept : m_task(std::move(rhs.m_task)) { + rhs.m_task = nullptr; + } + + KOKKOS_INLINE_FUNCTION + BasicFuture(BasicFuture const& rhs) + // : m_task(rhs.m_task) + : m_task(nullptr) { + *static_cast<task_base_type* volatile*>(&m_task) = rhs.m_task; + if (m_task) m_task->increment_reference_count(); + } + + KOKKOS_INLINE_FUNCTION + BasicFuture& operator=(BasicFuture&& rhs) noexcept { + if (m_task != rhs.m_task) { + clear(); + // m_task = std::move(rhs.m_task); + *static_cast<task_base_type* volatile*>(&m_task) = rhs.m_task; + // rhs.m_task reference count is unchanged, since this is a move + } else { + // They're the same, but this is a move, so 1 fewer references now + rhs.clear(); + } + rhs.m_task = nullptr; + return *this; + } + + KOKKOS_INLINE_FUNCTION + BasicFuture& operator=(BasicFuture const& rhs) { + if (m_task != rhs.m_task) { + clear(); + // m_task = rhs.m_task; + *static_cast<task_base_type* volatile*>(&m_task) = rhs.m_task; + } + if (m_task != nullptr) { + m_task->increment_reference_count(); + } + return *this; + } + + //---------------------------------------- + + template <class T, class S> + KOKKOS_INLINE_FUNCTION BasicFuture( + BasicFuture<T, S>&& rhs) noexcept // NOLINT(google-explicit-constructor) + : m_task(std::move(rhs.m_task)) { + static_assert(std::is_same<scheduler_type, void>::value || + std::is_same<scheduler_type, S>::value, + "Moved Futures must have the same scheduler"); + + static_assert(std::is_same<value_type, void>::value || + std::is_same<value_type, T>::value, + "Moved Futures must have the same value_type"); + + // reference counts are unchanged, since this is a move + rhs.m_task = nullptr; + } + + template <class T, class S> + KOKKOS_INLINE_FUNCTION BasicFuture( + BasicFuture<T, S> const& rhs) // NOLINT(google-explicit-constructor) + //: m_task(rhs.m_task) + : m_task(nullptr) { + static_assert(std::is_same<scheduler_type, void>::value || + std::is_same<scheduler_type, S>::value, + "Copied Futures must have the same scheduler"); + + static_assert(std::is_same<value_type, void>::value || + std::is_same<value_type, T>::value, + "Copied Futures must have the same value_type"); + + *static_cast<task_base_type* volatile*>(&m_task) = rhs.m_task; + if (m_task) m_task->increment_reference_count(); + } + + template <class T, class S> + KOKKOS_INLINE_FUNCTION BasicFuture& operator=(BasicFuture<T, S> const& rhs) { + static_assert(std::is_same<scheduler_type, void>::value || + std::is_same<scheduler_type, S>::value, + "Assigned Futures must have the same scheduler"); + + static_assert(std::is_same<value_type, void>::value || + std::is_same<value_type, T>::value, + "Assigned Futures must have the same value_type"); + + if (m_task != rhs.m_task) { + clear(); + // m_task = rhs.m_task; + *static_cast<task_base_type* volatile*>(&m_task) = rhs.m_task; + if (m_task != nullptr) { + m_task->increment_reference_count(); + } + } + return *this; + } + + template <class T, class S> + KOKKOS_INLINE_FUNCTION BasicFuture& operator=(BasicFuture<T, S>&& rhs) { + static_assert(std::is_same<scheduler_type, void>::value || + std::is_same<scheduler_type, S>::value, + "Assigned Futures must have the same scheduler"); + + static_assert(std::is_same<value_type, void>::value || + std::is_same<value_type, T>::value, + "Assigned Futures must have the same value_type"); + + if (m_task != rhs.m_task) { + clear(); + // m_task = std::move(rhs.m_task); + *static_cast<task_base_type* volatile*>(&m_task) = rhs.m_task; + // rhs.m_task reference count is unchanged, since this is a move + } else { + // They're the same, but this is a move, so 1 fewer references now + rhs.clear(); + } + rhs.m_task = nullptr; + return *this; + } + + KOKKOS_INLINE_FUNCTION + ~BasicFuture() noexcept { clear(); } + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + void clear() noexcept { + if (m_task) { + bool should_delete = m_task->decrement_and_check_reference_count(); + if (should_delete) { + static_cast<task_queue_type*>(m_task->ready_queue_base_ptr()) + ->deallocate(std::move(*m_task)); + } + } + // m_task = nullptr; + *static_cast<task_base_type* volatile*>(&m_task) = nullptr; + } + + KOKKOS_INLINE_FUNCTION + bool is_null() const noexcept { return m_task == nullptr; } + + KOKKOS_INLINE_FUNCTION + bool is_ready() const noexcept { + return (m_task == nullptr) || m_task->wait_queue_is_consumed(); + } + + KOKKOS_INLINE_FUNCTION + const typename Impl::TaskResult<ValueType>::reference_type get() const { + KOKKOS_EXPECTS(is_ready()); + return static_cast<result_storage_type*>(m_task)->value_reference(); + // return Impl::TaskResult<ValueType>::get(m_task); + } +}; + +//////////////////////////////////////////////////////////////////////////////// +// OLD CODE +//////////////////////////////////////////////////////////////////////////////// + +template <typename ValueType, typename Scheduler> +class BasicFuture { + private: + template <typename, typename> + friend class BasicTaskScheduler; + template <typename, typename> + friend class BasicFuture; + friend class Impl::TaskBase; + template <typename, typename, typename> + friend class Impl::Task; + + //---------------------------------------- + + public: + //---------------------------------------- + + using scheduler_type = Scheduler; + using queue_type = typename scheduler_type::queue_type; + using execution_space = typename scheduler_type::execution_space; + using value_type = ValueType; + + //---------------------------------------- + + private: + //---------------------------------------- + + using task_base = Impl::TaskBase; + + task_base* m_task; + + KOKKOS_INLINE_FUNCTION explicit BasicFuture(task_base* task) + : m_task(nullptr) { + if (task) queue_type::assign(&m_task, task); + } + + //---------------------------------------- + + public: + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + bool is_null() const { return nullptr == m_task; } + + KOKKOS_INLINE_FUNCTION + int reference_count() const { + return nullptr != m_task ? m_task->reference_count() : 0; + } + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + void clear() { + if (m_task) queue_type::assign(&m_task, nullptr); + } + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + ~BasicFuture() { clear(); } + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + BasicFuture() noexcept : m_task(nullptr) {} + + KOKKOS_INLINE_FUNCTION + BasicFuture(BasicFuture&& rhs) noexcept : m_task(rhs.m_task) { + rhs.m_task = nullptr; + } + + KOKKOS_INLINE_FUNCTION + BasicFuture(const BasicFuture& rhs) : m_task(nullptr) { + if (rhs.m_task) queue_type::assign(&m_task, rhs.m_task); + } + + KOKKOS_INLINE_FUNCTION + BasicFuture& operator=(BasicFuture&& rhs) noexcept { + clear(); + m_task = rhs.m_task; + rhs.m_task = nullptr; + return *this; + } + + KOKKOS_INLINE_FUNCTION + BasicFuture& operator=(BasicFuture const& rhs) { + if (m_task || rhs.m_task) queue_type::assign(&m_task, rhs.m_task); + return *this; + } + + //---------------------------------------- + + template <class T, class S> + KOKKOS_INLINE_FUNCTION BasicFuture( + BasicFuture<T, S>&& rhs) noexcept // NOLINT(google-explicit-constructor) + : m_task(rhs.m_task) { + static_assert(std::is_same<scheduler_type, void>::value || + std::is_same<scheduler_type, S>::value, + "Assigned Futures must have the same scheduler"); + + static_assert(std::is_same<value_type, void>::value || + std::is_same<value_type, T>::value, + "Assigned Futures must have the same value_type"); + + rhs.m_task = 0; + } + + template <class T, class S> + KOKKOS_INLINE_FUNCTION BasicFuture( + BasicFuture<T, S> const& rhs) // NOLINT(google-explicit-constructor) + : m_task(nullptr) { + static_assert(std::is_same<scheduler_type, void>::value || + std::is_same<scheduler_type, S>::value, + "Assigned Futures must have the same scheduler"); + + static_assert(std::is_same<value_type, void>::value || + std::is_same<value_type, T>::value, + "Assigned Futures must have the same value_type"); + + if (rhs.m_task) queue_type::assign(&m_task, rhs.m_task); + } + + template <class T, class S> + KOKKOS_INLINE_FUNCTION BasicFuture& operator=(BasicFuture<T, S> const& rhs) { + static_assert(std::is_same<scheduler_type, void>::value || + std::is_same<scheduler_type, S>::value, + "Assigned Futures must have the same scheduler"); + + static_assert(std::is_same<value_type, void>::value || + std::is_same<value_type, T>::value, + "Assigned Futures must have the same value_type"); + + if (m_task || rhs.m_task) queue_type::assign(&m_task, rhs.m_task); + return *this; + } + + template <class T, class S> + KOKKOS_INLINE_FUNCTION BasicFuture& operator=(BasicFuture<T, S>&& rhs) { + static_assert(std::is_same<scheduler_type, void>::value || + std::is_same<scheduler_type, S>::value, + "Assigned Futures must have the same scheduler"); + + static_assert(std::is_same<value_type, void>::value || + std::is_same<value_type, T>::value, + "Assigned Futures must have the same value_type"); + + clear(); + m_task = rhs.m_task; + rhs.m_task = 0; + return *this; + } + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + int is_ready() const noexcept { + return (nullptr == m_task) || + (((task_base*)task_base::LockTag) == m_task->m_wait); + } + + KOKKOS_INLINE_FUNCTION + const typename Impl::TaskResult<ValueType>::reference_type get() const { + if (nullptr == m_task) { + Kokkos::abort("Kokkos:::Future::get ERROR: is_null()"); + } + return Impl::TaskResult<ValueType>::get(m_task); + } +}; + +// Is a Future with the given execution space +template <typename, typename ExecSpace = void> +struct is_future : public std::false_type {}; + +template <typename ValueType, typename Scheduler, typename ExecSpace> +struct is_future<BasicFuture<ValueType, Scheduler>, ExecSpace> + : std::integral_constant< + bool, + std::is_same<ExecSpace, typename Scheduler::execution_space>::value || + std::is_void<ExecSpace>::value> {}; + +//////////////////////////////////////////////////////////////////////////////// +// END OLD CODE +//////////////////////////////////////////////////////////////////////////////// + +namespace Impl { + +template <class Arg1, class Arg2> +class ResolveFutureArgOrder { + private: + enum { Arg1_is_space = Kokkos::is_space<Arg1>::value }; + enum { Arg2_is_space = Kokkos::is_space<Arg2>::value }; + enum { Arg1_is_value = !Arg1_is_space && !std::is_same<Arg1, void>::value }; + enum { Arg2_is_value = !Arg2_is_space && !std::is_same<Arg2, void>::value }; + + static_assert(!(Arg1_is_space && Arg2_is_space), + "Future cannot be given two spaces"); + + static_assert(!(Arg1_is_value && Arg2_is_value), + "Future cannot be given two value types"); + + using value_type = typename std::conditional< + Arg1_is_value, Arg1, + typename std::conditional<Arg2_is_value, Arg2, void>::type>::type; + + using execution_space = typename std::conditional< + Arg1_is_space, Arg1, + typename std::conditional<Arg2_is_space, Arg2, + void>::type>::type::execution_space; + + public: + using type = BasicFuture<value_type, TaskScheduler<execution_space>>; +}; + +} // end namespace Impl + +/** + * + * Future< space > // value_type == void + * Future< value > // space == Default + * Future< value , space > + * + */ +template <class Arg1 = void, class Arg2 = void> +using Future = typename Impl::ResolveFutureArgOrder<Arg1, Arg2>::type; + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */ +#endif /* #ifndef KOKKOS_FUTURE */ diff --git a/packages/kokkos/core/src/Kokkos_Graph.hpp b/packages/kokkos/core/src/Kokkos_Graph.hpp new file mode 100644 index 0000000000000000000000000000000000000000..ef6057ae8f00959e11783d6e382b64d76d487fd1 --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_Graph.hpp @@ -0,0 +1,191 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_GRAPH_HPP +#define KOKKOS_GRAPH_HPP + +#include <Kokkos_Macros.hpp> +#include <impl/Kokkos_Error.hpp> // KOKKOS_EXPECTS + +#include <Kokkos_Graph_fwd.hpp> +#include <impl/Kokkos_GraphImpl_fwd.hpp> + +// GraphAccess needs to be defined, not just declared +#include <impl/Kokkos_GraphImpl.hpp> + +#include <impl/Kokkos_Utilities.hpp> // fold emulation + +#include <functional> +#include <memory> + +namespace Kokkos { +namespace Experimental { + +//============================================================================== +// <editor-fold desc="Graph"> {{{1 + +template <class ExecutionSpace> +struct KOKKOS_ATTRIBUTE_NODISCARD Graph { + public: + //---------------------------------------------------------------------------- + // <editor-fold desc="public member types"> {{{2 + + using execution_space = ExecutionSpace; + using graph = Graph; + + // </editor-fold> end public member types }}}2 + //---------------------------------------------------------------------------- + + private: + //---------------------------------------------------------------------------- + // <editor-fold desc="friends"> {{{2 + + friend struct Kokkos::Impl::GraphAccess; + + // </editor-fold> end friends }}}2 + //---------------------------------------------------------------------------- + + //---------------------------------------------------------------------------- + // <editor-fold desc="private data members"> {{{2 + + using impl_t = Kokkos::Impl::GraphImpl<ExecutionSpace>; + std::shared_ptr<impl_t> m_impl_ptr = nullptr; + + // </editor-fold> end private data members }}}2 + //---------------------------------------------------------------------------- + + //---------------------------------------------------------------------------- + // <editor-fold desc="private ctors"> {{{2 + + // Note: only create_graph() uses this constructor, but we can't just make + // that a friend instead of GraphAccess because of the way that friend + // function template injection works. + explicit Graph(std::shared_ptr<impl_t> arg_impl_ptr) + : m_impl_ptr(std::move(arg_impl_ptr)) {} + + // </editor-fold> end private ctors }}}2 + //---------------------------------------------------------------------------- + + public: + ExecutionSpace const& get_execution_space() const { + return m_impl_ptr->get_execution_space(); + } + + void submit() const { + KOKKOS_EXPECTS(bool(m_impl_ptr)) + (*m_impl_ptr).submit(); + } +}; + +// </editor-fold> end Graph }}}1 +//============================================================================== + +//============================================================================== +// <editor-fold desc="when_all"> {{{1 + +template <class... PredecessorRefs> +// constraints (not intended for subsumption, though...) +// ((remove_cvref_t<PredecessorRefs> is a specialization of +// GraphNodeRef with get_root().get_graph_impl() as its GraphImpl) +// && ...) +auto when_all(PredecessorRefs&&... arg_pred_refs) { + // TODO @graph @desul-integration check the constraints and preconditions + // once we have folded conjunctions from + // desul + static_assert(sizeof...(PredecessorRefs) > 0, + "when_all() needs at least one predecessor."); + auto graph_ptr_impl = + Kokkos::Impl::GraphAccess::get_graph_weak_ptr( + std::get<0>(std::forward_as_tuple(arg_pred_refs...))) + .lock(); + auto node_ptr_impl = graph_ptr_impl->create_aggregate_ptr(arg_pred_refs...); + graph_ptr_impl->add_node(node_ptr_impl); + KOKKOS_IMPL_FOLD_COMMA_OPERATOR( + graph_ptr_impl->add_predecessor(node_ptr_impl, arg_pred_refs) /* ... */); + return Kokkos::Impl::GraphAccess::make_graph_node_ref( + std::move(graph_ptr_impl), std::move(node_ptr_impl)); +} + +// </editor-fold> end when_all }}}1 +//============================================================================== + +//============================================================================== +// <editor-fold desc="create_graph"> {{{1 + +template <class ExecutionSpace, class Closure> +Graph<ExecutionSpace> create_graph(ExecutionSpace ex, Closure&& arg_closure) { + // Create a shared pointer to the graph: + // We need an attorney class here so we have an implementation friend to + // create a Graph class without graph having public constructors. We can't + // just make `create_graph` itself a friend because of the way that friend + // function template injection works. + auto rv = Kokkos::Impl::GraphAccess::construct_graph(std::move(ex)); + // Invoke the user's graph construction closure + ((Closure &&) arg_closure)(Kokkos::Impl::GraphAccess::create_root_ref(rv)); + // and given them back the graph + // KOKKOS_ENSURES(rv.m_impl_ptr.use_count() == 1) + return rv; +} + +template < + class ExecutionSpace = DefaultExecutionSpace, + class Closure = Kokkos::Impl::DoNotExplicitlySpecifyThisTemplateParameter> +Graph<ExecutionSpace> create_graph(Closure&& arg_closure) { + return create_graph(ExecutionSpace{}, (Closure &&) arg_closure); +} + +// </editor-fold> end create_graph }}}1 +//============================================================================== + +} // end namespace Experimental +} // namespace Kokkos + +// Even though these things are separable, include them here for now so that +// the user only needs to include Kokkos_Graph.hpp to get the whole facility. +#include <Kokkos_GraphNode.hpp> + +#include <impl/Kokkos_GraphNodeImpl.hpp> +#include <impl/Kokkos_Default_Graph_Impl.hpp> +#include <Cuda/Kokkos_Cuda_Graph_Impl.hpp> +#endif // KOKKOS_GRAPH_HPP diff --git a/packages/kokkos/core/src/Kokkos_GraphNode.hpp b/packages/kokkos/core/src/Kokkos_GraphNode.hpp new file mode 100644 index 0000000000000000000000000000000000000000..56e7d706f6a641467864ec9459660be21493de37 --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_GraphNode.hpp @@ -0,0 +1,462 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_KOKKOS_GRAPHNODE_HPP +#define KOKKOS_KOKKOS_GRAPHNODE_HPP + +#include <Kokkos_Macros.hpp> + +#include <impl/Kokkos_Error.hpp> // contract macros + +#include <Kokkos_Core_fwd.hpp> +#include <Kokkos_Graph_fwd.hpp> +#include <impl/Kokkos_GraphImpl_fwd.hpp> +#include <Kokkos_Parallel_Reduce.hpp> +#include <impl/Kokkos_GraphImpl_Utilities.hpp> +#include <impl/Kokkos_GraphImpl.hpp> // GraphAccess + +#include <memory> // std::shared_ptr + +namespace Kokkos { +namespace Experimental { + +template <class ExecutionSpace, class Kernel /*= TypeErasedTag*/, + class Predecessor /*= TypeErasedTag*/> +class GraphNodeRef { + //---------------------------------------------------------------------------- + // <editor-fold desc="template parameter constraints"> {{{2 + + // Note: because of these assertions, instantiating this class template is not + // intended to be SFINAE-safe, so do validation before you instantiate. + +// WORKAROUND Could not get it to compile with IBM XL V16.1.1 +#ifndef KOKKOS_COMPILER_IBM + static_assert( + std::is_same<Predecessor, TypeErasedTag>::value || + Kokkos::Impl::is_specialization_of<Predecessor, GraphNodeRef>::value, + "Invalid predecessor template parameter given to GraphNodeRef"); +#endif + + static_assert( + Kokkos::is_execution_space<ExecutionSpace>::value, + "Invalid execution space template parameter given to GraphNodeRef"); + + static_assert(std::is_same<Predecessor, TypeErasedTag>::value || + Kokkos::Impl::is_graph_kernel<Kernel>::value, + "Invalid kernel template parameter given to GraphNodeRef"); + + static_assert(!Kokkos::Impl::is_more_type_erased<Kernel, Predecessor>::value, + "The kernel of a graph node can't be more type-erased than the " + "predecessor"); + + // </editor-fold> end template parameter constraints }}}2 + //---------------------------------------------------------------------------- + + public: + //---------------------------------------------------------------------------- + // <editor-fold desc="public member types"> {{{2 + + using execution_space = ExecutionSpace; + using graph_kernel = Kernel; + using graph_predecessor = Predecessor; + + // </editor-fold> end public member types }}}2 + //---------------------------------------------------------------------------- + + private: + //---------------------------------------------------------------------------- + // <editor-fold desc="Friends"> {{{2 + + template <class, class, class> + friend class GraphNodeRef; + friend struct Kokkos::Impl::GraphAccess; + + // </editor-fold> end Friends }}}2 + //---------------------------------------------------------------------------- + + //---------------------------------------------------------------------------- + // <editor-fold desc="Private Data Members"> {{{2 + + using graph_impl_t = Kokkos::Impl::GraphImpl<ExecutionSpace>; + std::weak_ptr<graph_impl_t> m_graph_impl; + + // TODO @graphs figure out if we can get away with a weak reference here? + // GraphNodeRef instances shouldn't be stored by users outside + // of the create_graph closure, and so if we restructure things + // slightly, we could make it so that the graph owns the + // node_impl_t instance and this only holds a std::weak_ptr to + // it. + using node_impl_t = + Kokkos::Impl::GraphNodeImpl<ExecutionSpace, Kernel, Predecessor>; + std::shared_ptr<node_impl_t> m_node_impl; + + // </editor-fold> end Private Data Members }}}2 + //---------------------------------------------------------------------------- + + //---------------------------------------------------------------------------- + // <editor-fold desc="Implementation detail accessors"> {{{2 + + // Internally, use shallow constness + node_impl_t& get_node_impl() const { return *m_node_impl.get(); } + std::shared_ptr<node_impl_t> const& get_node_ptr() const& { + return m_node_impl; + } + std::shared_ptr<node_impl_t> get_node_ptr() && { + return std::move(m_node_impl); + } + std::weak_ptr<graph_impl_t> get_graph_weak_ptr() const { + return m_graph_impl; + } + + // </editor-fold> end Implementation detail accessors }}}2 + //---------------------------------------------------------------------------- + + // TODO kernel name propagation and exposure + + template <class NextKernelDeduced> + auto _then_kernel(NextKernelDeduced&& arg_kernel) const { + // readability note: + // std::remove_cvref_t<NextKernelDeduced> is a specialization of + // Kokkos::Impl::GraphNodeKernelImpl: + static_assert(Kokkos::Impl::is_specialization_of< + Kokkos::Impl::remove_cvref_t<NextKernelDeduced>, + Kokkos::Impl::GraphNodeKernelImpl>::value, + "Kokkos internal error"); + + auto graph_ptr = m_graph_impl.lock(); + KOKKOS_EXPECTS(bool(graph_ptr)) + + using next_kernel_t = Kokkos::Impl::remove_cvref_t<NextKernelDeduced>; + + using return_t = GraphNodeRef<ExecutionSpace, next_kernel_t, GraphNodeRef>; + + auto rv = Kokkos::Impl::GraphAccess::make_graph_node_ref( + m_graph_impl, + Kokkos::Impl::GraphAccess::make_node_shared_ptr< + typename return_t::node_impl_t>( + m_node_impl->execution_space_instance(), + Kokkos::Impl::_graph_node_kernel_ctor_tag{}, + (NextKernelDeduced &&) arg_kernel, + // *this is the predecessor + Kokkos::Impl::_graph_node_predecessor_ctor_tag{}, *this)); + + // Add the node itself to the backend's graph data structure, now that + // everything is set up. + graph_ptr->add_node(rv.m_node_impl); + // Add the predecessaor we stored in the constructor above in the backend's + // data structure, now that everything is set up. + graph_ptr->add_predecessor(rv.m_node_impl, *this); + KOKKOS_ENSURES(bool(rv.m_node_impl)) + return rv; + } + + //---------------------------------------------------------------------------- + // <editor-fold desc="Private constructors"> {{{2 + + GraphNodeRef(std::weak_ptr<graph_impl_t> arg_graph_impl, + std::shared_ptr<node_impl_t> arg_node_impl) + : m_graph_impl(std::move(arg_graph_impl)), + m_node_impl(std::move(arg_node_impl)) {} + + // </editor-fold> end Private constructors }}}2 + //---------------------------------------------------------------------------- + + public: + //---------------------------------------------------------------------------- + // <editor-fold desc="Constructors, destructors, and assignment"> {{{2 + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // <editor-fold desc="rule of 6 ctors"> {{{3 + + // Copyable and movable (basically just shared_ptr semantics + GraphNodeRef() noexcept = default; + GraphNodeRef(GraphNodeRef const&) = default; + GraphNodeRef(GraphNodeRef&&) noexcept = default; + GraphNodeRef& operator=(GraphNodeRef const&) = default; + GraphNodeRef& operator=(GraphNodeRef&&) noexcept = default; + ~GraphNodeRef() = default; + + // </editor-fold> end rule of 6 ctors }}}3 + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // <editor-fold desc="Type-erasing converting ctor and assignment"> {{{3 + + template < + class OtherKernel, class OtherPredecessor, + typename std::enable_if_t< + // Not a copy/move constructor + !std::is_same<GraphNodeRef, GraphNodeRef<execution_space, OtherKernel, + OtherPredecessor>>::value && + // must be an allowed type erasure of the kernel + Kokkos::Impl::is_compatible_type_erasure<OtherKernel, + graph_kernel>::value && + // must be an allowed type erasure of the predecessor + Kokkos::Impl::is_compatible_type_erasure< + OtherPredecessor, graph_predecessor>::value, + int> = 0> + /* implicit */ + GraphNodeRef( + GraphNodeRef<execution_space, OtherKernel, OtherPredecessor> const& other) + : m_graph_impl(other.m_graph_impl), m_node_impl(other.m_node_impl) {} + + // Note: because this is an implicit conversion (as is supposed to be the + // case with most type-erasing wrappers like this), we don't also need + // a converting assignment operator. + + // </editor-fold> end Type-erasing converting ctor and assignment }}}3 + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + // </editor-fold> end Constructors, destructors, and assignment }}}2 + //---------------------------------------------------------------------------- + + //---------------------------------------------------------------------------- + // <editor-fold desc="then_parallel_for"> {{{2 + + template < + class Policy, class Functor, + typename std::enable_if< + // equivalent to: + // requires Kokkos::ExecutionPolicy<remove_cvref_t<Policy>> + is_execution_policy<Kokkos::Impl::remove_cvref_t<Policy>>::value, + // -------------------- + int>::type = 0> + auto then_parallel_for(std::string arg_name, Policy&& arg_policy, + Functor&& functor) const { + //---------------------------------------- + KOKKOS_EXPECTS(!m_graph_impl.expired()) + KOKKOS_EXPECTS(bool(m_node_impl)) + // TODO @graph restore this expectation once we add comparability to space + // instances + // KOKKOS_EXPECTS( + // arg_policy.space() == m_graph_impl->get_execution_space()); + + // needs to static assert constraint: DataParallelFunctor<Functor> + + using policy_t = Kokkos::Impl::remove_cvref_t<Policy>; + // constraint check: same execution space type (or defaulted, maybe?) + static_assert( + std::is_same<typename policy_t::execution_space, + execution_space>::value, + // TODO @graph make defaulted execution space work + //|| policy_t::execution_space_is_defaulted, + "Execution Space mismatch between execution policy and graph"); + + auto policy = Experimental::require((Policy &&) arg_policy, + Kokkos::Impl::KernelInGraphProperty{}); + + using next_policy_t = decltype(policy); + using next_kernel_t = + Kokkos::Impl::GraphNodeKernelImpl<ExecutionSpace, next_policy_t, + std::decay_t<Functor>, + Kokkos::ParallelForTag>; + return this->_then_kernel(next_kernel_t{std::move(arg_name), policy.space(), + (Functor &&) functor, + (Policy &&) policy}); + } + + template < + class Policy, class Functor, + typename std::enable_if< + // equivalent to: + // requires Kokkos::ExecutionPolicy<remove_cvref_t<Policy>> + is_execution_policy<Kokkos::Impl::remove_cvref_t<Policy>>::value, + // -------------------- + int>::type = 0> + auto then_parallel_for(Policy&& policy, Functor&& functor) const { + // needs to static assert constraint: DataParallelFunctor<Functor> + return this->then_parallel_for("", (Policy &&) policy, + (Functor &&) functor); + } + + template <class Functor> + auto then_parallel_for(std::string name, std::size_t n, + Functor&& functor) const { + // needs to static assert constraint: DataParallelFunctor<Functor> + return this->then_parallel_for(std::move(name), + Kokkos::RangePolicy<execution_space>(0, n), + (Functor &&) functor); + } + + template <class Functor> + auto then_parallel_for(std::size_t n, Functor&& functor) const { + // needs to static assert constraint: DataParallelFunctor<Functor> + return this->then_parallel_for("", n, (Functor &&) functor); + } + + // </editor-fold> end then_parallel_for }}}2 + //---------------------------------------------------------------------------- + + //---------------------------------------------------------------------------- + // <editor-fold desc="then_parallel_reduce"> {{{2 + + template < + class Policy, class Functor, class ReturnType, + typename std::enable_if< + // equivalent to: + // requires Kokkos::ExecutionPolicy<remove_cvref_t<Policy>> + is_execution_policy<Kokkos::Impl::remove_cvref_t<Policy>>::value, + // -------------------- + int>::type = 0> + auto then_parallel_reduce(std::string arg_name, Policy&& arg_policy, + Functor&& functor, + ReturnType&& return_value) const { + auto graph_impl_ptr = m_graph_impl.lock(); + KOKKOS_EXPECTS(bool(graph_impl_ptr)) + KOKKOS_EXPECTS(bool(m_node_impl)) + // TODO @graph restore this expectation once we add comparability to space + // instances + // KOKKOS_EXPECTS( + // arg_policy.space() == m_graph_impl->get_execution_space()); + + // needs static assertion of constraint: + // DataParallelReductionFunctor<Functor, ReturnType> + + using policy_t = typename std::remove_cv< + typename std::remove_reference<Policy>::type>::type; + static_assert( + std::is_same<typename policy_t::execution_space, + execution_space>::value, + // TODO @graph make defaulted execution space work + // || policy_t::execution_space_is_defaulted, + "Execution Space mismatch between execution policy and graph"); + + // This is also just an expectation, but it's one that we expect the user + // to interact with (even in release mode), so we should throw an exception + // with an explanation rather than just doing a contract assertion. + // We can't static_assert this because of the way that Reducers store + // whether or not they point to a View as a runtime boolean rather than part + // of the type. + if (Kokkos::Impl::parallel_reduce_needs_fence( + graph_impl_ptr->get_execution_space(), return_value)) { + Kokkos::Impl::throw_runtime_exception( + "Parallel reductions in graphs can't operate on Reducers that " + "reference a scalar because they can't complete synchronously. Use a " + "Kokkos::View instead and keep in mind the result will only be " + "available once the graph is submitted (or in tasks that depend on " + "this one)."); + } + + //---------------------------------------- + // This is a disaster, but I guess it's not a my disaster to fix right now + using return_type_remove_cvref = typename std::remove_cv< + typename std::remove_reference<ReturnType>::type>::type; + static_assert(Kokkos::is_view<return_type_remove_cvref>::value || + Kokkos::is_reducer<return_type_remove_cvref>::value, + "Output argument to parallel reduce in a graph must be a " + "View or a Reducer"); + using return_type = + // Yes, you do really have to do this... + std::conditional_t<Kokkos::is_reducer<return_type_remove_cvref>::value, + return_type_remove_cvref, + const return_type_remove_cvref>; + using functor_type = Kokkos::Impl::remove_cvref_t<Functor>; + // see Kokkos_Parallel_Reduce.hpp for how these details are used there; + // we're just doing the same thing here + using return_value_adapter = + Kokkos::Impl::ParallelReduceReturnValue<void, return_type, + functor_type>; + using functor_adaptor = Kokkos::Impl::ParallelReduceFunctorType< + functor_type, Policy, typename return_value_adapter::value_type, + execution_space>; + // End of Kokkos reducer disaster + //---------------------------------------- + + auto policy = Experimental::require((Policy &&) arg_policy, + Kokkos::Impl::KernelInGraphProperty{}); + + using next_policy_t = decltype(policy); + using next_kernel_t = Kokkos::Impl::GraphNodeKernelImpl< + ExecutionSpace, next_policy_t, typename functor_adaptor::functor_type, + Kokkos::ParallelReduceTag, typename return_value_adapter::reducer_type>; + + return this->_then_kernel(next_kernel_t{ + std::move(arg_name), graph_impl_ptr->get_execution_space(), + (Functor &&) functor, (Policy &&) policy, + return_value_adapter::return_value(return_value, functor)}); + } + + template < + class Policy, class Functor, class ReturnType, + typename std::enable_if< + // equivalent to: + // requires Kokkos::ExecutionPolicy<remove_cvref_t<Policy>> + is_execution_policy<Kokkos::Impl::remove_cvref_t<Policy>>::value, + // -------------------- + int>::type = 0> + auto then_parallel_reduce(Policy&& arg_policy, Functor&& functor, + ReturnType&& return_value) const { + return this->then_parallel_reduce("", (Policy &&) arg_policy, + (Functor &&) functor, + (ReturnType &&) return_value); + } + + template <class Functor, class ReturnType> + auto then_parallel_reduce(std::string label, + typename execution_space::size_type idx_end, + Functor&& functor, + ReturnType&& return_value) const { + return this->then_parallel_reduce( + std::move(label), Kokkos::RangePolicy<execution_space>{0, idx_end}, + (Functor &&) functor, (ReturnType &&) return_value); + } + + template <class Functor, class ReturnType> + auto then_parallel_reduce(typename execution_space::size_type idx_end, + Functor&& functor, + ReturnType&& return_value) const { + return this->then_parallel_reduce("", idx_end, (Functor &&) functor, + (ReturnType &&) return_value); + } + + // </editor-fold> end then_parallel_reduce }}}2 + //---------------------------------------------------------------------------- + + // TODO @graph parallel scan, deep copy, etc. +}; + +} // end namespace Experimental +} // end namespace Kokkos + +#endif // KOKKOS_KOKKOS_GRAPHNODE_HPP diff --git a/packages/kokkos/core/src/Kokkos_Graph_fwd.hpp b/packages/kokkos/core/src/Kokkos_Graph_fwd.hpp new file mode 100644 index 0000000000000000000000000000000000000000..1ba58e4c8c74ac38d49d883f5795591846ed8488 --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_Graph_fwd.hpp @@ -0,0 +1,65 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_KOKKOS_GRAPH_FWD_HPP +#define KOKKOS_KOKKOS_GRAPH_FWD_HPP + +#include <Kokkos_Macros.hpp> + +namespace Kokkos { +namespace Experimental { + +struct TypeErasedTag {}; + +template <class ExecutionSpace> +struct Graph; + +template <class ExecutionSpace, class Kernel = TypeErasedTag, + class Predecessor = TypeErasedTag> +class GraphNodeRef; + +} // end namespace Experimental +} // end namespace Kokkos + +#endif // KOKKOS_KOKKOS_GRAPH_FWD_HPP diff --git a/packages/kokkos/core/src/Kokkos_HBWSpace.hpp b/packages/kokkos/core/src/Kokkos_HBWSpace.hpp new file mode 100644 index 0000000000000000000000000000000000000000..d0366b599cf8c80c92812e386ced90f6fa77eb93 --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_HBWSpace.hpp @@ -0,0 +1,320 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_HBWSPACE_HPP +#define KOKKOS_HBWSPACE_HPP + +#include <Kokkos_Macros.hpp> +#ifdef KOKKOS_ENABLE_HBWSPACE + +#include <Kokkos_HostSpace.hpp> + +namespace Kokkos { + +namespace Experimental { + +namespace Impl { + +/// \brief Initialize lock array for arbitrary size atomics. +/// +/// Arbitrary atomics are implemented using a hash table of locks +/// where the hash value is derived from the address of the +/// object for which an atomic operation is performed. +/// This function initializes the locks to zero (unset). +void init_lock_array_hbw_space(); + +/// \brief Acquire a lock for the address +/// +/// This function tries to acquire the lock for the hash value derived +/// from the provided ptr. If the lock is successfully acquired the +/// function returns true. Otherwise it returns false. +bool lock_address_hbw_space(void* ptr); + +/// \brief Release lock for the address +/// +/// This function releases the lock for the hash value derived +/// from the provided ptr. This function should only be called +/// after previously successfully acquiring a lock with +/// lock_address. +void unlock_address_hbw_space(void* ptr); + +} // namespace Impl + +} // namespace Experimental + +} // namespace Kokkos + +namespace Kokkos { + +namespace Experimental { + +/// \class HBWSpace +/// \brief Memory management for host memory. +/// +/// HBWSpace is a memory space that governs host memory. "Host" +/// memory means the usual CPU-accessible memory. +class HBWSpace { + public: + //! Tag this class as a kokkos memory space + using memory_space = HBWSpace; + using size_type = size_t; + + /// \typedef execution_space + /// \brief Default execution space for this memory space. + /// + /// Every memory space has a default execution space. This is + /// useful for things like initializing a View (which happens in + /// parallel using the View's default execution space). + using execution_space = Kokkos::DefaultHostExecutionSpace; + + //! This memory space preferred device_type + using device_type = Kokkos::Device<execution_space, memory_space>; + + /**\brief Default memory space instance */ + HBWSpace(); + HBWSpace(const HBWSpace& rhs) = default; + HBWSpace& operator=(const HBWSpace&) = default; + ~HBWSpace() = default; + + /**\brief Non-default memory space instance to choose allocation mechansim, + * if available */ + + enum AllocationMechanism { + STD_MALLOC, + POSIX_MEMALIGN, + POSIX_MMAP, + INTEL_MM_ALLOC + }; + + explicit HBWSpace(const AllocationMechanism&); + + /**\brief Allocate untracked memory in the space */ + void* allocate(const size_t arg_alloc_size) const; + void* allocate(const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const; + + /**\brief Deallocate untracked memory in the space */ + void deallocate(void* const arg_alloc_ptr, const size_t arg_alloc_size) const; + void deallocate(const char* arg_label, void* const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const; + + private: + template <class, class, class, class> + friend class LogicalMemorySpace; + + void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size = 0, + const Kokkos::Tools::SpaceHandle = + Kokkos::Tools::make_space_handle(name())) const; + void impl_deallocate(const char* arg_label, void* const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0, + const Kokkos::Tools::SpaceHandle = + Kokkos::Tools::make_space_handle(name())) const; + + public: + /**\brief Return Name of the MemorySpace */ + static constexpr const char* name() { return "HBW"; } + + private: + AllocationMechanism m_alloc_mech; + friend class Kokkos::Impl::SharedAllocationRecord< + Kokkos::Experimental::HBWSpace, void>; +}; + +} // namespace Experimental + +} // namespace Kokkos + +//---------------------------------------------------------------------------- + +namespace Kokkos { + +namespace Impl { + +template <> +class SharedAllocationRecord<Kokkos::Experimental::HBWSpace, void> + : public SharedAllocationRecord<void, void> { + private: + friend Kokkos::Experimental::HBWSpace; + + using RecordBase = SharedAllocationRecord<void, void>; + + SharedAllocationRecord(const SharedAllocationRecord&) = delete; + SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; + + static void deallocate(RecordBase*); + +#ifdef KOKKOS_ENABLE_DEBUG + /**\brief Root record for tracked allocations from this HBWSpace instance */ + static RecordBase s_root_record; +#endif + + const Kokkos::Experimental::HBWSpace m_space; + + protected: + ~SharedAllocationRecord() +#if defined( \ + KOKKOS_IMPL_INTEL_WORKAROUND_NOEXCEPT_SPECIFICATION_VIRTUAL_FUNCTION) + noexcept +#endif + ; + SharedAllocationRecord() = default; + + SharedAllocationRecord( + const Kokkos::Experimental::HBWSpace& arg_space, + const std::string& arg_label, const size_t arg_alloc_size, + const RecordBase::function_type arg_dealloc = &deallocate); + + public: + inline std::string get_label() const { + return std::string(RecordBase::head()->m_label); + } + + KOKKOS_INLINE_FUNCTION static SharedAllocationRecord* allocate( + const Kokkos::Experimental::HBWSpace& arg_space, + const std::string& arg_label, const size_t arg_alloc_size) { +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + return new SharedAllocationRecord(arg_space, arg_label, arg_alloc_size); +#else + return (SharedAllocationRecord*)0; +#endif + } + + /**\brief Allocate tracked memory in the space */ + static void* allocate_tracked(const Kokkos::Experimental::HBWSpace& arg_space, + const std::string& arg_label, + const size_t arg_alloc_size); + + /**\brief Reallocate tracked memory in the space */ + static void* reallocate_tracked(void* const arg_alloc_ptr, + const size_t arg_alloc_size); + + /**\brief Deallocate tracked memory in the space */ + static void deallocate_tracked(void* const arg_alloc_ptr); + + static SharedAllocationRecord* get_record(void* arg_alloc_ptr); + + static void print_records(std::ostream&, + const Kokkos::Experimental::HBWSpace&, + bool detail = false); +}; + +} // namespace Impl + +} // namespace Kokkos + +//---------------------------------------------------------------------------- + +namespace Kokkos { + +namespace Impl { + +static_assert( + Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::HBWSpace, + Kokkos::Experimental::HBWSpace>::assignable, + ""); + +template <> +struct MemorySpaceAccess<Kokkos::HostSpace, Kokkos::Experimental::HBWSpace> { + enum : bool { assignable = true }; + enum : bool { accessible = true }; + enum : bool { deepcopy = true }; +}; + +template <> +struct MemorySpaceAccess<Kokkos::Experimental::HBWSpace, Kokkos::HostSpace> { + enum : bool { assignable = false }; + enum : bool { accessible = true }; + enum : bool { deepcopy = true }; +}; + +} // namespace Impl + +} // namespace Kokkos + +//---------------------------------------------------------------------------- + +namespace Kokkos { + +namespace Impl { + +template <class ExecutionSpace> +struct DeepCopy<Kokkos::Experimental::HBWSpace, Kokkos::Experimental::HBWSpace, + ExecutionSpace> { + DeepCopy(void* dst, const void* src, size_t n) { memcpy(dst, src, n); } + + DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { + exec.fence(); + memcpy(dst, src, n); + } +}; + +template <class ExecutionSpace> +struct DeepCopy<HostSpace, Kokkos::Experimental::HBWSpace, ExecutionSpace> { + DeepCopy(void* dst, const void* src, size_t n) { memcpy(dst, src, n); } + + DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { + exec.fence(); + memcpy(dst, src, n); + } +}; + +template <class ExecutionSpace> +struct DeepCopy<Kokkos::Experimental::HBWSpace, HostSpace, ExecutionSpace> { + DeepCopy(void* dst, const void* src, size_t n) { memcpy(dst, src, n); } + + DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { + exec.fence(); + memcpy(dst, src, n); + } +}; + +} // namespace Impl + +} // namespace Kokkos + +#endif +#endif // #define KOKKOS_HBWSPACE_HPP diff --git a/packages/kokkos/core/src/Kokkos_HIP.hpp b/packages/kokkos/core/src/Kokkos_HIP.hpp new file mode 100644 index 0000000000000000000000000000000000000000..33cf8321c80282d5346c66afb5ee9b4be589576b --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_HIP.hpp @@ -0,0 +1,67 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_HIP_HPP +#define KOKKOS_HIP_HPP + +#include <Kokkos_Core_fwd.hpp> + +#if defined(KOKKOS_ENABLE_HIP) + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#include <Kokkos_HIP_Space.hpp> +#include <Kokkos_Parallel.hpp> +#include <impl/Kokkos_Tags.hpp> + +#include <HIP/Kokkos_HIP_Instance.hpp> +#include <HIP/Kokkos_HIP_MDRangePolicy.hpp> +#include <HIP/Kokkos_HIP_Parallel_Range.hpp> +#include <HIP/Kokkos_HIP_Parallel_MDRange.hpp> +#include <HIP/Kokkos_HIP_Parallel_Team.hpp> +#include <HIP/Kokkos_HIP_UniqueToken.hpp> + +#endif +#endif diff --git a/packages/kokkos/core/src/Kokkos_HIP_Space.hpp b/packages/kokkos/core/src/Kokkos_HIP_Space.hpp new file mode 100644 index 0000000000000000000000000000000000000000..17bd681aa4b7b7aa8d98bb8253c86db81de6ce05 --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_HIP_Space.hpp @@ -0,0 +1,644 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_HIPSPACE_HPP +#define KOKKOS_HIPSPACE_HPP + +#include <Kokkos_Core_fwd.hpp> + +#if defined(KOKKOS_ENABLE_HIP) + +#include <iosfwd> +#include <typeinfo> +#include <string> +#include <cstddef> +#include <iosfwd> + +#include <Kokkos_HostSpace.hpp> +#include <Kokkos_Layout.hpp> +#include <Kokkos_ScratchSpace.hpp> + +#include <impl/Kokkos_Profiling_Interface.hpp> +#include <impl/Kokkos_ExecSpaceInitializer.hpp> +#include <impl/Kokkos_HostSharedPtr.hpp> + +#include <hip/hip_runtime_api.h> +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Experimental { +/** \brief HIP on-device memory management */ + +class HIPSpace { + public: + //! Tag this class as a kokkos memory space + using memory_space = HIPSpace; + using execution_space = Kokkos::Experimental::HIP; + using device_type = Kokkos::Device<execution_space, memory_space>; + + using size_type = unsigned int; + + /*--------------------------------*/ + + HIPSpace(); + HIPSpace(HIPSpace&& rhs) = default; + HIPSpace(const HIPSpace& rhs) = default; + HIPSpace& operator=(HIPSpace&& rhs) = default; + HIPSpace& operator=(const HIPSpace& rhs) = default; + ~HIPSpace() = default; + + /**\brief Allocate untracked memory in the hip space */ + void* allocate(const size_t arg_alloc_size) const; + void* allocate(const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const; + + /**\brief Deallocate untracked memory in the hip space */ + void deallocate(void* const arg_alloc_ptr, const size_t arg_alloc_size) const; + void deallocate(const char* arg_label, void* const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const; + + private: + template <class, class, class, class> + friend class LogicalMemorySpace; + void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size = 0, + const Kokkos::Tools::SpaceHandle = + Kokkos::Tools::make_space_handle(name())) const; + void impl_deallocate(const char* arg_label, void* const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0, + const Kokkos::Tools::SpaceHandle = + Kokkos::Tools::make_space_handle(name())) const; + + public: + /**\brief Return Name of the MemorySpace */ + static constexpr const char* name() { return "HIP"; } + + /*--------------------------------*/ + /** \brief Error reporting for HostSpace attempt to access HIPSpace */ + KOKKOS_DEPRECATED static void access_error(); + KOKKOS_DEPRECATED static void access_error(const void* const); + + private: + int m_device; ///< Which HIP device + + friend class Kokkos::Impl::SharedAllocationRecord< + Kokkos::Experimental::HIPSpace, void>; +}; + +} // namespace Experimental +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Experimental { +/** \brief Host memory that is accessible to HIP execution space + * through HIP's host-pinned memory allocation. + */ +class HIPHostPinnedSpace { + public: + //! Tag this class as a kokkos memory space + /** \brief Memory is in HostSpace so use the HostSpace::execution_space */ + using execution_space = HostSpace::execution_space; + using memory_space = HIPHostPinnedSpace; + using device_type = Kokkos::Device<execution_space, memory_space>; + using size_type = unsigned int; + + /*--------------------------------*/ + + HIPHostPinnedSpace(); + HIPHostPinnedSpace(HIPHostPinnedSpace&& rhs) = default; + HIPHostPinnedSpace(const HIPHostPinnedSpace& rhs) = default; + HIPHostPinnedSpace& operator=(HIPHostPinnedSpace&& rhs) = default; + HIPHostPinnedSpace& operator=(const HIPHostPinnedSpace& rhs) = default; + ~HIPHostPinnedSpace() = default; + + /**\brief Allocate untracked memory in the space */ + void* allocate(const size_t arg_alloc_size) const; + void* allocate(const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const; + + /**\brief Deallocate untracked memory in the space */ + void deallocate(void* const arg_alloc_ptr, const size_t arg_alloc_size) const; + void deallocate(const char* arg_label, void* const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const; + + private: + template <class, class, class, class> + friend class LogicalMemorySpace; + void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size = 0, + const Kokkos::Tools::SpaceHandle = + Kokkos::Tools::make_space_handle(name())) const; + void impl_deallocate(const char* arg_label, void* const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0, + const Kokkos::Tools::SpaceHandle = + Kokkos::Tools::make_space_handle(name())) const; + + public: + /**\brief Return Name of the MemorySpace */ + static constexpr const char* name() { return "HIPHostPinned"; } + + /*--------------------------------*/ +}; +} // namespace Experimental +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { + +static_assert( + Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::HIPSpace, + Kokkos::Experimental::HIPSpace>::assignable, + ""); + +//---------------------------------------- + +template <> +struct MemorySpaceAccess<Kokkos::HostSpace, Kokkos::Experimental::HIPSpace> { + enum : bool { assignable = false }; + enum : bool { accessible = false }; + enum : bool { deepcopy = true }; +}; + +template <> +struct MemorySpaceAccess<Kokkos::HostSpace, + Kokkos::Experimental::HIPHostPinnedSpace> { + // HostSpace::execution_space == HIPHostPinnedSpace::execution_space + enum : bool { assignable = true }; + enum : bool { accessible = true }; + enum : bool { deepcopy = true }; +}; + +//---------------------------------------- + +template <> +struct MemorySpaceAccess<Kokkos::Experimental::HIPSpace, Kokkos::HostSpace> { + enum : bool { assignable = false }; + enum : bool { accessible = false }; + enum : bool { deepcopy = true }; +}; + +template <> +struct MemorySpaceAccess<Kokkos::Experimental::HIPSpace, + Kokkos::Experimental::HIPHostPinnedSpace> { + // HIPSpace::execution_space != HIPHostPinnedSpace::execution_space + enum : bool { assignable = false }; + enum : bool { accessible = true }; // HIPSpace::execution_space + enum : bool { deepcopy = true }; +}; + +//---------------------------------------- +// HIPHostPinnedSpace::execution_space == HostSpace::execution_space +// HIPHostPinnedSpace accessible to both HIP and Host + +template <> +struct MemorySpaceAccess<Kokkos::Experimental::HIPHostPinnedSpace, + Kokkos::HostSpace> { + enum : bool { assignable = false }; // Cannot access from HIP + enum : bool { accessible = true }; // HIPHostPinnedSpace::execution_space + enum : bool { deepcopy = true }; +}; + +template <> +struct MemorySpaceAccess<Kokkos::Experimental::HIPHostPinnedSpace, + Kokkos::Experimental::HIPSpace> { + enum : bool { assignable = false }; // Cannot access from Host + enum : bool { accessible = false }; + enum : bool { deepcopy = true }; +}; + +}; // namespace Impl +//---------------------------------------- + +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { + +void DeepCopyAsyncHIP(void* dst, const void* src, size_t n); + +template <> +struct DeepCopy<Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace, + Kokkos::Experimental::HIP> { + DeepCopy(void* dst, const void* src, size_t); + DeepCopy(const Kokkos::Experimental::HIP&, void* dst, const void* src, + size_t); +}; + +template <> +struct DeepCopy<Kokkos::Experimental::HIPSpace, HostSpace, + Kokkos::Experimental::HIP> { + DeepCopy(void* dst, const void* src, size_t); + DeepCopy(const Kokkos::Experimental::HIP&, void* dst, const void* src, + size_t); +}; + +template <> +struct DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace, + Kokkos::Experimental::HIP> { + DeepCopy(void* dst, const void* src, size_t); + DeepCopy(const Kokkos::Experimental::HIP&, void* dst, const void* src, + size_t); +}; + +template <class ExecutionSpace> +struct DeepCopy<Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace, + ExecutionSpace> { + inline DeepCopy(void* dst, const void* src, size_t n) { + (void)DeepCopy<Kokkos::Experimental::HIPSpace, + Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIP>( + dst, src, n); + } + + inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, + size_t n) { + exec.fence(); + DeepCopyAsyncHIP(dst, src, n); + } +}; + +template <class ExecutionSpace> +struct DeepCopy<Kokkos::Experimental::HIPSpace, HostSpace, ExecutionSpace> { + inline DeepCopy(void* dst, const void* src, size_t n) { + (void)DeepCopy<Kokkos::Experimental::HIPSpace, HostSpace, + Kokkos::Experimental::HIP>(dst, src, n); + } + + inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, + size_t n) { + exec.fence(); + DeepCopyAsyncHIP(dst, src, n); + } +}; + +template <class ExecutionSpace> +struct DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace, ExecutionSpace> { + inline DeepCopy(void* dst, const void* src, size_t n) { + (void)DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace, + Kokkos::Experimental::HIP>(dst, src, n); + } + + inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, + size_t n) { + exec.fence(); + DeepCopyAsyncHIP(dst, src, n); + } +}; + +template <> +struct DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace, + Kokkos::Experimental::HIPHostPinnedSpace, + Kokkos::Experimental::HIP> { + DeepCopy(void* dst, const void* src, size_t); + DeepCopy(const Kokkos::Experimental::HIP&, void* dst, const void* src, + size_t); +}; + +template <> +struct DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace, HostSpace, + Kokkos::Experimental::HIP> { + DeepCopy(void* dst, const void* src, size_t); + DeepCopy(const Kokkos::Experimental::HIP&, void* dst, const void* src, + size_t); +}; + +template <> +struct DeepCopy<HostSpace, Kokkos::Experimental::HIPHostPinnedSpace, + Kokkos::Experimental::HIP> { + DeepCopy(void* dst, const void* src, size_t); + DeepCopy(const Kokkos::Experimental::HIP&, void* dst, const void* src, + size_t); +}; + +template <class ExecutionSpace> +struct DeepCopy<Kokkos::Experimental::HIPSpace, + Kokkos::Experimental::HIPHostPinnedSpace, ExecutionSpace> { + inline DeepCopy(void* dst, const void* src, size_t n) { + (void)DeepCopy<Kokkos::Experimental::HIPSpace, HostSpace, + Kokkos::Experimental::HIP>(dst, src, n); + } + + inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, + size_t n) { + exec.fence(); + DeepCopyAsyncHIP(dst, src, n); + } +}; + +template <class ExecutionSpace> +struct DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace, + Kokkos::Experimental::HIPSpace, ExecutionSpace> { + inline DeepCopy(void* dst, const void* src, size_t n) { + (void)DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace, + Kokkos::Experimental::HIP>(dst, src, n); + } + + inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, + size_t n) { + exec.fence(); + DeepCopyAsyncHIP(dst, src, n); + } +}; + +template <class ExecutionSpace> +struct DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace, + Kokkos::Experimental::HIPHostPinnedSpace, ExecutionSpace> { + inline DeepCopy(void* dst, const void* src, size_t n) { + (void)DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace, + Kokkos::Experimental::HIPHostPinnedSpace, + Kokkos::Experimental::HIP>(dst, src, n); + } + + inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, + size_t n) { + exec.fence(); + DeepCopyAsyncHIP(dst, src, n); + } +}; + +template <class ExecutionSpace> +struct DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace, HostSpace, + ExecutionSpace> { + inline DeepCopy(void* dst, const void* src, size_t n) { + (void)DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace, HostSpace, + Kokkos::Experimental::HIP>(dst, src, n); + } + + inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, + size_t n) { + exec.fence(); + DeepCopyAsyncHIP(dst, src, n); + } +}; + +template <class ExecutionSpace> +struct DeepCopy<HostSpace, Kokkos::Experimental::HIPHostPinnedSpace, + ExecutionSpace> { + inline DeepCopy(void* dst, const void* src, size_t n) { + (void)DeepCopy<HostSpace, Kokkos::Experimental::HIPHostPinnedSpace, + Kokkos::Experimental::HIP>(dst, src, n); + } + + inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, + size_t n) { + exec.fence(); + DeepCopyAsyncHIP(dst, src, n); + } +}; +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <> +class SharedAllocationRecord<Kokkos::Experimental::HIPSpace, void> + : public HostInaccessibleSharedAllocationRecordCommon< + Kokkos::Experimental::HIPSpace> { + private: + friend class SharedAllocationRecordCommon<Kokkos::Experimental::HIPSpace>; + friend class HostInaccessibleSharedAllocationRecordCommon< + Kokkos::Experimental::HIPSpace>; + using base_t = HostInaccessibleSharedAllocationRecordCommon< + Kokkos::Experimental::HIPSpace>; + using RecordBase = SharedAllocationRecord<void, void>; + + SharedAllocationRecord(const SharedAllocationRecord&) = delete; + SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; + +#ifdef KOKKOS_ENABLE_DEBUG + static RecordBase s_root_record; +#endif + + const Kokkos::Experimental::HIPSpace m_space; + + protected: + ~SharedAllocationRecord(); + + SharedAllocationRecord( + const Kokkos::Experimental::HIPSpace& arg_space, + const std::string& arg_label, const size_t arg_alloc_size, + const RecordBase::function_type arg_dealloc = &base_t::deallocate); +}; + +template <> +class SharedAllocationRecord<Kokkos::Experimental::HIPHostPinnedSpace, void> + : public SharedAllocationRecordCommon< + Kokkos::Experimental::HIPHostPinnedSpace> { + private: + friend class SharedAllocationRecordCommon< + Kokkos::Experimental::HIPHostPinnedSpace>; + using base_t = + SharedAllocationRecordCommon<Kokkos::Experimental::HIPHostPinnedSpace>; + using RecordBase = SharedAllocationRecord<void, void>; + + SharedAllocationRecord(const SharedAllocationRecord&) = delete; + SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; + +#ifdef KOKKOS_ENABLE_DEBUG + static RecordBase s_root_record; +#endif + + const Kokkos::Experimental::HIPHostPinnedSpace m_space; + + protected: + ~SharedAllocationRecord(); + SharedAllocationRecord() = default; + + SharedAllocationRecord( + const Kokkos::Experimental::HIPHostPinnedSpace& arg_space, + const std::string& arg_label, const size_t arg_alloc_size, + const RecordBase::function_type arg_dealloc = &base_t::deallocate); +}; +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { +namespace Impl { +class HIPInternal; +} +/// \class HIP +/// \brief Kokkos device for multicore processors in the host memory space. +class HIP { + public: + //------------------------------------ + //! \name Type declarations that all Kokkos devices must provide. + //@{ + + //! Tag this class as a kokkos execution space + using execution_space = HIP; + using memory_space = HIPSpace; + using device_type = Kokkos::Device<execution_space, memory_space>; + + using array_layout = LayoutLeft; + using size_type = HIPSpace::size_type; + + using scratch_memory_space = ScratchMemorySpace<HIP>; + + HIP(); + HIP(hipStream_t stream); + + //@} + //------------------------------------ + //! \name Functions that all Kokkos devices must implement. + //@{ + + KOKKOS_INLINE_FUNCTION static int in_parallel() { +#if defined(__HIP_ARCH__) + return true; +#else + return false; +#endif + } + + /** \brief Wait until all dispatched functors complete. + * + * The parallel_for or parallel_reduce dispatch of a functor may return + * asynchronously, before the functor completes. This method does not return + * until all dispatched functors on this device have completed. + */ + static void impl_static_fence(); + + void fence() const; + + hipStream_t hip_stream() const; + + /// \brief Print configuration information to the given output stream. + static void print_configuration(std::ostream&, const bool detail = false); + + /// \brief Free any resources being consumed by the device. + static void impl_finalize(); + + /** \brief Initialize the device. + * + */ + struct SelectDevice { + int hip_device_id; + SelectDevice() : hip_device_id(0) {} + explicit SelectDevice(int id) : hip_device_id(id) {} + }; + + int hip_device() const; + static hipDeviceProp_t const& hip_device_prop(); + + static void impl_initialize(const SelectDevice = SelectDevice()); + + static int impl_is_initialized(); + + // static size_type device_arch(); + + static size_type detect_device_count(); + + static int concurrency(); + static const char* name(); + + inline Impl::HIPInternal* impl_internal_space_instance() const { + return m_space_instance.get(); + } + + uint32_t impl_instance_id() const noexcept { return 0; } + + private: + Kokkos::Impl::HostSharedPtr<Impl::HIPInternal> m_space_instance; +}; +} // namespace Experimental +namespace Tools { +namespace Experimental { +template <> +struct DeviceTypeTraits<Kokkos::Experimental::HIP> { + static constexpr DeviceType id = DeviceType::HIP; +}; +} // namespace Experimental +} // namespace Tools + +namespace Impl { + +class HIPSpaceInitializer : public Kokkos::Impl::ExecSpaceInitializerBase { + public: + HIPSpaceInitializer() = default; + ~HIPSpaceInitializer() = default; + void initialize(const InitArguments& args) final; + void finalize(const bool) final; + void fence() final; + void print_configuration(std::ostream& msg, const bool detail) final; +}; + +} // namespace Impl +} // namespace Kokkos + +namespace Kokkos { +namespace Impl { + +template <> +struct MemorySpaceAccess<Kokkos::Experimental::HIPSpace, + Kokkos::Experimental::HIP::scratch_memory_space> { + enum : bool { assignable = false }; + enum : bool { accessible = true }; + enum : bool { deepcopy = false }; +}; + +} // namespace Impl +} // namespace Kokkos + +#endif /* #if defined( KOKKOS_ENABLE_HIP ) */ +#endif /* #define KOKKOS_HIPSPACE_HPP */ diff --git a/packages/kokkos/core/src/Kokkos_HPX.hpp b/packages/kokkos/core/src/Kokkos_HPX.hpp new file mode 100644 index 0000000000000000000000000000000000000000..2100b49c116cfaecd35205aa60708ed1535578ca --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_HPX.hpp @@ -0,0 +1,2683 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_HPX_HPP +#define KOKKOS_HPX_HPP + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_HPX) + +#include <Kokkos_Core_fwd.hpp> + +#include <Kokkos_HostSpace.hpp> +#include <cstddef> +#include <iosfwd> + +#ifdef KOKKOS_ENABLE_HBWSPACE +#include <Kokkos_HBWSpace.hpp> +#endif + +#include <HPX/Kokkos_HPX_ChunkedRoundRobinExecutor.hpp> +#include <Kokkos_HostSpace.hpp> +#include <Kokkos_Layout.hpp> +#include <Kokkos_MemoryTraits.hpp> +#include <Kokkos_Parallel.hpp> +#include <Kokkos_ScratchSpace.hpp> +#include <Kokkos_TaskScheduler.hpp> +#include <impl/Kokkos_ConcurrentBitset.hpp> +#include <impl/Kokkos_FunctorAdapter.hpp> +#include <impl/Kokkos_FunctorAnalysis.hpp> +#include <impl/Kokkos_Tools.hpp> +#include <impl/Kokkos_Tags.hpp> +#include <impl/Kokkos_TaskQueue.hpp> +#include <impl/Kokkos_ExecSpaceInitializer.hpp> + +#include <KokkosExp_MDRangePolicy.hpp> + +#include <hpx/apply.hpp> +#include <hpx/hpx_start.hpp> +#include <hpx/include/util.hpp> +#include <hpx/lcos/local/barrier.hpp> +#include <hpx/lcos/local/latch.hpp> +#include <hpx/parallel/algorithms/for_loop.hpp> +#include <hpx/parallel/algorithms/reduce.hpp> +#include <hpx/parallel/executors/static_chunk_size.hpp> +#include <hpx/runtime.hpp> +#include <hpx/runtime/threads/run_as_hpx_thread.hpp> +#include <hpx/runtime/threads/threadmanager.hpp> +#include <hpx/runtime/thread_pool_helpers.hpp> + +#include <Kokkos_UniqueToken.hpp> + +#include <functional> +#include <iostream> +#include <memory> +#include <sstream> +#include <stdexcept> +#include <type_traits> +#include <vector> + +// There are currently two different implementations for the parallel dispatch +// functions: +// +// - 0: The HPX way. Unfortunately, this comes with unnecessary +// overheads at the moment, so there is +// - 1: The manual way. This way is more verbose and does not take advantage of +// e.g. parallel::for_loop in HPX but it is significantly faster in many +// benchmarks. +// - 2: Like 1, but spawn tasks using for_loop and a custom executor. +// +// In the long run 0 should be the preferred implementation, but until HPX is +// improved 1 will be the default. +#ifndef KOKKOS_HPX_IMPLEMENTATION +#define KOKKOS_HPX_IMPLEMENTATION 1 +#endif + +#if (KOKKOS_HPX_IMPLEMENTATION < 0) || (KOKKOS_HPX_IMPLEMENTATION > 2) +#error "You have chosen an invalid value for KOKKOS_HPX_IMPLEMENTATION" +#endif + +// [note 1] +// +// When using the asynchronous backend and independent instances, we explicitly +// reset the shared data at the end of a parallel task (execute_task). We do +// this to avoid circular references with shared pointers that would otherwise +// never be released. +// +// The HPX instance holds shared data for the instance in a shared_ptr. One of +// the pieces of shared data is the future that we use to sequence parallel +// dispatches. When a parallel task is launched, a copy of the closure +// (ParallelFor, ParallelReduce, etc.) is captured in the task. The closure +// also holds the policy, the policy holds the HPX instance, the instance holds +// the shared data (for use of buffers in the parallel task). When attaching a +// continuation to a future, the continuation is stored in the future (shared +// state). This means that there is a cycle future -> continuation -> closure +// -> policy -> HPX -> shared data -> future. We break this by releasing the +// shared data early, as (the pointer to) the shared data will not be used +// anymore by the closure at the end of execute_task. +// +// We also mark the shared instance data as mutable so that we can reset it +// from the const execute_task member function. + +namespace Kokkos { +namespace Impl { +class thread_buffer { + static constexpr std::size_t m_cache_line_size = 64; + + std::size_t m_num_threads; + std::size_t m_size_per_thread; + std::size_t m_size_total; + char *m_data; + + void pad_to_cache_line(std::size_t &size) { + size = ((size + m_cache_line_size - 1) / m_cache_line_size) * + m_cache_line_size; + } + + public: + thread_buffer() + : m_num_threads(0), + m_size_per_thread(0), + m_size_total(0), + m_data(nullptr) {} + thread_buffer(const std::size_t num_threads, + const std::size_t size_per_thread) { + resize(num_threads, size_per_thread); + } + ~thread_buffer() { delete[] m_data; } + + thread_buffer(const thread_buffer &) = delete; + thread_buffer(thread_buffer &&) = delete; + thread_buffer &operator=(const thread_buffer &) = delete; + thread_buffer &operator=(thread_buffer) = delete; + + void resize(const std::size_t num_threads, + const std::size_t size_per_thread) { + m_num_threads = num_threads; + m_size_per_thread = size_per_thread; + + pad_to_cache_line(m_size_per_thread); + + std::size_t size_total_new = m_num_threads * m_size_per_thread; + + if (m_size_total < size_total_new) { + delete[] m_data; + m_data = new char[size_total_new]; + m_size_total = size_total_new; + } + } + + char *get(std::size_t thread_num) { + assert(thread_num < m_num_threads); + if (m_data == nullptr) { + return nullptr; + } + return &m_data[thread_num * m_size_per_thread]; + } + + std::size_t size_per_thread() const noexcept { return m_size_per_thread; } + std::size_t size_total() const noexcept { return m_size_total; } +}; +} // namespace Impl + +namespace Experimental { +class HPX { + private: + static bool m_hpx_initialized; + static std::atomic<uint32_t> m_next_instance_id; + uint32_t m_instance_id = 0; + +#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH) + public: + enum class instance_mode { global, independent }; + instance_mode m_mode; + + private: + static std::atomic<uint32_t> m_active_parallel_region_count; + + struct instance_data { + instance_data() = default; + instance_data(hpx::shared_future<void> future) : m_future(future) {} + Kokkos::Impl::thread_buffer m_buffer; + hpx::shared_future<void> m_future = hpx::make_ready_future<void>(); + }; + + mutable std::shared_ptr<instance_data> m_independent_instance_data; + static instance_data m_global_instance_data; + + std::reference_wrapper<Kokkos::Impl::thread_buffer> m_buffer; + std::reference_wrapper<hpx::shared_future<void>> m_future; +#else + static Kokkos::Impl::thread_buffer m_global_buffer; +#endif + + public: + using execution_space = HPX; + using memory_space = HostSpace; + using device_type = Kokkos::Device<execution_space, memory_space>; + using array_layout = LayoutRight; + using size_type = memory_space::size_type; + using scratch_memory_space = ScratchMemorySpace<HPX>; + +#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH) + HPX() + noexcept + : m_instance_id(0), + m_mode(instance_mode::global), + m_buffer(m_global_instance_data.m_buffer), + m_future(m_global_instance_data.m_future) {} + + HPX(instance_mode mode) + : m_instance_id(mode == instance_mode::independent ? m_next_instance_id++ + : 0), + m_mode(mode), + m_independent_instance_data(mode == instance_mode::independent + ? (new instance_data()) + : nullptr), + m_buffer(mode == instance_mode::independent + ? m_independent_instance_data->m_buffer + : m_global_instance_data.m_buffer), + m_future(mode == instance_mode::independent + ? m_independent_instance_data->m_future + : m_global_instance_data.m_future) {} + + HPX(hpx::shared_future<void> future) + : m_instance_id(m_next_instance_id++), + m_mode(instance_mode::independent), + + m_independent_instance_data(new instance_data(future)), + m_buffer(m_independent_instance_data->m_buffer), + m_future(m_independent_instance_data->m_future) {} + + HPX(const HPX &other) + : m_instance_id(other.m_instance_id), + m_mode(other.m_mode), + m_independent_instance_data(other.m_independent_instance_data), + m_buffer(other.m_buffer), + m_future(other.m_future) {} + + HPX &operator=(const HPX &other) { + m_instance_id = + other.m_mode == instance_mode::independent ? m_next_instance_id++ : 0; + m_mode = other.m_mode; + m_independent_instance_data = other.m_independent_instance_data; + m_buffer = m_mode == instance_mode::independent + ? m_independent_instance_data->m_buffer + : m_global_instance_data.m_buffer; + m_future = m_mode == instance_mode::independent + ? m_independent_instance_data->m_future + : m_global_instance_data.m_future; + return *this; + } +#else + HPX() noexcept {} +#endif + + static void print_configuration(std::ostream &, + const bool /* verbose */ = false) { + std::cout << "HPX backend" << std::endl; + } + uint32_t impl_instance_id() const noexcept { return m_instance_id; } + +#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH) + static bool in_parallel(HPX const &instance = HPX()) noexcept { + return !instance.impl_get_future().is_ready(); + } +#else + static bool in_parallel(HPX const & = HPX()) noexcept { return false; } +#endif + +#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH) + static void impl_decrement_active_parallel_region_count() { + --m_active_parallel_region_count; + } + + static void impl_increment_active_parallel_region_count() { + ++m_active_parallel_region_count; + } + + void impl_fence_instance() const { + if (hpx::threads::get_self_ptr() == nullptr) { + hpx::threads::run_as_hpx_thread([this]() { impl_get_future().wait(); }); + } else { + impl_get_future().wait(); + } + } + + void impl_fence_all_instances() const { + hpx::util::yield_while( + []() { return m_active_parallel_region_count.load() != 0; }); + } +#endif + + void fence() const { +#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH) + if (m_mode == instance_mode::global) { + impl_fence_all_instances(); + } else { + impl_fence_instance(); + } +#endif + } + + static bool is_asynchronous(HPX const & = HPX()) noexcept { +#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH) + return true; +#else + return false; +#endif + } + + static std::vector<HPX> partition(...) { + Kokkos::abort( + "Kokkos::Experimental::HPX::partition_master: can't partition an HPX " + "instance\n"); + return std::vector<HPX>(); + } + + template <typename F> + static void partition_master(F const &, int requested_num_partitions = 0, + int = 0) { + if (requested_num_partitions > 1) { + Kokkos::abort( + "Kokkos::Experimental::HPX::partition_master: can't partition an " + "HPX instance\n"); + } + } + + static int concurrency(); + static void impl_initialize(int thread_count); + static void impl_initialize(); + static bool impl_is_initialized() noexcept; + static void impl_finalize(); + + static int impl_thread_pool_size() noexcept { + hpx::runtime *rt = hpx::get_runtime_ptr(); + if (rt == nullptr) { + return 0; + } else { + if (hpx::threads::get_self_ptr() == nullptr) { + return hpx::resource::get_thread_pool(0).get_os_thread_count(); + } else { + return hpx::this_thread::get_pool()->get_os_thread_count(); + } + } + } + + static int impl_thread_pool_rank() noexcept { + hpx::runtime *rt = hpx::get_runtime_ptr(); + if (rt == nullptr) { + return 0; + } else { + if (hpx::threads::get_self_ptr() == nullptr) { + return 0; + } else { + return hpx::this_thread::get_pool()->get_pool_index(); + } + } + } + + static int impl_thread_pool_size(int depth) { + if (depth == 0) { + return impl_thread_pool_size(); + } else { + return 1; + } + } + + static int impl_max_hardware_threads() noexcept { + return hpx::threads::hardware_concurrency(); + } + + static int impl_hardware_thread_id() noexcept { + return hpx::get_worker_thread_num(); + } + + Kokkos::Impl::thread_buffer &impl_get_buffer() const noexcept { +#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH) + return m_buffer.get(); +#else + return m_global_buffer; +#endif + } + +#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH) + hpx::shared_future<void> &impl_get_future() const noexcept { + return m_future; + } +#endif + +#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH) + struct KOKKOS_ATTRIBUTE_NODISCARD reset_on_exit_parallel { + HPX const &m_space; + reset_on_exit_parallel(HPX const &space) : m_space(space) {} + ~reset_on_exit_parallel() { + // See [note 1] for an explanation. m_independent_instance_data is + // marked mutable. + m_space.m_independent_instance_data.reset(); + + HPX::impl_decrement_active_parallel_region_count(); + } + }; +#endif + + static constexpr const char *name() noexcept { return "HPX"; } +}; +} // namespace Experimental + +namespace Tools { +namespace Experimental { +template <> +struct DeviceTypeTraits<Kokkos::Experimental::HPX> { + static constexpr DeviceType id = DeviceType::HPX; +}; +} // namespace Experimental +} // namespace Tools + +namespace Impl { + +class HPXSpaceInitializer : public ExecSpaceInitializerBase { + public: + HPXSpaceInitializer() = default; + ~HPXSpaceInitializer() = default; + void initialize(const InitArguments &args) final; + void finalize(const bool) final; + void fence() final; + void print_configuration(std::ostream &msg, const bool detail) final; +}; + +#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH) +template <typename Closure> +inline void dispatch_execute_task(Closure *closure, + Kokkos::Experimental::HPX const &instance, + bool force_synchronous = false) { + Kokkos::Experimental::HPX::impl_increment_active_parallel_region_count(); + + if (hpx::threads::get_self_ptr() == nullptr) { + hpx::threads::run_as_hpx_thread([closure, &instance]() { + hpx::shared_future<void> &fut = instance.impl_get_future(); + Closure closure_copy = *closure; + fut = fut.then([closure_copy](hpx::shared_future<void> &&) { + closure_copy.execute_task(); + }); + }); + } else { + hpx::shared_future<void> &fut = instance.impl_get_future(); + Closure closure_copy = *closure; + fut = fut.then([closure_copy](hpx::shared_future<void> &&) { + closure_copy.execute_task(); + }); + } + + if (force_synchronous) { + instance.fence(); + } +} +#else +template <typename Closure> +inline void dispatch_execute_task(Closure *closure, + Kokkos::Experimental::HPX const &, + bool = false) { +#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH) + Kokkos::Experimental::HPX::impl_increment_active_parallel_region_count(); +#endif + + if (hpx::threads::get_self_ptr() == nullptr) { + hpx::threads::run_as_hpx_thread([closure]() { closure->execute_task(); }); + } else { + closure->execute_task(); + } +} +#endif +} // namespace Impl +} // namespace Kokkos + +namespace Kokkos { +namespace Impl { +template <> +struct MemorySpaceAccess<Kokkos::Experimental::HPX::memory_space, + Kokkos::Experimental::HPX::scratch_memory_space> { + enum : bool { assignable = false }; + enum : bool { accessible = true }; + enum : bool { deepcopy = false }; +}; + +} // namespace Impl +} // namespace Kokkos + +namespace Kokkos { +namespace Experimental { +template <> +class UniqueToken<HPX, UniqueTokenScope::Instance> { + private: + using buffer_type = Kokkos::View<uint32_t *, Kokkos::HostSpace>; + int m_count; + buffer_type m_buffer_view; + uint32_t volatile *m_buffer; + + public: + using execution_space = HPX; + using size_type = int; + + /// \brief create object size for concurrency on the given instance + /// + /// This object should not be shared between instances + UniqueToken(execution_space const & = execution_space()) noexcept + : m_count(execution_space::impl_max_hardware_threads()), + m_buffer_view(buffer_type()), + m_buffer(nullptr) {} + + UniqueToken(size_type max_size, execution_space const & = execution_space()) + : m_count(max_size > execution_space::impl_max_hardware_threads() + ? execution_space::impl_max_hardware_threads() + : max_size), + m_buffer_view( + max_size > execution_space::impl_max_hardware_threads() + ? buffer_type() + : buffer_type("UniqueToken::m_buffer_view", + ::Kokkos::Impl::concurrent_bitset::buffer_bound( + m_count))), + m_buffer(m_buffer_view.data()) {} + + /// \brief upper bound for acquired values, i.e. 0 <= value < size() + KOKKOS_INLINE_FUNCTION + int size() const noexcept { return m_count; } + + /// \brief acquire value such that 0 <= value < size() + KOKKOS_INLINE_FUNCTION + int acquire() const noexcept { +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + if (m_buffer == nullptr) { + return execution_space::impl_hardware_thread_id(); + } else { + const ::Kokkos::pair<int, int> result = + ::Kokkos::Impl::concurrent_bitset::acquire_bounded( + m_buffer, m_count, ::Kokkos::Impl::clock_tic() % m_count); + + if (result.first < 0) { + ::Kokkos::abort( + "UniqueToken<HPX> failure to acquire tokens, no tokens " + "available"); + } + return result.first; + } +#else + return 0; +#endif + } + + /// \brief release a value acquired by generate + KOKKOS_INLINE_FUNCTION + void release(int i) const noexcept { +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + if (m_buffer != nullptr) { + ::Kokkos::Impl::concurrent_bitset::release(m_buffer, i); + } +#else + (void)i; +#endif + } +}; + +template <> +class UniqueToken<HPX, UniqueTokenScope::Global> { + public: + using execution_space = HPX; + using size_type = int; + UniqueToken(execution_space const & = execution_space()) noexcept {} + + // NOTE: Currently this assumes that there is no oversubscription. + // hpx::get_num_worker_threads can't be used directly because it may yield + // it's task (problematic if called after hpx::get_worker_thread_num). + int size() const noexcept { return HPX::impl_max_hardware_threads(); } + int acquire() const noexcept { return HPX::impl_hardware_thread_id(); } + void release(int) const noexcept {} +}; +} // namespace Experimental +} // namespace Kokkos + +namespace Kokkos { +namespace Impl { + +struct HPXTeamMember { + public: + using execution_space = Kokkos::Experimental::HPX; + using scratch_memory_space = + Kokkos::ScratchMemorySpace<Kokkos::Experimental::HPX>; + + private: + scratch_memory_space m_team_shared; + + int m_league_size; + int m_league_rank; + int m_team_size; + int m_team_rank; + + public: + KOKKOS_INLINE_FUNCTION + const scratch_memory_space &team_shmem() const { + return m_team_shared.set_team_thread_mode(0, 1, 0); + } + + KOKKOS_INLINE_FUNCTION + const execution_space::scratch_memory_space &team_scratch(const int) const { + return m_team_shared.set_team_thread_mode(0, 1, 0); + } + + KOKKOS_INLINE_FUNCTION + const execution_space::scratch_memory_space &thread_scratch(const int) const { + return m_team_shared.set_team_thread_mode(0, team_size(), team_rank()); + } + + KOKKOS_INLINE_FUNCTION int league_rank() const noexcept { + return m_league_rank; + } + + KOKKOS_INLINE_FUNCTION int league_size() const noexcept { + return m_league_size; + } + + KOKKOS_INLINE_FUNCTION int team_rank() const noexcept { return m_team_rank; } + KOKKOS_INLINE_FUNCTION int team_size() const noexcept { return m_team_size; } + + template <class... Properties> + constexpr KOKKOS_INLINE_FUNCTION HPXTeamMember( + const TeamPolicyInternal<Kokkos::Experimental::HPX, Properties...> + &policy, + const int team_rank, const int league_rank, void *scratch, + int scratch_size) noexcept + : m_team_shared(scratch, scratch_size, scratch, scratch_size), + m_league_size(policy.league_size()), + m_league_rank(league_rank), + m_team_size(policy.team_size()), + m_team_rank(team_rank) {} + + KOKKOS_INLINE_FUNCTION + void team_barrier() const {} + + template <class ValueType> + KOKKOS_INLINE_FUNCTION void team_broadcast(ValueType &, const int &) const { + static_assert(std::is_trivially_default_constructible<ValueType>(), + "Only trivial constructible types can be broadcasted"); + } + + template <class Closure, class ValueType> + KOKKOS_INLINE_FUNCTION void team_broadcast(const Closure &, ValueType &, + const int &) const { + static_assert(std::is_trivially_default_constructible<ValueType>(), + "Only trivial constructible types can be broadcasted"); + } + + template <class ValueType, class JoinOp> + KOKKOS_INLINE_FUNCTION ValueType team_reduce(const ValueType &value, + const JoinOp &) const { + return value; + } + + template <class ReducerType> + KOKKOS_INLINE_FUNCTION + typename std::enable_if<is_reducer<ReducerType>::value>::type + team_reduce(const ReducerType &) const {} + + template <typename Type> + KOKKOS_INLINE_FUNCTION Type + team_scan(const Type &value, Type *const global_accum = nullptr) const { + if (global_accum) { + Kokkos::atomic_fetch_add(global_accum, value); + } + + return 0; + } +}; + +template <class... Properties> +class TeamPolicyInternal<Kokkos::Experimental::HPX, Properties...> + : public PolicyTraits<Properties...> { + using traits = PolicyTraits<Properties...>; + + int m_league_size; + int m_team_size; + std::size_t m_team_scratch_size[2]; + std::size_t m_thread_scratch_size[2]; + int m_chunk_size; + + public: + //! Tag this class as a kokkos execution policy + using execution_policy = TeamPolicyInternal; + + using member_type = HPXTeamMember; + + //! Execution space of this execution policy: + using execution_space = Kokkos::Experimental::HPX; + + // NOTE: Max size is 1 for simplicity. In most cases more than 1 is not + // necessary on CPU. Implement later if there is a need. + template <class FunctorType> + inline static int team_size_max(const FunctorType &) { + return 1; + } + + template <class FunctorType> + inline static int team_size_recommended(const FunctorType &) { + return 1; + } + + template <class FunctorType> + inline static int team_size_recommended(const FunctorType &, const int &) { + return 1; + } + + template <class FunctorType> + int team_size_max(const FunctorType &, const ParallelForTag &) const { + return 1; + } + + template <class FunctorType> + int team_size_max(const FunctorType &, const ParallelReduceTag &) const { + return 1; + } + + template <class FunctorType, class ReducerType> + int team_size_max(const FunctorType &, const ReducerType &, + const ParallelReduceTag &) const { + return 1; + } + + template <class FunctorType> + int team_size_recommended(const FunctorType &, const ParallelForTag &) const { + return 1; + } + + template <class FunctorType> + int team_size_recommended(const FunctorType &, + const ParallelReduceTag &) const { + return 1; + } + + template <class FunctorType, class ReducerType> + int team_size_recommended(const FunctorType &, const ReducerType &, + const ParallelReduceTag &) const { + return 1; + } + + static int vector_length_max() { return 1; } + + inline int impl_vector_length() noexcept { return 1; } + inline bool impl_auto_team_size() noexcept { return false; } + inline bool impl_auto_vector_length() noexcept { return false; } + inline void impl_set_vector_length(int) noexcept {} + inline void impl_set_team_size(int) noexcept {} + + private: + inline void init(const int league_size_request, const int team_size_request) { + m_league_size = league_size_request; + const int max_team_size = 1; // TODO: Can't use team_size_max(...) because + // it requires a functor as argument. + m_team_size = + team_size_request > max_team_size ? max_team_size : team_size_request; + + if (m_chunk_size > 0) { + if (!Impl::is_integral_power_of_two(m_chunk_size)) + Kokkos::abort("TeamPolicy blocking granularity must be power of two"); + } else { + int new_chunk_size = 1; + while (new_chunk_size * 4 * Kokkos::Experimental::HPX::concurrency() < + m_league_size) { + new_chunk_size *= 2; + } + + if (new_chunk_size < 128) { + new_chunk_size = 1; + while ((new_chunk_size * Kokkos::Experimental::HPX::concurrency() < + m_league_size) && + (new_chunk_size < 128)) + new_chunk_size *= 2; + } + + m_chunk_size = new_chunk_size; + } + } + + public: + inline int team_size() const { return m_team_size; } + inline int league_size() const { return m_league_size; } + + inline size_t scratch_size(const int &level, int team_size_ = -1) const { + if (team_size_ < 0) { + team_size_ = m_team_size; + } + return m_team_scratch_size[level] + + team_size_ * m_thread_scratch_size[level]; + } + + inline static int scratch_size_max(int level) { + return (level == 0 ? 1024 * 32 : // Roughly L1 size + 20 * 1024 * 1024); // Limit to keep compatibility with CUDA + } + + public: + template <class ExecSpace, class... OtherProperties> + friend class TeamPolicyInternal; + + const typename traits::execution_space &space() const { + static typename traits::execution_space m_space; + return m_space; + } + + template <class... OtherProperties> + TeamPolicyInternal(const TeamPolicyInternal<Kokkos::Experimental::HPX, + OtherProperties...> &p) { + m_league_size = p.m_league_size; + m_team_size = p.m_team_size; + m_team_scratch_size[0] = p.m_team_scratch_size[0]; + m_thread_scratch_size[0] = p.m_thread_scratch_size[0]; + m_team_scratch_size[1] = p.m_team_scratch_size[1]; + m_thread_scratch_size[1] = p.m_thread_scratch_size[1]; + m_chunk_size = p.m_chunk_size; + } + + TeamPolicyInternal(const typename traits::execution_space &, + int league_size_request, int team_size_request, + int /* vector_length_request */ = 1) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_chunk_size(0) { + init(league_size_request, team_size_request); + } + + TeamPolicyInternal(const typename traits::execution_space &, + int league_size_request, const Kokkos::AUTO_t &, + int /* vector_length_request */ = 1) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_chunk_size(0) { + init(league_size_request, 1); + } + + TeamPolicyInternal(const typename traits::execution_space &space, + int league_size_request, + const Kokkos::AUTO_t &, /* team_size_request */ + const Kokkos::AUTO_t & /* vector_length_request */) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_chunk_size(0) { + init(league_size_request, 1); + } + + TeamPolicyInternal(const typename traits::execution_space &space, + int league_size_request, int team_size_request, + const Kokkos::AUTO_t & /* vector_length_request */ + ) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_chunk_size(0) { + init(league_size_request, team_size_request); + } + + TeamPolicyInternal(int league_size_request, + const Kokkos::AUTO_t &, /* team_size_request */ + const Kokkos::AUTO_t & /* vector_length_request */) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_chunk_size(0) { + init(league_size_request, 1); + } + + TeamPolicyInternal(int league_size_request, int team_size_request, + const Kokkos::AUTO_t & /* vector_length_request */ + ) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_chunk_size(0) { + init(league_size_request, team_size_request); + } + + TeamPolicyInternal(int league_size_request, int team_size_request, + int /* vector_length_request */ = 1) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_chunk_size(0) { + init(league_size_request, team_size_request); + } + + TeamPolicyInternal(int league_size_request, const Kokkos::AUTO_t &, + int /* vector_length_request */ = 1) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_chunk_size(0) { + init(league_size_request, 1); + } + + inline int chunk_size() const { return m_chunk_size; } + + inline TeamPolicyInternal &set_chunk_size( + typename traits::index_type chunk_size_) { + m_chunk_size = chunk_size_; + return *this; + } + + inline TeamPolicyInternal &set_scratch_size(const int &level, + const PerTeamValue &per_team) { + m_team_scratch_size[level] = per_team.value; + return *this; + } + + inline TeamPolicyInternal &set_scratch_size( + const int &level, const PerThreadValue &per_thread) { + m_thread_scratch_size[level] = per_thread.value; + return *this; + } + + inline TeamPolicyInternal &set_scratch_size( + const int &level, const PerTeamValue &per_team, + const PerThreadValue &per_thread) { + m_team_scratch_size[level] = per_team.value; + m_thread_scratch_size[level] = per_thread.value; + return *this; + } +}; +} // namespace Impl +} // namespace Kokkos + +namespace Kokkos { +namespace Impl { + +template <class FunctorType, class... Traits> +class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, + Kokkos::Experimental::HPX> { + private: + using Policy = Kokkos::RangePolicy<Traits...>; + using WorkTag = typename Policy::work_tag; + using WorkRange = typename Policy::WorkRange; + using Member = typename Policy::member_type; + + const FunctorType m_functor; + const Policy m_policy; + + template <class TagType> + static typename std::enable_if<std::is_same<TagType, void>::value>::type + execute_functor(const FunctorType &functor, const Member i) { + functor(i); + } + + template <class TagType> + static typename std::enable_if<!std::is_same<TagType, void>::value>::type + execute_functor(const FunctorType &functor, const Member i) { + const TagType t{}; + functor(t, i); + } + + template <class TagType> + static typename std::enable_if<std::is_same<TagType, void>::value>::type + execute_functor_range(const FunctorType &functor, const Member i_begin, + const Member i_end) { + for (Member i = i_begin; i < i_end; ++i) { + functor(i); + } + } + + template <class TagType> + static typename std::enable_if<!std::is_same<TagType, void>::value>::type + execute_functor_range(const FunctorType &functor, const Member i_begin, + const Member i_end) { + const TagType t{}; + for (Member i = i_begin; i < i_end; ++i) { + functor(t, i); + } + } + + public: + void execute() const { + Kokkos::Impl::dispatch_execute_task(this, m_policy.space()); + } + + void execute_task() const { + // See [note 1] for an explanation. +#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH) + Kokkos::Experimental::HPX::reset_on_exit_parallel reset_on_exit( + m_policy.space()); +#endif + +#if KOKKOS_HPX_IMPLEMENTATION == 0 + using hpx::parallel::for_loop; + using hpx::parallel::execution::par; + using hpx::parallel::execution::static_chunk_size; + + for_loop(par.with(static_chunk_size(m_policy.chunk_size())), + m_policy.begin(), m_policy.end(), [this](const Member i) { + execute_functor<WorkTag>(m_functor, i); + }); + +#elif KOKKOS_HPX_IMPLEMENTATION == 1 + using hpx::apply; + using hpx::lcos::local::latch; + + const int num_tasks = + (m_policy.end() - m_policy.begin() + m_policy.chunk_size() - 1) / + m_policy.chunk_size(); + latch num_tasks_remaining(num_tasks); + ChunkedRoundRobinExecutor exec(num_tasks); + + for (Member i_begin = m_policy.begin(); i_begin < m_policy.end(); + i_begin += m_policy.chunk_size()) { + apply(exec, [this, &num_tasks_remaining, i_begin]() { + const Member i_end = + (std::min)(i_begin + m_policy.chunk_size(), m_policy.end()); + execute_functor_range<WorkTag>(m_functor, i_begin, i_end); + + num_tasks_remaining.count_down(1); + }); + } + + num_tasks_remaining.wait(); + +#elif KOKKOS_HPX_IMPLEMENTATION == 2 + using hpx::parallel::for_loop_strided; + using hpx::parallel::execution::par; + using hpx::parallel::execution::static_chunk_size; + + const int num_tasks = + (m_policy.end() - m_policy.begin() + m_policy.chunk_size() - 1) / + m_policy.chunk_size(); + ChunkedRoundRobinExecutor exec(num_tasks); + + for_loop_strided( + par.on(exec).with(static_chunk_size(1)), m_policy.begin(), + m_policy.end(), m_policy.chunk_size(), [this](const Member i_begin) { + const Member i_end = + (std::min)(i_begin + m_policy.chunk_size(), m_policy.end()); + execute_functor_range<WorkTag>(m_functor, i_begin, i_end); + }); +#endif + } + + inline ParallelFor(const FunctorType &arg_functor, Policy arg_policy) + : m_functor(arg_functor), m_policy(arg_policy) {} +}; + +template <class FunctorType, class... Traits> +class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, + Kokkos::Experimental::HPX> { + private: + using MDRangePolicy = Kokkos::MDRangePolicy<Traits...>; + using Policy = typename MDRangePolicy::impl_range_policy; + using WorkTag = typename MDRangePolicy::work_tag; + using WorkRange = typename Policy::WorkRange; + using Member = typename Policy::member_type; + using iterate_type = + typename Kokkos::Impl::HostIterateTile<MDRangePolicy, FunctorType, + WorkTag, void>; + + const FunctorType m_functor; + const MDRangePolicy m_mdr_policy; + const Policy m_policy; + + public: + void execute() const { dispatch_execute_task(this, m_mdr_policy.space()); } + + inline void execute_task() const { + // See [note 1] for an explanation. +#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH) + Kokkos::Experimental::HPX::reset_on_exit_parallel reset_on_exit( + m_mdr_policy.space()); +#endif + +#if KOKKOS_HPX_IMPLEMENTATION == 0 + using hpx::parallel::for_loop; + using hpx::parallel::execution::par; + using hpx::parallel::execution::static_chunk_size; + + for_loop(par.with(static_chunk_size(m_policy.chunk_size())), + m_policy.begin(), m_policy.end(), [this](const Member i) { + iterate_type(m_mdr_policy, m_functor)(i); + }); + +#elif KOKKOS_HPX_IMPLEMENTATION == 1 + using hpx::apply; + using hpx::lcos::local::latch; + + const int num_tasks = + (m_policy.end() - m_policy.begin() + m_policy.chunk_size() - 1) / + m_policy.chunk_size(); + latch num_tasks_remaining(num_tasks); + ChunkedRoundRobinExecutor exec(num_tasks); + + for (Member i_begin = m_policy.begin(); i_begin < m_policy.end(); + i_begin += m_policy.chunk_size()) { + apply(exec, [this, &num_tasks_remaining, i_begin]() { + const Member i_end = + (std::min)(i_begin + m_policy.chunk_size(), m_policy.end()); + for (Member i = i_begin; i < i_end; ++i) { + iterate_type(m_mdr_policy, m_functor)(i); + } + + num_tasks_remaining.count_down(1); + }); + } + + num_tasks_remaining.wait(); + +#elif KOKKOS_HPX_IMPLEMENTATION == 2 + using hpx::parallel::for_loop_strided; + using hpx::parallel::execution::par; + using hpx::parallel::execution::static_chunk_size; + + const int num_tasks = + (m_policy.end() - m_policy.begin() + m_policy.chunk_size() - 1) / + m_policy.chunk_size(); + ChunkedRoundRobinExecutor exec(num_tasks); + + for_loop_strided( + par.on(exec).with(static_chunk_size(1)), m_policy.begin(), + m_policy.end(), m_policy.chunk_size(), [this](const Member i_begin) { + const Member i_end = + (std::min)(i_begin + m_policy.chunk_size(), m_policy.end()); + for (Member i = i_begin; i < i_end; ++i) { + iterate_type(m_mdr_policy, m_functor)(i); + } + }); +#endif + } + + inline ParallelFor(const FunctorType &arg_functor, MDRangePolicy arg_policy) + : m_functor(arg_functor), + m_mdr_policy(arg_policy), + m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)) {} + template <typename Policy, typename Functor> + static int max_tile_size_product(const Policy &, const Functor &) { + /** + * 1024 here is just our guess for a reasonable max tile size, + * it isn't a hardware constraint. If people see a use for larger + * tile size products, we're happy to change this. + */ + return 1024; + } +}; +} // namespace Impl +} // namespace Kokkos + +namespace Kokkos { +namespace Impl { +template <class FunctorType, class ReducerType, class... Traits> +class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType, + Kokkos::Experimental::HPX> { + private: + using Policy = Kokkos::RangePolicy<Traits...>; + using WorkTag = typename Policy::work_tag; + using WorkRange = typename Policy::WorkRange; + using Member = typename Policy::member_type; + using Analysis = + FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, FunctorType>; + using ReducerConditional = + Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, + FunctorType, ReducerType>; + using ReducerTypeFwd = typename ReducerConditional::type; + using WorkTagFwd = + typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, + WorkTag, void>::type; + using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>; + using ValueFinal = Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>; + using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>; + using ValueOps = Kokkos::Impl::FunctorValueOps<ReducerTypeFwd, WorkTagFwd>; + using value_type = typename Analysis::value_type; + using pointer_type = typename Analysis::pointer_type; + using reference_type = typename Analysis::reference_type; + + const FunctorType m_functor; + const Policy m_policy; + const ReducerType m_reducer; + const pointer_type m_result_ptr; + + bool m_force_synchronous; + + template <class TagType> + inline static + typename std::enable_if<std::is_same<TagType, void>::value>::type + execute_functor(const FunctorType &functor, const Member i, + reference_type update) { + functor(i, update); + } + + template <class TagType> + inline static + typename std::enable_if<!std::is_same<TagType, void>::value>::type + execute_functor(const FunctorType &functor, const Member i, + reference_type update) { + const TagType t{}; + functor(t, i, update); + } + + template <class TagType> + inline typename std::enable_if<std::is_same<TagType, void>::value>::type + execute_functor_range(reference_type update, const Member i_begin, + const Member i_end) const { + for (Member i = i_begin; i < i_end; ++i) { + m_functor(i, update); + } + } + + template <class TagType> + inline typename std::enable_if<!std::is_same<TagType, void>::value>::type + execute_functor_range(reference_type update, const Member i_begin, + const Member i_end) const { + const TagType t{}; + + for (Member i = i_begin; i < i_end; ++i) { + m_functor(t, i, update); + } + } + + class value_type_wrapper { + private: + std::size_t m_value_size; + char *m_value_buffer; + + public: + value_type_wrapper() : m_value_size(0), m_value_buffer(nullptr) {} + + value_type_wrapper(const std::size_t value_size) + : m_value_size(value_size), m_value_buffer(new char[m_value_size]) {} + + value_type_wrapper(const value_type_wrapper &other) + : m_value_size(0), m_value_buffer(nullptr) { + if (this != &other) { + m_value_buffer = new char[other.m_value_size]; + m_value_size = other.m_value_size; + + std::copy(other.m_value_buffer, other.m_value_buffer + m_value_size, + m_value_buffer); + } + } + + ~value_type_wrapper() { delete[] m_value_buffer; } + + value_type_wrapper(value_type_wrapper &&other) + : m_value_size(0), m_value_buffer(nullptr) { + if (this != &other) { + m_value_buffer = other.m_value_buffer; + m_value_size = other.m_value_size; + + other.m_value_buffer = nullptr; + other.m_value_size = 0; + } + } + + value_type_wrapper &operator=(const value_type_wrapper &other) { + if (this != &other) { + delete[] m_value_buffer; + m_value_buffer = new char[other.m_value_size]; + m_value_size = other.m_value_size; + + std::copy(other.m_value_buffer, other.m_value_buffer + m_value_size, + m_value_buffer); + } + + return *this; + } + + value_type_wrapper &operator=(value_type_wrapper &&other) { + if (this != &other) { + delete[] m_value_buffer; + m_value_buffer = other.m_value_buffer; + m_value_size = other.m_value_size; + + other.m_value_buffer = nullptr; + other.m_value_size = 0; + } + + return *this; + } + + pointer_type pointer() const { + return reinterpret_cast<pointer_type>(m_value_buffer); + } + + reference_type reference() const { + return ValueOps::reference( + reinterpret_cast<pointer_type>(m_value_buffer)); + } + }; + + public: + void execute() const { + if (m_policy.end() <= m_policy.begin()) { + if (m_result_ptr) { + ValueInit::init(ReducerConditional::select(m_functor, m_reducer), + m_result_ptr); + ValueFinal::final(ReducerConditional::select(m_functor, m_reducer), + m_result_ptr); + } + return; + } + dispatch_execute_task(this, m_policy.space(), m_force_synchronous); + } + + inline void execute_task() const { + // See [note 1] for an explanation. +#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH) + Kokkos::Experimental::HPX::reset_on_exit_parallel reset_on_exit( + m_policy.space()); +#endif + + const std::size_t value_size = + Analysis::value_size(ReducerConditional::select(m_functor, m_reducer)); + +#if KOKKOS_HPX_IMPLEMENTATION == 0 + // NOTE: This version makes the most use of HPX functionality, but + // requires the struct value_type_wrapper to handle different + // reference_types. It is also significantly slower than the version + // below due to not reusing the buffer used by other functions. + using hpx::parallel::for_loop; + using hpx::parallel::reduction; + using hpx::parallel::execution::par; + using hpx::parallel::execution::static_chunk_size; + + value_type_wrapper final_value(value_size); + value_type_wrapper identity(value_size); + + ValueInit::init(ReducerConditional::select(m_functor, m_reducer), + final_value.pointer()); + ValueInit::init(ReducerConditional::select(m_functor, m_reducer), + identity.pointer()); + + for_loop(par.with(static_chunk_size(m_policy.chunk_size())), + m_policy.begin(), m_policy.end(), + reduction(final_value, identity, + [this](value_type_wrapper &a, + value_type_wrapper &b) -> value_type_wrapper & { + ValueJoin::join( + ReducerConditional::select(m_functor, m_reducer), + a.pointer(), b.pointer()); + return a; + }), + [this](Member i, value_type_wrapper &update) { + execute_functor<WorkTag>(m_functor, i, update.reference()); + }); + + pointer_type final_value_ptr = final_value.pointer(); + +#elif KOKKOS_HPX_IMPLEMENTATION == 1 + const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + + thread_buffer &buffer = m_policy.space().impl_get_buffer(); + buffer.resize(num_worker_threads, value_size); + + using hpx::apply; + using hpx::lcos::local::latch; + + { + latch num_tasks_remaining(num_worker_threads); + ChunkedRoundRobinExecutor exec(num_worker_threads); + + for (int t = 0; t < num_worker_threads; ++t) { + apply(exec, [this, &num_tasks_remaining, &buffer, t]() { + ValueInit::init(ReducerConditional::select(m_functor, m_reducer), + reinterpret_cast<pointer_type>(buffer.get(t))); + + num_tasks_remaining.count_down(1); + }); + } + + num_tasks_remaining.wait(); + } + + const int num_tasks = + (m_policy.end() - m_policy.begin() + m_policy.chunk_size() - 1) / + m_policy.chunk_size(); + latch num_tasks_remaining(num_tasks); + ChunkedRoundRobinExecutor exec(num_tasks); + + for (Member i_begin = m_policy.begin(); i_begin < m_policy.end(); + i_begin += m_policy.chunk_size()) { + apply(exec, [this, &num_tasks_remaining, &buffer, i_begin]() { + reference_type update = + ValueOps::reference(reinterpret_cast<pointer_type>(buffer.get( + Kokkos::Experimental::HPX::impl_hardware_thread_id()))); + const Member i_end = + (std::min)(i_begin + m_policy.chunk_size(), m_policy.end()); + execute_functor_range<WorkTag>(update, i_begin, i_end); + + num_tasks_remaining.count_down(1); + }); + } + + num_tasks_remaining.wait(); + + for (int i = 1; i < num_worker_threads; ++i) { + ValueJoin::join(ReducerConditional::select(m_functor, m_reducer), + reinterpret_cast<pointer_type>(buffer.get(0)), + reinterpret_cast<pointer_type>(buffer.get(i))); + } + + pointer_type final_value_ptr = + reinterpret_cast<pointer_type>(buffer.get(0)); + +#elif KOKKOS_HPX_IMPLEMENTATION == 2 + const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + + thread_buffer &buffer = m_policy.space().impl_get_buffer(); + buffer.resize(num_worker_threads, value_size); + + using hpx::parallel::for_loop; + using hpx::parallel::for_loop_strided; + using hpx::parallel::execution::par; + using hpx::parallel::execution::static_chunk_size; + + { + ChunkedRoundRobinExecutor exec(num_worker_threads); + + for_loop(par.on(exec).with(static_chunk_size(1)), std::size_t(0), + num_worker_threads, [this, &buffer](const std::size_t t) { + ValueInit::init( + ReducerConditional::select(m_functor, m_reducer), + reinterpret_cast<pointer_type>(buffer.get(t))); + }); + } + + const int num_tasks = + (m_policy.end() - m_policy.begin() + m_policy.chunk_size() - 1) / + m_policy.chunk_size(); + ChunkedRoundRobinExecutor exec(num_tasks); + + for_loop_strided( + par.on(exec).with(static_chunk_size(1)), m_policy.begin(), + m_policy.end(), m_policy.chunk_size(), + [this, &buffer](const Member i_begin) { + reference_type update = + ValueOps::reference(reinterpret_cast<pointer_type>(buffer.get( + Kokkos::Experimental::HPX::impl_hardware_thread_id()))); + const Member i_end = + (std::min)(i_begin + m_policy.chunk_size(), m_policy.end()); + execute_functor_range<WorkTag>(update, i_begin, i_end); + }); + + for (int i = 1; i < num_worker_threads; ++i) { + ValueJoin::join(ReducerConditional::select(m_functor, m_reducer), + reinterpret_cast<pointer_type>(buffer.get(0)), + reinterpret_cast<pointer_type>(buffer.get(i))); + } + + pointer_type final_value_ptr = + reinterpret_cast<pointer_type>(buffer.get(0)); +#endif + + Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final( + ReducerConditional::select(m_functor, m_reducer), final_value_ptr); + + if (m_result_ptr != nullptr) { + const int n = Analysis::value_count( + ReducerConditional::select(m_functor, m_reducer)); + + for (int j = 0; j < n; ++j) { + m_result_ptr[j] = final_value_ptr[j]; + } + } + } + + template <class ViewType> + inline ParallelReduce( + const FunctorType &arg_functor, Policy arg_policy, + const ViewType &arg_view, + typename std::enable_if<Kokkos::is_view<ViewType>::value && + !Kokkos::is_reducer_type<ReducerType>::value, + void *>::type = nullptr) + : m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(InvalidType()), + m_result_ptr(arg_view.data()), + m_force_synchronous(!arg_view.impl_track().has_record()) {} + + inline ParallelReduce(const FunctorType &arg_functor, Policy arg_policy, + const ReducerType &reducer) + : m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(reducer), + m_result_ptr(reducer.view().data()), + m_force_synchronous(!reducer.view().impl_track().has_record()) {} +}; + +template <class FunctorType, class ReducerType, class... Traits> +class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType, + Kokkos::Experimental::HPX> { + private: + using MDRangePolicy = Kokkos::MDRangePolicy<Traits...>; + using Policy = typename MDRangePolicy::impl_range_policy; + using WorkTag = typename MDRangePolicy::work_tag; + using WorkRange = typename Policy::WorkRange; + using Member = typename Policy::member_type; + using Analysis = FunctorAnalysis<FunctorPatternInterface::REDUCE, + MDRangePolicy, FunctorType>; + using ReducerConditional = + Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, + FunctorType, ReducerType>; + using ReducerTypeFwd = typename ReducerConditional::type; + using WorkTagFwd = + typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, + WorkTag, void>::type; + using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>; + using ValueFinal = Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>; + using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>; + using ValueOps = Kokkos::Impl::FunctorValueOps<ReducerTypeFwd, WorkTagFwd>; + using pointer_type = typename Analysis::pointer_type; + using value_type = typename Analysis::value_type; + using reference_type = typename Analysis::reference_type; + using iterate_type = + typename Kokkos::Impl::HostIterateTile<MDRangePolicy, FunctorType, + WorkTag, reference_type>; + + const FunctorType m_functor; + const MDRangePolicy m_mdr_policy; + const Policy m_policy; + const ReducerType m_reducer; + const pointer_type m_result_ptr; + + bool m_force_synchronous; + + public: + void execute() const { + dispatch_execute_task(this, m_mdr_policy.space(), m_force_synchronous); + } + + inline void execute_task() const { + // See [note 1] for an explanation. +#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH) + Kokkos::Experimental::HPX::reset_on_exit_parallel reset_on_exit( + m_mdr_policy.space()); +#endif + + const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + const std::size_t value_size = + Analysis::value_size(ReducerConditional::select(m_functor, m_reducer)); + + thread_buffer &buffer = m_mdr_policy.space().impl_get_buffer(); + buffer.resize(num_worker_threads, value_size); + +#if KOKKOS_HPX_IMPLEMENTATION == 0 + using hpx::parallel::for_loop; + using hpx::parallel::execution::par; + using hpx::parallel::execution::static_chunk_size; + + for_loop(par, 0, num_worker_threads, [this, &buffer](std::size_t t) { + ValueInit::init(ReducerConditional::select(m_functor, m_reducer), + reinterpret_cast<pointer_type>(buffer.get(t))); + }); + + for_loop(par.with(static_chunk_size(m_policy.chunk_size())), + m_policy.begin(), m_policy.end(), [this, &buffer](const Member i) { + reference_type update = ValueOps::reference( + reinterpret_cast<pointer_type>(buffer.get( + Kokkos::Experimental::HPX::impl_hardware_thread_id()))); + iterate_type(m_mdr_policy, m_functor, update)(i); + }); + +#elif KOKKOS_HPX_IMPLEMENTATION == 1 + using hpx::apply; + using hpx::lcos::local::latch; + + { + latch num_tasks_remaining(num_worker_threads); + ChunkedRoundRobinExecutor exec(num_worker_threads); + + for (int t = 0; t < num_worker_threads; ++t) { + apply(exec, [this, &buffer, &num_tasks_remaining, t]() { + ValueInit::init(ReducerConditional::select(m_functor, m_reducer), + reinterpret_cast<pointer_type>(buffer.get(t))); + + num_tasks_remaining.count_down(1); + }); + } + + num_tasks_remaining.wait(); + } + + const int num_tasks = + (m_policy.end() - m_policy.begin() + m_policy.chunk_size() - 1) / + m_policy.chunk_size(); + latch num_tasks_remaining(num_tasks); + ChunkedRoundRobinExecutor exec(num_tasks); + + for (Member i_begin = m_policy.begin(); i_begin < m_policy.end(); + i_begin += m_policy.chunk_size()) { + apply(exec, [this, &num_tasks_remaining, &buffer, i_begin]() { + reference_type update = + ValueOps::reference(reinterpret_cast<pointer_type>(buffer.get( + Kokkos::Experimental::HPX::impl_hardware_thread_id()))); + const Member i_end = + (std::min)(i_begin + m_policy.chunk_size(), m_policy.end()); + + for (Member i = i_begin; i < i_end; ++i) { + iterate_type(m_mdr_policy, m_functor, update)(i); + } + + num_tasks_remaining.count_down(1); + }); + } + + num_tasks_remaining.wait(); + +#elif KOKKOS_HPX_IMPLEMENTATION == 2 + using hpx::parallel::for_loop; + using hpx::parallel::for_loop_strided; + using hpx::parallel::execution::par; + using hpx::parallel::execution::static_chunk_size; + + { + ChunkedRoundRobinExecutor exec(num_worker_threads); + + for_loop(par.on(exec).with(static_chunk_size(1)), std::size_t(0), + num_worker_threads, [this, &buffer](const std::size_t t) { + ValueInit::init( + ReducerConditional::select(m_functor, m_reducer), + reinterpret_cast<pointer_type>(buffer.get(t))); + }); + } + + const int num_tasks = + (m_policy.end() - m_policy.begin() + m_policy.chunk_size() - 1) / + m_policy.chunk_size(); + ChunkedRoundRobinExecutor exec(num_tasks); + + for_loop_strided( + par.on(exec).with(static_chunk_size(1)), m_policy.begin(), + m_policy.end(), m_policy.chunk_size(), + [this, &buffer](const Member i_begin) { + reference_type update = + ValueOps::reference(reinterpret_cast<pointer_type>(buffer.get( + Kokkos::Experimental::HPX::impl_hardware_thread_id()))); + const Member i_end = + (std::min)(i_begin + m_policy.chunk_size(), m_policy.end()); + + for (Member i = i_begin; i < i_end; ++i) { + iterate_type(m_mdr_policy, m_functor, update)(i); + } + }); +#endif + + for (int i = 1; i < num_worker_threads; ++i) { + ValueJoin::join(ReducerConditional::select(m_functor, m_reducer), + reinterpret_cast<pointer_type>(buffer.get(0)), + reinterpret_cast<pointer_type>(buffer.get(i))); + } + + Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final( + ReducerConditional::select(m_functor, m_reducer), + reinterpret_cast<pointer_type>(buffer.get(0))); + + if (m_result_ptr != nullptr) { + const int n = Analysis::value_count( + ReducerConditional::select(m_functor, m_reducer)); + + for (int j = 0; j < n; ++j) { + m_result_ptr[j] = reinterpret_cast<pointer_type>(buffer.get(0))[j]; + } + } + } + + template <class ViewType> + inline ParallelReduce( + const FunctorType &arg_functor, MDRangePolicy arg_policy, + const ViewType &arg_view, + typename std::enable_if<Kokkos::is_view<ViewType>::value && + !Kokkos::is_reducer_type<ReducerType>::value, + void *>::type = nullptr) + : m_functor(arg_functor), + m_mdr_policy(arg_policy), + m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)), + m_reducer(InvalidType()), + m_result_ptr(arg_view.data()), + m_force_synchronous(!arg_view.impl_track().has_record()) {} + + inline ParallelReduce(const FunctorType &arg_functor, + MDRangePolicy arg_policy, const ReducerType &reducer) + : m_functor(arg_functor), + m_mdr_policy(arg_policy), + m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)), + m_reducer(reducer), + m_result_ptr(reducer.view().data()), + m_force_synchronous(!reducer.view().impl_track().has_record()) {} + template <typename Policy, typename Functor> + static int max_tile_size_product(const Policy &, const Functor &) { + /** + * 1024 here is just our guess for a reasonable max tile size, + * it isn't a hardware constraint. If people see a use for larger + * tile size products, we're happy to change this. + */ + return 1024; + } +}; +} // namespace Impl +} // namespace Kokkos + +namespace Kokkos { +namespace Impl { + +template <class FunctorType, class... Traits> +class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, + Kokkos::Experimental::HPX> { + private: + using Policy = Kokkos::RangePolicy<Traits...>; + using WorkTag = typename Policy::work_tag; + using WorkRange = typename Policy::WorkRange; + using Member = typename Policy::member_type; + using Analysis = + FunctorAnalysis<FunctorPatternInterface::SCAN, Policy, FunctorType>; + using ValueInit = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>; + using ValueJoin = Kokkos::Impl::FunctorValueJoin<FunctorType, WorkTag>; + using ValueOps = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>; + using pointer_type = typename Analysis::pointer_type; + using reference_type = typename Analysis::reference_type; + using value_type = typename Analysis::value_type; + + const FunctorType m_functor; + const Policy m_policy; + + template <class TagType> + inline static + typename std::enable_if<std::is_same<TagType, void>::value>::type + execute_functor_range(const FunctorType &functor, const Member i_begin, + const Member i_end, reference_type update, + const bool final) { + for (Member i = i_begin; i < i_end; ++i) { + functor(i, update, final); + } + } + + template <class TagType> + inline static + typename std::enable_if<!std::is_same<TagType, void>::value>::type + execute_functor_range(const FunctorType &functor, const Member i_begin, + const Member i_end, reference_type update, + const bool final) { + const TagType t{}; + for (Member i = i_begin; i < i_end; ++i) { + functor(t, i, update, final); + } + } + + public: + void execute() const { dispatch_execute_task(this, m_policy.space()); } + + inline void execute_task() const { + // See [note 1] for an explanation. +#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH) + Kokkos::Experimental::HPX::reset_on_exit_parallel reset_on_exit( + m_policy.space()); +#endif + + const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + const int value_count = Analysis::value_count(m_functor); + const std::size_t value_size = Analysis::value_size(m_functor); + + thread_buffer &buffer = m_policy.space().impl_get_buffer(); + buffer.resize(num_worker_threads, 2 * value_size); + + using hpx::apply; + using hpx::lcos::local::barrier; + using hpx::lcos::local::latch; + + barrier bar(num_worker_threads); + latch num_tasks_remaining(num_worker_threads); + ChunkedRoundRobinExecutor exec(num_worker_threads); + + for (int t = 0; t < num_worker_threads; ++t) { + apply(exec, [this, &bar, &buffer, &num_tasks_remaining, + num_worker_threads, value_count, value_size, t]() { + reference_type update_sum = ValueInit::init( + m_functor, reinterpret_cast<pointer_type>(buffer.get(t))); + + const WorkRange range(m_policy, t, num_worker_threads); + execute_functor_range<WorkTag>(m_functor, range.begin(), range.end(), + update_sum, false); + + bar.wait(); + + if (t == 0) { + ValueInit::init(m_functor, reinterpret_cast<pointer_type>( + buffer.get(0) + value_size)); + + for (int i = 1; i < num_worker_threads; ++i) { + pointer_type ptr_1_prev = + reinterpret_cast<pointer_type>(buffer.get(i - 1)); + pointer_type ptr_2_prev = + reinterpret_cast<pointer_type>(buffer.get(i - 1) + value_size); + pointer_type ptr_2 = + reinterpret_cast<pointer_type>(buffer.get(i) + value_size); + + for (int j = 0; j < value_count; ++j) { + ptr_2[j] = ptr_2_prev[j]; + } + + ValueJoin::join(m_functor, ptr_2, ptr_1_prev); + } + } + + bar.wait(); + + reference_type update_base = ValueOps::reference( + reinterpret_cast<pointer_type>(buffer.get(t) + value_size)); + + execute_functor_range<WorkTag>(m_functor, range.begin(), range.end(), + update_base, true); + + num_tasks_remaining.count_down(1); + }); + } + + num_tasks_remaining.wait(); + } + + inline ParallelScan(const FunctorType &arg_functor, const Policy &arg_policy) + : m_functor(arg_functor), m_policy(arg_policy) {} +}; + +template <class FunctorType, class ReturnType, class... Traits> +class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>, + ReturnType, Kokkos::Experimental::HPX> { + private: + using Policy = Kokkos::RangePolicy<Traits...>; + using WorkTag = typename Policy::work_tag; + using WorkRange = typename Policy::WorkRange; + using Member = typename Policy::member_type; + using Analysis = + FunctorAnalysis<FunctorPatternInterface::SCAN, Policy, FunctorType>; + using ValueInit = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>; + using ValueJoin = Kokkos::Impl::FunctorValueJoin<FunctorType, WorkTag>; + using ValueOps = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>; + using pointer_type = typename Analysis::pointer_type; + using reference_type = typename Analysis::reference_type; + using value_type = typename Analysis::value_type; + + const FunctorType m_functor; + const Policy m_policy; + ReturnType &m_returnvalue; + + template <class TagType> + inline static + typename std::enable_if<std::is_same<TagType, void>::value>::type + execute_functor_range(const FunctorType &functor, const Member i_begin, + const Member i_end, reference_type update, + const bool final) { + for (Member i = i_begin; i < i_end; ++i) { + functor(i, update, final); + } + } + + template <class TagType> + inline static + typename std::enable_if<!std::is_same<TagType, void>::value>::type + execute_functor_range(const FunctorType &functor, const Member i_begin, + const Member i_end, reference_type update, + const bool final) { + const TagType t{}; + for (Member i = i_begin; i < i_end; ++i) { + functor(t, i, update, final); + } + } + + public: + void execute() const { dispatch_execute_task(this, m_policy.space()); } + + inline void execute_task() const { + // See [note 1] for an explanation. +#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH) + Kokkos::Experimental::HPX::reset_on_exit_parallel reset_on_exit( + m_policy.space()); +#endif + + const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + const int value_count = Analysis::value_count(m_functor); + const std::size_t value_size = Analysis::value_size(m_functor); + + thread_buffer &buffer = m_policy.space().impl_get_buffer(); + buffer.resize(num_worker_threads, 2 * value_size); + + using hpx::apply; + using hpx::lcos::local::barrier; + using hpx::lcos::local::latch; + + barrier bar(num_worker_threads); + latch num_tasks_remaining(num_worker_threads); + ChunkedRoundRobinExecutor exec(num_worker_threads); + + for (int t = 0; t < num_worker_threads; ++t) { + apply(exec, [this, &bar, &buffer, &num_tasks_remaining, + num_worker_threads, value_count, value_size, t]() { + reference_type update_sum = ValueInit::init( + m_functor, reinterpret_cast<pointer_type>(buffer.get(t))); + + const WorkRange range(m_policy, t, num_worker_threads); + execute_functor_range<WorkTag>(m_functor, range.begin(), range.end(), + update_sum, false); + + bar.wait(); + + if (t == 0) { + ValueInit::init(m_functor, reinterpret_cast<pointer_type>( + buffer.get(0) + value_size)); + + for (int i = 1; i < num_worker_threads; ++i) { + pointer_type ptr_1_prev = + reinterpret_cast<pointer_type>(buffer.get(i - 1)); + pointer_type ptr_2_prev = + reinterpret_cast<pointer_type>(buffer.get(i - 1) + value_size); + pointer_type ptr_2 = + reinterpret_cast<pointer_type>(buffer.get(i) + value_size); + + for (int j = 0; j < value_count; ++j) { + ptr_2[j] = ptr_2_prev[j]; + } + + ValueJoin::join(m_functor, ptr_2, ptr_1_prev); + } + } + + bar.wait(); + + reference_type update_base = ValueOps::reference( + reinterpret_cast<pointer_type>(buffer.get(t) + value_size)); + + execute_functor_range<WorkTag>(m_functor, range.begin(), range.end(), + update_base, true); + + if (t == num_worker_threads - 1) { + m_returnvalue = update_base; + } + + num_tasks_remaining.count_down(1); + }); + } + + num_tasks_remaining.wait(); + } + + inline ParallelScanWithTotal(const FunctorType &arg_functor, + const Policy &arg_policy, + ReturnType &arg_returnvalue) + : m_functor(arg_functor), + m_policy(arg_policy), + m_returnvalue(arg_returnvalue) {} +}; +} // namespace Impl +} // namespace Kokkos + +namespace Kokkos { +namespace Impl { +template <class FunctorType, class... Properties> +class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, + Kokkos::Experimental::HPX> { + private: + using Policy = TeamPolicyInternal<Kokkos::Experimental::HPX, Properties...>; + using WorkTag = typename Policy::work_tag; + using Member = typename Policy::member_type; + using memory_space = Kokkos::HostSpace; + + const FunctorType m_functor; + const Policy m_policy; + const int m_league; + const std::size_t m_shared; + + template <class TagType> + inline static + typename std::enable_if<std::is_same<TagType, void>::value>::type + execute_functor(const FunctorType &functor, const Policy &policy, + const int league_rank, char *local_buffer, + const std::size_t local_buffer_size) { + functor(Member(policy, 0, league_rank, local_buffer, local_buffer_size)); + } + + template <class TagType> + inline static + typename std::enable_if<!std::is_same<TagType, void>::value>::type + execute_functor(const FunctorType &functor, const Policy &policy, + const int league_rank, char *local_buffer, + const std::size_t local_buffer_size) { + const TagType t{}; + functor(t, Member(policy, 0, league_rank, local_buffer, local_buffer_size)); + } + + template <class TagType> + inline static + typename std::enable_if<std::is_same<TagType, void>::value>::type + execute_functor_range(const FunctorType &functor, const Policy &policy, + const int league_rank_begin, + const int league_rank_end, char *local_buffer, + const std::size_t local_buffer_size) { + for (int league_rank = league_rank_begin; league_rank < league_rank_end; + ++league_rank) { + functor(Member(policy, 0, league_rank, local_buffer, local_buffer_size)); + } + } + + template <class TagType> + inline static + typename std::enable_if<!std::is_same<TagType, void>::value>::type + execute_functor_range(const FunctorType &functor, const Policy &policy, + const int league_rank_begin, + const int league_rank_end, char *local_buffer, + const std::size_t local_buffer_size) { + const TagType t{}; + for (int league_rank = league_rank_begin; league_rank < league_rank_end; + ++league_rank) { + functor(t, + Member(policy, 0, league_rank, local_buffer, local_buffer_size)); + } + } + + public: + void execute() const { dispatch_execute_task(this, m_policy.space()); } + + inline void execute_task() const { + // See [note 1] for an explanation. +#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH) + Kokkos::Experimental::HPX::reset_on_exit_parallel reset_on_exit( + m_policy.space()); +#endif + + const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + + thread_buffer &buffer = m_policy.space().impl_get_buffer(); + buffer.resize(num_worker_threads, m_shared); + +#if KOKKOS_HPX_IMPLEMENTATION == 0 + using hpx::parallel::for_loop; + using hpx::parallel::execution::par; + using hpx::parallel::execution::static_chunk_size; + + for_loop( + par.with(static_chunk_size(m_policy.chunk_size())), 0, + m_policy.league_size(), [this, &buffer](const int league_rank) { + execute_functor<WorkTag>( + m_functor, m_policy, league_rank, + buffer.get(Kokkos::Experimental::HPX::impl_hardware_thread_id()), + m_shared); + }); + +#elif KOKKOS_HPX_IMPLEMENTATION == 1 + using hpx::apply; + using hpx::lcos::local::latch; + + const int num_tasks = (m_policy.league_size() + m_policy.chunk_size() - 1) / + m_policy.chunk_size(); + latch num_tasks_remaining(num_tasks); + ChunkedRoundRobinExecutor exec(num_tasks); + + for (int league_rank_begin = 0; league_rank_begin < m_policy.league_size(); + league_rank_begin += m_policy.chunk_size()) { + apply(exec, [this, &buffer, &num_tasks_remaining, league_rank_begin]() { + const int league_rank_end = (std::min)( + league_rank_begin + m_policy.chunk_size(), m_policy.league_size()); + execute_functor_range<WorkTag>( + m_functor, m_policy, league_rank_begin, league_rank_end, + buffer.get(Kokkos::Experimental::HPX::impl_hardware_thread_id()), + m_shared); + + num_tasks_remaining.count_down(1); + }); + } + + num_tasks_remaining.wait(); + +#elif KOKKOS_HPX_IMPLEMENTATION == 2 + using hpx::parallel::for_loop_strided; + using hpx::parallel::execution::par; + using hpx::parallel::execution::static_chunk_size; + + const int num_tasks = (m_policy.league_size() + m_policy.chunk_size() - 1) / + m_policy.chunk_size(); + ChunkedRoundRobinExecutor exec(num_tasks); + + for_loop_strided( + par.on(exec).with(static_chunk_size(1)), 0, m_policy.league_size(), + m_policy.chunk_size(), [this, &buffer](const int league_rank_begin) { + const int league_rank_end = + (std::min)(league_rank_begin + m_policy.chunk_size(), + m_policy.league_size()); + execute_functor_range<WorkTag>( + m_functor, m_policy, league_rank_begin, league_rank_end, + buffer.get(Kokkos::Experimental::HPX::impl_hardware_thread_id()), + m_shared); + }); +#endif + } + + ParallelFor(const FunctorType &arg_functor, const Policy &arg_policy) + : m_functor(arg_functor), + m_policy(arg_policy), + m_league(arg_policy.league_size()), + m_shared(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + + FunctorTeamShmemSize<FunctorType>::value( + arg_functor, arg_policy.team_size())) {} +}; + +template <class FunctorType, class ReducerType, class... Properties> +class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>, + ReducerType, Kokkos::Experimental::HPX> { + private: + using Policy = TeamPolicyInternal<Kokkos::Experimental::HPX, Properties...>; + using Analysis = + FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, FunctorType>; + using Member = typename Policy::member_type; + using WorkTag = typename Policy::work_tag; + using ReducerConditional = + Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, + FunctorType, ReducerType>; + using ReducerTypeFwd = typename ReducerConditional::type; + using WorkTagFwd = + typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, + WorkTag, void>::type; + using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>; + using ValueFinal = Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>; + using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>; + using ValueOps = Kokkos::Impl::FunctorValueOps<ReducerTypeFwd, WorkTagFwd>; + using pointer_type = typename Analysis::pointer_type; + using reference_type = typename Analysis::reference_type; + using value_type = typename Analysis::value_type; + + const FunctorType m_functor; + const int m_league; + const Policy m_policy; + const ReducerType m_reducer; + pointer_type m_result_ptr; + const std::size_t m_shared; + + bool m_force_synchronous; + + template <class TagType> + inline static + typename std::enable_if<std::is_same<TagType, void>::value>::type + execute_functor(const FunctorType &functor, const Policy &policy, + const int league_rank, char *local_buffer, + const std::size_t local_buffer_size, + reference_type update) { + functor(Member(policy, 0, league_rank, local_buffer, local_buffer_size), + update); + } + + template <class TagType> + inline static + typename std::enable_if<!std::is_same<TagType, void>::value>::type + execute_functor(const FunctorType &functor, const Policy &policy, + const int league_rank, char *local_buffer, + const std::size_t local_buffer_size, + reference_type update) { + const TagType t{}; + functor(t, Member(policy, 0, league_rank, local_buffer, local_buffer_size), + update); + } + + template <class TagType> + inline static + typename std::enable_if<std::is_same<TagType, void>::value>::type + execute_functor_range(const FunctorType &functor, const Policy &policy, + const int league_rank_begin, + const int league_rank_end, char *local_buffer, + const std::size_t local_buffer_size, + reference_type update) { + for (int league_rank = league_rank_begin; league_rank < league_rank_end; + ++league_rank) { + functor(Member(policy, 0, league_rank, local_buffer, local_buffer_size), + update); + } + } + + template <class TagType> + inline static + typename std::enable_if<!std::is_same<TagType, void>::value>::type + execute_functor_range(const FunctorType &functor, const Policy &policy, + const int league_rank_begin, + const int league_rank_end, char *local_buffer, + const std::size_t local_buffer_size, + reference_type update) { + const TagType t{}; + for (int league_rank = league_rank_begin; league_rank < league_rank_end; + ++league_rank) { + functor(t, + Member(policy, 0, league_rank, local_buffer, local_buffer_size), + update); + } + } + + public: + void execute() const { + if (m_policy.league_size() * m_policy.team_size() == 0) { + if (m_result_ptr) { + ValueInit::init(ReducerConditional::select(m_functor, m_reducer), + m_result_ptr); + ValueFinal::final(ReducerConditional::select(m_functor, m_reducer), + m_result_ptr); + } + return; + } + dispatch_execute_task(this, m_policy.space()); + } + + inline void execute_task() const { + // See [note 1] for an explanation. +#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH) + Kokkos::Experimental::HPX::reset_on_exit_parallel reset_on_exit( + m_policy.space()); +#endif + + const int num_worker_threads = Kokkos::Experimental::HPX::concurrency(); + const std::size_t value_size = + Analysis::value_size(ReducerConditional::select(m_functor, m_reducer)); + + thread_buffer &buffer = m_policy.space().impl_get_buffer(); + buffer.resize(num_worker_threads, value_size + m_shared); + +#if KOKKOS_HPX_IMPLEMENTATION == 0 + using hpx::parallel::for_loop; + using hpx::parallel::execution::par; + + for_loop(par, 0, num_worker_threads, [this, &buffer](const std::size_t t) { + ValueInit::init(ReducerConditional::select(m_functor, m_reducer), + reinterpret_cast<pointer_type>(buffer.get(t))); + }); + + using hpx::parallel::execution::static_chunk_size; + + hpx::parallel::for_loop( + par.with(static_chunk_size(m_policy.chunk_size())), 0, + m_policy.league_size(), + [this, &buffer, value_size](const int league_rank) { + std::size_t t = Kokkos::Experimental::HPX::impl_hardware_thread_id(); + reference_type update = ValueOps::reference( + reinterpret_cast<pointer_type>(buffer.get(t))); + + execute_functor<WorkTag>(m_functor, m_policy, league_rank, + buffer.get(t) + value_size, m_shared, + update); + }); + +#elif KOKKOS_HPX_IMPLEMENTATION == 1 + using hpx::apply; + using hpx::lcos::local::latch; + + { + latch num_tasks_remaining(num_worker_threads); + ChunkedRoundRobinExecutor exec(num_worker_threads); + + for (int t = 0; t < num_worker_threads; ++t) { + apply(exec, [this, &buffer, &num_tasks_remaining, t]() { + ValueInit::init(ReducerConditional::select(m_functor, m_reducer), + reinterpret_cast<pointer_type>(buffer.get(t))); + + num_tasks_remaining.count_down(1); + }); + } + + num_tasks_remaining.wait(); + } + + const int num_tasks = (m_policy.league_size() + m_policy.chunk_size() - 1) / + m_policy.chunk_size(); + latch num_tasks_remaining(num_tasks); + ChunkedRoundRobinExecutor exec(num_tasks); + + for (int league_rank_begin = 0; league_rank_begin < m_policy.league_size(); + league_rank_begin += m_policy.chunk_size()) { + apply(exec, [this, &buffer, &num_tasks_remaining, league_rank_begin, + value_size]() { + std::size_t t = Kokkos::Experimental::HPX::impl_hardware_thread_id(); + reference_type update = + ValueOps::reference(reinterpret_cast<pointer_type>(buffer.get(t))); + const int league_rank_end = (std::min)( + league_rank_begin + m_policy.chunk_size(), m_policy.league_size()); + execute_functor_range<WorkTag>( + m_functor, m_policy, league_rank_begin, league_rank_end, + buffer.get(t) + value_size, m_shared, update); + + num_tasks_remaining.count_down(1); + }); + } + + num_tasks_remaining.wait(); + +#elif KOKKOS_HPX_IMPLEMENTATION == 2 + using hpx::parallel::for_loop; + using hpx::parallel::for_loop_strided; + using hpx::parallel::execution::par; + using hpx::parallel::execution::static_chunk_size; + + { + ChunkedRoundRobinExecutor exec(num_worker_threads); + + for_loop(par.on(exec).with(static_chunk_size(1)), 0, num_worker_threads, + [this, &buffer](std::size_t const t) { + ValueInit::init( + ReducerConditional::select(m_functor, m_reducer), + reinterpret_cast<pointer_type>(buffer.get(t))); + }); + } + + const int num_tasks = (m_policy.league_size() + m_policy.chunk_size() - 1) / + m_policy.chunk_size(); + ChunkedRoundRobinExecutor exec(num_tasks); + + for_loop_strided( + par.on(exec).with(static_chunk_size(1)), 0, m_policy.league_size(), + m_policy.chunk_size(), + [this, &buffer, value_size](int const league_rank_begin) { + std::size_t t = Kokkos::Experimental::HPX::impl_hardware_thread_id(); + reference_type update = ValueOps::reference( + reinterpret_cast<pointer_type>(buffer.get(t))); + const int league_rank_end = + (std::min)(league_rank_begin + m_policy.chunk_size(), + m_policy.league_size()); + execute_functor_range<WorkTag>( + m_functor, m_policy, league_rank_begin, league_rank_end, + buffer.get(t) + value_size, m_shared, update); + }); +#endif + + const pointer_type ptr = reinterpret_cast<pointer_type>(buffer.get(0)); + for (int t = 1; t < num_worker_threads; ++t) { + ValueJoin::join(ReducerConditional::select(m_functor, m_reducer), ptr, + reinterpret_cast<pointer_type>(buffer.get(t))); + } + + Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final( + ReducerConditional::select(m_functor, m_reducer), ptr); + + if (m_result_ptr) { + const int n = Analysis::value_count( + ReducerConditional::select(m_functor, m_reducer)); + + for (int j = 0; j < n; ++j) { + m_result_ptr[j] = ptr[j]; + } + } + } + + template <class ViewType> + ParallelReduce( + const FunctorType &arg_functor, const Policy &arg_policy, + const ViewType &arg_result, + typename std::enable_if<Kokkos::is_view<ViewType>::value && + !Kokkos::is_reducer_type<ReducerType>::value, + void *>::type = nullptr) + : m_functor(arg_functor), + m_league(arg_policy.league_size()), + m_policy(arg_policy), + m_reducer(InvalidType()), + m_result_ptr(arg_result.data()), + m_shared(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + + FunctorTeamShmemSize<FunctorType>::value( + m_functor, arg_policy.team_size())), + m_force_synchronous(!arg_result.impl_track().has_record()) {} + + inline ParallelReduce(const FunctorType &arg_functor, Policy arg_policy, + const ReducerType &reducer) + : m_functor(arg_functor), + m_league(arg_policy.league_size()), + m_policy(arg_policy), + m_reducer(reducer), + m_result_ptr(reducer.view().data()), + m_shared(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + + FunctorTeamShmemSize<FunctorType>::value( + arg_functor, arg_policy.team_size())), + m_force_synchronous(!reducer.view().impl_track().has_record()) {} +}; +} // namespace Impl +} // namespace Kokkos + +namespace Kokkos { + +template <typename iType> +KOKKOS_INLINE_FUNCTION + Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember> + TeamThreadRange(const Impl::HPXTeamMember &thread, const iType &count) { + return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>( + thread, count); +} + +template <typename iType1, typename iType2> +KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct< + typename std::common_type<iType1, iType2>::type, Impl::HPXTeamMember> +TeamThreadRange(const Impl::HPXTeamMember &thread, const iType1 &i_begin, + const iType2 &i_end) { + using iType = typename std::common_type<iType1, iType2>::type; + return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>( + thread, iType(i_begin), iType(i_end)); +} + +template <typename iType> +KOKKOS_INLINE_FUNCTION + Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember> + TeamVectorRange(const Impl::HPXTeamMember &thread, const iType &count) { + return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>( + thread, count); +} + +template <typename iType1, typename iType2> +KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct< + typename std::common_type<iType1, iType2>::type, Impl::HPXTeamMember> +TeamVectorRange(const Impl::HPXTeamMember &thread, const iType1 &i_begin, + const iType2 &i_end) { + using iType = typename std::common_type<iType1, iType2>::type; + return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>( + thread, iType(i_begin), iType(i_end)); +} + +template <typename iType> +KOKKOS_INLINE_FUNCTION + Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember> + ThreadVectorRange(const Impl::HPXTeamMember &thread, const iType &count) { + return Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>( + thread, count); +} + +template <typename iType1, typename iType2> +KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct< + typename std::common_type<iType1, iType2>::type, Impl::HPXTeamMember> +ThreadVectorRange(const Impl::HPXTeamMember &thread, const iType1 &i_begin, + const iType2 &i_end) { + using iType = typename std::common_type<iType1, iType2>::type; + return Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>( + thread, iType(i_begin), iType(i_end)); +} + +KOKKOS_INLINE_FUNCTION +Impl::ThreadSingleStruct<Impl::HPXTeamMember> PerTeam( + const Impl::HPXTeamMember &thread) { + return Impl::ThreadSingleStruct<Impl::HPXTeamMember>(thread); +} + +KOKKOS_INLINE_FUNCTION +Impl::VectorSingleStruct<Impl::HPXTeamMember> PerThread( + const Impl::HPXTeamMember &thread) { + return Impl::VectorSingleStruct<Impl::HPXTeamMember>(thread); +} + +/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each + * i=0..N-1. + * + * The range i=0..N-1 is mapped to all threads of the the calling thread team. + */ +template <typename iType, class Lambda> +KOKKOS_INLINE_FUNCTION void parallel_for( + const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember> + &loop_boundaries, + const Lambda &lambda) { + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) + lambda(i); +} + +/** \brief Inter-thread vector parallel_reduce. Executes lambda(iType i, + * ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all threads of the the calling thread team + * and a summation of val is performed and put into result. + */ +template <typename iType, class Lambda, typename ValueType> +KOKKOS_INLINE_FUNCTION void parallel_reduce( + const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember> + &loop_boundaries, + const Lambda &lambda, ValueType &result) { + result = ValueType(); + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) { + lambda(i, result); + } +} + +/** \brief Intra-thread vector parallel_for. Executes lambda(iType i) for each + * i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling thread. + */ +template <typename iType, class Lambda> +KOKKOS_INLINE_FUNCTION void parallel_for( + const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember> + &loop_boundaries, + const Lambda &lambda) { +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) { + lambda(i); + } +} + +/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, + * ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling thread + * and a summation of val is performed and put into result. + */ +template <typename iType, class Lambda, typename ValueType> +KOKKOS_INLINE_FUNCTION void parallel_reduce( + const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember> + &loop_boundaries, + const Lambda &lambda, ValueType &result) { + result = ValueType(); +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) { + lambda(i, result); + } +} + +template <typename iType, class Lambda, typename ReducerType> +KOKKOS_INLINE_FUNCTION void parallel_reduce( + const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember> + &loop_boundaries, + const Lambda &lambda, const ReducerType &reducer) { + reducer.init(reducer.reference()); + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) { + lambda(i, reducer.reference()); + } +} + +template <typename iType, class Lambda, typename ReducerType> +KOKKOS_INLINE_FUNCTION void parallel_reduce( + const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember> + &loop_boundaries, + const Lambda &lambda, const ReducerType &reducer) { + reducer.init(reducer.reference()); +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) { + lambda(i, reducer.reference()); + } +} + +template <typename iType, class FunctorType> +KOKKOS_INLINE_FUNCTION void parallel_scan( + Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember> const + &loop_boundaries, + const FunctorType &lambda) { + using value_type = typename Kokkos::Impl::FunctorAnalysis< + Kokkos::Impl::FunctorPatternInterface::SCAN, void, + FunctorType>::value_type; + + value_type scan_val = value_type(); + + // Intra-member scan + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) { + lambda(i, scan_val, false); + } + + // 'scan_val' output is the exclusive prefix sum + scan_val = loop_boundaries.thread.team_scan(scan_val); + + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) { + lambda(i, scan_val, true); + } +} + +/** \brief Intra-thread vector parallel exclusive prefix sum. Executes + * lambda(iType i, ValueType & val, bool final) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan + * operation is performed. Depending on the target execution space the operator + * might be called twice: once with final=false and once with final=true. When + * final==true val contains the prefix sum value. The contribution of this "i" + * needs to be added to val no matter whether final==true or not. In a serial + * execution (i.e. team_size==1) the operator is only called once with + * final==true. Scan_val will be set to the final sum value over all vector + */ +template <typename iType, class FunctorType> +KOKKOS_INLINE_FUNCTION void parallel_scan( + const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember> + &loop_boundaries, + const FunctorType &lambda) { + using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, void>; + using value_type = typename ValueTraits::value_type; + + value_type scan_val = value_type(); + +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) { + lambda(i, scan_val, true); + } +} + +/** \brief Intra-thread vector parallel scan with reducer + * + */ +template <typename iType, class FunctorType, typename ReducerType> +KOKKOS_INLINE_FUNCTION + typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type + parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::HPXTeamMember> &loop_boundaries, + const FunctorType &lambda, const ReducerType &reducer) { + typename ReducerType::value_type scan_val; + reducer.init(scan_val); + +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) { + lambda(i, scan_val, true); + } +} + +template <class FunctorType> +KOKKOS_INLINE_FUNCTION void single( + const Impl::VectorSingleStruct<Impl::HPXTeamMember> &, + const FunctorType &lambda) { + lambda(); +} + +template <class FunctorType> +KOKKOS_INLINE_FUNCTION void single( + const Impl::ThreadSingleStruct<Impl::HPXTeamMember> &, + const FunctorType &lambda) { + lambda(); +} + +template <class FunctorType, class ValueType> +KOKKOS_INLINE_FUNCTION void single( + const Impl::VectorSingleStruct<Impl::HPXTeamMember> &, + const FunctorType &lambda, ValueType &val) { + lambda(val); +} + +template <class FunctorType, class ValueType> +KOKKOS_INLINE_FUNCTION void single( + const Impl::ThreadSingleStruct<Impl::HPXTeamMember> &, + const FunctorType &lambda, ValueType &val) { + lambda(val); +} + +} // namespace Kokkos + +#include <HPX/Kokkos_HPX_Task.hpp> + +#endif /* #if defined( KOKKOS_ENABLE_HPX ) */ +#endif /* #ifndef KOKKOS_HPX_HPP */ diff --git a/packages/kokkos/core/src/Kokkos_Half.hpp b/packages/kokkos/core/src/Kokkos_Half.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e4b351381afbd6f7bac2e14d340ca1888bead0be --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_Half.hpp @@ -0,0 +1,119 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_HALF_HPP_ +#define KOKKOS_HALF_HPP_ + +#include <type_traits> +#include <Kokkos_Macros.hpp> + +// Include special backend specific versions here +#include <Cuda/Kokkos_Cuda_Half.hpp> + +// Potentially include special compiler specific versions here +// e.g. for Intel + +// If none of the above actually did anything and defined a half precision type +// define a fallback implementation here using float +#ifndef KOKKOS_IMPL_HALF_TYPE_DEFINED +#define KOKKOS_IMPL_HALF_TYPE_DEFINED +#define KOKKOS_HALF_T_IS_FLOAT true +namespace Kokkos { +namespace Impl { +struct half_impl_t { + using type = float; +}; +} // namespace Impl +namespace Experimental { + +using half_t = Kokkos::Impl::half_impl_t::type; + +// cast_to_half +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(float val) { return half_t(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(bool val) { return half_t(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(double val) { return half_t(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(short val) { return half_t(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned short val) { return half_t(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(int val) { return half_t(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned int val) { return half_t(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(long val) { return half_t(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned long val) { return half_t(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(long long val) { return half_t(val); } +KOKKOS_INLINE_FUNCTION +half_t cast_to_half(unsigned long long val) { return half_t(val); } + +// cast_from_half +// Using an explicit list here too, since the other ones are explicit and for +// example don't include char +template <class T> +KOKKOS_INLINE_FUNCTION std::enable_if_t< + std::is_same<T, float>::value || std::is_same<T, bool>::value || + std::is_same<T, double>::value || std::is_same<T, short>::value || + std::is_same<T, unsigned short>::value || std::is_same<T, int>::value || + std::is_same<T, unsigned int>::value || std::is_same<T, long>::value || + std::is_same<T, unsigned long>::value || + std::is_same<T, long long>::value || + std::is_same<T, unsigned long long>::value, + T> +cast_from_half(half_t val) { + return T(val); +} + +} // namespace Experimental +} // namespace Kokkos + +#else +#define KOKKOS_HALF_T_IS_FLOAT false +#endif // KOKKOS_IMPL_HALF_TYPE_DEFINED +#endif // KOKKOS_HALF_HPP_ diff --git a/packages/kokkos/core/src/Kokkos_HostSpace.hpp b/packages/kokkos/core/src/Kokkos_HostSpace.hpp new file mode 100644 index 0000000000000000000000000000000000000000..ba69fbad393ee391eff2b59c34d4ae526fa7af29 --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_HostSpace.hpp @@ -0,0 +1,319 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_HOSTSPACE_HPP +#define KOKKOS_HOSTSPACE_HPP + +#include <cstring> +#include <string> +#include <iosfwd> +#include <typeinfo> + +#include <Kokkos_Core_fwd.hpp> +#include <Kokkos_Concepts.hpp> +#include <Kokkos_MemoryTraits.hpp> + +#include <impl/Kokkos_Traits.hpp> +#include <impl/Kokkos_Error.hpp> +#include <impl/Kokkos_SharedAlloc.hpp> +#include <impl/Kokkos_Tools.hpp> + +#include "impl/Kokkos_HostSpace_deepcopy.hpp" + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { + +namespace Impl { + +/// \brief Initialize lock array for arbitrary size atomics. +/// +/// Arbitrary atomics are implemented using a hash table of locks +/// where the hash value is derived from the address of the +/// object for which an atomic operation is performed. +/// This function initializes the locks to zero (unset). +void init_lock_array_host_space(); + +/// \brief Acquire a lock for the address +/// +/// This function tries to acquire the lock for the hash value derived +/// from the provided ptr. If the lock is successfully acquired the +/// function returns true. Otherwise it returns false. +bool lock_address_host_space(void* ptr); + +/// \brief Release lock for the address +/// +/// This function releases the lock for the hash value derived +/// from the provided ptr. This function should only be called +/// after previously successfully acquiring a lock with +/// lock_address. +void unlock_address_host_space(void* ptr); + +} // namespace Impl + +} // namespace Kokkos + +namespace Kokkos { +/// \class HostSpace +/// \brief Memory management for host memory. +/// +/// HostSpace is a memory space that governs host memory. "Host" +/// memory means the usual CPU-accessible memory. +class HostSpace { + public: + //! Tag this class as a kokkos memory space + using memory_space = HostSpace; + using size_type = size_t; + + /// \typedef execution_space + /// \brief Default execution space for this memory space. + /// + /// Every memory space has a default execution space. This is + /// useful for things like initializing a View (which happens in + /// parallel using the View's default execution space). +#if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP) + using execution_space = Kokkos::OpenMP; +#elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS) + using execution_space = Kokkos::Threads; +#elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HPX) + using execution_space = Kokkos::Experimental::HPX; +#elif defined(KOKKOS_ENABLE_OPENMP) + using execution_space = Kokkos::OpenMP; +#elif defined(KOKKOS_ENABLE_THREADS) + using execution_space = Kokkos::Threads; +#elif defined(KOKKOS_ENABLE_HPX) + using execution_space = Kokkos::Experimental::HPX; +#elif defined(KOKKOS_ENABLE_SERIAL) + using execution_space = Kokkos::Serial; +#else +#error \ + "At least one of the following host execution spaces must be defined: Kokkos::OpenMP, Kokkos::Threads, or Kokkos::Serial. You might be seeing this message if you disabled the Kokkos::Serial device explicitly using the Kokkos_ENABLE_Serial:BOOL=OFF CMake option, but did not enable any of the other host execution space devices." +#endif + + //! This memory space preferred device_type + using device_type = Kokkos::Device<execution_space, memory_space>; + + /**\brief Default memory space instance */ + HostSpace(); + HostSpace(HostSpace&& rhs) = default; + HostSpace(const HostSpace& rhs) = default; + HostSpace& operator=(HostSpace&&) = default; + HostSpace& operator=(const HostSpace&) = default; + ~HostSpace() = default; + + /**\brief Non-default memory space instance to choose allocation mechansim, + * if available */ + + enum AllocationMechanism { + STD_MALLOC, + POSIX_MEMALIGN, + POSIX_MMAP, + INTEL_MM_ALLOC + }; + + explicit HostSpace(const AllocationMechanism&); + + /**\brief Allocate untracked memory in the space */ + void* allocate(const size_t arg_alloc_size) const; + void* allocate(const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const; + + /**\brief Deallocate untracked memory in the space */ + void deallocate(void* const arg_alloc_ptr, const size_t arg_alloc_size) const; + void deallocate(const char* arg_label, void* const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const; + + private: + template <class, class, class, class> + friend class Kokkos::Experimental::LogicalMemorySpace; + + void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size = 0, + const Kokkos::Tools::SpaceHandle = + Kokkos::Tools::make_space_handle(name())) const; + void impl_deallocate(const char* arg_label, void* const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0, + const Kokkos::Tools::SpaceHandle = + Kokkos::Tools::make_space_handle(name())) const; + + public: + /**\brief Return Name of the MemorySpace */ + static constexpr const char* name() { return m_name; } + + private: + AllocationMechanism m_alloc_mech; + static constexpr const char* m_name = "Host"; + friend class Kokkos::Impl::SharedAllocationRecord<Kokkos::HostSpace, void>; +}; + +} // namespace Kokkos + +//---------------------------------------------------------------------------- + +namespace Kokkos { + +namespace Impl { + +static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace, + Kokkos::HostSpace>::assignable, + ""); + +template <typename S> +struct HostMirror { + private: + // If input execution space can access HostSpace then keep it. + // Example: Kokkos::OpenMP can access, Kokkos::Cuda cannot + enum { + keep_exe = Kokkos::Impl::MemorySpaceAccess< + typename S::execution_space::memory_space, + Kokkos::HostSpace>::accessible + }; + + // If HostSpace can access memory space then keep it. + // Example: Cannot access Kokkos::CudaSpace, can access Kokkos::CudaUVMSpace + enum { + keep_mem = + Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace, + typename S::memory_space>::accessible + }; + + public: + using Space = typename std::conditional< + keep_exe && keep_mem, S, + typename std::conditional< + keep_mem, + Kokkos::Device<Kokkos::HostSpace::execution_space, + typename S::memory_space>, + Kokkos::HostSpace>::type>::type; +}; + +} // namespace Impl + +} // namespace Kokkos + +//---------------------------------------------------------------------------- + +namespace Kokkos { + +namespace Impl { + +template <> +class SharedAllocationRecord<Kokkos::HostSpace, void> + : public SharedAllocationRecordCommon<Kokkos::HostSpace> { + private: + friend Kokkos::HostSpace; + friend class SharedAllocationRecordCommon<Kokkos::HostSpace>; + + using base_t = SharedAllocationRecordCommon<Kokkos::HostSpace>; + using RecordBase = SharedAllocationRecord<void, void>; + + SharedAllocationRecord(const SharedAllocationRecord&) = delete; + SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; + +#ifdef KOKKOS_ENABLE_DEBUG + /**\brief Root record for tracked allocations from this HostSpace instance */ + static RecordBase s_root_record; +#endif + + const Kokkos::HostSpace m_space; + + protected: + ~SharedAllocationRecord() +#if defined( \ + KOKKOS_IMPL_INTEL_WORKAROUND_NOEXCEPT_SPECIFICATION_VIRTUAL_FUNCTION) + noexcept +#endif + ; + SharedAllocationRecord() = default; + + SharedAllocationRecord( + const Kokkos::HostSpace& arg_space, const std::string& arg_label, + const size_t arg_alloc_size, + const RecordBase::function_type arg_dealloc = &deallocate); + + public: + KOKKOS_INLINE_FUNCTION static SharedAllocationRecord* allocate( + const Kokkos::HostSpace& arg_space, const std::string& arg_label, + const size_t arg_alloc_size) { +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + return new SharedAllocationRecord(arg_space, arg_label, arg_alloc_size); +#else + (void)arg_space; + (void)arg_label; + (void)arg_alloc_size; + return (SharedAllocationRecord*)0; +#endif + } +}; + +} // namespace Impl + +} // namespace Kokkos + +//---------------------------------------------------------------------------- + +namespace Kokkos { + +namespace Impl { + +template <class ExecutionSpace> +struct DeepCopy<HostSpace, HostSpace, ExecutionSpace> { + DeepCopy(void* dst, const void* src, size_t n) { + hostspace_parallel_deepcopy(dst, src, n); + } + + DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { + exec.fence(); + hostspace_parallel_deepcopy(dst, src, n); + exec.fence(); + } +}; + +} // namespace Impl + +} // namespace Kokkos + +#endif // #define KOKKOS_HOSTSPACE_HPP diff --git a/packages/kokkos/core/src/Kokkos_Layout.hpp b/packages/kokkos/core/src/Kokkos_Layout.hpp new file mode 100644 index 0000000000000000000000000000000000000000..778b4f08109a5b2d617c2ff89298c9e92dbccb61 --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_Layout.hpp @@ -0,0 +1,344 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +/// \file Kokkos_Layout.hpp +/// \brief Declaration of various \c MemoryLayout options. + +#ifndef KOKKOS_LAYOUT_HPP +#define KOKKOS_LAYOUT_HPP + +#include <cstddef> +#include <impl/Kokkos_Traits.hpp> +#include <impl/Kokkos_Tags.hpp> + +namespace Kokkos { + +enum { ARRAY_LAYOUT_MAX_RANK = 8 }; + +//---------------------------------------------------------------------------- +/// \struct LayoutLeft +/// \brief Memory layout tag indicating left-to-right (Fortran scheme) +/// striding of multi-indices. +/// +/// This is an example of a \c MemoryLayout template parameter of +/// View. The memory layout describes how View maps from a +/// multi-index (i0, i1, ..., ik) to a memory location. +/// +/// "Layout left" indicates a mapping where the leftmost index i0 +/// refers to contiguous access, and strides increase for dimensions +/// going right from there (i1, i2, ...). This layout imitates how +/// Fortran stores multi-dimensional arrays. For the special case of +/// a two-dimensional array, "layout left" is also called "column +/// major." +struct LayoutLeft { + //! Tag this class as a kokkos array layout + using array_layout = LayoutLeft; + + size_t dimension[ARRAY_LAYOUT_MAX_RANK]; + + enum : bool { is_extent_constructible = true }; + + LayoutLeft(LayoutLeft const&) = default; + LayoutLeft(LayoutLeft&&) = default; + LayoutLeft& operator=(LayoutLeft const&) = default; + LayoutLeft& operator=(LayoutLeft&&) = default; + + KOKKOS_INLINE_FUNCTION + explicit constexpr LayoutLeft(size_t N0 = 0, size_t N1 = 0, size_t N2 = 0, + size_t N3 = 0, size_t N4 = 0, size_t N5 = 0, + size_t N6 = 0, size_t N7 = 0) + : dimension{N0, N1, N2, N3, N4, N5, N6, N7} {} +}; + +//---------------------------------------------------------------------------- +/// \struct LayoutRight +/// \brief Memory layout tag indicating right-to-left (C or +/// lexigraphical scheme) striding of multi-indices. +/// +/// This is an example of a \c MemoryLayout template parameter of +/// View. The memory layout describes how View maps from a +/// multi-index (i0, i1, ..., ik) to a memory location. +/// +/// "Right layout" indicates a mapping where the rightmost index ik +/// refers to contiguous access, and strides increase for dimensions +/// going left from there. This layout imitates how C stores +/// multi-dimensional arrays. For the special case of a +/// two-dimensional array, "layout right" is also called "row major." +struct LayoutRight { + //! Tag this class as a kokkos array layout + using array_layout = LayoutRight; + + size_t dimension[ARRAY_LAYOUT_MAX_RANK]; + + enum : bool { is_extent_constructible = true }; + + LayoutRight(LayoutRight const&) = default; + LayoutRight(LayoutRight&&) = default; + LayoutRight& operator=(LayoutRight const&) = default; + LayoutRight& operator=(LayoutRight&&) = default; + + KOKKOS_INLINE_FUNCTION + explicit constexpr LayoutRight(size_t N0 = 0, size_t N1 = 0, size_t N2 = 0, + size_t N3 = 0, size_t N4 = 0, size_t N5 = 0, + size_t N6 = 0, size_t N7 = 0) + : dimension{N0, N1, N2, N3, N4, N5, N6, N7} {} +}; + +//---------------------------------------------------------------------------- +/// \struct LayoutStride +/// \brief Memory layout tag indicated arbitrarily strided +/// multi-index mapping into contiguous memory. +struct LayoutStride { + //! Tag this class as a kokkos array layout + using array_layout = LayoutStride; + + size_t dimension[ARRAY_LAYOUT_MAX_RANK]; + size_t stride[ARRAY_LAYOUT_MAX_RANK]; + + enum : bool { is_extent_constructible = false }; + + LayoutStride(LayoutStride const&) = default; + LayoutStride(LayoutStride&&) = default; + LayoutStride& operator=(LayoutStride const&) = default; + LayoutStride& operator=(LayoutStride&&) = default; + + /** \brief Compute strides from ordered dimensions. + * + * Values of order uniquely form the set [0..rank) + * and specify ordering of the dimensions. + * Order = {0,1,2,...} is LayoutLeft + * Order = {...,2,1,0} is LayoutRight + */ + template <typename iTypeOrder, typename iTypeDimen> + KOKKOS_INLINE_FUNCTION static LayoutStride order_dimensions( + int const rank, iTypeOrder const* const order, + iTypeDimen const* const dimen) { + LayoutStride tmp; + // Verify valid rank order: + int check_input = ARRAY_LAYOUT_MAX_RANK < rank ? 0 : int(1 << rank) - 1; + for (int r = 0; r < ARRAY_LAYOUT_MAX_RANK; ++r) { + tmp.dimension[r] = 0; + tmp.stride[r] = 0; + } + for (int r = 0; r < rank; ++r) { + check_input &= ~int(1 << order[r]); + } + if (0 == check_input) { + size_t n = 1; + for (int r = 0; r < rank; ++r) { + tmp.stride[order[r]] = n; + n *= (dimen[order[r]]); + tmp.dimension[r] = dimen[r]; + } + } + return tmp; + } + + KOKKOS_INLINE_FUNCTION + explicit constexpr LayoutStride(size_t N0 = 0, size_t S0 = 0, size_t N1 = 0, + size_t S1 = 0, size_t N2 = 0, size_t S2 = 0, + size_t N3 = 0, size_t S3 = 0, size_t N4 = 0, + size_t S4 = 0, size_t N5 = 0, size_t S5 = 0, + size_t N6 = 0, size_t S6 = 0, size_t N7 = 0, + size_t S7 = 0) + : dimension{N0, N1, N2, N3, N4, N5, N6, N7}, stride{S0, S1, S2, S3, + S4, S5, S6, S7} {} +}; + +// =================================================================================== + +////////////////////////////////////////////////////////////////////////////////////// + +enum class Iterate { + Default, + Left, // Left indices stride fastest + Right // Right indices stride fastest +}; + +// To check for LayoutTiled +// This is to hide extra compile-time 'identifier' info within the LayoutTiled +// class by not relying on template specialization to include the ArgN*'s +template <typename LayoutTiledCheck, class Enable = void> +struct is_layouttiled : std::false_type {}; + +template <typename LayoutTiledCheck> +struct is_layouttiled< + LayoutTiledCheck, + typename std::enable_if<LayoutTiledCheck::is_array_layout_tiled>::type> + : std::true_type {}; + +namespace Experimental { + +/// LayoutTiled +// Must have Rank >= 2 +template < + Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, unsigned ArgN0, + unsigned ArgN1, unsigned ArgN2 = 0, unsigned ArgN3 = 0, unsigned ArgN4 = 0, + unsigned ArgN5 = 0, unsigned ArgN6 = 0, unsigned ArgN7 = 0, + bool IsPowerOfTwo = + (Kokkos::Impl::is_integral_power_of_two(ArgN0) && + Kokkos::Impl::is_integral_power_of_two(ArgN1) && + (Kokkos::Impl::is_integral_power_of_two(ArgN2) || (ArgN2 == 0)) && + (Kokkos::Impl::is_integral_power_of_two(ArgN3) || (ArgN3 == 0)) && + (Kokkos::Impl::is_integral_power_of_two(ArgN4) || (ArgN4 == 0)) && + (Kokkos::Impl::is_integral_power_of_two(ArgN5) || (ArgN5 == 0)) && + (Kokkos::Impl::is_integral_power_of_two(ArgN6) || (ArgN6 == 0)) && + (Kokkos::Impl::is_integral_power_of_two(ArgN7) || (ArgN7 == 0)))> +struct LayoutTiled { + static_assert(IsPowerOfTwo, + "LayoutTiled must be given power-of-two tile dimensions"); + +#if 0 + static_assert( (Impl::is_integral_power_of_two(ArgN0) ) && + (Impl::is_integral_power_of_two(ArgN1) ) && + (Impl::is_integral_power_of_two(ArgN2) || (ArgN2 == 0) ) && + (Impl::is_integral_power_of_two(ArgN3) || (ArgN3 == 0) ) && + (Impl::is_integral_power_of_two(ArgN4) || (ArgN4 == 0) ) && + (Impl::is_integral_power_of_two(ArgN5) || (ArgN5 == 0) ) && + (Impl::is_integral_power_of_two(ArgN6) || (ArgN6 == 0) ) && + (Impl::is_integral_power_of_two(ArgN7) || (ArgN7 == 0) ) + , "LayoutTiled must be given power-of-two tile dimensions" ); +#endif + + using array_layout = LayoutTiled<OuterP, InnerP, ArgN0, ArgN1, ArgN2, ArgN3, + ArgN4, ArgN5, ArgN6, ArgN7, IsPowerOfTwo>; + static constexpr Iterate outer_pattern = OuterP; + static constexpr Iterate inner_pattern = InnerP; + + enum { N0 = ArgN0 }; + enum { N1 = ArgN1 }; + enum { N2 = ArgN2 }; + enum { N3 = ArgN3 }; + enum { N4 = ArgN4 }; + enum { N5 = ArgN5 }; + enum { N6 = ArgN6 }; + enum { N7 = ArgN7 }; + + size_t dimension[ARRAY_LAYOUT_MAX_RANK]; + + enum : bool { is_extent_constructible = true }; + + LayoutTiled(LayoutTiled const&) = default; + LayoutTiled(LayoutTiled&&) = default; + LayoutTiled& operator=(LayoutTiled const&) = default; + LayoutTiled& operator=(LayoutTiled&&) = default; + + KOKKOS_INLINE_FUNCTION + explicit constexpr LayoutTiled(size_t argN0 = 0, size_t argN1 = 0, + size_t argN2 = 0, size_t argN3 = 0, + size_t argN4 = 0, size_t argN5 = 0, + size_t argN6 = 0, size_t argN7 = 0) + : dimension{argN0, argN1, argN2, argN3, argN4, argN5, argN6, argN7} {} +}; + +} // namespace Experimental + +// For use with view_copy +template <typename... Layout> +struct layout_iterate_type_selector { + static const Kokkos::Iterate outer_iteration_pattern = + Kokkos::Iterate::Default; + static const Kokkos::Iterate inner_iteration_pattern = + Kokkos::Iterate::Default; +}; + +template <> +struct layout_iterate_type_selector<Kokkos::LayoutRight> { + static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Right; + static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Right; +}; + +template <> +struct layout_iterate_type_selector<Kokkos::LayoutLeft> { + static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Left; + static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Left; +}; + +template <> +struct layout_iterate_type_selector<Kokkos::LayoutStride> { + static const Kokkos::Iterate outer_iteration_pattern = + Kokkos::Iterate::Default; + static const Kokkos::Iterate inner_iteration_pattern = + Kokkos::Iterate::Default; +}; + +template <unsigned ArgN0, unsigned ArgN1, unsigned ArgN2, unsigned ArgN3, + unsigned ArgN4, unsigned ArgN5, unsigned ArgN6, unsigned ArgN7> +struct layout_iterate_type_selector<Kokkos::Experimental::LayoutTiled< + Kokkos::Iterate::Left, Kokkos::Iterate::Left, ArgN0, ArgN1, ArgN2, ArgN3, + ArgN4, ArgN5, ArgN6, ArgN7, true> > { + static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Left; + static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Left; +}; + +template <unsigned ArgN0, unsigned ArgN1, unsigned ArgN2, unsigned ArgN3, + unsigned ArgN4, unsigned ArgN5, unsigned ArgN6, unsigned ArgN7> +struct layout_iterate_type_selector<Kokkos::Experimental::LayoutTiled< + Kokkos::Iterate::Right, Kokkos::Iterate::Left, ArgN0, ArgN1, ArgN2, ArgN3, + ArgN4, ArgN5, ArgN6, ArgN7, true> > { + static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Right; + static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Left; +}; + +template <unsigned ArgN0, unsigned ArgN1, unsigned ArgN2, unsigned ArgN3, + unsigned ArgN4, unsigned ArgN5, unsigned ArgN6, unsigned ArgN7> +struct layout_iterate_type_selector<Kokkos::Experimental::LayoutTiled< + Kokkos::Iterate::Left, Kokkos::Iterate::Right, ArgN0, ArgN1, ArgN2, ArgN3, + ArgN4, ArgN5, ArgN6, ArgN7, true> > { + static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Left; + static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Right; +}; + +template <unsigned ArgN0, unsigned ArgN1, unsigned ArgN2, unsigned ArgN3, + unsigned ArgN4, unsigned ArgN5, unsigned ArgN6, unsigned ArgN7> +struct layout_iterate_type_selector<Kokkos::Experimental::LayoutTiled< + Kokkos::Iterate::Right, Kokkos::Iterate::Right, ArgN0, ArgN1, ArgN2, ArgN3, + ArgN4, ArgN5, ArgN6, ArgN7, true> > { + static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Right; + static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Right; +}; + +} // namespace Kokkos + +#endif // #ifndef KOKKOS_LAYOUT_HPP diff --git a/packages/kokkos/core/src/Kokkos_LogicalSpaces.hpp b/packages/kokkos/core/src/Kokkos_LogicalSpaces.hpp new file mode 100644 index 0000000000000000000000000000000000000000..caa41b79b096dd2e7f2697f164d2cc3819834fc2 --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_LogicalSpaces.hpp @@ -0,0 +1,428 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_LOGICALSPACES_HPP +#define KOKKOS_LOGICALSPACES_HPP + +#include <Kokkos_Macros.hpp> +#include <Kokkos_Core_fwd.hpp> +#include <Kokkos_ScratchSpace.hpp> +#include <impl/Kokkos_MemorySpace.hpp> +#include <impl/Kokkos_Error.hpp> +#include <impl/Kokkos_SharedAlloc.hpp> +#include <impl/Kokkos_Profiling.hpp> +#include <cstring> +namespace Kokkos { +namespace Experimental { +struct DefaultMemorySpaceNamer { + static constexpr const char* get_name() { + return "DefaultLogicalMemorySpaceName"; + } +}; + +struct LogicalSpaceSharesAccess { + struct shared_access {}; + struct no_shared_access {}; +}; + +/// \class LogicalMemorySpace +/// \brief +/// +/// LogicalMemorySpace is a space that is identical to another space, +/// but differentiable by name and template argument +template <class BaseSpace, class DefaultBaseExecutionSpace = void, + class Namer = DefaultMemorySpaceNamer, + class SharesAccessWithBase = LogicalSpaceSharesAccess::shared_access> +class LogicalMemorySpace { +#ifdef KOKKOS_ENABLE_OPENMPTARGET + // [DZP] For some reason I don't yet know, using LogicalMemorySpaces + // inside an OpenMPTarget build causes errors in the + // SharedAllocationRecords of other types. This is my way of erroring + // a build if we instantiate a LogicalMemSpace in an OMPTarget build + static_assert(!std::is_same<BaseSpace, BaseSpace>::value, + "Can't use LogicalMemorySpaces in an OpenMPTarget build, we're " + "debugging memory issues"); +#endif + public: + //! Tag this class as a kokkos memory space + using memory_space = LogicalMemorySpace<BaseSpace, DefaultBaseExecutionSpace, + Namer, SharesAccessWithBase>; + using size_type = typename BaseSpace::size_type; + + /// \typedef execution_space + /// \brief Default execution space for this memory space. + /// + /// Every memory space has a default execution space. This is + /// useful for things like initializing a View (which happens in + /// parallel using the View's default execution space). + + using execution_space = + typename std::conditional<std::is_void<DefaultBaseExecutionSpace>::value, + typename BaseSpace::execution_space, + DefaultBaseExecutionSpace>::type; + + using device_type = Kokkos::Device<execution_space, memory_space>; + + LogicalMemorySpace() = default; + + template <typename... Args> + LogicalMemorySpace(Args&&... args) : underlying_space((Args &&) args...) {} + + /**\brief Allocate untracked memory in the space */ + void* allocate(const size_t arg_alloc_size) const { + return allocate("[unlabeled]", arg_alloc_size); + } + void* allocate(const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return impl_allocate(arg_label, arg_alloc_size, arg_logical_size); + } + + /**\brief Deallocate untracked memory in the space */ + void deallocate(void* const arg_alloc_ptr, + const size_t arg_alloc_size) const { + deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size); + } + void deallocate(const char* arg_label, void* const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size); + } + + /**\brief Return Name of the MemorySpace */ + constexpr static const char* name() { return Namer::get_name(); } + + private: + BaseSpace underlying_space; + template <class, class, class, class> + friend class LogicalMemorySpace; + friend class Kokkos::Impl::SharedAllocationRecord<memory_space, void>; + + void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size = 0, + Kokkos::Tools::SpaceHandle arg_handle = + Kokkos::Tools::make_space_handle(name())) const { + return underlying_space.impl_allocate(arg_label, arg_alloc_size, + arg_logical_size, arg_handle); + } + void impl_deallocate(const char* arg_label, void* const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0, + const Kokkos::Tools::SpaceHandle arg_handle = + Kokkos::Tools::make_space_handle(name())) const { + underlying_space.impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, + arg_logical_size, arg_handle); + } +}; +} // namespace Experimental +} // namespace Kokkos + +//---------------------------------------------------------------------------- + +namespace Kokkos { + +namespace Impl { + +template <typename BaseSpace, typename DefaultBaseExecutionSpace, class Namer, + typename OtherSpace> +struct MemorySpaceAccess< + Kokkos::Experimental::LogicalMemorySpace< + BaseSpace, DefaultBaseExecutionSpace, Namer, + Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>, + OtherSpace> { + enum { assignable = MemorySpaceAccess<BaseSpace, OtherSpace>::assignable }; + enum { accessible = MemorySpaceAccess<BaseSpace, OtherSpace>::accessible }; + enum { deepcopy = MemorySpaceAccess<BaseSpace, OtherSpace>::deepcopy }; +}; + +template <typename BaseSpace, typename DefaultBaseExecutionSpace, class Namer, + typename OtherSpace> +struct MemorySpaceAccess< + OtherSpace, + Kokkos::Experimental::LogicalMemorySpace< + BaseSpace, DefaultBaseExecutionSpace, Namer, + Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>> { + enum { assignable = MemorySpaceAccess<OtherSpace, BaseSpace>::assignable }; + enum { accessible = MemorySpaceAccess<OtherSpace, BaseSpace>::accessible }; + enum { deepcopy = MemorySpaceAccess<OtherSpace, BaseSpace>::deepcopy }; +}; + +template <typename BaseSpace, typename DefaultBaseExecutionSpace, class Namer> +struct MemorySpaceAccess< + Kokkos::Experimental::LogicalMemorySpace< + BaseSpace, DefaultBaseExecutionSpace, Namer, + Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>, + Kokkos::Experimental::LogicalMemorySpace< + BaseSpace, DefaultBaseExecutionSpace, Namer, + Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>> { + enum { assignable = true }; + enum { accessible = true }; + enum { deepcopy = true }; +}; + +} // namespace Impl + +} // namespace Kokkos + +//---------------------------------------------------------------------------- + +namespace Kokkos { + +namespace Impl { +template <class BaseSpace, class DefaultBaseExecutionSpace, class Namer, + class SharesAccessSemanticsWithBase> +class SharedAllocationRecord<Kokkos::Experimental::LogicalMemorySpace< + BaseSpace, DefaultBaseExecutionSpace, Namer, + SharesAccessSemanticsWithBase>, + void> : public SharedAllocationRecord<void, void> { + private: + using SpaceType = + Kokkos::Experimental::LogicalMemorySpace<BaseSpace, + DefaultBaseExecutionSpace, Namer, + SharesAccessSemanticsWithBase>; + using RecordBase = SharedAllocationRecord<void, void>; + + SharedAllocationRecord(const SharedAllocationRecord&) = delete; + SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; + + static void deallocate(RecordBase* arg_rec) { + delete static_cast<SharedAllocationRecord*>(arg_rec); + } + +#ifdef KOKKOS_ENABLE_DEBUG + /**\brief Root record for tracked allocations from this + * LogicalMemorySpace instance */ + static RecordBase s_root_record; +#endif + + const SpaceType m_space; + + protected: + ~SharedAllocationRecord() { + m_space.deallocate(RecordBase::m_alloc_ptr->m_label, + SharedAllocationRecord<void, void>::m_alloc_ptr, + SharedAllocationRecord<void, void>::m_alloc_size, + (SharedAllocationRecord<void, void>::m_alloc_size - + sizeof(SharedAllocationHeader))); + } + SharedAllocationRecord() = default; + + SharedAllocationRecord( + const SpaceType& arg_space, const std::string& arg_label, + const size_t arg_alloc_size, + const RecordBase::function_type arg_dealloc = &deallocate) + : SharedAllocationRecord<void, void>( +#ifdef KOKKOS_ENABLE_DEBUG + &SharedAllocationRecord<SpaceType, void>::s_root_record, +#endif + Impl::checked_allocation_with_header(arg_space, arg_label, + arg_alloc_size), + sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc), + m_space(arg_space) { + // Fill in the Header information + RecordBase::m_alloc_ptr->m_record = + static_cast<SharedAllocationRecord<void, void>*>(this); + + strncpy(RecordBase::m_alloc_ptr->m_label, arg_label.c_str(), + SharedAllocationHeader::maximum_label_length - 1); + // Set last element zero, in case c_str is too long + RecordBase::m_alloc_ptr + ->m_label[SharedAllocationHeader::maximum_label_length - 1] = '\0'; + } + + public: + inline std::string get_label() const { + return std::string(RecordBase::head()->m_label); + } + KOKKOS_INLINE_FUNCTION static SharedAllocationRecord* allocate( + const SpaceType& arg_space, const std::string& arg_label, + const size_t arg_alloc_size) { +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + return new SharedAllocationRecord(arg_space, arg_label, arg_alloc_size); +#else + (void)arg_space; + (void)arg_label; + (void)arg_alloc_size; + return (SharedAllocationRecord*)nullptr; +#endif + } + + /**\brief Allocate tracked memory in the space */ + static void* allocate_tracked(const SpaceType& arg_space, + const std::string& arg_label, + const size_t arg_alloc_size) { + if (!arg_alloc_size) return (void*)nullptr; + + SharedAllocationRecord* const r = + allocate(arg_space, arg_label, arg_alloc_size); + + RecordBase::increment(r); + + return r->data(); + } + + /**\brief Reallocate tracked memory in the space */ + static void* reallocate_tracked(void* const arg_alloc_ptr, + const size_t arg_alloc_size) { + SharedAllocationRecord* const r_old = get_record(arg_alloc_ptr); + SharedAllocationRecord* const r_new = + allocate(r_old->m_space, r_old->get_label(), arg_alloc_size); + + Kokkos::Impl::DeepCopy<SpaceType, SpaceType>( + r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size())); + + RecordBase::increment(r_new); + RecordBase::decrement(r_old); + + return r_new->data(); + } + /**\brief Deallocate tracked memory in the space */ + static void deallocate_tracked(void* const arg_alloc_ptr) { + if (arg_alloc_ptr != nullptr) { + SharedAllocationRecord* const r = get_record(arg_alloc_ptr); + + RecordBase::decrement(r); + } + } + + static SharedAllocationRecord* get_record(void* alloc_ptr) { + using Header = SharedAllocationHeader; + using RecordHost = SharedAllocationRecord<SpaceType, void>; + + SharedAllocationHeader const* const head = + alloc_ptr ? Header::get_header(alloc_ptr) + : (SharedAllocationHeader*)nullptr; + RecordHost* const record = + head ? static_cast<RecordHost*>(head->m_record) : (RecordHost*)nullptr; + + if (!alloc_ptr || record->m_alloc_ptr != head) { + Kokkos::Impl::throw_runtime_exception(std::string( + "Kokkos::Impl::SharedAllocationRecord< LogicalMemorySpace<> , " + "void >::get_record ERROR")); + } + + return record; + } +#ifdef KOKKOS_ENABLE_DEBUG + static void print_records(std::ostream& s, const SpaceType&, + bool detail = false) { + SharedAllocationRecord<void, void>::print_host_accessible_records( + s, "HostSpace", &s_root_record, detail); + } +#else + static void print_records(std::ostream&, const SpaceType&, + bool detail = false) { + (void)detail; + throw_runtime_exception( + "SharedAllocationRecord<HostSpace>::print_records only works " + "with KOKKOS_ENABLE_DEBUG enabled"); + } +#endif +}; +#ifdef KOKKOS_ENABLE_DEBUG +/**\brief Root record for tracked allocations from this LogicalSpace + * instance */ +template <class BaseSpace, class DefaultBaseExecutionSpace, class Namer, + class SharesAccessSemanticsWithBase> +SharedAllocationRecord<void, void> + SharedAllocationRecord<Kokkos::Experimental::LogicalMemorySpace< + BaseSpace, DefaultBaseExecutionSpace, Namer, + SharesAccessSemanticsWithBase>, + void>::s_root_record; +#endif + +} // namespace Impl + +} // namespace Kokkos + +//---------------------------------------------------------------------------- + +namespace Kokkos { + +namespace Impl { + +template <class Namer, class BaseSpace, class DefaultBaseExecutionSpace, + class SharesAccess, class ExecutionSpace> +struct DeepCopy<Kokkos::Experimental::LogicalMemorySpace< + BaseSpace, DefaultBaseExecutionSpace, Namer, SharesAccess>, + Kokkos::Experimental::LogicalMemorySpace< + BaseSpace, DefaultBaseExecutionSpace, Namer, SharesAccess>, + ExecutionSpace> { + DeepCopy(void* dst, void* src, size_t n) { + DeepCopy<BaseSpace, BaseSpace, ExecutionSpace>(dst, src, n); + } + DeepCopy(const ExecutionSpace& exec, void* dst, void* src, size_t n) { + DeepCopy<BaseSpace, BaseSpace, ExecutionSpace>(exec, dst, src, n); + } +}; + +template <class Namer, class BaseSpace, class DefaultBaseExecutionSpace, + class SharesAccess, class ExecutionSpace, class SourceSpace> +struct DeepCopy<SourceSpace, + Kokkos::Experimental::LogicalMemorySpace< + BaseSpace, DefaultBaseExecutionSpace, Namer, SharesAccess>, + ExecutionSpace> { + DeepCopy(void* dst, void* src, size_t n) { + DeepCopy<SourceSpace, BaseSpace, ExecutionSpace>(dst, src, n); + } + DeepCopy(const ExecutionSpace& exec, void* dst, void* src, size_t n) { + DeepCopy<SourceSpace, BaseSpace, ExecutionSpace>(exec, dst, src, n); + } +}; + +template <class Namer, class BaseSpace, class DefaultBaseExecutionSpace, + class SharesAccess, class ExecutionSpace, class DestinationSpace> +struct DeepCopy<Kokkos::Experimental::LogicalMemorySpace< + BaseSpace, DefaultBaseExecutionSpace, Namer, SharesAccess>, + DestinationSpace, ExecutionSpace> { + DeepCopy(void* dst, void* src, size_t n) { + DeepCopy<BaseSpace, DestinationSpace, ExecutionSpace>(dst, src, n); + } + DeepCopy(const ExecutionSpace& exec, void* dst, void* src, size_t n) { + DeepCopy<BaseSpace, DestinationSpace, ExecutionSpace>(exec, dst, src, n); + } +}; +} // namespace Impl + +} // namespace Kokkos +#endif // KOKKOS_LOGICALSPACES_HPP diff --git a/packages/kokkos/core/src/Kokkos_Macros.hpp b/packages/kokkos/core/src/Kokkos_Macros.hpp new file mode 100644 index 0000000000000000000000000000000000000000..0d0185346540bf929b4305d6ad496b2f02e39c69 --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_Macros.hpp @@ -0,0 +1,572 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_MACROS_HPP +#define KOKKOS_MACROS_HPP + +//---------------------------------------------------------------------------- +/** Pick up configure / build options via #define macros: + * + * KOKKOS_ENABLE_CUDA Kokkos::Cuda execution and memory spaces + * KOKKOS_ENABLE_THREADS Kokkos::Threads execution space + * KOKKOS_ENABLE_HPX Kokkos::Experimental::HPX execution space + * KOKKOS_ENABLE_OPENMP Kokkos::OpenMP execution space + * KOKKOS_ENABLE_OPENMPTARGET Kokkos::Experimental::OpenMPTarget + * execution space KOKKOS_ENABLE_HWLOC HWLOC library is available. + * KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK Insert array bounds checks, is expensive! + * KOKKOS_ENABLE_MPI Negotiate MPI/execution space + * interactions. KOKKOS_ENABLE_CUDA_UVM Use CUDA UVM for Cuda memory + * space. + */ + +#ifndef KOKKOS_DONT_INCLUDE_CORE_CONFIG_H +#include <KokkosCore_config.h> +#endif + +//---------------------------------------------------------------------------- +/** Pick up compiler specific #define macros: + * + * Macros for known compilers evaluate to an integral version value + * + * KOKKOS_COMPILER_NVCC + * KOKKOS_COMPILER_GNU + * KOKKOS_COMPILER_INTEL + * KOKKOS_COMPILER_IBM + * KOKKOS_COMPILER_CRAYC + * KOKKOS_COMPILER_APPLECC + * KOKKOS_COMPILER_CLANG + * KOKKOS_COMPILER_PGI + * KOKKOS_COMPILER_MSVC + * + * Macros for which compiler extension to use for atomics on intrinsice types + * + * KOKKOS_ENABLE_CUDA_ATOMICS + * KOKKOS_ENABLE_GNU_ATOMICS + * KOKKOS_ENABLE_INTEL_ATOMICS + * KOKKOS_ENABLE_OPENMP_ATOMICS + * + * A suite of 'KOKKOS_ENABLE_PRAGMA_...' are defined for internal use. + * + * Macros for marking functions to run in an execution space: + * + * KOKKOS_FUNCTION + * KOKKOS_INLINE_FUNCTION request compiler to inline + * KOKKOS_FORCEINLINE_FUNCTION force compiler to inline, use with care! + */ + +//---------------------------------------------------------------------------- + +#if !defined(KOKKOS_ENABLE_THREADS) && !defined(KOKKOS_ENABLE_CUDA) && \ + !defined(KOKKOS_ENABLE_OPENMP) && !defined(KOKKOS_ENABLE_HPX) && \ + !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(KOKKOS_ENABLE_HIP) && \ + !defined(KOKKOS_ENABLE_SYCL) +#define KOKKOS_INTERNAL_NOT_PARALLEL +#endif + +#define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA + +#include <KokkosCore_Config_SetupBackend.hpp> + +//---------------------------------------------------------------------------- +// Mapping compiler built-ins to KOKKOS_COMPILER_*** macros + +#if defined(__NVCC__) +// NVIDIA compiler is being used. +// Code is parsed and separated into host and device code. +// Host code is compiled again with another compiler. +// Device code is compile to 'ptx'. +#define KOKKOS_COMPILER_NVCC __NVCC__ +#endif // #if defined( __NVCC__ ) + +#if !defined(KOKKOS_LAMBDA) +#define KOKKOS_LAMBDA [=] +#endif + +#if (defined(KOKKOS_ENABLE_CXX17) || defined(KOKKOS_ENABLE_CXX20)) && \ + !defined(KOKKOS_CLASS_LAMBDA) +#define KOKKOS_CLASS_LAMBDA [ =, *this ] +#endif + +//#if !defined( __CUDA_ARCH__ ) // Not compiling Cuda code to 'ptx'. + +// Intel compiler for host code. + +#if defined(__INTEL_COMPILER) +#define KOKKOS_COMPILER_INTEL __INTEL_COMPILER +#elif defined(__INTEL_LLVM_COMPILER) +#define KOKKOS_COMPILER_INTEL __INTEL_LLVM_COMPILER +#elif defined(__ICC) +// Old define +#define KOKKOS_COMPILER_INTEL __ICC +#elif defined(__ECC) +// Very old define +#define KOKKOS_COMPILER_INTEL __ECC +#endif + +// CRAY compiler for host code +#if defined(_CRAYC) +#define KOKKOS_COMPILER_CRAYC _CRAYC +#endif + +#if defined(__IBMCPP__) +// IBM C++ +#define KOKKOS_COMPILER_IBM __IBMCPP__ +#elif defined(__IBMC__) +#define KOKKOS_COMPILER_IBM __IBMC__ +#elif defined(__ibmxl_vrm__) // xlclang++ +#define KOKKOS_COMPILER_IBM __ibmxl_vrm__ +#endif + +#if defined(__APPLE_CC__) +#define KOKKOS_COMPILER_APPLECC __APPLE_CC__ +#endif + +#if defined(__clang__) && !defined(KOKKOS_COMPILER_INTEL) && \ + !defined(KOKKOS_COMPILER_IBM) +#define KOKKOS_COMPILER_CLANG \ + __clang_major__ * 100 + __clang_minor__ * 10 + __clang_patchlevel__ +#endif + +#if !defined(__clang__) && !defined(KOKKOS_COMPILER_INTEL) && defined(__GNUC__) +#define KOKKOS_COMPILER_GNU \ + __GNUC__ * 100 + __GNUC_MINOR__ * 10 + __GNUC_PATCHLEVEL__ + +#if (530 > KOKKOS_COMPILER_GNU) +#error "Compiling with GCC version earlier than 5.3.0 is not supported." +#endif +#endif + +#if defined(__PGIC__) +#define KOKKOS_COMPILER_PGI \ + __PGIC__ * 100 + __PGIC_MINOR__ * 10 + __PGIC_PATCHLEVEL__ + +#if (1740 > KOKKOS_COMPILER_PGI) +#error "Compiling with PGI version earlier than 17.4 is not supported." +#endif +#endif + +#if defined(_MSC_VER) && !defined(KOKKOS_COMPILER_INTEL) +#define KOKKOS_COMPILER_MSVC _MSC_VER +#endif + +#if defined(_OPENMP) +// Compiling with OpenMP. +// The value of _OPENMP is an integer value YYYYMM +// where YYYY and MM are the year and month designation +// of the supported OpenMP API version. +#endif // #if defined( _OPENMP ) + +//---------------------------------------------------------------------------- +// Intel compiler macros + +#if defined(KOKKOS_COMPILER_INTEL) +// FIXME_SYCL +#if !defined(KOKKOS_ENABLE_SYCL) +#define KOKKOS_ENABLE_PRAGMA_UNROLL 1 +#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 +#define KOKKOS_ENABLE_PRAGMA_VECTOR 1 +#endif +#if (1800 > KOKKOS_COMPILER_INTEL) +#define KOKKOS_ENABLE_PRAGMA_SIMD 1 +#endif + +// FIXME_SYCL +#if !defined(KOKKOS_ENABLE_SYCL) +#define KOKKOS_ENABLE_PRAGMA_IVDEP 1 +#endif + +#if !defined(KOKKOS_MEMORY_ALIGNMENT) +#define KOKKOS_MEMORY_ALIGNMENT 64 +#endif + +#define KOKKOS_RESTRICT __restrict__ + +#ifndef KOKKOS_IMPL_ALIGN_PTR +#define KOKKOS_IMPL_ALIGN_PTR(size) __attribute__((align_value(size))) +#endif + +#if (1700 > KOKKOS_COMPILER_INTEL) +#error "Compiling with Intel version earlier than 17.0 is not supported." +#endif + +#if !defined(KOKKOS_ENABLE_ASM) && !defined(_WIN32) +#define KOKKOS_ENABLE_ASM 1 +#endif + +#if !defined(KOKKOS_IMPL_FORCEINLINE_FUNCTION) +#if !defined(_WIN32) +#define KOKKOS_IMPL_FORCEINLINE_FUNCTION inline __attribute__((always_inline)) +#define KOKKOS_IMPL_FORCEINLINE __attribute__((always_inline)) +#else +#define KOKKOS_IMPL_FORCEINLINE_FUNCTION inline +#endif +#endif + +#if defined(KOKKOS_ARCH_AVX512MIC) +#define KOKKOS_ENABLE_RFO_PREFETCH 1 +#if (KOKKOS_COMPILER_INTEL < 1800) && !defined(KOKKOS_KNL_USE_ASM_WORKAROUND) +#define KOKKOS_KNL_USE_ASM_WORKAROUND 1 +#endif +#endif + +#if (1800 > KOKKOS_COMPILER_INTEL) +#define KOKKOS_IMPL_INTEL_WORKAROUND_NOEXCEPT_SPECIFICATION_VIRTUAL_FUNCTION +#endif + +#if defined(__MIC__) +// Compiling for Xeon Phi +#endif +#endif + +//---------------------------------------------------------------------------- +// Cray compiler macros + +#if defined(KOKKOS_COMPILER_CRAYC) +#endif + +//---------------------------------------------------------------------------- +// IBM Compiler macros + +#if defined(KOKKOS_COMPILER_IBM) +#define KOKKOS_ENABLE_PRAGMA_UNROLL 1 +//#define KOKKOS_ENABLE_PRAGMA_IVDEP 1 +//#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 +//#define KOKKOS_ENABLE_PRAGMA_VECTOR 1 +//#define KOKKOS_ENABLE_PRAGMA_SIMD 1 + +#if !defined(KOKKOS_ENABLE_ASM) +#define KOKKOS_ENABLE_ASM 1 +#endif +#endif + +//---------------------------------------------------------------------------- +// CLANG compiler macros + +#if defined(KOKKOS_COMPILER_CLANG) +//#define KOKKOS_ENABLE_PRAGMA_UNROLL 1 +//#define KOKKOS_ENABLE_PRAGMA_IVDEP 1 +//#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 +//#define KOKKOS_ENABLE_PRAGMA_VECTOR 1 +//#define KOKKOS_ENABLE_PRAGMA_SIMD 1 + +#if !defined(KOKKOS_IMPL_FORCEINLINE_FUNCTION) +#define KOKKOS_IMPL_FORCEINLINE_FUNCTION inline __attribute__((always_inline)) +#define KOKKOS_IMPL_FORCEINLINE __attribute__((always_inline)) +#endif + +#if !defined(KOKKOS_IMPL_ALIGN_PTR) +#define KOKKOS_IMPL_ALIGN_PTR(size) __attribute__((aligned(size))) +#endif + +#endif + +//---------------------------------------------------------------------------- +// GNU Compiler macros + +#if defined(KOKKOS_COMPILER_GNU) +//#define KOKKOS_ENABLE_PRAGMA_UNROLL 1 +//#define KOKKOS_ENABLE_PRAGMA_IVDEP 1 +//#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 +//#define KOKKOS_ENABLE_PRAGMA_VECTOR 1 +//#define KOKKOS_ENABLE_PRAGMA_SIMD 1 + +#if defined(KOKKOS_ARCH_AVX512MIC) +#define KOKKOS_ENABLE_RFO_PREFETCH 1 +#endif + +#if !defined(KOKKOS_IMPL_FORCEINLINE_FUNCTION) +#define KOKKOS_IMPL_FORCEINLINE_FUNCTION inline __attribute__((always_inline)) +#define KOKKOS_IMPL_FORCEINLINE __attribute__((always_inline)) +#endif + +#define KOKKOS_RESTRICT __restrict__ + +#if !defined(KOKKOS_ENABLE_ASM) && !defined(__PGIC__) && \ + (defined(__amd64) || defined(__amd64__) || defined(__x86_64) || \ + defined(__x86_64__) || defined(__PPC64__)) +#define KOKKOS_ENABLE_ASM 1 +#endif +#endif + +//---------------------------------------------------------------------------- + +#if defined(KOKKOS_COMPILER_PGI) +#define KOKKOS_ENABLE_PRAGMA_UNROLL 1 +#define KOKKOS_ENABLE_PRAGMA_IVDEP 1 +//#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 +#define KOKKOS_ENABLE_PRAGMA_VECTOR 1 +//#define KOKKOS_ENABLE_PRAGMA_SIMD 1 +#endif + +//---------------------------------------------------------------------------- + +#if defined(KOKKOS_COMPILER_NVCC) +#if defined(__CUDA_ARCH__) +#define KOKKOS_ENABLE_PRAGMA_UNROLL 1 +#endif +#endif + +//---------------------------------------------------------------------------- +// Define function marking macros if compiler specific macros are undefined: + +#if !defined(KOKKOS_IMPL_FORCEINLINE_FUNCTION) +#define KOKKOS_IMPL_FORCEINLINE_FUNCTION inline +#endif + +#if !defined(KOKKOS_IMPL_FORCEINLINE) +#define KOKKOS_IMPL_FORCEINLINE inline +#endif + +#if !defined(KOKKOS_IMPL_INLINE_FUNCTION) +#define KOKKOS_IMPL_INLINE_FUNCTION inline +#endif + +#if !defined(KOKKOS_IMPL_FUNCTION) +#define KOKKOS_IMPL_FUNCTION /**/ +#endif + +#if !defined(KOKKOS_INLINE_FUNCTION_DELETED) +#define KOKKOS_INLINE_FUNCTION_DELETED inline +#endif + +#if !defined(KOKKOS_DEFAULTED_FUNCTION) +#define KOKKOS_DEFAULTED_FUNCTION inline +#endif + +#if !defined(KOKKOS_IMPL_HOST_FUNCTION) +#define KOKKOS_IMPL_HOST_FUNCTION +#endif + +#if !defined(KOKKOS_IMPL_DEVICE_FUNCTION) +#define KOKKOS_IMPL_DEVICE_FUNCTION +#endif + +// Temporary solution for SYCL not supporting printf in kernels. +// Might disappear at any point once we have found another solution. +#if !defined(KOKKOS_IMPL_DO_NOT_USE_PRINTF) +#define KOKKOS_IMPL_DO_NOT_USE_PRINTF(...) printf(__VA_ARGS__) +#endif + +//---------------------------------------------------------------------------- +// Define final version of functions. This is so that clang tidy can find these +// macros more easily +#if defined(__clang_analyzer__) +#define KOKKOS_FUNCTION \ + KOKKOS_IMPL_FUNCTION __attribute__((annotate("KOKKOS_FUNCTION"))) +#define KOKKOS_INLINE_FUNCTION \ + KOKKOS_IMPL_INLINE_FUNCTION \ + __attribute__((annotate("KOKKOS_INLINE_FUNCTION"))) +#define KOKKOS_FORCEINLINE_FUNCTION \ + KOKKOS_IMPL_FORCEINLINE_FUNCTION \ + __attribute__((annotate("KOKKOS_FORCEINLINE_FUNCTION"))) +#else +#define KOKKOS_FUNCTION KOKKOS_IMPL_FUNCTION +#define KOKKOS_INLINE_FUNCTION KOKKOS_IMPL_INLINE_FUNCTION +#define KOKKOS_FORCEINLINE_FUNCTION KOKKOS_IMPL_FORCEINLINE_FUNCTION +#endif + +//---------------------------------------------------------------------------- +// Define empty macro for restrict if necessary: + +#if !defined(KOKKOS_RESTRICT) +#define KOKKOS_RESTRICT +#endif + +//---------------------------------------------------------------------------- +// Define Macro for alignment: + +#if !defined(KOKKOS_MEMORY_ALIGNMENT) +#define KOKKOS_MEMORY_ALIGNMENT 64 +#endif + +#if !defined(KOKKOS_MEMORY_ALIGNMENT_THRESHOLD) +#define KOKKOS_MEMORY_ALIGNMENT_THRESHOLD 1 +#endif + +#if !defined(KOKKOS_IMPL_ALIGN_PTR) +#define KOKKOS_IMPL_ALIGN_PTR(size) /* */ +#endif + +//---------------------------------------------------------------------------- +// Determine the default execution space for parallel dispatch. +// There is zero or one default execution space specified. + +#if 1 < ((defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA) ? 1 : 0) + \ + (defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HIP) ? 1 : 0) + \ + (defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SYCL) ? 1 : 0) + \ + (defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET) ? 1 : 0) + \ + (defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP) ? 1 : 0) + \ + (defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS) ? 1 : 0) + \ + (defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HPX) ? 1 : 0) + \ + (defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL) ? 1 : 0)) +#error "More than one KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_* specified." +#endif + +// If default is not specified then chose from enabled execution spaces. +// Priority: CUDA, HIP, SYCL, OPENMPTARGET, OPENMP, THREADS, HPX, SERIAL +#if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA) +#elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HIP) +#elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SYCL) +#elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET) +#elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP) +#elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS) +#elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HPX) +#elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL) +#elif defined(KOKKOS_ENABLE_CUDA) +#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA +#elif defined(KOKKOS_ENABLE_HIP) +#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HIP +#if defined(__HIP__) +// mark that HIP-clang can use __host__ and __device__ +// as valid overload criteria +#define KOKKOS_IMPL_ENABLE_OVERLOAD_HOST_DEVICE +#endif +#elif defined(KOKKOS_ENABLE_SYCL) +#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SYCL +#elif defined(KOKKOS_ENABLE_OPENMPTARGET) +#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET +#elif defined(KOKKOS_ENABLE_OPENMP) +#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP +#elif defined(KOKKOS_ENABLE_THREADS) +#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS +#elif defined(KOKKOS_ENABLE_HPX) +#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HPX +#else +#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL +#endif + +//---------------------------------------------------------------------------- +// Determine for what space the code is being compiled: + +#if defined(__CUDACC__) && defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) +#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA +#elif defined(__SYCL_DEVICE_ONLY__) && defined(KOKKOS_ENABLE_SYCL) +#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL +#elif defined(__HIPCC__) && defined(__HIP_DEVICE_COMPILE__) && \ + defined(KOKKOS_ENABLE_HIP) +#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HIP_GPU +#else +#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST +#endif + +//---------------------------------------------------------------------------- + +#if (defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 200112L) || \ + (defined(_XOPEN_SOURCE) && _XOPEN_SOURCE >= 600) +#if defined(KOKKOS_ENABLE_PERFORMANCE_POSIX_MEMALIGN) +#define KOKKOS_ENABLE_POSIX_MEMALIGN 1 +#endif +#endif + +//---------------------------------------------------------------------------- +// If compiling with CUDA, we must use relocateable device code +// to enable the task policy. + +#if defined(KOKKOS_ENABLE_CUDA) +#if defined(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) +#define KOKKOS_ENABLE_TASKDAG +#endif +#elif !defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_SYCL) +#define KOKKOS_ENABLE_TASKDAG +#endif + +#if defined(KOKKOS_ENABLE_CUDA) +#define KOKKOS_IMPL_CUDA_VERSION_9_WORKAROUND +#if (__CUDA_ARCH__) +#define KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK +#endif +#endif + +#define KOKKOS_INVALID_INDEX (~std::size_t(0)) + +#define KOKKOS_IMPL_CTOR_DEFAULT_ARG KOKKOS_INVALID_INDEX + +#define KOKKOS_CONSTEXPR_14 constexpr +#define KOKKOS_DEPRECATED [[deprecated]] +#define KOKKOS_DEPRECATED_TRAILING_ATTRIBUTE + +// DJS 05/28/2019: Bugfix: Issue 2155 +// Use KOKKOS_ENABLE_CUDA_LDG_INTRINSIC to avoid memory leak in RandomAccess +// View +#if defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_CUDA_LDG_INTRINSIC) +#define KOKKOS_ENABLE_CUDA_LDG_INTRINSIC +#endif + +#if defined(KOKKOS_ENABLE_CXX17) || defined(KOKKOS_ENABLE_CXX20) +#define KOKKOS_ATTRIBUTE_NODISCARD [[nodiscard]] +#else +#define KOKKOS_ATTRIBUTE_NODISCARD +#endif + +#if (defined(KOKKOS_COMPILER_GNU) || defined(KOKKOS_COMPILER_CLANG) || \ + defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_PGI)) && \ + !defined(KOKKOS_COMPILER_MSVC) +#define KOKKOS_IMPL_ENABLE_STACKTRACE +#define KOKKOS_IMPL_ENABLE_CXXABI +#endif + +// WORKAROUND for AMD aomp which apparently defines CUDA_ARCH when building for +// AMD GPUs with OpenMP Target ??? +#if defined(__CUDA_ARCH__) && !defined(__CUDACC__) && \ + !defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_CUDA) +#undef __CUDA_ARCH__ +#endif + +#if defined(KOKKOS_COMPILER_MSVC) && !defined(KOKKOS_COMPILER_CLANG) +#define KOKKOS_THREAD_LOCAL __declspec(thread) +#else +#define KOKKOS_THREAD_LOCAL __thread +#endif + +#if (defined(KOKKOS_IMPL_WINDOWS_CUDA) || defined(KOKKOS_COMPILER_MSVC)) && \ + !defined(KOKKOS_COMPILER_CLANG) +// MSVC (as of 16.5.5 at least) does not do empty base class optimization by +// default when there are multiple bases, even though the standard requires it +// for standard layout types. +#define KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION __declspec(empty_bases) +#else +#define KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION +#endif + +#endif // #ifndef KOKKOS_MACROS_HPP diff --git a/packages/kokkos/core/src/Kokkos_MasterLock.hpp b/packages/kokkos/core/src/Kokkos_MasterLock.hpp new file mode 100644 index 0000000000000000000000000000000000000000..3c45e131a0fba6e39f3f97ef2fd67451b9aef76c --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_MasterLock.hpp @@ -0,0 +1,75 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_MASTER_LOCK_HPP +#define KOKKOS_MASTER_LOCK_HPP + +#include <Kokkos_Macros.hpp> + +namespace Kokkos { +namespace Experimental { + +// my be used to coordinate work between master instances +// SHOULD NOT be used within a parallel algorithm +// +// This lock should be used with with a scoped lock guard +// i.e. std::unique_lock<Lock>, std::lock_guard +// +// cannot be copied or moved +// has the following functions available +// +// Lock() +// ~Lock() +// +// void lock() +// void unlock() +// bool try_lock() +// +template <typename ExecutionSpace> +class MasterLock; + +} // namespace Experimental +} // namespace Kokkos + +#endif // KOKKOS_MASTER_LOCK_HPP diff --git a/packages/kokkos/core/src/Kokkos_MathematicalFunctions.hpp b/packages/kokkos/core/src/Kokkos_MathematicalFunctions.hpp new file mode 100644 index 0000000000000000000000000000000000000000..50223651e7d189e07cd94f9bf48eb6c5dcaa62d2 --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_MathematicalFunctions.hpp @@ -0,0 +1,233 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_HPP +#define KOKKOS_MATHEMATICAL_FUNCTIONS_HPP + +#include <Kokkos_Macros.hpp> +#include <cmath> +#include <algorithm> +#include <type_traits> + +#ifdef KOKKOS_ENABLE_SYCL +#include <CL/sycl.hpp> +#endif + +namespace Kokkos { +namespace Experimental { + +#if defined(KOKKOS_ENABLE_SYCL) +#define NAMESPACE_MATH_FUNCTIONS sycl +#else +#define NAMESPACE_MATH_FUNCTIONS std +#endif + +#define KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, RETURNTYPE, ARGTYPE) \ + KOKKOS_INLINE_FUNCTION RETURNTYPE FUNC(ARGTYPE x) { \ + using NAMESPACE_MATH_FUNCTIONS::FUNC; \ + return FUNC(x); \ + } + +#define KOKKOS_IMPL_UNARY_FUNCTION_INTEGRAL(FUNC, RETURNTYPE) \ + template <typename Integer, \ + typename = std::enable_if_t<std::is_integral<Integer>::value>> \ + KOKKOS_INLINE_FUNCTION RETURNTYPE FUNC(Integer x) { \ + return Kokkos::Experimental::FUNC(static_cast<double>(x)); \ + } + +#define KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT(FUNC, TYPE) \ + KOKKOS_INLINE_FUNCTION TYPE FUNC(TYPE x, TYPE y) { \ + using NAMESPACE_MATH_FUNCTIONS::FUNC; \ + return FUNC(x, y); \ + } + +// NOTE long double overloads are not available on the device +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || \ + defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENMPTARGET) + +#define KOKKOS_IMPL_BINARY_FUNCTION_ARITHMETIC(FUNC) \ + template <typename Arithmetic1, typename Arithmetic2, \ + typename = std::enable_if_t< \ + std::is_arithmetic<Arithmetic1>::value && \ + std::is_arithmetic<Arithmetic2>::value && \ + !std::is_same<Arithmetic1, long double>::value && \ + !std::is_same<Arithmetic2, long double>::value>> \ + KOKKOS_INLINE_FUNCTION double FUNC(Arithmetic1 x, Arithmetic2 y) { \ + return Kokkos::Experimental::FUNC( \ + static_cast<std::conditional_t<std::is_integral<Arithmetic1>::value, \ + double, Arithmetic1>>(x), \ + static_cast<std::conditional_t<std::is_integral<Arithmetic2>::value, \ + double, Arithmetic2>>(y)); \ + } + +#define KOKKOS_IMPL_MATH_UNARY_FUNCTION(FUNC) \ + KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, float, float) \ + KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, double, double) \ + KOKKOS_IMPL_UNARY_FUNCTION_INTEGRAL(FUNC, double) + +#define KOKKOS_IMPL_MATH_UNARY_PREDICATE(FUNC) \ + KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, bool, float) \ + KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, bool, double) \ + KOKKOS_IMPL_UNARY_FUNCTION_INTEGRAL(FUNC, bool) + +#define KOKKOS_IMPL_MATH_BINARY_FUNCTION(FUNC) \ + KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT(FUNC, float) \ + KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT(FUNC, double) \ + KOKKOS_IMPL_BINARY_FUNCTION_ARITHMETIC(FUNC) + +#define KOKKOS_IMPL_MATH_NAN() \ + KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(nanf, float, char const*) \ + KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(nan, double, char const*) + +#else // long double overloads are available + +#define KOKKOS_IMPL_BINARY_FUNCTION_ARITHMETIC(FUNC) \ + template <typename Arithmetic1, typename Arithmetic2, \ + typename = \ + std::enable_if_t<std::is_arithmetic<Arithmetic1>::value && \ + std::is_arithmetic<Arithmetic2>::value>, \ + typename Promoted = std::conditional_t< \ + std::is_same<Arithmetic1, long double>::value || \ + std::is_same<Arithmetic2, long double>::value, \ + long double, double>> \ + KOKKOS_INLINE_FUNCTION Promoted FUNC(Arithmetic1 x, Arithmetic2 y) { \ + return Kokkos::Experimental::FUNC( \ + static_cast<std::conditional_t<std::is_integral<Arithmetic1>::value, \ + double, Arithmetic1>>(x), \ + static_cast<std::conditional_t<std::is_integral<Arithmetic2>::value, \ + double, Arithmetic2>>(y)); \ + } + +#define KOKKOS_IMPL_MATH_UNARY_FUNCTION(FUNC) \ + KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, float, float) \ + KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, double, double) \ + KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, long double, long double) \ + KOKKOS_IMPL_UNARY_FUNCTION_INTEGRAL(FUNC, double) + +#define KOKKOS_IMPL_MATH_UNARY_PREDICATE(FUNC) \ + KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, bool, float) \ + KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, bool, double) \ + KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, bool, long double) \ + KOKKOS_IMPL_UNARY_FUNCTION_INTEGRAL(FUNC, bool) + +#define KOKKOS_IMPL_MATH_BINARY_FUNCTION(FUNC) \ + KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT(FUNC, float) \ + KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT(FUNC, double) \ + KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT(FUNC, long double) \ + KOKKOS_IMPL_BINARY_FUNCTION_ARITHMETIC(FUNC) + +#define KOKKOS_IMPL_MATH_NAN() \ + KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(nanf, float, char const*) \ + KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(nan, double, char const*) \ + KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(nanl, long double, char const*) + +#endif + +// Basic operations +KOKKOS_IMPL_MATH_UNARY_FUNCTION(fabs) +KOKKOS_IMPL_MATH_BINARY_FUNCTION(fmod) +KOKKOS_IMPL_MATH_BINARY_FUNCTION(remainder) +KOKKOS_IMPL_MATH_BINARY_FUNCTION(fmin) +KOKKOS_IMPL_MATH_BINARY_FUNCTION(fmax) +KOKKOS_IMPL_MATH_BINARY_FUNCTION(fdim) +#ifndef KOKKOS_ENABLE_SYCL +KOKKOS_IMPL_MATH_NAN() +#endif +// Power functions +KOKKOS_IMPL_MATH_BINARY_FUNCTION(pow) +KOKKOS_IMPL_MATH_UNARY_FUNCTION(sqrt) +KOKKOS_IMPL_MATH_UNARY_FUNCTION(cbrt) +KOKKOS_IMPL_MATH_BINARY_FUNCTION(hypot) +// Exponential functions +KOKKOS_IMPL_MATH_UNARY_FUNCTION(exp) +KOKKOS_IMPL_MATH_UNARY_FUNCTION(exp2) +KOKKOS_IMPL_MATH_UNARY_FUNCTION(expm1) +KOKKOS_IMPL_MATH_UNARY_FUNCTION(log) +KOKKOS_IMPL_MATH_UNARY_FUNCTION(log10) +KOKKOS_IMPL_MATH_UNARY_FUNCTION(log2) +KOKKOS_IMPL_MATH_UNARY_FUNCTION(log1p) +// Trigonometric functions +KOKKOS_IMPL_MATH_UNARY_FUNCTION(sin) +KOKKOS_IMPL_MATH_UNARY_FUNCTION(cos) +KOKKOS_IMPL_MATH_UNARY_FUNCTION(tan) +KOKKOS_IMPL_MATH_UNARY_FUNCTION(asin) +KOKKOS_IMPL_MATH_UNARY_FUNCTION(acos) +KOKKOS_IMPL_MATH_UNARY_FUNCTION(atan) +KOKKOS_IMPL_MATH_BINARY_FUNCTION(atan2) +// Hyperbolic functions +KOKKOS_IMPL_MATH_UNARY_FUNCTION(sinh) +KOKKOS_IMPL_MATH_UNARY_FUNCTION(cosh) +KOKKOS_IMPL_MATH_UNARY_FUNCTION(tanh) +KOKKOS_IMPL_MATH_UNARY_FUNCTION(asinh) +KOKKOS_IMPL_MATH_UNARY_FUNCTION(acosh) +KOKKOS_IMPL_MATH_UNARY_FUNCTION(atanh) +// Error and gamma functions +KOKKOS_IMPL_MATH_UNARY_FUNCTION(erf) +KOKKOS_IMPL_MATH_UNARY_FUNCTION(erfc) +KOKKOS_IMPL_MATH_UNARY_FUNCTION(tgamma) +KOKKOS_IMPL_MATH_UNARY_FUNCTION(lgamma) +// Nearest integer floating point operations +KOKKOS_IMPL_MATH_UNARY_FUNCTION(ceil) +KOKKOS_IMPL_MATH_UNARY_FUNCTION(floor) +KOKKOS_IMPL_MATH_UNARY_FUNCTION(trunc) +#ifndef KOKKOS_ENABLE_SYCL +KOKKOS_IMPL_MATH_UNARY_FUNCTION(nearbyint) +#endif +// Classification and comparison +KOKKOS_IMPL_MATH_UNARY_PREDICATE(isfinite) +KOKKOS_IMPL_MATH_UNARY_PREDICATE(isinf) +KOKKOS_IMPL_MATH_UNARY_PREDICATE(isnan) + +#undef KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT +#undef KOKKOS_IMPL_UNARY_FUNCTION_INTEGRAL +#undef KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT +#undef KOKKOS_IMPL_BINARY_FUNCTION_ARITHMETIC +#undef KOKKOS_IMPL_MATH_UNARY_FUNCTION +#undef KOKKOS_IMPL_MATH_UNARY_PREDICATE +#undef KOKKOS_IMPL_MATH_BINARY_FUNCTION +#undef KOKKOS_IMPL_MATH_NAN +} // namespace Experimental +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/Kokkos_MemoryPool.hpp b/packages/kokkos/core/src/Kokkos_MemoryPool.hpp new file mode 100644 index 0000000000000000000000000000000000000000..2cafac1aea462ec29fe1d1cb853cb374ea7e8109 --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_MemoryPool.hpp @@ -0,0 +1,834 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_MEMORYPOOL_HPP +#define KOKKOS_MEMORYPOOL_HPP + +#include <Kokkos_Core_fwd.hpp> +#include <Kokkos_Parallel.hpp> +#include <Kokkos_Atomic.hpp> +#include <impl/Kokkos_ConcurrentBitset.hpp> +#include <impl/Kokkos_Error.hpp> +#include <impl/Kokkos_SharedAlloc.hpp> + +#include <iostream> + +namespace Kokkos { +namespace Impl { +/* Report violation of size constraints: + * min_block_alloc_size <= max_block_alloc_size + * max_block_alloc_size <= min_superblock_size + * min_superblock_size <= max_superblock_size + * min_superblock_size <= min_total_alloc_size + * min_superblock_size <= min_block_alloc_size * + * max_block_per_superblock + */ +void memory_pool_bounds_verification(size_t min_block_alloc_size, + size_t max_block_alloc_size, + size_t min_superblock_size, + size_t max_superblock_size, + size_t max_block_per_superblock, + size_t min_total_alloc_size); +} // namespace Impl +} // namespace Kokkos + +namespace Kokkos { + +namespace Impl { + +void _print_memory_pool_state(std::ostream &s, uint32_t const *sb_state_ptr, + int32_t sb_count, uint32_t sb_size_lg2, + uint32_t sb_state_size, uint32_t state_shift, + uint32_t state_used_mask); + +} // end namespace Impl + +template <typename DeviceType> +class MemoryPool { + private: + using CB = Kokkos::Impl::concurrent_bitset; + + enum : uint32_t { bits_per_int_lg2 = CB::bits_per_int_lg2 }; + enum : uint32_t { state_shift = CB::state_shift }; + enum : uint32_t { state_used_mask = CB::state_used_mask }; + enum : uint32_t { state_header_mask = CB::state_header_mask }; + enum : uint32_t { max_bit_count_lg2 = CB::max_bit_count_lg2 }; + enum : uint32_t { max_bit_count = CB::max_bit_count }; + + enum : uint32_t { HINT_PER_BLOCK_SIZE = 2 }; + + /* Each superblock has a concurrent bitset state + * which is an array of uint32_t integers. + * [ { block_count_lg2 : state_shift bits + * , used_block_count : ( 32 - state_shift ) bits + * } + * , { block allocation bit set }* ] + * + * As superblocks are assigned (allocated) to a block size + * and released (deallocated) back to empty the superblock state + * is concurrently updated. + */ + + /* Mapping between block_size <-> block_state + * + * block_state = ( m_sb_size_lg2 - block_size_lg2 ) << state_shift + * block_size = m_sb_size_lg2 - ( block_state >> state_shift ) + * + * Thus A_block_size < B_block_size <=> A_block_state > B_block_state + */ + + using base_memory_space = typename DeviceType::memory_space; + + enum { + accessible = Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace, + base_memory_space>::accessible + }; + + using Tracker = Kokkos::Impl::SharedAllocationTracker; + using Record = Kokkos::Impl::SharedAllocationRecord<base_memory_space>; + + Tracker m_tracker; + uint32_t *m_sb_state_array; + uint32_t m_sb_state_size; + uint32_t m_sb_size_lg2; + uint32_t m_max_block_size_lg2; + uint32_t m_min_block_size_lg2; + int32_t m_sb_count; + int32_t m_hint_offset; // Offset to K * #block_size array of hints + int32_t m_data_offset; // Offset to 0th superblock data + int32_t m_unused_padding; + + public: + using memory_space = typename DeviceType::memory_space; + + /**\brief The maximum size of a superblock and block */ + enum : uint32_t { max_superblock_size = 1LU << 31 /* 2 gigabytes */ }; + enum : uint32_t { max_block_per_superblock = max_bit_count }; + + //-------------------------------------------------------------------------- + + KOKKOS_INLINE_FUNCTION + bool operator==(MemoryPool const &other) const { + return m_sb_state_array == other.m_sb_state_array; + } + + KOKKOS_INLINE_FUNCTION + size_t capacity() const noexcept { + return size_t(m_sb_count) << m_sb_size_lg2; + } + + KOKKOS_INLINE_FUNCTION + size_t min_block_size() const noexcept { + return (1LU << m_min_block_size_lg2); + } + + KOKKOS_INLINE_FUNCTION + size_t max_block_size() const noexcept { + return (1LU << m_max_block_size_lg2); + } + + struct usage_statistics { + size_t capacity_bytes; ///< Capacity in bytes + size_t superblock_bytes; ///< Superblock size in bytes + size_t max_block_bytes; ///< Maximum block size in bytes + size_t min_block_bytes; ///< Minimum block size in bytes + size_t capacity_superblocks; ///< Number of superblocks + size_t consumed_superblocks; ///< Superblocks assigned to allocations + size_t consumed_blocks; ///< Number of allocations + size_t consumed_bytes; ///< Bytes allocated + size_t reserved_blocks; ///< Unallocated blocks in assigned superblocks + size_t reserved_bytes; ///< Unallocated bytes in assigned superblocks + }; + + void get_usage_statistics(usage_statistics &stats) const { + Kokkos::HostSpace host; + + const size_t alloc_size = m_hint_offset * sizeof(uint32_t); + + uint32_t *const sb_state_array = + accessible ? m_sb_state_array : (uint32_t *)host.allocate(alloc_size); + + if (!accessible) { + Kokkos::Impl::DeepCopy<Kokkos::HostSpace, base_memory_space>( + sb_state_array, m_sb_state_array, alloc_size); + } + + stats.superblock_bytes = (1LU << m_sb_size_lg2); + stats.max_block_bytes = (1LU << m_max_block_size_lg2); + stats.min_block_bytes = (1LU << m_min_block_size_lg2); + stats.capacity_bytes = stats.superblock_bytes * m_sb_count; + stats.capacity_superblocks = m_sb_count; + stats.consumed_superblocks = 0; + stats.consumed_blocks = 0; + stats.consumed_bytes = 0; + stats.reserved_blocks = 0; + stats.reserved_bytes = 0; + + const uint32_t *sb_state_ptr = sb_state_array; + + for (int32_t i = 0; i < m_sb_count; ++i, sb_state_ptr += m_sb_state_size) { + const uint32_t block_count_lg2 = (*sb_state_ptr) >> state_shift; + + if (block_count_lg2) { + const uint32_t block_count = 1u << block_count_lg2; + const uint32_t block_size_lg2 = m_sb_size_lg2 - block_count_lg2; + const uint32_t block_size = 1u << block_size_lg2; + const uint32_t block_used = (*sb_state_ptr) & state_used_mask; + + stats.consumed_superblocks++; + stats.consumed_blocks += block_used; + stats.consumed_bytes += block_used * block_size; + stats.reserved_blocks += block_count - block_used; + stats.reserved_bytes += (block_count - block_used) * block_size; + } + } + + if (!accessible) { + host.deallocate(sb_state_array, alloc_size); + } + } + + void print_state(std::ostream &s) const { + Kokkos::HostSpace host; + + const size_t alloc_size = m_hint_offset * sizeof(uint32_t); + + uint32_t *const sb_state_array = + accessible ? m_sb_state_array : (uint32_t *)host.allocate(alloc_size); + + if (!accessible) { + Kokkos::Impl::DeepCopy<Kokkos::HostSpace, base_memory_space>( + sb_state_array, m_sb_state_array, alloc_size); + } + + Impl::_print_memory_pool_state(s, sb_state_array, m_sb_count, m_sb_size_lg2, + m_sb_state_size, state_shift, + state_used_mask); + + if (!accessible) { + host.deallocate(sb_state_array, alloc_size); + } + } + + //-------------------------------------------------------------------------- + + KOKKOS_DEFAULTED_FUNCTION MemoryPool(MemoryPool &&) = default; + KOKKOS_DEFAULTED_FUNCTION MemoryPool(const MemoryPool &) = default; + KOKKOS_DEFAULTED_FUNCTION MemoryPool &operator=(MemoryPool &&) = default; + KOKKOS_DEFAULTED_FUNCTION MemoryPool &operator=(const MemoryPool &) = default; + + KOKKOS_INLINE_FUNCTION MemoryPool() + : m_tracker(), + m_sb_state_array(nullptr), + m_sb_state_size(0), + m_sb_size_lg2(0), + m_max_block_size_lg2(0), + m_min_block_size_lg2(0), + m_sb_count(0), + m_hint_offset(0), + m_data_offset(0), + m_unused_padding(0) {} + + /**\brief Allocate a memory pool from 'memspace'. + * + * The memory pool will have at least 'min_total_alloc_size' bytes + * of memory to allocate divided among superblocks of at least + * 'min_superblock_size' bytes. A single allocation must fit + * within a single superblock, so 'min_superblock_size' must be + * at least as large as the maximum single allocation. + * Both 'min_total_alloc_size' and 'min_superblock_size' + * are rounded up to the smallest power-of-two value that + * contains the corresponding sizes. + * Individual allocations will always consume a block of memory that + * is also a power-of-two. These roundings are made to enable + * significant runtime performance improvements. + */ + MemoryPool(const base_memory_space &memspace, + const size_t min_total_alloc_size, size_t min_block_alloc_size = 0, + size_t max_block_alloc_size = 0, size_t min_superblock_size = 0) + : m_tracker(), + m_sb_state_array(nullptr), + m_sb_state_size(0), + m_sb_size_lg2(0), + m_max_block_size_lg2(0), + m_min_block_size_lg2(0), + m_sb_count(0), + m_hint_offset(0), + m_data_offset(0), + m_unused_padding(0) { + const uint32_t int_align_lg2 = 3; /* align as int[8] */ + const uint32_t int_align_mask = (1u << int_align_lg2) - 1; + const uint32_t default_min_block_size = 1u << 6; /* 64 bytes */ + const uint32_t default_max_block_size = 1u << 12; /* 4k bytes */ + const uint32_t default_min_superblock_size = 1u << 20; /* 1M bytes */ + + //-------------------------------------------------- + // Default block and superblock sizes: + + if (0 == min_block_alloc_size) { + // Default all sizes: + + min_superblock_size = + std::min(size_t(default_min_superblock_size), min_total_alloc_size); + + min_block_alloc_size = + std::min(size_t(default_min_block_size), min_superblock_size); + + max_block_alloc_size = + std::min(size_t(default_max_block_size), min_superblock_size); + } else if (0 == min_superblock_size) { + // Choose superblock size as minimum of: + // max_block_per_superblock * min_block_size + // max_superblock_size + // min_total_alloc_size + + const size_t max_superblock = + min_block_alloc_size * max_block_per_superblock; + + min_superblock_size = + std::min(max_superblock, + std::min(size_t(max_superblock_size), min_total_alloc_size)); + } + + if (0 == max_block_alloc_size) { + max_block_alloc_size = min_superblock_size; + } + + //-------------------------------------------------- + + /* Enforce size constraints: + * min_block_alloc_size <= max_block_alloc_size + * max_block_alloc_size <= min_superblock_size + * min_superblock_size <= max_superblock_size + * min_superblock_size <= min_total_alloc_size + * min_superblock_size <= min_block_alloc_size * + * max_block_per_superblock + */ + + Kokkos::Impl::memory_pool_bounds_verification( + min_block_alloc_size, max_block_alloc_size, min_superblock_size, + max_superblock_size, max_block_per_superblock, min_total_alloc_size); + + //-------------------------------------------------- + // Block and superblock size is power of two: + // Maximum value is 'max_superblock_size' + + m_min_block_size_lg2 = + Kokkos::Impl::integral_power_of_two_that_contains(min_block_alloc_size); + + m_max_block_size_lg2 = + Kokkos::Impl::integral_power_of_two_that_contains(max_block_alloc_size); + + m_sb_size_lg2 = + Kokkos::Impl::integral_power_of_two_that_contains(min_superblock_size); + + { + // number of superblocks is multiple of superblock size that + // can hold min_total_alloc_size. + + const uint64_t sb_size_mask = (1LU << m_sb_size_lg2) - 1; + + m_sb_count = (min_total_alloc_size + sb_size_mask) >> m_sb_size_lg2; + } + + { + // Any superblock can be assigned to the smallest size block + // Size the block bitset to maximum number of blocks + + const uint32_t max_block_count_lg2 = m_sb_size_lg2 - m_min_block_size_lg2; + + m_sb_state_size = + (CB::buffer_bound_lg2(max_block_count_lg2) + int_align_mask) & + ~int_align_mask; + } + + // Array of all superblock states + + const size_t all_sb_state_size = + (m_sb_count * m_sb_state_size + int_align_mask) & ~int_align_mask; + + // Number of block sizes + + const int32_t number_block_sizes = + 1 + m_max_block_size_lg2 - m_min_block_size_lg2; + + // Array length for possible block sizes + // Hint array is one uint32_t per block size + + const int32_t block_size_array_size = + (number_block_sizes + int_align_mask) & ~int_align_mask; + + m_hint_offset = all_sb_state_size; + m_data_offset = m_hint_offset + block_size_array_size * HINT_PER_BLOCK_SIZE; + + // Allocation: + + const size_t header_size = m_data_offset * sizeof(uint32_t); + const size_t alloc_size = + header_size + (size_t(m_sb_count) << m_sb_size_lg2); + + Record *rec = Record::allocate(memspace, "Kokkos::MemoryPool", alloc_size); + + m_tracker.assign_allocated_record_to_uninitialized(rec); + + m_sb_state_array = (uint32_t *)rec->data(); + + Kokkos::HostSpace host; + + uint32_t *const sb_state_array = + accessible ? m_sb_state_array : (uint32_t *)host.allocate(header_size); + + for (int32_t i = 0; i < m_data_offset; ++i) sb_state_array[i] = 0; + + // Initial assignment of empty superblocks to block sizes: + + for (int32_t i = 0; i < number_block_sizes; ++i) { + const uint32_t block_size_lg2 = i + m_min_block_size_lg2; + const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2; + const uint32_t block_state = block_count_lg2 << state_shift; + const uint32_t hint_begin = m_hint_offset + i * HINT_PER_BLOCK_SIZE; + + // for block size index 'i': + // sb_id_hint = sb_state_array[ hint_begin ]; + // sb_id_begin = sb_state_array[ hint_begin + 1 ]; + + const int32_t jbeg = (i * m_sb_count) / number_block_sizes; + const int32_t jend = ((i + 1) * m_sb_count) / number_block_sizes; + + sb_state_array[hint_begin] = uint32_t(jbeg); + sb_state_array[hint_begin + 1] = uint32_t(jbeg); + + for (int32_t j = jbeg; j < jend; ++j) { + sb_state_array[j * m_sb_state_size] = block_state; + } + } + + // Write out initialized state: + + if (!accessible) { + Kokkos::Impl::DeepCopy<base_memory_space, Kokkos::HostSpace>( + m_sb_state_array, sb_state_array, header_size); + + host.deallocate(sb_state_array, header_size); + } else { + Kokkos::memory_fence(); + } + } + + //-------------------------------------------------------------------------- + + private: + /* Given a size 'n' get the block size in which it can be allocated. + * Restrict lower bound to minimum block size. + */ + KOKKOS_FORCEINLINE_FUNCTION + uint32_t get_block_size_lg2(uint32_t n) const noexcept { + const unsigned i = Kokkos::Impl::integral_power_of_two_that_contains(n); + + return i < m_min_block_size_lg2 ? m_min_block_size_lg2 : i; + } + + public: + /* Return 0 for invalid block size */ + KOKKOS_INLINE_FUNCTION + uint32_t allocate_block_size(uint64_t alloc_size) const noexcept { + return alloc_size <= (1UL << m_max_block_size_lg2) + ? (1UL << get_block_size_lg2(uint32_t(alloc_size))) + : 0; + } + + //-------------------------------------------------------------------------- + /**\brief Allocate a block of memory that is at least 'alloc_size' + * + * The block of memory is aligned to the minimum block size, + * currently is 64 bytes, will never be less than 32 bytes. + * + * If concurrent allocations and deallocations are taking place + * then a single allocation attempt may fail due to lack of available space. + * The allocation attempt will try up to 'attempt_limit' times. + */ + KOKKOS_FUNCTION + void *allocate(size_t alloc_size, int32_t attempt_limit = 1) const noexcept { + if (size_t(1LU << m_max_block_size_lg2) < alloc_size) { + Kokkos::abort( + "Kokkos MemoryPool allocation request exceeded specified maximum " + "allocation size"); + } + + if (0 == alloc_size) return nullptr; + + void *p = nullptr; + + const uint32_t block_size_lg2 = get_block_size_lg2(alloc_size); + + // Allocation will fit within a superblock + // that has block sizes ( 1 << block_size_lg2 ) + + const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2; + const uint32_t block_state = block_count_lg2 << state_shift; + const uint32_t block_count = 1u << block_count_lg2; + + // Superblock hints for this block size: + // hint_sb_id_ptr[0] is the dynamically changing hint + // hint_sb_id_ptr[1] is the static start point + + volatile uint32_t *const hint_sb_id_ptr = + m_sb_state_array /* memory pool state array */ + + m_hint_offset /* offset to hint portion of array */ + + HINT_PER_BLOCK_SIZE /* number of hints per block size */ + * (block_size_lg2 - m_min_block_size_lg2); /* block size id */ + + const int32_t sb_id_begin = int32_t(hint_sb_id_ptr[1]); + + // Fast query clock register 'tic' to pseudo-randomize + // the guess for which block within a superblock should + // be claimed. If not available then a search occurs. +#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GEN) + const uint32_t block_id_hint = alloc_size; +#else + const uint32_t block_id_hint = + (uint32_t)(Kokkos::Impl::clock_tic() +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA) + // Spread out potentially concurrent access + // by threads within a warp or thread block. + + (threadIdx.x + blockDim.x * threadIdx.y) +#endif + ); +#endif + + // expected state of superblock for allocation + uint32_t sb_state = block_state; + + int32_t sb_id = -1; + + volatile uint32_t *sb_state_array = nullptr; + + while (attempt_limit) { + int32_t hint_sb_id = -1; + + if (sb_id < 0) { + // No superblock specified, try the hint for this block size + + sb_id = hint_sb_id = int32_t(*hint_sb_id_ptr); + + sb_state_array = m_sb_state_array + (sb_id * m_sb_state_size); + } + + // Require: + // 0 <= sb_id + // sb_state_array == m_sb_state_array + m_sb_state_size * sb_id + + if (sb_state == (state_header_mask & *sb_state_array)) { + // This superblock state is as expected, for the moment. + // Attempt to claim a bit. The attempt updates the state + // so have already made sure the state header is as expected. + + const uint32_t count_lg2 = sb_state >> state_shift; + const uint32_t mask = (1u << count_lg2) - 1; + + const Kokkos::pair<int, int> result = CB::acquire_bounded_lg2( + sb_state_array, count_lg2, block_id_hint & mask, sb_state); + + // If result.first < 0 then failed to acquire + // due to either full or buffer was wrong state. + // Could be wrong state if a deallocation raced the + // superblock to empty before the acquire could succeed. + + if (0 <= result.first) { // acquired a bit + + const uint32_t size_lg2 = m_sb_size_lg2 - count_lg2; + + // Set the allocated block pointer + + p = ((char *)(m_sb_state_array + m_data_offset)) + + (uint64_t(sb_id) << m_sb_size_lg2) // superblock memory + + (uint64_t(result.first) << size_lg2); // block memory + +#if 0 + printf( " MemoryPool(0x%lx) pointer(0x%lx) allocate(%lu) sb_id(%d) sb_state(0x%x) block_size(%d) block_capacity(%d) block_id(%d) block_claimed(%d)\n" + , (uintptr_t)m_sb_state_array + , (uintptr_t)p + , alloc_size + , sb_id + , sb_state + , (1u << size_lg2) + , (1u << count_lg2) + , result.first + , result.second ); +#endif + + break; // Success + } + } + //------------------------------------------------------------------ + // Arrive here if failed to acquire a block. + // Must find a new superblock. + + // Start searching at designated index for this block size. + // Look for superblock that, in preferential order, + // 1) part-full superblock of this block size + // 2) empty superblock to claim for this block size + // 3) part-full superblock of the next larger block size + + sb_state = block_state; // Expect to find the desired state + sb_id = -1; + + bool update_hint = false; + int32_t sb_id_empty = -1; + int32_t sb_id_large = -1; + uint32_t sb_state_large = 0; + + sb_state_array = m_sb_state_array + sb_id_begin * m_sb_state_size; + + for (int32_t i = 0, id = sb_id_begin; i < m_sb_count; ++i) { + // Query state of the candidate superblock. + // Note that the state may change at any moment + // as concurrent allocations and deallocations occur. + + const uint32_t full_state = *sb_state_array; + const uint32_t used = full_state & state_used_mask; + const uint32_t state = full_state & state_header_mask; + + if (state == block_state) { + // Superblock is assigned to this block size + + if (used < block_count) { + // There is room to allocate one block + + sb_id = id; + + // Is there room to allocate more than one block? + + update_hint = used + 1 < block_count; + + break; + } + } else if (0 == used) { + // Superblock is empty + + if (-1 == sb_id_empty) { + // Superblock is not assigned to this block size + // and is the first empty superblock encountered. + // Save this id to use if a partfull superblock is not found. + + sb_id_empty = id; + } + } else if ((-1 == sb_id_empty /* have not found an empty */) && + (-1 == sb_id_large /* have not found a larger */) && + (state < block_state /* a larger block */) && + // is not full: + (used < (1u << (state >> state_shift)))) { + // First superblock encountered that is + // larger than this block size and + // has room for an allocation. + // Save this id to use of partfull or empty superblock not found + sb_id_large = id; + sb_state_large = state; + } + + // Iterate around the superblock array: + + if (++id < m_sb_count) { + sb_state_array += m_sb_state_size; + } else { + id = 0; + sb_state_array = m_sb_state_array; + } + } + + // printf(" search m_sb_count(%d) sb_id(%d) sb_id_empty(%d) + // sb_id_large(%d)\n" , m_sb_count , sb_id , sb_id_empty , sb_id_large); + + if (sb_id < 0) { + // Did not find a partfull superblock for this block size. + + if (0 <= sb_id_empty) { + // Found first empty superblock following designated superblock + // Attempt to claim it for this block size. + // If the claim fails assume that another thread claimed it + // for this block size and try to use it anyway, + // but do not update hint. + + sb_id = sb_id_empty; + + sb_state_array = m_sb_state_array + (sb_id * m_sb_state_size); + + // If successfully changed assignment of empty superblock 'sb_id' + // to this block_size then update the hint. + + const uint32_t state_empty = state_header_mask & *sb_state_array; + + // If this thread claims the empty block then update the hint + update_hint = + state_empty == Kokkos::atomic_compare_exchange( + sb_state_array, state_empty, block_state); + } else if (0 <= sb_id_large) { + // Found a larger superblock with space available + + sb_id = sb_id_large; + sb_state = sb_state_large; + + sb_state_array = m_sb_state_array + (sb_id * m_sb_state_size); + } else { + // Did not find a potentially usable superblock + --attempt_limit; + } + } + + if (update_hint) { + Kokkos::atomic_compare_exchange(hint_sb_id_ptr, uint32_t(hint_sb_id), + uint32_t(sb_id)); + } + } // end allocation attempt loop + //-------------------------------------------------------------------- + + return p; + } + // end allocate + //-------------------------------------------------------------------------- + + /**\brief Return an allocated block of memory to the pool. + * + * Requires: p is return value from allocate( alloc_size ); + * + * For now the alloc_size is ignored. + */ + KOKKOS_INLINE_FUNCTION + void deallocate(void *p, size_t /* alloc_size */) const noexcept { + if (nullptr == p) return; + + // Determine which superblock and block + const ptrdiff_t d = + ((char *)p) - ((char *)(m_sb_state_array + m_data_offset)); + + // Verify contained within the memory pool's superblocks: + const int ok_contains = + (0 <= d) && (size_t(d) < (size_t(m_sb_count) << m_sb_size_lg2)); + + int ok_block_aligned = 0; + int ok_dealloc_once = 0; + + if (ok_contains) { + const int sb_id = d >> m_sb_size_lg2; + + // State array for the superblock. + volatile uint32_t *const sb_state_array = + m_sb_state_array + (sb_id * m_sb_state_size); + + const uint32_t block_state = (*sb_state_array) & state_header_mask; + const uint32_t block_size_lg2 = + m_sb_size_lg2 - (block_state >> state_shift); + + ok_block_aligned = 0 == (d & ((1UL << block_size_lg2) - 1)); + + if (ok_block_aligned) { + // Map address to block's bit + // mask into superblock and then shift down for block index + + const uint32_t bit = + (d & (ptrdiff_t(1LU << m_sb_size_lg2) - 1)) >> block_size_lg2; + + const int result = CB::release(sb_state_array, bit, block_state); + + ok_dealloc_once = 0 <= result; + +#if 0 + printf( " MemoryPool(0x%lx) pointer(0x%lx) deallocate sb_id(%d) block_size(%d) block_capacity(%d) block_id(%d) block_claimed(%d)\n" + , (uintptr_t)m_sb_state_array + , (uintptr_t)p + , sb_id + , (1u << block_size_lg2) + , (1u << (m_sb_size_lg2 - block_size_lg2)) + , bit + , result ); +#endif + } + } + + if (!ok_contains || !ok_block_aligned || !ok_dealloc_once) { +#if 0 + printf( " MemoryPool(0x%lx) pointer(0x%lx) deallocate ok_contains(%d) ok_block_aligned(%d) ok_dealloc_once(%d)\n" + , (uintptr_t)m_sb_state_array + , (uintptr_t)p + , int(ok_contains) + , int(ok_block_aligned) + , int(ok_dealloc_once) ); +#endif + Kokkos::abort("Kokkos MemoryPool::deallocate given erroneous pointer"); + } + } + // end deallocate + //-------------------------------------------------------------------------- + + KOKKOS_INLINE_FUNCTION + int number_of_superblocks() const noexcept { return m_sb_count; } + + KOKKOS_INLINE_FUNCTION + void superblock_state(int sb_id, int &block_size, int &block_count_capacity, + int &block_count_used) const noexcept { + block_size = 0; + block_count_capacity = 0; + block_count_used = 0; + + if (Kokkos::Impl::MemorySpaceAccess< + Kokkos::Impl::ActiveExecutionMemorySpace, + base_memory_space>::accessible) { + // Can access the state array + + const uint32_t state = + ((uint32_t volatile *)m_sb_state_array)[sb_id * m_sb_state_size]; + + const uint32_t block_count_lg2 = state >> state_shift; + const uint32_t block_used = state & state_used_mask; + + block_size = 1LU << (m_sb_size_lg2 - block_count_lg2); + block_count_capacity = 1LU << block_count_lg2; + block_count_used = block_used; + } + } +}; + +} // namespace Kokkos + +#endif /* #ifndef KOKKOS_MEMORYPOOL_HPP */ diff --git a/packages/kokkos/core/src/Kokkos_MemoryTraits.hpp b/packages/kokkos/core/src/Kokkos_MemoryTraits.hpp new file mode 100644 index 0000000000000000000000000000000000000000..f23442b793f5eeca8e0c1b22df6468271df96b73 --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_MemoryTraits.hpp @@ -0,0 +1,125 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_MEMORYTRAITS_HPP +#define KOKKOS_MEMORYTRAITS_HPP + +#include <impl/Kokkos_Traits.hpp> +#include <impl/Kokkos_Tags.hpp> + +//---------------------------------------------------------------------------- + +namespace Kokkos { + +/** \brief Memory access traits for views, an extension point. + * + * These traits should be orthogonal. If there are dependencies then + * the MemoryTraits template must detect and enforce dependencies. + * + * A zero value is the default for a View, indicating that none of + * these traits are present. + */ +enum MemoryTraitsFlags { + Unmanaged = 0x01, + RandomAccess = 0x02, + Atomic = 0x04, + Restrict = 0x08, + Aligned = 0x10 +}; + +template <unsigned T> +struct MemoryTraits { + //! Tag this class as a kokkos memory traits: + using memory_traits = MemoryTraits<T>; + enum : bool { + is_unmanaged = (unsigned(0) != (T & unsigned(Kokkos::Unmanaged))) + }; + enum : bool { + is_random_access = (unsigned(0) != (T & unsigned(Kokkos::RandomAccess))) + }; + enum : bool { is_atomic = (unsigned(0) != (T & unsigned(Kokkos::Atomic))) }; + enum : bool { + is_restrict = (unsigned(0) != (T & unsigned(Kokkos::Restrict))) + }; + enum : bool { is_aligned = (unsigned(0) != (T & unsigned(Kokkos::Aligned))) }; +}; + +} // namespace Kokkos + +//---------------------------------------------------------------------------- + +namespace Kokkos { + +using MemoryManaged = Kokkos::MemoryTraits<0>; +using MemoryUnmanaged = Kokkos::MemoryTraits<Kokkos::Unmanaged>; +using MemoryRandomAccess = + Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>; + +} // namespace Kokkos + +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +static_assert((0 < int(KOKKOS_MEMORY_ALIGNMENT)) && + (0 == (int(KOKKOS_MEMORY_ALIGNMENT) & + (int(KOKKOS_MEMORY_ALIGNMENT) - 1))), + "KOKKOS_MEMORY_ALIGNMENT must be a power of two"); + +/** \brief Memory alignment settings + * + * Sets global value for memory alignment. Must be a power of two! + * Enable compatibility of views from different devices with static stride. + * Use compiler flag to enable overwrites. + */ +enum : unsigned { + MEMORY_ALIGNMENT = KOKKOS_MEMORY_ALIGNMENT, + MEMORY_ALIGNMENT_THRESHOLD = KOKKOS_MEMORY_ALIGNMENT_THRESHOLD +}; + +} // namespace Impl +} // namespace Kokkos + +#endif /* #ifndef KOKKOS_MEMORYTRAITS_HPP */ diff --git a/packages/kokkos/core/src/Kokkos_NumericTraits.hpp b/packages/kokkos/core/src/Kokkos_NumericTraits.hpp new file mode 100644 index 0000000000000000000000000000000000000000..b9380cbe02b42a04c5b21b6cb8408016049d15f8 --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_NumericTraits.hpp @@ -0,0 +1,578 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_NUMERIC_TRAITS_HPP +#define KOKKOS_NUMERIC_TRAITS_HPP + +#include <Kokkos_Macros.hpp> +#include <cfloat> +#include <climits> +#include <cmath> +#include <cstdint> +#include <type_traits> + +namespace Kokkos { +namespace Experimental { +namespace Impl { +// clang-format off +template <class> struct infinity_helper; +template <> struct infinity_helper<float> { static constexpr float value = HUGE_VALF; }; +template <> struct infinity_helper<double> { static constexpr double value = HUGE_VAL; }; +template <> struct infinity_helper<long double> { static constexpr long double value = HUGE_VALL; }; +template <class> struct finite_min_helper; +template <> struct finite_min_helper<bool> { static constexpr bool value = false; }; +template <> struct finite_min_helper<char> { static constexpr char value = CHAR_MIN; }; +template <> struct finite_min_helper<signed char> { static constexpr signed char value = SCHAR_MIN; }; +template <> struct finite_min_helper<unsigned char> { static constexpr unsigned char value = 0; }; +template <> struct finite_min_helper<short> { static constexpr short value = SHRT_MIN; }; +template <> struct finite_min_helper<unsigned short> { static constexpr unsigned short value = 0; }; +template <> struct finite_min_helper<int> { static constexpr int value = INT_MIN; }; +template <> struct finite_min_helper<unsigned int> { static constexpr unsigned int value = 0; }; +template <> struct finite_min_helper<long int> { static constexpr long int value = LONG_MIN; }; +template <> struct finite_min_helper<unsigned long int> { static constexpr unsigned long int value = 0; }; +template <> struct finite_min_helper<long long int> { static constexpr long long int value = LLONG_MIN; }; +template <> struct finite_min_helper<unsigned long long int> { static constexpr unsigned long long int value = 0; }; +template <> struct finite_min_helper<float> { static constexpr float value = -FLT_MAX; }; +template <> struct finite_min_helper<double> { static constexpr double value = -DBL_MAX; }; +template <> struct finite_min_helper<long double> { static constexpr long double value = -LDBL_MAX; }; +template <class> struct finite_max_helper; +template <> struct finite_max_helper<bool> { static constexpr bool value = true; }; +template <> struct finite_max_helper<char> { static constexpr char value = CHAR_MAX; }; +template <> struct finite_max_helper<signed char> { static constexpr signed char value = SCHAR_MAX; }; +template <> struct finite_max_helper<unsigned char> { static constexpr unsigned char value = UCHAR_MAX; }; +template <> struct finite_max_helper<short> { static constexpr short value = SHRT_MAX; }; +template <> struct finite_max_helper<unsigned short> { static constexpr unsigned short value = USHRT_MAX; }; +template <> struct finite_max_helper<int> { static constexpr int value = INT_MAX; }; +template <> struct finite_max_helper<unsigned int> { static constexpr unsigned int value = UINT_MAX; }; +template <> struct finite_max_helper<long int> { static constexpr long int value = LONG_MAX; }; +template <> struct finite_max_helper<unsigned long int> { static constexpr unsigned long int value = ULONG_MAX; }; +template <> struct finite_max_helper<long long int> { static constexpr long long int value = LLONG_MAX; }; +template <> struct finite_max_helper<unsigned long long int> { static constexpr unsigned long long int value = ULLONG_MAX; }; +template <> struct finite_max_helper<float> { static constexpr float value = FLT_MAX; }; +template <> struct finite_max_helper<double> { static constexpr double value = DBL_MAX; }; +template <> struct finite_max_helper<long double> { static constexpr long double value = LDBL_MAX; }; +template <class> struct epsilon_helper; +namespace{ + // FIXME workaround for LDL_EPSILON with XL + template<typename T> + constexpr T machineeps() { + T epsilon = 1, prev = 1, expression = 1; + do { + prev = epsilon; + epsilon /= 2; + expression = 1 + epsilon; + } while (expression > 1); + return prev; + } +} +template <> struct epsilon_helper<float> { static constexpr float value = FLT_EPSILON; }; +template <> struct epsilon_helper<double> { static constexpr double value = DBL_EPSILON; }; +template <> struct epsilon_helper<long double> { +#ifdef KOKKOS_COMPILER_IBM + static constexpr long double value = machineeps<long double>(); +#else + static constexpr long double value = LDBL_EPSILON; +#endif +}; +template <class> struct round_error_helper; +template <> struct round_error_helper<float> { static constexpr float value = 0.5F; }; +template <> struct round_error_helper<double> { static constexpr double value = 0.5; }; +template <> struct round_error_helper<long double> { static constexpr long double value = 0.5L; }; +template <class> struct norm_min_helper; +template <> struct norm_min_helper<float> { static constexpr float value = FLT_MIN; }; +template <> struct norm_min_helper<double> { static constexpr double value = DBL_MIN; }; +template <> struct norm_min_helper<long double> { static constexpr long double value = LDBL_MIN; }; +template <class> struct digits_helper; +template <> struct digits_helper<bool> { static constexpr int value = 1; }; +template <> struct digits_helper<char> { static constexpr int value = CHAR_BIT - std::is_signed<char>::value; }; +template <> struct digits_helper<signed char> { static constexpr int value = CHAR_BIT - 1; }; +template <> struct digits_helper<unsigned char> { static constexpr int value = CHAR_BIT; }; +template <> struct digits_helper<short> { static constexpr int value = CHAR_BIT*sizeof(short)-1; }; +template <> struct digits_helper<unsigned short> { static constexpr int value = CHAR_BIT*sizeof(short); }; +template <> struct digits_helper<int> { static constexpr int value = CHAR_BIT*sizeof(int)-1; }; +template <> struct digits_helper<unsigned int> { static constexpr int value = CHAR_BIT*sizeof(int); }; +template <> struct digits_helper<long int> { static constexpr int value = CHAR_BIT*sizeof(long int)-1; }; +template <> struct digits_helper<unsigned long int> { static constexpr int value = CHAR_BIT*sizeof(long int); }; +template <> struct digits_helper<long long int> { static constexpr int value = CHAR_BIT*sizeof(long long int)-1; }; +template <> struct digits_helper<unsigned long long int> { static constexpr int value = CHAR_BIT*sizeof(long long int); }; +template <> struct digits_helper<float> { static constexpr int value = FLT_MANT_DIG; }; +template <> struct digits_helper<double> { static constexpr int value = DBL_MANT_DIG; }; +template <> struct digits_helper<long double> { static constexpr int value = LDBL_MANT_DIG; }; +template <class> struct digits10_helper; +template <> struct digits10_helper<bool> { static constexpr int value = 0; }; +constexpr double log10_2 = 2.41; +#define DIGITS10_HELPER_INTEGRAL(TYPE) \ +template <> struct digits10_helper<TYPE> { static constexpr int value = digits_helper<TYPE>::value * log10_2; }; +DIGITS10_HELPER_INTEGRAL(char) +DIGITS10_HELPER_INTEGRAL(signed char) +DIGITS10_HELPER_INTEGRAL(unsigned char) +DIGITS10_HELPER_INTEGRAL(short) +DIGITS10_HELPER_INTEGRAL(unsigned short) +DIGITS10_HELPER_INTEGRAL(int) +DIGITS10_HELPER_INTEGRAL(unsigned int) +DIGITS10_HELPER_INTEGRAL(long int) +DIGITS10_HELPER_INTEGRAL(unsigned long int) +DIGITS10_HELPER_INTEGRAL(long long int) +DIGITS10_HELPER_INTEGRAL(unsigned long long int) +#undef DIGITS10_HELPER_INTEGRAL +template <> struct digits10_helper<float> { static constexpr int value = FLT_DIG; }; +template <> struct digits10_helper<double> { static constexpr int value = DBL_DIG; }; +template <> struct digits10_helper<long double> { static constexpr int value = LDBL_DIG; }; +template <class> struct max_digits10_helper; +// FIXME not sure why were not defined in my <cfloat> +//template <> struct max_digits10_helper<float> { static constexpr int value = FLT_DECIMAL_DIG; }; +//template <> struct max_digits10_helper<double> { static constexpr int value = DBL_DECIMAL_DIG; }; +//template <> struct max_digits10_helper<long double> { static constexpr int value = LDBL_DECIMAL_DIG; }; +template <> struct max_digits10_helper<float> { static constexpr int value = 9; }; +template <> struct max_digits10_helper<double> { static constexpr int value = 17; }; +template <> struct max_digits10_helper<long double> { static constexpr int value = 21; }; +template <class> struct radix_helper; +template <> struct radix_helper<bool> { static constexpr int value = 2; }; +template <> struct radix_helper<char> { static constexpr int value = 2; }; +template <> struct radix_helper<signed char> { static constexpr int value = 2; }; +template <> struct radix_helper<unsigned char> { static constexpr int value = 2; }; +template <> struct radix_helper<short> { static constexpr int value = 2; }; +template <> struct radix_helper<unsigned short> { static constexpr int value = 2; }; +template <> struct radix_helper<int> { static constexpr int value = 2; }; +template <> struct radix_helper<unsigned int> { static constexpr int value = 2; }; +template <> struct radix_helper<long int> { static constexpr int value = 2; }; +template <> struct radix_helper<unsigned long int> { static constexpr int value = 2; }; +template <> struct radix_helper<long long int> { static constexpr int value = 2; }; +template <> struct radix_helper<unsigned long long int> { static constexpr int value = 2; }; +template <> struct radix_helper<float> { static constexpr int value = FLT_RADIX; }; +template <> struct radix_helper<double> { static constexpr int value = FLT_RADIX; }; +template <> struct radix_helper<long double> { static constexpr int value = FLT_RADIX; }; +template <class> struct min_exponent_helper; +template <> struct min_exponent_helper<float> { static constexpr int value = FLT_MIN_EXP; }; +template <> struct min_exponent_helper<double> { static constexpr int value = DBL_MIN_EXP; }; +template <> struct min_exponent_helper<long double> { static constexpr int value = LDBL_MIN_EXP; }; +template <class> struct min_exponent10_helper; +template <> struct min_exponent10_helper<float> { static constexpr int value = FLT_MIN_10_EXP; }; +template <> struct min_exponent10_helper<double> { static constexpr int value = DBL_MIN_10_EXP; }; +template <> struct min_exponent10_helper<long double> { static constexpr int value = LDBL_MIN_10_EXP; }; +template <class> struct max_exponent_helper; +template <> struct max_exponent_helper<float> { static constexpr int value = FLT_MAX_EXP; }; +template <> struct max_exponent_helper<double> { static constexpr int value = DBL_MAX_EXP; }; +template <> struct max_exponent_helper<long double> { static constexpr int value = LDBL_MAX_EXP; }; +template <class> struct max_exponent10_helper; +template <> struct max_exponent10_helper<float> { static constexpr int value = FLT_MAX_10_EXP; }; +template <> struct max_exponent10_helper<double> { static constexpr int value = DBL_MAX_10_EXP; }; +template <> struct max_exponent10_helper<long double> { static constexpr int value = LDBL_MAX_10_EXP; }; +// clang-format on +} // namespace Impl + +#if defined(KOKKOS_ENABLE_CXX17) +#define KOKKOS_IMPL_DEFINE_TRAIT(TRAIT) \ + template <class T> \ + struct TRAIT : Impl::TRAIT##_helper<T> {}; \ + template <class T> \ + inline constexpr auto TRAIT##_v = TRAIT<T>::value; +#else +#define KOKKOS_IMPL_DEFINE_TRAIT(TRAIT) \ + template <class T> \ + struct TRAIT : Impl::TRAIT##_helper<T> {}; +#endif + +// Numeric distinguished value traits +KOKKOS_IMPL_DEFINE_TRAIT(infinity) +KOKKOS_IMPL_DEFINE_TRAIT(finite_min) +KOKKOS_IMPL_DEFINE_TRAIT(finite_max) +KOKKOS_IMPL_DEFINE_TRAIT(epsilon) +KOKKOS_IMPL_DEFINE_TRAIT(round_error) +KOKKOS_IMPL_DEFINE_TRAIT(norm_min) + +// Numeric characteristics traits +KOKKOS_IMPL_DEFINE_TRAIT(digits) +KOKKOS_IMPL_DEFINE_TRAIT(digits10) +KOKKOS_IMPL_DEFINE_TRAIT(max_digits10) +KOKKOS_IMPL_DEFINE_TRAIT(radix) +KOKKOS_IMPL_DEFINE_TRAIT(min_exponent) +KOKKOS_IMPL_DEFINE_TRAIT(min_exponent10) +KOKKOS_IMPL_DEFINE_TRAIT(max_exponent) +KOKKOS_IMPL_DEFINE_TRAIT(max_exponent10) + +#undef KOKKOS_IMPL_DEFINE_TRAIT + +} // namespace Experimental + +template <class T> +struct reduction_identity; /*{ + KOKKOS_FORCEINLINE_FUNCTION constexpr static T sum() { return T(); } // 0 + KOKKOS_FORCEINLINE_FUNCTION constexpr static T prod() // 1 + { static_assert( false, "Missing specialization of +Kokkos::reduction_identity for custom prod reduction type"); return T(); } + KOKKOS_FORCEINLINE_FUNCTION constexpr static T max() // minimum value + { static_assert( false, "Missing specialization of +Kokkos::reduction_identity for custom max reduction type"); return T(); } + KOKKOS_FORCEINLINE_FUNCTION constexpr static T min() // maximum value + { static_assert( false, "Missing specialization of +Kokkos::reduction_identity for custom min reduction type"); return T(); } + KOKKOS_FORCEINLINE_FUNCTION constexpr static T bor() // 0, only for integer +type { static_assert( false, "Missing specialization of +Kokkos::reduction_identity for custom bor reduction type"); return T(); } + KOKKOS_FORCEINLINE_FUNCTION constexpr static T band() // !0, only for integer +type { static_assert( false, "Missing specialization of +Kokkos::reduction_identity for custom band reduction type"); return T(); } + KOKKOS_FORCEINLINE_FUNCTION constexpr static T lor() // 0, only for integer +type { static_assert( false, "Missing specialization of +Kokkos::reduction_identity for custom lor reduction type"); return T(); } + KOKKOS_FORCEINLINE_FUNCTION constexpr static T land() // !0, only for integer +type { static_assert( false, "Missing specialization of +Kokkos::reduction_identity for custom land reduction type"); return T(); } +};*/ + +template <> +struct reduction_identity<signed char> { + KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char sum() { + return static_cast<signed char>(0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char prod() { + return static_cast<signed char>(1); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char max() { + return SCHAR_MIN; + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char min() { + return SCHAR_MAX; + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char bor() { + return static_cast<signed char>(0x0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char band() { + return ~static_cast<signed char>(0x0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char lor() { + return static_cast<signed char>(0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char land() { + return static_cast<signed char>(1); + } +}; + +template <> +struct reduction_identity<short> { + KOKKOS_FORCEINLINE_FUNCTION constexpr static short sum() { + return static_cast<short>(0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static short prod() { + return static_cast<short>(1); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static short max() { return SHRT_MIN; } + KOKKOS_FORCEINLINE_FUNCTION constexpr static short min() { return SHRT_MAX; } + KOKKOS_FORCEINLINE_FUNCTION constexpr static short bor() { + return static_cast<short>(0x0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static short band() { + return ~static_cast<short>(0x0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static short lor() { + return static_cast<short>(0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static short land() { + return static_cast<short>(1); + } +}; + +template <> +struct reduction_identity<int> { + KOKKOS_FORCEINLINE_FUNCTION constexpr static int sum() { + return static_cast<int>(0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static int prod() { + return static_cast<int>(1); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static int max() { return INT_MIN; } + KOKKOS_FORCEINLINE_FUNCTION constexpr static int min() { return INT_MAX; } + KOKKOS_FORCEINLINE_FUNCTION constexpr static int bor() { + return static_cast<int>(0x0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static int band() { + return ~static_cast<int>(0x0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static int lor() { + return static_cast<int>(0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static int land() { + return static_cast<int>(1); + } +}; + +template <> +struct reduction_identity<long> { + KOKKOS_FORCEINLINE_FUNCTION constexpr static long sum() { + return static_cast<long>(0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static long prod() { + return static_cast<long>(1); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static long max() { return LONG_MIN; } + KOKKOS_FORCEINLINE_FUNCTION constexpr static long min() { return LONG_MAX; } + KOKKOS_FORCEINLINE_FUNCTION constexpr static long bor() { + return static_cast<long>(0x0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static long band() { + return ~static_cast<long>(0x0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static long lor() { + return static_cast<long>(0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static long land() { + return static_cast<long>(1); + } +}; + +template <> +struct reduction_identity<long long> { + KOKKOS_FORCEINLINE_FUNCTION constexpr static long long sum() { + return static_cast<long long>(0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static long long prod() { + return static_cast<long long>(1); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static long long max() { + return LLONG_MIN; + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static long long min() { + return LLONG_MAX; + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static long long bor() { + return static_cast<long long>(0x0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static long long band() { + return ~static_cast<long long>(0x0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static long long lor() { + return static_cast<long long>(0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static long long land() { + return static_cast<long long>(1); + } +}; + +template <> +struct reduction_identity<unsigned char> { + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char sum() { + return static_cast<unsigned char>(0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char prod() { + return static_cast<unsigned char>(1); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char max() { + return static_cast<unsigned char>(0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char min() { + return UCHAR_MAX; + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char bor() { + return static_cast<unsigned char>(0x0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char band() { + return ~static_cast<unsigned char>(0x0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char lor() { + return static_cast<unsigned char>(0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char land() { + return static_cast<unsigned char>(1); + } +}; + +template <> +struct reduction_identity<unsigned short> { + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short sum() { + return static_cast<unsigned short>(0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short prod() { + return static_cast<unsigned short>(1); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short max() { + return static_cast<unsigned short>(0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short min() { + return USHRT_MAX; + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short bor() { + return static_cast<unsigned short>(0x0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short band() { + return ~static_cast<unsigned short>(0x0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short lor() { + return static_cast<unsigned short>(0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short land() { + return static_cast<unsigned short>(1); + } +}; + +template <> +struct reduction_identity<unsigned int> { + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int sum() { + return static_cast<unsigned int>(0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int prod() { + return static_cast<unsigned int>(1); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int max() { + return static_cast<unsigned int>(0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int min() { + return UINT_MAX; + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int bor() { + return static_cast<unsigned int>(0x0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int band() { + return ~static_cast<unsigned int>(0x0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int lor() { + return static_cast<unsigned int>(0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int land() { + return static_cast<unsigned int>(1); + } +}; + +template <> +struct reduction_identity<unsigned long> { + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long sum() { + return static_cast<unsigned long>(0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long prod() { + return static_cast<unsigned long>(1); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long max() { + return static_cast<unsigned long>(0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long min() { + return ULONG_MAX; + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long bor() { + return static_cast<unsigned long>(0x0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long band() { + return ~static_cast<unsigned long>(0x0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long lor() { + return static_cast<unsigned long>(0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long land() { + return static_cast<unsigned long>(1); + } +}; + +template <> +struct reduction_identity<unsigned long long> { + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long sum() { + return static_cast<unsigned long long>(0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long prod() { + return static_cast<unsigned long long>(1); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long max() { + return static_cast<unsigned long long>(0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long min() { + return ULLONG_MAX; + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long bor() { + return static_cast<unsigned long long>(0x0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long band() { + return ~static_cast<unsigned long long>(0x0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long lor() { + return static_cast<unsigned long long>(0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long land() { + return static_cast<unsigned long long>(1); + } +}; + +template <> +struct reduction_identity<float> { + KOKKOS_FORCEINLINE_FUNCTION constexpr static float sum() { + return static_cast<float>(0.0f); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static float prod() { + return static_cast<float>(1.0f); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static float max() { return -FLT_MAX; } + KOKKOS_FORCEINLINE_FUNCTION constexpr static float min() { return FLT_MAX; } +}; + +template <> +struct reduction_identity<double> { + KOKKOS_FORCEINLINE_FUNCTION constexpr static double sum() { + return static_cast<double>(0.0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static double prod() { + return static_cast<double>(1.0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static double max() { return -DBL_MAX; } + KOKKOS_FORCEINLINE_FUNCTION constexpr static double min() { return DBL_MAX; } +}; + +#if !defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA) && \ + !defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HIP_GPU) +template <> +struct reduction_identity<long double> { + KOKKOS_FORCEINLINE_FUNCTION constexpr static long double sum() { + return static_cast<long double>(0.0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static long double prod() { + return static_cast<long double>(1.0); + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static long double max() { + return -LDBL_MAX; + } + KOKKOS_FORCEINLINE_FUNCTION constexpr static long double min() { + return LDBL_MAX; + } +}; +#endif + +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/Kokkos_OpenMP.hpp b/packages/kokkos/core/src/Kokkos_OpenMP.hpp new file mode 100644 index 0000000000000000000000000000000000000000..eedba38a8456117ac03d8c21e657729673017984 --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_OpenMP.hpp @@ -0,0 +1,226 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_OPENMP_HPP +#define KOKKOS_OPENMP_HPP + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_OPENMP) + +#include <Kokkos_Core_fwd.hpp> + +#include <cstddef> +#include <iosfwd> +#include <Kokkos_HostSpace.hpp> + +#ifdef KOKKOS_ENABLE_HBWSPACE +#include <Kokkos_HBWSpace.hpp> +#endif + +#include <Kokkos_ScratchSpace.hpp> +#include <Kokkos_Parallel.hpp> +#include <Kokkos_TaskScheduler.hpp> +#include <Kokkos_Layout.hpp> +#include <impl/Kokkos_Tags.hpp> +#include <impl/Kokkos_Profiling_Interface.hpp> +#include <impl/Kokkos_ExecSpaceInitializer.hpp> + +#include <vector> + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { + +namespace Impl { +class OpenMPExec; +} + +/// \class OpenMP +/// \brief Kokkos device for multicore processors in the host memory space. +class OpenMP { + public: + //! Tag this class as a kokkos execution space + using execution_space = OpenMP; + + using memory_space = +#ifdef KOKKOS_ENABLE_HBWSPACE + Experimental::HBWSpace; +#else + HostSpace; +#endif + + //! This execution space preferred device_type + using device_type = Kokkos::Device<execution_space, memory_space>; + using array_layout = LayoutRight; + using size_type = memory_space::size_type; + using scratch_memory_space = ScratchMemorySpace<OpenMP>; + + /// \brief Print configuration information to the given output stream. + static void print_configuration(std::ostream&, const bool verbose = false); + + /// \brief is the instance running a parallel algorithm + inline static bool in_parallel(OpenMP const& = OpenMP()) noexcept; + + /// \brief Wait until all dispatched functors complete on the given instance + /// + /// This is a no-op on OpenMP + static void impl_static_fence(OpenMP const& = OpenMP()) noexcept; + + void fence() const; + + /// \brief Does the given instance return immediately after launching + /// a parallel algorithm + /// + /// This always returns false on OpenMP + inline static bool is_asynchronous(OpenMP const& = OpenMP()) noexcept; + + /// \brief Partition the default instance into new instances without creating + /// new masters + /// + /// This is a no-op on OpenMP since the default instance cannot be partitioned + /// without promoting other threads to 'master' + static std::vector<OpenMP> partition(...); + + /// Non-default instances should be ref-counted so that when the last + /// is destroyed the instance resources are released + /// + /// This is a no-op on OpenMP since a non default instance cannot be created + static OpenMP create_instance(...); + + /// \brief Partition the default instance and call 'f' on each new 'master' + /// thread + /// + /// Func is a functor with the following signiture + /// void( int partition_id, int num_partitions ) + template <typename F> + static void partition_master(F const& f, int requested_num_partitions = 0, + int requested_partition_size = 0); + + // use UniqueToken + static int concurrency(); + + static void impl_initialize(int thread_count = -1); + + /// \brief is the default execution space initialized for current 'master' + /// thread + static bool impl_is_initialized() noexcept; + + /// \brief Free any resources being consumed by the default execution space + static void impl_finalize(); + + inline static int impl_thread_pool_size() noexcept; + + /** \brief The rank of the executing thread in this thread pool */ + KOKKOS_INLINE_FUNCTION + static int impl_thread_pool_rank() noexcept; + + inline static int impl_thread_pool_size(int depth); + + // use UniqueToken + inline static int impl_max_hardware_threads() noexcept; + + // use UniqueToken + KOKKOS_INLINE_FUNCTION + static int impl_hardware_thread_id() noexcept; + + static int impl_get_current_max_threads() noexcept; + + static constexpr const char* name() noexcept { return "OpenMP"; } + uint32_t impl_instance_id() const noexcept { return 0; } +}; + +namespace Tools { +namespace Experimental { +template <> +struct DeviceTypeTraits<OpenMP> { + static constexpr DeviceType id = DeviceType::OpenMP; +}; +} // namespace Experimental +} // namespace Tools + +namespace Impl { + +class OpenMPSpaceInitializer : public ExecSpaceInitializerBase { + public: + OpenMPSpaceInitializer() = default; + ~OpenMPSpaceInitializer() = default; + void initialize(const InitArguments& args) final; + void finalize(const bool) final; + void fence() final; + void print_configuration(std::ostream& msg, const bool detail) final; +}; + +} // namespace Impl +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { + +template <> +struct MemorySpaceAccess<Kokkos::OpenMP::memory_space, + Kokkos::OpenMP::scratch_memory_space> { + enum : bool { assignable = false }; + enum : bool { accessible = true }; + enum : bool { deepcopy = false }; +}; + +} // namespace Impl +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +#include <OpenMP/Kokkos_OpenMP_Exec.hpp> +#include <OpenMP/Kokkos_OpenMP_Team.hpp> +#include <OpenMP/Kokkos_OpenMP_Parallel.hpp> +#include <OpenMP/Kokkos_OpenMP_Task.hpp> + +#include <KokkosExp_MDRangePolicy.hpp> +/*--------------------------------------------------------------------------*/ + +#endif /* #if defined( KOKKOS_ENABLE_OPENMP ) && defined( _OPENMP ) */ +#endif /* #ifndef KOKKOS_OPENMP_HPP */ diff --git a/packages/kokkos/core/src/Kokkos_OpenMPTarget.hpp b/packages/kokkos/core/src/Kokkos_OpenMPTarget.hpp new file mode 100644 index 0000000000000000000000000000000000000000..2a57a43e63b77b7f60e4cc40bb20272e0332944a --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_OpenMPTarget.hpp @@ -0,0 +1,161 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_OPENMPTARGET_HPP +#define KOKKOS_OPENMPTARGET_HPP + +#include <Kokkos_Core_fwd.hpp> + +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(_OPENMP) + +#include <omp.h> + +#include <cstddef> +#include <iosfwd> +#include <Kokkos_OpenMPTargetSpace.hpp> +#include <Kokkos_ScratchSpace.hpp> +#include <Kokkos_Parallel.hpp> +#include <Kokkos_TaskPolicy.hpp> +#include <Kokkos_Layout.hpp> +#include <impl/Kokkos_Tags.hpp> +#include <impl/Kokkos_Profiling_Interface.hpp> +#include <KokkosExp_MDRangePolicy.hpp> +#include <impl/Kokkos_ExecSpaceInitializer.hpp> +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Experimental { +namespace Impl { +class OpenMPTargetInternal; +} + +/// \class OpenMPTarget +/// \brief Kokkos device for multicore processors in the host memory space. +class OpenMPTarget { + public: + //------------------------------------ + //! \name Type declarations that all Kokkos devices must provide. + //@{ + + //! Tag this class as a kokkos execution space + using execution_space = OpenMPTarget; + using memory_space = OpenMPTargetSpace; + //! This execution space preferred device_type + using device_type = Kokkos::Device<execution_space, memory_space>; + + using array_layout = LayoutLeft; + using size_type = memory_space::size_type; + + using scratch_memory_space = ScratchMemorySpace<OpenMPTarget>; + + inline static bool in_parallel() { return omp_in_parallel(); } + + static void fence(); + + /** \brief Return the maximum amount of concurrency. */ + static int concurrency(); + + //! Print configuration information to the given output stream. + void print_configuration(std::ostream&, const bool detail = false); + + static const char* name(); + + //! Free any resources being consumed by the device. + void impl_finalize(); + + //! Has been initialized + static int impl_is_initialized(); + + //! Initialize, telling the CUDA run-time library which device to use. + void impl_initialize(); + + inline Impl::OpenMPTargetInternal* impl_internal_space_instance() const { + return m_space_instance; + } + + OpenMPTarget(); + uint32_t impl_instance_id() const noexcept { return 0; } + + private: + Impl::OpenMPTargetInternal* m_space_instance; +}; +} // namespace Experimental + +namespace Tools { +namespace Experimental { +template <> +struct DeviceTypeTraits<::Kokkos::Experimental::OpenMPTarget> { + static constexpr DeviceType id = + ::Kokkos::Profiling::Experimental::DeviceType::OpenMPTarget; +}; +} // namespace Experimental +} // namespace Tools + +namespace Impl { + +class OpenMPTargetSpaceInitializer : public ExecSpaceInitializerBase { + public: + OpenMPTargetSpaceInitializer() = default; + ~OpenMPTargetSpaceInitializer() = default; + void initialize(const InitArguments& args) final; + void finalize(const bool) final; + void fence() final; + void print_configuration(std::ostream& msg, const bool detail) final; +}; + +} // namespace Impl +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +#include <OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp> +#include <OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp> +#include <OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp> +#include <OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp> + +/*--------------------------------------------------------------------------*/ + +#endif /* #if defined( KOKKOS_ENABLE_OPENMPTARGET ) && defined( _OPENMP ) */ +#endif /* #ifndef KOKKOS_OPENMPTARGET_HPP */ diff --git a/packages/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp b/packages/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp new file mode 100644 index 0000000000000000000000000000000000000000..dc5e0194ab0a8bb85a29727c664a33b6c23e2c6c --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp @@ -0,0 +1,270 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_OPENMPTARGETSPACE_HPP +#define KOKKOS_OPENMPTARGETSPACE_HPP + +#include <cstring> +#include <string> +#include <iosfwd> +#include <typeinfo> + +#include <Kokkos_Core_fwd.hpp> + +#ifdef KOKKOS_ENABLE_OPENMPTARGET + +#include <OpenMPTarget/Kokkos_OpenMPTarget_Error.hpp> +#include <Kokkos_HostSpace.hpp> +#include <omp.h> + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { + +/// \brief Initialize lock array for arbitrary size atomics. +/// +/// Arbitrary atomics are implemented using a hash table of locks +/// where the hash value is derived from the address of the +/// object for which an atomic operation is performed. +/// This function initializes the locks to zero (unset). +// void init_lock_array_host_space(); + +/// \brief Acquire a lock for the address +/// +/// This function tries to acquire the lock for the hash value derived +/// from the provided ptr. If the lock is successfully acquired the +/// function returns true. Otherwise it returns false. +// bool lock_address_host_space(void* ptr); + +/// \brief Release lock for the address +/// +/// This function releases the lock for the hash value derived +/// from the provided ptr. This function should only be called +/// after previously successfully acquiring a lock with +/// lock_address. +// void unlock_address_host_space(void* ptr); + +} // namespace Impl +} // namespace Kokkos + +namespace Kokkos { +namespace Experimental { + +/// \class OpenMPTargetSpace +/// \brief Memory management for host memory. +/// +/// OpenMPTargetSpace is a memory space that governs host memory. "Host" +/// memory means the usual CPU-accessible memory. +class OpenMPTargetSpace { + public: + //! Tag this class as a kokkos memory space + using memory_space = OpenMPTargetSpace; + using size_type = size_t; + + /// \typedef execution_space + /// \brief Default execution space for this memory space. + /// + /// Every memory space has a default execution space. This is + /// useful for things like initializing a View (which happens in + /// parallel using the View's default execution space). + using execution_space = Kokkos::Experimental::OpenMPTarget; + + //! This memory space preferred device_type + using device_type = Kokkos::Device<execution_space, memory_space>; + + /*--------------------------------*/ + + /**\brief Default memory space instance */ + OpenMPTargetSpace(); + OpenMPTargetSpace(OpenMPTargetSpace&& rhs) = default; + OpenMPTargetSpace(const OpenMPTargetSpace& rhs) = default; + OpenMPTargetSpace& operator=(OpenMPTargetSpace&&) = default; + OpenMPTargetSpace& operator=(const OpenMPTargetSpace&) = default; + ~OpenMPTargetSpace() = default; + + /**\brief Allocate untracked memory in the space */ + void* allocate(const size_t arg_alloc_size) const; + + /**\brief Deallocate untracked memory in the space */ + void deallocate(void* const arg_alloc_ptr, const size_t arg_alloc_size) const; + + static constexpr const char* name() { return "OpenMPTargetSpace"; } + + private: + friend class Kokkos::Impl::SharedAllocationRecord< + Kokkos::Experimental::OpenMPTargetSpace, void>; +}; +} // namespace Experimental +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <> +class SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace, void> + : public HostInaccessibleSharedAllocationRecordCommon< + Kokkos::Experimental::OpenMPTargetSpace> { + private: + friend class HostInaccessibleSharedAllocationRecordCommon< + Kokkos::Experimental::OpenMPTargetSpace>; + friend class SharedAllocationRecordCommon< + Kokkos::Experimental::OpenMPTargetSpace>; + friend Kokkos::Experimental::OpenMPTargetSpace; + + using base_t = HostInaccessibleSharedAllocationRecordCommon< + Kokkos::Experimental::OpenMPTargetSpace>; + using RecordBase = SharedAllocationRecord<void, void>; + + SharedAllocationRecord(const SharedAllocationRecord&) = delete; + SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; + + /**\brief Root record for tracked allocations from this OpenMPTargetSpace + * instance */ + static RecordBase s_root_record; + + const Kokkos::Experimental::OpenMPTargetSpace m_space; + + protected: + ~SharedAllocationRecord(); + SharedAllocationRecord() = default; + + SharedAllocationRecord( + const Kokkos::Experimental::OpenMPTargetSpace& arg_space, + const std::string& arg_label, const size_t arg_alloc_size, + const RecordBase::function_type arg_dealloc = &deallocate); + + public: + std::string get_label() const; + + KOKKOS_INLINE_FUNCTION static SharedAllocationRecord* allocate( + const Kokkos::Experimental::OpenMPTargetSpace& arg_space, + const std::string& arg_label, const size_t arg_alloc_size) { +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + return new SharedAllocationRecord(arg_space, arg_label, arg_alloc_size); +#else + return nullptr; +#endif + } + + /**\brief Reallocate tracked memory in the space */ + static void* reallocate_tracked(void* const arg_alloc_ptr, + const size_t arg_alloc_size); +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +// TODO: implement all possible deep_copies +template <class ExecutionSpace> +struct DeepCopy<Kokkos::Experimental::OpenMPTargetSpace, + Kokkos::Experimental::OpenMPTargetSpace, ExecutionSpace> { + DeepCopy(void* dst, const void* src, size_t n) { + // In the Release and RelWithDebInfo builds, the size of the memcpy should + // be greater than zero to avoid error. omp_target_memcpy returns zero on + // success. + if (n > 0) + OMPT_SAFE_CALL(omp_target_memcpy(dst, const_cast<void*>(src), n, 0, 0, + omp_get_default_device(), + omp_get_default_device())); + } + DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { + exec.fence(); + if (n > 0) + OMPT_SAFE_CALL(omp_target_memcpy(dst, const_cast<void*>(src), n, 0, 0, + omp_get_default_device(), + omp_get_default_device())); + } +}; + +template <class ExecutionSpace> +struct DeepCopy<Kokkos::Experimental::OpenMPTargetSpace, HostSpace, + ExecutionSpace> { + DeepCopy(void* dst, const void* src, size_t n) { + if (n > 0) + OMPT_SAFE_CALL(omp_target_memcpy(dst, const_cast<void*>(src), n, 0, 0, + omp_get_default_device(), + omp_get_initial_device())); + } + DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { + exec.fence(); + if (n > 0) + OMPT_SAFE_CALL(omp_target_memcpy(dst, const_cast<void*>(src), n, 0, 0, + omp_get_default_device(), + omp_get_initial_device())); + } +}; + +template <class ExecutionSpace> +struct DeepCopy<HostSpace, Kokkos::Experimental::OpenMPTargetSpace, + ExecutionSpace> { + DeepCopy(void* dst, const void* src, size_t n) { + if (n > 0) + OMPT_SAFE_CALL(omp_target_memcpy(dst, const_cast<void*>(src), n, 0, 0, + omp_get_initial_device(), + omp_get_default_device())); + } + DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { + exec.fence(); + if (n > 0) + OMPT_SAFE_CALL(omp_target_memcpy(dst, const_cast<void*>(src), n, 0, 0, + omp_get_initial_device(), + omp_get_default_device())); + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif +#endif /* #define KOKKOS_OPENMPTARGETSPACE_HPP */ diff --git a/packages/kokkos/core/src/Kokkos_Pair.hpp b/packages/kokkos/core/src/Kokkos_Pair.hpp new file mode 100644 index 0000000000000000000000000000000000000000..d7512eb08616c243128e24cc7f38e5418bb54049 --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_Pair.hpp @@ -0,0 +1,507 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER + +/// \file Kokkos_Pair.hpp +/// \brief Declaration and definition of Kokkos::pair. +/// +/// This header file declares and defines Kokkos::pair and its related +/// nonmember functions. + +#ifndef KOKKOS_PAIR_HPP +#define KOKKOS_PAIR_HPP + +#include <Kokkos_Macros.hpp> +#include <utility> + +namespace Kokkos { +/// \struct pair +/// \brief Replacement for std::pair that works on CUDA devices. +/// +/// The instance methods of std::pair, including its constructors, are +/// not marked as <tt>__device__</tt> functions. Thus, they cannot be +/// called on a CUDA device, such as an NVIDIA GPU. This struct +/// implements the same interface as std::pair, but can be used on a +/// CUDA device as well as on the host. +template <class T1, class T2> +struct pair { + //! The first template parameter of this class. + using first_type = T1; + //! The second template parameter of this class. + using second_type = T2; + + //! The first element of the pair. + first_type first; + //! The second element of the pair. + second_type second; + + /// \brief Default constructor. + /// + /// This calls the default constructors of T1 and T2. It won't + /// compile if those default constructors are not defined and + /// public. + KOKKOS_DEFAULTED_FUNCTION constexpr pair() = default; + + /// \brief Constructor that takes both elements of the pair. + /// + /// This calls the copy constructors of T1 and T2. It won't compile + /// if those copy constructors are not defined and public. + KOKKOS_FORCEINLINE_FUNCTION constexpr pair(first_type const& f, + second_type const& s) + : first(f), second(s) {} + + /// \brief Copy constructor. + /// + /// This calls the copy constructors of T1 and T2. It won't compile + /// if those copy constructors are not defined and public. + template <class U, class V> + KOKKOS_FORCEINLINE_FUNCTION constexpr pair(const pair<U, V>& p) + : first(p.first), second(p.second) {} + + /// \brief Copy constructor. + /// + /// This calls the copy constructors of T1 and T2. It won't compile + /// if those copy constructors are not defined and public. + template <class U, class V> + KOKKOS_FORCEINLINE_FUNCTION constexpr pair(const volatile pair<U, V>& p) + : first(p.first), second(p.second) {} + + /// \brief Assignment operator. + /// + /// This calls the assignment operators of T1 and T2. It won't + /// compile if the assignment operators are not defined and public. + template <class U, class V> + KOKKOS_FORCEINLINE_FUNCTION pair<T1, T2>& operator=(const pair<U, V>& p) { + first = p.first; + second = p.second; + return *this; + } + + /// \brief Assignment operator, for volatile <tt>*this</tt>. + /// + /// \param p [in] Input; right-hand side of the assignment. + /// + /// This calls the assignment operators of T1 and T2. It will not + /// compile if the assignment operators are not defined and public. + /// + /// This operator returns \c void instead of <tt>volatile pair<T1, + /// T2>& </tt>. See Kokkos Issue #177 for the explanation. In + /// practice, this means that you should not chain assignments with + /// volatile lvalues. + template <class U, class V> + KOKKOS_FORCEINLINE_FUNCTION void operator=( + const volatile pair<U, V>& p) volatile { + first = p.first; + second = p.second; + // We deliberately do not return anything here. See explanation + // in public documentation above. + } + + // from std::pair<U,V> + template <class U, class V> + pair(const std::pair<U, V>& p) : first(p.first), second(p.second) {} + + /// \brief Return the std::pair version of this object. + /// + /// This is <i>not</i> a device function; you may not call it on a + /// CUDA device. It is meant to be called on the host, if the user + /// wants an std::pair instead of a Kokkos::pair. + /// + /// \note This is not a conversion operator, since defining a + /// conversion operator made the relational operators have + /// ambiguous definitions. + std::pair<T1, T2> to_std_pair() const { + return std::make_pair(first, second); + } +}; + +template <class T1, class T2> +struct pair<T1&, T2&> { + //! The first template parameter of this class. + using first_type = T1&; + //! The second template parameter of this class. + using second_type = T2&; + + //! The first element of the pair. + first_type first; + //! The second element of the pair. + second_type second; + + /// \brief Constructor that takes both elements of the pair. + /// + /// This calls the copy constructors of T1 and T2. It won't compile + /// if those copy constructors are not defined and public. + KOKKOS_FORCEINLINE_FUNCTION constexpr pair(first_type f, second_type s) + : first(f), second(s) {} + + /// \brief Copy constructor. + /// + /// This calls the copy constructors of T1 and T2. It won't compile + /// if those copy constructors are not defined and public. + template <class U, class V> + KOKKOS_FORCEINLINE_FUNCTION constexpr pair(const pair<U, V>& p) + : first(p.first), second(p.second) {} + + // from std::pair<U,V> + template <class U, class V> + pair(const std::pair<U, V>& p) : first(p.first), second(p.second) {} + + /// \brief Assignment operator. + /// + /// This calls the assignment operators of T1 and T2. It won't + /// compile if the assignment operators are not defined and public. + template <class U, class V> + KOKKOS_FORCEINLINE_FUNCTION pair<first_type, second_type>& operator=( + const pair<U, V>& p) { + first = p.first; + second = p.second; + return *this; + } + + /// \brief Return the std::pair version of this object. + /// + /// This is <i>not</i> a device function; you may not call it on a + /// CUDA device. It is meant to be called on the host, if the user + /// wants an std::pair instead of a Kokkos::pair. + /// + /// \note This is not a conversion operator, since defining a + /// conversion operator made the relational operators have + /// ambiguous definitions. + std::pair<T1, T2> to_std_pair() const { + return std::make_pair(first, second); + } +}; + +template <class T1, class T2> +struct pair<T1, T2&> { + //! The first template parameter of this class. + using first_type = T1; + //! The second template parameter of this class. + using second_type = T2&; + + //! The first element of the pair. + first_type first; + //! The second element of the pair. + second_type second; + + /// \brief Constructor that takes both elements of the pair. + /// + /// This calls the copy constructors of T1 and T2. It won't compile + /// if those copy constructors are not defined and public. + KOKKOS_FORCEINLINE_FUNCTION constexpr pair(first_type const& f, second_type s) + : first(f), second(s) {} + + /// \brief Copy constructor. + /// + /// This calls the copy constructors of T1 and T2. It won't compile + /// if those copy constructors are not defined and public. + template <class U, class V> + KOKKOS_FORCEINLINE_FUNCTION constexpr pair(const pair<U, V>& p) + : first(p.first), second(p.second) {} + + // from std::pair<U,V> + template <class U, class V> + pair(const std::pair<U, V>& p) : first(p.first), second(p.second) {} + + /// \brief Assignment operator. + /// + /// This calls the assignment operators of T1 and T2. It won't + /// compile if the assignment operators are not defined and public. + template <class U, class V> + KOKKOS_FORCEINLINE_FUNCTION pair<first_type, second_type>& operator=( + const pair<U, V>& p) { + first = p.first; + second = p.second; + return *this; + } + + /// \brief Return the std::pair version of this object. + /// + /// This is <i>not</i> a device function; you may not call it on a + /// CUDA device. It is meant to be called on the host, if the user + /// wants an std::pair instead of a Kokkos::pair. + /// + /// \note This is not a conversion operator, since defining a + /// conversion operator made the relational operators have + /// ambiguous definitions. + std::pair<T1, T2> to_std_pair() const { + return std::make_pair(first, second); + } +}; + +template <class T1, class T2> +struct pair<T1&, T2> { + //! The first template parameter of this class. + using first_type = T1&; + //! The second template parameter of this class. + using second_type = T2; + + //! The first element of the pair. + first_type first; + //! The second element of the pair. + second_type second; + + /// \brief Constructor that takes both elements of the pair. + /// + /// This calls the copy constructors of T1 and T2. It won't compile + /// if those copy constructors are not defined and public. + KOKKOS_FORCEINLINE_FUNCTION constexpr pair(first_type f, second_type const& s) + : first(f), second(s) {} + + /// \brief Copy constructor. + /// + /// This calls the copy constructors of T1 and T2. It won't compile + /// if those copy constructors are not defined and public. + template <class U, class V> + KOKKOS_FORCEINLINE_FUNCTION constexpr pair(const pair<U, V>& p) + : first(p.first), second(p.second) {} + + // from std::pair<U,V> + template <class U, class V> + pair(const std::pair<U, V>& p) : first(p.first), second(p.second) {} + + /// \brief Assignment operator. + /// + /// This calls the assignment operators of T1 and T2. It won't + /// compile if the assignment operators are not defined and public. + template <class U, class V> + KOKKOS_FORCEINLINE_FUNCTION pair<first_type, second_type>& operator=( + const pair<U, V>& p) { + first = p.first; + second = p.second; + return *this; + } + + /// \brief Return the std::pair version of this object. + /// + /// This is <i>not</i> a device function; you may not call it on a + /// CUDA device. It is meant to be called on the host, if the user + /// wants an std::pair instead of a Kokkos::pair. + /// + /// \note This is not a conversion operator, since defining a + /// conversion operator made the relational operators have + /// ambiguous definitions. + std::pair<T1, T2> to_std_pair() const { + return std::make_pair(first, second); + } +}; + +//! Equality operator for Kokkos::pair. +template <class T1, class T2> +KOKKOS_FORCEINLINE_FUNCTION bool operator==(const pair<T1, T2>& lhs, + const pair<T1, T2>& rhs) { + return lhs.first == rhs.first && lhs.second == rhs.second; +} + +//! Inequality operator for Kokkos::pair. +template <class T1, class T2> +KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator!=(const pair<T1, T2>& lhs, + const pair<T1, T2>& rhs) { + return !(lhs == rhs); +} + +//! Less-than operator for Kokkos::pair. +template <class T1, class T2> +KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator<(const pair<T1, T2>& lhs, + const pair<T1, T2>& rhs) { + return lhs.first < rhs.first || + (!(rhs.first < lhs.first) && lhs.second < rhs.second); +} + +//! Less-than-or-equal-to operator for Kokkos::pair. +template <class T1, class T2> +KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator<=(const pair<T1, T2>& lhs, + const pair<T1, T2>& rhs) { + return !(rhs < lhs); +} + +//! Greater-than operator for Kokkos::pair. +template <class T1, class T2> +KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>(const pair<T1, T2>& lhs, + const pair<T1, T2>& rhs) { + return rhs < lhs; +} + +//! Greater-than-or-equal-to operator for Kokkos::pair. +template <class T1, class T2> +KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>=(const pair<T1, T2>& lhs, + const pair<T1, T2>& rhs) { + return !(lhs < rhs); +} + +/// \brief Return a new pair. +/// +/// This is a "nonmember constructor" for Kokkos::pair. It works just +/// like std::make_pair. +template <class T1, class T2> +KOKKOS_FORCEINLINE_FUNCTION constexpr pair<T1, T2> make_pair(T1 x, T2 y) { + return (pair<T1, T2>(x, y)); +} + +/// \brief Return a pair of references to the input arguments. +/// +/// This compares to std::tie (new in C++11). You can use it to +/// assign to two variables at once, from the result of a function +/// that returns a pair. For example (<tt>__device__</tt> and +/// <tt>__host__</tt> attributes omitted for brevity): +/// \code +/// // Declaration of the function to call. +/// // First return value: operation count. +/// // Second return value: whether all operations succeeded. +/// Kokkos::pair<int, bool> someFunction (); +/// +/// // Code that uses Kokkos::tie. +/// int myFunction () { +/// int count = 0; +/// bool success = false; +/// +/// // This assigns to both count and success. +/// Kokkos::tie (count, success) = someFunction (); +/// +/// if (! success) { +/// // ... Some operation failed; +/// // take corrective action ... +/// } +/// return count; +/// } +/// \endcode +/// +/// The line that uses tie() could have been written like this: +/// \code +/// Kokkos::pair<int, bool> result = someFunction (); +/// count = result.first; +/// success = result.second; +/// \endcode +/// +/// Using tie() saves two lines of code and avoids a copy of each +/// element of the pair. The latter could be significant if one or +/// both elements of the pair are more substantial objects than \c int +/// or \c bool. +template <class T1, class T2> +KOKKOS_FORCEINLINE_FUNCTION pair<T1&, T2&> tie(T1& x, T2& y) { + return (pair<T1&, T2&>(x, y)); +} + +// +// Specialization of Kokkos::pair for a \c void second argument. This +// is not actually a "pair"; it only contains one element, the first. +// +template <class T1> +struct pair<T1, void> { + using first_type = T1; + using second_type = void; + + first_type first; + enum { second = 0 }; + + KOKKOS_DEFAULTED_FUNCTION constexpr pair() = default; + + KOKKOS_FORCEINLINE_FUNCTION constexpr pair(const first_type& f) : first(f) {} + + KOKKOS_FORCEINLINE_FUNCTION constexpr pair(const first_type& f, int) + : first(f) {} + + template <class U> + KOKKOS_FORCEINLINE_FUNCTION constexpr pair(const pair<U, void>& p) + : first(p.first) {} + + template <class U> + KOKKOS_FORCEINLINE_FUNCTION pair<T1, void>& operator=( + const pair<U, void>& p) { + first = p.first; + return *this; + } +}; + +// +// Specialization of relational operators for Kokkos::pair<T1,void>. +// + +template <class T1> +KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator==( + const pair<T1, void>& lhs, const pair<T1, void>& rhs) { + return lhs.first == rhs.first; +} + +template <class T1> +KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator!=( + const pair<T1, void>& lhs, const pair<T1, void>& rhs) { + return !(lhs == rhs); +} + +template <class T1> +KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator<( + const pair<T1, void>& lhs, const pair<T1, void>& rhs) { + return lhs.first < rhs.first; +} + +template <class T1> +KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator<=( + const pair<T1, void>& lhs, const pair<T1, void>& rhs) { + return !(rhs < lhs); +} + +template <class T1> +KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>( + const pair<T1, void>& lhs, const pair<T1, void>& rhs) { + return rhs < lhs; +} + +template <class T1> +KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>=( + const pair<T1, void>& lhs, const pair<T1, void>& rhs) { + return !(lhs < rhs); +} + +namespace Impl { + +template <class T> +struct is_pair_like : std::false_type {}; +template <class T, class U> +struct is_pair_like<Kokkos::pair<T, U>> : std::true_type {}; +template <class T, class U> +struct is_pair_like<std::pair<T, U>> : std::true_type {}; + +} // end namespace Impl + +} // namespace Kokkos + +#endif // KOKKOS_PAIR_HPP diff --git a/packages/kokkos/core/src/Kokkos_Parallel.hpp b/packages/kokkos/core/src/Kokkos_Parallel.hpp new file mode 100644 index 0000000000000000000000000000000000000000..85d1dad454ba64aa1311cf19437206768018571b --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_Parallel.hpp @@ -0,0 +1,562 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +/// \file Kokkos_Parallel.hpp +/// \brief Declaration of parallel operators + +#ifndef KOKKOS_PARALLEL_HPP +#define KOKKOS_PARALLEL_HPP + +#include <cstddef> +#include <Kokkos_Core_fwd.hpp> +#include <Kokkos_View.hpp> +#include <Kokkos_ExecPolicy.hpp> + +#include <impl/Kokkos_Tools.hpp> +#include <type_traits> +#include <typeinfo> + +#include <impl/Kokkos_Tags.hpp> +#include <impl/Kokkos_Traits.hpp> +#include <impl/Kokkos_FunctorAnalysis.hpp> +#include <impl/Kokkos_FunctorAdapter.hpp> + +#ifdef KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES +#include <iostream> +#endif + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <class T, class = void> +struct is_detected_execution_space : std::false_type { + using type = not_a_type; +}; + +template <class T> +struct is_detected_execution_space<T, void_t<typename T::execution_space>> + : std::true_type { + using type = typename T::execution_space; +}; + +template <class T> +using detected_execution_space_t = + typename is_detected_execution_space<T>::type; + +template <class T, class = void> +struct is_detected_device_type : std::false_type { + using type = not_a_type; +}; + +template <class T> +struct is_detected_device_type<T, void_t<typename T::device_type>> + : std::true_type { + using type = typename T::device_type; +}; + +template <class T> +using detected_device_type_t = typename is_detected_device_type<T>::type; + +//---------------------------------------------------------------------------- +/** \brief Given a Functor and Execution Policy query an execution space. + * + * if the Policy has an execution space use that + * else if the Functor has an execution_space use that + * else if the Functor has a device_type use that for backward compatibility + * else use the default + */ + +template <class Functor, class Policy> +struct FunctorPolicyExecutionSpace { + using execution_space = std::conditional_t< + is_detected_execution_space<Policy>::value, + detected_execution_space_t<Policy>, + std::conditional_t< + is_detected_execution_space<Functor>::value, + detected_execution_space_t<Functor>, + std::conditional_t< + is_detected_device_type<Functor>::value, + detected_execution_space_t<detected_device_type_t<Functor>>, + Kokkos::DefaultExecutionSpace>>>; +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +/** \brief Execute \c functor in parallel according to the execution \c policy. + * + * A "functor" is a class containing the function to execute in parallel, + * data needed for that execution, and an optional \c execution_space + * alias. Here is an example functor for parallel_for: + * + * \code + * class FunctorType { + * public: + * using execution_space = ...; + * void operator() ( WorkType iwork ) const ; + * }; + * \endcode + * + * In the above example, \c WorkType is any integer type for which a + * valid conversion from \c size_t to \c IntType exists. Its + * <tt>operator()</tt> method defines the operation to parallelize, + * over the range of integer indices <tt>iwork=[0,work_count-1]</tt>. + * This compares to a single iteration \c iwork of a \c for loop. + * If \c execution_space is not defined DefaultExecutionSpace will be used. + */ +template <class ExecPolicy, class FunctorType> +inline void parallel_for( + const ExecPolicy& policy, const FunctorType& functor, + const std::string& str = "", + typename std::enable_if< + Kokkos::Impl::is_execution_policy<ExecPolicy>::value>::type* = + nullptr) { + uint64_t kpID = 0; + + ExecPolicy inner_policy = policy; + Kokkos::Tools::Impl::begin_parallel_for(inner_policy, functor, str, kpID); + + Kokkos::Impl::shared_allocation_tracking_disable(); + Impl::ParallelFor<FunctorType, ExecPolicy> closure(functor, inner_policy); + Kokkos::Impl::shared_allocation_tracking_enable(); + + closure.execute(); + + Kokkos::Tools::Impl::end_parallel_for(inner_policy, functor, str, kpID); +} + +template <class FunctorType> +inline void parallel_for(const size_t work_count, const FunctorType& functor, + const std::string& str = "") { + using execution_space = + typename Impl::FunctorPolicyExecutionSpace<FunctorType, + void>::execution_space; + using policy = RangePolicy<execution_space>; + + uint64_t kpID = 0; + + policy execution_policy = policy(0, work_count); + + Kokkos::Tools::Impl::begin_parallel_for(execution_policy, functor, str, kpID); + + Kokkos::Impl::shared_allocation_tracking_disable(); + Impl::ParallelFor<FunctorType, policy> closure(functor, execution_policy); + Kokkos::Impl::shared_allocation_tracking_enable(); + + closure.execute(); + + Kokkos::Tools::Impl::end_parallel_for(execution_policy, functor, str, kpID); +} + +template <class ExecPolicy, class FunctorType> +inline void parallel_for(const std::string& str, const ExecPolicy& policy, + const FunctorType& functor) { +#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES + Kokkos::fence(); + std::cout << "KOKKOS_DEBUG Start parallel_for kernel: " << str << std::endl; +#endif + + ::Kokkos::parallel_for(policy, functor, str); + +#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES + Kokkos::fence(); + std::cout << "KOKKOS_DEBUG End parallel_for kernel: " << str << std::endl; +#endif + (void)str; +} + +} // namespace Kokkos + +#include <Kokkos_Parallel_Reduce.hpp> +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +/// \fn parallel_scan +/// \tparam ExecutionPolicy The execution policy type. +/// \tparam FunctorType The scan functor type. +/// +/// \param policy [in] The execution policy. +/// \param functor [in] The scan functor. +/// +/// This function implements a parallel scan pattern. The scan can +/// be either inclusive or exclusive, depending on how you implement +/// the scan functor. +/// +/// A scan functor looks almost exactly like a reduce functor, except +/// that its operator() takes a third \c bool argument, \c final_pass, +/// which indicates whether this is the last pass of the scan +/// operation. We will show below how to use the \c final_pass +/// argument to control whether the scan is inclusive or exclusive. +/// +/// Here is the minimum required interface of a scan functor for a POD +/// (plain old data) value type \c PodType. That is, the result is a +/// View of zero or more PodType. It is also possible for the result +/// to be an array of (same-sized) arrays of PodType, but we do not +/// show the required interface for that here. +/// \code +/// template< class ExecPolicy , class FunctorType > +/// class ScanFunctor { +/// public: +/// // The Kokkos device type +/// using execution_space = ...; +/// // Type of an entry of the array containing the result; +/// // also the type of each of the entries combined using +/// // operator() or join(). +/// using value_type = PodType; +/// +/// void operator () (const ExecPolicy::member_type & i, value_type& update, +/// const bool final_pass) const; void init (value_type& update) const; void +/// join (volatile value_type& update, volatile const value_type& input) const +/// }; +/// \endcode +/// +/// Here is an example of a functor which computes an inclusive plus-scan +/// of an array of \c int, in place. If given an array [1, 2, 3, 4], this +/// scan will overwrite that array with [1, 3, 6, 10]. +/// +/// \code +/// template<class SpaceType> +/// class InclScanFunctor { +/// public: +/// using execution_space = SpaceType; +/// using value_type = int; +/// using size_type = typename SpaceType::size_type; +/// +/// InclScanFunctor( Kokkos::View<value_type*, execution_space> x +/// , Kokkos::View<value_type*, execution_space> y ) : m_x(x), +/// m_y(y) {} +/// +/// void operator () (const size_type i, value_type& update, const bool +/// final_pass) const { +/// update += m_x(i); +/// if (final_pass) { +/// m_y(i) = update; +/// } +/// } +/// void init (value_type& update) const { +/// update = 0; +/// } +/// void join (volatile value_type& update, volatile const value_type& input) +/// const { +/// update += input; +/// } +/// +/// private: +/// Kokkos::View<value_type*, execution_space> m_x; +/// Kokkos::View<value_type*, execution_space> m_y; +/// }; +/// \endcode +/// +/// Here is an example of a functor which computes an <i>exclusive</i> +/// scan of an array of \c int, in place. In operator(), note both +/// that the final_pass test and the update have switched places, and +/// the use of a temporary. If given an array [1, 2, 3, 4], this scan +/// will overwrite that array with [0, 1, 3, 6]. +/// +/// \code +/// template<class SpaceType> +/// class ExclScanFunctor { +/// public: +/// using execution_space = SpaceType; +/// using value_type = int; +/// using size_type = typename SpaceType::size_type; +/// +/// ExclScanFunctor (Kokkos::View<value_type*, execution_space> x) : x_ (x) {} +/// +/// void operator () (const size_type i, value_type& update, const bool +/// final_pass) const { +/// const value_type x_i = x_(i); +/// if (final_pass) { +/// x_(i) = update; +/// } +/// update += x_i; +/// } +/// void init (value_type& update) const { +/// update = 0; +/// } +/// void join (volatile value_type& update, volatile const value_type& input) +/// const { +/// update += input; +/// } +/// +/// private: +/// Kokkos::View<value_type*, execution_space> x_; +/// }; +/// \endcode +/// +/// Here is an example of a functor which builds on the above +/// exclusive scan example, to compute an offsets array from a +/// population count array, in place. We assume that the pop count +/// array has an extra entry at the end to store the final count. If +/// given an array [1, 2, 3, 4, 0], this scan will overwrite that +/// array with [0, 1, 3, 6, 10]. +/// +/// \code +/// template<class SpaceType> +/// class OffsetScanFunctor { +/// public: +/// using execution_space = SpaceType; +/// using value_type = int; +/// using size_type = typename SpaceType::size_type; +/// +/// // lastIndex_ is the last valid index (zero-based) of x. +/// // If x has length zero, then lastIndex_ won't be used anyway. +/// OffsetScanFunctor( Kokkos::View<value_type*, execution_space> x +/// , Kokkos::View<value_type*, execution_space> y ) +/// : m_x(x), m_y(y), last_index_ (x.dimension_0 () == 0 ? 0 : +/// x.dimension_0 () - 1) +/// {} +/// +/// void operator () (const size_type i, int& update, const bool final_pass) +/// const { +/// if (final_pass) { +/// m_y(i) = update; +/// } +/// update += m_x(i); +/// // The last entry of m_y gets the final sum. +/// if (final_pass && i == last_index_) { +/// m_y(i+1) = update; +// i/ } +/// } +/// void init (value_type& update) const { +/// update = 0; +/// } +/// void join (volatile value_type& update, volatile const value_type& input) +/// const { +/// update += input; +/// } +/// +/// private: +/// Kokkos::View<value_type*, execution_space> m_x; +/// Kokkos::View<value_type*, execution_space> m_y; +/// const size_type last_index_; +/// }; +/// \endcode +/// +template <class ExecutionPolicy, class FunctorType> +inline void parallel_scan( + const ExecutionPolicy& policy, const FunctorType& functor, + const std::string& str = "", + typename std::enable_if< + Kokkos::Impl::is_execution_policy<ExecutionPolicy>::value>::type* = + nullptr) { + uint64_t kpID = 0; + ExecutionPolicy inner_policy = policy; + Kokkos::Tools::Impl::begin_parallel_scan(inner_policy, functor, str, kpID); + + Kokkos::Impl::shared_allocation_tracking_disable(); + Impl::ParallelScan<FunctorType, ExecutionPolicy> closure(functor, + inner_policy); + Kokkos::Impl::shared_allocation_tracking_enable(); + + closure.execute(); + + Kokkos::Tools::Impl::end_parallel_scan(inner_policy, functor, str, kpID); +} + +template <class FunctorType> +inline void parallel_scan(const size_t work_count, const FunctorType& functor, + const std::string& str = "") { + using execution_space = + typename Kokkos::Impl::FunctorPolicyExecutionSpace<FunctorType, + void>::execution_space; + + using policy = Kokkos::RangePolicy<execution_space>; + + uint64_t kpID = 0; + policy execution_policy(0, work_count); + Kokkos::Tools::Impl::begin_parallel_scan(execution_policy, functor, str, + kpID); + Kokkos::Impl::shared_allocation_tracking_disable(); + Impl::ParallelScan<FunctorType, policy> closure(functor, execution_policy); + Kokkos::Impl::shared_allocation_tracking_enable(); + + closure.execute(); + + Kokkos::Tools::Impl::end_parallel_scan(execution_policy, functor, str, kpID); +} + +template <class ExecutionPolicy, class FunctorType> +inline void parallel_scan(const std::string& str, const ExecutionPolicy& policy, + const FunctorType& functor) { +#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES + Kokkos::fence(); + std::cout << "KOKKOS_DEBUG Start parallel_scan kernel: " << str << std::endl; +#endif + + ::Kokkos::parallel_scan(policy, functor, str); + +#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES + Kokkos::fence(); + std::cout << "KOKKOS_DEBUG End parallel_scan kernel: " << str << std::endl; +#endif + (void)str; +} + +template <class ExecutionPolicy, class FunctorType, class ReturnType> +inline void parallel_scan( + const ExecutionPolicy& policy, const FunctorType& functor, + ReturnType& return_value, const std::string& str = "", + typename std::enable_if< + Kokkos::Impl::is_execution_policy<ExecutionPolicy>::value>::type* = + nullptr) { + uint64_t kpID = 0; + ExecutionPolicy inner_policy = policy; + Kokkos::Tools::Impl::begin_parallel_scan(inner_policy, functor, str, kpID); + + Kokkos::Impl::shared_allocation_tracking_disable(); + Impl::ParallelScanWithTotal<FunctorType, ExecutionPolicy, ReturnType> closure( + functor, inner_policy, return_value); + Kokkos::Impl::shared_allocation_tracking_enable(); + + closure.execute(); + + Kokkos::Tools::Impl::end_parallel_scan(inner_policy, functor, str, kpID); + + policy.space().fence(); +} + +template <class FunctorType, class ReturnType> +inline void parallel_scan(const size_t work_count, const FunctorType& functor, + ReturnType& return_value, + const std::string& str = "") { + using execution_space = + typename Kokkos::Impl::FunctorPolicyExecutionSpace<FunctorType, + void>::execution_space; + + using policy = Kokkos::RangePolicy<execution_space>; + + policy execution_policy(0, work_count); + uint64_t kpID = 0; + Kokkos::Tools::Impl::begin_parallel_scan(execution_policy, functor, str, + kpID); + + Kokkos::Impl::shared_allocation_tracking_disable(); + Impl::ParallelScanWithTotal<FunctorType, policy, ReturnType> closure( + functor, execution_policy, return_value); + Kokkos::Impl::shared_allocation_tracking_enable(); + + closure.execute(); + + Kokkos::Tools::Impl::end_parallel_scan(execution_policy, functor, str, kpID); + + execution_space().fence(); +} + +template <class ExecutionPolicy, class FunctorType, class ReturnType> +inline void parallel_scan(const std::string& str, const ExecutionPolicy& policy, + const FunctorType& functor, + ReturnType& return_value) { +#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES + Kokkos::fence(); + std::cout << "KOKKOS_DEBUG Start parallel_scan kernel: " << str << std::endl; +#endif + + ::Kokkos::parallel_scan(policy, functor, return_value, str); + +#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES + Kokkos::fence(); + std::cout << "KOKKOS_DEBUG End parallel_scan kernel: " << str << std::endl; +#endif + (void)str; +} + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <class FunctorType, + bool HasTeamShmemSize = + has_member_team_shmem_size<FunctorType>::value, + bool HasShmemSize = has_member_shmem_size<FunctorType>::value> +struct FunctorTeamShmemSize { + KOKKOS_INLINE_FUNCTION static size_t value(const FunctorType&, int) { + return 0; + } +}; + +template <class FunctorType> +struct FunctorTeamShmemSize<FunctorType, true, false> { + static inline size_t value(const FunctorType& f, int team_size) { + return f.team_shmem_size(team_size); + } +}; + +template <class FunctorType> +struct FunctorTeamShmemSize<FunctorType, false, true> { + static inline size_t value(const FunctorType& f, int team_size) { + return f.shmem_size(team_size); + } +}; +template <class FunctorType> +struct FunctorTeamShmemSize<FunctorType, true, true> { + static inline size_t value(const FunctorType& /*f*/, int /*team_size*/) { + Kokkos::abort( + "Functor with both team_shmem_size and shmem_size defined is " + "not allowed"); + return 0; + } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* KOKKOS_PARALLEL_HPP */ diff --git a/packages/kokkos/core/src/Kokkos_Parallel_Reduce.hpp b/packages/kokkos/core/src/Kokkos_Parallel_Reduce.hpp new file mode 100644 index 0000000000000000000000000000000000000000..96242f99b0ca678e1ede6f148ae5d90a16127afe --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_Parallel_Reduce.hpp @@ -0,0 +1,1210 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_PARALLEL_REDUCE_HPP +#define KOKKOS_PARALLEL_REDUCE_HPP + +#include <Kokkos_NumericTraits.hpp> +#include <Kokkos_View.hpp> +#include <impl/Kokkos_FunctorAnalysis.hpp> +#include <impl/Kokkos_FunctorAdapter.hpp> +#include <type_traits> + +namespace Kokkos { + +template <class T, class Enable = void> +struct is_reducer_type { + enum { value = 0 }; +}; + +template <class T> +struct is_reducer_type< + T, typename std::enable_if<std::is_same< + typename std::remove_cv<T>::type, + typename std::remove_cv<typename T::reducer>::type>::value>::type> { + enum { value = 1 }; +}; + +template <class Scalar, class Space> +struct Sum { + public: + // Required + using reducer = Sum<Scalar, Space>; + using value_type = typename std::remove_cv<Scalar>::type; + + using result_view_type = Kokkos::View<value_type, Space>; + + private: + result_view_type value; + bool references_scalar_v; + + public: + KOKKOS_INLINE_FUNCTION + Sum(value_type& value_) : value(&value_), references_scalar_v(true) {} + + KOKKOS_INLINE_FUNCTION + Sum(const result_view_type& value_) + : value(value_), references_scalar_v(false) {} + + // Required + KOKKOS_INLINE_FUNCTION + void join(value_type& dest, const value_type& src) const { dest += src; } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dest, const volatile value_type& src) const { + dest += src; + } + + KOKKOS_INLINE_FUNCTION + void init(value_type& val) const { + val = reduction_identity<value_type>::sum(); + } + + KOKKOS_INLINE_FUNCTION + value_type& reference() const { return *value.data(); } + + KOKKOS_INLINE_FUNCTION + result_view_type view() const { return value; } + + KOKKOS_INLINE_FUNCTION + bool references_scalar() const { return references_scalar_v; } +}; + +template <class Scalar, class Space> +struct Prod { + public: + // Required + using reducer = Prod<Scalar, Space>; + using value_type = typename std::remove_cv<Scalar>::type; + + using result_view_type = Kokkos::View<value_type, Space>; + + private: + result_view_type value; + bool references_scalar_v; + + public: + KOKKOS_INLINE_FUNCTION + Prod(value_type& value_) : value(&value_), references_scalar_v(true) {} + + KOKKOS_INLINE_FUNCTION + Prod(const result_view_type& value_) + : value(value_), references_scalar_v(false) {} + + // Required + KOKKOS_INLINE_FUNCTION + void join(value_type& dest, const value_type& src) const { dest *= src; } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dest, const volatile value_type& src) const { + dest *= src; + } + + KOKKOS_INLINE_FUNCTION + void init(value_type& val) const { + val = reduction_identity<value_type>::prod(); + } + + KOKKOS_INLINE_FUNCTION + value_type& reference() const { return *value.data(); } + + KOKKOS_INLINE_FUNCTION + result_view_type view() const { return value; } + + KOKKOS_INLINE_FUNCTION + bool references_scalar() const { return references_scalar_v; } +}; + +template <class Scalar, class Space> +struct Min { + public: + // Required + using reducer = Min<Scalar, Space>; + using value_type = typename std::remove_cv<Scalar>::type; + + using result_view_type = Kokkos::View<value_type, Space>; + + private: + result_view_type value; + bool references_scalar_v; + + public: + KOKKOS_INLINE_FUNCTION + Min(value_type& value_) : value(&value_), references_scalar_v(true) {} + + KOKKOS_INLINE_FUNCTION + Min(const result_view_type& value_) + : value(value_), references_scalar_v(false) {} + + // Required + KOKKOS_INLINE_FUNCTION + void join(value_type& dest, const value_type& src) const { + if (src < dest) dest = src; + } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dest, const volatile value_type& src) const { + if (src < dest) dest = src; + } + + KOKKOS_INLINE_FUNCTION + void init(value_type& val) const { + val = reduction_identity<value_type>::min(); + } + + KOKKOS_INLINE_FUNCTION + value_type& reference() const { return *value.data(); } + + KOKKOS_INLINE_FUNCTION + result_view_type view() const { return value; } + + KOKKOS_INLINE_FUNCTION + bool references_scalar() const { return references_scalar_v; } +}; + +template <class Scalar, class Space> +struct Max { + public: + // Required + using reducer = Max<Scalar, Space>; + using value_type = typename std::remove_cv<Scalar>::type; + + using result_view_type = Kokkos::View<value_type, Space>; + + private: + result_view_type value; + bool references_scalar_v; + + public: + KOKKOS_INLINE_FUNCTION + Max(value_type& value_) : value(&value_), references_scalar_v(true) {} + + KOKKOS_INLINE_FUNCTION + Max(const result_view_type& value_) + : value(value_), references_scalar_v(false) {} + + // Required + KOKKOS_INLINE_FUNCTION + void join(value_type& dest, const value_type& src) const { + if (src > dest) dest = src; + } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dest, const volatile value_type& src) const { + if (src > dest) dest = src; + } + + // Required + KOKKOS_INLINE_FUNCTION + void init(value_type& val) const { + val = reduction_identity<value_type>::max(); + } + + KOKKOS_INLINE_FUNCTION + value_type& reference() const { return *value.data(); } + + KOKKOS_INLINE_FUNCTION + result_view_type view() const { return value; } + + KOKKOS_INLINE_FUNCTION + bool references_scalar() const { return references_scalar_v; } +}; + +template <class Scalar, class Space> +struct LAnd { + public: + // Required + using reducer = LAnd<Scalar, Space>; + using value_type = typename std::remove_cv<Scalar>::type; + + using result_view_type = Kokkos::View<value_type, Space>; + + private: + result_view_type value; + bool references_scalar_v; + + public: + KOKKOS_INLINE_FUNCTION + LAnd(value_type& value_) : value(&value_), references_scalar_v(true) {} + + KOKKOS_INLINE_FUNCTION + LAnd(const result_view_type& value_) + : value(value_), references_scalar_v(false) {} + + KOKKOS_INLINE_FUNCTION + void join(value_type& dest, const value_type& src) const { + dest = dest && src; + } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dest, const volatile value_type& src) const { + dest = dest && src; + } + + KOKKOS_INLINE_FUNCTION + void init(value_type& val) const { + val = reduction_identity<value_type>::land(); + } + + KOKKOS_INLINE_FUNCTION + value_type& reference() const { return *value.data(); } + + KOKKOS_INLINE_FUNCTION + result_view_type view() const { return value; } + + KOKKOS_INLINE_FUNCTION + bool references_scalar() const { return references_scalar_v; } +}; + +template <class Scalar, class Space> +struct LOr { + public: + // Required + using reducer = LOr<Scalar, Space>; + using value_type = typename std::remove_cv<Scalar>::type; + + using result_view_type = Kokkos::View<value_type, Space>; + + private: + result_view_type value; + bool references_scalar_v; + + public: + KOKKOS_INLINE_FUNCTION + LOr(value_type& value_) : value(&value_), references_scalar_v(true) {} + + KOKKOS_INLINE_FUNCTION + LOr(const result_view_type& value_) + : value(value_), references_scalar_v(false) {} + + // Required + KOKKOS_INLINE_FUNCTION + void join(value_type& dest, const value_type& src) const { + dest = dest || src; + } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dest, const volatile value_type& src) const { + dest = dest || src; + } + + KOKKOS_INLINE_FUNCTION + void init(value_type& val) const { + val = reduction_identity<value_type>::lor(); + } + + KOKKOS_INLINE_FUNCTION + value_type& reference() const { return *value.data(); } + + KOKKOS_INLINE_FUNCTION + result_view_type view() const { return value; } + + KOKKOS_INLINE_FUNCTION + bool references_scalar() const { return references_scalar_v; } +}; + +template <class Scalar, class Space> +struct BAnd { + public: + // Required + using reducer = BAnd<Scalar, Space>; + using value_type = typename std::remove_cv<Scalar>::type; + + using result_view_type = Kokkos::View<value_type, Space>; + + private: + result_view_type value; + bool references_scalar_v; + + public: + KOKKOS_INLINE_FUNCTION + BAnd(value_type& value_) : value(&value_), references_scalar_v(true) {} + + KOKKOS_INLINE_FUNCTION + BAnd(const result_view_type& value_) + : value(value_), references_scalar_v(false) {} + + // Required + KOKKOS_INLINE_FUNCTION + void join(value_type& dest, const value_type& src) const { + dest = dest & src; + } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dest, const volatile value_type& src) const { + dest = dest & src; + } + + KOKKOS_INLINE_FUNCTION + void init(value_type& val) const { + val = reduction_identity<value_type>::band(); + } + + KOKKOS_INLINE_FUNCTION + value_type& reference() const { return *value.data(); } + + KOKKOS_INLINE_FUNCTION + result_view_type view() const { return value; } + + KOKKOS_INLINE_FUNCTION + bool references_scalar() const { return references_scalar_v; } +}; + +template <class Scalar, class Space> +struct BOr { + public: + // Required + using reducer = BOr<Scalar, Space>; + using value_type = typename std::remove_cv<Scalar>::type; + + using result_view_type = Kokkos::View<value_type, Space>; + + private: + result_view_type value; + bool references_scalar_v; + + public: + KOKKOS_INLINE_FUNCTION + BOr(value_type& value_) : value(&value_), references_scalar_v(true) {} + + KOKKOS_INLINE_FUNCTION + BOr(const result_view_type& value_) + : value(value_), references_scalar_v(false) {} + + // Required + KOKKOS_INLINE_FUNCTION + void join(value_type& dest, const value_type& src) const { + dest = dest | src; + } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dest, const volatile value_type& src) const { + dest = dest | src; + } + + KOKKOS_INLINE_FUNCTION + void init(value_type& val) const { + val = reduction_identity<value_type>::bor(); + } + + KOKKOS_INLINE_FUNCTION + value_type& reference() const { return *value.data(); } + + KOKKOS_INLINE_FUNCTION + result_view_type view() const { return value; } + + KOKKOS_INLINE_FUNCTION + bool references_scalar() const { return references_scalar_v; } +}; + +template <class Scalar, class Index> +struct ValLocScalar { + Scalar val; + Index loc; + + KOKKOS_INLINE_FUNCTION + void operator=(const ValLocScalar& rhs) { + val = rhs.val; + loc = rhs.loc; + } + + KOKKOS_INLINE_FUNCTION + void operator=(const volatile ValLocScalar& rhs) volatile { + val = rhs.val; + loc = rhs.loc; + } +}; + +template <class Scalar, class Index, class Space> +struct MinLoc { + private: + using scalar_type = typename std::remove_cv<Scalar>::type; + using index_type = typename std::remove_cv<Index>::type; + + public: + // Required + using reducer = MinLoc<Scalar, Index, Space>; + using value_type = ValLocScalar<scalar_type, index_type>; + + using result_view_type = Kokkos::View<value_type, Space>; + + private: + result_view_type value; + bool references_scalar_v; + + public: + KOKKOS_INLINE_FUNCTION + MinLoc(value_type& value_) : value(&value_), references_scalar_v(true) {} + + KOKKOS_INLINE_FUNCTION + MinLoc(const result_view_type& value_) + : value(value_), references_scalar_v(false) {} + + // Required + KOKKOS_INLINE_FUNCTION + void join(value_type& dest, const value_type& src) const { + if (src.val < dest.val) dest = src; + } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dest, const volatile value_type& src) const { + if (src.val < dest.val) dest = src; + } + + KOKKOS_INLINE_FUNCTION + void init(value_type& val) const { + val.val = reduction_identity<scalar_type>::min(); + val.loc = reduction_identity<index_type>::min(); + } + + KOKKOS_INLINE_FUNCTION + value_type& reference() const { return *value.data(); } + + KOKKOS_INLINE_FUNCTION + result_view_type view() const { return value; } + + KOKKOS_INLINE_FUNCTION + bool references_scalar() const { return references_scalar_v; } +}; + +template <class Scalar, class Index, class Space> +struct MaxLoc { + private: + using scalar_type = typename std::remove_cv<Scalar>::type; + using index_type = typename std::remove_cv<Index>::type; + + public: + // Required + using reducer = MaxLoc<Scalar, Index, Space>; + using value_type = ValLocScalar<scalar_type, index_type>; + + using result_view_type = Kokkos::View<value_type, Space>; + + private: + result_view_type value; + bool references_scalar_v; + + public: + KOKKOS_INLINE_FUNCTION + MaxLoc(value_type& value_) : value(&value_), references_scalar_v(true) {} + + KOKKOS_INLINE_FUNCTION + MaxLoc(const result_view_type& value_) + : value(value_), references_scalar_v(false) {} + + // Required + KOKKOS_INLINE_FUNCTION + void join(value_type& dest, const value_type& src) const { + if (src.val > dest.val) dest = src; + } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dest, const volatile value_type& src) const { + if (src.val > dest.val) dest = src; + } + + KOKKOS_INLINE_FUNCTION + void init(value_type& val) const { + val.val = reduction_identity<scalar_type>::max(); + val.loc = reduction_identity<index_type>::min(); + } + + KOKKOS_INLINE_FUNCTION + value_type& reference() const { return *value.data(); } + + KOKKOS_INLINE_FUNCTION + result_view_type view() const { return value; } + + KOKKOS_INLINE_FUNCTION + bool references_scalar() const { return references_scalar_v; } +}; + +template <class Scalar> +struct MinMaxScalar { + Scalar min_val, max_val; + + KOKKOS_INLINE_FUNCTION + void operator=(const MinMaxScalar& rhs) { + min_val = rhs.min_val; + max_val = rhs.max_val; + } + + KOKKOS_INLINE_FUNCTION + void operator=(const volatile MinMaxScalar& rhs) volatile { + min_val = rhs.min_val; + max_val = rhs.max_val; + } +}; + +template <class Scalar, class Space> +struct MinMax { + private: + using scalar_type = typename std::remove_cv<Scalar>::type; + + public: + // Required + using reducer = MinMax<Scalar, Space>; + using value_type = MinMaxScalar<scalar_type>; + + using result_view_type = Kokkos::View<value_type, Space>; + + private: + result_view_type value; + bool references_scalar_v; + + public: + KOKKOS_INLINE_FUNCTION + MinMax(value_type& value_) : value(&value_), references_scalar_v(true) {} + + KOKKOS_INLINE_FUNCTION + MinMax(const result_view_type& value_) + : value(value_), references_scalar_v(false) {} + + // Required + KOKKOS_INLINE_FUNCTION + void join(value_type& dest, const value_type& src) const { + if (src.min_val < dest.min_val) { + dest.min_val = src.min_val; + } + if (src.max_val > dest.max_val) { + dest.max_val = src.max_val; + } + } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dest, const volatile value_type& src) const { + if (src.min_val < dest.min_val) { + dest.min_val = src.min_val; + } + if (src.max_val > dest.max_val) { + dest.max_val = src.max_val; + } + } + + KOKKOS_INLINE_FUNCTION + void init(value_type& val) const { + val.max_val = reduction_identity<scalar_type>::max(); + val.min_val = reduction_identity<scalar_type>::min(); + } + + KOKKOS_INLINE_FUNCTION + value_type& reference() const { return *value.data(); } + + KOKKOS_INLINE_FUNCTION + result_view_type view() const { return value; } + + KOKKOS_INLINE_FUNCTION + bool references_scalar() const { return references_scalar_v; } +}; + +template <class Scalar, class Index> +struct MinMaxLocScalar { + Scalar min_val, max_val; + Index min_loc, max_loc; + + KOKKOS_INLINE_FUNCTION + void operator=(const MinMaxLocScalar& rhs) { + min_val = rhs.min_val; + min_loc = rhs.min_loc; + max_val = rhs.max_val; + max_loc = rhs.max_loc; + } + + KOKKOS_INLINE_FUNCTION + void operator=(const volatile MinMaxLocScalar& rhs) volatile { + min_val = rhs.min_val; + min_loc = rhs.min_loc; + max_val = rhs.max_val; + max_loc = rhs.max_loc; + } +}; + +template <class Scalar, class Index, class Space> +struct MinMaxLoc { + private: + using scalar_type = typename std::remove_cv<Scalar>::type; + using index_type = typename std::remove_cv<Index>::type; + + public: + // Required + using reducer = MinMaxLoc<Scalar, Index, Space>; + using value_type = MinMaxLocScalar<scalar_type, index_type>; + + using result_view_type = Kokkos::View<value_type, Space>; + + private: + result_view_type value; + bool references_scalar_v; + + public: + KOKKOS_INLINE_FUNCTION + MinMaxLoc(value_type& value_) : value(&value_), references_scalar_v(true) {} + + KOKKOS_INLINE_FUNCTION + MinMaxLoc(const result_view_type& value_) + : value(value_), references_scalar_v(false) {} + + // Required + KOKKOS_INLINE_FUNCTION + void join(value_type& dest, const value_type& src) const { + if (src.min_val < dest.min_val) { + dest.min_val = src.min_val; + dest.min_loc = src.min_loc; + } + if (src.max_val > dest.max_val) { + dest.max_val = src.max_val; + dest.max_loc = src.max_loc; + } + } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dest, const volatile value_type& src) const { + if (src.min_val < dest.min_val) { + dest.min_val = src.min_val; + dest.min_loc = src.min_loc; + } + if (src.max_val > dest.max_val) { + dest.max_val = src.max_val; + dest.max_loc = src.max_loc; + } + } + + KOKKOS_INLINE_FUNCTION + void init(value_type& val) const { + val.max_val = reduction_identity<scalar_type>::max(); + val.min_val = reduction_identity<scalar_type>::min(); + val.max_loc = reduction_identity<index_type>::min(); + val.min_loc = reduction_identity<index_type>::min(); + } + + KOKKOS_INLINE_FUNCTION + value_type& reference() const { return *value.data(); } + + KOKKOS_INLINE_FUNCTION + result_view_type view() const { return value; } + + KOKKOS_INLINE_FUNCTION + bool references_scalar() const { return references_scalar_v; } +}; +} // namespace Kokkos +namespace Kokkos { +namespace Impl { + +template <class T, class ReturnType, class ValueTraits> +struct ParallelReduceReturnValue; + +template <class ReturnType, class FunctorType> +struct ParallelReduceReturnValue< + typename std::enable_if<Kokkos::is_view<ReturnType>::value>::type, + ReturnType, FunctorType> { + using return_type = ReturnType; + using reducer_type = InvalidType; + + using value_type_scalar = typename return_type::value_type; + using value_type_array = typename return_type::value_type* const; + + using value_type = std::conditional_t<return_type::rank == 0, + value_type_scalar, value_type_array>; + + static return_type& return_value(ReturnType& return_val, const FunctorType&) { + return return_val; + } +}; + +template <class ReturnType, class FunctorType> +struct ParallelReduceReturnValue< + typename std::enable_if<!Kokkos::is_view<ReturnType>::value && + (!std::is_array<ReturnType>::value && + !std::is_pointer<ReturnType>::value) && + !Kokkos::is_reducer_type<ReturnType>::value>::type, + ReturnType, FunctorType> { + using return_type = + Kokkos::View<ReturnType, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>; + + using reducer_type = InvalidType; + + using value_type = typename return_type::value_type; + + static return_type return_value(ReturnType& return_val, const FunctorType&) { + return return_type(&return_val); + } +}; + +template <class ReturnType, class FunctorType> +struct ParallelReduceReturnValue< + typename std::enable_if<(std::is_array<ReturnType>::value || + std::is_pointer<ReturnType>::value)>::type, + ReturnType, FunctorType> { + using return_type = Kokkos::View<typename std::remove_const<ReturnType>::type, + Kokkos::HostSpace, Kokkos::MemoryUnmanaged>; + + using reducer_type = InvalidType; + + using value_type = typename return_type::value_type[]; + + static return_type return_value(ReturnType& return_val, + const FunctorType& functor) { + if (std::is_array<ReturnType>::value) + return return_type(return_val); + else + return return_type(return_val, functor.value_count); + } +}; + +template <class ReturnType, class FunctorType> +struct ParallelReduceReturnValue< + typename std::enable_if<Kokkos::is_reducer_type<ReturnType>::value>::type, + ReturnType, FunctorType> { + using return_type = ReturnType; + using reducer_type = ReturnType; + using value_type = typename return_type::value_type; + + static return_type return_value(ReturnType& return_val, const FunctorType&) { + return return_val; + } +}; + +template <class T, class ReturnType, class FunctorType> +struct ParallelReducePolicyType; + +template <class PolicyType, class FunctorType> +struct ParallelReducePolicyType< + typename std::enable_if< + Kokkos::Impl::is_execution_policy<PolicyType>::value>::type, + PolicyType, FunctorType> { + using policy_type = PolicyType; + static PolicyType policy(const PolicyType& policy_) { return policy_; } +}; + +template <class PolicyType, class FunctorType> +struct ParallelReducePolicyType< + typename std::enable_if<std::is_integral<PolicyType>::value>::type, + PolicyType, FunctorType> { + using execution_space = + typename Impl::FunctorPolicyExecutionSpace<FunctorType, + void>::execution_space; + + using policy_type = Kokkos::RangePolicy<execution_space>; + + static policy_type policy(const PolicyType& policy_) { + return policy_type(0, policy_); + } +}; + +template <class FunctorType, class ExecPolicy, class ValueType, + class ExecutionSpace> +struct ParallelReduceFunctorType { + using functor_type = FunctorType; + static const functor_type& functor(const functor_type& functor) { + return functor; + } +}; + +template <class PolicyType, class FunctorType, class ReturnType> +struct ParallelReduceAdaptor { + using return_value_adapter = + Impl::ParallelReduceReturnValue<void, ReturnType, FunctorType>; +#ifdef KOKKOS_IMPL_NEED_FUNCTOR_WRAPPER + using functor_adaptor = + Impl::ParallelReduceFunctorType<FunctorType, PolicyType, + typename return_value_adapter::value_type, + typename PolicyType::execution_space>; +#endif + static inline void execute(const std::string& label, const PolicyType& policy, + const FunctorType& functor, + ReturnType& return_value) { + uint64_t kpID = 0; + + PolicyType inner_policy = policy; + Kokkos::Tools::Impl::begin_parallel_reduce< + typename return_value_adapter::reducer_type>(inner_policy, functor, + label, kpID); + + Kokkos::Impl::shared_allocation_tracking_disable(); +#ifdef KOKKOS_IMPL_NEED_FUNCTOR_WRAPPER + Impl::ParallelReduce<typename functor_adaptor::functor_type, PolicyType, + typename return_value_adapter::reducer_type> + closure(functor_adaptor::functor(functor), inner_policy, + return_value_adapter::return_value(return_value, functor)); +#else + Impl::ParallelReduce<FunctorType, PolicyType, + typename return_value_adapter::reducer_type> + closure(functor, inner_policy, + return_value_adapter::return_value(return_value, functor)); +#endif + Kokkos::Impl::shared_allocation_tracking_enable(); + closure.execute(); + + Kokkos::Tools::Impl::end_parallel_reduce< + typename return_value_adapter::reducer_type>(inner_policy, functor, + label, kpID); + } +}; +} // namespace Impl + +//---------------------------------------------------------------------------- + +/*! \fn void parallel_reduce(label,policy,functor,return_argument) + \brief Perform a parallel reduction. + \param label An optional Label giving the call name. Must be able to + construct a std::string from the argument. \param policy A Kokkos Execution + Policy, such as an integer, a RangePolicy or a TeamPolicy. \param functor A + functor with a reduction operator, and optional init, join and final + functions. \param return_argument A return argument which can be a scalar, a + View, or a ReducerStruct. This argument can be left out if the functor has a + final function. +*/ + +// Parallel Reduce Blocking behavior + +namespace Impl { +template <typename T> +struct ReducerHasTestReferenceFunction { + template <typename E> + static std::true_type test_func(decltype(&E::references_scalar)); + template <typename E> + static std::false_type test_func(...); + + enum { + value = std::is_same<std::true_type, decltype(test_func<T>(nullptr))>::value + }; +}; + +template <class ExecutionSpace, class Arg> +constexpr std::enable_if_t< + // constraints only necessary because SFINAE lacks subsumption + !ReducerHasTestReferenceFunction<Arg>::value && + !Kokkos::is_view<Arg>::value, + // return type: + bool> +parallel_reduce_needs_fence(ExecutionSpace const&, Arg const&) { + return true; +} + +template <class ExecutionSpace, class Reducer> +constexpr std::enable_if_t< + // equivalent to: + // (requires (Reducer const& r) { + // { reducer.references_scalar() } -> std::convertible_to<bool>; + // }) + ReducerHasTestReferenceFunction<Reducer>::value, + // return type: + bool> +parallel_reduce_needs_fence(ExecutionSpace const&, Reducer const& reducer) { + return reducer.references_scalar(); +} + +template <class ExecutionSpace, class ViewLike> +constexpr std::enable_if_t< + // requires Kokkos::ViewLike<ViewLike> + Kokkos::is_view<ViewLike>::value, + // return type: + bool> +parallel_reduce_needs_fence(ExecutionSpace const&, ViewLike const&) { + return false; +} + +template <class ExecutionSpace, class... Args> +struct ParallelReduceFence { + template <class... ArgsDeduced> + static void fence(const ExecutionSpace& ex, ArgsDeduced&&... args) { + if (Impl::parallel_reduce_needs_fence(ex, (ArgsDeduced &&) args...)) { + ex.fence(); + } + } +}; + +} // namespace Impl + +/** \brief Parallel reduction + * + * parallel_reduce performs parallel reductions with arbitrary functions - i.e. + * it is not solely data based. The call expects up to 4 arguments: + * + * + * Example of a parallel_reduce functor for a POD (plain old data) value type: + * \code + * class FunctorType { // For POD value type + * public: + * using execution_space = ...; + * using value_type = <podType>; + * void operator()( <intType> iwork , <podType> & update ) const ; + * void init( <podType> & update ) const ; + * void join( volatile <podType> & update , + * volatile const <podType> & input ) const ; + * + * using has_final = true_type; + * void final( <podType> & update ) const ; + * }; + * \endcode + * + * Example of a parallel_reduce functor for an array of POD (plain old data) + * values: + * \code + * class FunctorType { // For array of POD value + * public: + * using execution_space = ...; + * using value_type = <podType>[]; + * void operator()( <intType> , <podType> update[] ) const ; + * void init( <podType> update[] ) const ; + * void join( volatile <podType> update[] , + * volatile const <podType> input[] ) const ; + * + * using has_final = true_type; + * void final( <podType> update[] ) const ; + * }; + * \endcode + */ + +// ReturnValue is scalar or array: take by reference + +template <class PolicyType, class FunctorType, class ReturnType> +inline typename std::enable_if< + Kokkos::Impl::is_execution_policy<PolicyType>::value>::type +parallel_reduce(const std::string& label, const PolicyType& policy, + const FunctorType& functor, ReturnType& return_value) { + Impl::ParallelReduceAdaptor<PolicyType, FunctorType, ReturnType>::execute( + label, policy, functor, return_value); + Impl::ParallelReduceFence<typename PolicyType::execution_space, + ReturnType>::fence(policy.space(), return_value); +} + +template <class PolicyType, class FunctorType, class ReturnType> +inline typename std::enable_if< + Kokkos::Impl::is_execution_policy<PolicyType>::value>::type +parallel_reduce(const PolicyType& policy, const FunctorType& functor, + ReturnType& return_value) { + Impl::ParallelReduceAdaptor<PolicyType, FunctorType, ReturnType>::execute( + "", policy, functor, return_value); + Impl::ParallelReduceFence<typename PolicyType::execution_space, + ReturnType>::fence(policy.space(), return_value); +} + +template <class FunctorType, class ReturnType> +inline void parallel_reduce(const size_t& policy, const FunctorType& functor, + ReturnType& return_value) { + using policy_type = + typename Impl::ParallelReducePolicyType<void, size_t, + FunctorType>::policy_type; + Impl::ParallelReduceAdaptor<policy_type, FunctorType, ReturnType>::execute( + "", policy_type(0, policy), functor, return_value); + Impl::ParallelReduceFence<typename policy_type::execution_space, ReturnType>:: + fence(typename policy_type::execution_space(), return_value); +} + +template <class FunctorType, class ReturnType> +inline void parallel_reduce(const std::string& label, const size_t& policy, + const FunctorType& functor, + ReturnType& return_value) { + using policy_type = + typename Impl::ParallelReducePolicyType<void, size_t, + FunctorType>::policy_type; + Impl::ParallelReduceAdaptor<policy_type, FunctorType, ReturnType>::execute( + label, policy_type(0, policy), functor, return_value); + Impl::ParallelReduceFence<typename policy_type::execution_space, ReturnType>:: + fence(typename policy_type::execution_space(), return_value); +} + +// ReturnValue as View or Reducer: take by copy to allow for inline construction + +template <class PolicyType, class FunctorType, class ReturnType> +inline typename std::enable_if< + Kokkos::Impl::is_execution_policy<PolicyType>::value>::type +parallel_reduce(const std::string& label, const PolicyType& policy, + const FunctorType& functor, const ReturnType& return_value) { + ReturnType return_value_impl = return_value; + Impl::ParallelReduceAdaptor<PolicyType, FunctorType, ReturnType>::execute( + label, policy, functor, return_value_impl); + Impl::ParallelReduceFence<typename PolicyType::execution_space, + ReturnType>::fence(policy.space(), return_value); +} + +template <class PolicyType, class FunctorType, class ReturnType> +inline typename std::enable_if< + Kokkos::Impl::is_execution_policy<PolicyType>::value>::type +parallel_reduce(const PolicyType& policy, const FunctorType& functor, + const ReturnType& return_value) { + ReturnType return_value_impl = return_value; + Impl::ParallelReduceAdaptor<PolicyType, FunctorType, ReturnType>::execute( + "", policy, functor, return_value_impl); + Impl::ParallelReduceFence<typename PolicyType::execution_space, + ReturnType>::fence(policy.space(), return_value); +} + +template <class FunctorType, class ReturnType> +inline void parallel_reduce(const size_t& policy, const FunctorType& functor, + const ReturnType& return_value) { + using policy_type = + typename Impl::ParallelReducePolicyType<void, size_t, + FunctorType>::policy_type; + ReturnType return_value_impl = return_value; + Impl::ParallelReduceAdaptor<policy_type, FunctorType, ReturnType>::execute( + "", policy_type(0, policy), functor, return_value_impl); + Impl::ParallelReduceFence<typename policy_type::execution_space, ReturnType>:: + fence(typename policy_type::execution_space(), return_value); +} + +template <class FunctorType, class ReturnType> +inline void parallel_reduce(const std::string& label, const size_t& policy, + const FunctorType& functor, + const ReturnType& return_value) { + using policy_type = + typename Impl::ParallelReducePolicyType<void, size_t, + FunctorType>::policy_type; + ReturnType return_value_impl = return_value; + Impl::ParallelReduceAdaptor<policy_type, FunctorType, ReturnType>::execute( + label, policy_type(0, policy), functor, return_value_impl); + Impl::ParallelReduceFence<typename policy_type::execution_space, ReturnType>:: + fence(typename policy_type::execution_space(), return_value); +} + +// No Return Argument + +template <class PolicyType, class FunctorType> +inline void parallel_reduce( + const std::string& label, const PolicyType& policy, + const FunctorType& functor, + typename std::enable_if< + Kokkos::Impl::is_execution_policy<PolicyType>::value>::type* = + nullptr) { + using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, void>; + using value_type = std::conditional_t<(ValueTraits::StaticValueSize != 0), + typename ValueTraits::value_type, + typename ValueTraits::pointer_type>; + + static_assert( + Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, PolicyType, + FunctorType>::has_final_member_function, + "Calling parallel_reduce without either return value or final function."); + + using result_view_type = + Kokkos::View<value_type, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>; + result_view_type result_view; + + Impl::ParallelReduceAdaptor<PolicyType, FunctorType, + result_view_type>::execute(label, policy, functor, + result_view); +} + +template <class PolicyType, class FunctorType> +inline void parallel_reduce( + const PolicyType& policy, const FunctorType& functor, + typename std::enable_if< + Kokkos::Impl::is_execution_policy<PolicyType>::value>::type* = + nullptr) { + using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, void>; + using value_type = std::conditional_t<(ValueTraits::StaticValueSize != 0), + typename ValueTraits::value_type, + typename ValueTraits::pointer_type>; + + static_assert( + Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, PolicyType, + FunctorType>::has_final_member_function, + "Calling parallel_reduce without either return value or final function."); + + using result_view_type = + Kokkos::View<value_type, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>; + result_view_type result_view; + + Impl::ParallelReduceAdaptor<PolicyType, FunctorType, + result_view_type>::execute("", policy, functor, + result_view); +} + +template <class FunctorType> +inline void parallel_reduce(const size_t& policy, const FunctorType& functor) { + using policy_type = + typename Impl::ParallelReducePolicyType<void, size_t, + FunctorType>::policy_type; + using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, void>; + using value_type = std::conditional_t<(ValueTraits::StaticValueSize != 0), + typename ValueTraits::value_type, + typename ValueTraits::pointer_type>; + + static_assert( + Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, + RangePolicy<>, + FunctorType>::has_final_member_function, + "Calling parallel_reduce without either return value or final function."); + + using result_view_type = + Kokkos::View<value_type, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>; + result_view_type result_view; + + Impl::ParallelReduceAdaptor<policy_type, FunctorType, + result_view_type>::execute("", + policy_type(0, policy), + functor, result_view); +} + +template <class FunctorType> +inline void parallel_reduce(const std::string& label, const size_t& policy, + const FunctorType& functor) { + using policy_type = + typename Impl::ParallelReducePolicyType<void, size_t, + FunctorType>::policy_type; + using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, void>; + using value_type = std::conditional_t<(ValueTraits::StaticValueSize != 0), + typename ValueTraits::value_type, + typename ValueTraits::pointer_type>; + + static_assert( + Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE, + RangePolicy<>, + FunctorType>::has_final_member_function, + "Calling parallel_reduce without either return value or final function."); + + using result_view_type = + Kokkos::View<value_type, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>; + result_view_type result_view; + + Impl::ParallelReduceAdaptor<policy_type, FunctorType, + result_view_type>::execute(label, + policy_type(0, policy), + functor, result_view); +} + +} // namespace Kokkos + +#endif // KOKKOS_PARALLEL_REDUCE_HPP diff --git a/packages/kokkos/core/src/Kokkos_PointerOwnership.hpp b/packages/kokkos/core/src/Kokkos_PointerOwnership.hpp new file mode 100644 index 0000000000000000000000000000000000000000..f1f168c38fea159835b34c1c25e0479f653cc76a --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_PointerOwnership.hpp @@ -0,0 +1,72 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +// Experimental unified task-data parallel manycore LDRD + +#ifndef KOKKOS_IMPL_POINTEROWNERSHIP_HPP +#define KOKKOS_IMPL_POINTEROWNERSHIP_HPP + +#include <Kokkos_Macros.hpp> + +#include <Kokkos_Core_fwd.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +/// Trivial wrapper for raw pointers that express ownership. +template <class T> +using OwningRawPtr = T*; + +/// Trivial wrapper for raw pointers that do not express ownership. +template <class T> +using ObservingRawPtr = T*; + +} // end namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #ifndef KOKKOS_IMPL_POINTEROWNERSHIP_HPP */ diff --git a/packages/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp b/packages/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp new file mode 100644 index 0000000000000000000000000000000000000000..241a3a13a9c4d682785f274d4616c3f17cb4c9a5 --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp @@ -0,0 +1,95 @@ +/* + //@HEADER + // ************************************************************************ + // + // Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). + // + // Under the terms of Contract DE-NA0003525 with NTESS, + // the U.S. Government retains certain rights in this software. + // + // Redistribution and use in source and binary forms, with or without + // modification, are permitted provided that the following conditions are + // met: + // + // 1. Redistributions of source code must retain the above copyright + // notice, this list of conditions and the following disclaimer. + // + // 2. Redistributions in binary form must reproduce the above copyright + // notice, this list of conditions and the following disclaimer in the + // documentation and/or other materials provided with the distribution. + // + // 3. Neither the name of the Corporation nor the names of the + // contributors may be used to endorse or promote products derived from + // this software without specific prior written permission. + // + // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY + // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE + // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + // + // Questions? Contact Christian R. Trott (crtrott@sandia.gov) + // + // ************************************************************************ + //@HEADER + */ + +#ifndef KOKKOSP_PROFILE_SECTION_HPP +#define KOKKOSP_PROFILE_SECTION_HPP + +#include <Kokkos_Macros.hpp> +#include <impl/Kokkos_Profiling_Interface.hpp> +#include <impl/Kokkos_Profiling.hpp> + +#include <string> + +namespace Kokkos { +namespace Profiling { + +class ProfilingSection { + public: + ProfilingSection(const std::string& sectionName) : secName(sectionName) { + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::createProfileSection(secName, &secID); + } + } + + void start() { + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::startSection(secID); + } + } + + void stop() { + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::stopSection(secID); + } + } + + ~ProfilingSection() { + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::destroyProfileSection(secID); + } + } + + std::string getName() { return secName; } + + uint32_t getSectionID() { return secID; } + + protected: + const std::string secName; + uint32_t secID; +}; + +} // namespace Profiling +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/Kokkos_SYCL.hpp b/packages/kokkos/core/src/Kokkos_SYCL.hpp new file mode 100644 index 0000000000000000000000000000000000000000..aa720371df73cb1ad7bba8191e5c6d83c6c317c5 --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_SYCL.hpp @@ -0,0 +1,186 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_SYCL_HPP +#define KOKKOS_SYCL_HPP + +#include <Kokkos_Macros.hpp> + +#ifdef KOKKOS_ENABLE_SYCL +#include <CL/sycl.hpp> +#include <Kokkos_SYCL_Space.hpp> +#include <Kokkos_Layout.hpp> +#include <Kokkos_ScratchSpace.hpp> +#include <impl/Kokkos_ExecSpaceInitializer.hpp> +#include <impl/Kokkos_Profiling_Interface.hpp> +#include <impl/Kokkos_HostSharedPtr.hpp> + +namespace Kokkos { +namespace Experimental { +namespace Impl { +class SYCLInternal; +} + +/// \class SYCL +/// \brief Kokkos device for multicore processors in the host memory space. +class SYCL { + public: + //------------------------------------ + //! \name Type declarations that all Kokkos devices must provide. + //@{ + + //! Tag this class as a kokkos execution space + using execution_space = SYCL; + using memory_space = SYCLDeviceUSMSpace; + using device_type = Kokkos::Device<execution_space, memory_space>; + + using array_layout = LayoutLeft; + using size_type = memory_space::size_type; + + using scratch_memory_space = ScratchMemorySpace<SYCL>; + + SYCL(); + explicit SYCL(const sycl::queue&); + + uint32_t impl_instance_id() const noexcept { return 0; } + + sycl::context sycl_context() const noexcept { + return m_space_instance->m_queue->get_context(); + }; + + //@} + //------------------------------------ + //! \name Functions that all Kokkos devices must implement. + //@{ + + KOKKOS_INLINE_FUNCTION static int in_parallel() { +#if defined(__SYCL_DEVICE_ONLY__) + return true; +#else + return false; +#endif + } + + /** \brief Set the device in a "sleep" state. */ + static bool sleep(); + + /** \brief Wake the device from the 'sleep' state. A noop for OpenMP. */ + static bool wake(); + + /** \brief Wait until all dispatched functors complete. A noop for OpenMP. */ + static void impl_static_fence(); + void fence() const; + + /// \brief Print configuration information to the given output stream. + static void print_configuration(std::ostream&, const bool detail = false); + + /// \brief Free any resources being consumed by the device. + static void impl_finalize(); + + /** \brief Initialize the device. + * + */ + + struct SYCLDevice { + SYCLDevice() : SYCLDevice(sycl::default_selector()) {} + explicit SYCLDevice(sycl::device d); + explicit SYCLDevice(const sycl::device_selector& selector); + explicit SYCLDevice(size_t id); + + sycl::device get_device() const; + + friend std::ostream& operator<<(std::ostream& os, const SYCLDevice& that) { + return that.info(os); + } + + private: + std::ostream& info(std::ostream& os) const; + + sycl::device m_device; + }; + + static void impl_initialize(SYCLDevice = SYCLDevice()); + + int sycl_device() const; + + static bool impl_is_initialized(); + + static int concurrency(); + static const char* name(); + + inline Impl::SYCLInternal* impl_internal_space_instance() const { + return m_space_instance.get(); + } + + private: + Kokkos::Impl::HostSharedPtr<Impl::SYCLInternal> m_space_instance; +}; + +namespace Impl { + +class SYCLSpaceInitializer : public Kokkos::Impl::ExecSpaceInitializerBase { + public: + void initialize(const InitArguments& args) final; + void finalize(const bool) final; + void fence() final; + void print_configuration(std::ostream& msg, const bool detail) final; +}; + +} // namespace Impl +} // namespace Experimental + +namespace Tools { +namespace Experimental { +template <> +struct DeviceTypeTraits<Kokkos::Experimental::SYCL> { + /// \brief An ID to differentiate (for example) Serial from OpenMP in Tooling + static constexpr DeviceType id = DeviceType::SYCL; +}; +} // namespace Experimental +} // namespace Tools + +} // namespace Kokkos + +#endif +#endif diff --git a/packages/kokkos/core/src/Kokkos_SYCL_Space.hpp b/packages/kokkos/core/src/Kokkos_SYCL_Space.hpp new file mode 100644 index 0000000000000000000000000000000000000000..392ab0e59a7d01f42342318bb44aa172bcb4f705 --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_SYCL_Space.hpp @@ -0,0 +1,284 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_SYCLSPACE_HPP +#define KOKKOS_SYCLSPACE_HPP + +#include <Kokkos_Core_fwd.hpp> + +#ifdef KOKKOS_ENABLE_SYCL +#include <Kokkos_Concepts.hpp> +#include <Kokkos_ScratchSpace.hpp> +#include <SYCL/Kokkos_SYCL_Instance.hpp> +#include <impl/Kokkos_SharedAlloc.hpp> +#include <impl/Kokkos_Tools.hpp> + +namespace Kokkos { +namespace Experimental { + +class SYCLDeviceUSMSpace { + public: + using execution_space = SYCL; + using memory_space = SYCLDeviceUSMSpace; + using device_type = Kokkos::Device<execution_space, memory_space>; + using size_type = Impl::SYCLInternal::size_type; + + SYCLDeviceUSMSpace(); + explicit SYCLDeviceUSMSpace(sycl::queue queue); + + void* allocate(const std::size_t arg_alloc_size) const; + void* allocate(const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const; + + void deallocate(void* const arg_alloc_ptr, + const std::size_t arg_alloc_size) const; + void deallocate(const char* arg_label, void* const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const; + + private: + template <class, class, class, class> + friend class LogicalMemorySpace; + + public: + static constexpr const char* name() { return "SYCLDeviceUSM"; }; + + private: + sycl::queue m_queue; +}; + +class SYCLSharedUSMSpace { + public: + using execution_space = SYCL; + using memory_space = SYCLSharedUSMSpace; + using device_type = Kokkos::Device<execution_space, memory_space>; + using size_type = Impl::SYCLInternal::size_type; + + SYCLSharedUSMSpace(); + explicit SYCLSharedUSMSpace(sycl::queue queue); + + void* allocate(const std::size_t arg_alloc_size) const; + void* allocate(const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const; + + void deallocate(void* const arg_alloc_ptr, + const std::size_t arg_alloc_size) const; + void deallocate(const char* arg_label, void* const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const; + + private: + template <class, class, class, class> + friend class LogicalMemorySpace; + + public: + static constexpr const char* name() { return "SYCLSharedUSM"; }; + + private: + sycl::queue m_queue; +}; +} // namespace Experimental + +namespace Impl { +static_assert(Kokkos::Impl::MemorySpaceAccess< + Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable, + ""); + +static_assert(Kokkos::Impl::MemorySpaceAccess< + Kokkos::Experimental::SYCLSharedUSMSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::assignable, + ""); + +template <> +struct MemorySpaceAccess<Kokkos::HostSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace> { + enum : bool { assignable = false }; + enum : bool { accessible = false }; + enum : bool { deepcopy = true }; +}; + +template <> +struct MemorySpaceAccess<Kokkos::HostSpace, + Kokkos::Experimental::SYCLSharedUSMSpace> { + // HostSpace::execution_space != SYCLSharedUSMSpace::execution_space + enum : bool { assignable = false }; + enum : bool { accessible = true }; + enum : bool { deepcopy = true }; +}; + +template <> +struct MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::HostSpace> { + enum : bool { assignable = false }; + enum : bool { accessible = false }; + enum : bool { deepcopy = true }; +}; + +template <> +struct MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLSharedUSMSpace> { + // SYCLDeviceUSMSpace::execution_space == SYCLSharedUSMSpace::execution_space + enum : bool { assignable = true }; + enum : bool { accessible = true }; + enum : bool { deepcopy = true }; +}; + +//---------------------------------------- +// SYCLSharedUSMSpace::execution_space == SYCL +// SYCLSharedUSMSpace accessible to both SYCL and Host + +template <> +struct MemorySpaceAccess<Kokkos::Experimental::SYCLSharedUSMSpace, + Kokkos::HostSpace> { + enum : bool { assignable = false }; + enum : bool { accessible = false }; // SYCL cannot access HostSpace + enum : bool { deepcopy = true }; +}; + +template <> +struct MemorySpaceAccess<Kokkos::Experimental::SYCLSharedUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace> { + // SYCLSharedUSMSpace::execution_space == SYCLDeviceUSMSpace::execution_space + // Can access SYCLSharedUSMSpace from Host but cannot access + // SYCLDeviceUSMSpace from Host + enum : bool { assignable = false }; + + // SYCLSharedUSMSpace::execution_space can access SYCLDeviceUSMSpace + enum : bool { accessible = true }; + enum : bool { deepcopy = true }; +}; + +template <> +struct MemorySpaceAccess< + Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::ScratchMemorySpace<Kokkos::Experimental::SYCL>> { + enum : bool { assignable = false }; + enum : bool { accessible = true }; + enum : bool { deepcopy = false }; +}; + +template <> +struct MemorySpaceAccess< + Kokkos::Experimental::SYCLSharedUSMSpace, + Kokkos::ScratchMemorySpace<Kokkos::Experimental::SYCL>> { + enum : bool { assignable = false }; + enum : bool { accessible = true }; + enum : bool { deepcopy = false }; +}; + +} // namespace Impl + +namespace Impl { + +template <> +class SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace, void> + : public HostInaccessibleSharedAllocationRecordCommon< + Kokkos::Experimental::SYCLDeviceUSMSpace> { + private: + friend class SharedAllocationRecordCommon< + Kokkos::Experimental::SYCLDeviceUSMSpace>; + friend class HostInaccessibleSharedAllocationRecordCommon< + Kokkos::Experimental::SYCLDeviceUSMSpace>; + using base_t = HostInaccessibleSharedAllocationRecordCommon< + Kokkos::Experimental::SYCLDeviceUSMSpace>; + using RecordBase = SharedAllocationRecord<void, void>; + + SharedAllocationRecord(const SharedAllocationRecord&) = delete; + SharedAllocationRecord(SharedAllocationRecord&&) = delete; + SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; + SharedAllocationRecord& operator=(SharedAllocationRecord&&) = delete; + +#ifdef KOKKOS_ENABLE_DEBUG + static RecordBase s_root_record; +#endif + + const Kokkos::Experimental::SYCLDeviceUSMSpace m_space; + + protected: + ~SharedAllocationRecord(); + + SharedAllocationRecord( + const Kokkos::Experimental::SYCLDeviceUSMSpace& arg_space, + const std::string& arg_label, const size_t arg_alloc_size, + const RecordBase::function_type arg_dealloc = &base_t::deallocate); +}; + +template <> +class SharedAllocationRecord<Kokkos::Experimental::SYCLSharedUSMSpace, void> + : public SharedAllocationRecordCommon< + Kokkos::Experimental::SYCLSharedUSMSpace> { + private: + friend class SharedAllocationRecordCommon< + Kokkos::Experimental::SYCLSharedUSMSpace>; + using base_t = + SharedAllocationRecordCommon<Kokkos::Experimental::SYCLSharedUSMSpace>; + using RecordBase = SharedAllocationRecord<void, void>; + + SharedAllocationRecord(const SharedAllocationRecord&) = delete; + SharedAllocationRecord(SharedAllocationRecord&&) = delete; + SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; + SharedAllocationRecord& operator=(SharedAllocationRecord&&) = delete; + + static RecordBase s_root_record; + + const Kokkos::Experimental::SYCLSharedUSMSpace m_space; + + protected: + ~SharedAllocationRecord(); + + SharedAllocationRecord() = default; + + SharedAllocationRecord( + const Kokkos::Experimental::SYCLSharedUSMSpace& arg_space, + const std::string& arg_label, const size_t arg_alloc_size, + const RecordBase::function_type arg_dealloc = &base_t::deallocate); +}; + +} // namespace Impl + +} // namespace Kokkos + +#endif +#endif diff --git a/packages/kokkos/core/src/Kokkos_ScratchSpace.hpp b/packages/kokkos/core/src/Kokkos_ScratchSpace.hpp new file mode 100644 index 0000000000000000000000000000000000000000..2eebf5365e71d2c5cf42c356951ccec9d041fe14 --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_ScratchSpace.hpp @@ -0,0 +1,172 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_SCRATCHSPACE_HPP +#define KOKKOS_SCRATCHSPACE_HPP + +#include <cstdio> +#include <cstddef> +#include <Kokkos_Core_fwd.hpp> +#include <Kokkos_Concepts.hpp> + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { + +/** \brief Scratch memory space associated with an execution space. + * + */ +template <class ExecSpace> +class ScratchMemorySpace { + static_assert( + is_execution_space<ExecSpace>::value, + "Instantiating ScratchMemorySpace on non-execution-space type."); + + public: + // Alignment of memory chunks returned by 'get' + // must be a power of two + enum { ALIGN = 8 }; + + private: + mutable char* m_iter_L0 = nullptr; + mutable char* m_iter_L1 = nullptr; + char* m_end_L0 = nullptr; + char* m_end_L1 = nullptr; + + mutable int m_multiplier = 0; + mutable int m_offset = 0; + mutable int m_default_level = 0; + + enum { MASK = ALIGN - 1 }; // Alignment used by View::shmem_size + + public: + //! Tag this class as a memory space + using memory_space = ScratchMemorySpace<ExecSpace>; + using execution_space = ExecSpace; + //! This execution space preferred device_type + using device_type = Kokkos::Device<execution_space, memory_space>; + + using array_layout = typename ExecSpace::array_layout; + using size_type = typename ExecSpace::size_type; + + static constexpr const char* name() { return "ScratchMemorySpace"; } + + template <typename IntType> + KOKKOS_INLINE_FUNCTION static IntType align(const IntType& size) { + return (size + MASK) & ~MASK; + } + + template <typename IntType> + KOKKOS_INLINE_FUNCTION void* get_shmem(const IntType& size, + int level = -1) const { + return get_shmem_common</*aligned*/ false>(size, 1, level); + } + + template <typename IntType> + KOKKOS_INLINE_FUNCTION void* get_shmem_aligned(const IntType& size, + const ptrdiff_t alignment, + int level = -1) const { + return get_shmem_common</*aligned*/ true>(size, alignment, level); + } + + private: + template <bool aligned, typename IntType> + KOKKOS_INLINE_FUNCTION void* get_shmem_common(const IntType& size, + const ptrdiff_t alignment, + int level = -1) const { + if (level == -1) level = m_default_level; + auto& m_iter = (level == 0) ? m_iter_L0 : m_iter_L1; + auto& m_end = (level == 0) ? m_end_L0 : m_end_L1; + char* previous = m_iter; + const ptrdiff_t missalign = size_t(m_iter) % alignment; + if (missalign) m_iter += alignment - missalign; + + void* tmp = m_iter + m_offset * (aligned ? size : align(size)); + if (m_end < (m_iter += (aligned ? size : align(size)) * m_multiplier)) { + m_iter = previous; // put it back like it was +#ifdef KOKKOS_ENABLE_DEBUG + // mfh 23 Jun 2015: printf call consumes 25 registers + // in a CUDA build, so only print in debug mode. The + // function still returns nullptr if not enough memory. + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "ScratchMemorySpace<...>::get_shmem: Failed to allocate " + "%ld byte(s); remaining capacity is %ld byte(s)\n", + long(size), long(m_end - m_iter)); +#endif // KOKKOS_ENABLE_DEBUG + tmp = nullptr; + } + return tmp; + } + + public: + KOKKOS_DEFAULTED_FUNCTION + ScratchMemorySpace() = default; + + template <typename IntType> + KOKKOS_INLINE_FUNCTION ScratchMemorySpace(void* ptr_L0, + const IntType& size_L0, + void* ptr_L1 = nullptr, + const IntType& size_L1 = 0) + : m_iter_L0((char*)ptr_L0), + m_iter_L1((char*)ptr_L1), + m_end_L0((char*)ptr_L0 + size_L0), + m_end_L1((char*)ptr_L1 + size_L1), + m_multiplier(1), + m_offset(0), + m_default_level(0) {} + + KOKKOS_INLINE_FUNCTION + const ScratchMemorySpace& set_team_thread_mode(const int& level, + const int& multiplier, + const int& offset) const { + m_default_level = level; + m_multiplier = multiplier; + m_offset = offset; + return *this; + } +}; + +} // namespace Kokkos + +#endif /* #ifndef KOKKOS_SCRATCHSPACE_HPP */ diff --git a/packages/kokkos/core/src/Kokkos_Serial.hpp b/packages/kokkos/core/src/Kokkos_Serial.hpp new file mode 100644 index 0000000000000000000000000000000000000000..4d5bb2410bfaabf6f752acf55795c9d7ef82016d --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_Serial.hpp @@ -0,0 +1,1099 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +/// \file Kokkos_Serial.hpp +/// \brief Declaration and definition of Kokkos::Serial device. + +#ifndef KOKKOS_SERIAL_HPP +#define KOKKOS_SERIAL_HPP + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_SERIAL) + +#include <cstddef> +#include <iosfwd> +#include <Kokkos_Core_fwd.hpp> +#include <Kokkos_Parallel.hpp> +#include <Kokkos_TaskScheduler.hpp> +#include <Kokkos_Layout.hpp> +#include <Kokkos_HostSpace.hpp> +#include <Kokkos_ScratchSpace.hpp> +#include <Kokkos_MemoryTraits.hpp> +#include <impl/Kokkos_Tags.hpp> +#include <impl/Kokkos_HostThreadTeam.hpp> +#include <impl/Kokkos_FunctorAnalysis.hpp> +#include <impl/Kokkos_FunctorAdapter.hpp> +#include <impl/Kokkos_Tools.hpp> +#include <impl/Kokkos_ExecSpaceInitializer.hpp> + +#include <KokkosExp_MDRangePolicy.hpp> + +#include <Kokkos_UniqueToken.hpp> + +namespace Kokkos { + +/// \class Serial +/// \brief Kokkos device for non-parallel execution +/// +/// A "device" represents a parallel execution model. It tells Kokkos +/// how to parallelize the execution of kernels in a parallel_for or +/// parallel_reduce. For example, the Threads device uses Pthreads or +/// C++11 threads on a CPU, the OpenMP device uses the OpenMP language +/// extensions, and the Cuda device uses NVIDIA's CUDA programming +/// model. The Serial device executes "parallel" kernels +/// sequentially. This is useful if you really do not want to use +/// threads, or if you want to explore different combinations of MPI +/// and shared-memory parallel programming models. +class Serial { + public: + //! \name Type declarations that all Kokkos devices must provide. + //@{ + + //! Tag this class as an execution space: + using execution_space = Serial; + //! This device's preferred memory space. + using memory_space = Kokkos::HostSpace; + //! The size_type alias best suited for this device. + using size_type = memory_space::size_type; + //! This execution space preferred device_type + using device_type = Kokkos::Device<execution_space, memory_space>; + + //! This device's preferred array layout. + using array_layout = LayoutRight; + + /// \brief Scratch memory space + using scratch_memory_space = ScratchMemorySpace<Kokkos::Serial>; + + //@} + + /// \brief True if and only if this method is being called in a + /// thread-parallel function. + /// + /// For the Serial device, this method <i>always</i> returns false, + /// because parallel_for or parallel_reduce with the Serial device + /// always execute sequentially. + inline static int in_parallel() { return false; } + + /// \brief Wait until all dispatched functors complete. + /// + /// The parallel_for or parallel_reduce dispatch of a functor may + /// return asynchronously, before the functor completes. This + /// method does not return until all dispatched functors on this + /// device have completed. + static void impl_static_fence() {} + + void fence() const {} + + /** \brief Return the maximum amount of concurrency. */ + static int concurrency() { return 1; } + + //! Print configuration information to the given output stream. + static void print_configuration(std::ostream&, + const bool /* detail */ = false) {} + + static void impl_initialize(); + + static bool impl_is_initialized(); + + //! Free any resources being consumed by the device. + static void impl_finalize(); + + //-------------------------------------------------------------------------- + + inline static int impl_thread_pool_size(int = 0) { return 1; } + KOKKOS_INLINE_FUNCTION static int impl_thread_pool_rank() { return 0; } + + //-------------------------------------------------------------------------- + + KOKKOS_INLINE_FUNCTION static unsigned impl_hardware_thread_id() { + return impl_thread_pool_rank(); + } + inline static unsigned impl_max_hardware_threads() { + return impl_thread_pool_size(0); + } + + uint32_t impl_instance_id() const noexcept { return 0; } + + static const char* name(); + //-------------------------------------------------------------------------- +}; + +namespace Tools { +namespace Experimental { +template <> +struct DeviceTypeTraits<Serial> { + static constexpr DeviceType id = DeviceType::Serial; +}; +} // namespace Experimental +} // namespace Tools + +namespace Impl { + +class SerialSpaceInitializer : public ExecSpaceInitializerBase { + public: + SerialSpaceInitializer() = default; + ~SerialSpaceInitializer() = default; + void initialize(const InitArguments& args) final; + void finalize(const bool) final; + void fence() final; + void print_configuration(std::ostream& msg, const bool detail) final; +}; + +} // namespace Impl +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { + +template <> +struct MemorySpaceAccess<Kokkos::Serial::memory_space, + Kokkos::Serial::scratch_memory_space> { + enum : bool { assignable = false }; + enum : bool { accessible = true }; + enum : bool { deepcopy = false }; +}; + +} // namespace Impl +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { + +// Resize thread team data scratch memory +void serial_resize_thread_team_data(size_t pool_reduce_bytes, + size_t team_reduce_bytes, + size_t team_shared_bytes, + size_t thread_local_bytes); + +HostThreadTeamData* serial_get_thread_team_data(); + +} /* namespace Impl */ +} /* namespace Kokkos */ + +namespace Kokkos { +namespace Impl { + +/* + * < Kokkos::Serial , WorkArgTag > + * < WorkArgTag , Impl::enable_if< std::is_same< Kokkos::Serial , + * Kokkos::DefaultExecutionSpace >::value >::type > + * + */ +template <class... Properties> +class TeamPolicyInternal<Kokkos::Serial, Properties...> + : public PolicyTraits<Properties...> { + private: + size_t m_team_scratch_size[2]; + size_t m_thread_scratch_size[2]; + int m_league_size; + int m_chunk_size; + + public: + //! Tag this class as a kokkos execution policy + using execution_policy = TeamPolicyInternal; + + using traits = PolicyTraits<Properties...>; + + //! Execution space of this execution policy: + using execution_space = Kokkos::Serial; + + const typename traits::execution_space& space() const { + static typename traits::execution_space m_space; + return m_space; + } + + template <class ExecSpace, class... OtherProperties> + friend class TeamPolicyInternal; + + template <class... OtherProperties> + TeamPolicyInternal( + const TeamPolicyInternal<Kokkos::Serial, OtherProperties...>& p) { + m_league_size = p.m_league_size; + m_team_scratch_size[0] = p.m_team_scratch_size[0]; + m_thread_scratch_size[0] = p.m_thread_scratch_size[0]; + m_team_scratch_size[1] = p.m_team_scratch_size[1]; + m_thread_scratch_size[1] = p.m_thread_scratch_size[1]; + m_chunk_size = p.m_chunk_size; + } + + //---------------------------------------- + + template <class FunctorType> + int team_size_max(const FunctorType&, const ParallelForTag&) const { + return 1; + } + template <class FunctorType> + int team_size_max(const FunctorType&, const ParallelReduceTag&) const { + return 1; + } + template <class FunctorType, class ReducerType> + int team_size_max(const FunctorType&, const ReducerType&, + const ParallelReduceTag&) const { + return 1; + } + template <class FunctorType> + int team_size_recommended(const FunctorType&, const ParallelForTag&) const { + return 1; + } + template <class FunctorType> + int team_size_recommended(const FunctorType&, + const ParallelReduceTag&) const { + return 1; + } + template <class FunctorType, class ReducerType> + int team_size_recommended(const FunctorType&, const ReducerType&, + const ParallelReduceTag&) const { + return 1; + } + + //---------------------------------------- + + inline int team_size() const { return 1; } + inline bool impl_auto_team_size() const { return false; } + inline bool impl_auto_vector_length() const { return false; } + inline void impl_set_team_size(size_t) {} + inline void impl_set_vector_length(size_t) {} + inline int league_size() const { return m_league_size; } + inline size_t scratch_size(const int& level, int = 0) const { + return m_team_scratch_size[level] + m_thread_scratch_size[level]; + } + + inline int impl_vector_length() const { return 1; } + inline static int vector_length_max() { + return 1024; + } // Use arbitrary large number, is meant as a vectorizable length + + inline static int scratch_size_max(int level) { + return (level == 0 ? 1024 * 32 : 20 * 1024 * 1024); + } + /** \brief Specify league size, request team size */ + TeamPolicyInternal(const execution_space&, int league_size_request, + int team_size_request, int /* vector_length_request */ = 1) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_league_size(league_size_request), + m_chunk_size(32) { + if (team_size_request > 1) + Kokkos::abort("Kokkos::abort: Requested Team Size is too large!"); + } + + TeamPolicyInternal(const execution_space& space, int league_size_request, + const Kokkos::AUTO_t& /**team_size_request*/, + int vector_length_request = 1) + : TeamPolicyInternal(space, league_size_request, -1, + vector_length_request) {} + + TeamPolicyInternal(const execution_space& space, int league_size_request, + const Kokkos::AUTO_t& /* team_size_request */ + , + const Kokkos::AUTO_t& /* vector_length_request */ + ) + : TeamPolicyInternal(space, league_size_request, -1, -1) {} + + TeamPolicyInternal(const execution_space& space, int league_size_request, + int team_size_request, + const Kokkos::AUTO_t& /* vector_length_request */ + ) + : TeamPolicyInternal(space, league_size_request, team_size_request, -1) {} + + TeamPolicyInternal(int league_size_request, + const Kokkos::AUTO_t& team_size_request, + int vector_length_request = 1) + : TeamPolicyInternal(typename traits::execution_space(), + league_size_request, team_size_request, + vector_length_request) {} + + TeamPolicyInternal(int league_size_request, + const Kokkos::AUTO_t& team_size_request, + const Kokkos::AUTO_t& vector_length_request) + : TeamPolicyInternal(typename traits::execution_space(), + league_size_request, team_size_request, + vector_length_request) {} + TeamPolicyInternal(int league_size_request, int team_size_request, + const Kokkos::AUTO_t& vector_length_request) + : TeamPolicyInternal(typename traits::execution_space(), + league_size_request, team_size_request, + vector_length_request) {} + + TeamPolicyInternal(int league_size_request, int team_size_request, + int vector_length_request = 1) + : TeamPolicyInternal(typename traits::execution_space(), + league_size_request, team_size_request, + vector_length_request) {} + + inline int chunk_size() const { return m_chunk_size; } + + /** \brief set chunk_size to a discrete value*/ + inline TeamPolicyInternal& set_chunk_size( + typename traits::index_type chunk_size_) { + m_chunk_size = chunk_size_; + return *this; + } + + /** \brief set per team scratch size for a specific level of the scratch + * hierarchy */ + inline TeamPolicyInternal& set_scratch_size(const int& level, + const PerTeamValue& per_team) { + m_team_scratch_size[level] = per_team.value; + return *this; + } + + /** \brief set per thread scratch size for a specific level of the scratch + * hierarchy */ + inline TeamPolicyInternal& set_scratch_size( + const int& level, const PerThreadValue& per_thread) { + m_thread_scratch_size[level] = per_thread.value; + return *this; + } + + /** \brief set per thread and per team scratch size for a specific level of + * the scratch hierarchy */ + inline TeamPolicyInternal& set_scratch_size( + const int& level, const PerTeamValue& per_team, + const PerThreadValue& per_thread) { + m_team_scratch_size[level] = per_team.value; + m_thread_scratch_size[level] = per_thread.value; + return *this; + } + + using member_type = Impl::HostThreadTeamMember<Kokkos::Serial>; +}; +} /* namespace Impl */ +} /* namespace Kokkos */ + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ +/* Parallel patterns for Kokkos::Serial with RangePolicy */ + +namespace Kokkos { +namespace Impl { + +template <class FunctorType, class... Traits> +class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Serial> { + private: + using Policy = Kokkos::RangePolicy<Traits...>; + + const FunctorType m_functor; + const Policy m_policy; + + template <class TagType> + typename std::enable_if<std::is_same<TagType, void>::value>::type exec() + const { + const typename Policy::member_type e = m_policy.end(); + for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { + m_functor(i); + } + } + + template <class TagType> + typename std::enable_if<!std::is_same<TagType, void>::value>::type exec() + const { + const TagType t{}; + const typename Policy::member_type e = m_policy.end(); + for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { + m_functor(t, i); + } + } + + public: + inline void execute() const { + this->template exec<typename Policy::work_tag>(); + } + + inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) + : m_functor(arg_functor), m_policy(arg_policy) {} +}; + +/*--------------------------------------------------------------------------*/ + +template <class FunctorType, class ReducerType, class... Traits> +class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType, + Kokkos::Serial> { + private: + using Policy = Kokkos::RangePolicy<Traits...>; + using WorkTag = typename Policy::work_tag; + + using ReducerConditional = + Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, + FunctorType, ReducerType>; + + using ReducerTypeFwd = typename ReducerConditional::type; + using WorkTagFwd = + std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag, + void>; + + using Analysis = + FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, FunctorType>; + + using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>; + + using pointer_type = typename Analysis::pointer_type; + using reference_type = typename Analysis::reference_type; + + const FunctorType m_functor; + const Policy m_policy; + const ReducerType m_reducer; + const pointer_type m_result_ptr; + + template <class TagType> + inline typename std::enable_if<std::is_same<TagType, void>::value>::type exec( + reference_type update) const { + const typename Policy::member_type e = m_policy.end(); + for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { + m_functor(i, update); + } + } + + template <class TagType> + inline typename std::enable_if<!std::is_same<TagType, void>::value>::type + exec(reference_type update) const { + const TagType t{}; + + const typename Policy::member_type e = m_policy.end(); + for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { + m_functor(t, i, update); + } + } + + public: + inline void execute() const { + const size_t pool_reduce_size = + Analysis::value_size(ReducerConditional::select(m_functor, m_reducer)); + const size_t team_reduce_size = 0; // Never shrinks + const size_t team_shared_size = 0; // Never shrinks + const size_t thread_local_size = 0; // Never shrinks + + serial_resize_thread_team_data(pool_reduce_size, team_reduce_size, + team_shared_size, thread_local_size); + + HostThreadTeamData& data = *serial_get_thread_team_data(); + + pointer_type ptr = + m_result_ptr ? m_result_ptr : pointer_type(data.pool_reduce_local()); + + reference_type update = + ValueInit::init(ReducerConditional::select(m_functor, m_reducer), ptr); + + this->template exec<WorkTag>(update); + + Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final( + ReducerConditional::select(m_functor, m_reducer), ptr); + } + + template <class HostViewType> + ParallelReduce( + const FunctorType& arg_functor, const Policy& arg_policy, + const HostViewType& arg_result_view, + typename std::enable_if<Kokkos::is_view<HostViewType>::value && + !Kokkos::is_reducer_type<ReducerType>::value, + void*>::type = nullptr) + : m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(InvalidType()), + m_result_ptr(arg_result_view.data()) { + static_assert(Kokkos::is_view<HostViewType>::value, + "Kokkos::Serial reduce result must be a View"); + + static_assert( + Kokkos::Impl::MemorySpaceAccess<typename HostViewType::memory_space, + Kokkos::HostSpace>::accessible, + "Kokkos::Serial reduce result must be a View in HostSpace"); + } + + inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy, + const ReducerType& reducer) + : m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(reducer), + m_result_ptr(reducer.view().data()) { + /*static_assert( std::is_same< typename ViewType::memory_space + , Kokkos::HostSpace >::value + , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" + );*/ + } +}; + +/*--------------------------------------------------------------------------*/ + +template <class FunctorType, class... Traits> +class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, + Kokkos::Serial> { + private: + using Policy = Kokkos::RangePolicy<Traits...>; + using WorkTag = typename Policy::work_tag; + + using Analysis = + FunctorAnalysis<FunctorPatternInterface::SCAN, Policy, FunctorType>; + + using ValueInit = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>; + + using pointer_type = typename Analysis::pointer_type; + using reference_type = typename Analysis::reference_type; + + const FunctorType m_functor; + const Policy m_policy; + + template <class TagType> + inline typename std::enable_if<std::is_same<TagType, void>::value>::type exec( + reference_type update) const { + const typename Policy::member_type e = m_policy.end(); + for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { + m_functor(i, update, true); + } + } + + template <class TagType> + inline typename std::enable_if<!std::is_same<TagType, void>::value>::type + exec(reference_type update) const { + const TagType t{}; + const typename Policy::member_type e = m_policy.end(); + for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { + m_functor(t, i, update, true); + } + } + + public: + inline void execute() const { + const size_t pool_reduce_size = Analysis::value_size(m_functor); + const size_t team_reduce_size = 0; // Never shrinks + const size_t team_shared_size = 0; // Never shrinks + const size_t thread_local_size = 0; // Never shrinks + + serial_resize_thread_team_data(pool_reduce_size, team_reduce_size, + team_shared_size, thread_local_size); + + HostThreadTeamData& data = *serial_get_thread_team_data(); + + reference_type update = + ValueInit::init(m_functor, pointer_type(data.pool_reduce_local())); + + this->template exec<WorkTag>(update); + } + + inline ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy) + : m_functor(arg_functor), m_policy(arg_policy) {} +}; + +/*--------------------------------------------------------------------------*/ +template <class FunctorType, class ReturnType, class... Traits> +class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>, + ReturnType, Kokkos::Serial> { + private: + using Policy = Kokkos::RangePolicy<Traits...>; + using WorkTag = typename Policy::work_tag; + + using Analysis = + FunctorAnalysis<FunctorPatternInterface::SCAN, Policy, FunctorType>; + + using ValueInit = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>; + + using pointer_type = typename Analysis::pointer_type; + using reference_type = typename Analysis::reference_type; + + const FunctorType m_functor; + const Policy m_policy; + ReturnType& m_returnvalue; + + template <class TagType> + inline typename std::enable_if<std::is_same<TagType, void>::value>::type exec( + reference_type update) const { + const typename Policy::member_type e = m_policy.end(); + for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { + m_functor(i, update, true); + } + } + + template <class TagType> + inline typename std::enable_if<!std::is_same<TagType, void>::value>::type + exec(reference_type update) const { + const TagType t{}; + const typename Policy::member_type e = m_policy.end(); + for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { + m_functor(t, i, update, true); + } + } + + public: + inline void execute() { + const size_t pool_reduce_size = Analysis::value_size(m_functor); + const size_t team_reduce_size = 0; // Never shrinks + const size_t team_shared_size = 0; // Never shrinks + const size_t thread_local_size = 0; // Never shrinks + + serial_resize_thread_team_data(pool_reduce_size, team_reduce_size, + team_shared_size, thread_local_size); + + HostThreadTeamData& data = *serial_get_thread_team_data(); + + reference_type update = + ValueInit::init(m_functor, pointer_type(data.pool_reduce_local())); + + this->template exec<WorkTag>(update); + + m_returnvalue = update; + } + + inline ParallelScanWithTotal(const FunctorType& arg_functor, + const Policy& arg_policy, + ReturnType& arg_returnvalue) + : m_functor(arg_functor), + m_policy(arg_policy), + m_returnvalue(arg_returnvalue) {} +}; + +} // namespace Impl +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ +/* Parallel patterns for Kokkos::Serial with MDRangePolicy */ + +namespace Kokkos { +namespace Impl { + +template <class FunctorType, class... Traits> +class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, + Kokkos::Serial> { + private: + using MDRangePolicy = Kokkos::MDRangePolicy<Traits...>; + using Policy = typename MDRangePolicy::impl_range_policy; + + using iterate_type = typename Kokkos::Impl::HostIterateTile< + MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void>; + + const FunctorType m_functor; + const MDRangePolicy m_mdr_policy; + const Policy m_policy; + + void exec() const { + const typename Policy::member_type e = m_policy.end(); + for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { + iterate_type(m_mdr_policy, m_functor)(i); + } + } + + public: + inline void execute() const { this->exec(); } + template <typename Policy, typename Functor> + static int max_tile_size_product(const Policy&, const Functor&) { + /** + * 1024 here is just our guess for a reasonable max tile size, + * it isn't a hardware constraint. If people see a use for larger + * tile size products, we're happy to change this. + */ + return 1024; + } + inline ParallelFor(const FunctorType& arg_functor, + const MDRangePolicy& arg_policy) + : m_functor(arg_functor), + m_mdr_policy(arg_policy), + m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)) {} +}; + +template <class FunctorType, class ReducerType, class... Traits> +class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType, + Kokkos::Serial> { + private: + using MDRangePolicy = Kokkos::MDRangePolicy<Traits...>; + using Policy = typename MDRangePolicy::impl_range_policy; + + using WorkTag = typename MDRangePolicy::work_tag; + + using ReducerConditional = + Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, + FunctorType, ReducerType>; + using ReducerTypeFwd = typename ReducerConditional::type; + using WorkTagFwd = + std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag, + void>; + + using Analysis = FunctorAnalysis<FunctorPatternInterface::REDUCE, + MDRangePolicy, FunctorType>; + + using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>; + + using pointer_type = typename Analysis::pointer_type; + using value_type = typename Analysis::value_type; + using reference_type = typename Analysis::reference_type; + + using iterate_type = + typename Kokkos::Impl::HostIterateTile<MDRangePolicy, FunctorType, + WorkTag, reference_type>; + + const FunctorType m_functor; + const MDRangePolicy m_mdr_policy; + const Policy m_policy; + const ReducerType m_reducer; + const pointer_type m_result_ptr; + + inline void exec(reference_type update) const { + const typename Policy::member_type e = m_policy.end(); + for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { + iterate_type(m_mdr_policy, m_functor, update)(i); + } + } + + public: + template <typename Policy, typename Functor> + static int max_tile_size_product(const Policy&, const Functor&) { + /** + * 1024 here is just our guess for a reasonable max tile size, + * it isn't a hardware constraint. If people see a use for larger + * tile size products, we're happy to change this. + */ + return 1024; + } + inline void execute() const { + const size_t pool_reduce_size = + Analysis::value_size(ReducerConditional::select(m_functor, m_reducer)); + const size_t team_reduce_size = 0; // Never shrinks + const size_t team_shared_size = 0; // Never shrinks + const size_t thread_local_size = 0; // Never shrinks + + serial_resize_thread_team_data(pool_reduce_size, team_reduce_size, + team_shared_size, thread_local_size); + + HostThreadTeamData& data = *serial_get_thread_team_data(); + + pointer_type ptr = + m_result_ptr ? m_result_ptr : pointer_type(data.pool_reduce_local()); + + reference_type update = + ValueInit::init(ReducerConditional::select(m_functor, m_reducer), ptr); + + this->exec(update); + + Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final( + ReducerConditional::select(m_functor, m_reducer), ptr); + } + + template <class HostViewType> + ParallelReduce( + const FunctorType& arg_functor, const MDRangePolicy& arg_policy, + const HostViewType& arg_result_view, + typename std::enable_if<Kokkos::is_view<HostViewType>::value && + !Kokkos::is_reducer_type<ReducerType>::value, + void*>::type = nullptr) + : m_functor(arg_functor), + m_mdr_policy(arg_policy), + m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)), + m_reducer(InvalidType()), + m_result_ptr(arg_result_view.data()) { + static_assert(Kokkos::is_view<HostViewType>::value, + "Kokkos::Serial reduce result must be a View"); + + static_assert( + Kokkos::Impl::MemorySpaceAccess<typename HostViewType::memory_space, + Kokkos::HostSpace>::accessible, + "Kokkos::Serial reduce result must be a View in HostSpace"); + } + + inline ParallelReduce(const FunctorType& arg_functor, + MDRangePolicy arg_policy, const ReducerType& reducer) + : m_functor(arg_functor), + m_mdr_policy(arg_policy), + m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)), + m_reducer(reducer), + m_result_ptr(reducer.view().data()) { + /*static_assert( std::is_same< typename ViewType::memory_space + , Kokkos::HostSpace >::value + , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" + );*/ + } +}; + +} // namespace Impl +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ +/* Parallel patterns for Kokkos::Serial with TeamPolicy */ + +namespace Kokkos { +namespace Impl { + +template <class FunctorType, class... Properties> +class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, + Kokkos::Serial> { + private: + enum { TEAM_REDUCE_SIZE = 512 }; + + using Policy = TeamPolicyInternal<Kokkos::Serial, Properties...>; + using Member = typename Policy::member_type; + + const FunctorType m_functor; + const int m_league; + const int m_shared; + + template <class TagType> + inline typename std::enable_if<std::is_same<TagType, void>::value>::type exec( + HostThreadTeamData& data) const { + for (int ileague = 0; ileague < m_league; ++ileague) { + m_functor(Member(data, ileague, m_league)); + } + } + + template <class TagType> + inline typename std::enable_if<!std::is_same<TagType, void>::value>::type + exec(HostThreadTeamData& data) const { + const TagType t{}; + for (int ileague = 0; ileague < m_league; ++ileague) { + m_functor(t, Member(data, ileague, m_league)); + } + } + + public: + inline void execute() const { + const size_t pool_reduce_size = 0; // Never shrinks + const size_t team_reduce_size = TEAM_REDUCE_SIZE; + const size_t team_shared_size = m_shared; + const size_t thread_local_size = 0; // Never shrinks + + serial_resize_thread_team_data(pool_reduce_size, team_reduce_size, + team_shared_size, thread_local_size); + + HostThreadTeamData& data = *serial_get_thread_team_data(); + + this->template exec<typename Policy::work_tag>(data); + } + + ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) + : m_functor(arg_functor), + m_league(arg_policy.league_size()), + m_shared(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + + FunctorTeamShmemSize<FunctorType>::value(arg_functor, 1)) {} +}; + +/*--------------------------------------------------------------------------*/ + +template <class FunctorType, class ReducerType, class... Properties> +class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>, + ReducerType, Kokkos::Serial> { + private: + enum { TEAM_REDUCE_SIZE = 512 }; + + using Policy = TeamPolicyInternal<Kokkos::Serial, Properties...>; + + using Analysis = + FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, FunctorType>; + + using Member = typename Policy::member_type; + using WorkTag = typename Policy::work_tag; + + using ReducerConditional = + Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, + FunctorType, ReducerType>; + using ReducerTypeFwd = typename ReducerConditional::type; + using WorkTagFwd = + std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag, + void>; + + using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>; + + using pointer_type = typename Analysis::pointer_type; + using reference_type = typename Analysis::reference_type; + + const FunctorType m_functor; + const int m_league; + const ReducerType m_reducer; + pointer_type m_result_ptr; + const int m_shared; + + template <class TagType> + inline typename std::enable_if<std::is_same<TagType, void>::value>::type exec( + HostThreadTeamData& data, reference_type update) const { + for (int ileague = 0; ileague < m_league; ++ileague) { + m_functor(Member(data, ileague, m_league), update); + } + } + + template <class TagType> + inline typename std::enable_if<!std::is_same<TagType, void>::value>::type + exec(HostThreadTeamData& data, reference_type update) const { + const TagType t{}; + + for (int ileague = 0; ileague < m_league; ++ileague) { + m_functor(t, Member(data, ileague, m_league), update); + } + } + + public: + inline void execute() const { + const size_t pool_reduce_size = + Analysis::value_size(ReducerConditional::select(m_functor, m_reducer)); + + const size_t team_reduce_size = TEAM_REDUCE_SIZE; + const size_t team_shared_size = m_shared; + const size_t thread_local_size = 0; // Never shrinks + + serial_resize_thread_team_data(pool_reduce_size, team_reduce_size, + team_shared_size, thread_local_size); + + HostThreadTeamData& data = *serial_get_thread_team_data(); + + pointer_type ptr = + m_result_ptr ? m_result_ptr : pointer_type(data.pool_reduce_local()); + + reference_type update = + ValueInit::init(ReducerConditional::select(m_functor, m_reducer), ptr); + + this->template exec<WorkTag>(data, update); + + Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final( + ReducerConditional::select(m_functor, m_reducer), ptr); + } + + template <class ViewType> + ParallelReduce( + const FunctorType& arg_functor, const Policy& arg_policy, + const ViewType& arg_result, + typename std::enable_if<Kokkos::is_view<ViewType>::value && + !Kokkos::is_reducer_type<ReducerType>::value, + void*>::type = nullptr) + : m_functor(arg_functor), + m_league(arg_policy.league_size()), + m_reducer(InvalidType()), + m_result_ptr(arg_result.data()), + m_shared(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + + FunctorTeamShmemSize<FunctorType>::value(m_functor, 1)) { + static_assert(Kokkos::is_view<ViewType>::value, + "Reduction result on Kokkos::Serial must be a Kokkos::View"); + + static_assert( + Kokkos::Impl::MemorySpaceAccess<typename ViewType::memory_space, + Kokkos::HostSpace>::accessible, + "Reduction result on Kokkos::Serial must be a Kokkos::View in " + "HostSpace"); + } + + inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy, + const ReducerType& reducer) + : m_functor(arg_functor), + m_league(arg_policy.league_size()), + m_reducer(reducer), + m_result_ptr(reducer.view().data()), + m_shared(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + + FunctorTeamShmemSize<FunctorType>::value(arg_functor, 1)) { + /*static_assert( std::is_same< typename ViewType::memory_space + , Kokkos::HostSpace >::value + , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" + );*/ + } +}; + +} // namespace Impl +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Experimental { + +template <> +class UniqueToken<Serial, UniqueTokenScope::Instance> { + public: + using execution_space = Serial; + using size_type = int; + + /// \brief create object size for concurrency on the given instance + /// + /// This object should not be shared between instances + UniqueToken(execution_space const& = execution_space()) noexcept {} + + /// \brief create object size for requested size on given instance + /// + /// It is the users responsibility to only acquire size tokens concurrently + UniqueToken(size_type, execution_space const& = execution_space()) {} + + /// \brief upper bound for acquired values, i.e. 0 <= value < size() + KOKKOS_INLINE_FUNCTION + int size() const noexcept { return 1; } + + /// \brief acquire value such that 0 <= value < size() + KOKKOS_INLINE_FUNCTION + int acquire() const noexcept { return 0; } + + /// \brief release a value acquired by generate + KOKKOS_INLINE_FUNCTION + void release(int) const noexcept {} +}; + +template <> +class UniqueToken<Serial, UniqueTokenScope::Global> { + public: + using execution_space = Serial; + using size_type = int; + + /// \brief create object size for concurrency on the given instance + /// + /// This object should not be shared between instances + UniqueToken(execution_space const& = execution_space()) noexcept {} + + /// \brief upper bound for acquired values, i.e. 0 <= value < size() + KOKKOS_INLINE_FUNCTION + int size() const noexcept { return 1; } + + /// \brief acquire value such that 0 <= value < size() + KOKKOS_INLINE_FUNCTION + int acquire() const noexcept { return 0; } + + /// \brief release a value acquired by generate + KOKKOS_INLINE_FUNCTION + void release(int) const noexcept {} +}; + +} // namespace Experimental +} // namespace Kokkos + +#include <impl/Kokkos_Serial_Task.hpp> + +#endif // defined( KOKKOS_ENABLE_SERIAL ) +#endif /* #define KOKKOS_SERIAL_HPP */ diff --git a/packages/kokkos/core/src/Kokkos_TaskPolicy.hpp b/packages/kokkos/core/src/Kokkos_TaskPolicy.hpp new file mode 100644 index 0000000000000000000000000000000000000000..91e079a0e78e314cdb4b22a42876564f25143a4c --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_TaskPolicy.hpp @@ -0,0 +1,47 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +// For backward compatibility: + +#include <Kokkos_TaskScheduler.hpp> diff --git a/packages/kokkos/core/src/Kokkos_TaskScheduler.hpp b/packages/kokkos/core/src/Kokkos_TaskScheduler.hpp new file mode 100644 index 0000000000000000000000000000000000000000..743273670c9b5fa77f6d590596eb27fc7204396a --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_TaskScheduler.hpp @@ -0,0 +1,710 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TASKSCHEDULER_HPP +#define KOKKOS_TASKSCHEDULER_HPP + +//---------------------------------------------------------------------------- + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_TASKDAG) + +#include <Kokkos_Core_fwd.hpp> +#include <Kokkos_TaskScheduler_fwd.hpp> +//---------------------------------------------------------------------------- + +#include <Kokkos_MemoryPool.hpp> +#include <impl/Kokkos_Tags.hpp> + +#include <Kokkos_Future.hpp> +#include <impl/Kokkos_TaskQueue.hpp> +#include <impl/Kokkos_SingleTaskQueue.hpp> +#include <impl/Kokkos_TaskQueueMultiple.hpp> +#include <impl/Kokkos_TaskPolicyData.hpp> +#include <impl/Kokkos_TaskTeamMember.hpp> +#include <impl/Kokkos_SimpleTaskScheduler.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +namespace Impl { + +template <class, class> +class TaskExec; + +} // end namespace Impl + +template <class ExecSpace, class QueueType> +class BasicTaskScheduler : public Impl::TaskSchedulerBase { + public: + using scheduler_type = BasicTaskScheduler; + using execution_space = ExecSpace; + using queue_type = QueueType; + using memory_space = typename queue_type::memory_space; + using memory_pool = typename queue_type::memory_pool; + using specialization = Impl::TaskQueueSpecialization<BasicTaskScheduler>; + using member_type = typename specialization::member_type; + using team_scheduler_type = BasicTaskScheduler; + template <class Functor> + using runnable_task_type = + Impl::Task<scheduler_type, typename Functor::value_type, Functor>; + template <class ValueType> + using future_type = Kokkos::BasicFuture<ValueType, BasicTaskScheduler>; + template <class FunctorType> + using future_type_for_functor = future_type<typename FunctorType::value_type>; + + private: + using track_type = Kokkos::Impl::SharedAllocationTracker; + using task_base = Impl::TaskBase; + + track_type m_track; + queue_type* m_queue; + + //---------------------------------------- + + template <typename, typename> + friend class Impl::TaskQueue; + template <typename> + friend struct Impl::TaskQueueSpecialization; + template <typename, typename> + friend class Impl::TaskQueueSpecializationConstrained; + template <typename, typename> + friend class Impl::TaskTeamMemberAdapter; + template <typename, typename> + friend class Impl::TaskExec; + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + BasicTaskScheduler(track_type arg_track, queue_type* arg_queue) + : m_track(std::move(arg_track)), m_queue(std::move(arg_queue)) {} + + KOKKOS_INLINE_FUNCTION + team_scheduler_type get_team_scheduler(int team_rank) const { + return {m_track, &m_queue->get_team_queue(team_rank)}; + } + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + static constexpr task_base* _get_task_ptr(std::nullptr_t) { return nullptr; } + + template <class ValueType> + KOKKOS_INLINE_FUNCTION static constexpr task_base* _get_task_ptr( + future_type<ValueType>&& f) { + return f.m_task; + } + + template <int TaskEnum, typename DepTaskType, typename FunctorType> + KOKKOS_FUNCTION + Kokkos::BasicFuture<typename FunctorType::value_type, scheduler_type> + _spawn_impl(DepTaskType* arg_predecessor_task, TaskPriority arg_priority, + typename task_base::function_type arg_function, + typename task_base::destroy_type /*arg_destroy*/, + FunctorType&& arg_functor) { + using functor_future_type = + future_type_for_functor<typename std::decay<FunctorType>::type>; + using task_type = + Impl::Task<BasicTaskScheduler, typename functor_future_type::value_type, + FunctorType>; + + //---------------------------------------- + // Give single-thread back-ends an opportunity to clear + // queue of ready tasks before allocating a new task + + // TODO @tasking @optimization DSH re-enable this, maybe? + // specialization::iff_single_thread_recursive_execute(scheduler); + + //---------------------------------------- + + functor_future_type f; + + // Allocate task from memory pool + + const size_t alloc_size = + m_queue->template spawn_allocation_size<FunctorType>(); + + void* task_storage = m_queue->allocate(alloc_size); + + if (task_storage) { + // Placement new construction + // Reference count starts at two: + // +1 for the matching decrement when task is complete + // +1 for the future + f.m_task = + new (task_storage) task_type(std::forward<FunctorType>(arg_functor)); + + f.m_task->m_apply = arg_function; + // f.m_task->m_destroy = arg_destroy; + f.m_task->m_queue = m_queue; + f.m_task->m_next = arg_predecessor_task; + f.m_task->m_ref_count = 2; + f.m_task->m_alloc_size = alloc_size; + f.m_task->m_task_type = TaskEnum; + f.m_task->m_priority = (int16_t)arg_priority; + + Kokkos::memory_fence(); + + // The dependence (if any) is processed immediately + // within the schedule function, as such the dependence's + // reference count does not need to be incremented for + // the assignment. + + m_queue->schedule_runnable(f.m_task); + // This task may be updated or executed at any moment, + // even during the call to 'schedule'. + } + + return f; + } + + public: + KOKKOS_INLINE_FUNCTION + BasicTaskScheduler() : m_track(), m_queue(nullptr) {} + + KOKKOS_INLINE_FUNCTION + BasicTaskScheduler(BasicTaskScheduler&& rhs) noexcept + : m_track(rhs.m_track), // probably should be a move, but this is + // deprecated code anyway + m_queue(std::move(rhs.m_queue)) {} + + KOKKOS_INLINE_FUNCTION + BasicTaskScheduler(BasicTaskScheduler const& rhs) + : m_track(rhs.m_track), m_queue(rhs.m_queue) {} + + KOKKOS_INLINE_FUNCTION + BasicTaskScheduler& operator=(BasicTaskScheduler&& rhs) noexcept { + m_track = rhs.m_track; // probably should be a move, but this is deprecated + // code anyway + m_queue = std::move(rhs.m_queue); + return *this; + } + + KOKKOS_INLINE_FUNCTION + BasicTaskScheduler& operator=(BasicTaskScheduler const& rhs) { + m_track = rhs.m_track; + m_queue = rhs.m_queue; + return *this; + } + + explicit BasicTaskScheduler(memory_pool const& arg_memory_pool) noexcept + : m_track(), m_queue(nullptr) { + using record_type = + Kokkos::Impl::SharedAllocationRecord<memory_space, + typename queue_type::Destroy>; + + record_type* record = record_type::allocate( + memory_space(), "Kokkos::TaskQueue", sizeof(queue_type)); + + m_queue = new (record->data()) queue_type(arg_memory_pool); + + record->m_destroy.m_queue = m_queue; + + m_track.assign_allocated_record_to_uninitialized(record); + } + + BasicTaskScheduler(memory_space const& arg_memory_space, + size_t const mempool_capacity, + unsigned const mempool_min_block_size // = 1u << 6 + , + unsigned const mempool_max_block_size // = 1u << 10 + , + unsigned const mempool_superblock_size // = 1u << 12 + ) + : BasicTaskScheduler(memory_pool( + arg_memory_space, mempool_capacity, mempool_min_block_size, + mempool_max_block_size, mempool_superblock_size)) {} + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + queue_type& queue() const noexcept { + KOKKOS_EXPECTS(m_queue != nullptr); + return *m_queue; + } + + KOKKOS_INLINE_FUNCTION + memory_pool* memory() const noexcept { + return m_queue ? &(m_queue->m_memory) : (memory_pool*)0; + } + + //---------------------------------------- + /**\brief Allocation size for a spawned task */ + template <typename FunctorType> + KOKKOS_FUNCTION size_t spawn_allocation_size() const { + return m_queue->template spawn_allocation_size<FunctorType>(); + } + + /**\brief Allocation size for a when_all aggregate */ + KOKKOS_FUNCTION + size_t when_all_allocation_size(int narg) const { + return m_queue->when_all_allocation_size(narg); + } + + //---------------------------------------- + + template <int TaskEnum, typename DepFutureType, typename FunctorType> + KOKKOS_FUNCTION static Kokkos::BasicFuture<typename FunctorType::value_type, + scheduler_type> + spawn(Impl::TaskPolicyWithScheduler<TaskEnum, scheduler_type, DepFutureType>&& + arg_policy, + typename task_base::function_type arg_function, + typename task_base::destroy_type arg_destroy, + FunctorType&& arg_functor) { + return std::move(arg_policy.scheduler()) + .template _spawn_impl<TaskEnum>( + _get_task_ptr(std::move(arg_policy.predecessor())), + arg_policy.priority(), arg_function, arg_destroy, + std::forward<FunctorType>(arg_functor)); + } + + template <int TaskEnum, typename DepFutureType, typename FunctorType> + KOKKOS_FUNCTION + future_type_for_functor<typename std::decay<FunctorType>::type> + spawn( + Impl::TaskPolicyWithPredecessor<TaskEnum, DepFutureType>&& arg_policy, + FunctorType&& arg_functor) { + using task_type = runnable_task_type<FunctorType>; + typename task_type::function_type const ptr = task_type::apply; + typename task_type::destroy_type const dtor = task_type::destroy; + + return _spawn_impl<TaskEnum>( + _get_task_ptr(std::move(arg_policy).predecessor()), + arg_policy.priority(), ptr, dtor, + std::forward<FunctorType>(arg_functor)); + } + + template <typename FunctorType, typename ValueType, typename Scheduler> + KOKKOS_FUNCTION static void respawn( + FunctorType* arg_self, + BasicFuture<ValueType, Scheduler> const& arg_dependence, + TaskPriority const& arg_priority) { + // Precondition: task is in Executing state + + using value_type = typename FunctorType::value_type; + using task_type = Impl::Task<BasicTaskScheduler, value_type, FunctorType>; + + task_type* const task = static_cast<task_type*>(arg_self); + + task->m_priority = static_cast<int>(arg_priority); + + task->add_dependence(arg_dependence.m_task); + + // Postcondition: task is in Executing-Respawn state + } + + template <typename FunctorType> + KOKKOS_FUNCTION static void respawn(FunctorType* arg_self, + BasicTaskScheduler const&, + TaskPriority const& arg_priority) { + // Precondition: task is in Executing state + + using value_type = typename FunctorType::value_type; + using task_type = Impl::Task<BasicTaskScheduler, value_type, FunctorType>; + + task_type* const task = static_cast<task_type*>(arg_self); + + task->m_priority = static_cast<int>(arg_priority); + + task->add_dependence(nullptr); + + // Postcondition: task is in Executing-Respawn state + } + + //---------------------------------------- + /**\brief Return a future that is complete + * when all input futures are complete. + */ + template <typename ValueType> + KOKKOS_FUNCTION BasicFuture<void, scheduler_type> when_all( + BasicFuture<ValueType, BasicTaskScheduler> const arg[], int narg) { + future_type<void> f; + + if (narg) { + queue_type* q = m_queue; + + // BasicTaskScheduler const* scheduler_ptr = nullptr; + + for (int i = 0; i < narg; ++i) { + task_base* const t = arg[i].m_task; + if (nullptr != t) { + // Increment reference count to track subsequent assignment. + Kokkos::atomic_increment(&(t->m_ref_count)); + if (q != static_cast<queue_type const*>(t->m_queue)) { + Kokkos::abort( + "Kokkos when_all Futures must be in the same scheduler"); + } + } + } + + if (q != nullptr) { // this should probably handle the queue == 0 case, + // but this is deprecated code anyway + + size_t const alloc_size = q->when_all_allocation_size(narg); + + f.m_task = reinterpret_cast<task_base*>(q->allocate(alloc_size)); + // f.m_scheduler = *scheduler_ptr; + + if (f.m_task) { + // Reference count starts at two: + // +1 to match decrement when task completes + // +1 for the future + + new (f.m_task) task_base(); + + f.m_task->m_queue = q; + f.m_task->m_ref_count = 2; + f.m_task->m_alloc_size = static_cast<int32_t>(alloc_size); + f.m_task->m_dep_count = narg; + f.m_task->m_task_type = task_base::Aggregate; + + // Assign dependences, reference counts were already incremented + + task_base* volatile* const dep = f.m_task->aggregate_dependences(); + + for (int i = 0; i < narg; ++i) { + dep[i] = arg[i].m_task; + } + + Kokkos::memory_fence(); + + q->schedule_aggregate(f.m_task); + // this when_all may be processed at any moment + } + } + } + + return f; + } + + template <class F> + KOKKOS_FUNCTION BasicFuture<void, scheduler_type> when_all(int narg, + F const func) { + using input_type = decltype(func(0)); + + static_assert(is_future<input_type>::value, + "Functor must return a Kokkos::Future"); + + future_type<void> f; + + if (0 == narg) return f; + + size_t const alloc_size = m_queue->when_all_allocation_size(narg); + + f.m_task = reinterpret_cast<task_base*>(m_queue->allocate(alloc_size)); + + if (f.m_task) { + // Reference count starts at two: + // +1 to match decrement when task completes + // +1 for the future + + new (f.m_task) task_base(); + // f.m_scheduler = *this; + + // f.m_task->m_scheduler = &f.m_scheduler; + f.m_task->m_queue = m_queue; + f.m_task->m_ref_count = 2; + f.m_task->m_alloc_size = static_cast<int32_t>(alloc_size); + f.m_task->m_dep_count = narg; + f.m_task->m_task_type = task_base::Aggregate; + // f.m_task->m_apply = nullptr; + // f.m_task->m_destroy = nullptr; + + // Assign dependences, reference counts were already incremented + + task_base* volatile* const dep = f.m_task->aggregate_dependences(); + + for (int i = 0; i < narg; ++i) { + const input_type arg_f = func(i); + if (nullptr != arg_f.m_task) { + // Not scheduled, so task scheduler is not yet set + // if ( m_queue != static_cast< BasicTaskScheduler const * >( + // arg_f.m_task->m_scheduler )->m_queue ) { + // Kokkos::abort("Kokkos when_all Futures must be in the same + // scheduler" ); + //} + // Increment reference count to track subsequent assignment. + Kokkos::atomic_increment(&(arg_f.m_task->m_ref_count)); + dep[i] = arg_f.m_task; + } + } + + Kokkos::memory_fence(); + + m_queue->schedule_aggregate(f.m_task); + // this when_all may be processed at any moment + } + return f; + } + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + int allocation_capacity() const noexcept { + return m_queue->m_memory.capacity(); + } + + KOKKOS_INLINE_FUNCTION + int allocated_task_count() const noexcept { return m_queue->m_count_alloc; } + + KOKKOS_INLINE_FUNCTION + int allocated_task_count_max() const noexcept { return m_queue->m_max_alloc; } + + KOKKOS_INLINE_FUNCTION + long allocated_task_count_accum() const noexcept { + return m_queue->m_accum_alloc; + } + + //---------------------------------------- + + template <class S, class Q> + friend void wait(Kokkos::BasicTaskScheduler<S, Q> const&); +}; + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +//---------------------------------------------------------------------------- +// Construct a TaskTeam execution policy + +template <class T, class Scheduler> +Impl::TaskPolicyWithPredecessor<Impl::TaskType::TaskTeam, + Kokkos::BasicFuture<T, Scheduler> > + KOKKOS_INLINE_FUNCTION + TaskTeam(Kokkos::BasicFuture<T, Scheduler> arg_future, + TaskPriority arg_priority = TaskPriority::Regular) { + return {std::move(arg_future), arg_priority}; +} + +template <class Scheduler> +Impl::TaskPolicyWithScheduler<Impl::TaskType::TaskTeam, Scheduler> + KOKKOS_INLINE_FUNCTION + TaskTeam(Scheduler arg_scheduler, + typename std::enable_if<Kokkos::is_scheduler<Scheduler>::value, + TaskPriority>::type arg_priority = + TaskPriority::Regular) { + return {std::move(arg_scheduler), arg_priority}; +} + +template <class Scheduler, class PredecessorFuture> +Impl::TaskPolicyWithScheduler<Kokkos::Impl::TaskType::TaskTeam, Scheduler, + PredecessorFuture> + KOKKOS_INLINE_FUNCTION TaskTeam( + Scheduler arg_scheduler, PredecessorFuture arg_future, + typename std::enable_if<Kokkos::is_scheduler<Scheduler>::value && + Kokkos::is_future<PredecessorFuture>::value, + TaskPriority>::type arg_priority = + TaskPriority::Regular) { + static_assert(std::is_same<typename PredecessorFuture::scheduler_type, + Scheduler>::value, + "Can't create a task policy from a scheduler and a future from " + "a different scheduler"); + + return {std::move(arg_scheduler), std::move(arg_future), arg_priority}; +} + +// Construct a TaskSingle execution policy + +template <class T, class Scheduler> +Impl::TaskPolicyWithPredecessor<Impl::TaskType::TaskSingle, + Kokkos::BasicFuture<T, Scheduler> > + KOKKOS_INLINE_FUNCTION + TaskSingle(Kokkos::BasicFuture<T, Scheduler> arg_future, + TaskPriority arg_priority = TaskPriority::Regular) { + return {std::move(arg_future), arg_priority}; +} + +template <class Scheduler> +Impl::TaskPolicyWithScheduler<Impl::TaskType::TaskSingle, Scheduler> + KOKKOS_INLINE_FUNCTION + TaskSingle(Scheduler arg_scheduler, + typename std::enable_if<Kokkos::is_scheduler<Scheduler>::value, + TaskPriority>::type arg_priority = + TaskPriority::Regular) { + return {std::move(arg_scheduler), arg_priority}; +} + +template <class Scheduler, class PredecessorFuture> +Impl::TaskPolicyWithScheduler<Kokkos::Impl::TaskType::TaskSingle, Scheduler, + PredecessorFuture> + KOKKOS_INLINE_FUNCTION TaskSingle( + Scheduler arg_scheduler, PredecessorFuture arg_future, + typename std::enable_if<Kokkos::is_scheduler<Scheduler>::value && + Kokkos::is_future<PredecessorFuture>::value, + TaskPriority>::type arg_priority = + TaskPriority::Regular) { + static_assert(std::is_same<typename PredecessorFuture::scheduler_type, + Scheduler>::value, + "Can't create a task policy from a scheduler and a future from " + "a different scheduler"); + + return {std::move(arg_scheduler), std::move(arg_future), arg_priority}; +} + +//---------------------------------------------------------------------------- + +/**\brief A host control thread spawns a task with options + * + * 1) Team or Serial + * 2) With scheduler or dependence + * 3) High, Normal, or Low priority + */ +template <int TaskEnum, typename Scheduler, typename DepFutureType, + typename FunctorType> +typename Scheduler::template future_type_for_functor< + typename std::decay<FunctorType>::type> +host_spawn(Impl::TaskPolicyWithScheduler<TaskEnum, Scheduler, DepFutureType> + arg_policy, + FunctorType&& arg_functor) { + using scheduler_type = Scheduler; + using task_type = + typename scheduler_type::template runnable_task_type<FunctorType>; + + static_assert(TaskEnum == Impl::TaskType::TaskTeam || + TaskEnum == Impl::TaskType::TaskSingle, + "Kokkos host_spawn requires TaskTeam or TaskSingle"); + + // May be spawning a Cuda task, must use the specialization + // to query on-device function pointer. + typename task_type::function_type ptr; + typename task_type::destroy_type dtor; + Kokkos::Impl::TaskQueueSpecialization< + scheduler_type>::template get_function_pointer<task_type>(ptr, dtor); + + return scheduler_type::spawn(std::move(arg_policy), ptr, dtor, + std::forward<FunctorType>(arg_functor)); +} + +/**\brief A task spawns a task with options + * + * 1) Team or Serial + * 2) With scheduler or dependence + * 3) High, Normal, or Low priority + */ +template <int TaskEnum, typename Scheduler, typename DepFutureType, + typename FunctorType> +typename Scheduler::template future_type_for_functor< + typename std::decay<FunctorType>::type> + KOKKOS_INLINE_FUNCTION + task_spawn(Impl::TaskPolicyWithScheduler<TaskEnum, Scheduler, DepFutureType> + arg_policy, + FunctorType&& arg_functor) { + using scheduler_type = Scheduler; + + using task_type = + typename scheduler_type::template runnable_task_type<FunctorType>; + +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) && \ + defined(KOKKOS_ENABLE_CUDA) + + // This doesn't work with clang cuda + // static_assert( + // !std::is_same<Kokkos::Cuda, typename Scheduler::execution_space>::value, + // "Error calling Kokkos::task_spawn for Cuda space within Host code"); + +#endif + + static_assert(TaskEnum == Impl::TaskType::TaskTeam || + TaskEnum == Impl::TaskType::TaskSingle, + "Kokkos task_spawn requires TaskTeam or TaskSingle"); + + typename task_type::function_type const ptr = task_type::apply; + typename task_type::destroy_type const dtor = task_type::destroy; + + return scheduler_type::spawn(std::move(arg_policy), ptr, dtor, + std::forward<FunctorType>(arg_functor)); +} + +/**\brief A task respawns itself with options + * + * 1) With scheduler or dependence + * 2) High, Normal, or Low priority + */ +template <typename FunctorType, typename T> +void KOKKOS_INLINE_FUNCTION +respawn(FunctorType* arg_self, T const& arg, + TaskPriority const& arg_priority = TaskPriority::Regular) { + static_assert(Kokkos::is_future<T>::value || Kokkos::is_scheduler<T>::value, + "Kokkos respawn argument must be Future or TaskScheduler"); + + T::scheduler_type::respawn(arg_self, arg, arg_priority); +} + +//---------------------------------------------------------------------------- + +// template<typename ValueType, typename Scheduler> +// KOKKOS_INLINE_FUNCTION +// BasicFuture<void, Scheduler> +// when_all(BasicFuture<ValueType, Scheduler> const arg[], int narg) +//{ +// return BasicFuture<void, Scheduler>::scheduler_type::when_all(arg, narg); +//} + +//---------------------------------------------------------------------------- +// Wait for all runnable tasks to complete + +template <class ExecSpace, class QueueType> +inline void wait(BasicTaskScheduler<ExecSpace, QueueType> const& scheduler) { + using scheduler_type = BasicTaskScheduler<ExecSpace, QueueType>; + scheduler_type::specialization::execute(scheduler); + // scheduler.m_queue->execute(); +} + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +//////////////////////////////////////////////////////////////////////////////// +// END OLD CODE +//////////////////////////////////////////////////////////////////////////////// + +#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */ +#endif /* #ifndef KOKKOS_TASKSCHEDULER_HPP */ diff --git a/packages/kokkos/core/src/Kokkos_TaskScheduler_fwd.hpp b/packages/kokkos/core/src/Kokkos_TaskScheduler_fwd.hpp new file mode 100644 index 0000000000000000000000000000000000000000..28af6345d1da49ee92b5da8cd7739f0a8cb80967 --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_TaskScheduler_fwd.hpp @@ -0,0 +1,233 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TASKSCHEDULER_FWD_HPP +#define KOKKOS_TASKSCHEDULER_FWD_HPP + +//---------------------------------------------------------------------------- + +#include <cstddef> +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_TASKDAG) + +#include <Kokkos_Core_fwd.hpp> +//---------------------------------------------------------------------------- + +namespace Kokkos { + +// Forward declarations used in Impl::TaskQueue + +template <typename ValueType, typename Scheduler> +class BasicFuture; + +template <class Space, class Queue> +class SimpleTaskScheduler; + +template <class Space, class Queue> +class BasicTaskScheduler; + +template <typename Space> +struct is_scheduler : public std::false_type {}; + +template <class Space, class Queue> +struct is_scheduler<BasicTaskScheduler<Space, Queue>> : public std::true_type { +}; + +template <class Space, class Queue> +struct is_scheduler<SimpleTaskScheduler<Space, Queue>> : public std::true_type { +}; + +enum class TaskPriority : int { High = 0, Regular = 1, Low = 2 }; + +} // namespace Kokkos + +//---------------------------------------------------------------------------- + +namespace Kokkos { + +template <class Device> +class MemoryPool; + +namespace Impl { + +template <class TaskQueueTraits> +class TaskNode; + +class TaskBase; + +/*\brief Implementation data for task data management, access, and execution. + * (Deprecated) + * CRTP Inheritance structure to allow static_cast from the + * task root type and a task's FunctorType. + * + * TaskBase< Space , ResultType , FunctorType > + * : TaskBase< Space , ResultType , void > + * , FunctorType + * { ... }; + * + * TaskBase< Space , ResultType , void > + * : TaskBase< Space , void , void > + * { ... }; + */ +template <typename Space, typename ResultType, typename FunctorType> +class Task; + +class TaskQueueBase; + +template <typename Space, typename MemorySpace> +class TaskQueue; + +template <typename ExecSpace, typename MemorySpace> +class TaskQueueMultiple; + +template <typename ExecSpace, typename MemSpace, typename TaskQueueTraits, + class MemoryPool = + Kokkos::MemoryPool<Kokkos::Device<ExecSpace, MemSpace>>> +class SingleTaskQueue; + +template <typename ExecSpace, typename MemSpace, typename TaskQueueTraits, + class MemoryPool> +class MultipleTaskQueue; + +struct TaskQueueTraitsLockBased; + +template <size_t CircularBufferSize = 64> +struct TaskQueueTraitsChaseLev; + +template <typename ResultType> +struct TaskResult; + +struct TaskSchedulerBase; + +template <class ExecSpace> +struct default_tasking_memory_space_for_execution_space { + using type = typename ExecSpace::memory_space; +}; + +#if defined(KOKKOS_ENABLE_CUDA) +template <> +struct default_tasking_memory_space_for_execution_space<Kokkos::Cuda> { + using type = Kokkos::CudaUVMSpace; +}; +#endif + +template <class ExecSpace> +using default_tasking_memory_space_for_execution_space_t = + typename default_tasking_memory_space_for_execution_space<ExecSpace>::type; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- + +namespace Kokkos { + +template <typename Space> +using DeprecatedTaskScheduler = BasicTaskScheduler< + Space, + Impl::TaskQueue< + Space, + Impl::default_tasking_memory_space_for_execution_space_t<Space>>>; + +template <typename Space> +using DeprecatedTaskSchedulerMultiple = BasicTaskScheduler< + Space, + Impl::TaskQueueMultiple< + Space, + Impl::default_tasking_memory_space_for_execution_space_t<Space>>>; + +template <typename Space> +using TaskScheduler = SimpleTaskScheduler< + Space, + Impl::SingleTaskQueue< + Space, Impl::default_tasking_memory_space_for_execution_space_t<Space>, + Impl::TaskQueueTraitsLockBased>>; + +template <typename Space> +using TaskSchedulerMultiple = SimpleTaskScheduler< + Space, + Impl::MultipleTaskQueue< + Space, Impl::default_tasking_memory_space_for_execution_space_t<Space>, + Impl::TaskQueueTraitsLockBased, + Kokkos::MemoryPool<Kokkos::Device< + Space, + Impl::default_tasking_memory_space_for_execution_space_t<Space>>>>>; + +template <typename Space> +using ChaseLevTaskScheduler = SimpleTaskScheduler< + Space, + Impl::MultipleTaskQueue< + Space, Impl::default_tasking_memory_space_for_execution_space_t<Space>, + Impl::TaskQueueTraitsChaseLev<>, + Kokkos::MemoryPool<Kokkos::Device< + Space, + Impl::default_tasking_memory_space_for_execution_space_t<Space>>>>>; + +template <class Space, class QueueType> +void wait(BasicTaskScheduler<Space, QueueType> const&); + +namespace Impl { + +struct TaskSchedulerBase {}; + +class TaskQueueBase {}; + +template <typename Scheduler, typename EnableIfConstraint = void> +class TaskQueueSpecializationConstrained {}; + +template <typename Scheduler> +struct TaskQueueSpecialization : TaskQueueSpecializationConstrained<Scheduler> { +}; + +template <int, typename> +struct TaskPolicyData; + +} // end namespace Impl + +} // namespace Kokkos + +//---------------------------------------------------------------------------- + +#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */ +#endif /* #ifndef KOKKOS_TASKSCHEDULER_FWD_HPP */ diff --git a/packages/kokkos/core/src/Kokkos_Threads.hpp b/packages/kokkos/core/src/Kokkos_Threads.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e827c2a2a1abd46999360c1eef57eb85428436aa --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_Threads.hpp @@ -0,0 +1,231 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_THREADS_HPP +#define KOKKOS_THREADS_HPP + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_THREADS) + +#include <Kokkos_Core_fwd.hpp> + +#include <cstddef> +#include <iosfwd> +#include <Kokkos_HostSpace.hpp> +#include <Kokkos_ScratchSpace.hpp> +#include <Kokkos_Layout.hpp> +#include <Kokkos_MemoryTraits.hpp> +#include <impl/Kokkos_Profiling_Interface.hpp> +#include <impl/Kokkos_Tags.hpp> +#include <impl/Kokkos_ExecSpaceInitializer.hpp> + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { +class ThreadsExec; +} // namespace Impl +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { + +/** \brief Execution space for a pool of Pthreads or C11 threads on a CPU. */ +class Threads { + public: + //! \name Type declarations that all Kokkos devices must provide. + //@{ + //! Tag this class as a kokkos execution space + using execution_space = Threads; + using memory_space = Kokkos::HostSpace; + + //! This execution space preferred device_type + using device_type = Kokkos::Device<execution_space, memory_space>; + + using array_layout = Kokkos::LayoutRight; + using size_type = memory_space::size_type; + + using scratch_memory_space = ScratchMemorySpace<Threads>; + + //@} + /*------------------------------------------------------------------------*/ + //! \name Static functions that all Kokkos devices must implement. + //@{ + + /// \brief True if and only if this method is being called in a + /// thread-parallel function. + static int in_parallel(); + + /// \brief Print configuration information to the given output stream. + static void print_configuration(std::ostream&, const bool detail = false); + + /// \brief Wait until all dispatched functors complete. + /// + /// The parallel_for or parallel_reduce dispatch of a functor may + /// return asynchronously, before the functor completes. This + /// method does not return until all dispatched functors on this + /// device have completed. + static void impl_static_fence(); + + void fence() const; + + /** \brief Return the maximum amount of concurrency. */ + static int concurrency(); + + /// \brief Free any resources being consumed by the device. + /// + /// For the Threads device, this terminates spawned worker threads. + static void impl_finalize(); + + //@} + /*------------------------------------------------------------------------*/ + /*------------------------------------------------------------------------*/ + //! \name Space-specific functions + //@{ + + /** \brief Initialize the device in the "ready to work" state. + * + * The device is initialized in a "ready to work" or "awake" state. + * This state reduces latency and thus improves performance when + * dispatching work. However, the "awake" state consumes resources + * even when no work is being done. You may call sleep() to put + * the device in a "sleeping" state that does not consume as many + * resources, but it will take time (latency) to awaken the device + * again (via the wake()) method so that it is ready for work. + * + * Teams of threads are distributed as evenly as possible across + * the requested number of numa regions and cores per numa region. + * A team will not be split across a numa region. + * + * If the 'use_' arguments are not supplied the hwloc is queried + * to use all available cores. + */ + static void impl_initialize(unsigned threads_count = 0, + unsigned use_numa_count = 0, + unsigned use_cores_per_numa = 0, + bool allow_asynchronous_threadpool = false); + + static int impl_is_initialized(); + + static Threads& impl_instance(int = 0); + + //---------------------------------------- + + static int impl_thread_pool_size(int depth = 0); +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + static int impl_thread_pool_rank(); +#else + KOKKOS_INLINE_FUNCTION static int impl_thread_pool_rank() { return 0; } +#endif + + inline static unsigned impl_max_hardware_threads() { + return impl_thread_pool_size(0); + } + KOKKOS_INLINE_FUNCTION static unsigned impl_hardware_thread_id() { + return impl_thread_pool_rank(); + } + + uint32_t impl_instance_id() const noexcept { return 0; } + + static const char* name(); + //@} + //---------------------------------------- +}; + +namespace Tools { +namespace Experimental { +template <> +struct DeviceTypeTraits<Threads> { + static constexpr DeviceType id = DeviceType::Threads; +}; +} // namespace Experimental +} // namespace Tools + +namespace Impl { + +class ThreadsSpaceInitializer : public ExecSpaceInitializerBase { + public: + ThreadsSpaceInitializer() = default; + ~ThreadsSpaceInitializer() = default; + void initialize(const InitArguments& args) final; + void finalize(const bool) final; + void fence() final; + void print_configuration(std::ostream& msg, const bool detail) final; +}; + +} // namespace Impl +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { + +template <> +struct MemorySpaceAccess<Kokkos::Threads::memory_space, + Kokkos::Threads::scratch_memory_space> { + enum : bool { assignable = false }; + enum : bool { accessible = true }; + enum : bool { deepcopy = false }; +}; + +} // namespace Impl +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ + +#include <Kokkos_ExecPolicy.hpp> +#include <Kokkos_Parallel.hpp> +#include <Threads/Kokkos_ThreadsExec.hpp> +#include <Threads/Kokkos_ThreadsTeam.hpp> +#include <Threads/Kokkos_Threads_Parallel.hpp> + +#include <KokkosExp_MDRangePolicy.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #if defined( KOKKOS_ENABLE_THREADS ) */ +#endif /* #define KOKKOS_THREADS_HPP */ diff --git a/packages/kokkos/core/src/Kokkos_Timer.hpp b/packages/kokkos/core/src/Kokkos_Timer.hpp new file mode 100644 index 0000000000000000000000000000000000000000..4fda4ec4d443972b280413d41c321059874d8e54 --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_Timer.hpp @@ -0,0 +1,79 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TIMER_HPP +#define KOKKOS_TIMER_HPP + +#include <Kokkos_Macros.hpp> +#include <chrono> + +namespace Kokkos { + +/** \brief Time since construction */ + +class Timer { + private: + std::chrono::high_resolution_clock::time_point m_old; + Timer(const Timer&); + Timer& operator=(const Timer&); + + public: + inline void reset() { m_old = std::chrono::high_resolution_clock::now(); } + + inline ~Timer() = default; + + inline Timer() { reset(); } + + inline double seconds() const { + std::chrono::high_resolution_clock::time_point m_new = + std::chrono::high_resolution_clock::now(); + return std::chrono::duration_cast<std::chrono::duration<double>>(m_new - + m_old) + .count(); + } +}; + +} // namespace Kokkos + +#endif /* #ifndef KOKKOS_TIMER_HPP */ diff --git a/packages/kokkos/core/src/Kokkos_Tuners.hpp b/packages/kokkos/core/src/Kokkos_Tuners.hpp new file mode 100644 index 0000000000000000000000000000000000000000..f7cc34cc114d29cbe5612bf4350fe01a498282c3 --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_Tuners.hpp @@ -0,0 +1,557 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_KOKKOS_TUNERS_HPP +#define KOKKOS_KOKKOS_TUNERS_HPP + +#include <Kokkos_Macros.hpp> +#include <Kokkos_Core_fwd.hpp> +#include <Kokkos_ExecPolicy.hpp> +#include <KokkosExp_MDRangePolicy.hpp> +#include <impl/Kokkos_Profiling_Interface.hpp> + +#include <array> +#include <utility> +#include <tuple> +#include <string> +#include <vector> +#include <map> +#include <cassert> + +namespace Kokkos { +namespace Tools { + +namespace Experimental { + +// forward declarations +SetOrRange make_candidate_set(size_t size, int64_t* data); +bool have_tuning_tool(); +size_t declare_output_type(const std::string&, + Kokkos::Tools::Experimental::VariableInfo); +void request_output_values(size_t, size_t, + Kokkos::Tools::Experimental::VariableValue*); +VariableValue make_variable_value(size_t, int64_t); +VariableValue make_variable_value(size_t, double); +SetOrRange make_candidate_range(double lower, double upper, double step, + bool openLower, bool openUpper); +size_t get_new_context_id(); +void begin_context(size_t context_id); +void end_context(size_t context_id); +namespace Impl { + +/** We're going to take in search space descriptions + * as nested maps, which aren't efficient to + * iterate across by index. These are very similar + * to nested maps, but better for index-based lookup + */ +template <typename ValueType, typename ContainedType> +struct ValueHierarchyNode; + +template <typename ValueType, typename ContainedType> +struct ValueHierarchyNode { + std::vector<ValueType> root_values; + std::vector<ContainedType> sub_values; + void add_root_value(const ValueType& in) noexcept { + root_values.push_back(in); + } + void add_sub_container(const ContainedType& in) { sub_values.push_back(in); } + const ValueType& get_root_value(const size_t index) const { + return root_values[index]; + } + const ContainedType& get_sub_value(const size_t index) const { + return sub_values[index]; + } +}; + +template <typename ValueType> +struct ValueHierarchyNode<ValueType, void> { + std::vector<ValueType> root_values; + explicit ValueHierarchyNode(std::vector<ValueType> rv) + : root_values(std::move(rv)) {} + void add_root_value(const ValueType& in) noexcept { + root_values.push_back(in); + } + const ValueType& get_root_value(const size_t index) const { + return root_values[index]; + } +}; + +/** For a given nested map type, we need a way to + * declare the equivalent ValueHierarchyNode + * structure + */ + +template <class NestedMap> +struct MapTypeConverter; + +// Vectors are our lowest-level, no nested values +template <class T> +struct MapTypeConverter<std::vector<T>> { + using type = ValueHierarchyNode<T, void>; +}; + +// Maps contain both the "root" types and sub-vectors +template <class K, class V> +struct MapTypeConverter<std::map<K, V>> { + using type = ValueHierarchyNode<K, typename MapTypeConverter<V>::type>; +}; + +/** + * We also need to be able to construct a ValueHierarchyNode set from a + * map + */ + +template <class NestedMap> +struct ValueHierarchyConstructor; + +// Vectors are our lowest-level, no nested values. Just fill in the fundamental +// values +template <class T> +struct ValueHierarchyConstructor<std::vector<T>> { + using return_type = typename MapTypeConverter<std::vector<T>>::type; + static return_type build(const std::vector<T>& in) { return return_type{in}; } +}; + +// For maps, we need to fill in the fundamental values, and construct child +// nodes +template <class K, class V> +struct ValueHierarchyConstructor<std::map<K, V>> { + using return_type = typename MapTypeConverter<std::map<K, V>>::type; + static return_type build(const std::map<K, V>& in) { + return_type node_to_build; + for (auto& entry : in) { + node_to_build.add_root_value(entry.first); + node_to_build.add_sub_container( + ValueHierarchyConstructor<V>::build(entry.second)); + } + return node_to_build; + } +}; + +/** + * We're going to be declaring a sparse multidimensional + * tuning space as a set of nested maps. The innermost level + * will be a vector. The dimensionality of such a space is the number of + * maps + 1. + * + * The following templates implement such logic recursively + */ +template <class InspectForDepth> +struct get_space_dimensionality; + +// The dimensionality of a vector is 1 +template <class T> +struct get_space_dimensionality<std::vector<T>> { + static constexpr int value = 1; +}; + +// The dimensionality of a map is 1 (the map) plus the dimensionality +// of the map's value type +template <class K, class V> +struct get_space_dimensionality<std::map<K, V>> { + static constexpr int value = 1 + get_space_dimensionality<V>::value; +}; + +template <class T, int N> +struct n_dimensional_sparse_structure; + +template <class T> +struct n_dimensional_sparse_structure<T, 1> { + using type = std::vector<T>; +}; + +template <class T, int N> +struct n_dimensional_sparse_structure { + using type = + std::map<T, typename n_dimensional_sparse_structure<T, N - 1>::type>; +}; + +/** + * This is the ugly part of this implementation: mapping a set of doubles in + * [0.0,1.0) into a point in this multidimensional space. We're going to + * implement this concept recursively, building up a tuple at each level. + */ + +// First, a helper to get the value in one dimension +template <class Container> +struct DimensionValueExtractor; + +// At any given level, just return your value at that level +template <class RootType, class Subtype> +struct DimensionValueExtractor<ValueHierarchyNode<RootType, Subtype>> { + static RootType get(const ValueHierarchyNode<RootType, Subtype>& dimension, + double fraction_to_traverse) { + size_t index = dimension.root_values.size() * fraction_to_traverse; + return dimension.get_root_value(index); + } +}; + +/** Now we're going to do the full "get a point in the space". + * At a root level, we'll take in a ValueHierarchyNode and a set of doubles + * representing the value in [0.0,1.0) we want to pick + */ + +// At the bottom level, we have one double and a base-level ValueHierarchyNode + +template <class HierarchyNode, class... InterpolationIndices> +struct GetMultidimensionalPoint; + +template <class ValueType> +struct GetMultidimensionalPoint<ValueHierarchyNode<ValueType, void>, double> { + using node_type = ValueHierarchyNode<ValueType, void>; + using return_type = std::tuple<ValueType>; + static return_type build(const node_type& in, double index) { + return std::make_tuple(DimensionValueExtractor<node_type>::get(in, index)); + } +}; + +// At levels above the bottom, we tuple_cat the result of our child on the end +// of our own tuple +template <class ValueType, class Subtype, class... Indices> +struct GetMultidimensionalPoint<ValueHierarchyNode<ValueType, Subtype>, double, + Indices...> { + using node_type = ValueHierarchyNode<ValueType, Subtype>; + using sub_tuple = + typename GetMultidimensionalPoint<Subtype, Indices...>::return_type; + using return_type = decltype(std::tuple_cat( + std::declval<std::tuple<ValueType>>(), std::declval<sub_tuple>())); + static return_type build(const node_type& in, double fraction_to_traverse, + Indices... indices) { + size_t index = in.sub_values.size() * fraction_to_traverse; + auto dimension_value = std::make_tuple( + DimensionValueExtractor<node_type>::get(in, fraction_to_traverse)); + return std::tuple_cat(dimension_value, + GetMultidimensionalPoint<Subtype, Indices...>::build( + in.get_sub_value(index), indices...)); + } +}; + +template <typename PointType, class ArrayType, size_t... Is> +auto get_point_helper(const PointType& in, const ArrayType& indices, + std::index_sequence<Is...>) { + using helper = GetMultidimensionalPoint< + PointType, + decltype(std::get<Is>(std::declval<ArrayType>()).value.double_value)...>; + return helper::build(in, std::get<Is>(indices).value.double_value...); +} + +template <typename PointType, typename ArrayType> +struct GetPoint; + +template <typename PointType, size_t X> +struct GetPoint<PointType, + std::array<Kokkos::Tools::Experimental::VariableValue, X>> { + using index_set_type = + std::array<Kokkos::Tools::Experimental::VariableValue, X>; + static auto build(const PointType& in, const index_set_type& indices) { + return get_point_helper(in, indices, std::make_index_sequence<X>{}); + } +}; + +template <typename PointType, typename ArrayType> +auto get_point(const PointType& point, const ArrayType& indices) { + return GetPoint<PointType, ArrayType>::build(point, indices); +} + +} // namespace Impl + +template <template <class...> class Container, size_t MaxDimensionSize = 100, + class... TemplateArguments> +class MultidimensionalSparseTuningProblem { + public: + using ProblemSpaceInput = Container<TemplateArguments...>; + static constexpr int space_dimensionality = + Impl::get_space_dimensionality<ProblemSpaceInput>::value; + static constexpr size_t max_space_dimension_size = MaxDimensionSize; + static constexpr double tuning_min = 0.0; + static constexpr double tuning_max = 0.999; + static constexpr double tuning_step = tuning_max / max_space_dimension_size; + + using StoredProblemSpace = + typename Impl::MapTypeConverter<ProblemSpaceInput>::type; + using HierarchyConstructor = + typename Impl::ValueHierarchyConstructor<Container<TemplateArguments...>>; + + using ValueArray = std::array<Kokkos::Tools::Experimental::VariableValue, + space_dimensionality>; + + private: + StoredProblemSpace m_space; + std::array<size_t, space_dimensionality> variable_ids; + size_t context; + + public: + MultidimensionalSparseTuningProblem() = default; + MultidimensionalSparseTuningProblem(ProblemSpaceInput space, + const std::vector<std::string>& names) + : m_space(HierarchyConstructor::build(space)) { + assert(names.size() == space_dimensionality); + for (unsigned long x = 0; x < names.size(); ++x) { + VariableInfo info; + info.type = Kokkos::Tools::Experimental::ValueType::kokkos_value_double; + info.category = Kokkos::Tools::Experimental::StatisticalCategory:: + kokkos_value_interval; + info.valueQuantity = + Kokkos::Tools::Experimental::CandidateValueType::kokkos_value_range; + info.candidates = Kokkos::Tools::Experimental::make_candidate_range( + tuning_min, tuning_max, tuning_step, true, true); + variable_ids[x] = declare_output_type(names[x], info); + } + } + + auto begin() { + context = Kokkos::Tools::Experimental::get_new_context_id(); + ValueArray values; + for (int x = 0; x < space_dimensionality; ++x) { + values[x] = Kokkos::Tools::Experimental::make_variable_value( + variable_ids[x], 0.0); + } + begin_context(context); + request_output_values(context, space_dimensionality, values.data()); + return get_point(m_space, values); + } + + auto end() { end_context(context); } +}; + +template <size_t MaxDimensionSize = 100, template <class...> class Container, + class... TemplateArguments> +auto make_multidimensional_sparse_tuning_problem( + const Container<TemplateArguments...>& in, std::vector<std::string> names) { + return MultidimensionalSparseTuningProblem<Container, MaxDimensionSize, + TemplateArguments...>(in, names); +} +class TeamSizeTuner { + private: + using SpaceDescription = std::map<int64_t, std::vector<int64_t>>; + using TunerType = decltype(make_multidimensional_sparse_tuning_problem<20>( + std::declval<SpaceDescription>(), + std::declval<std::vector<std::string>>())); + TunerType tuner; + + public: + TeamSizeTuner() = default; + TeamSizeTuner& operator=(const TeamSizeTuner& other) = default; + TeamSizeTuner(const TeamSizeTuner& other) = default; + TeamSizeTuner& operator=(TeamSizeTuner&& other) = default; + TeamSizeTuner(TeamSizeTuner&& other) = default; + template <typename ViableConfigurationCalculator, typename Functor, + typename TagType, typename... Properties> + TeamSizeTuner(const std::string& name, + Kokkos::TeamPolicy<Properties...>& policy, + const Functor& functor, const TagType& tag, + ViableConfigurationCalculator calc) { + using PolicyType = Kokkos::TeamPolicy<Properties...>; + auto initial_vector_length = policy.impl_vector_length(); + if (initial_vector_length < 1) { + policy.impl_set_vector_length(1); + } + /** + * Here we attempt to enumerate all of the possible configurations + * to expose to an autotuner. There are three possibilities + * + * 1) We're tuning both vector length and team size + * 2) We're tuning vector length but not team size + * 3) We're tuning team size but not vector length + * + * (In the fourth case where nothing is tuned + * this function won't be called) + * + * The set of valid team sizes is dependent on + * a vector length, so this leads to three + * algorithms + * + * 1) Loop over vector lengths to get the set + * of team sizes for each vector length, + * add it all to the set + * 2) Loop over vector lengths to see if the + * provided team size is valid for that + * vector length. If so, add it + * 3) A special case of (1) in which we only + * have one vector length + * + */ + SpaceDescription space_description; + + auto max_vector_length = PolicyType::vector_length_max(); + std::vector<int64_t> allowed_vector_lengths; + + if (policy.impl_auto_vector_length()) { // case 1 or 2 + for (int vector_length = max_vector_length; vector_length >= 1; + vector_length /= 2) { + policy.impl_set_vector_length(vector_length); + /** + * Figuring out whether a vector length is valid depends + * on whether we're in case 1 (tune everything) or 2 (just tune vector + * length) + * + * If we're tuning everything, all legal vector lengths are valid. + * If we're just tuning vector length, we need to check that if we + * set this vector length, the team size provided will be valid. + * + * These are the left and right hand sides of the "or" in this + * conditional, respectively. + */ + auto max_team_size = calc.get_max_team_size(policy, functor, tag); + if ((policy.impl_auto_team_size()) || + (policy.team_size() <= max_team_size)) { + allowed_vector_lengths.push_back(vector_length); + } + } + } else { // case 3, there's only one vector length to care about + allowed_vector_lengths.push_back(policy.impl_vector_length()); + } + + for (const auto vector_length : allowed_vector_lengths) { + std::vector<int64_t> allowed_team_sizes; + policy.impl_set_vector_length(vector_length); + auto max_team_size = calc.get_max_team_size(policy, functor, tag); + if (policy.impl_auto_team_size()) { // case 1 or 3, try all legal team + // sizes + for (int team_size = max_team_size; team_size >= 1; team_size /= 2) { + allowed_team_sizes.push_back(team_size); + } + } else { // case 2, just try the provided team size + allowed_team_sizes.push_back(policy.team_size()); + } + space_description[vector_length] = allowed_team_sizes; + } + tuner = make_multidimensional_sparse_tuning_problem<20>( + space_description, {std::string(name + "_vector_length"), + std::string(name + "_team_size")}); + policy.impl_set_vector_length(initial_vector_length); + } + + template <typename... Properties> + void tune(Kokkos::TeamPolicy<Properties...>& policy) { + if (Kokkos::Tools::Experimental::have_tuning_tool()) { + auto configuration = tuner.begin(); + auto team_size = std::get<1>(configuration); + auto vector_length = std::get<0>(configuration); + if (vector_length > 0) { + policy.impl_set_team_size(team_size); + policy.impl_set_vector_length(vector_length); + } + } + } + void end() { + if (Kokkos::Tools::Experimental::have_tuning_tool()) { + tuner.end(); + } + } + + private: +}; + +namespace Impl { + +template <typename T> +void fill_tile(std::vector<T>& cont, int tile_size) { + for (int x = 1; x < tile_size; x *= 2) { + cont.push_back(x); + } +} +template <typename T, typename Mapped> +void fill_tile(std::map<T, Mapped>& cont, int tile_size) { + for (int x = 1; x < tile_size; x *= 2) { + fill_tile(cont[x], tile_size / x); + } +} +} // namespace Impl + +template <int MDRangeRank> +struct MDRangeTuner { + private: + static constexpr int rank = MDRangeRank; + static constexpr int max_slices = 15; + using SpaceDescription = + typename Impl::n_dimensional_sparse_structure<int, rank>::type; + using TunerType = + decltype(make_multidimensional_sparse_tuning_problem<max_slices>( + std::declval<SpaceDescription>(), + std::declval<std::vector<std::string>>())); + TunerType tuner; + + public: + MDRangeTuner() = default; + template <typename Functor, typename TagType, typename Calculator, + typename... Properties> + MDRangeTuner(const std::string& name, + const Kokkos::MDRangePolicy<Properties...>& policy, + const Functor& functor, const TagType& tag, Calculator calc) { + SpaceDescription desc; + int max_tile_size = + calc.get_mdrange_max_tile_size_product(policy, functor, tag); + Impl::fill_tile(desc, max_tile_size); + std::vector<std::string> feature_names; + for (int x = 0; x < rank; ++x) { + feature_names.push_back(name + "_tile_size_" + std::to_string(x)); + } + tuner = make_multidimensional_sparse_tuning_problem<max_slices>( + desc, feature_names); + } + template <typename Policy, typename Tuple, size_t... Indices> + void set_policy_tile(Policy& policy, const Tuple& tuple, + const std::index_sequence<Indices...>&) { + policy.impl_change_tile_size({std::get<Indices>(tuple)...}); + } + template <typename... Properties> + void tune(Kokkos::MDRangePolicy<Properties...>& policy) { + if (Kokkos::Tools::Experimental::have_tuning_tool()) { + auto configuration = tuner.begin(); + set_policy_tile(policy, configuration, std::make_index_sequence<rank>{}); + } + } + void end() { + if (Kokkos::Tools::Experimental::have_tuning_tool()) { + tuner.end(); + } + } +}; + +} // namespace Experimental +} // namespace Tools +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/Kokkos_UniqueToken.hpp b/packages/kokkos/core/src/Kokkos_UniqueToken.hpp new file mode 100644 index 0000000000000000000000000000000000000000..bce7e703f0bffaba7b9dcfa40c9566f35d9c31fa --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_UniqueToken.hpp @@ -0,0 +1,179 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_UNIQUE_TOKEN_HPP +#define KOKKOS_UNIQUE_TOKEN_HPP + +#include <Kokkos_Macros.hpp> +#include <Kokkos_MemoryTraits.hpp> +#include <Kokkos_Core_fwd.hpp> + +namespace Kokkos { +namespace Experimental { + +enum class UniqueTokenScope : int { Instance, Global }; + +/// \brief class to generate unique ids base on the required amount of +/// concurrency +/// +/// This object should behave like a ref-counted object, so that when the last +/// instance is destroy resources are free if needed +template <typename ExecutionSpace, + UniqueTokenScope = UniqueTokenScope::Instance> +class UniqueToken { + public: + using execution_space = ExecutionSpace; + using size_type = typename execution_space::size_type; + + /// \brief create object size for concurrency on the given instance + /// + /// This object should not be shared between instances + UniqueToken(execution_space const& = execution_space()); + + /// \brief upper bound for acquired values, i.e. 0 <= value < size() + KOKKOS_INLINE_FUNCTION + size_type size() const; + + /// \brief acquire value such that 0 <= value < size() + KOKKOS_INLINE_FUNCTION + size_type acquire() const; + + /// \brief release a value acquired by generate + KOKKOS_INLINE_FUNCTION + void release(size_type) const; +}; + +/// \brief Instance scope UniqueToken allows for a max size other than +/// execution_space::concurrency() +/// +/// This object should behave like a ref-counted object, so that when the last +/// instance is destroyed, resources are free if needed +template <typename ExecutionSpace> +class UniqueToken<ExecutionSpace, UniqueTokenScope::Instance> + : public UniqueToken<ExecutionSpace, UniqueTokenScope::Global> { + public: + using execution_space = ExecutionSpace; + using size_type = typename execution_space::size_type; + + /// \brief Create object with specified size + /// + /// It is required that max_size is >= the maximum number of concurrent + /// threads that will attempt to acquire the UniqueToken. This constructor is + /// most commonly useful when you: + /// 1) Have a loop bound that may be smaller than + /// execution_space::concurrency(). + /// 2) Want a per-team unique token in the range [0, + /// execution_space::concurrency() / team_size) + UniqueToken(size_type max_size, execution_space const& = execution_space()); +}; + +// NOTE There was an agreement amongst developers that "AcquireUniqueToken" is a +// bad name but at this time no one has suggested a better alternative. + +/// \brief RAII helper for per-thread unique token values. +/// +/// The token value will be acquired at construction and automatically +/// released at destruction. +template <typename ExecutionSpace, + UniqueTokenScope TokenScope = UniqueTokenScope::Instance> +class AcquireUniqueToken { + public: + using exec_space = ExecutionSpace; + using size_type = typename exec_space::size_type; + using token_type = UniqueToken<exec_space, TokenScope>; + + private: + token_type my_token; + size_type my_acquired_val; + + public: + KOKKOS_FUNCTION AcquireUniqueToken(token_type t) + : my_token(t), my_acquired_val(my_token.acquire()) {} + + KOKKOS_FUNCTION ~AcquireUniqueToken() { my_token.release(my_acquired_val); } + + KOKKOS_FUNCTION size_type value() const { return my_acquired_val; } +}; + +/// \brief RAII helper for per-team unique token values. +/// +/// The token value will be acquired at construction and automatically +/// released at destruction. All threads in a team will share the same +/// token value. +template <typename TeamPolicy> +class AcquireTeamUniqueToken { + public: + using exec_space = typename TeamPolicy::execution_space; + using token_type = UniqueToken<exec_space>; + using size_type = typename token_type::size_type; + using team_member_type = typename TeamPolicy::member_type; + using scratch_view = + Kokkos::View<size_type, typename exec_space::scratch_memory_space, + Kokkos::MemoryUnmanaged>; + + private: + token_type my_token; + size_type my_acquired_val; + scratch_view my_team_acquired_val; + team_member_type my_team; + + public: + // NOTE The implementations of the constructor and destructor use + // `Kokkos::single()` which is an inline function defined in each backend. + // This creates circular dependency issues. Moving them to a separate header + // is less than ideal and should be revisited later. Having a `UniqueToken` + // forward declaration was considered but the non-type template parameter + // makes things complicated because it would require moving the definition of + // `UniqueTokenScope` enumeration type and its enumerators away which would + // hurt readability. + KOKKOS_FUNCTION AcquireTeamUniqueToken(token_type t, team_member_type team); + KOKKOS_FUNCTION ~AcquireTeamUniqueToken(); + KOKKOS_FUNCTION size_type value() const { return my_acquired_val; } + static std::size_t shmem_size() { return scratch_view::shmem_size(); } +}; + +} // namespace Experimental +} // namespace Kokkos + +#endif // KOKKOS_UNIQUE_TOKEN_HPP diff --git a/packages/kokkos/core/src/Kokkos_Vectorization.hpp b/packages/kokkos/core/src/Kokkos_Vectorization.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a232e5b3abc1191bc357940b30d9707613c10957 --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_Vectorization.hpp @@ -0,0 +1,58 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +/// \file Kokkos_Vectorization.hpp +/// \brief Declaration and definition of Kokkos::Vectorization interface. +#ifndef KOKKOS_VECTORIZATION_HPP +#define KOKKOS_VECTORIZATION_HPP + +#include <Kokkos_Macros.hpp> + +#if defined(KOKKOS_ENABLE_CUDA) +#include <Cuda/Kokkos_Cuda_Vectorization.hpp> +#elif defined(KOKKOS_ENABLE_HIP) +#include <HIP/Kokkos_HIP_Vectorization.hpp> +#endif + +#endif diff --git a/packages/kokkos/core/src/Kokkos_View.hpp b/packages/kokkos/core/src/Kokkos_View.hpp new file mode 100644 index 0000000000000000000000000000000000000000..1abe0a48df5eab32f01ef703e6d39921eb9c70c3 --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_View.hpp @@ -0,0 +1,2107 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_VIEW_HPP +#define KOKKOS_VIEW_HPP + +#include <type_traits> +#include <string> +#include <algorithm> +#include <initializer_list> + +#include <Kokkos_Core_fwd.hpp> +#include <Kokkos_HostSpace.hpp> +#include <Kokkos_MemoryTraits.hpp> +#include <Kokkos_ExecPolicy.hpp> + +#include <impl/Kokkos_Tools.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <class DataType> +struct ViewArrayAnalysis; + +template <class DataType, class ArrayLayout, + typename ValueType = + typename ViewArrayAnalysis<DataType>::non_const_value_type> +struct ViewDataAnalysis; + +template <class, class...> +class ViewMapping { + public: + enum : bool { is_assignable_data_type = false }; + enum : bool { is_assignable = false }; +}; + +template <typename IntType> +KOKKOS_INLINE_FUNCTION std::size_t count_valid_integers( + const IntType i0, const IntType i1, const IntType i2, const IntType i3, + const IntType i4, const IntType i5, const IntType i6, const IntType i7) { + static_assert(std::is_integral<IntType>::value, + "count_valid_integers() must have integer arguments."); + + return (i0 != KOKKOS_INVALID_INDEX) + (i1 != KOKKOS_INVALID_INDEX) + + (i2 != KOKKOS_INVALID_INDEX) + (i3 != KOKKOS_INVALID_INDEX) + + (i4 != KOKKOS_INVALID_INDEX) + (i5 != KOKKOS_INVALID_INDEX) + + (i6 != KOKKOS_INVALID_INDEX) + (i7 != KOKKOS_INVALID_INDEX); +} + +KOKKOS_INLINE_FUNCTION +void runtime_check_rank_device(const size_t dyn_rank, const bool is_void_spec, + const size_t i0, const size_t i1, + const size_t i2, const size_t i3, + const size_t i4, const size_t i5, + const size_t i6, const size_t i7) { + if (is_void_spec) { + const size_t num_passed_args = + count_valid_integers(i0, i1, i2, i3, i4, i5, i6, i7); + + if (num_passed_args != dyn_rank && is_void_spec) { + Kokkos::abort( + "Number of arguments passed to Kokkos::View() constructor must match " + "the dynamic rank of the view."); + } + } +} + +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST +KOKKOS_INLINE_FUNCTION +void runtime_check_rank_host(const size_t dyn_rank, const bool is_void_spec, + const size_t i0, const size_t i1, const size_t i2, + const size_t i3, const size_t i4, const size_t i5, + const size_t i6, const size_t i7, + const std::string& label) { + if (is_void_spec) { + const size_t num_passed_args = + count_valid_integers(i0, i1, i2, i3, i4, i5, i6, i7); + + if (num_passed_args != dyn_rank) { + const std::string message = + "Constructor for Kokkos View '" + label + + "' has mismatched number of arguments. Number of arguments = " + + std::to_string(num_passed_args) + + " but dynamic rank = " + std::to_string(dyn_rank) + " \n"; + Kokkos::abort(message.c_str()); + } + } +} +#endif + +} /* namespace Impl */ +} /* namespace Kokkos */ + +// Class to provide a uniform type +namespace Kokkos { +namespace Impl { +template <class ViewType, int Traits = 0> +struct ViewUniformType; +} +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +/** \class ViewTraits + * \brief Traits class for accessing attributes of a View. + * + * This is an implementation detail of View. It is only of interest + * to developers implementing a new specialization of View. + * + * Template argument options: + * - View< DataType > + * - View< DataType , Space > + * - View< DataType , Space , MemoryTraits > + * - View< DataType , ArrayLayout > + * - View< DataType , ArrayLayout , Space > + * - View< DataType , ArrayLayout , MemoryTraits > + * - View< DataType , ArrayLayout , Space , MemoryTraits > + * - View< DataType , MemoryTraits > + */ + +template <class DataType, class... Properties> +struct ViewTraits; + +template <> +struct ViewTraits<void> { + using execution_space = void; + using memory_space = void; + using HostMirrorSpace = void; + using array_layout = void; + using memory_traits = void; + using specialize = void; +}; + +template <class... Prop> +struct ViewTraits<void, void, Prop...> { + // Ignore an extraneous 'void' + using execution_space = typename ViewTraits<void, Prop...>::execution_space; + using memory_space = typename ViewTraits<void, Prop...>::memory_space; + using HostMirrorSpace = typename ViewTraits<void, Prop...>::HostMirrorSpace; + using array_layout = typename ViewTraits<void, Prop...>::array_layout; + using memory_traits = typename ViewTraits<void, Prop...>::memory_traits; + using specialize = typename ViewTraits<void, Prop...>::specialize; +}; + +template <class ArrayLayout, class... Prop> +struct ViewTraits<typename std::enable_if< + Kokkos::Impl::is_array_layout<ArrayLayout>::value>::type, + ArrayLayout, Prop...> { + // Specify layout, keep subsequent space and memory traits arguments + + using execution_space = typename ViewTraits<void, Prop...>::execution_space; + using memory_space = typename ViewTraits<void, Prop...>::memory_space; + using HostMirrorSpace = typename ViewTraits<void, Prop...>::HostMirrorSpace; + using array_layout = ArrayLayout; + using memory_traits = typename ViewTraits<void, Prop...>::memory_traits; + using specialize = typename ViewTraits<void, Prop...>::specialize; +}; + +template <class Space, class... Prop> +struct ViewTraits< + typename std::enable_if<Kokkos::Impl::is_space<Space>::value>::type, Space, + Prop...> { + // Specify Space, memory traits should be the only subsequent argument. + + static_assert( + std::is_same<typename ViewTraits<void, Prop...>::execution_space, + void>::value && + std::is_same<typename ViewTraits<void, Prop...>::memory_space, + void>::value && + std::is_same<typename ViewTraits<void, Prop...>::HostMirrorSpace, + void>::value && + std::is_same<typename ViewTraits<void, Prop...>::array_layout, + void>::value, + "Only one View Execution or Memory Space template argument"); + + using execution_space = typename Space::execution_space; + using memory_space = typename Space::memory_space; + using HostMirrorSpace = + typename Kokkos::Impl::HostMirror<Space>::Space::memory_space; + using array_layout = typename execution_space::array_layout; + using memory_traits = typename ViewTraits<void, Prop...>::memory_traits; + using specialize = typename ViewTraits<void, Prop...>::specialize; +}; + +template <class MemoryTraits, class... Prop> +struct ViewTraits<typename std::enable_if<Kokkos::Impl::is_memory_traits< + MemoryTraits>::value>::type, + MemoryTraits, Prop...> { + // Specify memory trait, should not be any subsequent arguments + + static_assert( + std::is_same<typename ViewTraits<void, Prop...>::execution_space, + void>::value && + std::is_same<typename ViewTraits<void, Prop...>::memory_space, + void>::value && + std::is_same<typename ViewTraits<void, Prop...>::array_layout, + void>::value && + std::is_same<typename ViewTraits<void, Prop...>::memory_traits, + void>::value, + "MemoryTrait is the final optional template argument for a View"); + + using execution_space = void; + using memory_space = void; + using HostMirrorSpace = void; + using array_layout = void; + using memory_traits = MemoryTraits; + using specialize = void; +}; + +template <class DataType, class... Properties> +struct ViewTraits { + private: + // Unpack the properties arguments + using prop = ViewTraits<void, Properties...>; + + using ExecutionSpace = typename std::conditional< + !std::is_same<typename prop::execution_space, void>::value, + typename prop::execution_space, Kokkos::DefaultExecutionSpace>::type; + + using MemorySpace = typename std::conditional< + !std::is_same<typename prop::memory_space, void>::value, + typename prop::memory_space, typename ExecutionSpace::memory_space>::type; + + using ArrayLayout = typename std::conditional< + !std::is_same<typename prop::array_layout, void>::value, + typename prop::array_layout, typename ExecutionSpace::array_layout>::type; + + using HostMirrorSpace = typename std::conditional< + !std::is_same<typename prop::HostMirrorSpace, void>::value, + typename prop::HostMirrorSpace, + typename Kokkos::Impl::HostMirror<ExecutionSpace>::Space>::type; + + using MemoryTraits = typename std::conditional< + !std::is_same<typename prop::memory_traits, void>::value, + typename prop::memory_traits, typename Kokkos::MemoryManaged>::type; + + // Analyze data type's properties, + // May be specialized based upon the layout and value type + using data_analysis = Kokkos::Impl::ViewDataAnalysis<DataType, ArrayLayout>; + + public: + //------------------------------------ + // Data type traits: + + using data_type = typename data_analysis::type; + using const_data_type = typename data_analysis::const_type; + using non_const_data_type = typename data_analysis::non_const_type; + + //------------------------------------ + // Compatible array of trivial type traits: + + using scalar_array_type = typename data_analysis::scalar_array_type; + using const_scalar_array_type = + typename data_analysis::const_scalar_array_type; + using non_const_scalar_array_type = + typename data_analysis::non_const_scalar_array_type; + + //------------------------------------ + // Value type traits: + + using value_type = typename data_analysis::value_type; + using const_value_type = typename data_analysis::const_value_type; + using non_const_value_type = typename data_analysis::non_const_value_type; + + //------------------------------------ + // Mapping traits: + + using array_layout = ArrayLayout; + using dimension = typename data_analysis::dimension; + + using specialize = typename std::conditional< + std::is_same<typename data_analysis::specialize, void>::value, + typename prop::specialize, typename data_analysis::specialize>:: + type; /* mapping specialization tag */ + + enum { rank = dimension::rank }; + enum { rank_dynamic = dimension::rank_dynamic }; + + //------------------------------------ + // Execution space, memory space, memory access traits, and host mirror space. + + using execution_space = ExecutionSpace; + using memory_space = MemorySpace; + using device_type = Kokkos::Device<ExecutionSpace, MemorySpace>; + using memory_traits = MemoryTraits; + using host_mirror_space = HostMirrorSpace; + + using size_type = typename MemorySpace::size_type; + + enum { is_hostspace = std::is_same<MemorySpace, HostSpace>::value }; + enum { is_managed = MemoryTraits::is_unmanaged == 0 }; + enum { is_random_access = MemoryTraits::is_random_access == 1 }; + + //------------------------------------ +}; + +/** \class View + * \brief View to an array of data. + * + * A View represents an array of one or more dimensions. + * For details, please refer to Kokkos' tutorial materials. + * + * \section Kokkos_View_TemplateParameters Template parameters + * + * This class has both required and optional template parameters. The + * \c DataType parameter must always be provided, and must always be + * first. The parameters \c Arg1Type, \c Arg2Type, and \c Arg3Type are + * placeholders for different template parameters. The default value + * of the fifth template parameter \c Specialize suffices for most use + * cases. When explaining the template parameters, we won't refer to + * \c Arg1Type, \c Arg2Type, and \c Arg3Type; instead, we will refer + * to the valid categories of template parameters, in whatever order + * they may occur. + * + * Valid ways in which template arguments may be specified: + * - View< DataType > + * - View< DataType , Layout > + * - View< DataType , Layout , Space > + * - View< DataType , Layout , Space , MemoryTraits > + * - View< DataType , Space > + * - View< DataType , Space , MemoryTraits > + * - View< DataType , MemoryTraits > + * + * \tparam DataType (required) This indicates both the type of each + * entry of the array, and the combination of compile-time and + * run-time array dimension(s). For example, <tt>double*</tt> + * indicates a one-dimensional array of \c double with run-time + * dimension, and <tt>int*[3]</tt> a two-dimensional array of \c int + * with run-time first dimension and compile-time second dimension + * (of 3). In general, the run-time dimensions (if any) must go + * first, followed by zero or more compile-time dimensions. For + * more examples, please refer to the tutorial materials. + * + * \tparam Space (required) The memory space. + * + * \tparam Layout (optional) The array's layout in memory. For + * example, LayoutLeft indicates a column-major (Fortran style) + * layout, and LayoutRight a row-major (C style) layout. If not + * specified, this defaults to the preferred layout for the + * <tt>Space</tt>. + * + * \tparam MemoryTraits (optional) Assertion of the user's intended + * access behavior. For example, RandomAccess indicates read-only + * access with limited spatial locality, and Unmanaged lets users + * wrap externally allocated memory in a View without automatic + * deallocation. + * + * \section Kokkos_View_MT MemoryTraits discussion + * + * \subsection Kokkos_View_MT_Interp MemoryTraits interpretation depends on + * Space + * + * Some \c MemoryTraits options may have different interpretations for + * different \c Space types. For example, with the Cuda device, + * \c RandomAccess tells Kokkos to fetch the data through the texture + * cache, whereas the non-GPU devices have no such hardware construct. + * + * \subsection Kokkos_View_MT_PrefUse Preferred use of MemoryTraits + * + * Users should defer applying the optional \c MemoryTraits parameter + * until the point at which they actually plan to rely on it in a + * computational kernel. This minimizes the number of template + * parameters exposed in their code, which reduces the cost of + * compilation. Users may always assign a View without specified + * \c MemoryTraits to a compatible View with that specification. + * For example: + * \code + * // Pass in the simplest types of View possible. + * void + * doSomething (View<double*, Cuda> out, + * View<const double*, Cuda> in) + * { + * // Assign the "generic" View in to a RandomAccess View in_rr. + * // Note that RandomAccess View objects must have const data. + * View<const double*, Cuda, RandomAccess> in_rr = in; + * // ... do something with in_rr and out ... + * } + * \endcode + */ + +} // namespace Kokkos + +namespace Kokkos { + +template <class T1, class T2> +struct is_always_assignable_impl; + +template <class... ViewTDst, class... ViewTSrc> +struct is_always_assignable_impl<Kokkos::View<ViewTDst...>, + Kokkos::View<ViewTSrc...>> { + using mapping_type = Kokkos::Impl::ViewMapping< + typename Kokkos::View<ViewTDst...>::traits, + typename Kokkos::View<ViewTSrc...>::traits, + typename Kokkos::View<ViewTDst...>::traits::specialize>; + + constexpr static bool value = + mapping_type::is_assignable && + static_cast<int>(Kokkos::View<ViewTDst...>::rank_dynamic) >= + static_cast<int>(Kokkos::View<ViewTSrc...>::rank_dynamic); +}; + +template <class View1, class View2> +using is_always_assignable = is_always_assignable_impl< + typename std::remove_reference<View1>::type, + typename std::remove_const< + typename std::remove_reference<View2>::type>::type>; + +#ifdef KOKKOS_ENABLE_CXX17 +template <class T1, class T2> +inline constexpr bool is_always_assignable_v = + is_always_assignable<T1, T2>::value; +#endif + +template <class... ViewTDst, class... ViewTSrc> +constexpr bool is_assignable(const Kokkos::View<ViewTDst...>& dst, + const Kokkos::View<ViewTSrc...>& src) { + using DstTraits = typename Kokkos::View<ViewTDst...>::traits; + using SrcTraits = typename Kokkos::View<ViewTSrc...>::traits; + using mapping_type = + Kokkos::Impl::ViewMapping<DstTraits, SrcTraits, + typename DstTraits::specialize>; + +#ifdef KOKKOS_ENABLE_CXX17 + return is_always_assignable_v<Kokkos::View<ViewTDst...>, + Kokkos::View<ViewTSrc...>> || +#else + return is_always_assignable<Kokkos::View<ViewTDst...>, + Kokkos::View<ViewTSrc...>>::value || +#endif + (mapping_type::is_assignable && + ((DstTraits::dimension::rank_dynamic >= 1) || + (dst.static_extent(0) == src.extent(0))) && + ((DstTraits::dimension::rank_dynamic >= 2) || + (dst.static_extent(1) == src.extent(1))) && + ((DstTraits::dimension::rank_dynamic >= 3) || + (dst.static_extent(2) == src.extent(2))) && + ((DstTraits::dimension::rank_dynamic >= 4) || + (dst.static_extent(3) == src.extent(3))) && + ((DstTraits::dimension::rank_dynamic >= 5) || + (dst.static_extent(4) == src.extent(4))) && + ((DstTraits::dimension::rank_dynamic >= 6) || + (dst.static_extent(5) == src.extent(5))) && + ((DstTraits::dimension::rank_dynamic >= 7) || + (dst.static_extent(6) == src.extent(6))) && + ((DstTraits::dimension::rank_dynamic >= 8) || + (dst.static_extent(7) == src.extent(7)))); +} + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#include <impl/Kokkos_ViewMapping.hpp> +#include <impl/Kokkos_ViewArray.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +namespace { + +constexpr Kokkos::Impl::ALL_t ALL = Kokkos::Impl::ALL_t(); + +constexpr Kokkos::Impl::WithoutInitializing_t WithoutInitializing = + Kokkos::Impl::WithoutInitializing_t(); + +constexpr Kokkos::Impl::AllowPadding_t AllowPadding = + Kokkos::Impl::AllowPadding_t(); + +} // namespace + +/** \brief Create View allocation parameter bundle from argument list. + * + * Valid argument list members are: + * 1) label as a "string" or std::string + * 2) memory space instance of the View::memory_space type + * 3) execution space instance compatible with the View::memory_space + * 4) Kokkos::WithoutInitializing to bypass initialization + * 4) Kokkos::AllowPadding to allow allocation to pad dimensions for memory + * alignment + */ +template <class... Args> +inline Impl::ViewCtorProp<typename Impl::ViewCtorProp<void, Args>::type...> +view_alloc(Args const&... args) { + using return_type = + Impl::ViewCtorProp<typename Impl::ViewCtorProp<void, Args>::type...>; + + static_assert(!return_type::has_pointer, + "Cannot give pointer-to-memory for view allocation"); + + return return_type(args...); +} + +template <class... Args> +KOKKOS_INLINE_FUNCTION + Impl::ViewCtorProp<typename Impl::ViewCtorProp<void, Args>::type...> + view_wrap(Args const&... args) { + using return_type = + Impl::ViewCtorProp<typename Impl::ViewCtorProp<void, Args>::type...>; + + static_assert(!return_type::has_memory_space && + !return_type::has_execution_space && + !return_type::has_label && return_type::has_pointer, + "Must only give pointer-to-memory for view wrapping"); + + return return_type(args...); +} + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +template <class DataType, class... Properties> +class View; + +template <class> +struct is_view : public std::false_type {}; + +template <class D, class... P> +struct is_view<View<D, P...>> : public std::true_type {}; + +template <class D, class... P> +struct is_view<const View<D, P...>> : public std::true_type {}; + +template <class DataType, class... Properties> +class View : public ViewTraits<DataType, Properties...> { + private: + template <class, class...> + friend class View; + template <class, class...> + friend class Kokkos::Impl::ViewMapping; + + using view_tracker_type = Kokkos::Impl::ViewTracker<View>; + + public: + using traits = ViewTraits<DataType, Properties...>; + + private: + using map_type = + Kokkos::Impl::ViewMapping<traits, typename traits::specialize>; + template <typename V> + friend struct Kokkos::Impl::ViewTracker; + + view_tracker_type m_track; + map_type m_map; + + public: + //---------------------------------------- + /** \brief Compatible view of array of scalar types */ + using array_type = + View<typename traits::scalar_array_type, typename traits::array_layout, + typename traits::device_type, typename traits::memory_traits>; + + /** \brief Compatible view of const data type */ + using const_type = + View<typename traits::const_data_type, typename traits::array_layout, + typename traits::device_type, typename traits::memory_traits>; + + /** \brief Compatible view of non-const data type */ + using non_const_type = + View<typename traits::non_const_data_type, typename traits::array_layout, + typename traits::device_type, typename traits::memory_traits>; + + /** \brief Compatible HostMirror view */ + using HostMirror = + View<typename traits::non_const_data_type, typename traits::array_layout, + Device<DefaultHostExecutionSpace, + typename traits::host_mirror_space::memory_space>>; + + /** \brief Compatible HostMirror view */ + using host_mirror_type = + View<typename traits::non_const_data_type, typename traits::array_layout, + typename traits::host_mirror_space>; + + /** \brief Unified types */ + using uniform_type = typename Impl::ViewUniformType<View, 0>::type; + using uniform_const_type = + typename Impl::ViewUniformType<View, 0>::const_type; + using uniform_runtime_type = + typename Impl::ViewUniformType<View, 0>::runtime_type; + using uniform_runtime_const_type = + typename Impl::ViewUniformType<View, 0>::runtime_const_type; + using uniform_nomemspace_type = + typename Impl::ViewUniformType<View, 0>::nomemspace_type; + using uniform_const_nomemspace_type = + typename Impl::ViewUniformType<View, 0>::const_nomemspace_type; + using uniform_runtime_nomemspace_type = + typename Impl::ViewUniformType<View, 0>::runtime_nomemspace_type; + using uniform_runtime_const_nomemspace_type = + typename Impl::ViewUniformType<View, 0>::runtime_const_nomemspace_type; + + //---------------------------------------- + // Domain rank and extents + + enum { Rank = map_type::Rank }; + + /** \brief rank() to be implemented + */ + // KOKKOS_INLINE_FUNCTION + // static + // constexpr unsigned rank() { return map_type::Rank; } + + template <typename iType> + KOKKOS_INLINE_FUNCTION constexpr + typename std::enable_if<std::is_integral<iType>::value, size_t>::type + extent(const iType& r) const noexcept { + return m_map.extent(r); + } + + static KOKKOS_INLINE_FUNCTION constexpr size_t static_extent( + const unsigned r) noexcept { + return map_type::static_extent(r); + } + + template <typename iType> + KOKKOS_INLINE_FUNCTION constexpr + typename std::enable_if<std::is_integral<iType>::value, int>::type + extent_int(const iType& r) const noexcept { + return static_cast<int>(m_map.extent(r)); + } + + KOKKOS_INLINE_FUNCTION constexpr typename traits::array_layout layout() + const { + return m_map.layout(); + } + + //---------------------------------------- + /* Deprecate all 'dimension' functions in favor of + * ISO/C++ vocabulary 'extent'. + */ + + KOKKOS_INLINE_FUNCTION constexpr size_t size() const { + return m_map.dimension_0() * m_map.dimension_1() * m_map.dimension_2() * + m_map.dimension_3() * m_map.dimension_4() * m_map.dimension_5() * + m_map.dimension_6() * m_map.dimension_7(); + } + + KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { + return m_map.stride_0(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { + return m_map.stride_1(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { + return m_map.stride_2(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { + return m_map.stride_3(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { + return m_map.stride_4(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { + return m_map.stride_5(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { + return m_map.stride_6(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { + return m_map.stride_7(); + } + + template <typename iType> + KOKKOS_INLINE_FUNCTION constexpr + typename std::enable_if<std::is_integral<iType>::value, size_t>::type + stride(iType r) const { + return ( + r == 0 + ? m_map.stride_0() + : (r == 1 + ? m_map.stride_1() + : (r == 2 + ? m_map.stride_2() + : (r == 3 + ? m_map.stride_3() + : (r == 4 + ? m_map.stride_4() + : (r == 5 + ? m_map.stride_5() + : (r == 6 + ? m_map.stride_6() + : m_map.stride_7()))))))); + } + + template <typename iType> + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + m_map.stride(s); + } + + //---------------------------------------- + // Range span is the span which contains all members. + + using reference_type = typename map_type::reference_type; + using pointer_type = typename map_type::pointer_type; + + enum { + reference_type_is_lvalue_reference = + std::is_lvalue_reference<reference_type>::value + }; + + KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_map.span(); } + KOKKOS_INLINE_FUNCTION bool span_is_contiguous() const { + return m_map.span_is_contiguous(); + } + KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const { + return m_map.data() != nullptr; + } + KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { + return m_map.data(); + } + + //---------------------------------------- + // Allow specializations to query their specialized map + + KOKKOS_INLINE_FUNCTION + const Kokkos::Impl::ViewMapping<traits, typename traits::specialize>& + impl_map() const { + return m_map; + } + KOKKOS_INLINE_FUNCTION + const Kokkos::Impl::SharedAllocationTracker& impl_track() const { + return m_track.m_tracker; + } + //---------------------------------------- + + private: + static constexpr bool is_layout_left = + std::is_same<typename traits::array_layout, Kokkos::LayoutLeft>::value; + + static constexpr bool is_layout_right = + std::is_same<typename traits::array_layout, Kokkos::LayoutRight>::value; + + static constexpr bool is_layout_stride = + std::is_same<typename traits::array_layout, Kokkos::LayoutStride>::value; + + static constexpr bool is_default_map = + std::is_same<typename traits::specialize, void>::value && + (is_layout_left || is_layout_right || is_layout_stride); + +#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) + +#define KOKKOS_IMPL_SINK(ARG) ARG + +#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(ARG) \ + Kokkos::Impl::verify_space<Kokkos::Impl::ActiveExecutionMemorySpace, \ + typename traits::memory_space>::check(); \ + Kokkos::Impl::view_verify_operator_bounds<typename traits::memory_space> ARG; + +#else + +#define KOKKOS_IMPL_SINK(ARG) + +#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(ARG) \ + Kokkos::Impl::verify_space<Kokkos::Impl::ActiveExecutionMemorySpace, \ + typename traits::memory_space>::check(); + +#endif + + public: + //------------------------------ + // Rank 0 operator() + + KOKKOS_FORCEINLINE_FUNCTION + reference_type operator()() const { return m_map.reference(); } + //------------------------------ + // Rank 1 operator() + + template <typename I0> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<(Kokkos::Impl::are_integral<I0>::value && + (1 == Rank) && !is_default_map), + reference_type>::type + operator()(const I0& i0) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0)) + return m_map.reference(i0); + } + + template <typename I0> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<(Kokkos::Impl::are_integral<I0>::value && + (1 == Rank) && is_default_map && + !is_layout_stride), + reference_type>::type + operator()(const I0& i0) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0)) + return m_map.m_impl_handle[i0]; + } + + template <typename I0> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<(Kokkos::Impl::are_integral<I0>::value && + (1 == Rank) && is_default_map && + is_layout_stride), + reference_type>::type + operator()(const I0& i0) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0)) + return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; + } + //------------------------------ + // Rank 1 operator[] + + template <typename I0> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<(Kokkos::Impl::are_integral<I0>::value && + (1 == Rank) && !is_default_map), + reference_type>::type + operator[](const I0& i0) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0)) + return m_map.reference(i0); + } + + template <typename I0> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<(Kokkos::Impl::are_integral<I0>::value && + (1 == Rank) && is_default_map && + !is_layout_stride), + reference_type>::type + operator[](const I0& i0) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0)) + return m_map.m_impl_handle[i0]; + } + + template <typename I0> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<(Kokkos::Impl::are_integral<I0>::value && + (1 == Rank) && is_default_map && + is_layout_stride), + reference_type>::type + operator[](const I0& i0) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0)) + return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; + } + + //------------------------------ + // Rank 2 + + template <typename I0, typename I1> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<(Kokkos::Impl::are_integral<I0, I1>::value && + (2 == Rank) && !is_default_map), + reference_type>::type + operator()(const I0& i0, const I1& i1) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0, i1)) + return m_map.reference(i0, i1); + } + + template <typename I0, typename I1> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<(Kokkos::Impl::are_integral<I0, I1>::value && + (2 == Rank) && is_default_map && + is_layout_left && (traits::rank_dynamic == 0)), + reference_type>::type + operator()(const I0& i0, const I1& i1) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0, i1)) + return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_dim.N0 * i1]; + } + + template <typename I0, typename I1> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<(Kokkos::Impl::are_integral<I0, I1>::value && + (2 == Rank) && is_default_map && + is_layout_left && (traits::rank_dynamic != 0)), + reference_type>::type + operator()(const I0& i0, const I1& i1) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0, i1)) + return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_stride * i1]; + } + + template <typename I0, typename I1> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<(Kokkos::Impl::are_integral<I0, I1>::value && + (2 == Rank) && is_default_map && + is_layout_right && (traits::rank_dynamic == 0)), + reference_type>::type + operator()(const I0& i0, const I1& i1) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0, i1)) + return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_dim.N1 * i0]; + } + + template <typename I0, typename I1> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<(Kokkos::Impl::are_integral<I0, I1>::value && + (2 == Rank) && is_default_map && + is_layout_right && (traits::rank_dynamic != 0)), + reference_type>::type + operator()(const I0& i0, const I1& i1) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0, i1)) + return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_stride * i0]; + } + + template <typename I0, typename I1> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<(Kokkos::Impl::are_integral<I0, I1>::value && + (2 == Rank) && is_default_map && + is_layout_stride), + reference_type>::type + operator()(const I0& i0, const I1& i1) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0, i1)) + return m_map.m_impl_handle[i0 * m_map.m_impl_offset.m_stride.S0 + + i1 * m_map.m_impl_offset.m_stride.S1]; + } + + //------------------------------ + // Rank 3 + + template <typename I0, typename I1, typename I2> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<(Kokkos::Impl::are_integral<I0, I1, I2>::value && + (3 == Rank) && is_default_map), + reference_type>::type + operator()(const I0& i0, const I1& i1, const I2& i2) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0, i1, i2)) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2)]; + } + + template <typename I0, typename I1, typename I2> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<(Kokkos::Impl::are_integral<I0, I1, I2>::value && + (3 == Rank) && !is_default_map), + reference_type>::type + operator()(const I0& i0, const I1& i1, const I2& i2) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0, i1, i2)) + return m_map.reference(i0, i1, i2); + } + + //------------------------------ + // Rank 4 + + template <typename I0, typename I1, typename I2, typename I3> + KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< + (Kokkos::Impl::are_integral<I0, I1, I2, I3>::value && (4 == Rank) && + is_default_map), + reference_type>::type + operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0, i1, i2, i3)) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3)]; + } + + template <typename I0, typename I1, typename I2, typename I3> + KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< + (Kokkos::Impl::are_integral<I0, I1, I2, I3>::value && (4 == Rank) && + !is_default_map), + reference_type>::type + operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0, i1, i2, i3)) + return m_map.reference(i0, i1, i2, i3); + } + + //------------------------------ + // Rank 5 + + template <typename I0, typename I1, typename I2, typename I3, typename I4> + KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< + (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4>::value && (5 == Rank) && + is_default_map), + reference_type>::type + operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, + const I4& i4) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0, i1, i2, i3, i4)) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4)]; + } + + template <typename I0, typename I1, typename I2, typename I3, typename I4> + KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< + (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4>::value && (5 == Rank) && + !is_default_map), + reference_type>::type + operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, + const I4& i4) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0, i1, i2, i3, i4)) + return m_map.reference(i0, i1, i2, i3, i4); + } + + //------------------------------ + // Rank 6 + + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5> + KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< + (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5>::value && + (6 == Rank) && is_default_map), + reference_type>::type + operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, + const I4& i4, const I5& i5) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0, i1, i2, i3, i4, i5)) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5)]; + } + + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5> + KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< + (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5>::value && + (6 == Rank) && !is_default_map), + reference_type>::type + operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, + const I4& i4, const I5& i5) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((m_track, m_map, i0, i1, i2, i3, i4, i5)) + return m_map.reference(i0, i1, i2, i3, i4, i5); + } + + //------------------------------ + // Rank 7 + + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5, typename I6> + KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< + (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5, I6>::value && + (7 == Rank) && is_default_map), + reference_type>::type + operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, + const I4& i4, const I5& i5, const I6& i6) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + (m_track, m_map, i0, i1, i2, i3, i4, i5, i6)) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5, i6)]; + } + + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5, typename I6> + KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< + (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5, I6>::value && + (7 == Rank) && !is_default_map), + reference_type>::type + operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, + const I4& i4, const I5& i5, const I6& i6) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + (m_track, m_map, i0, i1, i2, i3, i4, i5, i6)) + return m_map.reference(i0, i1, i2, i3, i4, i5, i6); + } + + //------------------------------ + // Rank 8 + + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5, typename I6, typename I7> + KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< + (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5, I6, I7>::value && + (8 == Rank) && is_default_map), + reference_type>::type + operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, + const I4& i4, const I5& i5, const I6& i6, const I7& i7) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + (m_track, m_map, i0, i1, i2, i3, i4, i5, i6, i7)) + return m_map + .m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5, i6, i7)]; + } + + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5, typename I6, typename I7> + KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< + (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5, I6, I7>::value && + (8 == Rank) && !is_default_map), + reference_type>::type + operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, + const I4& i4, const I5& i5, const I6& i6, const I7& i7) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + (m_track, m_map, i0, i1, i2, i3, i4, i5, i6, i7)) + return m_map.reference(i0, i1, i2, i3, i4, i5, i6, i7); + } + + template <class... Args> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<(Kokkos::Impl::are_integral<Args...>::value && + (0 == Rank)), + reference_type>::type + access(Args... KOKKOS_IMPL_SINK(args)) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + KOKKOS_IMPL_SINK((m_track, m_map, args...))) + return m_map.reference(); + } + + template <typename I0, class... Args> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<(Kokkos::Impl::are_integral<I0, Args...>::value && + (1 == Rank) && !is_default_map), + reference_type>::type + access(const I0& i0, Args... KOKKOS_IMPL_SINK(args)) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + KOKKOS_IMPL_SINK((m_track, m_map, i0, args...))) + return m_map.reference(i0); + } + + template <typename I0, class... Args> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<(Kokkos::Impl::are_integral<I0, Args...>::value && + (1 == Rank) && is_default_map && + !is_layout_stride), + reference_type>::type + access(const I0& i0, Args... KOKKOS_IMPL_SINK(args)) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + KOKKOS_IMPL_SINK((m_track, m_map, i0, args...))) + return m_map.m_impl_handle[i0]; + } + + template <typename I0, class... Args> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<(Kokkos::Impl::are_integral<I0, Args...>::value && + (1 == Rank) && is_default_map && + is_layout_stride), + reference_type>::type + access(const I0& i0, Args... KOKKOS_IMPL_SINK(args)) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + KOKKOS_IMPL_SINK((m_track, m_map, i0, args...))) + return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; + } + + template <typename I0, typename I1, class... Args> + KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< + (Kokkos::Impl::are_integral<I0, I1, Args...>::value && (2 == Rank) && + !is_default_map), + reference_type>::type + access(const I0& i0, const I1& i1, Args... KOKKOS_IMPL_SINK(args)) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + KOKKOS_IMPL_SINK((m_track, m_map, i0, i1, args...))) + return m_map.reference(i0, i1); + } + + template <typename I0, typename I1, class... Args> + KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< + (Kokkos::Impl::are_integral<I0, I1, Args...>::value && (2 == Rank) && + is_default_map && is_layout_left && (traits::rank_dynamic == 0)), + reference_type>::type + access(const I0& i0, const I1& i1, Args... KOKKOS_IMPL_SINK(args)) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + KOKKOS_IMPL_SINK((m_track, m_map, i0, i1, args...))) + return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_dim.N0 * i1]; + } + + template <typename I0, typename I1, class... Args> + KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< + (Kokkos::Impl::are_integral<I0, I1, Args...>::value && (2 == Rank) && + is_default_map && is_layout_left && (traits::rank_dynamic != 0)), + reference_type>::type + access(const I0& i0, const I1& i1, Args... KOKKOS_IMPL_SINK(args)) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + KOKKOS_IMPL_SINK((m_track, m_map, i0, i1, args...))) + return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_stride * i1]; + } + + template <typename I0, typename I1, class... Args> + KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< + (Kokkos::Impl::are_integral<I0, I1, Args...>::value && (2 == Rank) && + is_default_map && is_layout_right && (traits::rank_dynamic == 0)), + reference_type>::type + access(const I0& i0, const I1& i1, Args... KOKKOS_IMPL_SINK(args)) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + KOKKOS_IMPL_SINK((m_track, m_map, i0, i1, args...))) + return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_dim.N1 * i0]; + } + + template <typename I0, typename I1, class... Args> + KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< + (Kokkos::Impl::are_integral<I0, I1, Args...>::value && (2 == Rank) && + is_default_map && is_layout_right && (traits::rank_dynamic != 0)), + reference_type>::type + access(const I0& i0, const I1& i1, Args... KOKKOS_IMPL_SINK(args)) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + KOKKOS_IMPL_SINK((m_track, m_map, i0, i1, args...))) + return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_stride * i0]; + } + + template <typename I0, typename I1, class... Args> + KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< + (Kokkos::Impl::are_integral<I0, I1, Args...>::value && (2 == Rank) && + is_default_map && is_layout_stride), + reference_type>::type + access(const I0& i0, const I1& i1, Args... KOKKOS_IMPL_SINK(args)) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + KOKKOS_IMPL_SINK((m_track, m_map, i0, i1, args...))) + return m_map.m_impl_handle[i0 * m_map.m_impl_offset.m_stride.S0 + + i1 * m_map.m_impl_offset.m_stride.S1]; + } + + //------------------------------ + // Rank 3 + + template <typename I0, typename I1, typename I2, class... Args> + KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< + (Kokkos::Impl::are_integral<I0, I1, I2, Args...>::value && (3 == Rank) && + is_default_map), + reference_type>::type + access(const I0& i0, const I1& i1, const I2& i2, + Args... KOKKOS_IMPL_SINK(args)) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + KOKKOS_IMPL_SINK((m_track, m_map, i0, i1, i2, args...))) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2)]; + } + + template <typename I0, typename I1, typename I2, class... Args> + KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< + (Kokkos::Impl::are_integral<I0, I1, I2, Args...>::value && (3 == Rank) && + !is_default_map), + reference_type>::type + access(const I0& i0, const I1& i1, const I2& i2, + Args... KOKKOS_IMPL_SINK(args)) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + KOKKOS_IMPL_SINK((m_track, m_map, i0, i1, i2, args...))) + return m_map.reference(i0, i1, i2); + } + + //------------------------------ + // Rank 4 + + template <typename I0, typename I1, typename I2, typename I3, class... Args> + KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< + (Kokkos::Impl::are_integral<I0, I1, I2, I3, Args...>::value && + (4 == Rank) && is_default_map), + reference_type>::type + access(const I0& i0, const I1& i1, const I2& i2, const I3& i3, + Args... KOKKOS_IMPL_SINK(args)) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + KOKKOS_IMPL_SINK((m_track, m_map, i0, i1, i2, i3, args...))) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3)]; + } + + template <typename I0, typename I1, typename I2, typename I3, class... Args> + KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< + (Kokkos::Impl::are_integral<I0, I1, I2, I3, Args...>::value && + (4 == Rank) && !is_default_map), + reference_type>::type + access(const I0& i0, const I1& i1, const I2& i2, const I3& i3, + Args... KOKKOS_IMPL_SINK(args)) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + KOKKOS_IMPL_SINK((m_track, m_map, i0, i1, i2, i3, args...))) + return m_map.reference(i0, i1, i2, i3); + } + + //------------------------------ + // Rank 5 + + template <typename I0, typename I1, typename I2, typename I3, typename I4, + class... Args> + KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< + (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, Args...>::value && + (5 == Rank) && is_default_map), + reference_type>::type + access(const I0& i0, const I1& i1, const I2& i2, const I3& i3, const I4& i4, + Args... KOKKOS_IMPL_SINK(args)) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + KOKKOS_IMPL_SINK((m_track, m_map, i0, i1, i2, i3, i4, args...))) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4)]; + } + + template <typename I0, typename I1, typename I2, typename I3, typename I4, + class... Args> + KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< + (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, Args...>::value && + (5 == Rank) && !is_default_map), + reference_type>::type + access(const I0& i0, const I1& i1, const I2& i2, const I3& i3, const I4& i4, + Args... KOKKOS_IMPL_SINK(args)) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + KOKKOS_IMPL_SINK((m_track, m_map, i0, i1, i2, i3, i4, args...))) + return m_map.reference(i0, i1, i2, i3, i4); + } + + //------------------------------ + // Rank 6 + + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5, class... Args> + KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< + (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5, Args...>::value && + (6 == Rank) && is_default_map), + reference_type>::type + access(const I0& i0, const I1& i1, const I2& i2, const I3& i3, const I4& i4, + const I5& i5, Args... KOKKOS_IMPL_SINK(args)) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + KOKKOS_IMPL_SINK((m_track, m_map, i0, i1, i2, i3, i4, i5, args...))) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5)]; + } + + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5, class... Args> + KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< + (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5, Args...>::value && + (6 == Rank) && !is_default_map), + reference_type>::type + access(const I0& i0, const I1& i1, const I2& i2, const I3& i3, const I4& i4, + const I5& i5, Args... KOKKOS_IMPL_SINK(args)) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + KOKKOS_IMPL_SINK((m_track, m_map, i0, i1, i2, i3, i4, i5, args...))) + return m_map.reference(i0, i1, i2, i3, i4, i5); + } + + //------------------------------ + // Rank 7 + + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5, typename I6, class... Args> + KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< + (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5, I6, Args...>::value && + (7 == Rank) && is_default_map), + reference_type>::type + access(const I0& i0, const I1& i1, const I2& i2, const I3& i3, const I4& i4, + const I5& i5, const I6& i6, Args... KOKKOS_IMPL_SINK(args)) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + KOKKOS_IMPL_SINK((m_track, m_map, i0, i1, i2, i3, i4, i5, i6, args...))) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5, i6)]; + } + + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5, typename I6, class... Args> + KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< + (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5, I6, Args...>::value && + (7 == Rank) && !is_default_map), + reference_type>::type + access(const I0& i0, const I1& i1, const I2& i2, const I3& i3, const I4& i4, + const I5& i5, const I6& i6, Args... KOKKOS_IMPL_SINK(args)) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( + KOKKOS_IMPL_SINK((m_track, m_map, i0, i1, i2, i3, i4, i5, i6, args...))) + return m_map.reference(i0, i1, i2, i3, i4, i5, i6); + } + + //------------------------------ + // Rank 8 + + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5, typename I6, typename I7, class... Args> + KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< + (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5, I6, I7, + Args...>::value && + (8 == Rank) && is_default_map), + reference_type>::type + access(const I0& i0, const I1& i1, const I2& i2, const I3& i3, const I4& i4, + const I5& i5, const I6& i6, const I7& i7, + Args... KOKKOS_IMPL_SINK(args)) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(KOKKOS_IMPL_SINK( + (m_track, m_map, i0, i1, i2, i3, i4, i5, i6, i7, args...))) + return m_map + .m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5, i6, i7)]; + } + + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5, typename I6, typename I7, class... Args> + KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< + (Kokkos::Impl::are_integral<I0, I1, I2, I3, I4, I5, I6, I7, + Args...>::value && + (8 == Rank) && !is_default_map), + reference_type>::type + access(const I0& i0, const I1& i1, const I2& i2, const I3& i3, const I4& i4, + const I5& i5, const I6& i6, const I7& i7, + Args... KOKKOS_IMPL_SINK(args)) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(KOKKOS_IMPL_SINK( + (m_track, m_map, i0, i1, i2, i3, i4, i5, i6, i7, args...))) + return m_map.reference(i0, i1, i2, i3, i4, i5, i6, i7); + } + +#undef KOKKOS_IMPL_VIEW_OPERATOR_VERIFY + + //---------------------------------------- + // Standard destructor, constructors, and assignment operators + + KOKKOS_DEFAULTED_FUNCTION + ~View() = default; + + KOKKOS_DEFAULTED_FUNCTION + View() = default; + + KOKKOS_DEFAULTED_FUNCTION + View(const View&) = default; + + KOKKOS_DEFAULTED_FUNCTION + View(View&&) = default; + + KOKKOS_DEFAULTED_FUNCTION + View& operator=(const View&) = default; + + KOKKOS_DEFAULTED_FUNCTION + View& operator=(View&&) = default; + + //---------------------------------------- + // Compatible view copy constructor and assignment + // may assign unmanaged from managed. + + template <class RT, class... RP> + KOKKOS_INLINE_FUNCTION View( + const View<RT, RP...>& rhs, + typename std::enable_if<Kokkos::Impl::ViewMapping< + traits, typename View<RT, RP...>::traits, + typename traits::specialize>::is_assignable_data_type>::type* = + nullptr) + : m_track(rhs), m_map() { + using SrcTraits = typename View<RT, RP...>::traits; + using Mapping = Kokkos::Impl::ViewMapping<traits, SrcTraits, + typename traits::specialize>; + static_assert(Mapping::is_assignable, + "Incompatible View copy construction"); + Mapping::assign(m_map, rhs.m_map, rhs.m_track.m_tracker); + } + + template <class RT, class... RP> + KOKKOS_INLINE_FUNCTION typename std::enable_if< + Kokkos::Impl::ViewMapping< + traits, typename View<RT, RP...>::traits, + typename traits::specialize>::is_assignable_data_type, + View>::type& + operator=(const View<RT, RP...>& rhs) { + using SrcTraits = typename View<RT, RP...>::traits; + using Mapping = Kokkos::Impl::ViewMapping<traits, SrcTraits, + typename traits::specialize>; + static_assert(Mapping::is_assignable, "Incompatible View copy assignment"); + Mapping::assign(m_map, rhs.m_map, rhs.m_track.m_tracker); + m_track.assign(rhs); + return *this; + } + + //---------------------------------------- + // Compatible subview constructor + // may assign unmanaged from managed. + + template <class RT, class... RP, class Arg0, class... Args> + KOKKOS_INLINE_FUNCTION View(const View<RT, RP...>& src_view, const Arg0 arg0, + Args... args) + : m_track(src_view), m_map() { + using SrcType = View<RT, RP...>; + + using Mapping = Kokkos::Impl::ViewMapping<void, typename SrcType::traits, + Arg0, Args...>; + + using DstType = typename Mapping::type; + + static_assert( + Kokkos::Impl::ViewMapping<traits, typename DstType::traits, + typename traits::specialize>::is_assignable, + "Subview construction requires compatible view and subview arguments"); + + Mapping::assign(m_map, src_view.m_map, arg0, args...); + } + + //---------------------------------------- + // Allocation tracking properties + + KOKKOS_INLINE_FUNCTION + int use_count() const { return m_track.m_tracker.use_count(); } + + inline const std::string label() const { + return m_track.m_tracker + .template get_label<typename traits::memory_space>(); + } + + //---------------------------------------- + // Allocation according to allocation properties and array layout + + template <class... P> + explicit inline View( + const Impl::ViewCtorProp<P...>& arg_prop, + typename std::enable_if<!Impl::ViewCtorProp<P...>::has_pointer, + typename traits::array_layout>::type const& + arg_layout) + : m_track(), m_map() { + // Append layout and spaces if not input + using alloc_prop_input = Impl::ViewCtorProp<P...>; + + // use 'std::integral_constant<unsigned,I>' for non-types + // to avoid duplicate class error. + using alloc_prop = Impl::ViewCtorProp< + P..., + typename std::conditional<alloc_prop_input::has_label, + std::integral_constant<unsigned int, 0>, + typename std::string>::type, + typename std::conditional< + alloc_prop_input::has_memory_space, + std::integral_constant<unsigned int, 1>, + typename traits::device_type::memory_space>::type, + typename std::conditional< + alloc_prop_input::has_execution_space, + std::integral_constant<unsigned int, 2>, + typename traits::device_type::execution_space>::type>; + + static_assert(traits::is_managed, + "View allocation constructor requires managed memory"); + + if (alloc_prop::initialize && + !alloc_prop::execution_space::impl_is_initialized()) { + // If initializing view data then + // the execution space must be initialized. + Kokkos::Impl::throw_runtime_exception( + "Constructing View and initializing data with uninitialized " + "execution space"); + } + + // Copy the input allocation properties with possibly defaulted properties + alloc_prop prop_copy(arg_prop); + +//------------------------------------------------------------ +#if defined(KOKKOS_ENABLE_CUDA) + // If allocating in CudaUVMSpace must fence before and after + // the allocation to protect against possible concurrent access + // on the CPU and the GPU. + // Fence using the trait's execution space (which will be Kokkos::Cuda) + // to avoid incomplete type errors from using Kokkos::Cuda directly. + if (std::is_same<Kokkos::CudaUVMSpace, + typename traits::device_type::memory_space>::value) { + typename traits::device_type::memory_space::execution_space().fence(); + } +#endif + //------------------------------------------------------------ + + Kokkos::Impl::SharedAllocationRecord<>* record = + m_map.allocate_shared(prop_copy, arg_layout); + +//------------------------------------------------------------ +#if defined(KOKKOS_ENABLE_CUDA) + if (std::is_same<Kokkos::CudaUVMSpace, + typename traits::device_type::memory_space>::value) { + typename traits::device_type::memory_space::execution_space().fence(); + } +#endif + //------------------------------------------------------------ + + // Setup and initialization complete, start tracking + m_track.m_tracker.assign_allocated_record_to_uninitialized(record); + } + + KOKKOS_INLINE_FUNCTION + void assign_data(pointer_type arg_data) { + m_track.m_tracker.clear(); + m_map.assign_data(arg_data); + } + + // Wrap memory according to properties and array layout + template <class... P> + explicit KOKKOS_INLINE_FUNCTION View( + const Impl::ViewCtorProp<P...>& arg_prop, + typename std::enable_if<Impl::ViewCtorProp<P...>::has_pointer, + typename traits::array_layout>::type const& + arg_layout) + : m_track() // No memory tracking + , + m_map(arg_prop, arg_layout) { + static_assert( + std::is_same<pointer_type, + typename Impl::ViewCtorProp<P...>::pointer_type>::value, + "Constructing View to wrap user memory must supply matching pointer " + "type"); + } + + // Simple dimension-only layout + template <class... P> + explicit inline View( + const Impl::ViewCtorProp<P...>& arg_prop, + typename std::enable_if<!Impl::ViewCtorProp<P...>::has_pointer, + size_t>::type const arg_N0 = + KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) + : View(arg_prop, + typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6, arg_N7)) { +#ifdef KOKKOS_ENABLE_OPENMPTARGET + KOKKOS_IMPL_IF_ON_HOST + Impl::runtime_check_rank_host( + traits::rank_dynamic, + std::is_same<typename traits::specialize, void>::value, arg_N0, arg_N1, + arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7, label()); + else Impl::runtime_check_rank_device( + traits::rank_dynamic, + std::is_same<typename traits::specialize, void>::value, arg_N0, arg_N1, + arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7); +#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + Impl::runtime_check_rank_host( + traits::rank_dynamic, + std::is_same<typename traits::specialize, void>::value, arg_N0, arg_N1, + arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7, label()); +#else + Impl::runtime_check_rank_device( + traits::rank_dynamic, + std::is_same<typename traits::specialize, void>::value, arg_N0, arg_N1, + arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7); + +#endif + } + + template <class... P> + explicit KOKKOS_INLINE_FUNCTION View( + const Impl::ViewCtorProp<P...>& arg_prop, + typename std::enable_if<Impl::ViewCtorProp<P...>::has_pointer, + size_t>::type const arg_N0 = + KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) + : View(arg_prop, + typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6, arg_N7)) { +#ifdef KOKKOS_ENABLE_OPENMPTARGET + KOKKOS_IMPL_IF_ON_HOST + Impl::runtime_check_rank_host( + traits::rank_dynamic, + std::is_same<typename traits::specialize, void>::value, arg_N0, arg_N1, + arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7, label()); + else Impl::runtime_check_rank_device( + traits::rank_dynamic, + std::is_same<typename traits::specialize, void>::value, arg_N0, arg_N1, + arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7); +#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + Impl::runtime_check_rank_host( + traits::rank_dynamic, + std::is_same<typename traits::specialize, void>::value, arg_N0, arg_N1, + arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7, label()); +#else + Impl::runtime_check_rank_device( + traits::rank_dynamic, + std::is_same<typename traits::specialize, void>::value, arg_N0, arg_N1, + arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7); + +#endif + } + + // Allocate with label and layout + template <typename Label> + explicit inline View( + const Label& arg_label, + typename std::enable_if<Kokkos::Impl::is_view_label<Label>::value, + typename traits::array_layout>::type const& + arg_layout) + : View(Impl::ViewCtorProp<std::string>(arg_label), arg_layout) {} + + // Allocate label and layout, must disambiguate from subview constructor. + template <typename Label> + explicit inline View( + const Label& arg_label, + typename std::enable_if<Kokkos::Impl::is_view_label<Label>::value, + const size_t>::type arg_N0 = + KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) + : View(Impl::ViewCtorProp<std::string>(arg_label), + typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6, arg_N7)) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not extent constructible. A layout object should " + "be passed too.\n"); + +#ifdef KOKKOS_ENABLE_OPENMPTARGET + KOKKOS_IMPL_IF_ON_HOST + Impl::runtime_check_rank_host( + traits::rank_dynamic, + std::is_same<typename traits::specialize, void>::value, arg_N0, arg_N1, + arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7, label()); + else Impl::runtime_check_rank_device( + traits::rank_dynamic, + std::is_same<typename traits::specialize, void>::value, arg_N0, arg_N1, + arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7); +#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + Impl::runtime_check_rank_host( + traits::rank_dynamic, + std::is_same<typename traits::specialize, void>::value, arg_N0, arg_N1, + arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7, label()); +#else + Impl::runtime_check_rank_device( + traits::rank_dynamic, + std::is_same<typename traits::specialize, void>::value, arg_N0, arg_N1, + arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7); + +#endif + } + + // Construct view from ViewTracker and map + // This should be the preferred method because future extensions may need to + // use the ViewTracker class. + template <class Traits> + KOKKOS_INLINE_FUNCTION View( + const view_tracker_type& track, + const Kokkos::Impl::ViewMapping<Traits, typename Traits::specialize>& map) + : m_track(track), m_map() { + using Mapping = + Kokkos::Impl::ViewMapping<traits, Traits, typename traits::specialize>; + static_assert(Mapping::is_assignable, + "Incompatible View copy construction"); + Mapping::assign(m_map, map, track.m_tracker); + } + + // Construct View from internal shared allocation tracker object and map + // This is here for backwards compatibility for classes that derive from + // Kokkos::View + template <class Traits> + KOKKOS_INLINE_FUNCTION View( + const typename view_tracker_type::track_type& track, + const Kokkos::Impl::ViewMapping<Traits, typename Traits::specialize>& map) + : m_track(track), m_map() { + using Mapping = + Kokkos::Impl::ViewMapping<traits, Traits, typename traits::specialize>; + static_assert(Mapping::is_assignable, + "Incompatible View copy construction"); + Mapping::assign(m_map, map, track); + } + + //---------------------------------------- + // Memory span required to wrap these dimensions. + static constexpr size_t required_allocation_size( + const size_t arg_N0 = 0, const size_t arg_N1 = 0, const size_t arg_N2 = 0, + const size_t arg_N3 = 0, const size_t arg_N4 = 0, const size_t arg_N5 = 0, + const size_t arg_N6 = 0, const size_t arg_N7 = 0) { + return map_type::memory_span(typename traits::array_layout( + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); + } + + explicit KOKKOS_INLINE_FUNCTION View( + pointer_type arg_ptr, const size_t arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) + : View(Impl::ViewCtorProp<pointer_type>(arg_ptr), + typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6, arg_N7)) { +#ifdef KOKKOS_ENABLE_OPENMPTARGET + KOKKOS_IMPL_IF_ON_HOST + Impl::runtime_check_rank_host( + traits::rank_dynamic, + std::is_same<typename traits::specialize, void>::value, arg_N0, arg_N1, + arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7, label()); + else Impl::runtime_check_rank_device( + traits::rank_dynamic, + std::is_same<typename traits::specialize, void>::value, arg_N0, arg_N1, + arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7); +#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + Impl::runtime_check_rank_host( + traits::rank_dynamic, + std::is_same<typename traits::specialize, void>::value, arg_N0, arg_N1, + arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7, label()); +#else + Impl::runtime_check_rank_device( + traits::rank_dynamic, + std::is_same<typename traits::specialize, void>::value, arg_N0, arg_N1, + arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7); + +#endif + } + + explicit KOKKOS_INLINE_FUNCTION View( + pointer_type arg_ptr, const typename traits::array_layout& arg_layout) + : View(Impl::ViewCtorProp<pointer_type>(arg_ptr), arg_layout) {} + + //---------------------------------------- + // Shared scratch memory constructor + + static inline size_t shmem_size(const size_t arg_N0 = KOKKOS_INVALID_INDEX, + const size_t arg_N1 = KOKKOS_INVALID_INDEX, + const size_t arg_N2 = KOKKOS_INVALID_INDEX, + const size_t arg_N3 = KOKKOS_INVALID_INDEX, + const size_t arg_N4 = KOKKOS_INVALID_INDEX, + const size_t arg_N5 = KOKKOS_INVALID_INDEX, + const size_t arg_N6 = KOKKOS_INVALID_INDEX, + const size_t arg_N7 = KOKKOS_INVALID_INDEX) { + if (is_layout_stride) { + Kokkos::abort( + "Kokkos::View::shmem_size(extents...) doesn't work with " + "LayoutStride. Pass a LayoutStride object instead"); + } + const size_t num_passed_args = Impl::count_valid_integers( + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7); + + if (std::is_same<typename traits::specialize, void>::value && + num_passed_args != traits::rank_dynamic) { + Kokkos::abort( + "Kokkos::View::shmem_size() rank_dynamic != number of arguments.\n"); + } + + return View::shmem_size(typename traits::array_layout( + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); + } + + static inline size_t shmem_size( + typename traits::array_layout const& arg_layout) { + return map_type::memory_span(arg_layout) + + sizeof(typename traits::value_type); + } + + explicit KOKKOS_INLINE_FUNCTION View( + const typename traits::execution_space::scratch_memory_space& arg_space, + const typename traits::array_layout& arg_layout) + : View(Impl::ViewCtorProp<pointer_type>( + reinterpret_cast<pointer_type>(arg_space.get_shmem_aligned( + map_type::memory_span(arg_layout), + sizeof(typename traits::value_type)))), + arg_layout) {} + + explicit KOKKOS_INLINE_FUNCTION View( + const typename traits::execution_space::scratch_memory_space& arg_space, + const size_t arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) + : View(Impl::ViewCtorProp<pointer_type>( + reinterpret_cast<pointer_type>(arg_space.get_shmem_aligned( + map_type::memory_span(typename traits::array_layout( + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, + arg_N7)), + sizeof(typename traits::value_type)))), + typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6, arg_N7)) { +#ifdef KOKKOS_ENABLE_OPENMPTARGET + KOKKOS_IMPL_IF_ON_HOST + Impl::runtime_check_rank_host( + traits::rank_dynamic, + std::is_same<typename traits::specialize, void>::value, arg_N0, arg_N1, + arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7, label()); + else Impl::runtime_check_rank_device( + traits::rank_dynamic, + std::is_same<typename traits::specialize, void>::value, arg_N0, arg_N1, + arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7); +#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + Impl::runtime_check_rank_host( + traits::rank_dynamic, + std::is_same<typename traits::specialize, void>::value, arg_N0, arg_N1, + arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7, label()); +#else + Impl::runtime_check_rank_device( + traits::rank_dynamic, + std::is_same<typename traits::specialize, void>::value, arg_N0, arg_N1, + arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7); + +#endif + } +}; + +/** \brief Temporary free function rank() + * until rank() is implemented + * in the View + */ +template <typename D, class... P> +KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const View<D, P...>& V) { + return V.Rank; +} // Temporary until added to view + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +template <class V, class... Args> +using Subview = + typename Kokkos::Impl::ViewMapping<void /* deduce subview type from source + view traits */ + , + typename V::traits, Args...>::type; + +template <class D, class... P, class... Args> +KOKKOS_INLINE_FUNCTION + typename Kokkos::Impl::ViewMapping<void /* deduce subview type from source + view traits */ + , + ViewTraits<D, P...>, Args...>::type + subview(const View<D, P...>& src, Args... args) { + static_assert(View<D, P...>::Rank == sizeof...(Args), + "subview requires one argument for each source View rank"); + + return typename Kokkos::Impl::ViewMapping< + void /* deduce subview type from source view traits */ + , + ViewTraits<D, P...>, Args...>::type(src, args...); +} + +template <class MemoryTraits, class D, class... P, class... Args> +KOKKOS_INLINE_FUNCTION typename Kokkos::Impl::ViewMapping< + void /* deduce subview type from source view traits */ + , + ViewTraits<D, P...>, Args...>::template apply<MemoryTraits>::type +subview(const View<D, P...>& src, Args... args) { + static_assert(View<D, P...>::Rank == sizeof...(Args), + "subview requires one argument for each source View rank"); + + return typename Kokkos::Impl::ViewMapping< + void /* deduce subview type from source view traits */ + , + ViewTraits<D, P...>, + Args...>::template apply<MemoryTraits>::type(src, args...); +} + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +template <class LT, class... LP, class RT, class... RP> +KOKKOS_INLINE_FUNCTION bool operator==(const View<LT, LP...>& lhs, + const View<RT, RP...>& rhs) { + // Same data, layout, dimensions + using lhs_traits = ViewTraits<LT, LP...>; + using rhs_traits = ViewTraits<RT, RP...>; + + return std::is_same<typename lhs_traits::const_value_type, + typename rhs_traits::const_value_type>::value && + std::is_same<typename lhs_traits::array_layout, + typename rhs_traits::array_layout>::value && + std::is_same<typename lhs_traits::memory_space, + typename rhs_traits::memory_space>::value && + unsigned(lhs_traits::rank) == unsigned(rhs_traits::rank) && + lhs.data() == rhs.data() && lhs.span() == rhs.span() && + lhs.extent(0) == rhs.extent(0) && lhs.extent(1) == rhs.extent(1) && + lhs.extent(2) == rhs.extent(2) && lhs.extent(3) == rhs.extent(3) && + lhs.extent(4) == rhs.extent(4) && lhs.extent(5) == rhs.extent(5) && + lhs.extent(6) == rhs.extent(6) && lhs.extent(7) == rhs.extent(7); +} + +template <class LT, class... LP, class RT, class... RP> +KOKKOS_INLINE_FUNCTION bool operator!=(const View<LT, LP...>& lhs, + const View<RT, RP...>& rhs) { + return !(operator==(lhs, rhs)); +} + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +inline void shared_allocation_tracking_disable() { + Kokkos::Impl::SharedAllocationRecord<void, void>::tracking_disable(); +} + +inline void shared_allocation_tracking_enable() { + Kokkos::Impl::SharedAllocationRecord<void, void>::tracking_enable(); +} + +} /* namespace Impl */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <class Specialize, typename A, typename B> +struct CommonViewValueType; + +template <typename A, typename B> +struct CommonViewValueType<void, A, B> { + using value_type = typename std::common_type<A, B>::type; +}; + +template <class Specialize, class ValueType> +struct CommonViewAllocProp; + +template <class ValueType> +struct CommonViewAllocProp<void, ValueType> { + using value_type = ValueType; + using scalar_array_type = ValueType; + + template <class... Views> + KOKKOS_INLINE_FUNCTION CommonViewAllocProp(const Views&...) {} +}; + +template <class... Views> +struct DeduceCommonViewAllocProp; + +// Base case must provide types for: +// 1. specialize 2. value_type 3. is_view 4. prop_type +template <class FirstView> +struct DeduceCommonViewAllocProp<FirstView> { + using specialize = typename FirstView::traits::specialize; + + using value_type = typename FirstView::traits::value_type; + + enum : bool { is_view = is_view<FirstView>::value }; + + using prop_type = CommonViewAllocProp<specialize, value_type>; +}; + +template <class FirstView, class... NextViews> +struct DeduceCommonViewAllocProp<FirstView, NextViews...> { + using NextTraits = DeduceCommonViewAllocProp<NextViews...>; + + using first_specialize = typename FirstView::traits::specialize; + using first_value_type = typename FirstView::traits::value_type; + + enum : bool { first_is_view = is_view<FirstView>::value }; + + using next_specialize = typename NextTraits::specialize; + using next_value_type = typename NextTraits::value_type; + + enum : bool { next_is_view = NextTraits::is_view }; + + // common types + + // determine specialize type + // if first and next specialize differ, but are not the same specialize, error + // out + static_assert(!(!std::is_same<first_specialize, next_specialize>::value && + !std::is_same<first_specialize, void>::value && + !std::is_same<void, next_specialize>::value), + "Kokkos DeduceCommonViewAllocProp ERROR: Only one non-void " + "specialize trait allowed"); + + // otherwise choose non-void specialize if either/both are non-void + using specialize = typename std::conditional< + std::is_same<first_specialize, next_specialize>::value, first_specialize, + typename std::conditional<(std::is_same<first_specialize, void>::value && + !std::is_same<next_specialize, void>::value), + next_specialize, first_specialize>::type>::type; + + using value_type = typename CommonViewValueType<specialize, first_value_type, + next_value_type>::value_type; + + enum : bool { is_view = (first_is_view && next_is_view) }; + + using prop_type = CommonViewAllocProp<specialize, value_type>; +}; + +} // end namespace Impl + +template <class... Views> +using DeducedCommonPropsType = + typename Impl::DeduceCommonViewAllocProp<Views...>::prop_type; + +// User function +template <class... Views> +KOKKOS_INLINE_FUNCTION DeducedCommonPropsType<Views...> common_view_alloc_prop( + Views const&... views) { + return DeducedCommonPropsType<Views...>(views...); +} + +} // namespace Kokkos + +namespace Kokkos { +namespace Impl { + +using Kokkos::is_view; + +} /* namespace Impl */ +} /* namespace Kokkos */ + +#include <impl/Kokkos_ViewUniformType.hpp> +#include <impl/Kokkos_Atomic_View.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #ifndef KOKKOS_VIEW_HPP */ diff --git a/packages/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp b/packages/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp new file mode 100644 index 0000000000000000000000000000000000000000..bdc8993c398f2dd6d6b581008d1f0c8d3535d860 --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp @@ -0,0 +1,263 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_WORKGRAPHPOLICY_HPP +#define KOKKOS_WORKGRAPHPOLICY_HPP + +#include <impl/Kokkos_AnalyzePolicy.hpp> +#include <Kokkos_Crs.hpp> + +namespace Kokkos { +namespace Impl { + +template <class functor_type, class execution_space, class... policy_args> +class WorkGraphExec; + +} +} // namespace Kokkos + +namespace Kokkos { + +template <class... Properties> +class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> { + public: + using execution_policy = WorkGraphPolicy<Properties...>; + using self_type = WorkGraphPolicy<Properties...>; + using traits = Kokkos::Impl::PolicyTraits<Properties...>; + using index_type = typename traits::index_type; + using member_type = index_type; + using execution_space = typename traits::execution_space; + using memory_space = typename execution_space::memory_space; + using graph_type = Kokkos::Crs<index_type, execution_space, void, index_type>; + + enum : std::int32_t { + END_TOKEN = -1, + BEGIN_TOKEN = -2, + COMPLETED_TOKEN = -3 + }; + + private: + using ints_type = Kokkos::View<std::int32_t*, memory_space>; + + // Let N = m_graph.numRows(), the total work + // m_queue[ 0 .. N-1] = the ready queue + // m_queue[ N .. 2*N-1] = the waiting queue counts + // m_queue[2*N .. 2*N+2] = the ready queue hints + + graph_type const m_graph; + ints_type m_queue; + + KOKKOS_INLINE_FUNCTION + void push_work(const std::int32_t w) const noexcept { + const std::int32_t N = m_graph.numRows(); + + std::int32_t volatile* const ready_queue = &m_queue[0]; + std::int32_t volatile* const end_hint = &m_queue[2 * N + 1]; + + // Push work to end of queue + const std::int32_t j = atomic_fetch_add(end_hint, 1); + + if ((N <= j) || (END_TOKEN != atomic_exchange(ready_queue + j, w))) { + // ERROR: past the end of queue or did not replace END_TOKEN + Kokkos::abort("WorkGraphPolicy push_work error"); + } + + memory_fence(); + } + + public: + /**\brief Attempt to pop the work item at the head of the queue. + * + * Find entry 'i' such that + * ( m_queue[i] != BEGIN_TOKEN ) AND + * ( i == 0 OR m_queue[i-1] == BEGIN_TOKEN ) + * if found then + * increment begin hint + * return atomic_exchange( m_queue[i] , BEGIN_TOKEN ) + * else if i < total work + * return END_TOKEN + * else + * return COMPLETED_TOKEN + * + */ + KOKKOS_INLINE_FUNCTION + std::int32_t pop_work() const noexcept { + const std::int32_t N = m_graph.numRows(); + + std::int32_t volatile* const ready_queue = &m_queue[0]; + std::int32_t volatile* const begin_hint = &m_queue[2 * N]; + + // begin hint is guaranteed to be less than or equal to + // actual begin location in the queue. + + for (std::int32_t i = *begin_hint; i < N; ++i) { + const std::int32_t w = ready_queue[i]; + + if (w == END_TOKEN) { + return END_TOKEN; + } + + if ((w != BEGIN_TOKEN) && + (w == atomic_compare_exchange(ready_queue + i, w, + (std::int32_t)BEGIN_TOKEN))) { + // Attempt to claim ready work index succeeded, + // update the hint and return work index + atomic_increment(begin_hint); + return w; + } + // arrive here when ready_queue[i] == BEGIN_TOKEN + } + + return COMPLETED_TOKEN; + } + + KOKKOS_INLINE_FUNCTION + void completed_work(std::int32_t w) const noexcept { + Kokkos::memory_fence(); + + // Make sure the completed work function's memory accesses are flushed. + + const std::int32_t N = m_graph.numRows(); + + std::int32_t volatile* const count_queue = &m_queue[N]; + + const std::int32_t B = m_graph.row_map(w); + const std::int32_t E = m_graph.row_map(w + 1); + + for (std::int32_t i = B; i < E; ++i) { + const std::int32_t j = m_graph.entries(i); + if (1 == atomic_fetch_add(count_queue + j, -1)) { + push_work(j); + } + } + } + + struct TagInit {}; + struct TagCount {}; + struct TagReady {}; + + /**\brief Initialize queue + * + * m_queue[0..N-1] = END_TOKEN, the ready queue + * m_queue[N..2*N-1] = 0, the waiting count queue + * m_queue[2*N..2*N+1] = 0, begin/end hints for ready queue + */ + KOKKOS_INLINE_FUNCTION + void operator()(const TagInit, int i) const noexcept { + m_queue[i] = i < m_graph.numRows() ? END_TOKEN : 0; + } + + KOKKOS_INLINE_FUNCTION + void operator()(const TagCount, int i) const noexcept { + std::int32_t volatile* const count_queue = &m_queue[m_graph.numRows()]; + + atomic_increment(count_queue + m_graph.entries[i]); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const TagReady, int w) const noexcept { + std::int32_t const* const count_queue = &m_queue[m_graph.numRows()]; + + if (0 == count_queue[w]) push_work(w); + } + + execution_space space() const { return execution_space(); } + + WorkGraphPolicy(const graph_type& arg_graph) + : m_graph(arg_graph), + m_queue(view_alloc("queue", WithoutInitializing), + arg_graph.numRows() * 2 + 2) { + { // Initialize + using policy_type = RangePolicy<std::int32_t, execution_space, TagInit>; + using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>; + const closure_type closure(*this, policy_type(0, m_queue.size())); + closure.execute(); + execution_space().fence(); + } + + { // execute-after counts + using policy_type = RangePolicy<std::int32_t, execution_space, TagCount>; + using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>; + const closure_type closure(*this, policy_type(0, m_graph.entries.size())); + closure.execute(); + execution_space().fence(); + } + + { // Scheduling ready tasks + using policy_type = RangePolicy<std::int32_t, execution_space, TagReady>; + using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>; + const closure_type closure(*this, policy_type(0, m_graph.numRows())); + closure.execute(); + execution_space().fence(); + } + } +}; + +} // namespace Kokkos + +#ifdef KOKKOS_ENABLE_SERIAL +#include "impl/Kokkos_Serial_WorkGraphPolicy.hpp" +#endif + +#ifdef KOKKOS_ENABLE_OPENMP +#include "OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp" +#endif + +#ifdef KOKKOS_ENABLE_CUDA +#include "Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp" +#endif + +#ifdef KOKKOS_ENABLE_HIP +#include "HIP/Kokkos_HIP_WorkGraphPolicy.hpp" +#endif + +#ifdef KOKKOS_ENABLE_THREADS +#include "Threads/Kokkos_Threads_WorkGraphPolicy.hpp" +#endif + +#ifdef KOKKOS_ENABLE_HPX +#include "HPX/Kokkos_HPX_WorkGraphPolicy.hpp" +#endif + +#endif /* #define KOKKOS_WORKGRAPHPOLICY_HPP */ diff --git a/packages/kokkos/core/src/Kokkos_hwloc.hpp b/packages/kokkos/core/src/Kokkos_hwloc.hpp new file mode 100644 index 0000000000000000000000000000000000000000..23fa0a0c67001161cb510e6904455091cc4b5986 --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_hwloc.hpp @@ -0,0 +1,144 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_HWLOC_HPP +#define KOKKOS_HWLOC_HPP + +#include <Kokkos_Macros.hpp> + +#include <utility> + +namespace Kokkos { + +/** \brief Minimal subset of logical 'hwloc' functionality available + * from http://www.open-mpi.org/projects/hwloc/. + * + * The calls are NOT thread safe in order to avoid mutexes, + * memory allocations, or other actions which could give the + * runtime system an opportunity to migrate the threads or + * touch allocated memory during the function calls. + * + * All calls to these functions should be performed by a thread + * when it has guaranteed exclusive access; e.g., for OpenMP + * within a 'critical' region. + */ +namespace hwloc { + +/** \brief Query if hwloc is available */ +bool available(); + +/** \brief Query number of available NUMA regions. + * This will be less than the hardware capacity + * if the MPI process is pinned to a NUMA region. + */ +unsigned get_available_numa_count(); + +/** \brief Query number of available cores per NUMA regions. + * This will be less than the hardware capacity + * if the MPI process is pinned to a set of cores. + */ +unsigned get_available_cores_per_numa(); + +/** \brief Query number of available "hard" threads per core; i.e., + * hyperthreads */ +unsigned get_available_threads_per_core(); + +} /* namespace hwloc */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- +// Internal functions for binding persistent spawned threads. + +namespace Kokkos { +namespace hwloc { + +/** \brief Recommend mapping of threads onto cores. + * + * If thread_count == 0 then choose and set a value. + * If use_numa_count == 0 then choose and set a value. + * If use_cores_per_numa == 0 then choose and set a value. + * + * Return 0 if asynchronous, + * Return 1 if synchronous and threads_coord[0] is process core + */ +unsigned thread_mapping(const char* const label, const bool allow_async, + unsigned& thread_count, unsigned& use_numa_count, + unsigned& use_cores_per_numa, + std::pair<unsigned, unsigned> threads_coord[]); + +/** \brief Query core-coordinate of the current thread + * with respect to the core_topology. + * + * As long as the thread is running within the + * process binding the following condition holds. + * + * core_coordinate.first < core_topology.first + * core_coordinate.second < core_topology.second + */ +std::pair<unsigned, unsigned> get_this_thread_coordinate(); + +/** \brief Bind the current thread to a core. */ +bool bind_this_thread(const std::pair<unsigned, unsigned>); + +/** \brief Can hwloc bind threads? */ +bool can_bind_threads(); + +/** \brief Bind the current thread to one of the cores in the list. + * Set that entry to (~0,~0) and return the index. + * If binding fails return ~0. + */ +unsigned bind_this_thread(const unsigned coordinate_count, + std::pair<unsigned, unsigned> coordinate[]); + +/** \brief Unbind the current thread back to the original process binding */ +bool unbind_this_thread(); + +} /* namespace hwloc */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #define KOKKOS_HWLOC_HPP */ diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e530612a57f81dace23777bdf98670dd73a9d026 --- /dev/null +++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp @@ -0,0 +1,501 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_OPENMP) + +#include <cstdio> +#include <cstdlib> + +#include <limits> +#include <iostream> +#include <vector> + +#include <Kokkos_Core.hpp> + +#include <impl/Kokkos_Error.hpp> +#include <impl/Kokkos_CPUDiscovery.hpp> +#include <impl/Kokkos_Tools.hpp> + +namespace Kokkos { +namespace Impl { + +int g_openmp_hardware_max_threads = 1; + +__thread int t_openmp_hardware_id = 0; +__thread Impl::OpenMPExec *t_openmp_instance = nullptr; + +void OpenMPExec::validate_partition(const int nthreads, int &num_partitions, + int &partition_size) { + if (nthreads == 1) { + num_partitions = 1; + partition_size = 1; + } else if (num_partitions < 1 && partition_size < 1) { + int idle = nthreads; + for (int np = 2; np <= nthreads; ++np) { + for (int ps = 1; ps <= nthreads / np; ++ps) { + if (nthreads - np * ps < idle) { + idle = nthreads - np * ps; + num_partitions = np; + partition_size = ps; + } + if (idle == 0) { + break; + } + } + } + } else if (num_partitions < 1 && partition_size > 0) { + if (partition_size <= nthreads) { + num_partitions = nthreads / partition_size; + } else { + num_partitions = 1; + partition_size = nthreads; + } + } else if (num_partitions > 0 && partition_size < 1) { + if (num_partitions <= nthreads) { + partition_size = nthreads / num_partitions; + } else { + num_partitions = nthreads; + partition_size = 1; + } + } else if (num_partitions * partition_size > nthreads) { + int idle = nthreads; + const int NP = num_partitions; + const int PS = partition_size; + for (int np = NP; np > 0; --np) { + for (int ps = PS; ps > 0; --ps) { + if ((np * ps <= nthreads) && (nthreads - np * ps < idle)) { + idle = nthreads - np * ps; + num_partitions = np; + partition_size = ps; + } + if (idle == 0) { + break; + } + } + } + } +} + +void OpenMPExec::verify_is_master(const char *const label) { + if (!t_openmp_instance) { + std::string msg(label); + msg.append(" ERROR: in parallel or not initialized"); + Kokkos::Impl::throw_runtime_exception(msg); + } +} + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +void OpenMPExec::clear_thread_data() { + const size_t member_bytes = + sizeof(int64_t) * + HostThreadTeamData::align_to_int64(sizeof(HostThreadTeamData)); + + const int old_alloc_bytes = + m_pool[0] ? (member_bytes + m_pool[0]->scratch_bytes()) : 0; + + OpenMP::memory_space space; + +#pragma omp parallel num_threads(m_pool_size) + { + const int rank = omp_get_thread_num(); + + if (nullptr != m_pool[rank]) { + m_pool[rank]->disband_pool(); + + space.deallocate(m_pool[rank], old_alloc_bytes); + + m_pool[rank] = nullptr; + } + } + /* END #pragma omp parallel */ +} + +void OpenMPExec::resize_thread_data(size_t pool_reduce_bytes, + size_t team_reduce_bytes, + size_t team_shared_bytes, + size_t thread_local_bytes) { + const size_t member_bytes = + sizeof(int64_t) * + HostThreadTeamData::align_to_int64(sizeof(HostThreadTeamData)); + + HostThreadTeamData *root = m_pool[0]; + + const size_t old_pool_reduce = root ? root->pool_reduce_bytes() : 0; + const size_t old_team_reduce = root ? root->team_reduce_bytes() : 0; + const size_t old_team_shared = root ? root->team_shared_bytes() : 0; + const size_t old_thread_local = root ? root->thread_local_bytes() : 0; + const size_t old_alloc_bytes = + root ? (member_bytes + root->scratch_bytes()) : 0; + + // Allocate if any of the old allocation is tool small: + + const bool allocate = (old_pool_reduce < pool_reduce_bytes) || + (old_team_reduce < team_reduce_bytes) || + (old_team_shared < team_shared_bytes) || + (old_thread_local < thread_local_bytes); + + if (allocate) { + if (pool_reduce_bytes < old_pool_reduce) { + pool_reduce_bytes = old_pool_reduce; + } + if (team_reduce_bytes < old_team_reduce) { + team_reduce_bytes = old_team_reduce; + } + if (team_shared_bytes < old_team_shared) { + team_shared_bytes = old_team_shared; + } + if (thread_local_bytes < old_thread_local) { + thread_local_bytes = old_thread_local; + } + + const size_t alloc_bytes = + member_bytes + + HostThreadTeamData::scratch_size(pool_reduce_bytes, team_reduce_bytes, + team_shared_bytes, thread_local_bytes); + + OpenMP::memory_space space; + + memory_fence(); + +#pragma omp parallel num_threads(m_pool_size) + { + const int rank = omp_get_thread_num(); + + if (nullptr != m_pool[rank]) { + m_pool[rank]->disband_pool(); + + space.deallocate(m_pool[rank], old_alloc_bytes); + } + + void *ptr = nullptr; + try { + ptr = space.allocate(alloc_bytes); + } catch ( + Kokkos::Experimental::RawMemoryAllocationFailure const &failure) { + // For now, just rethrow the error message the existing way + Kokkos::Impl::throw_runtime_exception(failure.get_error_message()); + } + + m_pool[rank] = new (ptr) HostThreadTeamData(); + + m_pool[rank]->scratch_assign(((char *)ptr) + member_bytes, alloc_bytes, + pool_reduce_bytes, team_reduce_bytes, + team_shared_bytes, thread_local_bytes); + + memory_fence(); + } + /* END #pragma omp parallel */ + + HostThreadTeamData::organize_pool(m_pool, m_pool_size); + } +} + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +//---------------------------------------------------------------------------- + +int OpenMP::impl_get_current_max_threads() noexcept { + // Using omp_get_max_threads(); is problematic in conjunction with + // Hwloc on Intel (essentially an initial call to the OpenMP runtime + // without a parallel region before will set a process mask for a single core + // The runtime will than bind threads for a parallel region to other cores on + // the entering the first parallel region and make the process mask the + // aggregate of the thread masks. The intend seems to be to make serial code + // run fast, if you compile with OpenMP enabled but don't actually use + // parallel regions or so static int omp_max_threads = omp_get_max_threads(); + + int count = 0; +#pragma omp parallel + { +#pragma omp atomic + ++count; + } + return count; +} + +void OpenMP::impl_initialize(int thread_count) { + if (omp_in_parallel()) { + std::string msg("Kokkos::OpenMP::initialize ERROR : in parallel"); + Kokkos::Impl::throw_runtime_exception(msg); + } + + if (Impl::t_openmp_instance) { + finalize(); + } + + { + if (Kokkos::show_warnings() && nullptr == std::getenv("OMP_PROC_BIND")) { + printf( + "Kokkos::OpenMP::initialize WARNING: OMP_PROC_BIND environment " + "variable not set\n"); + printf( + " In general, for best performance with OpenMP 4.0 or better set " + "OMP_PROC_BIND=spread and OMP_PLACES=threads\n"); + printf(" For best performance with OpenMP 3.1 set OMP_PROC_BIND=true\n"); + printf(" For unit testing set OMP_PROC_BIND=false\n"); + } + + OpenMP::memory_space space; + + // Before any other call to OMP query the maximum number of threads + // and save the value for re-initialization unit testing. + + Impl::g_openmp_hardware_max_threads = impl_get_current_max_threads(); + + int process_num_threads = Impl::g_openmp_hardware_max_threads; + + if (Kokkos::hwloc::available()) { + process_num_threads = Kokkos::hwloc::get_available_numa_count() * + Kokkos::hwloc::get_available_cores_per_numa() * + Kokkos::hwloc::get_available_threads_per_core(); + } + + // if thread_count < 0, use g_openmp_hardware_max_threads; + // if thread_count == 0, set g_openmp_hardware_max_threads to + // process_num_threads if thread_count > 0, set + // g_openmp_hardware_max_threads to thread_count + if (thread_count < 0) { + thread_count = Impl::g_openmp_hardware_max_threads; + } else if (thread_count == 0 && + Impl::g_openmp_hardware_max_threads != process_num_threads) { + Impl::g_openmp_hardware_max_threads = process_num_threads; + omp_set_num_threads(Impl::g_openmp_hardware_max_threads); + } else { + if (Kokkos::show_warnings() && thread_count > process_num_threads) { + printf( + "Kokkos::OpenMP::initialize WARNING: You are likely " + "oversubscribing your CPU cores.\n"); + printf(" process threads available : %3d, requested thread : %3d\n", + process_num_threads, thread_count); + } + Impl::g_openmp_hardware_max_threads = thread_count; + omp_set_num_threads(Impl::g_openmp_hardware_max_threads); + } + +// setup thread local +#pragma omp parallel num_threads(Impl::g_openmp_hardware_max_threads) + { + Impl::t_openmp_instance = nullptr; + Impl::t_openmp_hardware_id = omp_get_thread_num(); + Impl::SharedAllocationRecord<void, void>::tracking_enable(); + } + + void *ptr = nullptr; + try { + ptr = space.allocate(sizeof(Impl::OpenMPExec)); + } catch (Kokkos::Experimental::RawMemoryAllocationFailure const &f) { + // For now, just rethrow the error message the existing way + Kokkos::Impl::throw_runtime_exception(f.get_error_message()); + } + + Impl::t_openmp_instance = + new (ptr) Impl::OpenMPExec(Impl::g_openmp_hardware_max_threads); + + // New, unified host thread team data: + { + size_t pool_reduce_bytes = 32 * thread_count; + size_t team_reduce_bytes = 32 * thread_count; + size_t team_shared_bytes = 1024 * thread_count; + size_t thread_local_bytes = 1024; + + Impl::t_openmp_instance->resize_thread_data( + pool_reduce_bytes, team_reduce_bytes, team_shared_bytes, + thread_local_bytes); + } + } + + // Check for over-subscription + if (Kokkos::show_warnings() && + (Impl::mpi_ranks_per_node() * long(thread_count) > + Impl::processors_per_node())) { + std::cerr << "Kokkos::OpenMP::initialize WARNING: You are likely " + "oversubscribing your CPU cores." + << std::endl; + std::cerr << " Detected: " + << Impl::processors_per_node() << " cores per node." << std::endl; + std::cerr << " Detected: " + << Impl::mpi_ranks_per_node() << " MPI_ranks per node." + << std::endl; + std::cerr << " Requested: " + << thread_count << " threads per process." << std::endl; + } + // Init the array for used for arbitrarily sized atomics + Impl::init_lock_array_host_space(); +} + +//---------------------------------------------------------------------------- + +void OpenMP::impl_finalize() { + if (omp_in_parallel()) { + std::string msg("Kokkos::OpenMP::finalize ERROR "); + if (!Impl::t_openmp_instance) msg.append(": not initialized"); + if (omp_in_parallel()) msg.append(": in parallel"); + Kokkos::Impl::throw_runtime_exception(msg); + } + + if (Impl::t_openmp_instance) { + // Silence Cuda Warning + const int nthreads = Impl::t_openmp_instance->m_pool_size <= + Impl::g_openmp_hardware_max_threads + ? Impl::g_openmp_hardware_max_threads + : Impl::t_openmp_instance->m_pool_size; + (void)nthreads; + + using Exec = Impl::OpenMPExec; + Exec *instance = Impl::t_openmp_instance; + instance->~Exec(); + + OpenMP::memory_space space; + space.deallocate(instance, sizeof(Exec)); + +#pragma omp parallel num_threads(nthreads) + { + Impl::t_openmp_hardware_id = 0; + Impl::t_openmp_instance = nullptr; + Impl::SharedAllocationRecord<void, void>::tracking_disable(); + } + + // allow main thread to track + Impl::SharedAllocationRecord<void, void>::tracking_enable(); + + Impl::g_openmp_hardware_max_threads = 1; + } + + Kokkos::Profiling::finalize(); +} + +//---------------------------------------------------------------------------- + +void OpenMP::print_configuration(std::ostream &s, const bool /*verbose*/) { + s << "Kokkos::OpenMP"; + + const bool is_initialized = Impl::t_openmp_instance != nullptr; + + if (is_initialized) { + Impl::OpenMPExec::verify_is_master("OpenMP::print_configuration"); + + const int numa_count = 1; + const int core_per_numa = Impl::g_openmp_hardware_max_threads; + const int thread_per_core = 1; + + s << " thread_pool_topology[ " << numa_count << " x " << core_per_numa + << " x " << thread_per_core << " ]" << std::endl; + } else { + s << " not initialized" << std::endl; + } +} + +std::vector<OpenMP> OpenMP::partition(...) { return std::vector<OpenMP>(1); } + +OpenMP OpenMP::create_instance(...) { return OpenMP(); } + +int OpenMP::concurrency() { return Impl::g_openmp_hardware_max_threads; } + +void OpenMP::fence() const {} + +namespace Impl { + +int g_openmp_space_factory_initialized = + initialize_space_factory<OpenMPSpaceInitializer>("050_OpenMP"); + +void OpenMPSpaceInitializer::initialize(const InitArguments &args) { + // Prevent "unused variable" warning for 'args' input struct. If + // Serial::initialize() ever needs to take arguments from the input + // struct, you may remove this line of code. + const int num_threads = args.num_threads; + + if (std::is_same<Kokkos::OpenMP, Kokkos::DefaultExecutionSpace>::value || + std::is_same<Kokkos::OpenMP, Kokkos::HostSpace::execution_space>::value) { + Kokkos::OpenMP::impl_initialize(num_threads); + } else { + // std::cout << "Kokkos::initialize() fyi: OpenMP enabled but not + // initialized" << std::endl ; + } +} + +void OpenMPSpaceInitializer::finalize(const bool) { + if (Kokkos::OpenMP::impl_is_initialized()) Kokkos::OpenMP::impl_finalize(); +} + +void OpenMPSpaceInitializer::fence() { Kokkos::OpenMP::impl_static_fence(); } + +void OpenMPSpaceInitializer::print_configuration(std::ostream &msg, + const bool detail) { + msg << "Host Parallel Execution Space:" << std::endl; + msg << " KOKKOS_ENABLE_OPENMP: "; + msg << "yes" << std::endl; + + msg << "OpenMP Atomics:" << std::endl; + msg << " KOKKOS_ENABLE_OPENMP_ATOMICS: "; +#ifdef KOKKOS_ENABLE_OPENMP_ATOMICS + msg << "yes" << std::endl; +#else + msg << "no" << std::endl; +#endif + + msg << "\nOpenMP Runtime Configuration:" << std::endl; + OpenMP::print_configuration(msg, detail); +} + +} // namespace Impl +} // namespace Kokkos + +#else +void KOKKOS_CORE_SRC_OPENMP_EXEC_PREVENT_LINK_ERROR() {} +#endif // KOKKOS_ENABLE_OPENMP diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp new file mode 100644 index 0000000000000000000000000000000000000000..82f049ed136119c28b4add24f1460831fec55b16 --- /dev/null +++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp @@ -0,0 +1,362 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_OPENMPEXEC_HPP +#define KOKKOS_OPENMPEXEC_HPP + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_OPENMP) + +#if !defined(_OPENMP) && !defined(__CUDA_ARCH__) && \ + !defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__) +#error \ + "You enabled Kokkos OpenMP support without enabling OpenMP in the compiler!" +#endif + +#include <Kokkos_OpenMP.hpp> + +#include <impl/Kokkos_Traits.hpp> +#include <impl/Kokkos_HostThreadTeam.hpp> + +#include <Kokkos_Atomic.hpp> + +#include <Kokkos_UniqueToken.hpp> +#include <impl/Kokkos_ConcurrentBitset.hpp> + +#include <omp.h> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +class OpenMPExec; + +extern int g_openmp_hardware_max_threads; + +extern __thread int t_openmp_hardware_id; +extern __thread OpenMPExec* t_openmp_instance; + +//---------------------------------------------------------------------------- +/** \brief Data for OpenMP thread execution */ + +class OpenMPExec { + public: + friend class Kokkos::OpenMP; + + enum { MAX_THREAD_COUNT = 512 }; + + void clear_thread_data(); + + static void validate_partition(const int nthreads, int& num_partitions, + int& partition_size); + + private: + OpenMPExec(int arg_pool_size) + : m_pool_size{arg_pool_size}, m_level{omp_get_level()}, m_pool() {} + + ~OpenMPExec() { clear_thread_data(); } + + int m_pool_size; + int m_level; + + HostThreadTeamData* m_pool[MAX_THREAD_COUNT]; + + public: + static void verify_is_master(const char* const); + + void resize_thread_data(size_t pool_reduce_bytes, size_t team_reduce_bytes, + size_t team_shared_bytes, size_t thread_local_bytes); + + inline HostThreadTeamData* get_thread_data() const noexcept { + return m_pool[m_level == omp_get_level() ? 0 : omp_get_thread_num()]; + } + + inline HostThreadTeamData* get_thread_data(int i) const noexcept { + return m_pool[i]; + } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +inline bool OpenMP::impl_is_initialized() noexcept { + return Impl::t_openmp_instance != nullptr; +} + +inline bool OpenMP::in_parallel(OpenMP const&) noexcept { + // t_openmp_instance is only non-null on a master thread + return !Impl::t_openmp_instance || + Impl::t_openmp_instance->m_level < omp_get_level(); +} + +inline int OpenMP::impl_thread_pool_size() noexcept { + return OpenMP::in_parallel() ? omp_get_num_threads() + : Impl::t_openmp_instance->m_pool_size; +} + +KOKKOS_INLINE_FUNCTION +int OpenMP::impl_thread_pool_rank() noexcept { +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + return Impl::t_openmp_instance ? 0 : omp_get_thread_num(); +#else + return -1; +#endif +} + +inline void OpenMP::impl_static_fence(OpenMP const& /*instance*/) noexcept {} + +inline bool OpenMP::is_asynchronous(OpenMP const& /*instance*/) noexcept { + return false; +} + +template <typename F> +void OpenMP::partition_master(F const& f, int num_partitions, + int partition_size) { +#if _OPENMP >= 201811 + if (omp_get_max_active_levels() > 1) { +#else + if (omp_get_nested()) { +#endif + using Exec = Impl::OpenMPExec; + + Exec* prev_instance = Impl::t_openmp_instance; + + Exec::validate_partition(prev_instance->m_pool_size, num_partitions, + partition_size); + + OpenMP::memory_space space; + +#pragma omp parallel num_threads(num_partitions) + { + void* ptr = nullptr; + try { + ptr = space.allocate(sizeof(Exec)); + } catch ( + Kokkos::Experimental::RawMemoryAllocationFailure const& failure) { + // For now, just rethrow the error message the existing way + Kokkos::Impl::throw_runtime_exception(failure.get_error_message()); + } + + Impl::t_openmp_instance = new (ptr) Exec(partition_size); + + size_t pool_reduce_bytes = 32 * partition_size; + size_t team_reduce_bytes = 32 * partition_size; + size_t team_shared_bytes = 1024 * partition_size; + size_t thread_local_bytes = 1024; + + Impl::t_openmp_instance->resize_thread_data( + pool_reduce_bytes, team_reduce_bytes, team_shared_bytes, + thread_local_bytes); + + omp_set_num_threads(partition_size); + f(omp_get_thread_num(), omp_get_num_threads()); + + Impl::t_openmp_instance->~Exec(); + space.deallocate(Impl::t_openmp_instance, sizeof(Exec)); + Impl::t_openmp_instance = nullptr; + } + + Impl::t_openmp_instance = prev_instance; + } else { + // nested openmp not enabled + f(0, 1); + } +} + +namespace Experimental { + +template <> +class MasterLock<OpenMP> { + public: + void lock() { omp_set_lock(&m_lock); } + void unlock() { omp_unset_lock(&m_lock); } + bool try_lock() { return static_cast<bool>(omp_test_lock(&m_lock)); } + + MasterLock() { omp_init_lock(&m_lock); } + ~MasterLock() { omp_destroy_lock(&m_lock); } + + MasterLock(MasterLock const&) = delete; + MasterLock(MasterLock&&) = delete; + MasterLock& operator=(MasterLock const&) = delete; + MasterLock& operator=(MasterLock&&) = delete; + + private: + omp_lock_t m_lock; +}; + +template <> +class UniqueToken<OpenMP, UniqueTokenScope::Instance> { + private: + using buffer_type = Kokkos::View<uint32_t*, Kokkos::HostSpace>; + int m_count; + buffer_type m_buffer_view; + uint32_t volatile* m_buffer; + + public: + using execution_space = OpenMP; + using size_type = int; + + /// \brief create object size for concurrency on the given instance + /// + /// This object should not be shared between instances + UniqueToken(execution_space const& = execution_space()) noexcept + : m_count(::Kokkos::OpenMP::impl_thread_pool_size()), + m_buffer_view(buffer_type()), + m_buffer(nullptr) {} + + UniqueToken(size_type max_size, execution_space const& = execution_space()) + : m_count(max_size), + m_buffer_view("UniqueToken::m_buffer_view", + ::Kokkos::Impl::concurrent_bitset::buffer_bound(m_count)), + m_buffer(m_buffer_view.data()) {} + + /// \brief upper bound for acquired values, i.e. 0 <= value < size() + KOKKOS_INLINE_FUNCTION + int size() const noexcept { +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + return m_count; +#else + return 0; +#endif + } + + /// \brief acquire value such that 0 <= value < size() + KOKKOS_INLINE_FUNCTION + int acquire() const noexcept { +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + if (m_count >= ::Kokkos::OpenMP::impl_thread_pool_size()) + return ::Kokkos::OpenMP::impl_thread_pool_rank(); + const ::Kokkos::pair<int, int> result = + ::Kokkos::Impl::concurrent_bitset::acquire_bounded( + m_buffer, m_count, ::Kokkos::Impl::clock_tic() % m_count); + + if (result.first < 0) { + ::Kokkos::abort( + "UniqueToken<OpenMP> failure to acquire tokens, no tokens available"); + } + + return result.first; +#else + return 0; +#endif + } + + /// \brief release a value acquired by generate + KOKKOS_INLINE_FUNCTION + void release(int i) const noexcept { +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + if (m_count < ::Kokkos::OpenMP::impl_thread_pool_size()) + ::Kokkos::Impl::concurrent_bitset::release(m_buffer, i); +#else + (void)i; +#endif + } +}; + +template <> +class UniqueToken<OpenMP, UniqueTokenScope::Global> { + public: + using execution_space = OpenMP; + using size_type = int; + + /// \brief create object size for concurrency on the given instance + /// + /// This object should not be shared between instances + UniqueToken(execution_space const& = execution_space()) noexcept {} + + /// \brief upper bound for acquired values, i.e. 0 <= value < size() + KOKKOS_INLINE_FUNCTION + int size() const noexcept { +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + return Kokkos::Impl::g_openmp_hardware_max_threads; +#else + return 0; +#endif + } + + /// \brief acquire value such that 0 <= value < size() + KOKKOS_INLINE_FUNCTION + int acquire() const noexcept { +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + return Kokkos::Impl::t_openmp_hardware_id; +#else + return 0; +#endif + } + + /// \brief release a value acquired by generate + KOKKOS_INLINE_FUNCTION + void release(int) const noexcept {} +}; + +} // namespace Experimental + +inline int OpenMP::impl_thread_pool_size(int depth) { + return depth < 2 ? impl_thread_pool_size() : 1; +} + +KOKKOS_INLINE_FUNCTION +int OpenMP::impl_hardware_thread_id() noexcept { +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + return Impl::t_openmp_hardware_id; +#else + return -1; +#endif +} + +inline int OpenMP::impl_max_hardware_threads() noexcept { + return Impl::g_openmp_hardware_max_threads; +} + +} // namespace Kokkos + +#endif +#endif /* #ifndef KOKKOS_OPENMPEXEC_HPP */ diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp new file mode 100644 index 0000000000000000000000000000000000000000..2fc522780a495971a1d6455e19260bad0b422207 --- /dev/null +++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp @@ -0,0 +1,1184 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_OPENMP_PARALLEL_HPP +#define KOKKOS_OPENMP_PARALLEL_HPP + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_OPENMP) + +#include <omp.h> +#include <OpenMP/Kokkos_OpenMP_Exec.hpp> +#include <impl/Kokkos_FunctorAdapter.hpp> + +#include <KokkosExp_MDRangePolicy.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <class FunctorType, class... Traits> +class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::OpenMP> { + private: + using Policy = Kokkos::RangePolicy<Traits...>; + using WorkTag = typename Policy::work_tag; + using WorkRange = typename Policy::WorkRange; + using Member = typename Policy::member_type; + + OpenMPExec* m_instance; + const FunctorType m_functor; + const Policy m_policy; + + template <class TagType> + inline static + typename std::enable_if<std::is_same<TagType, void>::value>::type + exec_range(const FunctorType& functor, const Member ibeg, + const Member iend) { +#ifdef KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif +#endif + for (Member iwork = ibeg; iwork < iend; ++iwork) { + functor(iwork); + } + } + + template <class TagType> + inline static + typename std::enable_if<!std::is_same<TagType, void>::value>::type + exec_range(const FunctorType& functor, const Member ibeg, + const Member iend) { + const TagType t{}; +#ifdef KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif +#endif + for (Member iwork = ibeg; iwork < iend; ++iwork) { + functor(t, iwork); + } + } + + public: + inline void execute() const { + enum { + is_dynamic = std::is_same<typename Policy::schedule_type::type, + Kokkos::Dynamic>::value + }; + + if (OpenMP::in_parallel()) { + exec_range<WorkTag>(m_functor, m_policy.begin(), m_policy.end()); + } else { + OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_for"); + +#pragma omp parallel num_threads(OpenMP::impl_thread_pool_size()) + { + HostThreadTeamData& data = *(m_instance->get_thread_data()); + + data.set_work_partition(m_policy.end() - m_policy.begin(), + m_policy.chunk_size()); + + if (is_dynamic) { + // Make sure work partition is set before stealing + if (data.pool_rendezvous()) data.pool_rendezvous_release(); + } + + std::pair<int64_t, int64_t> range(0, 0); + + do { + range = is_dynamic ? data.get_work_stealing_chunk() + : data.get_work_partition(); + + ParallelFor::template exec_range<WorkTag>( + m_functor, range.first + m_policy.begin(), + range.second + m_policy.begin()); + + } while (is_dynamic && 0 <= range.first); + } + } + } + + inline ParallelFor(const FunctorType& arg_functor, Policy arg_policy) + : m_instance(t_openmp_instance), + m_functor(arg_functor), + m_policy(arg_policy) {} +}; + +// MDRangePolicy impl +template <class FunctorType, class... Traits> +class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, + Kokkos::OpenMP> { + private: + using MDRangePolicy = Kokkos::MDRangePolicy<Traits...>; + using Policy = typename MDRangePolicy::impl_range_policy; + using WorkTag = typename MDRangePolicy::work_tag; + + using WorkRange = typename Policy::WorkRange; + using Member = typename Policy::member_type; + + using iterate_type = typename Kokkos::Impl::HostIterateTile< + MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void>; + + OpenMPExec* m_instance; + const FunctorType m_functor; + const MDRangePolicy m_mdr_policy; + const Policy m_policy; // construct as RangePolicy( 0, num_tiles + // ).set_chunk_size(1) in ctor + + inline static void exec_range(const MDRangePolicy& mdr_policy, + const FunctorType& functor, const Member ibeg, + const Member iend) { +#ifdef KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif +#endif + for (Member iwork = ibeg; iwork < iend; ++iwork) { + iterate_type(mdr_policy, functor)(iwork); + } + } + + public: + inline void execute() const { + enum { + is_dynamic = std::is_same<typename Policy::schedule_type::type, + Kokkos::Dynamic>::value + }; + + if (OpenMP::in_parallel()) { + ParallelFor::exec_range(m_mdr_policy, m_functor, m_policy.begin(), + m_policy.end()); + } else { + OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_for"); + +#pragma omp parallel num_threads(OpenMP::impl_thread_pool_size()) + { + HostThreadTeamData& data = *(m_instance->get_thread_data()); + + data.set_work_partition(m_policy.end() - m_policy.begin(), + m_policy.chunk_size()); + + if (is_dynamic) { + // Make sure work partition is set before stealing + if (data.pool_rendezvous()) data.pool_rendezvous_release(); + } + + std::pair<int64_t, int64_t> range(0, 0); + + do { + range = is_dynamic ? data.get_work_stealing_chunk() + : data.get_work_partition(); + + ParallelFor::exec_range(m_mdr_policy, m_functor, + range.first + m_policy.begin(), + range.second + m_policy.begin()); + + } while (is_dynamic && 0 <= range.first); + } + // END #pragma omp parallel + } + } + + inline ParallelFor(const FunctorType& arg_functor, MDRangePolicy arg_policy) + : m_instance(t_openmp_instance), + m_functor(arg_functor), + m_mdr_policy(arg_policy), + m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)) {} + template <typename Policy, typename Functor> + static int max_tile_size_product(const Policy&, const Functor&) { + /** + * 1024 here is just our guess for a reasonable max tile size, + * it isn't a hardware constraint. If people see a use for larger + * tile size products, we're happy to change this. + */ + return 1024; + } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <class FunctorType, class ReducerType, class... Traits> +class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType, + Kokkos::OpenMP> { + private: + using Policy = Kokkos::RangePolicy<Traits...>; + + using WorkTag = typename Policy::work_tag; + using WorkRange = typename Policy::WorkRange; + using Member = typename Policy::member_type; + + using Analysis = + FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, FunctorType>; + + using ReducerConditional = + Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, + FunctorType, ReducerType>; + using ReducerTypeFwd = typename ReducerConditional::type; + using WorkTagFwd = + std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag, + void>; + + // Static Assert WorkTag void if ReducerType not InvalidType + + using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>; + using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>; + + using pointer_type = typename Analysis::pointer_type; + using reference_type = typename Analysis::reference_type; + + OpenMPExec* m_instance; + const FunctorType m_functor; + const Policy m_policy; + const ReducerType m_reducer; + const pointer_type m_result_ptr; + + template <class TagType> + inline static + typename std::enable_if<std::is_same<TagType, void>::value>::type + exec_range(const FunctorType& functor, const Member ibeg, + const Member iend, reference_type update) { + for (Member iwork = ibeg; iwork < iend; ++iwork) { + functor(iwork, update); + } + } + + template <class TagType> + inline static + typename std::enable_if<!std::is_same<TagType, void>::value>::type + exec_range(const FunctorType& functor, const Member ibeg, + const Member iend, reference_type update) { + const TagType t{}; + for (Member iwork = ibeg; iwork < iend; ++iwork) { + functor(t, iwork, update); + } + } + + public: + inline void execute() const { + if (m_policy.end() <= m_policy.begin()) { + if (m_result_ptr) { + ValueInit::init(ReducerConditional::select(m_functor, m_reducer), + m_result_ptr); + Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final( + ReducerConditional::select(m_functor, m_reducer), m_result_ptr); + } + return; + } + enum { + is_dynamic = std::is_same<typename Policy::schedule_type::type, + Kokkos::Dynamic>::value + }; + + OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_reduce"); + + const size_t pool_reduce_bytes = + Analysis::value_size(ReducerConditional::select(m_functor, m_reducer)); + + m_instance->resize_thread_data(pool_reduce_bytes, 0 // team_reduce_bytes + , + 0 // team_shared_bytes + , + 0 // thread_local_bytes + ); + + const int pool_size = OpenMP::impl_thread_pool_size(); +#pragma omp parallel num_threads(pool_size) + { + HostThreadTeamData& data = *(m_instance->get_thread_data()); + + data.set_work_partition(m_policy.end() - m_policy.begin(), + m_policy.chunk_size()); + + if (is_dynamic) { + // Make sure work partition is set before stealing + if (data.pool_rendezvous()) data.pool_rendezvous_release(); + } + + reference_type update = + ValueInit::init(ReducerConditional::select(m_functor, m_reducer), + data.pool_reduce_local()); + + std::pair<int64_t, int64_t> range(0, 0); + + do { + range = is_dynamic ? data.get_work_stealing_chunk() + : data.get_work_partition(); + + ParallelReduce::template exec_range<WorkTag>( + m_functor, range.first + m_policy.begin(), + range.second + m_policy.begin(), update); + + } while (is_dynamic && 0 <= range.first); + } + + // Reduction: + + const pointer_type ptr = + pointer_type(m_instance->get_thread_data(0)->pool_reduce_local()); + + for (int i = 1; i < pool_size; ++i) { + ValueJoin::join(ReducerConditional::select(m_functor, m_reducer), ptr, + m_instance->get_thread_data(i)->pool_reduce_local()); + } + + Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final( + ReducerConditional::select(m_functor, m_reducer), ptr); + + if (m_result_ptr) { + const int n = Analysis::value_count( + ReducerConditional::select(m_functor, m_reducer)); + + for (int j = 0; j < n; ++j) { + m_result_ptr[j] = ptr[j]; + } + } + } + + //---------------------------------------- + + template <class ViewType> + inline ParallelReduce( + const FunctorType& arg_functor, Policy arg_policy, + const ViewType& arg_view, + typename std::enable_if<Kokkos::is_view<ViewType>::value && + !Kokkos::is_reducer_type<ReducerType>::value, + void*>::type = nullptr) + : m_instance(t_openmp_instance), + m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(InvalidType()), + m_result_ptr(arg_view.data()) { + /*static_assert( std::is_same< typename ViewType::memory_space + , Kokkos::HostSpace >::value + , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" + );*/ + } + + inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy, + const ReducerType& reducer) + : m_instance(t_openmp_instance), + m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(reducer), + m_result_ptr(reducer.view().data()) { + /*static_assert( std::is_same< typename ViewType::memory_space + , Kokkos::HostSpace >::value + , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" + );*/ + } +}; + +// MDRangePolicy impl +template <class FunctorType, class ReducerType, class... Traits> +class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType, + Kokkos::OpenMP> { + private: + using MDRangePolicy = Kokkos::MDRangePolicy<Traits...>; + using Policy = typename MDRangePolicy::impl_range_policy; + + using WorkTag = typename MDRangePolicy::work_tag; + using WorkRange = typename Policy::WorkRange; + using Member = typename Policy::member_type; + + using Analysis = FunctorAnalysis<FunctorPatternInterface::REDUCE, + MDRangePolicy, FunctorType>; + + using ReducerConditional = + Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, + FunctorType, ReducerType>; + using ReducerTypeFwd = typename ReducerConditional::type; + using WorkTagFwd = + std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag, + void>; + + using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>; + using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>; + + using pointer_type = typename Analysis::pointer_type; + using value_type = typename Analysis::value_type; + using reference_type = typename Analysis::reference_type; + + using iterate_type = + typename Kokkos::Impl::HostIterateTile<MDRangePolicy, FunctorType, + WorkTag, reference_type>; + + OpenMPExec* m_instance; + const FunctorType m_functor; + const MDRangePolicy m_mdr_policy; + const Policy m_policy; // construct as RangePolicy( 0, num_tiles + // ).set_chunk_size(1) in ctor + const ReducerType m_reducer; + const pointer_type m_result_ptr; + + inline static void exec_range(const MDRangePolicy& mdr_policy, + const FunctorType& functor, const Member ibeg, + const Member iend, reference_type update) { + for (Member iwork = ibeg; iwork < iend; ++iwork) { + iterate_type(mdr_policy, functor, update)(iwork); + } + } + + public: + inline void execute() const { + enum { + is_dynamic = std::is_same<typename Policy::schedule_type::type, + Kokkos::Dynamic>::value + }; + + OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_reduce"); + + const size_t pool_reduce_bytes = + Analysis::value_size(ReducerConditional::select(m_functor, m_reducer)); + + m_instance->resize_thread_data(pool_reduce_bytes, 0 // team_reduce_bytes + , + 0 // team_shared_bytes + , + 0 // thread_local_bytes + ); + + const int pool_size = OpenMP::impl_thread_pool_size(); +#pragma omp parallel num_threads(pool_size) + { + HostThreadTeamData& data = *(m_instance->get_thread_data()); + + data.set_work_partition(m_policy.end() - m_policy.begin(), + m_policy.chunk_size()); + + if (is_dynamic) { + // Make sure work partition is set before stealing + if (data.pool_rendezvous()) data.pool_rendezvous_release(); + } + + reference_type update = + ValueInit::init(ReducerConditional::select(m_functor, m_reducer), + data.pool_reduce_local()); + + std::pair<int64_t, int64_t> range(0, 0); + + do { + range = is_dynamic ? data.get_work_stealing_chunk() + : data.get_work_partition(); + + ParallelReduce::exec_range(m_mdr_policy, m_functor, + range.first + m_policy.begin(), + range.second + m_policy.begin(), update); + + } while (is_dynamic && 0 <= range.first); + } + // END #pragma omp parallel + + // Reduction: + + const pointer_type ptr = + pointer_type(m_instance->get_thread_data(0)->pool_reduce_local()); + + for (int i = 1; i < pool_size; ++i) { + ValueJoin::join(ReducerConditional::select(m_functor, m_reducer), ptr, + m_instance->get_thread_data(i)->pool_reduce_local()); + } + + Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final( + ReducerConditional::select(m_functor, m_reducer), ptr); + + if (m_result_ptr) { + const int n = Analysis::value_count( + ReducerConditional::select(m_functor, m_reducer)); + + for (int j = 0; j < n; ++j) { + m_result_ptr[j] = ptr[j]; + } + } + } + + //---------------------------------------- + + template <class ViewType> + inline ParallelReduce( + const FunctorType& arg_functor, MDRangePolicy arg_policy, + const ViewType& arg_view, + typename std::enable_if<Kokkos::is_view<ViewType>::value && + !Kokkos::is_reducer_type<ReducerType>::value, + void*>::type = nullptr) + : m_instance(t_openmp_instance), + m_functor(arg_functor), + m_mdr_policy(arg_policy), + m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)), + m_reducer(InvalidType()), + m_result_ptr(arg_view.data()) { + /*static_assert( std::is_same< typename ViewType::memory_space + , Kokkos::HostSpace >::value + , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" + );*/ + } + + inline ParallelReduce(const FunctorType& arg_functor, + MDRangePolicy arg_policy, const ReducerType& reducer) + : m_instance(t_openmp_instance), + m_functor(arg_functor), + m_mdr_policy(arg_policy), + m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)), + m_reducer(reducer), + m_result_ptr(reducer.view().data()) { + /*static_assert( std::is_same< typename ViewType::memory_space + , Kokkos::HostSpace >::value + , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" + );*/ + } + template <typename Policy, typename Functor> + static int max_tile_size_product(const Policy&, const Functor&) { + /** + * 1024 here is just our guess for a reasonable max tile size, + * it isn't a hardware constraint. If people see a use for larger + * tile size products, we're happy to change this. + */ + return 1024; + } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <class FunctorType, class... Traits> +class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, + Kokkos::OpenMP> { + private: + using Policy = Kokkos::RangePolicy<Traits...>; + + using Analysis = + FunctorAnalysis<FunctorPatternInterface::SCAN, Policy, FunctorType>; + + using WorkTag = typename Policy::work_tag; + using WorkRange = typename Policy::WorkRange; + using Member = typename Policy::member_type; + + using ValueInit = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>; + using ValueJoin = Kokkos::Impl::FunctorValueJoin<FunctorType, WorkTag>; + using ValueOps = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>; + + using pointer_type = typename Analysis::pointer_type; + using reference_type = typename Analysis::reference_type; + + OpenMPExec* m_instance; + const FunctorType m_functor; + const Policy m_policy; + + template <class TagType> + inline static + typename std::enable_if<std::is_same<TagType, void>::value>::type + exec_range(const FunctorType& functor, const Member ibeg, + const Member iend, reference_type update, const bool final) { + for (Member iwork = ibeg; iwork < iend; ++iwork) { + functor(iwork, update, final); + } + } + + template <class TagType> + inline static + typename std::enable_if<!std::is_same<TagType, void>::value>::type + exec_range(const FunctorType& functor, const Member ibeg, + const Member iend, reference_type update, const bool final) { + const TagType t{}; + for (Member iwork = ibeg; iwork < iend; ++iwork) { + functor(t, iwork, update, final); + } + } + + public: + inline void execute() const { + OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_scan"); + + const int value_count = Analysis::value_count(m_functor); + const size_t pool_reduce_bytes = 2 * Analysis::value_size(m_functor); + + m_instance->resize_thread_data(pool_reduce_bytes, 0 // team_reduce_bytes + , + 0 // team_shared_bytes + , + 0 // thread_local_bytes + ); + +#pragma omp parallel num_threads(OpenMP::impl_thread_pool_size()) + { + HostThreadTeamData& data = *(m_instance->get_thread_data()); + + const WorkRange range(m_policy, omp_get_thread_num(), + omp_get_num_threads()); + + reference_type update_sum = + ValueInit::init(m_functor, data.pool_reduce_local()); + + ParallelScan::template exec_range<WorkTag>( + m_functor, range.begin(), range.end(), update_sum, false); + + if (data.pool_rendezvous()) { + pointer_type ptr_prev = nullptr; + + const int n = omp_get_num_threads(); + + for (int i = 0; i < n; ++i) { + pointer_type ptr = + (pointer_type)data.pool_member(i)->pool_reduce_local(); + + if (i) { + for (int j = 0; j < value_count; ++j) { + ptr[j + value_count] = ptr_prev[j + value_count]; + } + ValueJoin::join(m_functor, ptr + value_count, ptr_prev); + } else { + ValueInit::init(m_functor, ptr + value_count); + } + + ptr_prev = ptr; + } + + data.pool_rendezvous_release(); + } + + reference_type update_base = ValueOps::reference( + ((pointer_type)data.pool_reduce_local()) + value_count); + + ParallelScan::template exec_range<WorkTag>( + m_functor, range.begin(), range.end(), update_base, true); + } + } + + //---------------------------------------- + + inline ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy) + : m_instance(t_openmp_instance), + m_functor(arg_functor), + m_policy(arg_policy) {} + + //---------------------------------------- +}; + +template <class FunctorType, class ReturnType, class... Traits> +class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>, + ReturnType, Kokkos::OpenMP> { + private: + using Policy = Kokkos::RangePolicy<Traits...>; + + using Analysis = + FunctorAnalysis<FunctorPatternInterface::SCAN, Policy, FunctorType>; + + using WorkTag = typename Policy::work_tag; + using WorkRange = typename Policy::WorkRange; + using Member = typename Policy::member_type; + + using ValueInit = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>; + using ValueJoin = Kokkos::Impl::FunctorValueJoin<FunctorType, WorkTag>; + using ValueOps = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>; + + using pointer_type = typename Analysis::pointer_type; + using reference_type = typename Analysis::reference_type; + + OpenMPExec* m_instance; + const FunctorType m_functor; + const Policy m_policy; + ReturnType& m_returnvalue; + + template <class TagType> + inline static + typename std::enable_if<std::is_same<TagType, void>::value>::type + exec_range(const FunctorType& functor, const Member ibeg, + const Member iend, reference_type update, const bool final) { + for (Member iwork = ibeg; iwork < iend; ++iwork) { + functor(iwork, update, final); + } + } + + template <class TagType> + inline static + typename std::enable_if<!std::is_same<TagType, void>::value>::type + exec_range(const FunctorType& functor, const Member ibeg, + const Member iend, reference_type update, const bool final) { + const TagType t{}; + for (Member iwork = ibeg; iwork < iend; ++iwork) { + functor(t, iwork, update, final); + } + } + + public: + inline void execute() const { + OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_scan"); + + const int value_count = Analysis::value_count(m_functor); + const size_t pool_reduce_bytes = 2 * Analysis::value_size(m_functor); + + m_instance->resize_thread_data(pool_reduce_bytes, 0 // team_reduce_bytes + , + 0 // team_shared_bytes + , + 0 // thread_local_bytes + ); + +#pragma omp parallel num_threads(OpenMP::impl_thread_pool_size()) + { + HostThreadTeamData& data = *(m_instance->get_thread_data()); + + const WorkRange range(m_policy, omp_get_thread_num(), + omp_get_num_threads()); + reference_type update_sum = + ValueInit::init(m_functor, data.pool_reduce_local()); + + ParallelScanWithTotal::template exec_range<WorkTag>( + m_functor, range.begin(), range.end(), update_sum, false); + + if (data.pool_rendezvous()) { + pointer_type ptr_prev = nullptr; + + const int n = omp_get_num_threads(); + + for (int i = 0; i < n; ++i) { + pointer_type ptr = + (pointer_type)data.pool_member(i)->pool_reduce_local(); + + if (i) { + for (int j = 0; j < value_count; ++j) { + ptr[j + value_count] = ptr_prev[j + value_count]; + } + ValueJoin::join(m_functor, ptr + value_count, ptr_prev); + } else { + ValueInit::init(m_functor, ptr + value_count); + } + + ptr_prev = ptr; + } + + data.pool_rendezvous_release(); + } + + reference_type update_base = ValueOps::reference( + ((pointer_type)data.pool_reduce_local()) + value_count); + + ParallelScanWithTotal::template exec_range<WorkTag>( + m_functor, range.begin(), range.end(), update_base, true); + + if (omp_get_thread_num() == omp_get_num_threads() - 1) { + m_returnvalue = update_base; + } + } + } + + //---------------------------------------- + + inline ParallelScanWithTotal(const FunctorType& arg_functor, + const Policy& arg_policy, + ReturnType& arg_returnvalue) + : m_instance(t_openmp_instance), + m_functor(arg_functor), + m_policy(arg_policy), + m_returnvalue(arg_returnvalue) {} + + //---------------------------------------- +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <class FunctorType, class... Properties> +class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, + Kokkos::OpenMP> { + private: + enum { TEAM_REDUCE_SIZE = 512 }; + + using Policy = + Kokkos::Impl::TeamPolicyInternal<Kokkos::OpenMP, Properties...>; + using WorkTag = typename Policy::work_tag; + using SchedTag = typename Policy::schedule_type::type; + using Member = typename Policy::member_type; + + OpenMPExec* m_instance; + const FunctorType m_functor; + const Policy m_policy; + const int m_shmem_size; + + template <class TagType> + inline static + typename std::enable_if<(std::is_same<TagType, void>::value)>::type + exec_team(const FunctorType& functor, HostThreadTeamData& data, + const int league_rank_begin, const int league_rank_end, + const int league_size) { + for (int r = league_rank_begin; r < league_rank_end;) { + functor(Member(data, r, league_size)); + + if (++r < league_rank_end) { + // Don't allow team members to lap one another + // so that they don't overwrite shared memory. + if (data.team_rendezvous()) { + data.team_rendezvous_release(); + } + } + } + } + + template <class TagType> + inline static + typename std::enable_if<(!std::is_same<TagType, void>::value)>::type + exec_team(const FunctorType& functor, HostThreadTeamData& data, + const int league_rank_begin, const int league_rank_end, + const int league_size) { + const TagType t{}; + + for (int r = league_rank_begin; r < league_rank_end;) { + functor(t, Member(data, r, league_size)); + + if (++r < league_rank_end) { + // Don't allow team members to lap one another + // so that they don't overwrite shared memory. + if (data.team_rendezvous()) { + data.team_rendezvous_release(); + } + } + } + } + + public: + inline void execute() const { + enum { is_dynamic = std::is_same<SchedTag, Kokkos::Dynamic>::value }; + + OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_for"); + + const size_t pool_reduce_size = 0; // Never shrinks + const size_t team_reduce_size = TEAM_REDUCE_SIZE * m_policy.team_size(); + const size_t team_shared_size = m_shmem_size + m_policy.scratch_size(1); + const size_t thread_local_size = 0; // Never shrinks + + m_instance->resize_thread_data(pool_reduce_size, team_reduce_size, + team_shared_size, thread_local_size); + +#pragma omp parallel num_threads(OpenMP::impl_thread_pool_size()) + { + HostThreadTeamData& data = *(m_instance->get_thread_data()); + + const int active = data.organize_team(m_policy.team_size()); + + if (active) { + data.set_work_partition( + m_policy.league_size(), + (0 < m_policy.chunk_size() ? m_policy.chunk_size() + : m_policy.team_iter())); + } + + if (is_dynamic) { + // Must synchronize to make sure each team has set its + // partition before beginning the work stealing loop. + if (data.pool_rendezvous()) data.pool_rendezvous_release(); + } + + if (active) { + std::pair<int64_t, int64_t> range(0, 0); + + do { + range = is_dynamic ? data.get_work_stealing_chunk() + : data.get_work_partition(); + + ParallelFor::template exec_team<WorkTag>(m_functor, data, range.first, + range.second, + m_policy.league_size()); + + } while (is_dynamic && 0 <= range.first); + } + + data.disband_team(); + } + } + + inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) + : m_instance(t_openmp_instance), + m_functor(arg_functor), + m_policy(arg_policy), + m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + + FunctorTeamShmemSize<FunctorType>::value( + arg_functor, arg_policy.team_size())) {} +}; + +//---------------------------------------------------------------------------- + +template <class FunctorType, class ReducerType, class... Properties> +class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>, + ReducerType, Kokkos::OpenMP> { + private: + enum { TEAM_REDUCE_SIZE = 512 }; + + using Policy = + Kokkos::Impl::TeamPolicyInternal<Kokkos::OpenMP, Properties...>; + + using Analysis = + FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, FunctorType>; + + using WorkTag = typename Policy::work_tag; + using SchedTag = typename Policy::schedule_type::type; + using Member = typename Policy::member_type; + + using ReducerConditional = + Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, + FunctorType, ReducerType>; + + using ReducerTypeFwd = typename ReducerConditional::type; + using WorkTagFwd = + std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag, + void>; + + using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>; + using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>; + + using pointer_type = typename Analysis::pointer_type; + using reference_type = typename Analysis::reference_type; + + OpenMPExec* m_instance; + const FunctorType m_functor; + const Policy m_policy; + const ReducerType m_reducer; + const pointer_type m_result_ptr; + const int m_shmem_size; + + template <class TagType> + inline static + typename std::enable_if<(std::is_same<TagType, void>::value)>::type + exec_team(const FunctorType& functor, HostThreadTeamData& data, + reference_type& update, const int league_rank_begin, + const int league_rank_end, const int league_size) { + for (int r = league_rank_begin; r < league_rank_end;) { + functor(Member(data, r, league_size), update); + + if (++r < league_rank_end) { + // Don't allow team members to lap one another + // so that they don't overwrite shared memory. + if (data.team_rendezvous()) { + data.team_rendezvous_release(); + } + } + } + } + + template <class TagType> + inline static + typename std::enable_if<(!std::is_same<TagType, void>::value)>::type + exec_team(const FunctorType& functor, HostThreadTeamData& data, + reference_type& update, const int league_rank_begin, + const int league_rank_end, const int league_size) { + const TagType t{}; + + for (int r = league_rank_begin; r < league_rank_end;) { + functor(t, Member(data, r, league_size), update); + + if (++r < league_rank_end) { + // Don't allow team members to lap one another + // so that they don't overwrite shared memory. + if (data.team_rendezvous()) { + data.team_rendezvous_release(); + } + } + } + } + + public: + inline void execute() const { + enum { is_dynamic = std::is_same<SchedTag, Kokkos::Dynamic>::value }; + + if (m_policy.league_size() * m_policy.team_size() == 0) { + if (m_result_ptr) { + ValueInit::init(ReducerConditional::select(m_functor, m_reducer), + m_result_ptr); + Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final( + ReducerConditional::select(m_functor, m_reducer), m_result_ptr); + } + return; + } + OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_reduce"); + + const size_t pool_reduce_size = + Analysis::value_size(ReducerConditional::select(m_functor, m_reducer)); + + const size_t team_reduce_size = TEAM_REDUCE_SIZE * m_policy.team_size(); + const size_t team_shared_size = m_shmem_size + m_policy.scratch_size(1); + const size_t thread_local_size = 0; // Never shrinks + + m_instance->resize_thread_data(pool_reduce_size, team_reduce_size, + team_shared_size, thread_local_size); + + const int pool_size = OpenMP::impl_thread_pool_size(); +#pragma omp parallel num_threads(pool_size) + { + HostThreadTeamData& data = *(m_instance->get_thread_data()); + + const int active = data.organize_team(m_policy.team_size()); + + if (active) { + data.set_work_partition( + m_policy.league_size(), + (0 < m_policy.chunk_size() ? m_policy.chunk_size() + : m_policy.team_iter())); + } + + if (is_dynamic) { + // Must synchronize to make sure each team has set its + // partition before beginning the work stealing loop. + if (data.pool_rendezvous()) data.pool_rendezvous_release(); + } + + if (active) { + reference_type update = + ValueInit::init(ReducerConditional::select(m_functor, m_reducer), + data.pool_reduce_local()); + + std::pair<int64_t, int64_t> range(0, 0); + + do { + range = is_dynamic ? data.get_work_stealing_chunk() + : data.get_work_partition(); + + ParallelReduce::template exec_team<WorkTag>(m_functor, data, update, + range.first, range.second, + m_policy.league_size()); + + } while (is_dynamic && 0 <= range.first); + } else { + ValueInit::init(ReducerConditional::select(m_functor, m_reducer), + data.pool_reduce_local()); + } + + data.disband_team(); + + // This thread has updated 'pool_reduce_local()' with its + // contributions to the reduction. The parallel region is + // about to terminate and the master thread will load and + // reduce each 'pool_reduce_local()' contribution. + // Must 'memory_fence()' to guarantee that storing the update to + // 'pool_reduce_local()' will complete before this thread + // exits the parallel region. + + memory_fence(); + } + + // Reduction: + + const pointer_type ptr = + pointer_type(m_instance->get_thread_data(0)->pool_reduce_local()); + + for (int i = 1; i < pool_size; ++i) { + ValueJoin::join(ReducerConditional::select(m_functor, m_reducer), ptr, + m_instance->get_thread_data(i)->pool_reduce_local()); + } + + Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final( + ReducerConditional::select(m_functor, m_reducer), ptr); + + if (m_result_ptr) { + const int n = Analysis::value_count( + ReducerConditional::select(m_functor, m_reducer)); + + for (int j = 0; j < n; ++j) { + m_result_ptr[j] = ptr[j]; + } + } + } + + //---------------------------------------- + + template <class ViewType> + inline ParallelReduce( + const FunctorType& arg_functor, const Policy& arg_policy, + const ViewType& arg_result, + typename std::enable_if<Kokkos::is_view<ViewType>::value && + !Kokkos::is_reducer_type<ReducerType>::value, + void*>::type = nullptr) + : m_instance(t_openmp_instance), + m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(InvalidType()), + m_result_ptr(arg_result.data()), + m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + + FunctorTeamShmemSize<FunctorType>::value( + arg_functor, arg_policy.team_size())) {} + + inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy, + const ReducerType& reducer) + : m_instance(t_openmp_instance), + m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(reducer), + m_result_ptr(reducer.view().data()), + m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + + FunctorTeamShmemSize<FunctorType>::value( + arg_functor, arg_policy.team_size())) { + /*static_assert( std::is_same< typename ViewType::memory_space + , Kokkos::HostSpace >::value + , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" + );*/ + } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif +#endif /* KOKKOS_OPENMP_PARALLEL_HPP */ diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f7338819af53300839895021ada12c11323a2f82 --- /dev/null +++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp @@ -0,0 +1,110 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_OPENMP) && defined(KOKKOS_ENABLE_TASKDAG) + +#include <Kokkos_Core.hpp> + +#include <impl/Kokkos_TaskQueue_impl.hpp> +#include <impl/Kokkos_HostThreadTeam.hpp> +#include <OpenMP/Kokkos_OpenMP_Task.hpp> +#include <cassert> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template class TaskQueue<Kokkos::OpenMP, typename Kokkos::OpenMP::memory_space>; + +HostThreadTeamData& HostThreadTeamDataSingleton::singleton() { + static HostThreadTeamDataSingleton s; + return s; +} + +HostThreadTeamDataSingleton::HostThreadTeamDataSingleton() + : HostThreadTeamData() { + Kokkos::OpenMP::memory_space space; + const size_t num_pool_reduce_bytes = 32; + const size_t num_team_reduce_bytes = 32; + const size_t num_team_shared_bytes = 1024; + const size_t num_thread_local_bytes = 1024; + const size_t alloc_bytes = HostThreadTeamData::scratch_size( + num_pool_reduce_bytes, num_team_reduce_bytes, num_team_shared_bytes, + num_thread_local_bytes); + + void* ptr = nullptr; + try { + ptr = space.allocate(alloc_bytes); + } catch (Kokkos::Experimental::RawMemoryAllocationFailure const& f) { + // For now, just rethrow the error message with a note + // Note that this could, in turn, trigger an out of memory exception, + // but it's pretty unlikely, so we won't worry about it for now. + // TODO reasonable error message when `std::string` causes OOM error + Kokkos::Impl::throw_runtime_exception( + std::string("Failure to allocate scratch memory: ") + + f.get_error_message()); + } + + HostThreadTeamData::scratch_assign( + ptr, alloc_bytes, num_pool_reduce_bytes, num_team_reduce_bytes, + num_team_shared_bytes, num_thread_local_bytes); +} + +HostThreadTeamDataSingleton::~HostThreadTeamDataSingleton() { + Kokkos::OpenMP::memory_space space; + space.deallocate(HostThreadTeamData::scratch_buffer(), + static_cast<size_t>(HostThreadTeamData::scratch_bytes())); +} + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +#else +void KOKKOS_CORE_SRC_OPENMP_KOKKOS_OPENMP_TASK_PREVENT_LINK_ERROR() {} +#endif /* #if defined( KOKKOS_ENABLE_OPENMP ) && defined( \ + KOKKOS_ENABLE_TASKDAG ) */ diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp new file mode 100644 index 0000000000000000000000000000000000000000..2a4a7b1d53bd4785f26508fbc990148291bd9763 --- /dev/null +++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp @@ -0,0 +1,390 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_IMPL_OPENMP_TASK_HPP +#define KOKKOS_IMPL_OPENMP_TASK_HPP + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_OPENMP) && defined(KOKKOS_ENABLE_TASKDAG) + +#include <Kokkos_TaskScheduler_fwd.hpp> + +#include <impl/Kokkos_HostThreadTeam.hpp> +#include <Kokkos_OpenMP.hpp> + +#include <type_traits> +#include <cassert> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +class HostThreadTeamDataSingleton : private HostThreadTeamData { + private: + HostThreadTeamDataSingleton(); + ~HostThreadTeamDataSingleton(); + + public: + static HostThreadTeamData& singleton(); +}; + +// Hack this as a partial specialization for now +// TODO @tasking @cleanup DSH Make this the general class template and make the +// old code the partial specialization +template <class QueueType> +class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::OpenMP, QueueType> > { + public: + using execution_space = Kokkos::OpenMP; + using scheduler_type = SimpleTaskScheduler<Kokkos::OpenMP, QueueType>; + using member_type = + TaskTeamMemberAdapter<Kokkos::Impl::HostThreadTeamMember<execution_space>, + scheduler_type>; + using memory_space = Kokkos::HostSpace; + + enum : int { max_league_size = HostThreadTeamData::max_pool_members }; + + // Must provide task queue execution function + static void execute(scheduler_type const& scheduler) { + using task_base_type = typename scheduler_type::task_base_type; + + // Unused; ChaseLev queue still needs worker ID even in single case (so we + // need to use the thread data from inside of the parallel region. Team + // size is fixed at 1 for now anyway + // HostThreadTeamData& team_data_single = + // HostThreadTeamDataSingleton::singleton(); + + // TODO @tasking @generalization DSH use + // scheduler.get_execution_space().impl() (or something like that) instead + // of the thread-local variable + Impl::OpenMPExec* instance = t_openmp_instance; + const int pool_size = get_max_team_count(scheduler.get_execution_space()); + + // TODO @tasking @new_feature DSH allow team sizes other than 1 + const int team_size = 1; // Threads per core + instance->resize_thread_data(0, /* global reduce buffer */ + 512 * team_size, /* team reduce buffer */ + 0, /* team shared buffer */ + 0 /* thread local buffer */ + ); + assert(pool_size % team_size == 0); + + auto& queue = scheduler.queue(); + + // queue.initialize_team_queues(pool_size / team_size); + +#pragma omp parallel num_threads(pool_size) + { + Impl::HostThreadTeamData& self = *(instance->get_thread_data()); + + // Organizing threads into a team performs a barrier across the + // entire pool to insure proper initialization of the team + // rendezvous mechanism before a team rendezvous can be performed. + + // organize_team() returns true if this is an active team member + if (self.organize_team(team_size)) { + member_type single_exec(scheduler, self); + member_type team_exec(scheduler, self); + + auto& team_scheduler = team_exec.scheduler(); + + auto current_task = OptionalRef<task_base_type>(nullptr); + + while (!queue.is_done()) { + // Each team lead attempts to acquire either a thread team task + // or a single thread task for the team. + if (team_exec.team_rank() == 0) { + // loop while both: + // - the queue is not done + // - the most recently popped task is a single task or empty + while (!queue.is_done()) { + current_task = + queue.pop_ready_task(team_scheduler.team_scheduler_info()); + + if (current_task) { + if (current_task->is_team_runnable()) { + // break out of the team leader loop to run the team task + break; + } else { + KOKKOS_ASSERT(current_task->is_single_runnable()); + current_task->as_runnable_task().run(single_exec); + // Respawns are handled in the complete function + queue.complete((*std::move(current_task)).as_runnable_task(), + team_scheduler.team_scheduler_info()); + } + + } // end if current_task is not null + + current_task = nullptr; + + } // end team leader loop + } + + // Otherwise, make sure everyone in the team has the same task + team_exec.team_broadcast(current_task, 0); + + if (current_task) { + KOKKOS_ASSERT(current_task->is_team_runnable()); + current_task->as_runnable_task().run(team_exec); + + if (team_exec.team_rank() == 0) { + // Respawns are handled in the complete function + queue.complete((*std::move(current_task)).as_runnable_task(), + team_scheduler.team_scheduler_info()); + } + } + } + } + self.disband_team(); + } // end pragma omp parallel + } + + static uint32_t get_max_team_count(execution_space const& espace) { + return static_cast<uint32_t>(espace.impl_thread_pool_size()); + } + + // TODO @tasking @optimization DSH specialize this for trivially destructible + // types + template <typename TaskType> + static void get_function_pointer(typename TaskType::function_type& ptr, + typename TaskType::destroy_type& dtor) { + ptr = TaskType::apply; + dtor = TaskType::destroy; + } +}; + +template <class Scheduler> +class TaskQueueSpecializationConstrained< + Scheduler, + typename std::enable_if<std::is_same<typename Scheduler::execution_space, + Kokkos::OpenMP>::value>::type> { + public: + using execution_space = Kokkos::OpenMP; + using scheduler_type = Scheduler; + using member_type = + TaskTeamMemberAdapter<Kokkos::Impl::HostThreadTeamMember<execution_space>, + scheduler_type>; + using memory_space = Kokkos::HostSpace; + + enum : int { max_league_size = HostThreadTeamData::max_pool_members }; + + static void iff_single_thread_recursive_execute( + scheduler_type const& scheduler) { + using task_base_type = typename scheduler_type::task_base; + using queue_type = typename scheduler_type::queue_type; + + if (1 == OpenMP::impl_thread_pool_size()) { + task_base_type* const end = (task_base_type*)task_base_type::EndTag; + + HostThreadTeamData& team_data_single = + HostThreadTeamDataSingleton::singleton(); + + member_type single_exec(scheduler, team_data_single); + + task_base_type* task = end; + + do { + task = end; + + // Loop by priority and then type + for (int i = 0; i < queue_type::NumQueue && end == task; ++i) { + for (int j = 0; j < 2 && end == task; ++j) { + task = + queue_type::pop_ready_task(&scheduler.m_queue->m_ready[i][j]); + } + } + + if (end == task) break; + + (*task->m_apply)(task, &single_exec); + + scheduler.m_queue->complete(task); + + } while (1); + } + } + + // Must provide task queue execution function + static void execute(scheduler_type const& scheduler) { + using task_base_type = typename scheduler_type::task_base; + using queue_type = typename scheduler_type::queue_type; + + static task_base_type* const end = (task_base_type*)task_base_type::EndTag; + + constexpr task_base_type* no_more_tasks_sentinel = nullptr; + + HostThreadTeamData& team_data_single = + HostThreadTeamDataSingleton::singleton(); + + Impl::OpenMPExec* instance = t_openmp_instance; + const int pool_size = OpenMP::impl_thread_pool_size(); + + const int team_size = 1; // Threads per core + instance->resize_thread_data(0 /* global reduce buffer */ + , + 512 * team_size /* team reduce buffer */ + , + 0 /* team shared buffer */ + , + 0 /* thread local buffer */ + ); + assert(pool_size % team_size == 0); + auto& queue = scheduler.queue(); + queue.initialize_team_queues(pool_size / team_size); + +#pragma omp parallel num_threads(pool_size) + { + Impl::HostThreadTeamData& self = *(instance->get_thread_data()); + + // Organizing threads into a team performs a barrier across the + // entire pool to insure proper initialization of the team + // rendezvous mechanism before a team rendezvous can be performed. + + // organize_team() returns true if this is an active team member + if (self.organize_team(team_size)) { + member_type single_exec(scheduler, team_data_single); + member_type team_exec(scheduler, self); + + auto& team_queue = team_exec.scheduler().queue(); + + // Loop until all queues are empty and no tasks in flight + + task_base_type* task = no_more_tasks_sentinel; + + do { + // Each team lead attempts to acquire either a thread team task + // or a single thread task for the team. + + if (0 == team_exec.team_rank()) { + bool leader_loop = false; + + do { + if (task != no_more_tasks_sentinel && task != end) { + // team member #0 completes the previously executed task, + // completion may delete the task + team_queue.complete(task); + } + + // If 0 == m_ready_count then set task = 0 + + if (*((volatile int*)&team_queue.m_ready_count) > 0) { + task = end; + // Attempt to acquire a task + // Loop by priority and then type + for (int i = 0; i < queue_type::NumQueue && end == task; ++i) { + for (int j = 0; j < 2 && end == task; ++j) { + task = + queue_type::pop_ready_task(&team_queue.m_ready[i][j]); + } + } + } else { + // returns nullptr if and only if all other queues have a ready + // count of 0 also. Otherwise, returns a task from another queue + // or `end` if one couldn't be popped + task = team_queue.attempt_to_steal_task(); +#if 0 + if(task != no_more_tasks_sentinel && task != end) { + std::printf("task stolen on rank %d\n", team_exec.league_rank()); + } +#endif + } + + // If still tasks are still executing + // and no task could be acquired + // then continue this leader loop + if (task == end) { + // this means that the ready task count was not zero, but we + // couldn't pop a task (because, for instance, someone else + // got there before us + leader_loop = true; + } else if ((task != no_more_tasks_sentinel) && + (task_base_type::TaskSingle == task->m_task_type)) { + // if a single thread task then execute now + + (*task->m_apply)(task, &single_exec); + + leader_loop = true; + } else { + leader_loop = false; + } + } while (leader_loop); + } + + // Team lead either found 0 == m_ready_count or a team task + // Team lead broadcast acquired task: + + team_exec.team_broadcast(task, 0); + + if (task != no_more_tasks_sentinel) { // Thread Team Task + + (*task->m_apply)(task, &team_exec); + + // The m_apply function performs a barrier + } + } while (task != no_more_tasks_sentinel); + } + self.disband_team(); + } // end pragma omp parallel + } + + template <typename TaskType> + static void get_function_pointer(typename TaskType::function_type& ptr, + typename TaskType::destroy_type& dtor) { + ptr = TaskType::apply; + dtor = TaskType::destroy; + } +}; + +extern template class TaskQueue<Kokkos::OpenMP, + typename Kokkos::OpenMP::memory_space>; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */ +#endif /* #ifndef KOKKOS_IMPL_OPENMP_TASK_HPP */ diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Team.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Team.hpp new file mode 100644 index 0000000000000000000000000000000000000000..be7afd32883df4869a8919eb7460e812f3be6e0e --- /dev/null +++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Team.hpp @@ -0,0 +1,359 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_OPENMP_TEAM_HPP +#define KOKKOS_OPENMP_TEAM_HPP + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_OPENMP) + +#include <OpenMP/Kokkos_OpenMP_Exec.hpp> + +namespace Kokkos { +namespace Impl { + +template <class... Properties> +class TeamPolicyInternal<Kokkos::OpenMP, Properties...> + : public PolicyTraits<Properties...> { + public: + //! Tag this class as a kokkos execution policy + using execution_policy = TeamPolicyInternal<OpenMP, Properties...>; + + using traits = PolicyTraits<Properties...>; + + const typename traits::execution_space& space() const { + static typename traits::execution_space m_space; + return m_space; + } + + template <class ExecSpace, class... OtherProperties> + friend class TeamPolicyInternal; + + template <class... OtherProperties> + TeamPolicyInternal( + const TeamPolicyInternal<Kokkos::OpenMP, OtherProperties...>& p) { + m_league_size = p.m_league_size; + m_team_size = p.m_team_size; + m_team_alloc = p.m_team_alloc; + m_team_iter = p.m_team_iter; + m_team_scratch_size[0] = p.m_team_scratch_size[0]; + m_thread_scratch_size[0] = p.m_thread_scratch_size[0]; + m_team_scratch_size[1] = p.m_team_scratch_size[1]; + m_thread_scratch_size[1] = p.m_thread_scratch_size[1]; + m_chunk_size = p.m_chunk_size; + m_tune_team = p.m_tune_team; + m_tune_vector = p.m_tune_vector; + } + //---------------------------------------- + + template <class FunctorType> + int team_size_max(const FunctorType&, const ParallelForTag&) const { + int pool_size = traits::execution_space::impl_thread_pool_size(1); + int max_host_team_size = Impl::HostThreadTeamData::max_team_members; + return pool_size < max_host_team_size ? pool_size : max_host_team_size; + } + + int impl_vector_length() const { return 1; } + + template <class FunctorType> + int team_size_max(const FunctorType&, const ParallelReduceTag&) const { + int pool_size = traits::execution_space::impl_thread_pool_size(1); + int max_host_team_size = Impl::HostThreadTeamData::max_team_members; + return pool_size < max_host_team_size ? pool_size : max_host_team_size; + } + template <class FunctorType, class ReducerType> + inline int team_size_max(const FunctorType& f, const ReducerType&, + const ParallelReduceTag& t) const { + return team_size_max(f, t); + } + template <class FunctorType> + int team_size_recommended(const FunctorType&, const ParallelForTag&) const { + return traits::execution_space::impl_thread_pool_size(2); + } + template <class FunctorType> + int team_size_recommended(const FunctorType&, + const ParallelReduceTag&) const { + return traits::execution_space::impl_thread_pool_size(2); + } + template <class FunctorType, class ReducerType> + inline int team_size_recommended(const FunctorType& f, const ReducerType&, + const ParallelReduceTag& t) const { + return team_size_recommended(f, t); + } + + inline static int vector_length_max() { + return 1024; + } // Use arbitrary large number, is meant as a vectorizable length + + inline static int scratch_size_max(int level) { + return (level == 0 ? 1024 * 32 : // Roughly L1 size + 20 * 1024 * 1024); // Limit to keep compatibility with CUDA + } + + //---------------------------------------- + + private: + int m_league_size; + int m_team_size; + int m_team_alloc; + int m_team_iter; + + size_t m_team_scratch_size[2]; + size_t m_thread_scratch_size[2]; + + int m_chunk_size; + + bool m_tune_team; + bool m_tune_vector; + + inline void init(const int league_size_request, const int team_size_request) { + const int pool_size = traits::execution_space::impl_thread_pool_size(0); + const int team_grain = traits::execution_space::impl_thread_pool_size(2); + const int max_host_team_size = Impl::HostThreadTeamData::max_team_members; + const int team_max = + ((pool_size < max_host_team_size) ? pool_size : max_host_team_size); + + m_league_size = league_size_request; + + if (team_size_request > team_max) + Kokkos::abort("Kokkos::abort: Requested Team Size is too large!"); + m_team_size = team_size_request < team_max ? team_size_request : team_max; + + // Round team size up to a multiple of 'team_gain' + const int team_size_grain = + team_grain * ((m_team_size + team_grain - 1) / team_grain); + const int team_count = pool_size / team_size_grain; + + // Constraint : pool_size = m_team_alloc * team_count + m_team_alloc = pool_size / team_count; + + // Maxumum number of iterations each team will take: + m_team_iter = (m_league_size + team_count - 1) / team_count; + + set_auto_chunk_size(); + } + + public: + inline int team_size() const { return m_team_size; } + inline int league_size() const { return m_league_size; } + inline bool impl_auto_team_size() const { return m_tune_team; } + inline bool impl_auto_vector_length() const { return m_tune_vector; } + inline void impl_set_team_size(size_t new_team_size) { + m_team_size = new_team_size; + } + inline void impl_set_vector_length(size_t) {} + inline size_t scratch_size(const int& level, int team_size_ = -1) const { + if (team_size_ < 0) team_size_ = m_team_size; + return m_team_scratch_size[level] + + team_size_ * m_thread_scratch_size[level]; + } + + /** \brief Specify league size, request team size */ + TeamPolicyInternal(const typename traits::execution_space&, + int league_size_request, int team_size_request, + int /* vector_length_request */ = 1) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_chunk_size(0), + m_tune_team(false), + m_tune_vector(false) { + init(league_size_request, team_size_request); + } + + TeamPolicyInternal(const typename traits::execution_space&, + int league_size_request, + const Kokkos::AUTO_t& /* team_size_request */ + , + int /* vector_length_request */ = 1) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_chunk_size(0), + m_tune_team(true), + m_tune_vector(false) { + init(league_size_request, + traits::execution_space::impl_thread_pool_size(2)); + } + + TeamPolicyInternal(const typename traits::execution_space&, + int league_size_request, + const Kokkos::AUTO_t& /* team_size_request */ + , + const Kokkos::AUTO_t& /* vector_length_request */) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_chunk_size(0), + m_tune_team(true), + m_tune_vector(true) { + init(league_size_request, + traits::execution_space::impl_thread_pool_size(2)); + } + + TeamPolicyInternal(const typename traits::execution_space&, + int league_size_request, const int team_size_request, + const Kokkos::AUTO_t& /* vector_length_request */) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_chunk_size(0), + m_tune_team(false), + m_tune_vector(true) { + init(league_size_request, team_size_request); + } + + TeamPolicyInternal(int league_size_request, int team_size_request, + int /* vector_length_request */ = 1) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_chunk_size(0), + m_tune_team(false), + m_tune_vector(false) { + init(league_size_request, team_size_request); + } + + TeamPolicyInternal(int league_size_request, + const Kokkos::AUTO_t& /* team_size_request */ + , + int /* vector_length_request */ = 1) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_chunk_size(0), + m_tune_team(true), + m_tune_vector(false) { + init(league_size_request, + traits::execution_space::impl_thread_pool_size(2)); + } + + TeamPolicyInternal(int league_size_request, + const Kokkos::AUTO_t& /* team_size_request */ + , + const Kokkos::AUTO_t& /* vector_length_request */) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_chunk_size(0), + m_tune_team(true), + m_tune_vector(true) { + init(league_size_request, + traits::execution_space::impl_thread_pool_size(2)); + } + + TeamPolicyInternal(int league_size_request, int team_size_request, + const Kokkos::AUTO_t& /* vector_length_request */) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_chunk_size(0), + m_tune_team(true), + m_tune_vector(true) { + init(league_size_request, team_size_request); + } + + inline int team_alloc() const { return m_team_alloc; } + inline int team_iter() const { return m_team_iter; } + + inline int chunk_size() const { return m_chunk_size; } + + /** \brief set chunk_size to a discrete value*/ + inline TeamPolicyInternal& set_chunk_size( + typename traits::index_type chunk_size_) { + m_chunk_size = chunk_size_; + return *this; + } + + /** \brief set per team scratch size for a specific level of the scratch + * hierarchy */ + inline TeamPolicyInternal& set_scratch_size(const int& level, + const PerTeamValue& per_team) { + m_team_scratch_size[level] = per_team.value; + return *this; + } + + /** \brief set per thread scratch size for a specific level of the scratch + * hierarchy */ + inline TeamPolicyInternal& set_scratch_size( + const int& level, const PerThreadValue& per_thread) { + m_thread_scratch_size[level] = per_thread.value; + return *this; + } + + /** \brief set per thread and per team scratch size for a specific level of + * the scratch hierarchy */ + inline TeamPolicyInternal& set_scratch_size( + const int& level, const PerTeamValue& per_team, + const PerThreadValue& per_thread) { + m_team_scratch_size[level] = per_team.value; + m_thread_scratch_size[level] = per_thread.value; + return *this; + } + + private: + /** \brief finalize chunk_size if it was set to AUTO*/ + inline void set_auto_chunk_size() { + int concurrency = + traits::execution_space::impl_thread_pool_size(0) / m_team_alloc; + if (concurrency == 0) concurrency = 1; + + if (m_chunk_size > 0) { + if (!Impl::is_integral_power_of_two(m_chunk_size)) + Kokkos::abort("TeamPolicy blocking granularity must be power of two"); + } + + int new_chunk_size = 1; + while (new_chunk_size * 100 * concurrency < m_league_size) + new_chunk_size *= 2; + if (new_chunk_size < 128) { + new_chunk_size = 1; + while ((new_chunk_size * 40 * concurrency < m_league_size) && + (new_chunk_size < 128)) + new_chunk_size *= 2; + } + m_chunk_size = new_chunk_size; + } + + public: + using member_type = Impl::HostThreadTeamMember<Kokkos::OpenMP>; +}; + +} // namespace Impl +} // namespace Kokkos + +#endif +#endif /* KOKKOS_OPENMP_TEAM_HPP */ diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp new file mode 100644 index 0000000000000000000000000000000000000000..92e4ee636a8a0d4e2c196672b81c4720376ce21c --- /dev/null +++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp @@ -0,0 +1,99 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_OPENMP_WORKGRAPHPOLICY_HPP +#define KOKKOS_OPENMP_WORKGRAPHPOLICY_HPP + +#include <Kokkos_OpenMP.hpp> + +namespace Kokkos { +namespace Impl { + +template <class FunctorType, class... Traits> +class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>, + Kokkos::OpenMP> { + private: + using Policy = Kokkos::WorkGraphPolicy<Traits...>; + + Policy m_policy; + FunctorType m_functor; + + template <class TagType> + typename std::enable_if<std::is_same<TagType, void>::value>::type exec_one( + const std::int32_t w) const noexcept { + m_functor(w); + } + + template <class TagType> + typename std::enable_if<!std::is_same<TagType, void>::value>::type exec_one( + const std::int32_t w) const noexcept { + const TagType t{}; + m_functor(t, w); + } + + public: + inline void execute() { +#pragma omp parallel num_threads(OpenMP::impl_thread_pool_size()) + { + // Spin until COMPLETED_TOKEN. + // END_TOKEN indicates no work is currently available. + + for (std::int32_t w = Policy::END_TOKEN; + Policy::COMPLETED_TOKEN != (w = m_policy.pop_work());) { + if (Policy::END_TOKEN != w) { + exec_one<typename Policy::work_tag>(w); + m_policy.completed_work(w); + } + } + } + } + + inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) + : m_policy(arg_policy), m_functor(arg_functor) {} +}; + +} // namespace Impl +} // namespace Kokkos + +#endif /* #define KOKKOS_OPENMP_WORKGRAPHPOLICY_HPP */ diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6fbb4245b8fb8b1e354452727ce9862c85a147c8 --- /dev/null +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp @@ -0,0 +1,253 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Macros.hpp> + +#include <algorithm> +#include <omp.h> + +/*--------------------------------------------------------------------------*/ + +#include <stdlib.h> +#include <stdint.h> +#include <memory.h> + +#include <iostream> +#include <sstream> +#include <cstring> + +#include <Kokkos_OpenMPTarget.hpp> +#include <Kokkos_OpenMPTargetSpace.hpp> +#include <impl/Kokkos_Error.hpp> +#include <Kokkos_Atomic.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { +/* Default allocation mechanism */ +OpenMPTargetSpace::OpenMPTargetSpace() {} + +void *OpenMPTargetSpace::allocate(const size_t arg_alloc_size) const { + static_assert(sizeof(void *) == sizeof(uintptr_t), + "Error sizeof(void*) != sizeof(uintptr_t)"); + + void *ptr; + + ptr = omp_target_alloc(arg_alloc_size, omp_get_default_device()); + + return ptr; +} + +void OpenMPTargetSpace::deallocate(void *const arg_alloc_ptr, + const size_t /*arg_alloc_size*/) const { + if (arg_alloc_ptr) { + omp_target_free(arg_alloc_ptr, omp_get_default_device()); + } +} +} // namespace Experimental +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +#ifdef KOKKOS_ENABLE_DEBUG +SharedAllocationRecord<void, void> SharedAllocationRecord< + Kokkos::Experimental::OpenMPTargetSpace, void>::s_root_record; +#endif + +SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace, + void>::~SharedAllocationRecord() { + m_space.deallocate(SharedAllocationRecord<void, void>::m_alloc_ptr, + SharedAllocationRecord<void, void>::m_alloc_size); +} + +// TODO: Implement deep copy back see CudaSpace +std::string SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace, + void>::get_label() const { + return std::string("OpenMPTargetAllocation"); +} + +SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace, void>:: + SharedAllocationRecord( + const Kokkos::Experimental::OpenMPTargetSpace &arg_space, + const std::string &arg_label, const size_t arg_alloc_size, + const SharedAllocationRecord<void, void>::function_type arg_dealloc) + // Pass through allocated [ SharedAllocationHeader , user_memory ] + // Pass through deallocation function + : base_t( +#ifdef KOKKOS_ENABLE_DEBUG + &SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace, + void>::s_root_record, +#endif + reinterpret_cast<SharedAllocationHeader *>(arg_space.allocate( + sizeof(SharedAllocationHeader) + arg_alloc_size)), + sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc), + m_space(arg_space) { + SharedAllocationHeader header; + + this->base_t::_fill_host_accessible_header_info(header, arg_label); + + // TODO DeepCopy + // DeepCopy + Kokkos::Impl::DeepCopy<Experimental::OpenMPTargetSpace, HostSpace>( + RecordBase::m_alloc_ptr, &header, sizeof(SharedAllocationHeader)); +} + +//---------------------------------------------------------------------------- + +void *SharedAllocationRecord<Kokkos::Experimental::OpenMPTargetSpace, void>:: + reallocate_tracked(void *const arg_alloc_ptr, const size_t arg_alloc_size) { + SharedAllocationRecord *const r_old = get_record(arg_alloc_ptr); + SharedAllocationRecord *const r_new = + allocate(r_old->m_space, r_old->get_label(), arg_alloc_size); + + // Kokkos::Impl::DeepCopy<OpenMPTargetSpace,OpenMPTargetSpace>( r_new->data() + // , r_old->data() + // , std::min( r_old->size() , + // r_new->size() ) ); + + RecordBase::increment(r_new); + RecordBase::decrement(r_old); + + return r_new->data(); +} + +} // namespace Impl +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { + +template <class> +struct ViewOperatorBoundsErrorAbort; + +template <> +struct ViewOperatorBoundsErrorAbort<Kokkos::Experimental::OpenMPTargetSpace> { + static void apply(const size_t rank, const size_t n0, const size_t n1, + const size_t n2, const size_t n3, const size_t n4, + const size_t n5, const size_t n6, const size_t n7, + const size_t i0, const size_t i1, const size_t i2, + const size_t i3, const size_t i4, const size_t i5, + const size_t i6, const size_t i7); +}; + +void ViewOperatorBoundsErrorAbort<Kokkos::Experimental::OpenMPTargetSpace>:: + apply(const size_t rank, const size_t n0, const size_t n1, const size_t n2, + const size_t n3, const size_t n4, const size_t n5, const size_t n6, + const size_t n7, const size_t i0, const size_t i1, const size_t i2, + const size_t i3, const size_t i4, const size_t i5, const size_t i6, + const size_t i7) { + printf( + "View operator bounds error : rank(%lu) " + "dim(%lu,%lu,%lu,%lu,%lu,%lu,%lu,%lu) " + "index(%lu,%lu,%lu,%lu,%lu,%lu,%lu,%lu)", + rank, n0, n1, n2, n3, n4, n5, n6, n7, i0, i1, i2, i3, i4, i5, i6, i7); + // Kokkos::Impl::throw_runtime_exception( buffer ); +} + +} // namespace Impl +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ +/* +namespace Kokkos { +namespace { + const unsigned HOST_SPACE_ATOMIC_MASK = 0xFFFF; + const unsigned HOST_SPACE_ATOMIC_XOR_MASK = 0x5A39; + static int HOST_SPACE_ATOMIC_LOCKS[HOST_SPACE_ATOMIC_MASK+1]; +} + +namespace Impl { +void init_lock_array_host_space() { + static int is_initialized = 0; + if(! is_initialized) + for(int i = 0; i < static_cast<int> (HOST_SPACE_ATOMIC_MASK+1); i++) + HOST_SPACE_ATOMIC_LOCKS[i] = 0; +} + +bool lock_address_host_space(void* ptr) { + return 0 == atomic_compare_exchange( &HOST_SPACE_ATOMIC_LOCKS[ + (( size_t(ptr) >> 2 ) & HOST_SPACE_ATOMIC_MASK) ^ +HOST_SPACE_ATOMIC_XOR_MASK] , 0 , 1); +} + +void unlock_address_host_space(void* ptr) { + atomic_exchange( &HOST_SPACE_ATOMIC_LOCKS[ + (( size_t(ptr) >> 2 ) & HOST_SPACE_ATOMIC_MASK) ^ +HOST_SPACE_ATOMIC_XOR_MASK] , 0); +} + +} +}*/ + +//============================================================================== +// <editor-fold desc="Explicit instantiations of CRTP Base classes"> {{{1 + +#include <impl/Kokkos_SharedAlloc_timpl.hpp> + +namespace Kokkos { +namespace Impl { + +// To avoid additional compilation cost for something that's (mostly?) not +// performance sensitive, we explicity instantiate these CRTP base classes here, +// where we have access to the associated *_timpl.hpp header files. +template class HostInaccessibleSharedAllocationRecordCommon< + Kokkos::Experimental::OpenMPTargetSpace>; +template class SharedAllocationRecordCommon< + Kokkos::Experimental::OpenMPTargetSpace>; + +} // end namespace Impl +} // end namespace Kokkos + +// </editor-fold> end Explicit instantiations of CRTP Base classes }}}1 +//============================================================================== diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Abort.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Abort.hpp new file mode 100644 index 0000000000000000000000000000000000000000..ff07ce4f3ae86a32e0272891f5113536af38df07 --- /dev/null +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Abort.hpp @@ -0,0 +1,63 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_OPENMPTARGET_ABORT_HPP +#define KOKKOS_OPENMPTARGET_ABORT_HPP + +#include <Kokkos_Macros.hpp> +#ifdef KOKKOS_ENABLE_OPENMPTARGET + +namespace Kokkos { +namespace Impl { + +KOKKOS_INLINE_FUNCTION void OpenMPTarget_abort(char const *msg) { + fprintf(stderr, "%s.\n", msg); + std::abort(); +} + +} // namespace Impl +} // namespace Kokkos + +#endif +#endif diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Error.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Error.hpp new file mode 100644 index 0000000000000000000000000000000000000000..1ca30631af920badd089559874a7d24a7cfb63f7 --- /dev/null +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Error.hpp @@ -0,0 +1,73 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_OPENMPTARGET_ERROR_HPP +#define KOKKOS_OPENMPTARGET_ERROR_HPP + +#include <impl/Kokkos_Error.hpp> +#include <sstream> + +namespace Kokkos { +namespace Impl { + +inline void ompt_internal_safe_call(int e, const char* name, + const char* file = nullptr, + const int line = 0) { + if (e != 0) { + std::ostringstream out; + out << name << " return value of " << e << " indicates failure"; + if (file) { + out << " " << file << ":" << line; + } + throw_runtime_exception(out.str()); + } +} + +#define OMPT_SAFE_CALL(call) \ + Kokkos::Impl::ompt_internal_safe_call(call, #call, __FILE__, __LINE__) + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f13875b440b63b729a64615a20da0f597a85cf6e --- /dev/null +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp @@ -0,0 +1,169 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <stdio.h> +#include <limits> +#include <iostream> +#include <vector> +#include <Kokkos_Core.hpp> +#include <impl/Kokkos_Error.hpp> +#include <iostream> +#include <impl/Kokkos_CPUDiscovery.hpp> +#include <impl/Kokkos_Tools.hpp> + +#ifdef KOKKOS_ENABLE_OPENMPTARGET + +// FIXME_OPENMPTARGET currently unused +/* +namespace Kokkos { +namespace Impl { +namespace { + +KOKKOS_INLINE_FUNCTION +int kokkos_omp_in_parallel(); + +KOKKOS_INLINE_FUNCTION +int kokkos_omp_in_parallel() { return omp_in_parallel(); } + +bool s_using_hwloc = false; + +} // namespace +} // namespace Impl +} // namespace Kokkos +*/ + +namespace Kokkos { +namespace Impl { + +void OpenMPTargetExec::verify_is_process(const char* const label) { + if (omp_in_parallel()) { + std::string msg(label); + msg.append(" ERROR: in parallel"); + Kokkos::Impl::throw_runtime_exception(msg); + } +} + +void OpenMPTargetExec::verify_initialized(const char* const label) { + if (0 == Kokkos::Experimental::OpenMPTarget().impl_is_initialized()) { + std::string msg(label); + msg.append(" ERROR: not initialized"); + Kokkos::Impl::throw_runtime_exception(msg); + } +} + +void* OpenMPTargetExec::m_scratch_ptr = nullptr; +int64_t OpenMPTargetExec::m_scratch_size = 0; +int* OpenMPTargetExec::m_lock_array = nullptr; +int64_t OpenMPTargetExec::m_lock_size = 0; +uint32_t* OpenMPTargetExec::m_uniquetoken_ptr = nullptr; + +void OpenMPTargetExec::clear_scratch() { + Kokkos::Experimental::OpenMPTargetSpace space; + space.deallocate(m_scratch_ptr, m_scratch_size); + m_scratch_ptr = nullptr; + m_scratch_size = 0; +} + +void OpenMPTargetExec::clear_lock_array() { + if (m_lock_array != nullptr) { + Kokkos::Experimental::OpenMPTargetSpace space; + space.deallocate(m_lock_array, m_lock_size); + m_lock_array = nullptr; + m_lock_size = 0; + } +} + +void* OpenMPTargetExec::get_scratch_ptr() { return m_scratch_ptr; } + +void OpenMPTargetExec::resize_scratch(int64_t team_size, int64_t shmem_size_L0, + int64_t shmem_size_L1) { + Kokkos::Experimental::OpenMPTargetSpace space; + const int64_t shmem_size = + shmem_size_L0 + shmem_size_L1; // L0 + L1 scratch memory per team. + const int64_t padding = shmem_size * 10 / 100; // Padding per team. + // Total amount of scratch memory allocated is depenedent + // on the maximum number of in-flight teams possible. + int64_t total_size = + (shmem_size + OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE + padding) * + (MAX_ACTIVE_THREADS / team_size); + + if (total_size > m_scratch_size) { + space.deallocate(m_scratch_ptr, m_scratch_size); + m_scratch_size = total_size; + m_scratch_ptr = space.allocate(total_size); + } +} + +int* OpenMPTargetExec::get_lock_array(int num_teams) { + Kokkos::Experimental::OpenMPTargetSpace space; + int max_active_league_size = MAX_ACTIVE_THREADS / 32; + int lock_array_elem = + (num_teams > max_active_league_size) ? num_teams : max_active_league_size; + if (m_lock_size < (lock_array_elem * sizeof(int))) { + space.deallocate(m_lock_array, m_lock_size); + m_lock_size = lock_array_elem * sizeof(int); + m_lock_array = static_cast<int*>(space.allocate(m_lock_size)); + + // FIXME_OPENMPTARGET - Creating a target region here to initialize the + // lock_array with 0's fails. Hence creating an equivalent host array to + // achieve the same. Value of host array are then copied to the lock_array. + int* h_lock_array = static_cast<int*>( + omp_target_alloc(m_lock_size, omp_get_initial_device())); + + for (int i = 0; i < lock_array_elem; ++i) h_lock_array[i] = 0; + + OMPT_SAFE_CALL(omp_target_memcpy(m_lock_array, h_lock_array, m_lock_size, 0, + 0, omp_get_default_device(), + omp_get_initial_device())); + + omp_target_free(h_lock_array, omp_get_initial_device()); + } + + return m_lock_array; +} + +} // namespace Impl +} // namespace Kokkos + +#endif // KOKKOS_ENABLE_OPENMPTARGET diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp new file mode 100644 index 0000000000000000000000000000000000000000..0b65e0d4a4b2270fdf577b4fffc1a10835467a47 --- /dev/null +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp @@ -0,0 +1,1729 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_OPENMPTARGETEXEC_HPP +#define KOKKOS_OPENMPTARGETEXEC_HPP + +#include <impl/Kokkos_Traits.hpp> +#include <impl/Kokkos_Spinwait.hpp> + +#include <Kokkos_Atomic.hpp> +#include "Kokkos_OpenMPTarget_Abort.hpp" + +// FIXME_OPENMPTARGET - Using this macro to implement a workaround for +// hierarchical reducers. It avoids hitting the code path which we wanted to +// write but doesn't work. undef'ed at the end. +#define KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <class Reducer> +struct OpenMPTargetReducerWrapper { + using value_type = typename Reducer::value_type; + +// WORKAROUND OPENMPTARGET +// This pragma omp declare target should not be necessary, but Intel compiler +// fails without it +#pragma omp declare target + KOKKOS_INLINE_FUNCTION + static void join(value_type&, const value_type&) { + printf( + "Using a generic unknown Reducer for the OpenMPTarget backend is not " + "implemented."); + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type&, const volatile value_type&) { + printf( + "Using a generic unknown Reducer for the OpenMPTarget backend is not " + "implemented."); + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type&) { + printf( + "Using a generic unknown Reducer for the OpenMPTarget backend is not " + "implemented."); + } +#pragma omp end declare target +}; + +template <class Scalar, class Space> +struct OpenMPTargetReducerWrapper<Sum<Scalar, Space>> { + public: + // Required + using value_type = typename std::remove_cv<Scalar>::type; + +// WORKAROUND OPENMPTARGET +// This pragma omp declare target should not be necessary, but Intel compiler +// fails without it +#pragma omp declare target + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { dest += src; } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + dest += src; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val = reduction_identity<value_type>::sum(); + } +#pragma omp end declare target +}; + +template <class Scalar, class Space> +struct OpenMPTargetReducerWrapper<Prod<Scalar, Space>> { + public: + // Required + using value_type = typename std::remove_cv<Scalar>::type; + +// WORKAROUND OPENMPTARGET +// This pragma omp declare target should not be necessary, but Intel compiler +// fails without it +#pragma omp declare target + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { dest *= src; } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + dest *= src; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val = reduction_identity<value_type>::prod(); + } +#pragma omp end declare target +}; + +template <class Scalar, class Space> +struct OpenMPTargetReducerWrapper<Min<Scalar, Space>> { + public: + // Required + using value_type = typename std::remove_cv<Scalar>::type; + +// WORKAROUND OPENMPTARGET +// This pragma omp declare target should not be necessary, but Intel compiler +// fails without it +#pragma omp declare target + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + if (src < dest) dest = src; + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + if (src < dest) dest = src; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val = reduction_identity<value_type>::min(); + } +#pragma omp end declare target +}; + +template <class Scalar, class Space> +struct OpenMPTargetReducerWrapper<Max<Scalar, Space>> { + public: + // Required + using value_type = typename std::remove_cv<Scalar>::type; + +// WORKAROUND OPENMPTARGET +// This pragma omp declare target should not be necessary, but Intel compiler +// fails without it +#pragma omp declare target + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + if (src > dest) dest = src; + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + if (src > dest) dest = src; + } + + // Required + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val = reduction_identity<value_type>::max(); + } +#pragma omp end declare target +}; + +template <class Scalar, class Space> +struct OpenMPTargetReducerWrapper<LAnd<Scalar, Space>> { + public: + // Required + using value_type = typename std::remove_cv<Scalar>::type; + +// WORKAROUND OPENMPTARGET +// This pragma omp declare target should not be necessary, but Intel compiler +// fails without it +#pragma omp declare target + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + dest = dest && src; + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + dest = dest && src; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val = reduction_identity<value_type>::land(); + } +#pragma omp end declare target +}; + +template <class Scalar, class Space> +struct OpenMPTargetReducerWrapper<LOr<Scalar, Space>> { + public: + // Required + using value_type = typename std::remove_cv<Scalar>::type; + + using result_view_type = Kokkos::View<value_type, Space>; + +// WORKAROUND OPENMPTARGET +// This pragma omp declare target should not be necessary, but Intel compiler +// fails without it +#pragma omp declare target + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + dest = dest || src; + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + dest = dest || src; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val = reduction_identity<value_type>::lor(); + } +#pragma omp end declare target +}; + +template <class Scalar, class Space> +struct OpenMPTargetReducerWrapper<BAnd<Scalar, Space>> { + public: + // Required + using value_type = typename std::remove_cv<Scalar>::type; + +// WORKAROUND OPENMPTARGET +// This pragma omp declare target should not be necessary, but Intel compiler +// fails without it +#pragma omp declare target + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + dest = dest & src; + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + dest = dest & src; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val = reduction_identity<value_type>::band(); + } +#pragma omp end declare target +}; + +template <class Scalar, class Space> +struct OpenMPTargetReducerWrapper<BOr<Scalar, Space>> { + public: + // Required + using value_type = typename std::remove_cv<Scalar>::type; + +// WORKAROUND OPENMPTARGET +// This pragma omp declare target should not be necessary, but Intel compiler +// fails without it +#pragma omp declare target + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + dest = dest | src; + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + dest = dest | src; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val = reduction_identity<value_type>::bor(); + } +#pragma omp end declare target +}; + +template <class Scalar, class Index, class Space> +struct OpenMPTargetReducerWrapper<MinLoc<Scalar, Index, Space>> { + private: + using scalar_type = typename std::remove_cv<Scalar>::type; + using index_type = typename std::remove_cv<Index>::type; + + public: + // Required + using value_type = ValLocScalar<scalar_type, index_type>; + +// WORKAROUND OPENMPTARGET +// This pragma omp declare target should not be necessary, but Intel compiler +// fails without it +#pragma omp declare target + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + if (src.val < dest.val) dest = src; + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + if (src.val < dest.val) dest = src; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val.val = reduction_identity<scalar_type>::min(); + val.loc = reduction_identity<index_type>::min(); + } +#pragma omp end declare target +}; + +template <class Scalar, class Index, class Space> +struct OpenMPTargetReducerWrapper<MaxLoc<Scalar, Index, Space>> { + private: + using scalar_type = typename std::remove_cv<Scalar>::type; + using index_type = typename std::remove_cv<Index>::type; + + public: + // Required + using value_type = ValLocScalar<scalar_type, index_type>; + +// WORKAROUND OPENMPTARGET +// This pragma omp declare target should not be necessary, but Intel compiler +// fails without it +#pragma omp declare target + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + if (src.val > dest.val) dest = src; + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + if (src.val > dest.val) dest = src; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val.val = reduction_identity<scalar_type>::max(); + val.loc = reduction_identity<index_type>::min(); + } +#pragma omp end declare target +}; + +template <class Scalar, class Space> +struct OpenMPTargetReducerWrapper<MinMax<Scalar, Space>> { + private: + using scalar_type = typename std::remove_cv<Scalar>::type; + + public: + // Required + using value_type = MinMaxScalar<scalar_type>; + +// WORKAROUND OPENMPTARGET +// This pragma omp declare target should not be necessary, but Intel compiler +// fails without it +#pragma omp declare target + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + if (src.min_val < dest.min_val) { + dest.min_val = src.min_val; + } + if (src.max_val > dest.max_val) { + dest.max_val = src.max_val; + } + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + if (src.min_val < dest.min_val) { + dest.min_val = src.min_val; + } + if (src.max_val > dest.max_val) { + dest.max_val = src.max_val; + } + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val.max_val = reduction_identity<scalar_type>::max(); + val.min_val = reduction_identity<scalar_type>::min(); + } +#pragma omp end declare target +}; + +template <class Scalar, class Index, class Space> +struct OpenMPTargetReducerWrapper<MinMaxLoc<Scalar, Index, Space>> { + private: + using scalar_type = typename std::remove_cv<Scalar>::type; + using index_type = typename std::remove_cv<Index>::type; + + public: + // Required + using value_type = MinMaxLocScalar<scalar_type, index_type>; + +// WORKAROUND OPENMPTARGET +// This pragma omp declare target should not be necessary, but Intel compiler +// fails without it +#pragma omp declare target + // Required + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + if (src.min_val < dest.min_val) { + dest.min_val = src.min_val; + dest.min_loc = src.min_loc; + } + if (src.max_val > dest.max_val) { + dest.max_val = src.max_val; + dest.max_loc = src.max_loc; + } + } + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& dest, const volatile value_type& src) { + if (src.min_val < dest.min_val) { + dest.min_val = src.min_val; + dest.min_loc = src.min_loc; + } + if (src.max_val > dest.max_val) { + dest.max_val = src.max_val; + dest.max_loc = src.max_loc; + } + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val.max_val = reduction_identity<scalar_type>::max(); + val.min_val = reduction_identity<scalar_type>::min(); + val.max_loc = reduction_identity<index_type>::min(); + val.min_loc = reduction_identity<index_type>::min(); + } +#pragma omp end declare target +}; +/* +template<class ReducerType> +class OpenMPTargetReducerWrapper { + public: + const ReducerType& reducer; + using value_type = typename ReducerType::value_type; + value_type& value; + + KOKKOS_INLINE_FUNCTION + void join(const value_type& upd) { + reducer.join(value,upd); + } + + KOKKOS_INLINE_FUNCTION + void init(const value_type& upd) { + reducer.init(value,upd); + } +};*/ + +} // namespace Impl +} // namespace Kokkos + +namespace Kokkos { +namespace Impl { + +//---------------------------------------------------------------------------- +/** \brief Data for OpenMPTarget thread execution */ + +class OpenMPTargetExec { + public: + // FIXME_OPENMPTARGET - Currently the maximum number of + // teams possible is calculated based on NVIDIA's Volta GPU. In + // future this value should be based on the chosen architecture for the + // OpenMPTarget backend. + enum { MAX_ACTIVE_THREADS = 2080 * 80 }; + enum { MAX_ACTIVE_TEAMS = MAX_ACTIVE_THREADS / 32 }; + + private: + static void* scratch_ptr; + + public: + static void verify_is_process(const char* const); + static void verify_initialized(const char* const); + + static int* get_lock_array(int num_teams); + static void* get_scratch_ptr(); + static void clear_scratch(); + static void clear_lock_array(); + static void resize_scratch(int64_t team_reduce_bytes, + int64_t team_shared_bytes, + int64_t thread_local_bytes); + + static void* m_scratch_ptr; + static int64_t m_scratch_size; + static int* m_lock_array; + static int64_t m_lock_size; + static uint32_t* m_uniquetoken_ptr; +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +class OpenMPTargetExecTeamMember { + public: + enum { TEAM_REDUCE_SIZE = 512 }; + + /** \brief Thread states for team synchronization */ + enum { Active = 0, Rendezvous = 1 }; + + using execution_space = Kokkos::Experimental::OpenMPTarget; + using scratch_memory_space = execution_space::scratch_memory_space; + + scratch_memory_space m_team_shared; + int m_team_scratch_size[2]; + int m_team_rank; + int m_team_size; + int m_league_rank; + int m_league_size; + int m_vector_length; + int m_vector_lane; + int m_shmem_block_index; + void* m_glb_scratch; + void* m_reduce_scratch; + + /* + // Fan-in team threads, root of the fan-in which does not block returns true + inline + bool team_fan_in() const + { + memory_fence(); + for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( + m_team_rank_rev & n ) ; n <<= 1 ) { + + m_exec.pool_rev( m_team_base_rev + j )->state_wait( Active ); + } + + if ( m_team_rank_rev ) { + m_exec.state_set( Rendezvous ); + memory_fence(); + m_exec.state_wait( Rendezvous ); + } + + return 0 == m_team_rank_rev ; + } + + inline + void team_fan_out() const + { + memory_fence(); + for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( + m_team_rank_rev & n ) ; n <<= 1 ) { m_exec.pool_rev( m_team_base_rev + j + )->state_set( Active ); memory_fence(); + } + } + */ + public: + KOKKOS_INLINE_FUNCTION + const execution_space::scratch_memory_space& team_shmem() const { + return m_team_shared.set_team_thread_mode(0, 1, 0); + } + + KOKKOS_INLINE_FUNCTION + const execution_space::scratch_memory_space& team_scratch(int level) const { + return m_team_shared.set_team_thread_mode(level, 1, + m_team_scratch_size[level]); + } + + KOKKOS_INLINE_FUNCTION + const execution_space::scratch_memory_space& thread_scratch(int level) const { + return m_team_shared.set_team_thread_mode(level, team_size(), team_rank()); + } + + KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank; } + KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size; } + KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank; } + KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size; } + KOKKOS_INLINE_FUNCTION void* impl_reduce_scratch() const { + return m_reduce_scratch; + } + + KOKKOS_INLINE_FUNCTION void team_barrier() const { +#pragma omp barrier + } + + template <class ValueType> + KOKKOS_INLINE_FUNCTION void team_broadcast(ValueType& value, + int thread_id) const { + // Make sure there is enough scratch space: + using type = + typename std::conditional<(sizeof(ValueType) < TEAM_REDUCE_SIZE), + ValueType, void>::type; + type* team_scratch = reinterpret_cast<type*>( + ((char*)(m_glb_scratch) + TEAM_REDUCE_SIZE * omp_get_team_num())); +#pragma omp barrier + if (team_rank() == thread_id) *team_scratch = value; +#pragma omp barrier + value = *team_scratch; + } + + template <class Closure, class ValueType> + KOKKOS_INLINE_FUNCTION void team_broadcast(const Closure& f, ValueType& value, + const int& thread_id) const { + f(value); + team_broadcast(value, thread_id); + } + + template <class ValueType, class JoinOp> + KOKKOS_INLINE_FUNCTION ValueType team_reduce(const ValueType& value, + const JoinOp& op_in) const { +#pragma omp barrier + + using value_type = ValueType; + const JoinLambdaAdapter<value_type, JoinOp> op(op_in); + + // Make sure there is enough scratch space: + using type = std::conditional_t<(sizeof(value_type) < TEAM_REDUCE_SIZE), + value_type, void>; + + const int n_values = TEAM_REDUCE_SIZE / sizeof(value_type); + type* team_scratch = + (type*)((char*)m_glb_scratch + TEAM_REDUCE_SIZE * omp_get_team_num()); + for (int i = m_team_rank; i < n_values; i += m_team_size) { + team_scratch[i] = value_type(); + } + +#pragma omp barrier + + for (int k = 0; k < m_team_size; k += n_values) { + if ((k <= m_team_rank) && (k + n_values > m_team_rank)) + team_scratch[m_team_rank % n_values] += value; +#pragma omp barrier + } + + for (int d = 1; d < n_values; d *= 2) { + if ((m_team_rank + d < n_values) && (m_team_rank % (2 * d) == 0)) { + team_scratch[m_team_rank] += team_scratch[m_team_rank + d]; + } +#pragma omp barrier + } + return team_scratch[0]; + } + /** \brief Intra-team exclusive prefix sum with team_rank() ordering + * with intra-team non-deterministic ordering accumulation. + * + * The global inter-team accumulation value will, at the end of the + * league's parallel execution, be the scan's total. + * Parallel execution ordering of the league's teams is non-deterministic. + * As such the base value for each team's scan operation is similarly + * non-deterministic. + */ + template <typename ArgType> + KOKKOS_INLINE_FUNCTION ArgType + team_scan(const ArgType& /*value*/, ArgType* const /*global_accum*/) const { + // FIXME_OPENMPTARGET + /* // Make sure there is enough scratch space: + using type = + std::conditional_t<(sizeof(ArgType) < TEAM_REDUCE_SIZE), ArgType, void>; + + volatile type * const work_value = ((type*) m_exec.scratch_thread()); + + *work_value = value ; + + memory_fence(); + + if ( team_fan_in() ) { + // The last thread to synchronize returns true, all other threads wait + for team_fan_out() + // m_team_base[0] == highest ranking team member + // m_team_base[ m_team_size - 1 ] == lowest ranking team member + // + // 1) copy from lower to higher rank, initialize lowest rank to zero + // 2) prefix sum from lowest to highest rank, skipping lowest rank + + type accum = 0 ; + + if ( global_accum ) { + for ( int i = m_team_size ; i-- ; ) { + type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i + )->scratch_thread()); accum += val ; + } + accum = atomic_fetch_add( global_accum , accum ); + } + + for ( int i = m_team_size ; i-- ; ) { + type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i + )->scratch_thread()); const type offset = accum ; accum += val ; val = + offset ; + } + + memory_fence(); + } + + team_fan_out(); + + return *work_value ;*/ + return ArgType(); + } + + /** \brief Intra-team exclusive prefix sum with team_rank() ordering. + * + * The highest rank thread can compute the reduction total as + * reduction_total = dev.team_scan( value ) + value ; + */ + template <typename Type> + KOKKOS_INLINE_FUNCTION Type team_scan(const Type& value) const { + return this->template team_scan<Type>(value, 0); + } + + //---------------------------------------- + // Private for the driver + + private: + using space = execution_space::scratch_memory_space; + + public: + // FIXME_OPENMPTARGET - 512(16*32) bytes at the begining of the scratch space + // for each league is saved for reduction. It should actually be based on the + // ValueType of the reduction variable. + inline OpenMPTargetExecTeamMember( + const int league_rank, const int league_size, const int team_size, + const int vector_length // const TeamPolicyInternal< OpenMPTarget, + // Properties ...> & team + , + void* const glb_scratch, const int shmem_block_index, + const int shmem_size_L0, const int shmem_size_L1) + : m_team_scratch_size{shmem_size_L0, shmem_size_L1}, + m_team_rank(0), + m_team_size(team_size), + m_league_rank(league_rank), + m_league_size(league_size), + m_vector_length(vector_length), + m_shmem_block_index(shmem_block_index), + m_glb_scratch(glb_scratch) { + const int omp_tid = omp_get_thread_num(); + m_team_shared = scratch_memory_space( + ((char*)glb_scratch + + m_shmem_block_index * + (shmem_size_L0 + shmem_size_L1 + + ((shmem_size_L0 + shmem_size_L1) * 10 / 100) + TEAM_REDUCE_SIZE)), + shmem_size_L0, + ((char*)glb_scratch + + m_shmem_block_index * (shmem_size_L0 + shmem_size_L1 + + ((shmem_size_L0 + shmem_size_L1) * 10 / 100) + + TEAM_REDUCE_SIZE)) + + shmem_size_L0 + ((shmem_size_L0 + shmem_size_L1) * 10 / 100) + + TEAM_REDUCE_SIZE, + shmem_size_L1); + m_reduce_scratch = + (char*)glb_scratch + + shmem_block_index * + (shmem_size_L0 + shmem_size_L1 + + ((shmem_size_L0 + shmem_size_L1) * 10 / 100) + TEAM_REDUCE_SIZE); + m_league_rank = league_rank; + m_team_rank = omp_tid; + m_vector_lane = 0; + } + + static inline int team_reduce_size() { return TEAM_REDUCE_SIZE; } +}; + +template <class... Properties> +class TeamPolicyInternal<Kokkos::Experimental::OpenMPTarget, Properties...> + : public PolicyTraits<Properties...> { + public: + //! Tag this class as a kokkos execution policy + using execution_policy = TeamPolicyInternal; + + using traits = PolicyTraits<Properties...>; + + //---------------------------------------- + + template <class FunctorType> + inline static int team_size_max(const FunctorType&, const ParallelForTag&) { + return 256; + } + + template <class FunctorType> + inline static int team_size_max(const FunctorType&, + const ParallelReduceTag&) { + return 256; + } + + template <class FunctorType, class ReducerType> + inline static int team_size_max(const FunctorType&, const ReducerType&, + const ParallelReduceTag&) { + return 256; + } + + template <class FunctorType> + inline static int team_size_recommended(const FunctorType&, + const ParallelForTag&) { + return 128; + } + + template <class FunctorType> + inline static int team_size_recommended(const FunctorType&, + const ParallelReduceTag&) { + return 128; + } + + template <class FunctorType, class ReducerType> + inline static int team_size_recommended(const FunctorType&, + const ReducerType&, + const ParallelReduceTag&) { + return 128; + } + + //---------------------------------------- + + private: + int m_league_size; + int m_team_size; + int m_vector_length; + int m_team_alloc; + int m_team_iter; + std::array<size_t, 2> m_team_scratch_size; + std::array<size_t, 2> m_thread_scratch_size; + bool m_tune_team_size; + bool m_tune_vector_length; + constexpr const static size_t default_team_size = 256; + int m_chunk_size; + + inline void init(const int league_size_request, const int team_size_request, + const int vector_length_request) { + m_league_size = league_size_request; + + // Minimum team size should be 32 for OpenMPTarget backend. + if (team_size_request < 32) { + Kokkos::Impl::OpenMPTarget_abort( + "OpenMPTarget backend requires a minimum of 32 threads per team.\n"); + } else + m_team_size = team_size_request; + + m_vector_length = vector_length_request; + set_auto_chunk_size(); + } + + template <typename ExecSpace, typename... OtherProperties> + friend class TeamPolicyInternal; + + public: + inline bool impl_auto_team_size() const { return m_tune_team_size; } + inline bool impl_auto_vector_length() const { return m_tune_vector_length; } + inline void impl_set_team_size(const size_t size) { m_team_size = size; } + inline void impl_set_vector_length(const size_t length) { + m_tune_vector_length = length; + } + inline int impl_vector_length() const { return m_vector_length; } + KOKKOS_DEPRECATED inline int vector_length() const { + return impl_vector_length(); + } + inline int team_size() const { return m_team_size; } + inline int league_size() const { return m_league_size; } + inline size_t scratch_size(const int& level, int team_size_ = -1) const { + if (team_size_ < 0) team_size_ = m_team_size; + return m_team_scratch_size[level] + + team_size_ * m_thread_scratch_size[level]; + } + + inline Kokkos::Experimental::OpenMPTarget space() const { + return Kokkos::Experimental::OpenMPTarget(); + } + + template <class... OtherProperties> + TeamPolicyInternal(const TeamPolicyInternal<OtherProperties...>& p) + : m_league_size(p.m_league_size), + m_team_size(p.m_team_size), + m_vector_length(p.m_vector_length), + m_team_alloc(p.m_team_alloc), + m_team_iter(p.m_team_iter), + m_team_scratch_size(p.m_team_scratch_size), + m_thread_scratch_size(p.m_thread_scratch_size), + m_tune_team_size(p.m_tune_team_size), + m_tune_vector_length(p.m_tune_vector_length), + m_chunk_size(p.m_chunk_size) {} + + /** \brief Specify league size, request team size */ + TeamPolicyInternal(typename traits::execution_space&, int league_size_request, + int team_size_request, int vector_length_request = 1) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_tune_team_size(false), + m_tune_vector_length(false), + m_chunk_size(0) { + init(league_size_request, team_size_request, vector_length_request); + } + + TeamPolicyInternal(typename traits::execution_space&, int league_size_request, + const Kokkos::AUTO_t& /* team_size_request */ + , + int vector_length_request = 1) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_tune_team_size(true), + m_tune_vector_length(false), + m_chunk_size(0) { + init(league_size_request, default_team_size / vector_length_request, + vector_length_request); + } + + TeamPolicyInternal(typename traits::execution_space&, int league_size_request, + const Kokkos::AUTO_t& /* team_size_request */ + , + const Kokkos::AUTO_t& /* vector_length_request */) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_tune_team_size(true), + m_tune_vector_length(true), + m_chunk_size(0) { + init(league_size_request, default_team_size, 1); + } + TeamPolicyInternal(typename traits::execution_space&, int league_size_request, + int team_size_request, + const Kokkos::AUTO_t& /* vector_length_request */) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_tune_team_size(false), + m_tune_vector_length(true), + m_chunk_size(0) { + init(league_size_request, team_size_request, 1); + } + + TeamPolicyInternal(int league_size_request, int team_size_request, + int vector_length_request = 1) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_tune_team_size(false), + m_tune_vector_length(false), + m_chunk_size(0) { + init(league_size_request, team_size_request, vector_length_request); + } + + TeamPolicyInternal(int league_size_request, + const Kokkos::AUTO_t& /* team_size_request */ + , + int vector_length_request = 1) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_tune_team_size(true), + m_tune_vector_length(false), + m_chunk_size(0) { + init(league_size_request, default_team_size / vector_length_request, + vector_length_request); + } + + TeamPolicyInternal(int league_size_request, + const Kokkos::AUTO_t& /* team_size_request */ + , + const Kokkos::AUTO_t& /* vector_length_request */) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_tune_team_size(true), + m_tune_vector_length(true), + m_chunk_size(0) { + init(league_size_request, default_team_size, 1); + } + TeamPolicyInternal(int league_size_request, int team_size_request, + const Kokkos::AUTO_t& /* vector_length_request */) + : m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_tune_team_size(false), + m_tune_vector_length(true), + m_chunk_size(0) { + init(league_size_request, team_size_request, 1); + } + inline static size_t vector_length_max() { + return 32; /* TODO: this is bad. Need logic that is compiler and backend + aware */ + } + inline int team_alloc() const { return m_team_alloc; } + inline int team_iter() const { return m_team_iter; } + + inline int chunk_size() const { return m_chunk_size; } + + /** \brief set chunk_size to a discrete value*/ + inline TeamPolicyInternal& set_chunk_size( + typename traits::index_type chunk_size_) { + m_chunk_size = chunk_size_; + return *this; + } + + /** \brief set per team scratch size for a specific level of the scratch + * hierarchy */ + inline TeamPolicyInternal& set_scratch_size(const int& level, + const PerTeamValue& per_team) { + m_team_scratch_size[level] = per_team.value; + return *this; + } + + /** \brief set per thread scratch size for a specific level of the scratch + * hierarchy */ + inline TeamPolicyInternal& set_scratch_size( + const int& level, const PerThreadValue& per_thread) { + m_thread_scratch_size[level] = per_thread.value; + return *this; + } + + /** \brief set per thread and per team scratch size for a specific level of + * the scratch hierarchy */ + inline TeamPolicyInternal& set_scratch_size( + const int& level, const PerTeamValue& per_team, + const PerThreadValue& per_thread) { + m_team_scratch_size[level] = per_team.value; + m_thread_scratch_size[level] = per_thread.value; + return *this; + } + + private: + /** \brief finalize chunk_size if it was set to AUTO*/ + inline void set_auto_chunk_size() { + int concurrency = 2048 * 128; + + if (concurrency == 0) concurrency = 1; + + if (m_chunk_size > 0) { + if (!Impl::is_integral_power_of_two(m_chunk_size)) + Kokkos::abort("TeamPolicy blocking granularity must be power of two"); + } + + int new_chunk_size = 1; + while (new_chunk_size * 100 * concurrency < m_league_size) + new_chunk_size *= 2; + if (new_chunk_size < 128) { + new_chunk_size = 1; + while ((new_chunk_size * 40 * concurrency < m_league_size) && + (new_chunk_size < 128)) + new_chunk_size *= 2; + } + m_chunk_size = new_chunk_size; + } + + public: + using member_type = Impl::OpenMPTargetExecTeamMember; +}; +} // namespace Impl + +} // namespace Kokkos + +namespace Kokkos { + +template <typename iType> +KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember> +TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread, + const iType& count) { + return Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>(thread, count); +} + +template <typename iType1, typename iType2> +KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct< + typename std::common_type<iType1, iType2>::type, + Impl::OpenMPTargetExecTeamMember> +TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread, + const iType1& begin, const iType2& end) { + using iType = typename std::common_type<iType1, iType2>::type; + return Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>(thread, iType(begin), + iType(end)); +} + +template <typename iType> +KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember> +ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, + const iType& count) { + return Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>(thread, count); +} + +template <typename iType1, typename iType2> +KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct< + typename std::common_type<iType1, iType2>::type, + Impl::OpenMPTargetExecTeamMember> +ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, + const iType1& arg_begin, const iType2& arg_end) { + using iType = typename std::common_type<iType1, iType2>::type; + return Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>(thread, iType(arg_begin), + iType(arg_end)); +} + +template <typename iType> +KOKKOS_INLINE_FUNCTION Impl::TeamVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember> +TeamVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, + const iType& count) { + return Impl::TeamVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>(thread, count); +} + +template <typename iType1, typename iType2> +KOKKOS_INLINE_FUNCTION Impl::TeamVectorRangeBoundariesStruct< + typename std::common_type<iType1, iType2>::type, + Impl::OpenMPTargetExecTeamMember> +TeamVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, + const iType1& arg_begin, const iType2& arg_end) { + using iType = typename std::common_type<iType1, iType2>::type; + return Impl::TeamVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>(thread, iType(arg_begin), + iType(arg_end)); +} + +KOKKOS_INLINE_FUNCTION +Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember> PerTeam( + const Impl::OpenMPTargetExecTeamMember& thread) { + return Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember>(thread); +} + +KOKKOS_INLINE_FUNCTION +Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember> PerThread( + const Impl::OpenMPTargetExecTeamMember& thread) { + return Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember>(thread); +} +} // namespace Kokkos + +namespace Kokkos { + +/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each + * i=0..N-1. + * + * The range i=0..N-1 is mapped to all threads of the the calling thread team. + */ +template <typename iType, class Lambda> +KOKKOS_INLINE_FUNCTION void parallel_for( + const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const Lambda& lambda) { +#pragma omp for nowait schedule(static, 1) + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i); +} + +/** \brief Inter-thread vector parallel_reduce. Executes lambda(iType i, + * ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all threads of the the calling thread team + * and a summation of val is performed and put into result. + */ + +template <typename iType, class Lambda, typename ValueType> +KOKKOS_INLINE_FUNCTION + std::enable_if_t<!Kokkos::is_reducer_type<ValueType>::value> + parallel_reduce( + const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const Lambda& lambda, ValueType& result) { + // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of + // elements in the array <= 32. For reduction we allocate, 16 bytes per + // element in the scratch space, hence, 16*32 = 512. + static_assert(sizeof(ValueType) <= + Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); + + ValueType* TeamThread_scratch = + static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch()); + +#pragma omp barrier + TeamThread_scratch[0] = ValueType(); +#pragma omp barrier + + if constexpr (std::is_arithmetic<ValueType>::value) { +#pragma omp for reduction(+ : TeamThread_scratch[:1]) + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { + ValueType tmp = ValueType(); + lambda(i, tmp); + TeamThread_scratch[0] += tmp; + } + } else { +#pragma omp declare reduction(custom:ValueType : omp_out += omp_in) + +#pragma omp for reduction(custom : TeamThread_scratch[:1]) + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { + ValueType tmp = ValueType(); + lambda(i, tmp); + TeamThread_scratch[0] += tmp; + } + } + + result = TeamThread_scratch[0]; +} + +#if !defined(KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND) +// For some reason the actual version we wanted to write doesn't work +// and crashes. We should try this with every new compiler +// This is the variant we actually wanted to write +template <typename iType, class Lambda, typename ReducerType> +KOKKOS_INLINE_FUNCTION + std::enable_if_t<Kokkos::is_reducer_type<ReducerType>::value> + parallel_reduce( + const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const Lambda& lambda, ReducerType result) { + using ValueType = typename ReducerType::value_type; + +#pragma omp declare reduction( \ + custominner:ValueType \ + : Impl::OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \ + initializer( \ + Impl::OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv)) + + // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of + // elements in the array <= 32. For reduction we allocate, 16 bytes per + // element in the scratch space, hence, 16*32 = 512. + static_assert(sizeof(ValueType) <= + Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); + + ValueType* TeamThread_scratch = + static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch()); + +#pragma omp barrier + // These three lines all cause crash + Impl::OpenMPTargetReducerWrapper<ReducerType>::init(TeamThread_scratch[0]); +// result.init(TeamThread_scratch[0]); +// Impl::OpenMPTargetReducerWrapper<ReducerType> red; +// red.init(TeamThread_scratch[0]); +#pragma omp barrier + +#pragma omp for reduction(custominner : TeamThread_scratch[:1]) + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { + ValueType tmp; + result.init(tmp); + lambda(i, tmp); + // This line causes a crash + Impl::OpenMPTargetReducerWrapper<ReducerType>::join(TeamThread_scratch[0], + tmp); + } + result.reference() = TeamThread_scratch[0]; +} +#else +template <typename iType, class Lambda, typename ReducerType> +KOKKOS_INLINE_FUNCTION + std::enable_if_t<Kokkos::is_reducer_type<ReducerType>::value> + parallel_reduce( + const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const Lambda& lambda, ReducerType result) { + using ValueType = typename ReducerType::value_type; + + // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of + // elements in the array <= 32. For reduction we allocate, 16 bytes per + // element in the scratch space, hence, 16*32 = 512. + static_assert(sizeof(ValueType) <= + Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); + + ValueType* TeamThread_scratch = + static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch()); + +#pragma omp declare reduction( \ + omp_red_teamthread_reducer:ValueType \ + : Impl::OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \ + initializer( \ + Impl::OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv)) + +#pragma omp barrier + ValueType tmp; + result.init(tmp); + TeamThread_scratch[0] = tmp; +#pragma omp barrier + + iType team_size = iType(omp_get_num_threads()); +#pragma omp for reduction(omp_red_teamthread_reducer \ + : TeamThread_scratch[:1]) schedule(static, 1) + for (iType t = 0; t < team_size; t++) { + ValueType tmp2; + result.init(tmp2); + + for (iType i = loop_boundaries.start + t; i < loop_boundaries.end; + i += team_size) { + lambda(i, tmp2); + } + TeamThread_scratch[0] = tmp2; + } + + result.reference() = TeamThread_scratch[0]; +} +#endif // KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND + +/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, + * ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling thread + * and a reduction of val is performed using JoinType(ValueType& val, const + * ValueType& update) and put into init_result. The input value of init_result + * is used as initializer for temporary variables of ValueType. Therefore the + * input value should be the neutral element with respect to the join operation + * (e.g. '0 for +-' or '1 for *'). + */ +template <typename iType, class Lambda, typename ValueType, class JoinType> +KOKKOS_INLINE_FUNCTION void parallel_reduce( + const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const Lambda& lambda, const JoinType& join, ValueType& init_result) { + ValueType* TeamThread_scratch = + static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch()); + + // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of + // elements in the array <= 32. For reduction we allocate, 16 bytes per + // element in the scratch space, hence, 16*32 = 512. + static_assert(sizeof(ValueType) <= + Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); + +#pragma omp barrier + TeamThread_scratch[0] = init_result; +#pragma omp barrier + + if constexpr (std::is_arithmetic<ValueType>::value) { +#pragma omp for reduction(+ : TeamThread_scratch[:1]) + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { + ValueType tmp = ValueType(); + lambda(i, tmp); + TeamThread_scratch[0] += tmp; + } + } else { +#pragma omp declare reduction(custom:ValueType : omp_out += omp_in) + +#pragma omp for reduction(custom : TeamThread_scratch[:1]) + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { + ValueType tmp = ValueType(); + lambda(i, tmp); + join(TeamThread_scratch[0], tmp); + } + } + + init_result = TeamThread_scratch[0]; +} + +// This is largely the same code as in HIP and CUDA except for the member name +template <typename iType, class FunctorType> +KOKKOS_INLINE_FUNCTION void parallel_scan( + const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_bounds, + const FunctorType& lambda) { + // Extract value_type from lambda + using value_type = typename Kokkos::Impl::FunctorAnalysis< + Kokkos::Impl::FunctorPatternInterface::SCAN, void, + FunctorType>::value_type; + + const auto start = loop_bounds.start; + const auto end = loop_bounds.end; + // Note this thing is called .member in the CUDA specialization of + // TeamThreadRangeBoundariesStruct + auto& member = loop_bounds.team; + const auto team_size = member.team_size(); + const auto team_rank = member.team_rank(); + const auto nchunk = (end - start + team_size - 1) / team_size; + value_type accum = 0; + // each team has to process one or more chunks of the prefix scan + for (iType i = 0; i < nchunk; ++i) { + auto ii = start + i * team_size + team_rank; + // local accumulation for this chunk + value_type local_accum = 0; + // user updates value with prefix value + if (ii < loop_bounds.end) lambda(ii, local_accum, false); + // perform team scan + local_accum = member.team_scan(local_accum); + // add this blocks accum to total accumulation + auto val = accum + local_accum; + // user updates their data with total accumulation + if (ii < loop_bounds.end) lambda(ii, val, true); + // the last value needs to be propogated to next chunk + if (team_rank == team_size - 1) accum = val; + // broadcast last value to rest of the team + member.team_broadcast(accum, team_size - 1); + } +} + +} // namespace Kokkos +#undef KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND + +namespace Kokkos { +/** \brief Intra-thread vector parallel_for. Executes lambda(iType i) for each + * i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling thread. + */ +template <typename iType, class Lambda> +KOKKOS_INLINE_FUNCTION void parallel_for( + const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const Lambda& lambda) { +#pragma omp simd + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i); +} + +/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, + * ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling thread + * and a summation of val is performed and put into result. + */ +template <typename iType, class Lambda, typename ValueType> +KOKKOS_INLINE_FUNCTION void parallel_reduce( + const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const Lambda& lambda, ValueType& result) { + ValueType vector_reduce = ValueType(); + + if constexpr (std::is_arithmetic<ValueType>::value) { +#pragma omp simd reduction(+ : vector_reduce) + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { + ValueType tmp = ValueType(); + lambda(i, tmp); + vector_reduce += tmp; + } + } else { +#pragma omp declare reduction(custom:ValueType : omp_out += omp_in) + +#pragma omp simd reduction(custom : vector_reduce) + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { + lambda(i, vector_reduce); + } + } + + result = vector_reduce; +} + +template <typename iType, class Lambda, typename ReducerType> +KOKKOS_INLINE_FUNCTION + std::enable_if_t<Kokkos::is_reducer_type<ReducerType>::value> + parallel_reduce( + const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const Lambda& lambda, ReducerType const& result) { + using ValueType = typename ReducerType::value_type; + +#pragma omp declare reduction( \ + custom:ValueType \ + : Impl::OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \ + initializer( \ + Impl::OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv)) + + ValueType vector_reduce; + Impl::OpenMPTargetReducerWrapper<ReducerType>::init(vector_reduce); + +#pragma omp simd reduction(custom : vector_reduce) + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { + lambda(i, vector_reduce); + } + + result.reference() = vector_reduce; +} + +/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, + * ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling thread + * and a reduction of val is performed using JoinType(ValueType& val, const + * ValueType& update) and put into init_result. The input value of init_result + * is used as initializer for temporary variables of ValueType. Therefore the + * input value should be the neutral element with respect to the join operation + * (e.g. '0 for +-' or '1 for *'). + */ +template <typename iType, class Lambda, typename ValueType, class JoinType> +KOKKOS_INLINE_FUNCTION void parallel_reduce( + const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const Lambda& lambda, const JoinType& join, ValueType& init_result) { + ValueType result = init_result; + + // FIXME_OPENMPTARGET think about omp simd + // join does not work with omp reduction clause + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { + ValueType tmp = ValueType(); + lambda(i, tmp); + join(result, tmp); + } + + init_result = result; +} + +/** \brief Intra-thread vector parallel exclusive prefix sum. Executes + * lambda(iType i, ValueType & val, bool final) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan + * operation is performed. Depending on the target execution space the operator + * might be called twice: once with final=false and once with final=true. When + * final==true val contains the prefix sum value. The contribution of this "i" + * needs to be added to val no matter whether final==true or not. In a serial + * execution (i.e. team_size==1) the operator is only called once with + * final==true. Scan_val will be set to the final sum value over all vector + * lanes. + */ +template <typename iType, class FunctorType> +KOKKOS_INLINE_FUNCTION void parallel_scan( + const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const FunctorType& lambda) { + using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, void>; + using value_type = typename ValueTraits::value_type; + + value_type scan_val = value_type(); + +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) { + lambda(i, scan_val, true); + } +} + +} // namespace Kokkos + +namespace Kokkos { +/** \brief Intra-team vector parallel_for. Executes lambda(iType i) for each + * i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling team. + */ +template <typename iType, class Lambda> +KOKKOS_INLINE_FUNCTION void parallel_for( + const Impl::TeamVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const Lambda& lambda) { +#pragma omp for simd nowait schedule(static, 1) + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i); +} + +/** \brief Intra-team vector parallel_reduce. Executes lambda(iType i, + * ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling team + * and a summation of val is performed and put into result. + */ +template <typename iType, class Lambda, typename ValueType> +KOKKOS_INLINE_FUNCTION void parallel_reduce( + const Impl::TeamVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const Lambda& lambda, ValueType& result) { + // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of + // elements in the array <= 32. For reduction we allocate, 16 bytes per + // element in the scratch space, hence, 16*32 = 512. + static_assert(sizeof(ValueType) <= + Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); + + ValueType* TeamVector_scratch = + static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch()); + +#pragma omp barrier + TeamVector_scratch[0] = ValueType(); +#pragma omp barrier + + if constexpr (std::is_arithmetic<ValueType>::value) { +#pragma omp for simd reduction(+ : TeamVector_scratch[:1]) + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { + ValueType tmp = ValueType(); + lambda(i, tmp); + TeamVector_scratch[0] += tmp; + } + } else { +#pragma omp declare reduction(custom:ValueType : omp_out += omp_in) + +#pragma omp for simd reduction(custom : TeamVector_scratch[:1]) + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { + ValueType tmp = ValueType(); + lambda(i, tmp); + TeamVector_scratch[0] += tmp; + } + } + + result = TeamVector_scratch[0]; +} + +#if !defined(KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND) +template <typename iType, class Lambda, typename ReducerType> +KOKKOS_INLINE_FUNCTION + std::enable_if_t<Kokkos::is_reducer_type<ReducerType>::value> + parallel_reduce( + const Impl::TeamVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const Lambda& lambda, ReducerType const& result) { + using ValueType = typename ReducerType::value_type; + + // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of + // elements in the array <= 32. For reduction we allocate, 16 bytes per + // element in the scratch space, hence, 16*32 = 512. + static_assert(sizeof(ValueType) <= + Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); + +#pragma omp declare reduction( \ + custom:ValueType \ + : Impl::OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \ + initializer( \ + Impl::OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv)) + + ValueType* TeamVector_scratch = + static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch()); + +#pragma omp barrier + Impl::OpenMPTargetReducerWrapper<ReducerType>::init(TeamVector_scratch[0]); +#pragma omp barrier + +#pragma omp for simd reduction(custom : TeamVector_scratch[:1]) + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { + ValueType tmp = ValueType(); + lambda(i, tmp); + TeamVector_scratch[0] += tmp; + } + + result.reference() = TeamVector_scratch[0]; +} +#else +template <typename iType, class Lambda, typename ReducerType> +KOKKOS_INLINE_FUNCTION + std::enable_if_t<Kokkos::is_reducer_type<ReducerType>::value> + parallel_reduce( + const Impl::TeamVectorRangeBoundariesStruct< + iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, + const Lambda& lambda, ReducerType const& result) { + using ValueType = typename ReducerType::value_type; + + // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of + // elements in the array <= 32. For reduction we allocate, 16 bytes per + // element in the scratch space, hence, 16*32 = 512. + static_assert(sizeof(ValueType) <= + Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); + + ValueType* TeamVector_scratch = + static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch()); + +#pragma omp declare reduction( \ + omp_red_teamthread_reducer:ValueType \ + : Impl::OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \ + initializer( \ + Impl::OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv)) + +#pragma omp barrier + ValueType tmp; + result.init(tmp); + TeamVector_scratch[0] = tmp; +#pragma omp barrier + + iType team_size = iType(omp_get_num_threads()); +#pragma omp for simd reduction(omp_red_teamthread_reducer \ + : TeamVector_scratch[:1]) schedule(static, 1) + for (iType t = 0; t < team_size; t++) { + ValueType tmp2; + result.init(tmp2); + + for (iType i = loop_boundaries.start + t; i < loop_boundaries.end; + i += team_size) { + lambda(i, tmp2); + } + TeamVector_scratch[0] = tmp2; + } + + result.reference() = TeamVector_scratch[0]; +} +#endif // KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND +} // namespace Kokkos + +#undef KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND + +namespace Kokkos { + +template <class FunctorType> +KOKKOS_INLINE_FUNCTION void single( + const Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember>& + /*single_struct*/, + const FunctorType& lambda) { + lambda(); +} + +template <class FunctorType> +KOKKOS_INLINE_FUNCTION void single( + const Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember>& + single_struct, + const FunctorType& lambda) { + if (single_struct.team_member.team_rank() == 0) lambda(); +} + +template <class FunctorType, class ValueType> +KOKKOS_INLINE_FUNCTION void single( + const Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember>& + /*single_struct*/, + const FunctorType& lambda, ValueType& val) { + lambda(val); +} + +template <class FunctorType, class ValueType> +KOKKOS_INLINE_FUNCTION void single( + const Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember>& + single_struct, + const FunctorType& lambda, ValueType& val) { + if (single_struct.team_member.team_rank() == 0) { + lambda(val); + } + single_struct.team_member.team_broadcast(val, 0); +} +} // namespace Kokkos + +#endif /* #ifndef KOKKOS_OPENMPTARGETEXEC_HPP */ diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4a79b72732dafb9bd93613723551ec7a9b01ddd1 --- /dev/null +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp @@ -0,0 +1,209 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Macros.hpp> + +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(_OPENMP) + +// FIXME_OPENMPTARGET - macro for workaround implementation in UniqueToken +// constructor. undef'ed at the end +#define KOKKOS_IMPL_OPENMPTARGET_WORKAROUND + +#include <Kokkos_OpenMPTarget.hpp> +#include <OpenMPTarget/Kokkos_OpenMPTarget_UniqueToken.hpp> +#include <OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp> + +#include <sstream> + +namespace Kokkos { +namespace Experimental { +namespace Impl { +void OpenMPTargetInternal::fence() {} +int OpenMPTargetInternal::concurrency() { return 128000; } +const char* OpenMPTargetInternal::name() { return "OpenMPTarget"; } +void OpenMPTargetInternal::print_configuration(std::ostream& /*stream*/, + const bool) { + // FIXME_OPENMPTARGET + printf("Using OpenMPTarget\n"); +} + +void OpenMPTargetInternal::impl_finalize() { + m_is_initialized = false; + Kokkos::Impl::OpenMPTargetExec space; + if (space.m_lock_array != nullptr) space.clear_lock_array(); + + if (space.m_uniquetoken_ptr != nullptr) + Kokkos::kokkos_free<Kokkos::Experimental::OpenMPTargetSpace>( + space.m_uniquetoken_ptr); +} +void OpenMPTargetInternal::impl_initialize() { m_is_initialized = true; } +int OpenMPTargetInternal::impl_is_initialized() { + return m_is_initialized ? 1 : 0; +} + +OpenMPTargetInternal* OpenMPTargetInternal::impl_singleton() { + static OpenMPTargetInternal self; + return &self; +} + +} // Namespace Impl + +OpenMPTarget::OpenMPTarget() + : m_space_instance(Impl::OpenMPTargetInternal::impl_singleton()) {} + +const char* OpenMPTarget::name() { + return Impl::OpenMPTargetInternal::impl_singleton()->name(); +} +void OpenMPTarget::print_configuration(std::ostream& stream, + const bool detail) { + m_space_instance->print_configuration(stream, detail); +} + +int OpenMPTarget::concurrency() { + return Impl::OpenMPTargetInternal::impl_singleton()->concurrency(); +} +void OpenMPTarget::fence() { + Impl::OpenMPTargetInternal::impl_singleton()->fence(); +} + +void OpenMPTarget::impl_initialize() { m_space_instance->impl_initialize(); } +void OpenMPTarget::impl_finalize() { m_space_instance->impl_finalize(); } +int OpenMPTarget::impl_is_initialized() { + return Impl::OpenMPTargetInternal::impl_singleton()->impl_is_initialized(); +} +} // Namespace Experimental + +namespace Impl { +int g_openmptarget_space_factory_initialized = + Kokkos::Impl::initialize_space_factory<OpenMPTargetSpaceInitializer>( + "160_OpenMPTarget"); + +void OpenMPTargetSpaceInitializer::initialize(const InitArguments& args) { + // Prevent "unused variable" warning for 'args' input struct. If + // Serial::initialize() ever needs to take arguments from the input + // struct, you may remove this line of code. + (void)args; + + if (std::is_same<Kokkos::Experimental::OpenMPTarget, + Kokkos::DefaultExecutionSpace>::value) { + Kokkos::Experimental::OpenMPTarget().impl_initialize(); + // std::cout << "Kokkos::initialize() fyi: OpenMP enabled and initialized" + // << std::endl ; + } else { + // std::cout << "Kokkos::initialize() fyi: OpenMP enabled but not + // initialized" << std::endl ; + } +} + +void OpenMPTargetSpaceInitializer::finalize(const bool all_spaces) { + if (std::is_same<Kokkos::Experimental::OpenMPTarget, + Kokkos::DefaultExecutionSpace>::value || + all_spaces) { + if (Kokkos::Experimental::OpenMPTarget().impl_is_initialized()) + Kokkos::Experimental::OpenMPTarget().impl_finalize(); + } +} + +void OpenMPTargetSpaceInitializer::fence() { + Kokkos::Experimental::OpenMPTarget::fence(); +} + +void OpenMPTargetSpaceInitializer::print_configuration(std::ostream& msg, + const bool detail) { + msg << "OpenMPTarget Execution Space:" << std::endl; + msg << " KOKKOS_ENABLE_OPENMPTARGET: "; + msg << "yes" << std::endl; + + msg << "\nOpenMPTarget Runtime Configuration:" << std::endl; + Kokkos::Experimental::OpenMPTarget().print_configuration(msg, detail); +} + +} // namespace Impl +} // Namespace Kokkos + +namespace Kokkos { +namespace Experimental { + +UniqueToken<Kokkos::Experimental::OpenMPTarget, + Kokkos::Experimental::UniqueTokenScope::Global>:: + UniqueToken(Kokkos::Experimental::OpenMPTarget const&) { +#ifdef KOKKOS_IMPL_OPENMPTARGET_WORKAROUND + uint32_t* ptr = Kokkos::Impl::OpenMPTargetExec::m_uniquetoken_ptr; + int count = Kokkos::Experimental::OpenMPTarget().concurrency(); + if (ptr == nullptr) { + int size = count * sizeof(uint32_t); + ptr = static_cast<uint32_t*>( + Kokkos::kokkos_malloc<Kokkos::Experimental::OpenMPTargetSpace>( + "Kokkos::OpenMPTarget::m_uniquetoken_ptr", size)); + std::vector<uint32_t> h_buf(count, 0); + OMPT_SAFE_CALL(omp_target_memcpy(ptr, h_buf.data(), size, 0, 0, + omp_get_default_device(), + omp_get_initial_device())); + + Kokkos::Impl::OpenMPTargetExec::m_uniquetoken_ptr = ptr; + } +#else +// FIXME_OPENMPTARGET - 2 versions of non-working implementations to fill `ptr` +// with 0's +// Version 1 - Creating a target region and filling the +// pointer Error - CUDA error: named symbol not found +#pragma omp target teams distribute parallel for is_device_ptr(ptr) \ + map(to \ + : size) + for (int i = 0; i < count; ++i) ptr[i] = 0; + + // Version 2 : Allocating a view on the device and filling it with a scalar + // value of 0. + Kokkos::View<uint32_t*, Kokkos::Experimental::OpenMPTargetSpace> ptr_view( + ptr, count); + Kokkos::deep_copy(ptr_view, 0); +#endif + m_buffer = ptr; + m_count = count; +} +} // namespace Experimental +} // namespace Kokkos + +#undef KOKKOS_IMPL_OPENMPTARGET_WORKAROUND +#endif // defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(_OPENMP) diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a1caf90c195b98511b4476db73345816af2b4669 --- /dev/null +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp @@ -0,0 +1,89 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_OPENMPTARGET_INSTANCE_HPP +#define KOKKOS_OPENMPTARGET_INSTANCE_HPP + +#include <Kokkos_Core.hpp> + +namespace Kokkos { +namespace Experimental { +namespace Impl { + +class OpenMPTargetInternal { + private: + OpenMPTargetInternal() = default; + OpenMPTargetInternal(const OpenMPTargetInternal&) = default; + OpenMPTargetInternal& operator=(const OpenMPTargetInternal&) = default; + + public: + void fence(); + + /** \brief Return the maximum amount of concurrency. */ + int concurrency(); + + //! Print configuration information to the given output stream. + void print_configuration(std::ostream&, const bool detail = false); + + static const char* name(); + + //! Free any resources being consumed by the device. + void impl_finalize(); + + //! Has been initialized + int impl_is_initialized(); + + //! Initialize, telling the CUDA run-time library which device to use. + void impl_initialize(); + + static OpenMPTargetInternal* impl_singleton(); + + private: + bool m_is_initialized = false; +}; +} // Namespace Impl +} // Namespace Experimental +} // Namespace Kokkos + +#endif // KOKKOS_OPENMPTARGET_INSTANCE_HPP diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a4092c3a37a7e9a1493576c5efe783334982a391 --- /dev/null +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp @@ -0,0 +1,939 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_OPENMPTARGET_PARALLEL_HPP +#define KOKKOS_OPENMPTARGET_PARALLEL_HPP + +#include <omp.h> +#include <sstream> +#include <Kokkos_Parallel.hpp> +#include <OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp> +#include <impl/Kokkos_FunctorAdapter.hpp> + +#define KOKKOS_IMPL_LOCK_FREE_HIERARCHICAL + +namespace Kokkos { +namespace Impl { + +template <class FunctorType, class... Traits> +class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, + Kokkos::Experimental::OpenMPTarget> { + private: + using Policy = Kokkos::RangePolicy<Traits...>; + using WorkTag = typename Policy::work_tag; + using WorkRange = typename Policy::WorkRange; + using Member = typename Policy::member_type; + + const FunctorType m_functor; + const Policy m_policy; + + public: + inline void execute() const { execute_impl<WorkTag>(); } + /* + template <class TagType> + inline typename std::enable_if<std::is_same<TagType, void>::value>::type + execute_impl() const { + OpenMPTargetExec::verify_is_process( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + OpenMPTargetExec::verify_initialized( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + const typename Policy::member_type begin = m_policy.begin(); + const typename Policy::member_type end = m_policy.end(); + + #pragma omp target teams distribute parallel for map(to: this->m_functor) + for (int i = begin; i < end; i++) m_functor(i); + } + */ + template <class TagType> + inline void execute_impl() const { + OpenMPTargetExec::verify_is_process( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + OpenMPTargetExec::verify_initialized( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + const auto begin = m_policy.begin(); + const auto end = m_policy.end(); + + if (end <= begin) return; + + FunctorType a_functor(m_functor); + + if constexpr (std::is_same<TagType, void>::value) { +#pragma omp target teams distribute parallel for map(to : a_functor) + for (auto i = begin; i < end; i++) a_functor(i); + } else { +#pragma omp target teams distribute parallel for map(to : a_functor) + for (auto i = begin; i < end; i++) a_functor(TagType(), i); + } + } + + inline ParallelFor(const FunctorType& arg_functor, Policy arg_policy) + : m_functor(arg_functor), m_policy(arg_policy) {} +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <class FunctorType, class PolicyType, class ReducerType, + class PointerType, class ValueType, bool FunctorHasJoin, + bool UseReducerType> +struct ParallelReduceSpecialize { + static inline void execute(const FunctorType& /*f*/, const PolicyType& /*p*/, + PointerType /*result_ptr*/) { + std::stringstream error_message; + error_message << "Error: Invalid Specialization " << FunctorHasJoin << ' ' + << UseReducerType << '\n'; + // FIXME_OPENMPTARGET + OpenMPTarget_abort(error_message.str().c_str()); + } +}; + +template <class FunctorType, class ReducerType, class PointerType, + class ValueType, class... PolicyArgs> +struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>, + ReducerType, PointerType, ValueType, false, + false> { + using PolicyType = Kokkos::RangePolicy<PolicyArgs...>; + template <class TagType> + inline static void execute_impl(const FunctorType& f, const PolicyType& p, + PointerType result_ptr) { + OpenMPTargetExec::verify_is_process( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + OpenMPTargetExec::verify_initialized( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + const auto begin = p.begin(); + const auto end = p.end(); + + if (end <= begin) return; + + ValueType result = ValueType(); + if constexpr (std::is_same<TagType, void>::value) { +#pragma omp target teams distribute parallel for num_teams(512) \ + map(to:f) map(tofrom:result) reduction(+: result) + for (auto i = begin; i < end; i++) f(i, result); + } else { +#pragma omp target teams distribute parallel for num_teams(512) \ + map(to:f) map(tofrom:result) reduction(+: result) + for (auto i = begin; i < end; i++) f(TagType(), i, result); + } + + *result_ptr = result; + } + + inline static void execute(const FunctorType& f, const PolicyType& p, + PointerType ptr) { + execute_impl<typename PolicyType::work_tag>(f, p, ptr); + } +}; + +template <class FunctorType, class PolicyType, class ReducerType, + class PointerType, class ValueType> +struct ParallelReduceSpecialize<FunctorType, PolicyType, ReducerType, + PointerType, ValueType, false, true> { +#pragma omp declare reduction( \ + custom:ValueType \ + : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv)) + + template <class TagType> + inline static void execute_impl(const FunctorType& f, const PolicyType& p, + PointerType result_ptr) { + OpenMPTargetExec::verify_is_process( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + OpenMPTargetExec::verify_initialized( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + const typename PolicyType::member_type begin = p.begin(); + const typename PolicyType::member_type end = p.end(); + + if (end <= begin) return; + + ValueType result = ValueType(); + OpenMPTargetReducerWrapper<ReducerType>::init(result); + + if constexpr (std::is_same<TagType, void>::value) { +#pragma omp target teams distribute parallel for num_teams(512) map(to \ + : f) \ + reduction(custom \ + : result) + for (auto i = begin; i < end; i++) f(i, result); + *result_ptr = result; + } else { +#pragma omp target teams distribute parallel for num_teams(512) map(to \ + : f) \ + reduction(custom \ + : result) + for (auto i = begin; i < end; i++) f(TagType(), i, result); + *result_ptr = result; + } + } + + inline static void execute(const FunctorType& f, const PolicyType& p, + PointerType ptr) { + execute_impl<typename PolicyType::work_tag>(f, p, ptr); + } +}; + +template <class FunctorType, class ReducerType, class... Traits> +class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType, + Kokkos::Experimental::OpenMPTarget> { + private: + using Policy = Kokkos::RangePolicy<Traits...>; + + using WorkTag = typename Policy::work_tag; + using WorkRange = typename Policy::WorkRange; + using Member = typename Policy::member_type; + + using ReducerConditional = + Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, + FunctorType, ReducerType>; + using ReducerTypeFwd = typename ReducerConditional::type; + using WorkTagFwd = + std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag, + void>; + + // Static Assert WorkTag void if ReducerType not InvalidType + + using ValueTraits = + Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>; + using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>; + using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>; + + enum { HasJoin = ReduceFunctorHasJoin<FunctorType>::value }; + enum { UseReducer = is_reducer_type<ReducerType>::value }; + + using pointer_type = typename ValueTraits::pointer_type; + using reference_type = typename ValueTraits::reference_type; + + using ParReduceSpecialize = + ParallelReduceSpecialize<FunctorType, Policy, ReducerType, pointer_type, + typename ValueTraits::value_type, HasJoin, + UseReducer>; + + const FunctorType m_functor; + const Policy m_policy; + const ReducerType m_reducer; + const pointer_type m_result_ptr; + + public: + inline void execute() const { + ParReduceSpecialize::execute(m_functor, m_policy, m_result_ptr); + } + + template <class ViewType> + inline ParallelReduce( + const FunctorType& arg_functor, Policy arg_policy, + const ViewType& arg_result_view, + typename std::enable_if<Kokkos::is_view<ViewType>::value && + !Kokkos::is_reducer_type<ReducerType>::value, + void*>::type = nullptr) + : m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(InvalidType()), + m_result_ptr(arg_result_view.data()) {} + + inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy, + const ReducerType& reducer) + : m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(reducer), + m_result_ptr(reducer.view().data()) {} +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <class FunctorType, class... Traits> +class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, + Kokkos::Experimental::OpenMPTarget> { + protected: + using Policy = Kokkos::RangePolicy<Traits...>; + + using WorkTag = typename Policy::work_tag; + using WorkRange = typename Policy::WorkRange; + using Member = typename Policy::member_type; + using idx_type = typename Policy::index_type; + + using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, WorkTag>; + using ValueInit = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>; + using ValueJoin = Kokkos::Impl::FunctorValueJoin<FunctorType, WorkTag>; + using ValueOps = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>; + + using value_type = typename ValueTraits::value_type; + using pointer_type = typename ValueTraits::pointer_type; + using reference_type = typename ValueTraits::reference_type; + + const FunctorType m_functor; + const Policy m_policy; + + template <class TagType> + inline typename std::enable_if<std::is_same<TagType, void>::value>::type + call_with_tag(const FunctorType& f, const idx_type& idx, value_type& val, + const bool& is_final) const { + f(idx, val, is_final); + } + template <class TagType> + inline typename std::enable_if<!std::is_same<TagType, void>::value>::type + call_with_tag(const FunctorType& f, const idx_type& idx, value_type& val, + const bool& is_final) const { + f(WorkTag(), idx, val, is_final); + } + + public: + inline void impl_execute( + Kokkos::View<value_type**, Kokkos::LayoutRight, + Kokkos::Experimental::OpenMPTargetSpace> + element_values, + Kokkos::View<value_type*, Kokkos::Experimental::OpenMPTargetSpace> + chunk_values, + Kokkos::View<int64_t, Kokkos::Experimental::OpenMPTargetSpace> count) + const { + const idx_type N = m_policy.end() - m_policy.begin(); + const idx_type chunk_size = 128; + const idx_type n_chunks = (N + chunk_size - 1) / chunk_size; + idx_type nteams = n_chunks > 512 ? 512 : n_chunks; + idx_type team_size = 128; + + FunctorType a_functor(m_functor); +#pragma omp target teams distribute map(to \ + : a_functor) num_teams(nteams) \ + thread_limit(team_size) + for (idx_type team_id = 0; team_id < n_chunks; team_id++) { +#pragma omp parallel num_threads(team_size) + { + const idx_type local_offset = team_id * chunk_size; + +#pragma omp for + for (idx_type i = 0; i < chunk_size; i++) { + const idx_type idx = local_offset + i; + value_type val; + ValueInit::init(a_functor, &val); + if (idx < N) call_with_tag<WorkTag>(a_functor, idx, val, false); + element_values(team_id, i) = val; + } +#pragma omp barrier + if (omp_get_thread_num() == 0) { + value_type sum; + ValueInit::init(a_functor, &sum); + for (idx_type i = 0; i < chunk_size; i++) { + ValueJoin::join(a_functor, &sum, &element_values(team_id, i)); + element_values(team_id, i) = sum; + } + chunk_values(team_id) = sum; + } +#pragma omp barrier + if (omp_get_thread_num() == 0) { + if (Kokkos::atomic_fetch_add(&count(), 1) == n_chunks - 1) { + value_type sum; + ValueInit::init(a_functor, &sum); + for (idx_type i = 0; i < n_chunks; i++) { + ValueJoin::join(a_functor, &sum, &chunk_values(i)); + chunk_values(i) = sum; + } + } + } + } + } + +#pragma omp target teams distribute map(to \ + : a_functor) num_teams(nteams) \ + thread_limit(team_size) + for (idx_type team_id = 0; team_id < n_chunks; team_id++) { +#pragma omp parallel num_threads(team_size) + { + const idx_type local_offset = team_id * chunk_size; + value_type offset_value; + if (team_id > 0) + offset_value = chunk_values(team_id - 1); + else + ValueInit::init(a_functor, &offset_value); + +#pragma omp for + for (idx_type i = 0; i < chunk_size; i++) { + const idx_type idx = local_offset + i; + value_type local_offset_value; + if (i > 0) { + local_offset_value = element_values(team_id, i - 1); + ValueJoin::join(a_functor, &local_offset_value, &offset_value); + } else + local_offset_value = offset_value; + if (idx < N) + call_with_tag<WorkTag>(a_functor, idx, local_offset_value, true); + } + } + } + } + + inline void execute() const { + OpenMPTargetExec::verify_is_process( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + OpenMPTargetExec::verify_initialized( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + const idx_type N = m_policy.end() - m_policy.begin(); + const idx_type chunk_size = 128; + const idx_type n_chunks = (N + chunk_size - 1) / chunk_size; + + // This could be scratch memory per team + Kokkos::View<value_type**, Kokkos::LayoutRight, + Kokkos::Experimental::OpenMPTargetSpace> + element_values("element_values", n_chunks, chunk_size); + Kokkos::View<value_type*, Kokkos::Experimental::OpenMPTargetSpace> + chunk_values("chunk_values", n_chunks); + Kokkos::View<int64_t, Kokkos::Experimental::OpenMPTargetSpace> count( + "Count"); + + impl_execute(element_values, chunk_values, count); + } + + //---------------------------------------- + + inline ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy) + : m_functor(arg_functor), m_policy(arg_policy) {} + + //---------------------------------------- +}; + +template <class FunctorType, class ReturnType, class... Traits> +class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>, + ReturnType, Kokkos::Experimental::OpenMPTarget> + : public ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, + Kokkos::Experimental::OpenMPTarget> { + using base_t = ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, + Kokkos::Experimental::OpenMPTarget>; + using value_type = typename base_t::value_type; + value_type& m_returnvalue; + + public: + inline void execute() const { + OpenMPTargetExec::verify_is_process( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + OpenMPTargetExec::verify_initialized( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + const int64_t N = base_t::m_policy.end() - base_t::m_policy.begin(); + const int chunk_size = 128; + const int64_t n_chunks = (N + chunk_size - 1) / chunk_size; + + if (N > 0) { + // This could be scratch memory per team + Kokkos::View<value_type**, Kokkos::LayoutRight, + Kokkos::Experimental::OpenMPTargetSpace> + element_values("element_values", n_chunks, chunk_size); + Kokkos::View<value_type*, Kokkos::Experimental::OpenMPTargetSpace> + chunk_values("chunk_values", n_chunks); + Kokkos::View<int64_t, Kokkos::Experimental::OpenMPTargetSpace> count( + "Count"); + + base_t::impl_execute(element_values, chunk_values, count); + + const int size = base_t::ValueTraits::value_size(base_t::m_functor); + DeepCopy<HostSpace, Kokkos::Experimental::OpenMPTargetSpace>( + &m_returnvalue, chunk_values.data() + (n_chunks - 1), size); + } else { + m_returnvalue = 0; + } + } + + ParallelScanWithTotal(const FunctorType& arg_functor, + const typename base_t::Policy& arg_policy, + ReturnType& arg_returnvalue) + : base_t(arg_functor, arg_policy), m_returnvalue(arg_returnvalue) {} +}; +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <class FunctorType, class... Properties> +class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, + Kokkos::Experimental::OpenMPTarget> { + private: + using Policy = + Kokkos::Impl::TeamPolicyInternal<Kokkos::Experimental::OpenMPTarget, + Properties...>; + using WorkTag = typename Policy::work_tag; + using Member = typename Policy::member_type; + + const FunctorType m_functor; + const Policy m_policy; + const int m_shmem_size; + + public: + inline void execute() const { + OpenMPTargetExec::verify_is_process( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + OpenMPTargetExec::verify_initialized( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + execute_impl<WorkTag>(); + } + + private: + template <class TagType> + inline void execute_impl() const { + OpenMPTargetExec::verify_is_process( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + OpenMPTargetExec::verify_initialized( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + const auto league_size = m_policy.league_size(); + const auto team_size = m_policy.team_size(); + const auto vector_length = m_policy.impl_vector_length(); + + const size_t shmem_size_L0 = m_policy.scratch_size(0, team_size); + const size_t shmem_size_L1 = m_policy.scratch_size(1, team_size); + OpenMPTargetExec::resize_scratch(team_size, shmem_size_L0, shmem_size_L1); + + void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); + FunctorType a_functor(m_functor); + + // FIXME_OPENMPTARGET - If the team_size is not a multiple of 32, the + // scratch implementation does not work in the Release or RelWithDebugInfo + // mode but works in the Debug mode. + + // Maximum active teams possible. + int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size; + // nteams should not exceed the maximum in-flight teams possible. + const auto nteams = + league_size < max_active_teams ? league_size : max_active_teams; + +#ifdef KOKKOS_IMPL_LOCK_FREE_HIERARCHICAL +// Performing our own scheduling of teams to avoid separation of code between +// teams-distribute and parallel. Gave a 2x performance boost in test cases with +// the clang compiler. atomic_compare_exchange can be avoided since the standard +// guarantees that the number of teams specified in the `num_teams` clause is +// always less than or equal to the maximum concurrently running teams. +#pragma omp target teams num_teams(nteams) thread_limit(team_size) \ + map(to \ + : a_functor) is_device_ptr(scratch_ptr) +#pragma omp parallel + { + const int blockIdx = omp_get_team_num(); + const int gridDim = omp_get_num_teams(); + + // Iterate through the number of teams until league_size and assign the + // league_id accordingly + // Guarantee that the compilers respect the `num_teams` clause + if (gridDim <= nteams) { + for (int league_id = blockIdx; league_id < league_size; + league_id += gridDim) { + typename Policy::member_type team( + league_id, league_size, team_size, vector_length, scratch_ptr, + blockIdx, shmem_size_L0, shmem_size_L1); + if constexpr (std::is_same<TagType, void>::value) + m_functor(team); + else + m_functor(TagType(), team); + } + } else + Kokkos::abort("`num_teams` clause was not respected.\n"); + } + +#else +// Saving the older implementation that uses `atomic_compare_exchange` to +// calculate the shared memory block index and `distribute` clause to distribute +// teams. +#pragma omp target teams distribute map(to \ + : a_functor) \ + is_device_ptr(scratch_ptr, lock_array) num_teams(nteams) \ + thread_limit(team_size) + for (int i = 0; i < league_size; i++) { + int shmem_block_index = -1, lock_team = 99999, iter = -1; + iter = (omp_get_team_num() % max_active_teams); + + // Loop as long as a shmem_block_index is not found. + while (shmem_block_index == -1) { + // Try and acquire a lock on the index. + lock_team = atomic_compare_exchange(&lock_array[iter], 0, 1); + + // If lock is acquired assign it to the block index. + // lock_team = 0, implies atomic_compare_exchange is successfull. + if (lock_team == 0) + shmem_block_index = iter; + else + iter = ++iter % max_active_teams; + } + +#pragma omp parallel num_threads(team_size) + { + typename Policy::member_type team( + i, league_size, team_size, vector_length, scratch_ptr, + shmem_block_index, shmem_size_L0, shmem_size_L1); + m_functor(team); + } + + // Free the locked block and increment the number of available free + // blocks. + lock_team = atomic_compare_exchange(&lock_array[shmem_block_index], 1, 0); + } +#endif + } + + public: + inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) + : m_functor(arg_functor), + m_policy(arg_policy), + m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + + FunctorTeamShmemSize<FunctorType>::value( + arg_functor, arg_policy.team_size())) {} +}; + +template <class FunctorType, class ReducerType, class PointerType, + class ValueType, class... PolicyArgs> +struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>, + ReducerType, PointerType, ValueType, false, + false> { + using PolicyType = TeamPolicyInternal<PolicyArgs...>; + + template <class TagType> + inline static void execute_impl(const FunctorType& f, const PolicyType& p, + PointerType result_ptr) { + OpenMPTargetExec::verify_is_process( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + OpenMPTargetExec::verify_initialized( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + + const int league_size = p.league_size(); + const int team_size = p.team_size(); + const int vector_length = p.impl_vector_length(); + + const size_t shmem_size_L0 = p.scratch_size(0, team_size); + const size_t shmem_size_L1 = p.scratch_size(1, team_size); + OpenMPTargetExec::resize_scratch(PolicyType::member_type::TEAM_REDUCE_SIZE, + shmem_size_L0, shmem_size_L1); + void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); + + ValueType result = ValueType(); + + // Maximum active teams possible. + int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size; + const auto nteams = + league_size < max_active_teams ? league_size : max_active_teams; + +#ifdef KOKKOS_IMPL_LOCK_FREE_HIERARCHICAL +#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to \ + : f) \ + is_device_ptr(scratch_ptr) reduction(+: result) +#pragma omp parallel reduction(+ : result) + { + const int blockIdx = omp_get_team_num(); + const int gridDim = omp_get_num_teams(); + + // Guarantee that the compilers respect the `num_teams` clause + if (gridDim <= nteams) { + for (int league_id = blockIdx; league_id < league_size; + league_id += gridDim) { + typename PolicyType::member_type team( + league_id, league_size, team_size, vector_length, scratch_ptr, + blockIdx, shmem_size_L0, shmem_size_L1); + if constexpr (std::is_same<TagType, void>::value) + f(team, result); + else + f(TagType(), team, result); + } + } else + Kokkos::abort("`num_teams` clause was not respected.\n"); + } + + *result_ptr = result; +#else +// Saving the older implementation that uses `atomic_compare_exchange` to +// calculate the shared memory block index and `distribute` clause to distribute +// teams. +#pragma omp target teams distribute num_teams(nteams) thread_limit(team_size) \ + map(to:f) map(tofrom:result) reduction(+: result) \ + is_device_ptr(scratch_ptr, lock_array) + for (int i = 0; i < league_size; i++) { + ValueType inner_result = ValueType(); + int shmem_block_index = -1, lock_team = 99999, iter = -1; + iter = (omp_get_team_num() % max_active_teams); + + // Loop as long as a shmem_block_index is not found. + while (shmem_block_index == -1) { + // Try and acquire a lock on the index. + lock_team = atomic_compare_exchange(&lock_array[iter], 0, 1); + + // If lock is acquired assign it to the block index. + // lock_team = 0, implies atomic_compare_exchange is successfull. + if (lock_team == 0) + shmem_block_index = iter; + else + iter = ++iter % max_active_teams; + } +#pragma omp parallel num_threads(team_size) reduction(+ : inner_result) + { + typename PolicyType::member_type team( + i, league_size, team_size, vector_length, scratch_ptr, + shmem_block_index, shmem_size_L0, shmem_size_L1); + f(team, inner_result); + } + result = inner_result; + + // Free the locked block and increment the number of available free + // blocks. + lock_team = atomic_compare_exchange(&lock_array[shmem_block_index], 1, 0); + } + + *result_ptr = result; +#endif + } + + inline static void execute(const FunctorType& f, const PolicyType& p, + PointerType ptr) { + execute_impl<typename PolicyType::work_tag>(f, p, ptr); + } +}; + +template <class FunctorType, class ReducerType, class PointerType, + class ValueType, class... PolicyArgs> +struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>, + ReducerType, PointerType, ValueType, false, + true> { + using PolicyType = TeamPolicyInternal<PolicyArgs...>; + template <class TagType> + inline static void execute_impl(const FunctorType& f, const PolicyType& p, + PointerType result_ptr) { + OpenMPTargetExec::verify_is_process( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + OpenMPTargetExec::verify_initialized( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + +#pragma omp declare reduction( \ + custom:ValueType \ + : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv)) + const int league_size = p.league_size(); + const int team_size = p.team_size(); + const int vector_length = p.impl_vector_length(); + const size_t shmem_size_L0 = p.scratch_size(0, team_size); + const size_t shmem_size_L1 = p.scratch_size(1, team_size); + OpenMPTargetExec::resize_scratch(team_size, shmem_size_L0, shmem_size_L1); + void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); + + ValueType result = ValueType(); + + // Maximum active teams possible. + int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size; + const auto nteams = + league_size < max_active_teams ? league_size : max_active_teams; + +#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to \ + : f) \ + is_device_ptr(scratch_ptr) reduction(custom \ + : result) +#pragma omp parallel reduction(custom : result) + { + const int blockIdx = omp_get_team_num(); + const int gridDim = omp_get_num_teams(); + + // Guarantee that the compilers respect the `num_teams` clause + if (gridDim <= nteams) { + for (int league_id = blockIdx; league_id < league_size; + league_id += gridDim) { + typename PolicyType::member_type team( + league_id, league_size, team_size, vector_length, scratch_ptr, + blockIdx, shmem_size_L0, shmem_size_L1); + if constexpr (std::is_same<TagType, void>::value) + f(team, result); + else + f(TagType(), team, result); + } + } else + Kokkos::abort("`num_teams` clause was not respected.\n"); + } + + *result_ptr = result; + } + + inline static void execute(const FunctorType& f, const PolicyType& p, + PointerType ptr) { + execute_impl<typename PolicyType::work_tag>(f, p, ptr); + } +}; + +template <class FunctorType, class ReducerType, class... Properties> +class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>, + ReducerType, Kokkos::Experimental::OpenMPTarget> { + private: + using Policy = + Kokkos::Impl::TeamPolicyInternal<Kokkos::Experimental::OpenMPTarget, + Properties...>; + + using WorkTag = typename Policy::work_tag; + using Member = typename Policy::member_type; + + using ReducerConditional = + Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, + FunctorType, ReducerType>; + using ReducerTypeFwd = typename ReducerConditional::type; + using WorkTagFwd = + std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag, + void>; + + using ValueTraits = + Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>; + using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>; + using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>; + + using pointer_type = typename ValueTraits::pointer_type; + using reference_type = typename ValueTraits::reference_type; + using value_type = typename ValueTraits::value_type; + + enum { HasJoin = ReduceFunctorHasJoin<FunctorType>::value }; + enum { UseReducer = is_reducer_type<ReducerType>::value }; + + using ParForSpecialize = + ParallelReduceSpecialize<FunctorType, Policy, ReducerType, pointer_type, + typename ValueTraits::value_type, HasJoin, + UseReducer>; + + const FunctorType m_functor; + const Policy m_policy; + const ReducerType m_reducer; + const pointer_type m_result_ptr; + const int m_shmem_size; + + public: + inline void execute() const { + ParForSpecialize::execute(m_functor, m_policy, m_result_ptr); + } + + template <class ViewType> + inline ParallelReduce( + const FunctorType& arg_functor, const Policy& arg_policy, + const ViewType& arg_result, + typename std::enable_if<Kokkos::is_view<ViewType>::value && + !Kokkos::is_reducer_type<ReducerType>::value, + void*>::type = nullptr) + : m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(InvalidType()), + m_result_ptr(arg_result.data()), + m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + + FunctorTeamShmemSize<FunctorType>::value( + arg_functor, arg_policy.team_size())) {} + + inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy, + const ReducerType& reducer) + : m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(reducer), + m_result_ptr(reducer.view().data()), + m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + + FunctorTeamShmemSize<FunctorType>::value( + arg_functor, arg_policy.team_size())) {} +}; + +} // namespace Impl +} // namespace Kokkos + +namespace Kokkos { +namespace Impl { + +template <typename iType> +struct TeamThreadRangeBoundariesStruct<iType, OpenMPTargetExecTeamMember> { + using index_type = iType; + const iType start; + const iType end; + const OpenMPTargetExecTeamMember& team; + + inline TeamThreadRangeBoundariesStruct( + const OpenMPTargetExecTeamMember& thread_, iType count) + : start(0), end(count), team(thread_) {} + inline TeamThreadRangeBoundariesStruct( + const OpenMPTargetExecTeamMember& thread_, iType begin_, iType end_) + : start(begin_), end(end_), team(thread_) {} +}; + +template <typename iType> +struct ThreadVectorRangeBoundariesStruct<iType, OpenMPTargetExecTeamMember> { + using index_type = iType; + const index_type start; + const index_type end; + const OpenMPTargetExecTeamMember& team; + + inline ThreadVectorRangeBoundariesStruct( + const OpenMPTargetExecTeamMember& thread_, index_type count) + : start(0), end(count), team(thread_) {} + inline ThreadVectorRangeBoundariesStruct( + const OpenMPTargetExecTeamMember& thread_, index_type begin_, + index_type end_) + : start(begin_), end(end_), team(thread_) {} +}; + +template <typename iType> +struct TeamVectorRangeBoundariesStruct<iType, OpenMPTargetExecTeamMember> { + using index_type = iType; + const index_type start; + const index_type end; + const OpenMPTargetExecTeamMember& team; + + inline TeamVectorRangeBoundariesStruct( + const OpenMPTargetExecTeamMember& thread_, index_type count) + : start(0), end(count), team(thread_) {} + inline TeamVectorRangeBoundariesStruct( + const OpenMPTargetExecTeamMember& thread_, index_type begin_, + index_type end_) + : start(begin_), end(end_), team(thread_) {} +}; + +} // namespace Impl + +} // namespace Kokkos +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#undef KOKKOS_IMPL_LOCK_FREE_HIERARCHICAL +#endif /* KOKKOS_OPENMPTARGET_PARALLEL_HPP */ diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp new file mode 100644 index 0000000000000000000000000000000000000000..3dfad2bb856e0bb65a48dfd70b3458cee4c9beb5 --- /dev/null +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp @@ -0,0 +1,849 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_OPENMPTARGET_PARALLEL_MDRANGE_HPP +#define KOKKOS_OPENMPTARGET_PARALLEL_MDRANGE_HPP + +#include <omp.h> +#include <Kokkos_Parallel.hpp> +#include <OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp> +#include <impl/Kokkos_FunctorAdapter.hpp> + +// WORKAROUND OPENMPTARGET: sometimes tile sizes don't make it correctly, +// this was tracked down to a bug in clang with regards of mapping structs +// with arrays of long in it. Arrays of int might be fine though ... +#define KOKKOS_IMPL_MDRANGE_USE_NO_TILES // undef EOF + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <class FunctorType, class... Traits> +class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, + Kokkos::Experimental::OpenMPTarget> { + private: + using Policy = Kokkos::MDRangePolicy<Traits...>; + using WorkTag = typename Policy::work_tag; + using Member = typename Policy::member_type; + + const FunctorType m_functor; + const Policy m_policy; + + public: + inline void execute() const { + OpenMPTargetExec::verify_is_process( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + OpenMPTargetExec::verify_initialized( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + FunctorType functor(m_functor); + Policy policy = m_policy; + +#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES + typename Policy::point_type unused; + + execute_tile<Policy::rank>(unused, functor, policy); +#else + const int64_t begin = 0; + const int64_t end = m_policy.m_num_tiles; + +#pragma omp target teams distribute map(to : functor) num_teams(end - begin) + { + for (ptrdiff_t tile_idx = begin; tile_idx < end; tile_idx++) { + +#pragma omp parallel + { + typename Policy::point_type offset; + if (Policy::outer_direction == Policy::Left) { + for (int i = 0; i < Policy::rank; ++i) { + offset[i] = (tile_idx % policy.m_tile_end[i]) * policy.m_tile[i] + + policy.m_lower[i]; + tile_idx /= policy.m_tile_end[i]; + } + } else { + for (int i = Policy::rank - 1; i >= 0; --i) { + offset[i] = (tile_idx % policy.m_tile_end[i]) * policy.m_tile[i] + + policy.m_lower[i]; + tile_idx /= policy.m_tile_end[i]; + } + } + execute_tile<Policy::rank>(offset, functor, policy); + } + } + } +#endif + } + + template <int Rank> + inline typename std::enable_if<Rank == 1>::type execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy) const { +#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES + (void)offset; + const auto begin_0 = policy.m_lower[0]; + + const auto end_0 = policy.m_upper[0]; + +#pragma omp target teams distribute parallel for map(to : functor) + for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++) { + functor(i0); + } +#else + const ptrdiff_t begin_0 = offset[0]; + ptrdiff_t end_0 = begin_0 + policy.m_tile[0]; + end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0]; +#pragma omp for + for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++) { + functor(i0); + } +#endif + } + + template <int Rank> + inline typename std::enable_if<Rank == 2>::type execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy) const { +#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES + (void)offset; + const auto begin_0 = policy.m_lower[0]; + const auto begin_1 = policy.m_lower[1]; + + const auto end_0 = policy.m_upper[0]; + const auto end_1 = policy.m_upper[1]; + +#pragma omp target teams distribute parallel for collapse(2) map(to : functor) + for (auto i0 = begin_0; i0 < end_0; i0++) { + for (auto i1 = begin_1; i1 < end_1; i1++) { + if constexpr (std::is_same<typename Policy::work_tag, void>::value) + functor(i0, i1); + else + functor(typename Policy::work_tag(), i0, i1); + } + } +#else + const ptrdiff_t begin_0 = offset[0]; + ptrdiff_t end_0 = begin_0 + policy.m_tile[0]; + end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0]; + + const ptrdiff_t begin_1 = offset[1]; + ptrdiff_t end_1 = begin_1 + policy.m_tile[1]; + end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1]; + +#pragma omp for collapse(2) + for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++) + for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++) { + if constexpr (std::is_same<typename Policy::work_tag, void>::value) + functor(i0, i1); + else + functor(typename Policy::work_tag(), i0, i1); + } +#endif + } + + template <int Rank> + inline typename std::enable_if<Rank == 3>::type execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy) const { +#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES + (void)offset; + const auto begin_0 = policy.m_lower[0]; + const auto begin_1 = policy.m_lower[1]; + const auto begin_2 = policy.m_lower[2]; + + const auto end_0 = policy.m_upper[0]; + const auto end_1 = policy.m_upper[1]; + const auto end_2 = policy.m_upper[2]; + +#pragma omp target teams distribute parallel for collapse(3) map(to : functor) + for (auto i0 = begin_0; i0 < end_0; i0++) { + for (auto i1 = begin_1; i1 < end_1; i1++) { + for (auto i2 = begin_2; i2 < end_2; i2++) { + if constexpr (std::is_same<typename Policy::work_tag, void>::value) + functor(i0, i1, i2); + else + functor(typename Policy::work_tag(), i0, i1, i2); + } + } + } +#else + const ptrdiff_t begin_0 = offset[0]; + ptrdiff_t end_0 = begin_0 + policy.m_tile[0]; + end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0]; + + const ptrdiff_t begin_1 = offset[1]; + ptrdiff_t end_1 = begin_1 + policy.m_tile[1]; + end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1]; + + const ptrdiff_t begin_2 = offset[2]; + ptrdiff_t end_2 = begin_2 + policy.m_tile[2]; + end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2]; + +#pragma omp for collapse(3) + for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++) + for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++) + for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++) { + if constexpr (std::is_same<typename Policy::work_tag, void>::value) + functor(i0, i1, i2); + else + functor(typename Policy::work_tag(), i0, i1, i2); + } +#endif + } + + template <int Rank> + inline typename std::enable_if<Rank == 4>::type execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy) const { +#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES + (void)offset; + const auto begin_0 = policy.m_lower[0]; + const auto begin_1 = policy.m_lower[1]; + const auto begin_2 = policy.m_lower[2]; + const auto begin_3 = policy.m_lower[3]; + + const auto end_0 = policy.m_upper[0]; + const auto end_1 = policy.m_upper[1]; + const auto end_2 = policy.m_upper[2]; + const auto end_3 = policy.m_upper[3]; + +#pragma omp target teams distribute parallel for collapse(4) map(to : functor) + for (auto i0 = begin_0; i0 < end_0; i0++) { + for (auto i1 = begin_1; i1 < end_1; i1++) { + for (auto i2 = begin_2; i2 < end_2; i2++) { + for (auto i3 = begin_3; i3 < end_3; i3++) { + if constexpr (std::is_same<typename Policy::work_tag, void>::value) + functor(i0, i1, i2, i3); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3); + } + } + } + } +#else + const ptrdiff_t begin_0 = offset[0]; + ptrdiff_t end_0 = begin_0 + policy.m_tile[0]; + end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0]; + + const ptrdiff_t begin_1 = offset[1]; + ptrdiff_t end_1 = begin_1 + policy.m_tile[1]; + end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1]; + + const ptrdiff_t begin_2 = offset[2]; + ptrdiff_t end_2 = begin_2 + policy.m_tile[2]; + end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2]; + + const ptrdiff_t begin_3 = offset[3]; + ptrdiff_t end_3 = begin_3 + policy.m_tile[3]; + end_3 = end_3 < policy.m_upper[3] ? end_3 : policy.m_upper[3]; + +#pragma omp for collapse(4) + for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++) + for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++) + for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++) + for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++) { + if constexpr (std::is_same<typename Policy::work_tag, void>::value) + functor(i0, i1, i2, i3); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3); + } +#endif + } + + template <int Rank> + inline typename std::enable_if<Rank == 5>::type execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy) const { +#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES + (void)offset; + const auto begin_0 = policy.m_lower[0]; + const auto begin_1 = policy.m_lower[1]; + const auto begin_2 = policy.m_lower[2]; + const auto begin_3 = policy.m_lower[3]; + const auto begin_4 = policy.m_lower[4]; + + const auto end_0 = policy.m_upper[0]; + const auto end_1 = policy.m_upper[1]; + const auto end_2 = policy.m_upper[2]; + const auto end_3 = policy.m_upper[3]; + const auto end_4 = policy.m_upper[4]; + +#pragma omp target teams distribute parallel for collapse(5) map(to : functor) + for (auto i0 = begin_0; i0 < end_0; i0++) { + for (auto i1 = begin_1; i1 < end_1; i1++) { + for (auto i2 = begin_2; i2 < end_2; i2++) { + for (auto i3 = begin_3; i3 < end_3; i3++) { + for (auto i4 = begin_4; i4 < end_4; i4++) { + if constexpr (std::is_same<typename Policy::work_tag, + void>::value) + functor(i0, i1, i2, i3, i4); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4); + } + } + } + } + } +#else + const ptrdiff_t begin_0 = offset[0]; + ptrdiff_t end_0 = begin_0 + policy.m_tile[0]; + end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0]; + + const ptrdiff_t begin_1 = offset[1]; + ptrdiff_t end_1 = begin_1 + policy.m_tile[1]; + end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1]; + + const ptrdiff_t begin_2 = offset[2]; + ptrdiff_t end_2 = begin_2 + policy.m_tile[2]; + end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2]; + + const ptrdiff_t begin_3 = offset[3]; + ptrdiff_t end_3 = begin_3 + policy.m_tile[3]; + end_3 = end_3 < policy.m_upper[3] ? end_3 : policy.m_upper[3]; + + const ptrdiff_t begin_4 = offset[4]; + ptrdiff_t end_4 = begin_4 + policy.m_tile[4]; + end_4 = end_4 < policy.m_upper[4] ? end_4 : policy.m_upper[4]; + +#pragma omp for collapse(5) + for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++) + for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++) + for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++) + for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++) + for (ptrdiff_t i4 = begin_4; i4 < end_4; i4++) { + if constexpr (std::is_same<typename Policy::work_tag, + void>::value) + functor(i0, i1, i2, i3, i4); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4); + } +#endif + } + + template <int Rank> + inline typename std::enable_if<Rank == 6>::type execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy) const { +#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES + (void)offset; + const auto begin_0 = policy.m_lower[0]; + const auto begin_1 = policy.m_lower[1]; + const auto begin_2 = policy.m_lower[2]; + const auto begin_3 = policy.m_lower[3]; + const auto begin_4 = policy.m_lower[4]; + const auto begin_5 = policy.m_lower[5]; + + const auto end_0 = policy.m_upper[0]; + const auto end_1 = policy.m_upper[1]; + const auto end_2 = policy.m_upper[2]; + const auto end_3 = policy.m_upper[3]; + const auto end_4 = policy.m_upper[4]; + const auto end_5 = policy.m_upper[5]; + +#pragma omp target teams distribute parallel for collapse(6) map(to : functor) + for (auto i0 = begin_0; i0 < end_0; i0++) { + for (auto i1 = begin_1; i1 < end_1; i1++) { + for (auto i2 = begin_2; i2 < end_2; i2++) { + for (auto i3 = begin_3; i3 < end_3; i3++) { + for (auto i4 = begin_4; i4 < end_4; i4++) { + for (auto i5 = begin_5; i5 < end_5; i5++) { + { + if constexpr (std::is_same<typename Policy::work_tag, + void>::value) + functor(i0, i1, i2, i3, i4, i5); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, + i5); + } + } + } + } + } + } + } +#else + const ptrdiff_t begin_0 = offset[0]; + ptrdiff_t end_0 = begin_0 + policy.m_tile[0]; + end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0]; + + const ptrdiff_t begin_1 = offset[1]; + ptrdiff_t end_1 = begin_1 + policy.m_tile[1]; + end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1]; + + const ptrdiff_t begin_2 = offset[2]; + ptrdiff_t end_2 = begin_2 + policy.m_tile[2]; + end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2]; + + const ptrdiff_t begin_3 = offset[3]; + ptrdiff_t end_3 = begin_3 + policy.m_tile[3]; + end_3 = end_3 < policy.m_upper[3] ? end_3 : policy.m_upper[3]; + + const ptrdiff_t begin_4 = offset[4]; + ptrdiff_t end_4 = begin_4 + policy.m_tile[4]; + end_4 = end_4 < policy.m_upper[4] ? end_4 : policy.m_upper[4]; + + const ptrdiff_t begin_5 = offset[5]; + ptrdiff_t end_5 = begin_5 + policy.m_tile[5]; + end_5 = end_5 < policy.m_upper[5] ? end_5 : policy.m_upper[5]; + +#pragma omp for collapse(6) + for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++) + for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++) + for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++) + for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++) + for (ptrdiff_t i4 = begin_4; i4 < end_4; i4++) + for (ptrdiff_t i5 = begin_5; i5 < end_5; i5++) { + if constexpr (std::is_same<typename Policy::work_tag, + void>::value) + functor(i0, i1, i2, i3, i4, i5); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5); + } +#endif + } + + template <int Rank> + inline typename std::enable_if<Rank == 7>::type execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy) const { +#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES + (void)offset; + const int begin_0 = policy.m_lower[0]; + const int begin_1 = policy.m_lower[1]; + const int begin_2 = policy.m_lower[2]; + const int begin_3 = policy.m_lower[3]; + const int begin_4 = policy.m_lower[4]; + const int begin_5 = policy.m_lower[5]; + const int begin_6 = policy.m_lower[6]; + + const int end_0 = policy.m_upper[0]; + const int end_1 = policy.m_upper[1]; + const int end_2 = policy.m_upper[2]; + const int end_3 = policy.m_upper[3]; + const int end_4 = policy.m_upper[4]; + const int end_5 = policy.m_upper[5]; + const int end_6 = policy.m_upper[6]; + +#pragma omp target teams distribute parallel for collapse(7) map(to : functor) + for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++) { + for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++) { + for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++) { + for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++) { + for (ptrdiff_t i4 = begin_4; i4 < end_4; i4++) { + for (ptrdiff_t i5 = begin_5; i5 < end_5; i5++) { + for (ptrdiff_t i6 = begin_6; i6 < end_6; i6++) { + if constexpr (std::is_same<typename Policy::work_tag, + void>::value) + functor(i0, i1, i2, i3, i4, i5, i6); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5, + i6); + } + } + } + } + } + } + } +#else + const ptrdiff_t begin_0 = offset[0]; + ptrdiff_t end_0 = begin_0 + policy.m_tile[0]; + end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0]; + + const ptrdiff_t begin_1 = offset[1]; + ptrdiff_t end_1 = begin_1 + policy.m_tile[1]; + end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1]; + + const ptrdiff_t begin_2 = offset[2]; + ptrdiff_t end_2 = begin_2 + policy.m_tile[2]; + end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2]; + + const ptrdiff_t begin_3 = offset[3]; + ptrdiff_t end_3 = begin_3 + policy.m_tile[3]; + end_3 = end_3 < policy.m_upper[3] ? end_3 : policy.m_upper[3]; + + const ptrdiff_t begin_4 = offset[4]; + ptrdiff_t end_4 = begin_4 + policy.m_tile[4]; + end_4 = end_4 < policy.m_upper[4] ? end_4 : policy.m_upper[4]; + + const ptrdiff_t begin_5 = offset[5]; + ptrdiff_t end_5 = begin_5 + policy.m_tile[5]; + end_5 = end_5 < policy.m_upper[5] ? end_5 : policy.m_upper[5]; + + const ptrdiff_t begin_6 = offset[6]; + ptrdiff_t end_6 = begin_6 + policy.m_tile[6]; + end_6 = end_6 < policy.m_upper[6] ? end_6 : policy.m_upper[6]; + +#pragma omp for collapse(7) + for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++) + for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++) + for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++) + for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++) + for (ptrdiff_t i4 = begin_4; i4 < end_4; i4++) + for (ptrdiff_t i5 = begin_5; i5 < end_5; i5++) + for (ptrdiff_t i6 = begin_6; i6 < end_6; i6++) { + if constexpr (std::is_same<typename Policy::work_tag, + void>::value) + functor(i0, i1, i2, i3, i4, i5, i6); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5, + i6); + } +#endif + } + + template <int Rank> + inline typename std::enable_if<Rank == 8>::type execute_tile( + typename Policy::point_type offset, const FunctorType& functor, + const Policy& policy) const { +#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES + (void)offset; + const int begin_0 = policy.m_lower[0]; + const int begin_1 = policy.m_lower[1]; + const int begin_2 = policy.m_lower[2]; + const int begin_3 = policy.m_lower[3]; + const int begin_4 = policy.m_lower[4]; + const int begin_5 = policy.m_lower[5]; + const int begin_6 = policy.m_lower[6]; + const int begin_7 = policy.m_lower[7]; + + const int end_0 = policy.m_upper[0]; + const int end_1 = policy.m_upper[1]; + const int end_2 = policy.m_upper[2]; + const int end_3 = policy.m_upper[3]; + const int end_4 = policy.m_upper[4]; + const int end_5 = policy.m_upper[5]; + const int end_6 = policy.m_upper[6]; + const int end_7 = policy.m_upper[7]; + +#pragma omp target teams distribute parallel for collapse(8) map(to : functor) + for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++) { + for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++) { + for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++) { + for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++) { + for (ptrdiff_t i4 = begin_4; i4 < end_4; i4++) { + for (ptrdiff_t i5 = begin_5; i5 < end_5; i5++) { + for (ptrdiff_t i6 = begin_6; i6 < end_6; i6++) { + for (ptrdiff_t i7 = begin_7; i7 < end_7; i7++) { + if constexpr (std::is_same<typename Policy::work_tag, + void>::value) + functor(i0, i1, i2, i3, i4, i5, i6, i7); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, + i5, i6, i7); + } + } + } + } + } + } + } + } +#else + const ptrdiff_t begin_0 = offset[0]; + ptrdiff_t end_0 = begin_0 + policy.m_tile[0]; + end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0]; + + const ptrdiff_t begin_1 = offset[1]; + ptrdiff_t end_1 = begin_1 + policy.m_tile[1]; + end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1]; + + const ptrdiff_t begin_2 = offset[2]; + ptrdiff_t end_2 = begin_2 + policy.m_tile[2]; + end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2]; + + const ptrdiff_t begin_3 = offset[3]; + ptrdiff_t end_3 = begin_3 + policy.m_tile[3]; + end_3 = end_3 < policy.m_upper[3] ? end_3 : policy.m_upper[3]; + + const ptrdiff_t begin_4 = offset[4]; + ptrdiff_t end_4 = begin_4 + policy.m_tile[4]; + end_4 = end_4 < policy.m_upper[4] ? end_4 : policy.m_upper[4]; + + const ptrdiff_t begin_5 = offset[5]; + ptrdiff_t end_5 = begin_5 + policy.m_tile[5]; + end_5 = end_5 < policy.m_upper[5] ? end_5 : policy.m_upper[5]; + + const ptrdiff_t begin_6 = offset[6]; + ptrdiff_t end_6 = begin_6 + policy.m_tile[6]; + end_6 = end_6 < policy.m_upper[6] ? end_6 : policy.m_upper[6]; + + const ptrdiff_t begin_7 = offset[7]; + ptrdiff_t end_7 = begin_7 + policy.m_tile[7]; + end_7 = end_7 < policy.m_upper[7] ? end_7 : policy.m_upper[7]; + +#pragma omp for collapse(8) + for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++) + for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++) + for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++) + for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++) + for (ptrdiff_t i4 = begin_4; i4 < end_4; i4++) + for (ptrdiff_t i5 = begin_5; i5 < end_5; i5++) + for (ptrdiff_t i6 = begin_6; i6 < end_6; i6++) + for (ptrdiff_t i7 = begin_7; i7 < end_7; i7++) { + if constexpr (std::is_same<typename Policy::work_tag, + void>::value) + functor(i0, i1, i2, i3, i4, i5, i6, i7); + else + functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, + i5, i6, i7); + } +#endif + } + + inline ParallelFor(const FunctorType& arg_functor, Policy arg_policy) + : m_functor(arg_functor), m_policy(arg_policy) {} + // TODO DZP: based on a conversation with Christian, we're using 256 as a + // heuristic here. We need something better once we can query these kinds of + // properties + template <typename Policy, typename Functor> + static int max_tile_size_product(const Policy&, const Functor&) { + return 256; + } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <class FunctorType, class ReducerType, class PointerType, + class ValueType, class... PolicyArgs> +struct ParallelReduceSpecialize<FunctorType, + Kokkos::MDRangePolicy<PolicyArgs...>, + ReducerType, PointerType, ValueType, 0, 0> { + using PolicyType = Kokkos::RangePolicy<PolicyArgs...>; + template <class TagType> + inline static + typename std::enable_if<std::is_same<TagType, void>::value>::type + execute_impl(const FunctorType& f, const PolicyType& p, + PointerType result_ptr) { + OpenMPTargetExec::verify_is_process( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + OpenMPTargetExec::verify_initialized( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + const typename PolicyType::member_type begin = p.begin(); + const typename PolicyType::member_type end = p.end(); + + ValueType result = ValueType(); +#pragma omp target teams distribute parallel for num_teams(512) map(to:f) map(tofrom:result) reduction(+: result) + for (int i = begin; i < end; i++) f(i, result); + + *result_ptr = result; + } + + template <class TagType> + inline static + typename std::enable_if<!std::is_same<TagType, void>::value>::type + execute_impl(const FunctorType& f, const PolicyType& p, + PointerType result_ptr) { + OpenMPTargetExec::verify_is_process( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + OpenMPTargetExec::verify_initialized( + "Kokkos::Experimental::OpenMPTarget parallel_for"); + const typename PolicyType::member_type begin = p.begin(); + const typename PolicyType::member_type end = p.end(); + + ValueType result = ValueType(); +#pragma omp target teams distribute parallel for num_teams(512) map(to:f) map(tofrom: result) reduction(+: result) + for (int i = begin; i < end; i++) f(TagType(), i, result); + + *result_ptr = result; + } + + inline static void execute(const FunctorType& f, const PolicyType& p, + PointerType ptr) { + execute_impl<typename PolicyType::work_tag>(f, p, ptr); + } +}; +/* +template<class FunctorType, class PolicyType, class ReducerType, class +PointerType, class ValueType> struct ParallelReduceSpecialize<FunctorType, +PolicyType, ReducerType, PointerType, ValueType, 0,1> { + + #pragma omp declare reduction(custom: ValueType : ReducerType::join(omp_out, +omp_in)) initializer ( ReducerType::init(omp_priv) ) + + template< class TagType > + inline static + typename std::enable_if< std::is_same< TagType , void >::value >::type + execute_impl(const FunctorType& f, const PolicyType& p, PointerType +result_ptr) + { + OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget +parallel_for"); + OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget +parallel_for"); const typename PolicyType::member_type begin = p.begin(); const +typename PolicyType::member_type end = p.end(); + + ValueType result = ValueType(); + #pragma omp target teams distribute parallel for num_teams(512) map(to:f) +map(tofrom:result) reduction(custom: result) for(int i=begin; i<end; i++) + f(i,result); + + *result_ptr=result; + } + + + template< class TagType > + inline static + typename std::enable_if< ! std::is_same< TagType , void >::value >::type + execute_impl(const FunctorType& f, const PolicyType& p, PointerType +result_ptr) + { + OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget +parallel_for"); + OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget +parallel_for"); const typename PolicyType::member_type begin = p.begin(); const +typename PolicyType::member_type end = p.end(); + + ValueType result = ValueType(); + #pragma omp target teams distribute parallel for num_teams(512) map(to:f) +map(tofrom: result) reduction(custom: result) for(int i=begin; i<end; i++) + f(TagType(),i,result); + + *result_ptr=result; + } + + + inline static + void execute(const FunctorType& f, const PolicyType& p, PointerType ptr) { + execute_impl<typename PolicyType::work_tag>(f,p,ptr); + } +}; + + +template <class FunctorType, class ReducerType, class... Traits> +class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType, + Kokkos::Experimental::OpenMPTarget> { + private: + using Policy = Kokkos::MDRangePolicy<Traits...>; + + using WorkTag = typename Policy::work_tag; + using WorkRange = typename Policy::WorkRange; + using Member = typename Policy::member_type; + + using ReducerConditional = + Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, + FunctorType, ReducerType>; + using ReducerTypeFwd = typename ReducerConditional::type; + using WorkTagFwd = + typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, + WorkTag, void>::type; + + // Static Assert WorkTag void if ReducerType not InvalidType + + using ValueTraits = + Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>; + using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>; + using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>; + + enum { HasJoin = ReduceFunctorHasJoin<FunctorType>::value }; + enum { UseReducer = is_reducer_type<ReducerType>::value }; + + using pointer_type = typename ValueTraits::pointer_type; + using reference_type = typename ValueTraits::reference_type; + + using ParForSpecialize = ParallelReduceSpecialize< + FunctorType, Policy, ReducerType, pointer_type, + typename ValueTraits::value_type, HasJoin, UseReducer>; + + const FunctorType m_functor; + const Policy m_policy; + const ReducerType m_reducer; + const pointer_type m_result_ptr; + + public: + inline void execute() const { + ParForSpecialize::execute(m_functor, m_policy, m_result_ptr); + } + + template <class ViewType> + inline ParallelReduce( + const FunctorType& arg_functor, Policy arg_policy, + const ViewType& arg_result_view, + typename std::enable_if<Kokkos::is_view<ViewType>::value && + !Kokkos::is_reducer_type<ReducerType>::value, + void*>::type = NULL) + : m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(InvalidType()), + m_result_ptr(arg_result_view.data()) { + //static_assert( std::is_same< typename ViewType::memory_space + // , Kokkos::HostSpace >::value + // , "Reduction result on Kokkos::Experimental::OpenMPTarget must be a + // Kokkos::View in HostSpace" ); + } + + inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy, + const ReducerType& reducer) + : m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(reducer), + m_result_ptr(reducer.view().data()) { + //static_assert( std::is_same< typename ViewType::memory_space + // , Kokkos::HostSpace >::value + // , "Reduction result on Kokkos::Experimental::OpenMPTarget must be a + // Kokkos::View in HostSpace" ); + } + // TODO DZP: based on a conversation with Christian, we're using 256 as a +heuristic + // here. We need something better once we can query these kinds of properties + template<typename Policy, typename Functor> +static int max_tile_size_product(const Policy&, const Functor&) { + return 256; + } +};*/ + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- +#undef KOKKOS_IMPL_MDRANGE_USE_NO_TILES +#endif /* KOKKOS_OPENMPTARGET_PARALLEL_HPP */ diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp new file mode 100644 index 0000000000000000000000000000000000000000..be924ffa61c1f8cf696b3b84cb44765536fde4f9 --- /dev/null +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp @@ -0,0 +1,314 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> + +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ENABLE_TASKPOLICY) + +#include <impl/Kokkos_TaskQueue_impl.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template class TaskQueue<Kokkos::Experimental::OpenMPTarget>; + +//---------------------------------------------------------------------------- + +TaskExec<Kokkos::Experimental::OpenMPTarget>::TaskExec() + : m_self_exec(0), + m_team_exec(0), + m_sync_mask(0), + m_sync_value(0), + m_sync_step(0), + m_group_rank(0), + m_team_rank(0), + m_team_size(1) {} + +TaskExec<Kokkos::Experimental::OpenMPTarget>::TaskExec( + Kokkos::Impl::OpenMPTargetExec &arg_exec, int const arg_team_size) + : m_self_exec(&arg_exec), + m_team_exec(arg_exec.pool_rev(arg_exec.pool_rank_rev() / arg_team_size)), + m_sync_mask(0), + m_sync_value(0), + m_sync_step(0), + m_group_rank(arg_exec.pool_rank_rev() / arg_team_size), + m_team_rank(arg_exec.pool_rank_rev() % arg_team_size), + m_team_size(arg_team_size) { + // This team spans + // m_self_exec->pool_rev( team_size * group_rank ) + // m_self_exec->pool_rev( team_size * ( group_rank + 1 ) - 1 ) + + int64_t volatile *const sync = (int64_t *)m_self_exec->scratch_reduce(); + + sync[0] = int64_t(0); + sync[1] = int64_t(0); + + for (int i = 0; i < m_team_size; ++i) { + m_sync_value |= int64_t(1) << (8 * i); + m_sync_mask |= int64_t(3) << (8 * i); + } + + Kokkos::memory_fence(); +} + +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + +void TaskExec<Kokkos::Experimental::OpenMPTarget>::team_barrier_impl() const { + if (m_team_exec->scratch_reduce_size() < int(2 * sizeof(int64_t))) { + Kokkos::abort("TaskQueue<OpenMPTarget> scratch_reduce memory too small"); + } + + // Use team shared memory to synchronize. + // Alternate memory locations between barriers to avoid a sequence + // of barriers overtaking one another. + + int64_t volatile *const sync = + ((int64_t *)m_team_exec->scratch_reduce()) + (m_sync_step & 0x01); + + // This team member sets one byte within the sync variable + int8_t volatile *const sync_self = ((int8_t *)sync) + m_team_rank; + +#if 0 +fprintf( stdout + , "barrier group(%d) member(%d) step(%d) wait(%lx) : before(%lx)\n" + , m_group_rank + , m_team_rank + , m_sync_step + , m_sync_value + , *sync + ); +fflush(stdout); +#endif + + *sync_self = int8_t(m_sync_value & 0x03); // signal arrival + + while (m_sync_value != *sync) + ; // wait for team to arrive + +#if 0 +fprintf( stdout + , "barrier group(%d) member(%d) step(%d) wait(%lx) : after(%lx)\n" + , m_group_rank + , m_team_rank + , m_sync_step + , m_sync_value + , *sync + ); +fflush(stdout); +#endif + + ++m_sync_step; + + if (0 == (0x01 & m_sync_step)) { // Every other step + m_sync_value ^= m_sync_mask; + if (1000 < m_sync_step) m_sync_step = 0; + } +} + +#endif + +//---------------------------------------------------------------------------- + +void TaskQueueSpecialization<Kokkos::Experimental::OpenMPTarget>::execute( + TaskQueue<Kokkos::Experimental::OpenMPTarget> *const queue) { + using execution_space = Kokkos::Experimental::OpenMPTarget; + using queue_type = TaskQueue<execution_space>; + using task_root_type = TaskBase<execution_space, void, void>; + using PoolExec = Kokkos::Impl::OpenMPTargetExec; + using Member = TaskExec<execution_space>; + + task_root_type *const end = (task_root_type *)task_root_type::EndTag; + + // Required: team_size <= 8 + + const int team_size = PoolExec::pool_size(2); // Threads per core + // const int team_size = PoolExec::pool_size(1); // Threads per NUMA + + if (8 < team_size) { + Kokkos::abort("TaskQueue<OpenMPTarget> unsupported team size"); + } + +#pragma omp parallel + { + PoolExec &self = *PoolExec::get_thread_omp(); + + Member single_exec; + Member team_exec(self, team_size); + + // Team shared memory + task_root_type *volatile *const task_shared = + (task_root_type **)team_exec.m_team_exec->scratch_thread(); + +// Barrier across entire OpenMPTarget thread pool to insure initialization +#pragma omp barrier + + // Loop until all queues are empty and no tasks in flight + + do { + task_root_type *task = 0; + + // Each team lead attempts to acquire either a thread team task + // or a single thread task for the team. + + if (0 == team_exec.team_rank()) { + task = 0 < *((volatile int *)&queue->m_ready_count) ? end : 0; + + // Loop by priority and then type + for (int i = 0; i < queue_type::NumQueue && end == task; ++i) { + for (int j = 0; j < 2 && end == task; ++j) { + task = queue_type::pop_task(&queue->m_ready[i][j]); + } + } + } + + // Team lead broadcast acquired task to team members: + + if (1 < team_exec.team_size()) { + if (0 == team_exec.team_rank()) *task_shared = task; + + // Fence to be sure task_shared is stored before the barrier + Kokkos::memory_fence(); + + // Whole team waits for every team member to reach this statement + team_exec.team_barrier(); + + // Fence to be sure task_shared is stored + Kokkos::memory_fence(); + + task = *task_shared; + } + +#if 0 +fprintf( stdout + , "\nexecute group(%d) member(%d) task_shared(0x%lx) task(0x%lx)\n" + , team_exec.m_group_rank + , team_exec.m_team_rank + , uintptr_t(task_shared) + , uintptr_t(task) + ); +fflush(stdout); +#endif + + if (0 == task) break; // 0 == m_ready_count + + if (end == task) { + // All team members wait for whole team to reach this statement. + // Is necessary to prevent task_shared from being updated + // before it is read by all threads. + team_exec.team_barrier(); + } else if (task_root_type::TaskTeam == task->m_task_type) { + // Thread Team Task + (*task->m_apply)(task, &team_exec); + + // The m_apply function performs a barrier + + if (0 == team_exec.team_rank()) { + // team member #0 completes the task, which may delete the task + queue->complete(task); + } + } else { + // Single Thread Task + + if (0 == team_exec.team_rank()) { + (*task->m_apply)(task, &single_exec); + + queue->complete(task); + } + + // All team members wait for whole team to reach this statement. + // Not necessary to complete the task. + // Is necessary to prevent task_shared from being updated + // before it is read by all threads. + team_exec.team_barrier(); + } + } while (1); + } + // END #pragma omp parallel +} + +void TaskQueueSpecialization<Kokkos::Experimental::OpenMPTarget>:: + iff_single_thread_recursive_execute( + TaskQueue<Kokkos::Experimental::OpenMPTarget> *const queue) { + using execution_space = Kokkos::Experimental::OpenMPTarget; + using queue_type = TaskQueue<execution_space>; + using task_root_type = TaskBase<execution_space, void, void>; + using Member = TaskExec<execution_space>; + + if (1 == omp_get_num_threads()) { + task_root_type *const end = (task_root_type *)task_root_type::EndTag; + + Member single_exec; + + task_root_type *task = end; + + do { + task = end; + + // Loop by priority and then type + for (int i = 0; i < queue_type::NumQueue && end == task; ++i) { + for (int j = 0; j < 2 && end == task; ++j) { + task = queue_type::pop_task(&queue->m_ready[i][j]); + } + } + + if (end == task) break; + + (*task->m_apply)(task, &single_exec); + + queue->complete(task); + + } while (1); + } +} + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- + +#endif /* #if defined( KOKKOS_ENABLE_OPENMPTARGET ) && defined( \ + KOKKOS_ENABLE_TASKPOLICY ) */ diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp new file mode 100644 index 0000000000000000000000000000000000000000..c5959a0ad7f50a71074ef03f2def9c4334510eca --- /dev/null +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp @@ -0,0 +1,347 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_IMPL_OPENMP_TASK_HPP +#define KOKKOS_IMPL_OPENMP_TASK_HPP + +#if defined(KOKKOS_ENABLE_TASKPOLICY) + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <> +class TaskQueueSpecialization<Kokkos::Experimental::OpenMPTarget> { + public: + using execution_space = Kokkos::Experimental::OpenMPTarget; + using queue_type = Kokkos::Impl::TaskQueue<execution_space>; + using task_base_type = Kokkos::Impl::TaskBase<execution_space, void, void>; + + // Must specify memory space + using memory_space = Kokkos::HostSpace; + + static void iff_single_thread_recursive_execute(queue_type* const); + + // Must provide task queue execution function + static void execute(queue_type* const); + + // Must provide mechanism to set function pointer in + // execution space from the host process. + template <typename FunctorType> + static void proc_set_apply(task_base_type::function_type* ptr) { + using TaskType = TaskBase<Kokkos::Experimental::OpenMPTarget, + typename FunctorType::value_type, FunctorType>; + *ptr = TaskType::apply; + } +}; + +extern template class TaskQueue<Kokkos::Experimental::OpenMPTarget>; + +//---------------------------------------------------------------------------- + +template <> +class TaskExec<Kokkos::Experimental::OpenMPTarget> { + private: + TaskExec(TaskExec&&) = delete; + TaskExec(TaskExec const&) = delete; + TaskExec& operator=(TaskExec&&) = delete; + TaskExec& operator=(TaskExec const&) = delete; + + using PoolExec = Kokkos::Impl::OpenMPTargetExec; + + friend class Kokkos::Impl::TaskQueue<Kokkos::Experimental::OpenMPTarget>; + friend class Kokkos::Impl::TaskQueueSpecialization< + Kokkos::Experimental::OpenMPTarget>; + + PoolExec* const m_self_exec; ///< This thread's thread pool data structure + PoolExec* const m_team_exec; ///< Team thread's thread pool data structure + int64_t m_sync_mask; + int64_t mutable m_sync_value; + int mutable m_sync_step; + int m_group_rank; ///< Which "team" subset of thread pool + int m_team_rank; ///< Which thread within a team + int m_team_size; + + TaskExec(); + TaskExec(PoolExec& arg_exec, int arg_team_size); + + void team_barrier_impl() const; + + public: +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + void* team_shared() const { + return m_team_exec ? m_team_exec->scratch_thread() : nullptr; + } + + int team_shared_size() const { + return m_team_exec ? m_team_exec->scratch_thread_size() : 0; + } + + /**\brief Whole team enters this function call + * before any teeam member returns from + * this function call. + */ + void team_barrier() const { + if (1 < m_team_size) team_barrier_impl(); + } +#else + KOKKOS_INLINE_FUNCTION void team_barrier() const {} + KOKKOS_INLINE_FUNCTION void* team_shared() const { return 0; } + KOKKOS_INLINE_FUNCTION int team_shared_size() const { return 0; } +#endif + + KOKKOS_INLINE_FUNCTION + int team_rank() const { return m_team_rank; } + + KOKKOS_INLINE_FUNCTION + int team_size() const { return m_team_size; } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +template <typename iType> +KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> > +TeamThreadRange(Impl::TaskExec<Kokkos::Experimental::OpenMPTarget>& thread, + const iType& count) { + return Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >(thread, + count); +} + +template <typename iType> +KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> > +TeamThreadRange(Impl::TaskExec<Kokkos::Experimental::OpenMPTarget>& thread, + const iType& start, const iType& end) { + return Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >(thread, start, + end); +} + +/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each + * i=0..N-1. + * + * The range i=0..N-1 is mapped to all threads of the the calling thread team. + */ +template <typename iType, class Lambda> +KOKKOS_INLINE_FUNCTION void parallel_for( + const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >& + loop_boundaries, + const Lambda& lambda) { + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) { + lambda(i); + } +} + +template <typename iType, class Lambda, typename ValueType> +KOKKOS_INLINE_FUNCTION void parallel_reduce( + const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >& + loop_boundaries, + const Lambda& lambda, ValueType& initialized_result) { + int team_rank = + loop_boundaries.thread.team_rank(); // member num within the team + ValueType result = initialized_result; + + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) { + lambda(i, result); + } + + if (1 < loop_boundaries.thread.team_size()) { + ValueType* shared = (ValueType*)loop_boundaries.thread.team_shared(); + + loop_boundaries.thread.team_barrier(); + shared[team_rank] = result; + + loop_boundaries.thread.team_barrier(); + + // reduce across threads to thread 0 + if (team_rank == 0) { + for (int i = 1; i < loop_boundaries.thread.team_size(); i++) { + shared[0] += shared[i]; + } + } + + loop_boundaries.thread.team_barrier(); + + // broadcast result + initialized_result = shared[0]; + } else { + initialized_result = result; + } +} + +template <typename iType, class Lambda, typename ValueType, class JoinType> +KOKKOS_INLINE_FUNCTION void parallel_reduce( + const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >& + loop_boundaries, + const Lambda& lambda, const JoinType& join, ValueType& initialized_result) { + int team_rank = + loop_boundaries.thread.team_rank(); // member num within the team + ValueType result = initialized_result; + + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) { + lambda(i, result); + } + + if (1 < loop_boundaries.thread.team_size()) { + ValueType* shared = (ValueType*)loop_boundaries.thread.team_shared(); + + loop_boundaries.thread.team_barrier(); + shared[team_rank] = result; + + loop_boundaries.thread.team_barrier(); + + // reduce across threads to thread 0 + if (team_rank == 0) { + for (int i = 1; i < loop_boundaries.thread.team_size(); i++) { + join(shared[0], shared[i]); + } + } + + loop_boundaries.thread.team_barrier(); + + // broadcast result + initialized_result = shared[0]; + } else { + initialized_result = result; + } +} + +// placeholder for future function +template <typename iType, class Lambda, typename ValueType> +KOKKOS_INLINE_FUNCTION void parallel_reduce( + const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >& + loop_boundaries, + const Lambda& lambda, ValueType& initialized_result) {} + +// placeholder for future function +template <typename iType, class Lambda, typename ValueType, class JoinType> +KOKKOS_INLINE_FUNCTION void parallel_reduce( + const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >& + loop_boundaries, + const Lambda& lambda, const JoinType& join, ValueType& initialized_result) { +} + +template <typename ValueType, typename iType, class Lambda> +KOKKOS_INLINE_FUNCTION void parallel_scan( + const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >& + loop_boundaries, + const Lambda& lambda) { + ValueType accum = 0; + ValueType val, local_total; + ValueType* shared = (ValueType*)loop_boundaries.thread.team_shared(); + int team_size = loop_boundaries.thread.team_size(); + int team_rank = + loop_boundaries.thread.team_rank(); // member num within the team + + // Intra-member scan + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) { + local_total = 0; + lambda(i, local_total, false); + val = accum; + lambda(i, val, true); + accum += local_total; + } + + shared[team_rank] = accum; + loop_boundaries.thread.team_barrier(); + + // Member 0 do scan on accumulated totals + if (team_rank == 0) { + for (iType i = 1; i < team_size; i += 1) { + shared[i] += shared[i - 1]; + } + accum = 0; // Member 0 set accum to 0 in preparation for inter-member scan + } + + loop_boundaries.thread.team_barrier(); + + // Inter-member scan adding in accumulated totals + if (team_rank != 0) { + accum = shared[team_rank - 1]; + } + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) { + local_total = 0; + lambda(i, local_total, false); + val = accum; + lambda(i, val, true); + accum += local_total; + } +} + +// placeholder for future function +template <typename iType, class Lambda, typename ValueType> +KOKKOS_INLINE_FUNCTION void parallel_scan( + const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::TaskExec<Kokkos::Experimental::OpenMPTarget> >& + loop_boundaries, + const Lambda& lambda) {} + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */ +#endif /* #ifndef KOKKOS_IMPL_OPENMP_TASK_HPP */ diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_UniqueToken.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_UniqueToken.hpp new file mode 100644 index 0000000000000000000000000000000000000000..fa348611b953aa62704cb760521a275a04729985 --- /dev/null +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_UniqueToken.hpp @@ -0,0 +1,135 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_OPENMPTARGET_UNIQUE_TOKEN_HPP +#define KOKKOS_OPENMPTARGET_UNIQUE_TOKEN_HPP + +#include <Kokkos_Macros.hpp> +#ifdef KOKKOS_ENABLE_OPENMPTARGET + +#include <Kokkos_OpenMPTargetSpace.hpp> +#include <Kokkos_UniqueToken.hpp> +#include <impl/Kokkos_SharedAlloc.hpp> +#include <impl/Kokkos_ConcurrentBitset.hpp> + +namespace Kokkos { +namespace Experimental { + +// both global and instance Unique Tokens are implemented in the same way +template <> +class UniqueToken<OpenMPTarget, UniqueTokenScope::Global> { + protected: + uint32_t volatile* m_buffer; + uint32_t m_count; + + public: + using execution_space = OpenMPTarget; + using size_type = int32_t; + + explicit UniqueToken(execution_space const& = execution_space()); + + KOKKOS_DEFAULTED_FUNCTION + UniqueToken(const UniqueToken&) = default; + + KOKKOS_DEFAULTED_FUNCTION + UniqueToken(UniqueToken&&) = default; + + KOKKOS_DEFAULTED_FUNCTION + UniqueToken& operator=(const UniqueToken&) = default; + + KOKKOS_DEFAULTED_FUNCTION + UniqueToken& operator=(UniqueToken&&) = default; + + /// \brief upper bound for acquired values, i.e. 0 <= value < size() + KOKKOS_INLINE_FUNCTION + size_type size() const noexcept { return m_count; } + + /// \brief acquire value such that 0 <= value < size() + KOKKOS_INLINE_FUNCTION + size_type acquire() const { + const Kokkos::pair<int, int> result = + Kokkos::Impl::concurrent_bitset::acquire_bounded( + m_buffer, m_count, Kokkos::Impl::clock_tic() % m_count); + + if (result.first < 0) { + Kokkos::abort( + "UniqueToken<OpenMPTarget> failure to acquire tokens, no tokens " + "available"); + } + + return result.first; + } + + /// \brief release an acquired value + KOKKOS_INLINE_FUNCTION + void release(size_type i) const noexcept { + Kokkos::Impl::concurrent_bitset::release(m_buffer, i); + } +}; + +template <> +class UniqueToken<OpenMPTarget, UniqueTokenScope::Instance> + : public UniqueToken<OpenMPTarget, UniqueTokenScope::Global> { + private: + Kokkos::View<uint32_t*, ::Kokkos::Experimental::OpenMPTargetSpace> + m_buffer_view; + + public: + explicit UniqueToken(execution_space const& arg = execution_space()) + : UniqueToken<OpenMPTarget, UniqueTokenScope::Global>(arg) {} + + UniqueToken(size_type max_size, execution_space const& = execution_space()) + : m_buffer_view( + "Kokkos::UniqueToken::m_buffer_view", + ::Kokkos::Impl::concurrent_bitset::buffer_bound(max_size)) { + m_buffer = m_buffer_view.data(); + m_count = max_size; + } +}; + +} // namespace Experimental +} // namespace Kokkos + +#endif // KOKKOS_ENABLE_OPENMPTARGET +#endif // KOKKOS_OPENMPTARGET_UNIQUE_TOKEN_HPP diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9c29eb190d17b64c0340751a3459785c070d7c47 --- /dev/null +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp @@ -0,0 +1,309 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Concepts.hpp> +#include <SYCL/Kokkos_SYCL_Instance.hpp> +#include <Kokkos_SYCL.hpp> +#include <Kokkos_HostSpace.hpp> +#include <Kokkos_Serial.hpp> +#include <Kokkos_Core.hpp> +#include <impl/Kokkos_Error.hpp> + +namespace { +template <typename C> +struct Container { + explicit Container(const C& c) : container(c) {} + + friend std::ostream& operator<<(std::ostream& os, const Container& that) { + os << that.container.size(); + for (const auto& v : that.container) { + os << "\n\t" << v; + } + return os; + } + + private: + const C& container; +}; +} // namespace + +namespace Kokkos { + +namespace Impl { +// forward-declaration +int get_gpu(const InitArguments& args); +} // namespace Impl + +namespace Experimental { +SYCL::SYCL() + : m_space_instance(&Impl::SYCLInternal::singleton(), + [](Impl::SYCLInternal*) {}) { + Impl::SYCLInternal::singleton().verify_is_initialized( + "SYCL instance constructor"); +} + +SYCL::SYCL(const sycl::queue& stream) + : m_space_instance(new Impl::SYCLInternal, [](Impl::SYCLInternal* ptr) { + ptr->finalize(); + delete ptr; + }) { + Impl::SYCLInternal::singleton().verify_is_initialized( + "SYCL instance constructor"); + m_space_instance->initialize(stream); +} + +int SYCL::concurrency() { + return Impl::SYCLInternal::singleton().m_maxConcurrency; +} + +const char* SYCL::name() { return "SYCL"; } + +bool SYCL::impl_is_initialized() { + return Impl::SYCLInternal::singleton().is_initialized(); +} + +void SYCL::impl_finalize() { Impl::SYCLInternal::singleton().finalize(); } + +void SYCL::fence() const { + Impl::SYCLInternal::fence(*m_space_instance->m_queue); +} + +void SYCL::impl_static_fence() { + // guard accessing all_queues + std::lock_guard<std::mutex> lock(Impl::SYCLInternal::mutex); + for (auto& queue : Impl::SYCLInternal::all_queues) + Impl::SYCLInternal::fence(**queue); +} + +int SYCL::sycl_device() const { + return impl_internal_space_instance()->m_syclDev; +} + +SYCL::SYCLDevice::SYCLDevice(sycl::device d) : m_device(std::move(d)) {} + +SYCL::SYCLDevice::SYCLDevice(const sycl::device_selector& selector) + : m_device(selector.select_device()) {} + +SYCL::SYCLDevice::SYCLDevice(size_t id) { + std::vector<sycl::device> gpu_devices = + sycl::device::get_devices(sycl::info::device_type::gpu); + if (id >= gpu_devices.size()) { + std::stringstream error_message; + error_message << "Requested GPU with id " << id << " but only " + << gpu_devices.size() << " GPU(s) available!\n"; + Kokkos::Impl::throw_runtime_exception(error_message.str()); + } + m_device = gpu_devices[id]; +} + +sycl::device SYCL::SYCLDevice::get_device() const { return m_device; } + +void SYCL::impl_initialize(SYCL::SYCLDevice d) { + Impl::SYCLInternal::singleton().initialize(d.get_device()); +} + +std::ostream& SYCL::SYCLDevice::info(std::ostream& os) const { + using namespace sycl::info; + return os << "Name: " << m_device.get_info<device::name>() + << "\nDriver Version: " + << m_device.get_info<device::driver_version>() + << "\nIs Host: " << m_device.is_host() + << "\nIs CPU: " << m_device.is_cpu() + << "\nIs GPU: " << m_device.is_gpu() + << "\nIs Accelerator: " << m_device.is_accelerator() + << "\nVendor Id: " << m_device.get_info<device::vendor_id>() + << "\nMax Compute Units: " + << m_device.get_info<device::max_compute_units>() + << "\nMax Work Item Dimensions: " + << m_device.get_info<device::max_work_item_dimensions>() + << "\nMax Work Group Size: " + << m_device.get_info<device::max_work_group_size>() + << "\nPreferred Vector Width Char: " + << m_device.get_info<device::preferred_vector_width_char>() + << "\nPreferred Vector Width Short: " + << m_device.get_info<device::preferred_vector_width_short>() + << "\nPreferred Vector Width Int: " + << m_device.get_info<device::preferred_vector_width_int>() + << "\nPreferred Vector Width Long: " + << m_device.get_info<device::preferred_vector_width_long>() + << "\nPreferred Vector Width Float: " + << m_device.get_info<device::preferred_vector_width_float>() + << "\nPreferred Vector Width Double: " + << m_device.get_info<device::preferred_vector_width_double>() + << "\nPreferred Vector Width Half: " + << m_device.get_info<device::preferred_vector_width_half>() + << "\nNative Vector Width Char: " + << m_device.get_info<device::native_vector_width_char>() + << "\nNative Vector Width Short: " + << m_device.get_info<device::native_vector_width_short>() + << "\nNative Vector Width Int: " + << m_device.get_info<device::native_vector_width_int>() + << "\nNative Vector Width Long: " + << m_device.get_info<device::native_vector_width_long>() + << "\nNative Vector Width Float: " + << m_device.get_info<device::native_vector_width_float>() + << "\nNative Vector Width Double: " + << m_device.get_info<device::native_vector_width_double>() + << "\nNative Vector Width Half: " + << m_device.get_info<device::native_vector_width_half>() + << "\nAddress Bits: " << m_device.get_info<device::address_bits>() + << "\nImage Support: " << m_device.get_info<device::image_support>() + << "\nMax Mem Alloc Size: " + << m_device.get_info<device::max_mem_alloc_size>() + << "\nMax Read Image Args: " + << m_device.get_info<device::max_read_image_args>() + << "\nImage2d Max Width: " + << m_device.get_info<device::image2d_max_width>() + << "\nImage2d Max Height: " + << m_device.get_info<device::image2d_max_height>() + << "\nImage3d Max Width: " + << m_device.get_info<device::image3d_max_width>() + << "\nImage3d Max Height: " + << m_device.get_info<device::image3d_max_height>() + << "\nImage3d Max Depth: " + << m_device.get_info<device::image3d_max_depth>() + << "\nImage Max Buffer Size: " + << m_device.get_info<device::image_max_buffer_size>() + << "\nImage Max Array Size: " + << m_device.get_info<device::image_max_array_size>() + << "\nMax Samplers: " << m_device.get_info<device::max_samplers>() + << "\nMax Parameter Size: " + << m_device.get_info<device::max_parameter_size>() + << "\nMem Base Addr Align: " + << m_device.get_info<device::mem_base_addr_align>() + << "\nGlobal Cache Mem Line Size: " + << m_device.get_info<device::global_mem_cache_line_size>() + << "\nGlobal Mem Cache Size: " + << m_device.get_info<device::global_mem_cache_size>() + << "\nGlobal Mem Size: " + << m_device.get_info<device::global_mem_size>() + << "\nMax Constant Buffer Size: " + << m_device.get_info<device::max_constant_buffer_size>() + << "\nMax Constant Args: " + << m_device.get_info<device::max_constant_args>() + << "\nLocal Mem Size: " + << m_device.get_info<device::local_mem_size>() + << "\nError Correction Support: " + << m_device.get_info<device::error_correction_support>() + << "\nHost Unified Memory: " + << m_device.get_info<device::host_unified_memory>() + << "\nProfiling Timer Resolution: " + << m_device.get_info<device::profiling_timer_resolution>() + << "\nIs Endian Little: " + << m_device.get_info<device::is_endian_little>() + << "\nIs Available: " << m_device.get_info<device::is_available>() + << "\nIs Compiler Available: " + << m_device.get_info<device::is_compiler_available>() + << "\nIs Linker Available: " + << m_device.get_info<device::is_linker_available>() + << "\nQueue Profiling: " + << m_device.get_info<device::queue_profiling>() + << "\nBuilt In Kernels: " + << Container<std::vector<std::string>>( + m_device.get_info<device::built_in_kernels>()) + << "\nVendor: " << m_device.get_info<device::vendor>() + << "\nProfile: " << m_device.get_info<device::profile>() + << "\nVersion: " << m_device.get_info<device::version>() + << "\nExtensions: " + << Container<std::vector<std::string>>( + m_device.get_info<device::extensions>()) + << "\nPrintf Buffer Size: " + << m_device.get_info<device::printf_buffer_size>() + << "\nPreferred Interop User Sync: " + << m_device.get_info<device::preferred_interop_user_sync>() + << "\nPartition Max Sub Devices: " + << m_device.get_info<device::partition_max_sub_devices>() + << "\nReference Count: " + << m_device.get_info<device::reference_count>() << '\n'; +} + +namespace Impl { + +int g_sycl_space_factory_initialized = + Kokkos::Impl::initialize_space_factory<SYCLSpaceInitializer>("170_SYCL"); + +void SYCLSpaceInitializer::initialize(const InitArguments& args) { + int use_gpu = Kokkos::Impl::get_gpu(args); + + if (std::is_same<Kokkos::Experimental::SYCL, + Kokkos::DefaultExecutionSpace>::value || + 0 < use_gpu) { + if (use_gpu > -1) { + Kokkos::Experimental::SYCL::impl_initialize( + Kokkos::Experimental::SYCL::SYCLDevice(use_gpu)); + } else { + Kokkos::Experimental::SYCL::impl_initialize( + Kokkos::Experimental::SYCL::SYCLDevice(sycl::default_selector())); + } + } +} + +void SYCLSpaceInitializer::finalize(const bool all_spaces) { + if (std::is_same<Kokkos::Experimental::SYCL, + Kokkos::DefaultExecutionSpace>::value || + all_spaces) { + if (Kokkos::Experimental::SYCL::impl_is_initialized()) + Kokkos::Experimental::SYCL::impl_finalize(); + } +} + +void SYCLSpaceInitializer::fence() { + Kokkos::Experimental::SYCL::impl_static_fence(); +} + +void SYCLSpaceInitializer::print_configuration(std::ostream& msg, + const bool /*detail*/) { + msg << "Devices:" << std::endl; + msg << " KOKKOS_ENABLE_SYCL: "; + msg << "yes" << std::endl; + + msg << "\nRuntime Configuration:" << std::endl; + // FIXME_SYCL not implemented + std::abort(); + // Experimental::SYCL::print_configuration(msg, detail); +} + +} // namespace Impl +} // namespace Experimental +} // namespace Kokkos diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Abort.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Abort.hpp new file mode 100644 index 0000000000000000000000000000000000000000..13d6dc1a4a705421a05ce3f86e28f376de0ac41b --- /dev/null +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Abort.hpp @@ -0,0 +1,62 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_SYCL_ABORT_HPP +#define KOKKOS_SYCL_ABORT_HPP + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_SYCL) + +namespace Kokkos { +namespace Impl { + +inline void sycl_abort(char const *msg) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Aborting with message %s.\n", msg); +} + +} // namespace Impl +} // namespace Kokkos + +#endif +#endif diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp new file mode 100644 index 0000000000000000000000000000000000000000..aef65ee7ecbbf3c39432b42a42b595dbfe00b239 --- /dev/null +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp @@ -0,0 +1,231 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_SYCLDEEPCOPY_HPP +#define KOKKOS_SYCLDEEPCOPY_HPP + +#include <Kokkos_Core_fwd.hpp> +#include <Kokkos_SYCL.hpp> + +#ifdef KOKKOS_ENABLE_SYCL + +namespace Kokkos { +namespace Impl { + +template <> +struct DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCL> { + DeepCopy(void* dst, const void* src, size_t); + DeepCopy(const Kokkos::Experimental::SYCL&, void* dst, const void* src, + size_t); +}; + +template <> +struct DeepCopy<Kokkos::HostSpace, Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCL> { + DeepCopy(void* dst, const void* src, size_t); + DeepCopy(const Kokkos::Experimental::SYCL&, void* dst, const void* src, + size_t); +}; + +template <> +struct DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, Kokkos::HostSpace, + Kokkos::Experimental::SYCL> { + DeepCopy(void* dst, const void* src, size_t); + DeepCopy(const Kokkos::Experimental::SYCL&, void* dst, const void* src, + size_t); +}; + +template <class ExecutionSpace> +struct DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace, ExecutionSpace> { + DeepCopy(void* dst, const void* src, size_t n) { + (void)DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCL>(dst, src, n); + } + + DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { + exec.fence(); + DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCL>(Kokkos::Experimental::SYCL(), dst, src, + n); + Kokkos::Experimental::SYCL().fence(); + } +}; + +template <class ExecutionSpace> +struct DeepCopy<Kokkos::HostSpace, Kokkos::Experimental::SYCLDeviceUSMSpace, + ExecutionSpace> { + DeepCopy(void* dst, const void* src, size_t n) { + (void)DeepCopy<Kokkos::HostSpace, Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCL>(dst, src, n); + } + + DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { + exec.fence(); + DeepCopy<Kokkos::HostSpace, Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCL>(Kokkos::Experimental::SYCL(), dst, src, + n); + Kokkos::Experimental::SYCL().fence(); + } +}; + +template <class ExecutionSpace> +struct DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, Kokkos::HostSpace, + ExecutionSpace> { + DeepCopy(void* dst, const void* src, size_t n) { + (void)DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, Kokkos::HostSpace, + Kokkos::Experimental::SYCL>(dst, src, n); + } + + DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { + exec.fence(); + DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, Kokkos::HostSpace, + Kokkos::Experimental::SYCL>(Kokkos::Experimental::SYCL(), dst, src, + n); + Kokkos::Experimental::SYCL().fence(); + } +}; + +template <> +struct DeepCopy<Experimental::SYCLSharedUSMSpace, + Experimental::SYCLSharedUSMSpace, Kokkos::Experimental::SYCL> + : public DeepCopy<Experimental::SYCLDeviceUSMSpace, + Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCL> { + using DeepCopy<Experimental::SYCLDeviceUSMSpace, + Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCL>::DeepCopy; +}; + +template <> +struct DeepCopy<Experimental::SYCLSharedUSMSpace, HostSpace, + Kokkos::Experimental::SYCL> + : public DeepCopy<Experimental::SYCLDeviceUSMSpace, HostSpace, + Kokkos::Experimental::SYCL> { + using DeepCopy<Experimental::SYCLDeviceUSMSpace, HostSpace, + Kokkos::Experimental::SYCL>::DeepCopy; +}; + +template <> +struct DeepCopy<HostSpace, Experimental::SYCLSharedUSMSpace, + Kokkos::Experimental::SYCL> + : public DeepCopy<HostSpace, Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCL> { + using DeepCopy<HostSpace, Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCL>::DeepCopy; +}; + +template <> +struct DeepCopy<Experimental::SYCLSharedUSMSpace, + Experimental::SYCLDeviceUSMSpace, Kokkos::Experimental::SYCL> + : public DeepCopy<Experimental::SYCLDeviceUSMSpace, + Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCL> { + using DeepCopy<Experimental::SYCLDeviceUSMSpace, + Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCL>::DeepCopy; +}; + +template <> +struct DeepCopy<Experimental::SYCLDeviceUSMSpace, + Experimental::SYCLSharedUSMSpace, Kokkos::Experimental::SYCL> + : public DeepCopy<Experimental::SYCLDeviceUSMSpace, + Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCL> { + using DeepCopy<Experimental::SYCLDeviceUSMSpace, + Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCL>::DeepCopy; +}; + +template <class ExecutionSpace> +struct DeepCopy<Experimental::SYCLDeviceUSMSpace, + Experimental::SYCLSharedUSMSpace, ExecutionSpace> + : public DeepCopy<Experimental::SYCLDeviceUSMSpace, + Experimental::SYCLDeviceUSMSpace, ExecutionSpace> { + using DeepCopy<Experimental::SYCLDeviceUSMSpace, + Experimental::SYCLDeviceUSMSpace, ExecutionSpace>::DeepCopy; +}; + +template <class ExecutionSpace> +struct DeepCopy<Experimental::SYCLSharedUSMSpace, + Experimental::SYCLDeviceUSMSpace, ExecutionSpace> + : public DeepCopy<Experimental::SYCLDeviceUSMSpace, + Experimental::SYCLDeviceUSMSpace, ExecutionSpace> { + using DeepCopy<Experimental::SYCLDeviceUSMSpace, + Experimental::SYCLDeviceUSMSpace, ExecutionSpace>::DeepCopy; +}; + +template <class ExecutionSpace> +struct DeepCopy<Experimental::SYCLSharedUSMSpace, + Experimental::SYCLSharedUSMSpace, ExecutionSpace> + : public DeepCopy<Experimental::SYCLDeviceUSMSpace, + Experimental::SYCLDeviceUSMSpace, ExecutionSpace> { + using DeepCopy<Experimental::SYCLDeviceUSMSpace, + Experimental::SYCLDeviceUSMSpace, ExecutionSpace>::DeepCopy; +}; + +template <class ExecutionSpace> +struct DeepCopy<Experimental::SYCLSharedUSMSpace, HostSpace, ExecutionSpace> + : public DeepCopy<Experimental::SYCLDeviceUSMSpace, HostSpace, + ExecutionSpace> { + using DeepCopy<Experimental::SYCLDeviceUSMSpace, HostSpace, + ExecutionSpace>::DeepCopy; +}; + +template <class ExecutionSpace> +struct DeepCopy<HostSpace, Experimental::SYCLSharedUSMSpace, ExecutionSpace> + : public DeepCopy<HostSpace, Experimental::SYCLDeviceUSMSpace, + ExecutionSpace> { + using DeepCopy<HostSpace, Experimental::SYCLDeviceUSMSpace, + ExecutionSpace>::DeepCopy; +}; + +} // namespace Impl +} // namespace Kokkos +#endif +#endif diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5a702b5027277cc7137cba9bba72e7367e9ae97b --- /dev/null +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp @@ -0,0 +1,291 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Concepts.hpp> +#include <SYCL/Kokkos_SYCL_Instance.hpp> +#include <KokkosCore_Config_DeclareBackend.hpp> +#include <Kokkos_SYCL.hpp> +#include <Kokkos_HostSpace.hpp> +#include <Kokkos_Serial.hpp> +#include <impl/Kokkos_ConcurrentBitset.hpp> +#include <impl/Kokkos_Error.hpp> + +namespace Kokkos { +namespace Experimental { +namespace Impl { + +std::vector<std::optional<sycl::queue>*> SYCLInternal::all_queues; +std::mutex SYCLInternal::mutex; + +SYCLInternal::~SYCLInternal() { + if (!was_finalized || m_scratchSpace || m_scratchFlags || + m_scratchConcurrentBitset) { + std::cerr << "Kokkos::Experimental::SYCL ERROR: Failed to call " + "Kokkos::Experimental::SYCL::finalize()" + << std::endl; + std::cerr.flush(); + } +} + +int SYCLInternal::verify_is_initialized(const char* const label) const { + if (!is_initialized()) { + std::cerr << "Kokkos::Experimental::SYCL::" << label + << " : ERROR device not initialized" << std::endl; + } + return is_initialized(); +} +SYCLInternal& SYCLInternal::singleton() { + static SYCLInternal self; + return self; +} + +void SYCLInternal::initialize(const sycl::device& d) { + auto exception_handler = [](sycl::exception_list exceptions) { + bool asynchronous_error = false; + for (std::exception_ptr const& e : exceptions) { + try { + std::rethrow_exception(e); + } catch (sycl::exception const& e) { + std::cerr << e.what() << '\n'; + asynchronous_error = true; + } + } + if (asynchronous_error) + Kokkos::Impl::throw_runtime_exception( + "There was an asynchronous SYCL error!\n"); + }; + initialize(sycl::queue{d, exception_handler}); +} + +// FIXME_SYCL +void SYCLInternal::initialize(const sycl::queue& q) { + if (was_finalized) + Kokkos::abort("Calling SYCL::initialize after SYCL::finalize is illegal\n"); + + if (is_initialized()) return; + + if (!HostSpace::execution_space::impl_is_initialized()) { + const std::string msg( + "SYCL::initialize ERROR : HostSpace::execution_space is not " + "initialized"); + Kokkos::Impl::throw_runtime_exception(msg); + } + + const bool ok_init = nullptr == m_scratchSpace || nullptr == m_scratchFlags; + const bool ok_dev = true; + if (ok_init && ok_dev) { + m_queue = q; + // guard pushing to all_queues + { + std::lock_guard<std::mutex> lock(mutex); + all_queues.push_back(&m_queue); + } + const sycl::device& d = m_queue->get_device(); + std::cout << SYCL::SYCLDevice(d) << '\n'; + + m_maxWorkgroupSize = + d.template get_info<sycl::info::device::max_work_group_size>(); + // FIXME_SYCL this should give the correct value for NVIDIA GPUs + m_maxConcurrency = + m_maxWorkgroupSize * 2 * + d.template get_info<sycl::info::device::max_compute_units>(); + + // Setup concurent bitset for obtaining unique tokens from within an + // executing kernel. + { + const int32_t buffer_bound = + Kokkos::Impl::concurrent_bitset::buffer_bound(m_maxConcurrency); + using Record = Kokkos::Impl::SharedAllocationRecord< + Kokkos::Experimental::SYCLDeviceUSMSpace, void>; + Record* const r = + Record::allocate(Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue), + "Kokkos::SYCL::InternalScratchBitset", + sizeof(uint32_t) * buffer_bound); + Record::increment(r); + m_scratchConcurrentBitset = reinterpret_cast<uint32_t*>(r->data()); + auto event = m_queue->memset(m_scratchConcurrentBitset, 0, + sizeof(uint32_t) * buffer_bound); + fence(event); + } + + m_maxShmemPerBlock = + d.template get_info<sycl::info::device::local_mem_size>(); + m_indirectKernelMem.reset(*m_queue); + m_indirectReducerMem.reset(*m_queue); + } else { + std::ostringstream msg; + msg << "Kokkos::Experimental::SYCL::initialize(...) FAILED"; + + if (!ok_init) { + msg << " : Already initialized"; + } + Kokkos::Impl::throw_runtime_exception(msg.str()); + } +} + +void SYCLInternal::finalize() { + SYCL().fence(); + was_finalized = true; + + using RecordSYCL = Kokkos::Impl::SharedAllocationRecord<SYCLDeviceUSMSpace>; + if (nullptr != m_scratchSpace) + RecordSYCL::decrement(RecordSYCL::get_record(m_scratchSpace)); + if (nullptr != m_scratchFlags) + RecordSYCL::decrement(RecordSYCL::get_record(m_scratchFlags)); + m_syclDev = -1; + m_scratchSpaceCount = 0; + m_scratchSpace = nullptr; + m_scratchFlagsCount = 0; + m_scratchFlags = nullptr; + + RecordSYCL::decrement(RecordSYCL::get_record(m_scratchConcurrentBitset)); + m_scratchConcurrentBitset = nullptr; + + m_indirectKernelMem.reset(); + m_indirectReducerMem.reset(); + // guard erasing from all_queues + { + std::lock_guard<std::mutex> lock(mutex); + all_queues.erase(std::find(all_queues.begin(), all_queues.end(), &m_queue)); + } + m_queue.reset(); +} + +void* SYCLInternal::scratch_space( + const Kokkos::Experimental::SYCL::size_type size) { + const size_type sizeScratchGrain = + sizeof(Kokkos::Experimental::SYCL::size_type); + if (verify_is_initialized("scratch_space") && + m_scratchSpaceCount * sizeScratchGrain < size) { + m_scratchSpaceCount = (size + sizeScratchGrain - 1) / sizeScratchGrain; + + using Record = Kokkos::Impl::SharedAllocationRecord< + Kokkos::Experimental::SYCLDeviceUSMSpace, void>; + + if (nullptr != m_scratchSpace) + Record::decrement(Record::get_record(m_scratchSpace)); + + Record* const r = + Record::allocate(Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue), + "Kokkos::SYCL::InternalScratchSpace", + (sizeScratchGrain * m_scratchSpaceCount)); + + Record::increment(r); + + m_scratchSpace = reinterpret_cast<size_type*>(r->data()); + } + + return m_scratchSpace; +} + +void* SYCLInternal::scratch_flags( + const Kokkos::Experimental::SYCL::size_type size) { + const size_type sizeScratchGrain = + sizeof(Kokkos::Experimental::SYCL::size_type); + if (verify_is_initialized("scratch_flags") && + m_scratchFlagsCount * sizeScratchGrain < size) { + m_scratchFlagsCount = (size + sizeScratchGrain - 1) / sizeScratchGrain; + + using Record = Kokkos::Impl::SharedAllocationRecord< + Kokkos::Experimental::SYCLDeviceUSMSpace, void>; + + if (nullptr != m_scratchFlags) + Record::decrement(Record::get_record(m_scratchFlags)); + + Record* const r = + Record::allocate(Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue), + "Kokkos::SYCL::InternalScratchFlags", + (sizeScratchGrain * m_scratchFlagsCount)); + + Record::increment(r); + + m_scratchFlags = reinterpret_cast<size_type*>(r->data()); + } + m_queue->memset(m_scratchFlags, 0, m_scratchFlagsCount * sizeScratchGrain); + fence(*m_queue); + + return m_scratchFlags; +} + +template <sycl::usm::alloc Kind> +size_t SYCLInternal::USMObjectMem<Kind>::reserve(size_t n) { + assert(m_size == 0); + assert(m_q); + + if (m_capacity < n) { + using Record = Kokkos::Impl::SharedAllocationRecord<AllocationSpace, void>; + // First free what we have (in case malloc can reuse it) + if (m_data) Record::decrement(Record::get_record(m_data)); + + Record* const r = Record::allocate(AllocationSpace(*m_q), + "Kokkos::SYCL::USMObjectMem", n); + Record::increment(r); + + m_data = r->data(); + m_capacity = n; + } + + return m_capacity; +} + +template <sycl::usm::alloc Kind> +void SYCLInternal::USMObjectMem<Kind>::reset() { + assert(m_size == 0); + + if (m_data) { + using Record = Kokkos::Impl::SharedAllocationRecord<AllocationSpace, void>; + Record::decrement(Record::get_record(m_data)); + + m_capacity = 0; + m_data = nullptr; + } + m_q.reset(); +} + +template class SYCLInternal::USMObjectMem<sycl::usm::alloc::shared>; +template class SYCLInternal::USMObjectMem<sycl::usm::alloc::device>; + +} // namespace Impl +} // namespace Experimental +} // namespace Kokkos diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e797411cd40bdd734c04d2a9b0e51151fa269ebd --- /dev/null +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp @@ -0,0 +1,339 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_SYCL_INSTANCE_HPP_ +#define KOKKOS_SYCL_INSTANCE_HPP_ + +#include <optional> +#include <CL/sycl.hpp> + +#include <impl/Kokkos_Error.hpp> + +namespace Kokkos { +namespace Experimental { +namespace Impl { + +class SYCLInternal { + public: + using size_type = int; + + SYCLInternal() = default; + ~SYCLInternal(); + + SYCLInternal(const SYCLInternal&) = delete; + SYCLInternal& operator=(const SYCLInternal&) = delete; + SYCLInternal& operator=(SYCLInternal&&) = delete; + SYCLInternal(SYCLInternal&&) = delete; + + void* scratch_space(const size_type size); + void* scratch_flags(const size_type size); + + int m_syclDev = -1; + + size_t m_maxWorkgroupSize = 0; + uint32_t m_maxConcurrency = 0; + uint64_t m_maxShmemPerBlock = 0; + + uint32_t* m_scratchConcurrentBitset = nullptr; + size_type m_scratchSpaceCount = 0; + size_type* m_scratchSpace = nullptr; + size_type m_scratchFlagsCount = 0; + size_type* m_scratchFlags = nullptr; + + std::optional<sycl::queue> m_queue; + + // Using std::vector<std::optional<sycl::queue>> reveals a compiler bug when + // compiling for the CUDA backend. Storing pointers instead works around this. + static std::vector<std::optional<sycl::queue>*> all_queues; + // We need a mutex for thread safety when modifying all_queues. + static std::mutex mutex; + + // USMObjectMem is a reusable buffer for a single object + // in USM memory + template <sycl::usm::alloc Kind> + class USMObjectMem { + public: + class Deleter { + public: + Deleter() = default; + explicit Deleter(USMObjectMem* mem) : m_mem(mem) {} + + template <typename T> + void operator()(T* p) const noexcept { + assert(m_mem); + assert(sizeof(T) == m_mem->size()); + + if constexpr (sycl::usm::alloc::device == kind) + // Only skipping the dtor on trivially copyable types + static_assert(std::is_trivially_copyable_v<T>); + else + p->~T(); + + m_mem->m_size = 0; + } + + private: + USMObjectMem* m_mem = nullptr; + }; + + static constexpr sycl::usm::alloc kind = Kind; + + void reset(); + + void reset(sycl::queue q) { + reset(); + m_q.emplace(std::move(q)); + } + + USMObjectMem() = default; + explicit USMObjectMem(sycl::queue q) noexcept : m_q(std::move(q)) {} + + USMObjectMem(USMObjectMem const&) = delete; + USMObjectMem(USMObjectMem&&) = delete; + USMObjectMem& operator=(USMObjectMem&&) = delete; + USMObjectMem& operator=(USMObjectMem const&) = delete; + + ~USMObjectMem() { reset(); }; + + void* data() noexcept { return m_data; } + const void* data() const noexcept { return m_data; } + + size_t size() const noexcept { return m_size; } + size_t capacity() const noexcept { return m_capacity; } + + // reserve() allocates space for at least n bytes + // returns the new capacity + size_t reserve(size_t n); + + private: + using AllocationSpace = + std::conditional_t<Kind == sycl::usm::alloc::device, + Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>; + + // This will memcpy an object T into memory held by this object + // returns: a T* to that object + // + // Note: it is UB to dereference this pointer with an object that is + // not an implicit-lifetime nor trivially-copyable type, but presumably much + // faster because we can use USM device memory + template <typename T> + std::unique_ptr<T, Deleter> memcpy_from(const T& t) { + reserve(sizeof(T)); + sycl::event memcopied = m_q->memcpy(m_data, std::addressof(t), sizeof(T)); + fence(memcopied); + + std::unique_ptr<T, Deleter> ptr(reinterpret_cast<T*>(m_data), + Deleter(this)); + m_size = sizeof(T); + return ptr; + } + + // This will copy-constuct an object T into memory held by this object + // returns: a unique_ptr<T, destruct_delete> that will call the + // destructor on the type when it goes out of scope. + // + // Note: This will not work with USM device memory + template <typename T> + std::unique_ptr<T, Deleter> copy_construct_from(const T& t) { + static_assert(kind != sycl::usm::alloc::device, + "Cannot copy construct into USM device memory"); + + reserve(sizeof(T)); + + std::unique_ptr<T, Deleter> ptr(new (m_data) T(t), Deleter(this)); + m_size = sizeof(T); + return ptr; + } + + public: + // Performs either memcpy (for USM device memory) and returns a T* + // (but is technically UB when dereferenced on an object that is not + // an implicit-lifetime nor trivially-copyable type + // + // or + // + // performs copy construction (for other USM memory types) and returns a + // unique_ptr<T, ...> + template <typename T> + std::unique_ptr<T, Deleter> copy_from(const T& t) { + if constexpr (sycl::usm::alloc::device == kind) + return memcpy_from(t); + else + return copy_construct_from(t); + } + + private: + // Returns a reference to t (helpful when debugging) + template <typename T> + T& memcpy_to(T& t) { + assert(sizeof(T) == m_size); + + sycl::event memcopied = m_q->memcpy(std::addressof(t), m_data, sizeof(T)); + fence(memcopied); + + return t; + } + + // Returns a reference to t (helpful when debugging) + template <typename T> + T& move_assign_to(T& t) { + static_assert(kind != sycl::usm::alloc::device, + "Cannot move_assign_to from USM device memory"); + + assert(sizeof(T) == m_size); + + t = std::move(*static_cast<T*>(m_data)); + + return t; + } + + public: + // Returns a reference to t (helpful when debugging) + template <typename T> + T& transfer_to(T& t) { + if constexpr (sycl::usm::alloc::device == kind) + return memcpy_to(t); + else + return move_assign_to(t); + } + + private: + // USMObjectMem class invariants + // All four expressions below must evaluate to true: + // + // !m_data == !m_capacity + // m_q || !m_data + // m_data || !m_size + // m_size <= m_capacity + // + // The above invariants mean that: + // if m_size != 0 then m_data != 0 + // if m_data != 0 then m_capacity != 0 && m_q != nullopt + // if m_data == 0 then m_capacity == 0 + + std::optional<sycl::queue> m_q; + void* m_data = nullptr; + size_t m_size = 0; // sizeof(T) iff m_data points to live T + size_t m_capacity = 0; + }; + + // An indirect kernel is one where the functor to be executed is explicitly + // copied to USM device memory before being executed, to get around the + // trivially copyable limitation of SYCL. + using IndirectKernelMem = USMObjectMem<sycl::usm::alloc::shared>; + IndirectKernelMem m_indirectKernelMem; + + using IndirectReducerMem = USMObjectMem<sycl::usm::alloc::shared>; + IndirectReducerMem m_indirectReducerMem; + + bool was_finalized = false; + + static SYCLInternal& singleton(); + + int verify_is_initialized(const char* const label) const; + + void initialize(const sycl::device& d); + + void initialize(const sycl::queue& q); + + int is_initialized() const { return m_queue.has_value(); } + + void finalize(); + + private: + // fence(...) takes any type with a .wait_and_throw() method + // (sycl::event and sycl::queue) + template <typename WAT> + static void fence_helper(WAT& wat) { + try { + wat.wait_and_throw(); + } catch (sycl::exception const& e) { + Kokkos::Impl::throw_runtime_exception( + std::string("There was a synchronous SYCL error:\n") += e.what()); + } + } + + public: + static void fence(sycl::queue& q) { fence_helper(q); } + static void fence(sycl::event& e) { fence_helper(e); } +}; + +template <typename Functor, typename Storage, + bool is_memcpyable = std::is_trivially_copyable_v<Functor>> +class SYCLFunctionWrapper; + +template <typename Functor, typename Storage> +class SYCLFunctionWrapper<Functor, Storage, true> { + const Functor& m_functor; + + public: + SYCLFunctionWrapper(const Functor& functor, Storage&) : m_functor(functor) {} + + const Functor& get_functor() const { return m_functor; } +}; + +template <typename Functor, typename Storage> +class SYCLFunctionWrapper<Functor, Storage, false> { + std::unique_ptr<Functor, + Experimental::Impl::SYCLInternal::IndirectKernelMem::Deleter> + m_kernelFunctorPtr; + + public: + SYCLFunctionWrapper(const Functor& functor, Storage& storage) + : m_kernelFunctorPtr(storage.copy_from(functor)) {} + + std::reference_wrapper<const Functor> get_functor() const { + return {*m_kernelFunctorPtr}; + } +}; + +template <typename Functor, typename Storage> +auto make_sycl_function_wrapper(const Functor& functor, Storage& storage) { + return SYCLFunctionWrapper<Functor, Storage>(functor, storage); +} +} // namespace Impl +} // namespace Experimental +} // namespace Kokkos +#endif diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_MDRangePolicy.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_MDRangePolicy.hpp new file mode 100644 index 0000000000000000000000000000000000000000..3e90ec1fb50b21e92f4f2ce589f98e2e755967ea --- /dev/null +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_MDRangePolicy.hpp @@ -0,0 +1,37 @@ +#ifndef KOKKOS_SYCL_MDRANGEPOLICY_HPP_ +#define KOKKOS_SYCL_MDRANGEPOLICY_HPP_ + +#include <KokkosExp_MDRangePolicy.hpp> + +namespace Kokkos { + +template <> +struct default_outer_direction<Kokkos::Experimental::SYCL> { + using type = Iterate; + static constexpr Iterate value = Iterate::Left; +}; + +template <> +struct default_inner_direction<Kokkos::Experimental::SYCL> { + using type = Iterate; + static constexpr Iterate value = Iterate::Left; +}; + +namespace Impl { + +// Settings for MDRangePolicy +template <> +inline TileSizeProperties get_tile_size_properties<Kokkos::Experimental::SYCL>( + const Kokkos::Experimental::SYCL& space) { + TileSizeProperties properties; + properties.max_threads = + space.impl_internal_space_instance()->m_maxWorkgroupSize; + properties.default_largest_tile_size = 16; + properties.default_tile_size = 2; + properties.max_total_tile_size = properties.max_threads; + return properties; +} + +} // Namespace Impl +} // Namespace Kokkos +#endif diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Range.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Range.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a286169c45988339dce1b14c6d6a4ffde25dcea5 --- /dev/null +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Range.hpp @@ -0,0 +1,271 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_SYCL_PARALLEL_RANGE_HPP_ +#define KOKKOS_SYCL_PARALLEL_RANGE_HPP_ + +#include <impl/KokkosExp_IterateTileGPU.hpp> + +template <class FunctorType, class ExecPolicy> +class Kokkos::Impl::ParallelFor<FunctorType, ExecPolicy, + Kokkos::Experimental::SYCL> { + public: + using Policy = ExecPolicy; + + private: + using Member = typename Policy::member_type; + using WorkTag = typename Policy::work_tag; + using LaunchBounds = typename Policy::launch_bounds; + + const FunctorType m_functor; + const Policy m_policy; + + template <typename Functor> + static void sycl_direct_launch(const Policy& policy, const Functor& functor) { + // Convenience references + const Kokkos::Experimental::SYCL& space = policy.space(); + Kokkos::Experimental::Impl::SYCLInternal& instance = + *space.impl_internal_space_instance(); + sycl::queue& q = *instance.m_queue; + + space.fence(); + + q.submit([functor, policy](sycl::handler& cgh) { + sycl::range<1> range(policy.end() - policy.begin()); + const auto begin = policy.begin(); + + cgh.parallel_for(range, [=](sycl::item<1> item) { + const typename Policy::index_type id = item.get_linear_id() + begin; + if constexpr (std::is_same<WorkTag, void>::value) + functor(id); + else + functor(WorkTag(), id); + }); + }); + + space.fence(); + } + + public: + using functor_type = FunctorType; + + void execute() const { + if (m_policy.begin() == m_policy.end()) return; + + Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& + indirectKernelMem = m_policy.space() + .impl_internal_space_instance() + ->m_indirectKernelMem; + + const auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( + m_functor, indirectKernelMem); + sycl_direct_launch(m_policy, functor_wrapper.get_functor()); + } + + ParallelFor(const ParallelFor&) = delete; + ParallelFor(ParallelFor&&) = delete; + ParallelFor& operator=(const ParallelFor&) = delete; + ParallelFor& operator=(ParallelFor&&) = delete; + ~ParallelFor() = default; + + ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) + : m_functor(arg_functor), m_policy(arg_policy) {} +}; + +// ParallelFor +template <class FunctorType, class... Traits> +class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, + Kokkos::Experimental::SYCL> { + public: + using Policy = Kokkos::MDRangePolicy<Traits...>; + + private: + using array_index_type = typename Policy::array_index_type; + using index_type = typename Policy::index_type; + using LaunchBounds = typename Policy::launch_bounds; + using WorkTag = typename Policy::work_tag; + + const FunctorType m_functor; + // MDRangePolicy is not trivially copyable. Hence, replicate the data we + // really need in DeviceIterateTile in a trivially copyable struct. + const struct BarePolicy { + using index_type = typename Policy::index_type; + + BarePolicy(const Policy& policy) + : m_lower(policy.m_lower), + m_upper(policy.m_upper), + m_tile(policy.m_tile), + m_tile_end(policy.m_tile_end), + m_num_tiles(policy.m_num_tiles) {} + + const typename Policy::point_type m_lower; + const typename Policy::point_type m_upper; + const typename Policy::tile_type m_tile; + const typename Policy::point_type m_tile_end; + const typename Policy::index_type m_num_tiles; + static constexpr Iterate inner_direction = Policy::inner_direction; + } m_policy; + const Kokkos::Experimental::SYCL& m_space; + + sycl::nd_range<3> compute_ranges() const { + const auto& m_tile = m_policy.m_tile; + const auto& m_tile_end = m_policy.m_tile_end; + + if constexpr (Policy::rank == 2) { + sycl::range<3> local_sizes(m_tile[0], m_tile[1], 1); + sycl::range<3> global_sizes(m_tile_end[0] * m_tile[0], + m_tile_end[1] * m_tile[1], 1); + return {global_sizes, local_sizes}; + } + if constexpr (Policy::rank == 3) { + sycl::range<3> local_sizes(m_tile[0], m_tile[1], m_tile[2]); + sycl::range<3> global_sizes(m_tile_end[0] * m_tile[0], + m_tile_end[1] * m_tile[1], + m_tile_end[2] * m_tile[2]); + return {global_sizes, local_sizes}; + } + if constexpr (Policy::rank == 4) { + // id0,id1 encoded within first index; id2 to second index; id3 to third + // index + sycl::range<3> local_sizes(m_tile[0] * m_tile[1], m_tile[2], m_tile[3]); + sycl::range<3> global_sizes( + m_tile_end[0] * m_tile[0] * m_tile_end[1] * m_tile[1], + m_tile_end[2] * m_tile[2], m_tile_end[3] * m_tile[3]); + return {global_sizes, local_sizes}; + } + if constexpr (Policy::rank == 5) { + // id0,id1 encoded within first index; id2,id3 to second index; id4 to + // third index + sycl::range<3> local_sizes(m_tile[0] * m_tile[1], m_tile[2] * m_tile[3], + m_tile[4]); + sycl::range<3> global_sizes( + m_tile_end[0] * m_tile[0] * m_tile_end[1] * m_tile[1], + m_tile_end[2] * m_tile[2] * m_tile_end[3] * m_tile[3], + m_tile_end[4] * m_tile[4]); + return {global_sizes, local_sizes}; + } + if constexpr (Policy::rank == 6) { + // id0,id1 encoded within first index; id2,id3 to second index; id4,id5 to + // third index + sycl::range<3> local_sizes(m_tile[0] * m_tile[1], m_tile[2] * m_tile[3], + m_tile[4] * m_tile[5]); + sycl::range<3> global_sizes( + m_tile_end[0] * m_tile[0] * m_tile_end[1] * m_tile[1], + m_tile_end[2] * m_tile[2] * m_tile_end[3] * m_tile[3], + m_tile_end[4] * m_tile[4] * m_tile_end[5] * m_tile[5]); + return {global_sizes, local_sizes}; + } + static_assert(Policy::rank > 1 && Policy::rank < 7, + "Kokkos::MDRange Error: Exceeded rank bounds with SYCL\n"); + } + + template <typename Functor> + void sycl_direct_launch(const Functor& functor) const { + // Convenience references + Kokkos::Experimental::Impl::SYCLInternal& instance = + *m_space.impl_internal_space_instance(); + sycl::queue& q = *instance.m_queue; + + m_space.fence(); + + if (m_policy.m_num_tiles == 0) return; + + const BarePolicy bare_policy(m_policy); + + q.submit([functor, this, bare_policy](sycl::handler& cgh) { + const auto range = compute_ranges(); + + cgh.parallel_for(range, [functor, bare_policy](sycl::nd_item<3> item) { + const index_type local_x = item.get_local_id(0); + const index_type local_y = item.get_local_id(1); + const index_type local_z = item.get_local_id(2); + const index_type global_x = item.get_group(0); + const index_type global_y = item.get_group(1); + const index_type global_z = item.get_group(2); + const index_type n_global_x = item.get_group_range(0); + const index_type n_global_y = item.get_group_range(1); + const index_type n_global_z = item.get_group_range(2); + + Kokkos::Impl::DeviceIterateTile<Policy::rank, BarePolicy, Functor, + typename Policy::work_tag>( + bare_policy, functor, {n_global_x, n_global_y, n_global_z}, + {global_x, global_y, global_z}, {local_x, local_y, local_z}) + .exec_range(); + }); + }); + + m_space.fence(); + } + + public: + using functor_type = FunctorType; + + template <typename Policy, typename Functor> + static int max_tile_size_product(const Policy& policy, const Functor&) { + return policy.space().impl_internal_space_instance()->m_maxWorkgroupSize; + } + + void execute() const { + Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& + indirectKernelMem = + m_space.impl_internal_space_instance()->m_indirectKernelMem; + + const auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( + m_functor, indirectKernelMem); + sycl_direct_launch(functor_wrapper.get_functor()); + } + + ParallelFor(const ParallelFor&) = delete; + ParallelFor(ParallelFor&&) = delete; + ParallelFor& operator=(const ParallelFor&) = delete; + ParallelFor& operator=(ParallelFor&&) = delete; + ~ParallelFor() = default; + + ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) + : m_functor(arg_functor), + m_policy(arg_policy), + m_space(arg_policy.space()) {} +}; + +#endif // KOKKOS_SYCL_PARALLEL_RANGE_HPP_ diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp new file mode 100644 index 0000000000000000000000000000000000000000..03b7753f8e81ef5045b16cedd4206d85174c0033 --- /dev/null +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp @@ -0,0 +1,597 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_SYCL_PARALLEL_REDUCE_HPP +#define KOKKOS_SYCL_PARALLEL_REDUCE_HPP + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_SYCL) + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <class FunctorType, class ReducerType, class... Traits> +class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType, + Kokkos::Experimental::SYCL> { + public: + using Policy = Kokkos::RangePolicy<Traits...>; + + private: + using Analysis = + FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, FunctorType>; + using execution_space = typename Analysis::execution_space; + using value_type = typename Analysis::value_type; + using pointer_type = typename Analysis::pointer_type; + using reference_type = typename Analysis::reference_type; + + using WorkTag = typename Policy::work_tag; + + public: + // V - View + template <typename V> + ParallelReduce( + const FunctorType& f, const Policy& p, const V& v, + typename std::enable_if<Kokkos::is_view<V>::value, void*>::type = nullptr) + : m_functor(f), m_policy(p), m_result_ptr(v.data()) {} + + ParallelReduce(const FunctorType& f, const Policy& p, + const ReducerType& reducer) + : m_functor(f), + m_policy(p), + m_reducer(reducer), + m_result_ptr(reducer.view().data()) {} + + private: + template <typename PolicyType, typename Functor, typename Reducer> + void sycl_direct_launch(const PolicyType& policy, const Functor& functor, + const Reducer& reducer) const { + using ReducerConditional = + Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, + FunctorType, ReducerType>; + using ReducerTypeFwd = typename ReducerConditional::type; + using WorkTagFwd = + std::conditional_t<std::is_same<InvalidType, ReducerType>::value, + WorkTag, void>; + using ValueInit = + Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>; + using ValueJoin = + Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>; + using ValueOps = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>; + + auto selected_reducer = ReducerConditional::select(functor, reducer); + + // Convenience references + const Kokkos::Experimental::SYCL& space = policy.space(); + Kokkos::Experimental::Impl::SYCLInternal& instance = + *space.impl_internal_space_instance(); + sycl::queue& q = *instance.m_queue; + + // FIXME_SYCL optimize + constexpr size_t wgroup_size = 128; + constexpr size_t values_per_thread = 2; + std::size_t size = policy.end() - policy.begin(); + const auto init_size = std::max<std::size_t>( + ((size + values_per_thread - 1) / values_per_thread + wgroup_size - 1) / + wgroup_size, + 1); + const unsigned int value_count = + FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>::value_count( + selected_reducer); + // FIXME_SYCL only use the first half + const auto results_ptr = static_cast<pointer_type>(instance.scratch_space( + sizeof(value_type) * std::max(value_count, 1u) * init_size * 2)); + // FIXME_SYCL without this we are running into a race condition + const auto results_ptr2 = + results_ptr + std::max(value_count, 1u) * init_size; + + // If size<=1 we only call init(), the functor and possibly final once + // working with the global scratch memory but don't copy back to + // m_result_ptr yet. + if (size <= 1) { + q.submit([&](sycl::handler& cgh) { + const auto begin = policy.begin(); + cgh.single_task([=]() { + const auto& selected_reducer = ReducerConditional::select( + static_cast<const FunctorType&>(functor), + static_cast<const ReducerType&>(reducer)); + reference_type update = + ValueInit::init(selected_reducer, results_ptr); + if (size == 1) { + if constexpr (std::is_same<WorkTag, void>::value) + functor(begin, update); + else + functor(WorkTag(), begin, update); + } + if constexpr (ReduceFunctorHasFinal<FunctorType>::value) + FunctorFinal<FunctorType, WorkTag>::final( + static_cast<const FunctorType&>(functor), results_ptr); + }); + }); + space.fence(); + } + + // Otherwise, we perform a reduction on the values in all workgroups + // separately, write the workgroup results back to global memory and recurse + // until only one workgroup does the reduction and thus gets the final + // value. + bool first_run = true; + while (size > 1) { + auto n_wgroups = ((size + values_per_thread - 1) / values_per_thread + + wgroup_size - 1) / + wgroup_size; + q.submit([&](sycl::handler& cgh) { + sycl::accessor<value_type, 1, sycl::access::mode::read_write, + sycl::access::target::local> + local_mem(sycl::range<1>(wgroup_size) * std::max(value_count, 1u), + cgh); + const auto begin = policy.begin(); + + cgh.parallel_for( + sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size), + [=](sycl::nd_item<1> item) { + const auto local_id = item.get_local_linear_id(); + const auto global_id = + wgroup_size * item.get_group_linear_id() * values_per_thread + + local_id; + const auto& selected_reducer = ReducerConditional::select( + static_cast<const FunctorType&>(functor), + static_cast<const ReducerType&>(reducer)); + + // In the first iteration, we call functor to initialize the local + // memory. Otherwise, the local memory is initialized with the + // results from the previous iteration that are stored in global + // memory. Note that we load values_per_thread values per thread + // and immediately combine them to avoid too many threads being + // idle in the actual workgroup reduction. + using index_type = typename Policy::index_type; + const auto upper_bound = std::min<index_type>( + global_id + values_per_thread * wgroup_size, size); + if (first_run) { + reference_type update = ValueInit::init( + selected_reducer, &local_mem[local_id * value_count]); + for (index_type id = global_id; id < upper_bound; + id += wgroup_size) { + if constexpr (std::is_same<WorkTag, void>::value) + functor(id + begin, update); + else + functor(WorkTag(), id + begin, update); + } + } else { + if (global_id >= size) + ValueInit::init(selected_reducer, + &local_mem[local_id * value_count]); + else { + ValueOps::copy(functor, &local_mem[local_id * value_count], + &results_ptr[global_id * value_count]); + for (index_type id = global_id + wgroup_size; + id < upper_bound; id += wgroup_size) { + ValueJoin::join(selected_reducer, + &local_mem[local_id * value_count], + &results_ptr[id * value_count]); + } + } + } + item.barrier(sycl::access::fence_space::local_space); + + // Perform the actual workgroup reduction. To achieve a better + // memory access pattern, we use sequential addressing and a + // reversed loop. If the workgroup size is 8, the first element + // contains all the values with index%4==0, after the second one + // the values with index%2==0 and after the third one index%1==0, + // i.e., all values. + for (unsigned int stride = wgroup_size / 2; stride > 0; + stride >>= 1) { + const auto idx = local_id; + if (idx < stride) { + ValueJoin::join(selected_reducer, + &local_mem[idx * value_count], + &local_mem[(idx + stride) * value_count]); + } + item.barrier(sycl::access::fence_space::local_space); + } + + // Finally, we copy the workgroup results back to global memory to + // be used in the next iteration. If this is the last iteration, + // i.e., there is only one workgroup also call final() if + // necessary. + if (local_id == 0) { + ValueOps::copy( + functor, + &results_ptr2[(item.get_group_linear_id()) * value_count], + &local_mem[0]); + if constexpr (ReduceFunctorHasFinal<FunctorType>::value) + if (n_wgroups <= 1) + FunctorFinal<FunctorType, WorkTag>::final( + static_cast<const FunctorType&>(functor), + &results_ptr2[(item.get_group_linear_id()) * + value_count]); + } + }); + }); + space.fence(); + + // FIXME_SYCL this is likely not necessary, see above + Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>( + space, results_ptr, results_ptr2, + sizeof(*m_result_ptr) * value_count * n_wgroups); + space.fence(); + + first_run = false; + size = n_wgroups; + } + + // At this point, the reduced value is written to the entry in results_ptr + // and all that is left is to copy it back to the given result pointer if + // necessary. + if (m_result_ptr) { + Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>( + space, m_result_ptr, results_ptr, + sizeof(*m_result_ptr) * value_count); + space.fence(); + } + } + + public: + void execute() const { + Kokkos::Experimental::Impl::SYCLInternal& instance = + *m_policy.space().impl_internal_space_instance(); + using IndirectKernelMem = + Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem; + IndirectKernelMem& indirectKernelMem = instance.m_indirectKernelMem; + IndirectKernelMem& indirectReducerMem = instance.m_indirectReducerMem; + + const auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( + m_functor, indirectKernelMem); + const auto reducer_wrapper = Experimental::Impl::make_sycl_function_wrapper( + m_reducer, indirectReducerMem); + + sycl_direct_launch(m_policy, functor_wrapper.get_functor(), + reducer_wrapper.get_functor()); + } + + private: + FunctorType m_functor; + Policy m_policy; + ReducerType m_reducer; + pointer_type m_result_ptr; +}; + +template <class FunctorType, class ReducerType, class... Traits> +class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType, + Kokkos::Experimental::SYCL> { + public: + using Policy = Kokkos::MDRangePolicy<Traits...>; + + private: + using Analysis = + FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, FunctorType>; + using execution_space = typename Analysis::execution_space; + using value_type = typename Analysis::value_type; + using pointer_type = typename Analysis::pointer_type; + using reference_type = typename Analysis::reference_type; + + using WorkTag = typename Policy::work_tag; + + // MDRangePolicy is not trivially copyable. Hence, replicate the data we + // really need in DeviceIterateTile in a trivially copyable struct. + struct BarePolicy { + using index_type = typename Policy::index_type; + + BarePolicy(const Policy& policy) + : m_lower(policy.m_lower), + m_upper(policy.m_upper), + m_tile(policy.m_tile), + m_tile_end(policy.m_tile_end), + m_num_tiles(policy.m_num_tiles), + m_prod_tile_dims(policy.m_prod_tile_dims) {} + + const typename Policy::point_type m_lower; + const typename Policy::point_type m_upper; + const typename Policy::tile_type m_tile; + const typename Policy::point_type m_tile_end; + const typename Policy::index_type m_num_tiles; + const typename Policy::index_type m_prod_tile_dims; + static constexpr Iterate inner_direction = Policy::inner_direction; + static constexpr int rank = Policy::rank; + }; + + public: + // V - View + template <typename V> + ParallelReduce( + const FunctorType& f, const Policy& p, const V& v, + typename std::enable_if<Kokkos::is_view<V>::value, void*>::type = nullptr) + : m_functor(f), m_policy(p), m_space(p.space()), m_result_ptr(v.data()) {} + + ParallelReduce(const FunctorType& f, const Policy& p, + const ReducerType& reducer) + : m_functor(f), + m_policy(p), + m_space(p.space()), + m_reducer(reducer), + m_result_ptr(reducer.view().data()) {} + + private: + template <typename PolicyType, typename Functor, typename Reducer> + void sycl_direct_launch(const PolicyType& policy, const Functor& functor, + const Reducer& reducer) const { + using ReducerConditional = + Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, + FunctorType, ReducerType>; + using ReducerTypeFwd = typename ReducerConditional::type; + using WorkTagFwd = + std::conditional_t<std::is_same<InvalidType, ReducerType>::value, + WorkTag, void>; + using ValueInit = + Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>; + using ValueJoin = + Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>; + using ValueOps = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>; + + // Convenience references + Kokkos::Experimental::Impl::SYCLInternal& instance = + *m_space.impl_internal_space_instance(); + sycl::queue& q = *instance.m_queue; + + const int nwork = m_policy.m_num_tiles; + const int block_size = + std::pow(2, std::ceil(std::log2(m_policy.m_prod_tile_dims))); + + const sycl::range<1> local_range(block_size); + // REMEMBER swap local x<->y to be conforming with Cuda/HIP implementation + const sycl::range<1> global_range(nwork * block_size); + const sycl::nd_range<1> range{global_range, local_range}; + + const size_t wgroup_size = range.get_local_range().size(); + size_t size = range.get_global_range().size(); + const auto init_size = + std::max<std::size_t>((size + wgroup_size - 1) / wgroup_size, 1); + const auto& selected_reducer = ReducerConditional::select(functor, reducer); + const unsigned int value_count = + FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>::value_count( + selected_reducer); + // FIXME_SYCL only use the first half + const auto results_ptr = static_cast<pointer_type>(instance.scratch_space( + sizeof(value_type) * std::max(value_count, 1u) * init_size * 2)); + // FIXME_SYCL without this we are running into a race condition + const auto results_ptr2 = + results_ptr + std::max(value_count, 1u) * init_size; + + // If size<=1 we only call init(), the functor and possibly final once + // working with the global scratch memory but don't copy back to + // m_result_ptr yet. + if (size <= 1) { + q.submit([&](sycl::handler& cgh) { + cgh.single_task([=]() { + const auto& selected_reducer = ReducerConditional::select( + static_cast<const FunctorType&>(functor), + static_cast<const ReducerType&>(reducer)); + reference_type update = + ValueInit::init(selected_reducer, results_ptr); + if (size == 1) { + Kokkos::Impl::Reduce::DeviceIterateTile< + Policy::rank, BarePolicy, Functor, typename Policy::work_tag, + reference_type>(policy, functor, update, {1, 1, 1}, {0, 0, 0}, + {0, 0, 0}) + .exec_range(); + } + if constexpr (ReduceFunctorHasFinal<FunctorType>::value) + FunctorFinal<FunctorType, WorkTag>::final( + static_cast<const FunctorType&>(functor), results_ptr); + }); + }); + m_space.fence(); + } + + // Otherwise, we perform a reduction on the values in all workgroups + // separately, write the workgroup results back to global memory and recurse + // until only one workgroup does the reduction and thus gets the final + // value. + bool first_run = true; + while (size > 1) { + auto n_wgroups = (size + wgroup_size - 1) / wgroup_size; + q.submit([&](sycl::handler& cgh) { + sycl::accessor<value_type, 1, sycl::access::mode::read_write, + sycl::access::target::local> + local_mem(sycl::range<1>(wgroup_size) * std::max(value_count, 1u), + cgh); + + const BarePolicy bare_policy = m_policy; + + cgh.parallel_for(range, [=](sycl::nd_item<1> item) { + const auto local_id = item.get_local_linear_id(); + const auto global_id = + wgroup_size * item.get_group_linear_id() + local_id; + const auto& selected_reducer = ReducerConditional::select( + static_cast<const FunctorType&>(functor), + static_cast<const ReducerType&>(reducer)); + + // In the first iteration, we call functor to initialize the local + // memory. Otherwise, the local memory is initialized with the + // results from the previous iteration that are stored in global + // memory. + using index_type = typename Policy::index_type; + const auto upper_bound = + std::min<index_type>(global_id + wgroup_size, size); + if (first_run) { + reference_type update = ValueInit::init( + selected_reducer, &local_mem[local_id * value_count]); + + // SWAPPED here to be conforming with CUDA implementation + const index_type local_x = 0; + const index_type local_y = item.get_local_id(0); + const index_type local_z = 0; + const index_type global_x = item.get_group(0); + const index_type global_y = 0; + const index_type global_z = 0; + const index_type n_global_x = item.get_group_range(0); + const index_type n_global_y = 1; + const index_type n_global_z = 1; + + Kokkos::Impl::Reduce::DeviceIterateTile< + Policy::rank, BarePolicy, Functor, typename Policy::work_tag, + reference_type>(bare_policy, functor, update, + {n_global_x, n_global_y, n_global_z}, + {global_x, global_y, global_z}, + {local_x, local_y, local_z}) + .exec_range(); + } else { + if (global_id >= size) + ValueInit::init(selected_reducer, + &local_mem[local_id * value_count]); + else { + ValueOps::copy(functor, &local_mem[local_id * value_count], + &results_ptr[global_id * value_count]); + for (index_type id = global_id + wgroup_size; id < upper_bound; + id += wgroup_size) { + ValueJoin::join(selected_reducer, + &local_mem[local_id * value_count], + &results_ptr[id * value_count]); + } + } + } + item.barrier(sycl::access::fence_space::local_space); + + // Perform the actual workgroup reduction. To achieve a better + // memory access pattern, we use sequential addressing and a + // reversed loop. If the workgroup size is 8, the first element + // contains all the values with index%4==0, after the second one + // the values with index%2==0 and after the third one index%1==0, + // i.e., all values. + for (unsigned int stride = wgroup_size / 2; stride > 0; + stride >>= 1) { + const auto idx = local_id; + if (idx < stride) { + ValueJoin::join(selected_reducer, &local_mem[idx * value_count], + &local_mem[(idx + stride) * value_count]); + } + item.barrier(sycl::access::fence_space::local_space); + } + + // Finally, we copy the workgroup results back to global memory to + // be used in the next iteration. If this is the last iteration, + // i.e., there is only one workgroup also call final() if + // necessary. + if (local_id == 0) { + ValueOps::copy( + functor, + &results_ptr2[(item.get_group_linear_id()) * value_count], + &local_mem[0]); + if constexpr (ReduceFunctorHasFinal<FunctorType>::value) + if (n_wgroups <= 1) + FunctorFinal<FunctorType, WorkTag>::final( + static_cast<const FunctorType&>(functor), + &results_ptr2[(item.get_group_linear_id()) * value_count]); + } + }); + }); + m_space.fence(); + + // FIXME_SYCL this is likely not necessary, see above + Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>( + m_space, results_ptr, results_ptr2, + sizeof(*m_result_ptr) * value_count * n_wgroups); + m_space.fence(); + + first_run = false; + size = n_wgroups; + } + + // At this point, the reduced value is written to the entry in results_ptr + // and all that is left is to copy it back to the given result pointer if + // necessary. + if (m_result_ptr) { + Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>( + m_space, m_result_ptr, results_ptr, + sizeof(*m_result_ptr) * value_count); + m_space.fence(); + } + } + + public: + template <typename Policy, typename Functor> + static int max_tile_size_product(const Policy& policy, const Functor&) { + return policy.space().impl_internal_space_instance()->m_maxThreadsPerSM; + } + + void execute() const { + Kokkos::Experimental::Impl::SYCLInternal& instance = + *m_space.impl_internal_space_instance(); + using IndirectKernelMem = + Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem; + IndirectKernelMem& indirectKernelMem = instance.m_indirectKernelMem; + IndirectKernelMem& indirectReducerMem = instance.m_indirectReducerMem; + + const auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( + m_functor, indirectKernelMem); + const auto reducer_wrapper = Experimental::Impl::make_sycl_function_wrapper( + m_reducer, indirectReducerMem); + + sycl_direct_launch(m_policy, functor_wrapper.get_functor(), + reducer_wrapper.get_functor()); + } + + private: + FunctorType m_functor; + BarePolicy m_policy; + const Kokkos::Experimental::SYCL& m_space; + ReducerType m_reducer; + pointer_type m_result_ptr; +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif +#endif /* KOKKOS_SYCL_PARALLEL_REDUCE_HPP */ diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp new file mode 100644 index 0000000000000000000000000000000000000000..5eac6bf9da62b29b9d15697bc5061c00db504e0c --- /dev/null +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp @@ -0,0 +1,308 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKO_SYCL_PARALLEL_SCAN_HPP +#define KOKKO_SYCL_PARALLEL_SCAN_HPP + +#include <Kokkos_Macros.hpp> +#include <memory> +#if defined(KOKKOS_ENABLE_SYCL) + +namespace Kokkos { +namespace Impl { + +template <class FunctorType, class... Traits> +class ParallelScanSYCLBase { + public: + using Policy = Kokkos::RangePolicy<Traits...>; + + protected: + using Member = typename Policy::member_type; + using WorkTag = typename Policy::work_tag; + using WorkRange = typename Policy::WorkRange; + using LaunchBounds = typename Policy::launch_bounds; + + using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, WorkTag>; + using ValueInit = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>; + using ValueJoin = Kokkos::Impl::FunctorValueJoin<FunctorType, WorkTag>; + using ValueOps = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>; + + public: + using pointer_type = typename ValueTraits::pointer_type; + using value_type = typename ValueTraits::value_type; + using reference_type = typename ValueTraits::reference_type; + using functor_type = FunctorType; + using size_type = Kokkos::Experimental::SYCL::size_type; + using index_type = typename Policy::index_type; + + protected: + const FunctorType m_functor; + const Policy m_policy; + pointer_type m_scratch_space = nullptr; + + private: + template <typename Functor> + void scan_internal(sycl::queue& q, const Functor& functor, + pointer_type global_mem, std::size_t size) const { + // FIXME_SYCL optimize + constexpr size_t wgroup_size = 32; + auto n_wgroups = (size + wgroup_size - 1) / wgroup_size; + + // FIXME_SYCL The allocation should be handled by the execution space + auto deleter = [&q](value_type* ptr) { sycl::free(ptr, q); }; + std::unique_ptr<value_type[], decltype(deleter)> group_results_memory( + static_cast<pointer_type>(sycl::malloc(sizeof(value_type) * n_wgroups, + q, sycl::usm::alloc::shared)), + deleter); + auto group_results = group_results_memory.get(); + + q.submit([&](sycl::handler& cgh) { + sycl::accessor<value_type, 1, sycl::access::mode::read_write, + sycl::access::target::local> + local_mem(sycl::range<1>(wgroup_size), cgh); + + // FIXME_SYCL we get wrong results without this, not sure why + sycl::stream out(1, 1, cgh); + cgh.parallel_for( + sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size), + [=](sycl::nd_item<1> item) { + const auto local_id = item.get_local_linear_id(); + const auto global_id = item.get_global_linear_id(); + + // Initialize local memory + if (global_id < size) + ValueOps::copy(functor, &local_mem[local_id], + &global_mem[global_id]); + else + ValueInit::init(functor, &local_mem[local_id]); + item.barrier(sycl::access::fence_space::local_space); + + // Perform workgroup reduction + for (size_t stride = 1; 2 * stride < wgroup_size + 1; stride *= 2) { + auto idx = 2 * stride * (local_id + 1) - 1; + if (idx < wgroup_size) + ValueJoin::join(functor, &local_mem[idx], + &local_mem[idx - stride]); + item.barrier(sycl::access::fence_space::local_space); + } + + if (local_id == 0) { + if (n_wgroups > 1) + ValueOps::copy(functor, + &group_results[item.get_group_linear_id()], + &local_mem[wgroup_size - 1]); + else + ValueInit::init(functor, + &group_results[item.get_group_linear_id()]); + ValueInit::init(functor, &local_mem[wgroup_size - 1]); + } + + // Add results to all items + for (size_t stride = wgroup_size / 2; stride > 0; stride /= 2) { + auto idx = 2 * stride * (local_id + 1) - 1; + if (idx < wgroup_size) { + value_type dummy; + ValueOps::copy(functor, &dummy, &local_mem[idx - stride]); + ValueOps::copy(functor, &local_mem[idx - stride], + &local_mem[idx]); + ValueJoin::join(functor, &local_mem[idx], &dummy); + } + item.barrier(sycl::access::fence_space::local_space); + } + + // Write results to global memory + if (global_id < size) + ValueOps::copy(functor, &global_mem[global_id], + &local_mem[local_id]); + }); + }); + + if (n_wgroups > 1) scan_internal(q, functor, group_results, n_wgroups); + m_policy.space().fence(); + + q.submit([&](sycl::handler& cgh) { + cgh.parallel_for(sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size), + [=](sycl::nd_item<1> item) { + const auto global_id = item.get_global_linear_id(); + if (global_id < size) + ValueJoin::join( + functor, &global_mem[global_id], + &group_results[item.get_group_linear_id()]); + }); + }); + m_policy.space().fence(); + } + + template <typename Functor> + void sycl_direct_launch(const Functor& functor) const { + // Convenience references + const Kokkos::Experimental::SYCL& space = m_policy.space(); + Kokkos::Experimental::Impl::SYCLInternal& instance = + *space.impl_internal_space_instance(); + sycl::queue& q = *instance.m_queue; + + const std::size_t len = m_policy.end() - m_policy.begin(); + + // Initialize global memory + q.submit([&](sycl::handler& cgh) { + auto global_mem = m_scratch_space; + auto begin = m_policy.begin(); + cgh.parallel_for(sycl::range<1>(len), [=](sycl::item<1> item) { + const typename Policy::index_type id = + static_cast<typename Policy::index_type>(item.get_id()) + begin; + value_type update{}; + ValueInit::init(functor, &update); + if constexpr (std::is_same<WorkTag, void>::value) + functor(id, update, false); + else + functor(WorkTag(), id, update, false); + ValueOps::copy(functor, &global_mem[id], &update); + }); + }); + space.fence(); + + // Perform the actual exlcusive scan + scan_internal(q, functor, m_scratch_space, len); + + // Write results to global memory + q.submit([&](sycl::handler& cgh) { + auto global_mem = m_scratch_space; + cgh.parallel_for(sycl::range<1>(len), [=](sycl::item<1> item) { + auto global_id = item.get_id(); + + value_type update = global_mem[global_id]; + if constexpr (std::is_same<WorkTag, void>::value) + functor(global_id, update, true); + else + functor(WorkTag(), global_id, update, true); + ValueOps::copy(functor, &global_mem[global_id], &update); + }); + }); + space.fence(); + } + + public: + template <typename PostFunctor> + void impl_execute(const PostFunctor& post_functor) { + if (m_policy.begin() == m_policy.end()) return; + + const auto& q = *m_policy.space().impl_internal_space_instance()->m_queue; + const std::size_t len = m_policy.end() - m_policy.begin(); + + // FIXME_SYCL The allocation should be handled by the execution space + // consider only storing one value per block and recreate initial results in + // the end before doing the final pass + auto deleter = [&q](value_type* ptr) { sycl::free(ptr, q); }; + std::unique_ptr<value_type[], decltype(deleter)> result_memory( + static_cast<pointer_type>(sycl::malloc(sizeof(value_type) * len, q, + sycl::usm::alloc::shared)), + deleter); + m_scratch_space = result_memory.get(); + + Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& + indirectKernelMem = m_policy.space() + .impl_internal_space_instance() + ->m_indirectKernelMem; + + const auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( + m_functor, indirectKernelMem); + + sycl_direct_launch(functor_wrapper.get_functor()); + post_functor(); + } + + ParallelScanSYCLBase(const FunctorType& arg_functor, const Policy& arg_policy) + : m_functor(arg_functor), m_policy(arg_policy) {} +}; + +template <class FunctorType, class... Traits> +class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, + Kokkos::Experimental::SYCL> + : private ParallelScanSYCLBase<FunctorType, Traits...> { + public: + using Base = ParallelScanSYCLBase<FunctorType, Traits...>; + + inline void execute() { + Base::impl_execute([]() {}); + } + + ParallelScan(const FunctorType& arg_functor, + const typename Base::Policy& arg_policy) + : Base(arg_functor, arg_policy) {} +}; + +//---------------------------------------------------------------------------- + +template <class FunctorType, class ReturnType, class... Traits> +class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>, + ReturnType, Kokkos::Experimental::SYCL> + : private ParallelScanSYCLBase<FunctorType, Traits...> { + public: + using Base = ParallelScanSYCLBase<FunctorType, Traits...>; + + ReturnType& m_returnvalue; + + inline void execute() { + Base::impl_execute([&]() { + const long long nwork = Base::m_policy.end() - Base::m_policy.begin(); + if (nwork > 0) { + const int size = Base::ValueTraits::value_size(Base::m_functor); + DeepCopy<HostSpace, Kokkos::Experimental::SYCLDeviceUSMSpace>( + &m_returnvalue, Base::m_scratch_space + nwork - 1, size); + } + }); + } + + ParallelScanWithTotal(const FunctorType& arg_functor, + const typename Base::Policy& arg_policy, + ReturnType& arg_returnvalue) + : Base(arg_functor, arg_policy), m_returnvalue(arg_returnvalue) {} +}; + +} // namespace Impl +} // namespace Kokkos + +#endif + +#endif diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp new file mode 100644 index 0000000000000000000000000000000000000000..738620926b5496b9710ce001b77c6fb625325320 --- /dev/null +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp @@ -0,0 +1,835 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_SYCL_PARALLEL_TEAM_HPP +#define KOKKOS_SYCL_PARALLEL_TEAM_HPP + +#include <Kokkos_Parallel.hpp> + +#include <SYCL/Kokkos_SYCL_Team.hpp> + +namespace Kokkos { +namespace Impl { +template <typename... Properties> +class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...> + : public PolicyTraits<Properties...> { + public: + using execution_policy = TeamPolicyInternal; + + using traits = PolicyTraits<Properties...>; + + template <typename ExecSpace, typename... OtherProperties> + friend class TeamPolicyInternal; + + private: + static int constexpr MAX_WARP = 8; + + typename traits::execution_space m_space; + int m_league_size; + int m_team_size; + int m_vector_length; + int m_team_scratch_size[2]; + int m_thread_scratch_size[2]; + int m_chunk_size; + bool m_tune_team_size; + bool m_tune_vector_length; + + public: + using execution_space = Kokkos::Experimental::SYCL; + + template <class... OtherProperties> + TeamPolicyInternal(TeamPolicyInternal<OtherProperties...> const& p) { + m_league_size = p.m_league_size; + m_team_size = p.m_team_size; + m_vector_length = p.m_vector_length; + m_team_scratch_size[0] = p.m_team_scratch_size[0]; + m_team_scratch_size[1] = p.m_team_scratch_size[1]; + m_thread_scratch_size[0] = p.m_thread_scratch_size[0]; + m_thread_scratch_size[1] = p.m_thread_scratch_size[1]; + m_chunk_size = p.m_chunk_size; + m_space = p.m_space; + m_tune_team_size = p.m_tune_team_size; + m_tune_vector_length = p.m_tune_vector_length; + } + + template <typename FunctorType> + int team_size_max(FunctorType const& f, ParallelForTag const&) const { + return internal_team_size_max_for(f); + } + + template <class FunctorType> + inline int team_size_max(const FunctorType& f, + const ParallelReduceTag&) const { + return internal_team_size_max_reduce(f); + } + + template <class FunctorType, class ReducerType> + inline int team_size_max(const FunctorType& f, const ReducerType& /*r*/, + const ParallelReduceTag&) const { + return internal_team_size_max_reduce(f); + } + + template <typename FunctorType> + int team_size_recommended(FunctorType const& f, ParallelForTag const&) const { + return internal_team_size_max_for(f); + } + + template <typename FunctorType> + inline int team_size_recommended(FunctorType const& f, + ParallelReduceTag const&) const { + return internal_team_size_recommended_reduce(f); + } + + template <class FunctorType, class ReducerType> + int team_size_recommended(FunctorType const& f, ReducerType const&, + ParallelReduceTag const&) const { + return internal_team_size_recommended_reduce(f); + } + inline bool impl_auto_vector_length() const { return m_tune_vector_length; } + inline bool impl_auto_team_size() const { return m_tune_team_size; } + static int vector_length_max() { + // FIXME_SYCL provide a reasonable value + return 1; + } + + static int verify_requested_vector_length(int requested_vector_length) { + int test_vector_length = + std::min(requested_vector_length, vector_length_max()); + + // Allow only power-of-two vector_length + if (!(is_integral_power_of_two(test_vector_length))) { + int test_pow2 = 1; + for (int i = 0; i < 5; i++) { + test_pow2 = test_pow2 << 1; + if (test_pow2 > test_vector_length) { + break; + } + } + test_vector_length = test_pow2 >> 1; + } + + return test_vector_length; + } + + static int scratch_size_max(int level) { + return level == 0 ? 1024 * 32 + : // FIXME_SYCL arbitrarily setting this to 32kB + 20 * 1024 * 1024; // FIXME_SYCL arbitrarily setting this to 20MB + } + inline void impl_set_vector_length(size_t size) { m_vector_length = size; } + inline void impl_set_team_size(size_t size) { m_team_size = size; } + int impl_vector_length() const { return m_vector_length; } + KOKKOS_DEPRECATED int vector_length() const { return impl_vector_length(); } + + int team_size() const { return m_team_size; } + + int league_size() const { return m_league_size; } + + int scratch_size(int level, int team_size_ = -1) const { + if (team_size_ < 0) team_size_ = m_team_size; + return m_team_scratch_size[level] + + team_size_ * m_thread_scratch_size[level]; + } + + int team_scratch_size(int level) const { return m_team_scratch_size[level]; } + + int thread_scratch_size(int level) const { + return m_thread_scratch_size[level]; + } + + typename traits::execution_space space() const { return m_space; } + + TeamPolicyInternal() + : m_space(typename traits::execution_space()), + m_league_size(0), + m_team_size(-1), + m_vector_length(0), + m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_chunk_size(0), + m_tune_team_size(false), + m_tune_vector_length(false) {} + + /** \brief Specify league size, request team size */ + TeamPolicyInternal(const execution_space space_, int league_size_, + int team_size_request, int vector_length_request = 1) + : m_space(space_), + m_league_size(league_size_), + m_team_size(team_size_request), + m_vector_length( + (vector_length_request > 0) + ? verify_requested_vector_length(vector_length_request) + : (verify_requested_vector_length(1))), + m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_chunk_size(0), + m_tune_team_size(bool(team_size_request <= 0)), + m_tune_vector_length(bool(vector_length_request <= 0)) { + // FIXME_SYCL check paramters + } + + /** \brief Specify league size, request team size */ + TeamPolicyInternal(const execution_space space_, int league_size_, + const Kokkos::AUTO_t& /* team_size_request */, + int vector_length_request = 1) + : TeamPolicyInternal(space_, league_size_, -1, vector_length_request) {} + // FLAG + /** \brief Specify league size and team size, request vector length*/ + TeamPolicyInternal(const execution_space space_, int league_size_, + int team_size_request, + const Kokkos::AUTO_t& /* vector_length_request */ + ) + : TeamPolicyInternal(space_, league_size_, team_size_request, -1) + + {} + + /** \brief Specify league size, request team size and vector length*/ + TeamPolicyInternal(const execution_space space_, int league_size_, + const Kokkos::AUTO_t& /* team_size_request */, + const Kokkos::AUTO_t& /* vector_length_request */ + + ) + : TeamPolicyInternal(space_, league_size_, -1, -1) + + {} + + TeamPolicyInternal(int league_size_, int team_size_request, + int vector_length_request = 1) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, + team_size_request, vector_length_request) {} + + TeamPolicyInternal(int league_size_, + const Kokkos::AUTO_t& /* team_size_request */, + int vector_length_request = 1) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1, + vector_length_request) {} + + /** \brief Specify league size and team size, request vector length*/ + TeamPolicyInternal(int league_size_, int team_size_request, + const Kokkos::AUTO_t& /* vector_length_request */ + + ) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, + team_size_request, -1) + + {} + + /** \brief Specify league size, request team size and vector length*/ + TeamPolicyInternal(int league_size_, + const Kokkos::AUTO_t& /* team_size_request */, + const Kokkos::AUTO_t& /* vector_length_request */ + + ) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1, + -1) {} + + int chunk_size() const { return m_chunk_size; } + + TeamPolicyInternal& set_chunk_size(typename traits::index_type chunk_size_) { + m_chunk_size = chunk_size_; + return *this; + } + + /** \brief set per team scratch size for a specific level of the scratch + * hierarchy */ + TeamPolicyInternal& set_scratch_size(int level, + PerTeamValue const& per_team) { + m_team_scratch_size[level] = per_team.value; + return *this; + } + + /** \brief set per thread scratch size for a specific level of the scratch + * hierarchy */ + TeamPolicyInternal& set_scratch_size(int level, + PerThreadValue const& per_thread) { + m_thread_scratch_size[level] = per_thread.value; + return *this; + } + + /** \brief set per thread and per team scratch size for a specific level of + * the scratch hierarchy */ + TeamPolicyInternal& set_scratch_size(int level, PerTeamValue const& per_team, + PerThreadValue const& per_thread) { + m_team_scratch_size[level] = per_team.value; + m_thread_scratch_size[level] = per_thread.value; + return *this; + } + + using member_type = Kokkos::Impl::SYCLTeamMember; + + protected: + template <class FunctorType> + int internal_team_size_max_for(const FunctorType& /*f*/) const { + // nested_reducer_memsize = (sizeof(double) * (m_team_size + 2) + // custom: m_team_scratch_size[0] + m_thread_scratch_size[0] * m_team_size + // total: + // 2*sizeof(double)+m_team_scratch_size[0] + // + m_team_size(sizeof(double)+m_thread_scratch_size[0]) + const int max_threads_for_memory = + (space().impl_internal_space_instance()->m_maxShmemPerBlock - + 2 * sizeof(double) - m_team_scratch_size[0]) / + (sizeof(double) + m_thread_scratch_size[0]); + return std::min<int>( + m_space.impl_internal_space_instance()->m_maxWorkgroupSize, + max_threads_for_memory); + } + + template <class FunctorType> + int internal_team_size_max_reduce(const FunctorType& f) const { + using Analysis = FunctorAnalysis<FunctorPatternInterface::REDUCE, + TeamPolicyInternal, FunctorType>; + using value_type = typename Analysis::value_type; + const int value_count = Analysis::value_count(f); + + // nested_reducer_memsize = (sizeof(double) * (m_team_size + 2) + // reducer_memsize = sizeof(value_type) * m_team_size * value_count + // custom: m_team_scratch_size[0] + m_thread_scratch_size[0] * m_team_size + // total: + // 2*sizeof(double)+m_team_scratch_size[0] + // + m_team_size(sizeof(double)+sizeof(value_type)*value_count + // +m_thread_scratch_size[0]) + const int max_threads_for_memory = + (space().impl_internal_space_instance()->m_maxShmemPerBlock - + 2 * sizeof(double) - m_team_scratch_size[0]) / + (sizeof(double) + sizeof(value_type) * value_count + + m_thread_scratch_size[0]); + return std::min<int>( + m_space.impl_internal_space_instance()->m_maxWorkgroupSize, + max_threads_for_memory); + } + + template <class FunctorType> + int internal_team_size_recommended_for(const FunctorType& f) const { + // FIXME_SYCL improve + return internal_team_size_max_for(f); + } + + template <class FunctorType> + int internal_team_size_recommended_reduce(const FunctorType& f) const { + // FIXME_SYCL improve + return internal_team_size_max_reduce(f); + } +}; + +template <typename FunctorType, typename... Properties> +class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, + Kokkos::Experimental::SYCL> { + public: + using Policy = TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>; + using functor_type = FunctorType; + using size_type = ::Kokkos::Experimental::SYCL::size_type; + + private: + using member_type = typename Policy::member_type; + using work_tag = typename Policy::work_tag; + using launch_bounds = typename Policy::launch_bounds; + + FunctorType const m_functor; + Policy const m_policy; + size_type const m_league_size; + int m_team_size; + size_type const m_vector_size; + int m_shmem_begin; + int m_shmem_size; + void* m_scratch_ptr[2]; + int m_scratch_size[2]; + + template <typename Functor> + void sycl_direct_launch(const Policy& policy, const Functor& functor) const { + // Convenience references + const Kokkos::Experimental::SYCL& space = policy.space(); + Kokkos::Experimental::Impl::SYCLInternal& instance = + *space.impl_internal_space_instance(); + sycl::queue& q = *instance.m_queue; + + q.submit([&](sycl::handler& cgh) { + // FIXME_SYCL accessors seem to need a size greater than zero at least for + // host queues + sycl::accessor<char, 1, sycl::access::mode::read_write, + sycl::access::target::local> + team_scratch_memory_L0( + sycl::range<1>(std::max(m_scratch_size[0] + m_shmem_begin, 1)), + cgh); + + // Avoid capturing *this since it might not be trivially copyable + const auto shmem_begin = m_shmem_begin; + const int scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]}; + void* const scratch_ptr[2] = {m_scratch_ptr[0], m_scratch_ptr[1]}; + + cgh.parallel_for( + sycl::nd_range<2>( + sycl::range<2>(m_league_size * m_team_size, m_vector_size), + sycl::range<2>(m_team_size, m_vector_size)), + [=](sycl::nd_item<2> item) { + const member_type team_member( + team_scratch_memory_L0.get_pointer(), shmem_begin, + scratch_size[0], + static_cast<char*>(scratch_ptr[1]) + + item.get_group(0) * scratch_size[1], + scratch_size[1], item); + if constexpr (std::is_same<work_tag, void>::value) + functor(team_member); + else + functor(work_tag(), team_member); + }); + }); + space.fence(); + } + + public: + inline void execute() const { + if (m_league_size == 0) return; + + Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& + indirectKernelMem = m_policy.space() + .impl_internal_space_instance() + ->m_indirectKernelMem; + + const auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( + m_functor, indirectKernelMem); + + sycl_direct_launch(m_policy, functor_wrapper.get_functor()); + } + + ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy) + : m_functor(arg_functor), + m_policy(arg_policy), + m_league_size(arg_policy.league_size()), + m_team_size(arg_policy.team_size()), + m_vector_size(arg_policy.impl_vector_length()) { + // FIXME_SYCL optimize + if (m_team_size < 0) m_team_size = 32; + + m_shmem_begin = (sizeof(double) * (m_team_size + 2)); + m_shmem_size = + (m_policy.scratch_size(0, m_team_size) + + FunctorTeamShmemSize<FunctorType>::value(m_functor, m_team_size)); + m_scratch_size[0] = m_shmem_size; + m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); + + // FIXME_SYCL so far accessors used instead of these pointers + // Functor's reduce memory, team scan memory, and team shared memory depend + // upon team size. + const auto& space = *m_policy.space().impl_internal_space_instance(); + const sycl::queue& q = *space.m_queue; + m_scratch_ptr[0] = nullptr; + m_scratch_ptr[1] = sycl::malloc_device( + sizeof(char) * m_scratch_size[1] * m_league_size, q); + + if (static_cast<int>(space.m_maxShmemPerBlock) < + m_shmem_size - m_shmem_begin) { + std::stringstream out; + out << "Kokkos::Impl::ParallelFor<SYCL> insufficient shared memory! " + "Requested " + << m_shmem_size - m_shmem_begin << " bytes but maximum is " + << m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock + << '\n'; + Kokkos::Impl::throw_runtime_exception(out.str()); + } + + if (m_team_size > m_policy.team_size_max(arg_functor, ParallelForTag{})) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelFor<SYCL> requested too large team size."); + } + + // FIXME_SYCL remove when managing m_scratch_ptr[1] in the execution space + // instance + ParallelFor(const ParallelFor&) = delete; + ParallelFor& operator=(const ParallelFor&) = delete; + + ~ParallelFor() { + const Kokkos::Experimental::SYCL& space = m_policy.space(); + Kokkos::Experimental::Impl::SYCLInternal& instance = + *space.impl_internal_space_instance(); + sycl::queue& q = *instance.m_queue; + sycl::free(m_scratch_ptr[1], q); + } +}; + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +template <class FunctorType, class ReducerType, class... Properties> +class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>, + ReducerType, Kokkos::Experimental::SYCL> { + public: + using Policy = TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>; + + private: + using Analysis = + FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, FunctorType>; + using member_type = typename Policy::member_type; + using WorkTag = typename Policy::work_tag; + using launch_bounds = typename Policy::launch_bounds; + + using pointer_type = typename Analysis::pointer_type; + using reference_type = typename Analysis::reference_type; + using value_type = typename Analysis::value_type; + + public: + using functor_type = FunctorType; + using size_type = Kokkos::Experimental::SYCL::size_type; + + private: + const FunctorType m_functor; + const Policy m_policy; + const ReducerType m_reducer; + const pointer_type m_result_ptr; + // FIXME_SYCL avoid reallocating memory for reductions + /* size_type* m_scratch_space; + size_type* m_scratch_flags; + size_type m_team_begin;*/ + size_type m_shmem_begin; + size_type m_shmem_size; + void* m_scratch_ptr[2]; + int m_scratch_size[2]; + const size_type m_league_size; + int m_team_size; + const size_type m_vector_size; + + template <typename PolicyType, typename Functor, typename Reducer> + void sycl_direct_launch(const PolicyType& policy, const Functor& functor, + const Reducer& reducer) const { + using ReducerConditional = + Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, + FunctorType, ReducerType>; + using ReducerTypeFwd = typename ReducerConditional::type; + using WorkTagFwd = + std::conditional_t<std::is_same<InvalidType, ReducerType>::value, + WorkTag, void>; + using ValueInit = + Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>; + using ValueJoin = + Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>; + using ValueOps = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>; + + auto selected_reducer = ReducerConditional::select(functor, reducer); + + // Convenience references + const Kokkos::Experimental::SYCL& space = policy.space(); + Kokkos::Experimental::Impl::SYCLInternal& instance = + *space.impl_internal_space_instance(); + sycl::queue& q = *instance.m_queue; + + // FIXME_SYCL optimize + const size_t wgroup_size = m_team_size; + std::size_t size = m_league_size * m_team_size; + const auto init_size = + std::max<std::size_t>((size + wgroup_size - 1) / wgroup_size, 1); + const unsigned int value_count = + FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>::value_count( + selected_reducer); + // FIXME_SYCL only use the first half + const auto results_ptr = static_cast<pointer_type>(instance.scratch_space( + sizeof(value_type) * std::max(value_count, 1u) * init_size * 2)); + // FIXME_SYCL without this we are running into a race condition + const auto results_ptr2 = + results_ptr + std::max(value_count, 1u) * init_size; + + // If size<=1 we only call init(), the functor and possibly final once + // working with the global scratch memory but don't copy back to + // m_result_ptr yet. + if (size <= 1) { + q.submit([&](sycl::handler& cgh) { + // FIXME_SYCL accessors seem to need a size greater than zero at least + // for host queues + sycl::accessor<char, 1, sycl::access::mode::read_write, + sycl::access::target::local> + team_scratch_memory_L0( + sycl::range<1>(std::max(m_scratch_size[0] + m_shmem_begin, 1)), + cgh); + + // Avoid capturing *this since it might not be trivially copyable + const auto shmem_begin = m_shmem_begin; + const int scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]}; + void* const scratch_ptr[2] = {m_scratch_ptr[0], m_scratch_ptr[1]}; + + cgh.parallel_for( + sycl::nd_range<2>(sycl::range<2>(1, 1), sycl::range<2>(1, 1)), + [=](sycl::nd_item<2> item) { + const auto& selected_reducer = ReducerConditional::select( + static_cast<const FunctorType&>(functor), + static_cast<const ReducerType&>(reducer)); + reference_type update = + ValueInit::init(selected_reducer, results_ptr); + if (size == 1) { + const member_type team_member( + team_scratch_memory_L0.get_pointer(), shmem_begin, + scratch_size[0], static_cast<char*>(scratch_ptr[1]), + scratch_size[1], item); + if constexpr (std::is_same<WorkTag, void>::value) + functor(team_member, update); + else + functor(WorkTag(), team_member, update); + } + if constexpr (ReduceFunctorHasFinal<FunctorType>::value) + FunctorFinal<FunctorType, WorkTag>::final( + static_cast<const FunctorType&>(functor), results_ptr); + }); + }); + space.fence(); + } + + // Otherwise, we perform a reduction on the values in all workgroups + // separately, write the workgroup results back to global memory and recurse + // until only one workgroup does the reduction and thus gets the final + // value. + bool first_run = true; + while (size > 1) { + auto n_wgroups = (size + wgroup_size - 1) / wgroup_size; + q.submit([&](sycl::handler& cgh) { + sycl::accessor<value_type, 1, sycl::access::mode::read_write, + sycl::access::target::local> + local_mem(sycl::range<1>(wgroup_size) * std::max(value_count, 1u), + cgh); + // FIXME_SYCL accessors seem to need a size greater than zero at least + // for host queues + sycl::accessor<char, 1, sycl::access::mode::read_write, + sycl::access::target::local> + team_scratch_memory_L0( + sycl::range<1>(std::max(m_scratch_size[0] + m_shmem_begin, 1)), + cgh); + + // Avoid capturing *this since it might not be trivially copyable + const auto shmem_begin = m_shmem_begin; + const int scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]}; + void* const scratch_ptr[2] = {m_scratch_ptr[0], m_scratch_ptr[1]}; + + cgh.parallel_for( + sycl::nd_range<2>( + sycl::range<2>(m_league_size * m_team_size, m_vector_size), + sycl::range<2>(m_team_size, m_vector_size)), + [=](sycl::nd_item<2> item) { + const auto local_id = item.get_local_linear_id(); + const auto global_id = + wgroup_size * item.get_group_linear_id() + local_id; + const auto& selected_reducer = ReducerConditional::select( + static_cast<const FunctorType&>(functor), + static_cast<const ReducerType&>(reducer)); + + // In the first iteration, we call functor to initialize the local + // memory. Otherwise, the local memory is initialized with the + // results from the previous iteration that are stored in global + // memory. Note that we load values_per_thread values per thread + // and immediately combine them to avoid too many threads being + // idle in the actual workgroup reduction. + if (first_run) { + reference_type update = ValueInit::init( + selected_reducer, &local_mem[local_id * value_count]); + const member_type team_member( + team_scratch_memory_L0.get_pointer(), shmem_begin, + scratch_size[0], + static_cast<char*>(scratch_ptr[1]) + + item.get_group(0) * scratch_size[1], + scratch_size[1], item); + if constexpr (std::is_same<WorkTag, void>::value) + functor(team_member, update); + else + functor(WorkTag(), team_member, update); + } else { + if (global_id >= size) + ValueInit::init(selected_reducer, + &local_mem[local_id * value_count]); + else { + ValueOps::copy(functor, &local_mem[local_id * value_count], + &results_ptr[global_id * value_count]); + } + } + item.barrier(sycl::access::fence_space::local_space); + + // Perform the actual workgroup reduction. To achieve a better + // memory access pattern, we use sequential addressing and a + // reversed loop. If the workgroup size is 8, the first element + // contains all the values with index%4==0, after the second one + // the values with index%2==0 and after the third one index%1==0, + // i.e., all values. + for (unsigned int stride = wgroup_size / 2; stride > 0; + stride >>= 1) { + const auto idx = local_id; + if (idx < stride) { + ValueJoin::join(selected_reducer, + &local_mem[idx * value_count], + &local_mem[(idx + stride) * value_count]); + } + item.barrier(sycl::access::fence_space::local_space); + } + + // Finally, we copy the workgroup results back to global memory to + // be used in the next iteration. If this is the last iteration, + // i.e., there is only one workgroup also call final() if + // necessary. + if (local_id == 0) { + ValueOps::copy( + functor, + &results_ptr2[(item.get_group_linear_id()) * value_count], + &local_mem[0]); + if constexpr (ReduceFunctorHasFinal<FunctorType>::value) + if (n_wgroups <= 1 && item.get_group_linear_id() == 0) { + FunctorFinal<FunctorType, WorkTag>::final( + static_cast<const FunctorType&>(functor), + &results_ptr2[(item.get_group_linear_id()) * + value_count]); + } + } + }); + }); + space.fence(); + + // FIXME_SYCL this is likely not necessary, see above + Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>( + space, results_ptr, results_ptr2, + sizeof(*m_result_ptr) * value_count * n_wgroups); + space.fence(); + + first_run = false; + size = n_wgroups; + } + + // At this point, the reduced value is written to the entry in results_ptr + // and all that is left is to copy it back to the given result pointer if + // necessary. + if (m_result_ptr) { + Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>( + space, m_result_ptr, results_ptr, + sizeof(*m_result_ptr) * value_count); + space.fence(); + } + } + + public: + inline void execute() { + Kokkos::Experimental::Impl::SYCLInternal& instance = + *m_policy.space().impl_internal_space_instance(); + using IndirectKernelMem = + Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem; + IndirectKernelMem& indirectKernelMem = instance.m_indirectKernelMem; + IndirectKernelMem& indirectReducerMem = instance.m_indirectReducerMem; + + const auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( + m_functor, indirectKernelMem); + const auto reducer_wrapper = Experimental::Impl::make_sycl_function_wrapper( + m_reducer, indirectReducerMem); + + sycl_direct_launch(m_policy, functor_wrapper.get_functor(), + reducer_wrapper.get_functor()); + } + + private: + void initialize() { + // FIXME_SYCL optimize + if (m_team_size < 0) m_team_size = 32; + // Must be a power of two greater than two, get the one not bigger than the + // requested one. + if ((m_team_size & m_team_size - 1) || m_team_size < 2) { + int temp_team_size = 2; + while ((temp_team_size << 1) < m_team_size) temp_team_size <<= 1; + m_team_size = temp_team_size; + } + + m_shmem_begin = (sizeof(double) * (m_team_size + 2)); + m_shmem_size = + (m_policy.scratch_size(0, m_team_size) + + FunctorTeamShmemSize<FunctorType>::value(m_functor, m_team_size)); + m_scratch_size[0] = m_shmem_size; + m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); + + // FIXME_SYCL so far accessors used instead of these pointers + // Functor's reduce memory, team scan memory, and team shared memory depend + // upon team size. + const auto& space = *m_policy.space().impl_internal_space_instance(); + const sycl::queue& q = *space.m_queue; + m_scratch_ptr[0] = nullptr; + m_scratch_ptr[1] = sycl::malloc_device( + sizeof(char) * m_scratch_size[1] * m_league_size, q); + + if (static_cast<int>(space.m_maxShmemPerBlock) < + m_shmem_size - m_shmem_begin) { + std::stringstream out; + out << "Kokkos::Impl::ParallelFor<SYCL> insufficient shared memory! " + "Requested " + << m_shmem_size - m_shmem_begin << " bytes but maximum is " + << m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock + << '\n'; + Kokkos::Impl::throw_runtime_exception(out.str()); + } + + if (m_team_size > m_policy.team_size_max(m_functor, ParallelForTag{})) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelFor<SYCL> requested too large team size."); + } + + public: + template <class ViewType> + ParallelReduce(FunctorType const& arg_functor, Policy const& arg_policy, + ViewType const& arg_result, + typename std::enable_if<Kokkos::is_view<ViewType>::value, + void*>::type = nullptr) + : m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(InvalidType()), + m_result_ptr(arg_result.data()), + m_league_size(arg_policy.league_size()), + m_team_size(arg_policy.team_size()), + m_vector_size(arg_policy.impl_vector_length()) { + initialize(); + } + + ParallelReduce(FunctorType const& arg_functor, Policy const& arg_policy, + ReducerType const& reducer) + : m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(reducer), + m_result_ptr(reducer.view().data()), + m_league_size(arg_policy.league_size()), + m_team_size(arg_policy.team_size()), + m_vector_size(arg_policy.impl_vector_length()) { + initialize(); + } +}; +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp new file mode 100644 index 0000000000000000000000000000000000000000..75741438e295c543db2737e6943ea52e244d69db --- /dev/null +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp @@ -0,0 +1,347 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Macros.hpp> + +#include <Kokkos_HostSpace.hpp> +#include <Kokkos_SYCL.hpp> +#include <Kokkos_SYCL_Space.hpp> +#include <SYCL/Kokkos_SYCL_DeepCopy.hpp> +#include <SYCL/Kokkos_SYCL_Instance.hpp> +#include <impl/Kokkos_MemorySpace.hpp> +#include <impl/Kokkos_Profiling.hpp> + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ +namespace Kokkos { +namespace Impl { +namespace { +auto USM_memcpy(sycl::queue& q, void* dst, const void* src, size_t n) { + return q.memcpy(dst, src, n); +} + +void USM_memcpy(Kokkos::Experimental::Impl::SYCLInternal& space, void* dst, + const void* src, size_t n) { + (void)USM_memcpy(*space.m_queue, dst, src, n); +} + +void USM_memcpy(void* dst, const void* src, size_t n) { + Experimental::SYCL().fence(); + auto event = USM_memcpy( + *Experimental::Impl::SYCLInternal::singleton().m_queue, dst, src, n); + Experimental::Impl::SYCLInternal::fence(event); +} +} // namespace + +DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace, Kokkos::Experimental::SYCL>:: + DeepCopy(const Kokkos::Experimental::SYCL& instance, void* dst, + const void* src, size_t n) { + USM_memcpy(*instance.impl_internal_space_instance(), dst, src, n); +} + +DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCL>::DeepCopy(void* dst, const void* src, + size_t n) { + USM_memcpy(dst, src, n); +} + +DeepCopy<Kokkos::HostSpace, Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCL>::DeepCopy(const Kokkos::Experimental::SYCL& + instance, + void* dst, const void* src, + size_t n) { + USM_memcpy(*instance.impl_internal_space_instance(), dst, src, n); +} + +DeepCopy<Kokkos::HostSpace, Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCL>::DeepCopy(void* dst, const void* src, + size_t n) { + USM_memcpy(dst, src, n); +} + +DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, Kokkos::HostSpace, + Kokkos::Experimental::SYCL>::DeepCopy(const Kokkos::Experimental::SYCL& + instance, + void* dst, const void* src, + size_t n) { + USM_memcpy(*instance.impl_internal_space_instance(), dst, src, n); +} + +DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, Kokkos::HostSpace, + Kokkos::Experimental::SYCL>::DeepCopy(void* dst, const void* src, + size_t n) { + USM_memcpy(dst, src, n); +} + +} // namespace Impl +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Experimental { + +SYCLDeviceUSMSpace::SYCLDeviceUSMSpace() + : m_queue(*SYCL().impl_internal_space_instance()->m_queue) {} +SYCLDeviceUSMSpace::SYCLDeviceUSMSpace(sycl::queue queue) + : m_queue(std::move(queue)) {} + +SYCLSharedUSMSpace::SYCLSharedUSMSpace() + : m_queue(*SYCL().impl_internal_space_instance()->m_queue) {} +SYCLSharedUSMSpace::SYCLSharedUSMSpace(sycl::queue queue) + : m_queue(std::move(queue)) {} + +void* allocate_sycl( + const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size, const Kokkos::Tools::SpaceHandle arg_handle, + const RawMemoryAllocationFailure::AllocationMechanism failure_tag, + const sycl::usm::alloc allocation_kind, const sycl::queue& queue) { + void* const hostPtr = sycl::malloc(arg_alloc_size, queue, allocation_kind); + + if (hostPtr == nullptr) + throw RawMemoryAllocationFailure( + arg_alloc_size, 1, RawMemoryAllocationFailure::FailureMode::Unknown, + failure_tag); + + if (Kokkos::Profiling::profileLibraryLoaded()) { + const size_t reported_size = + (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; + Kokkos::Profiling::allocateData(arg_handle, arg_label, hostPtr, + reported_size); + } + + return hostPtr; +} + +void* SYCLDeviceUSMSpace::allocate(const size_t arg_alloc_size) const { + return allocate("[unlabeled]", arg_alloc_size); +} + +void* SYCLDeviceUSMSpace::allocate(const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size) const { + return allocate_sycl( + arg_label, arg_alloc_size, arg_logical_size, + Kokkos::Tools::make_space_handle(name()), + RawMemoryAllocationFailure::AllocationMechanism::SYCLMallocDevice, + sycl::usm::alloc::device, m_queue); +} + +void* SYCLSharedUSMSpace::allocate(const size_t arg_alloc_size) const { + return allocate("[unlabeled]", arg_alloc_size); +} +void* SYCLSharedUSMSpace::allocate(const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size) const { + return allocate_sycl( + arg_label, arg_alloc_size, arg_logical_size, + Kokkos::Tools::make_space_handle(name()), + RawMemoryAllocationFailure::AllocationMechanism::SYCLMallocShared, + sycl::usm::alloc::shared, m_queue); +} + +void sycl_deallocate(const char* arg_label, void* const arg_alloc_ptr, + const size_t arg_alloc_size, const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle, + const sycl::queue& queue) { + if (Kokkos::Profiling::profileLibraryLoaded()) { + const size_t reported_size = + (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; + Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, + reported_size); + } + + sycl::free(arg_alloc_ptr, queue); +} + +void SYCLDeviceUSMSpace::deallocate(void* const arg_alloc_ptr, + const size_t arg_alloc_size) const { + deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size); +} +void SYCLDeviceUSMSpace::deallocate(const char* arg_label, + void* const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t arg_logical_size) const { + sycl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size, + Kokkos::Tools::make_space_handle(name()), m_queue); +} + +void SYCLSharedUSMSpace::deallocate(void* const arg_alloc_ptr, + const size_t arg_alloc_size) const { + deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size); +} + +void SYCLSharedUSMSpace::deallocate(const char* arg_label, + void* const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t arg_logical_size) const { + sycl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size, + Kokkos::Tools::make_space_handle(name()), m_queue); +} + +} // namespace Experimental +} // namespace Kokkos + +namespace Kokkos { +namespace Impl { + +#ifdef KOKKOS_ENABLE_DEBUG +SharedAllocationRecord<void, void> SharedAllocationRecord< + Kokkos::Experimental::SYCLDeviceUSMSpace, void>::s_root_record; + +SharedAllocationRecord<void, void> SharedAllocationRecord< + Kokkos::Experimental::SYCLSharedUSMSpace, void>::s_root_record; +#endif + +SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace, void>:: + SharedAllocationRecord( + const Kokkos::Experimental::SYCLDeviceUSMSpace& space, + const std::string& label, const size_t size, + const SharedAllocationRecord<void, void>::function_type dealloc) + // Pass through allocated [ SharedAllocationHeader , user_memory ] + // Pass through deallocation function + : base_t( +#ifdef KOKKOS_ENABLE_DEBUG + &SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace, + void>::s_root_record, +#endif + Kokkos::Impl::checked_allocation_with_header(space, label, size), + sizeof(SharedAllocationHeader) + size, dealloc), + m_space(space) { + SharedAllocationHeader header; + + this->base_t::_fill_host_accessible_header_info(header, label); + + // Copy to device memory + Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, HostSpace>( + RecordBase::m_alloc_ptr, &header, sizeof(SharedAllocationHeader)); +} + +SharedAllocationRecord<Kokkos::Experimental::SYCLSharedUSMSpace, void>:: + SharedAllocationRecord( + const Kokkos::Experimental::SYCLSharedUSMSpace& arg_space, + const std::string& arg_label, const size_t arg_alloc_size, + const SharedAllocationRecord<void, void>::function_type arg_dealloc) + // Pass through allocated [ SharedAllocationHeader , user_memory ] + // Pass through deallocation function + : base_t( +#ifdef KOKKOS_ENABLE_DEBUG + &SharedAllocationRecord<Kokkos::Experimental::SYCLSharedUSMSpace, + void>::s_root_record, +#endif + Impl::checked_allocation_with_header(arg_space, arg_label, + arg_alloc_size), + sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc), + m_space(arg_space) { + + this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, + arg_label); +} + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace, + void>::~SharedAllocationRecord() { + const char* label = nullptr; + if (Kokkos::Profiling::profileLibraryLoaded()) { + SharedAllocationHeader header; + Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::HostSpace>(&header, RecordBase::m_alloc_ptr, + sizeof(SharedAllocationHeader)); + label = header.label(); + } + const auto alloc_size = SharedAllocationRecord<void, void>::m_alloc_size; + m_space.deallocate(label, SharedAllocationRecord<void, void>::m_alloc_ptr, + alloc_size, alloc_size - sizeof(SharedAllocationHeader)); +} + +SharedAllocationRecord<Kokkos::Experimental::SYCLSharedUSMSpace, + void>::~SharedAllocationRecord() { + const char* label = nullptr; + if (Kokkos::Profiling::profileLibraryLoaded()) { + label = RecordBase::m_alloc_ptr->m_label; + } + const auto alloc_size = SharedAllocationRecord<void, void>::m_alloc_size; + m_space.deallocate(label, SharedAllocationRecord<void, void>::m_alloc_ptr, + alloc_size, alloc_size - sizeof(SharedAllocationHeader)); +} + +//---------------------------------------------------------------------------- + +} // namespace Impl +} // namespace Kokkos + +//============================================================================== +// <editor-fold desc="Explicit instantiations of CRTP Base classes"> {{{1 + +#include <impl/Kokkos_SharedAlloc_timpl.hpp> + +namespace Kokkos { +namespace Impl { + +// To avoid additional compilation cost for something that's (mostly?) not +// performance sensitive, we explicity instantiate these CRTP base classes here, +// where we have access to the associated *_timpl.hpp header files. +template class HostInaccessibleSharedAllocationRecordCommon< + Kokkos::Experimental::SYCLDeviceUSMSpace>; +template class SharedAllocationRecordCommon< + Kokkos::Experimental::SYCLDeviceUSMSpace>; +template class SharedAllocationRecordCommon< + Kokkos::Experimental::SYCLSharedUSMSpace>; + +} // namespace Impl +} // namespace Kokkos + +// </editor-fold> end Explicit instantiations of CRTP Base classes }}}1 +//============================================================================== diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a30cf2109a60ccc5934bfc6ee834a831c539d485 --- /dev/null +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp @@ -0,0 +1,816 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_SYCL_TEAM_HPP +#define KOKKOS_SYCL_TEAM_HPP + +#include <Kokkos_Macros.hpp> + +#ifdef KOKKOS_ENABLE_SYCL + +#include <utility> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +/**\brief Team member_type passed to TeamPolicy or TeamTask closures. + */ +class SYCLTeamMember { + public: + using execution_space = Kokkos::Experimental::SYCL; + using scratch_memory_space = execution_space::scratch_memory_space; + + private: + mutable void* m_team_reduce; + scratch_memory_space m_team_shared; + int m_team_reduce_size; + sycl::nd_item<2> m_item; + + public: + KOKKOS_INLINE_FUNCTION + const execution_space::scratch_memory_space& team_shmem() const { + return m_team_shared.set_team_thread_mode(0, 1, 0); + } + + KOKKOS_INLINE_FUNCTION + const execution_space::scratch_memory_space& team_scratch( + const int level) const { + return m_team_shared.set_team_thread_mode(level, 1, 0); + } + + KOKKOS_INLINE_FUNCTION + const execution_space::scratch_memory_space& thread_scratch( + const int level) const { + return m_team_shared.set_team_thread_mode(level, team_size(), team_rank()); + } + + KOKKOS_INLINE_FUNCTION int league_rank() const { + return m_item.get_group_linear_id(); + } + KOKKOS_INLINE_FUNCTION int league_size() const { + // FIXME_SYCL needs to be revised for vector_length>1. + return m_item.get_group_range(0); + } + KOKKOS_INLINE_FUNCTION int team_rank() const { + return m_item.get_local_linear_id(); + } + KOKKOS_INLINE_FUNCTION int team_size() const { + // FIXME_SYCL needs to be revised for vector_length>1. + return m_item.get_local_range(0); + } + KOKKOS_INLINE_FUNCTION void team_barrier() const { m_item.barrier(); } + + KOKKOS_INLINE_FUNCTION const sycl::nd_item<2>& item() const { return m_item; } + + //-------------------------------------------------------------------------- + + template <class ValueType> + KOKKOS_INLINE_FUNCTION void team_broadcast(ValueType& val, + const int thread_id) const { + // Wait for shared data write until all threads arrive here + m_item.barrier(sycl::access::fence_space::local_space); + if (m_item.get_local_id(1) == 0 && + static_cast<int>(m_item.get_local_id(0)) == thread_id) { + *static_cast<ValueType*>(m_team_reduce) = val; + } + // Wait for shared data read until root thread writes + m_item.barrier(sycl::access::fence_space::local_space); + val = *static_cast<ValueType*>(m_team_reduce); + } + + template <class Closure, class ValueType> + KOKKOS_INLINE_FUNCTION void team_broadcast(Closure const& f, ValueType& val, + const int thread_id) const { + f(val); + team_broadcast(val, thread_id); + } + + //-------------------------------------------------------------------------- + /**\brief Reduction across a team + */ + template <typename ReducerType> + KOKKOS_INLINE_FUNCTION + typename std::enable_if<is_reducer<ReducerType>::value>::type + team_reduce(ReducerType const& reducer) const noexcept { + team_reduce(reducer, reducer.reference()); + } + + template <typename ReducerType> + KOKKOS_INLINE_FUNCTION + typename std::enable_if<is_reducer<ReducerType>::value>::type + team_reduce(ReducerType const& reducer, + typename ReducerType::value_type& value) const noexcept { + using value_type = typename ReducerType::value_type; + + // We need to chunk up the whole reduction because we might not have + // allocated enough memory. + const int maximum_work_range = + std::min<int>(m_team_reduce_size / sizeof(value_type), team_size()); + + int smaller_power_of_two = 1; + while ((smaller_power_of_two << 1) < maximum_work_range) + smaller_power_of_two <<= 1; + + const int idx = team_rank(); + auto reduction_array = static_cast<value_type*>(m_team_reduce); + + // Load values into the first maximum_work_range values of the reduction + // array in chunks. This means that only threads with an id in the + // corresponding chunk load values and the reduction is always done by the + // first smaller_power_of_two threads. + if (idx < maximum_work_range) reduction_array[idx] = value; + m_item.barrier(sycl::access::fence_space::local_space); + + for (int start = maximum_work_range; start < team_size(); + start += maximum_work_range) { + if (idx >= start && + idx < std::min(start + maximum_work_range, team_size())) + reducer.join(reduction_array[idx - start], value); + m_item.barrier(sycl::access::fence_space::local_space); + } + + for (int stride = smaller_power_of_two; stride > 0; stride >>= 1) { + if (idx < stride && idx + stride < maximum_work_range) + reducer.join(reduction_array[idx], reduction_array[idx + stride]); + m_item.barrier(sycl::access::fence_space::local_space); + } + reducer.reference() = reduction_array[0]; + m_item.barrier(sycl::access::fence_space::local_space); + } + + // FIXME_SYCL move somewhere else and combine with other places that do + // parallel_scan + // Exclusive scan returning the total sum. + // n is required to be a power of two and + // temp must point to an array containing the data to be processed + // The accumulated value is returned. + template <typename Type> + static Type prescan(sycl::nd_item<2> m_item, Type* temp, int n) { + int thid = m_item.get_local_id(0); + + // First do a reduction saving intermediate results + for (int stride = 1; stride < n; stride <<= 1) { + auto idx = 2 * stride * (thid + 1) - 1; + if (idx < n) temp[idx] += temp[idx - stride]; + m_item.barrier(sycl::access::fence_space::local_space); + } + + Type total_sum = temp[n - 1]; + m_item.barrier(sycl::access::fence_space::local_space); + + // clear the last element so we get an exclusive scan + if (thid == 0) temp[n - 1] = Type{}; + m_item.barrier(sycl::access::fence_space::local_space); + + // Now add the intermediate results to the remaining items again + for (int stride = n / 2; stride > 0; stride >>= 1) { + auto idx = 2 * stride * (thid + 1) - 1; + if (idx < n) { + Type dummy = temp[idx - stride]; + temp[idx - stride] = temp[idx]; + temp[idx] += dummy; + } + m_item.barrier(sycl::access::fence_space::local_space); + } + + return total_sum; + } + + //-------------------------------------------------------------------------- + /** \brief Intra-team exclusive prefix sum with team_rank() ordering + * with intra-team non-deterministic ordering accumulation. + * + * The global inter-team accumulation value will, at the end of the + * league's parallel execution, be the scan's total. + * Parallel execution ordering of the league's teams is non-deterministic. + * As such the base value for each team's scan operation is similarly + * non-deterministic. + */ + template <typename Type> + KOKKOS_INLINE_FUNCTION Type team_scan(const Type& value, + Type* const global_accum) const { + // We need to chunk up the whole reduction because we might not have + // allocated enough memory. + const int maximum_work_range = + std::min<int>(m_team_reduce_size / sizeof(Type), team_size()); + + int not_greater_power_of_two = 1; + while ((not_greater_power_of_two << 1) < maximum_work_range + 1) + not_greater_power_of_two <<= 1; + + Type intermediate; + Type total{}; + + const int idx = team_rank(); + const auto base_data = static_cast<Type*>(m_team_reduce); + + // Load values into the first not_greater_power_of_two values of the + // reduction array in chunks. This means that only threads with an id in the + // corresponding chunk load values and the reduction is always done by the + // first not_greater_power_of_two threads. + for (int start = 0; start < team_size(); + start += not_greater_power_of_two) { + m_item.barrier(sycl::access::fence_space::local_space); + if (idx >= start && idx < start + not_greater_power_of_two) { + base_data[idx - start] = value; + } + m_item.barrier(sycl::access::fence_space::local_space); + + const Type partial_total = + prescan(m_item, base_data, not_greater_power_of_two); + if (idx >= start && idx < start + not_greater_power_of_two) + intermediate = base_data[idx - start] + total; + if (start == 0) + total = partial_total; + else + total += partial_total; + } + + if (global_accum) { + if (team_size() == idx + 1) { + base_data[team_size()] = atomic_fetch_add(global_accum, total); + } + m_item.barrier(); // Wait for atomic + intermediate += base_data[team_size()]; + } + + return intermediate; + } + + /** \brief Intra-team exclusive prefix sum with team_rank() ordering. + * + * The highest rank thread can compute the reduction total as + * reduction_total = dev.team_scan( value ) + value ; + */ + template <typename Type> + KOKKOS_INLINE_FUNCTION Type team_scan(const Type& value) const { + return this->template team_scan<Type>(value, nullptr); + } + + //---------------------------------------- + + template <typename ReducerType> + KOKKOS_INLINE_FUNCTION static + typename std::enable_if<is_reducer<ReducerType>::value>::type + vector_reduce(ReducerType const& reducer) { + vector_reduce(reducer, reducer.reference()); + } + + template <typename ReducerType> + KOKKOS_INLINE_FUNCTION static + typename std::enable_if<is_reducer<ReducerType>::value>::type + vector_reduce(ReducerType const& /*reducer*/, + typename ReducerType::value_type& /*value*/) { + // FIXME_SYCL + Kokkos::abort("Not implemented!"); + } + + //-------------------------------------------------------------------------- + /**\brief Global reduction across all blocks + * + * Return !0 if reducer contains the final value + */ + template <typename ReducerType> + KOKKOS_INLINE_FUNCTION static + typename std::enable_if<is_reducer<ReducerType>::value, int>::type + global_reduce(ReducerType const& /*reducer*/, + int* const /*global_scratch_flags*/, + void* const /*global_scratch_space*/, void* const /*shmem*/, + int const /*shmem_size*/) { + // FIXME_SYCL + Kokkos::abort("Not implemented!"); + } + + //---------------------------------------- + // Private for the driver + + KOKKOS_INLINE_FUNCTION + SYCLTeamMember(void* shared, const int shared_begin, const int shared_size, + void* scratch_level_1_ptr, const int scratch_level_1_size, + const sycl::nd_item<2> item) + : m_team_reduce(shared), + m_team_shared(static_cast<char*>(shared) + shared_begin, shared_size, + scratch_level_1_ptr, scratch_level_1_size), + m_team_reduce_size(shared_begin), + m_item(item) {} + + public: + // Declare to avoid unused private member warnings which are trigger + // when SFINAE excludes the member function which uses these variables + // Making another class a friend also surpresses these warnings + bool impl_avoid_sfinae_warning() const noexcept { + return m_team_reduce_size > 0 && m_team_reduce != nullptr; + } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <typename iType> +struct TeamThreadRangeBoundariesStruct<iType, SYCLTeamMember> { + using index_type = iType; + const SYCLTeamMember& member; + const iType start; + const iType end; + + KOKKOS_INLINE_FUNCTION + TeamThreadRangeBoundariesStruct(const SYCLTeamMember& thread_, iType count) + : member(thread_), start(0), end(count) {} + + KOKKOS_INLINE_FUNCTION + TeamThreadRangeBoundariesStruct(const SYCLTeamMember& thread_, iType begin_, + iType end_) + : member(thread_), start(begin_), end(end_) {} +}; + +template <typename iType> +struct TeamVectorRangeBoundariesStruct<iType, SYCLTeamMember> { + using index_type = iType; + const SYCLTeamMember& member; + const iType start; + const iType end; + + KOKKOS_INLINE_FUNCTION + TeamVectorRangeBoundariesStruct(const SYCLTeamMember& thread_, + const iType& count) + : member(thread_), start(0), end(count) {} + + KOKKOS_INLINE_FUNCTION + TeamVectorRangeBoundariesStruct(const SYCLTeamMember& thread_, + const iType& begin_, const iType& end_) + : member(thread_), start(begin_), end(end_) {} +}; + +template <typename iType> +struct ThreadVectorRangeBoundariesStruct<iType, SYCLTeamMember> { + using index_type = iType; + const SYCLTeamMember& member; + const index_type start; + const index_type end; + + KOKKOS_INLINE_FUNCTION + ThreadVectorRangeBoundariesStruct(const SYCLTeamMember& thread, + index_type count) + : member(thread), start(static_cast<index_type>(0)), end(count) {} + + KOKKOS_INLINE_FUNCTION + ThreadVectorRangeBoundariesStruct(const SYCLTeamMember& thread, + index_type arg_begin, index_type arg_end) + : member(thread), start(arg_begin), end(arg_end) {} +}; + +} // namespace Impl + +template <typename iType> +KOKKOS_INLINE_FUNCTION + Impl::TeamThreadRangeBoundariesStruct<iType, Impl::SYCLTeamMember> + TeamThreadRange(const Impl::SYCLTeamMember& thread, iType count) { + return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::SYCLTeamMember>( + thread, count); +} + +template <typename iType1, typename iType2> +KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct< + typename std::common_type<iType1, iType2>::type, Impl::SYCLTeamMember> +TeamThreadRange(const Impl::SYCLTeamMember& thread, iType1 begin, iType2 end) { + using iType = typename std::common_type<iType1, iType2>::type; + return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::SYCLTeamMember>( + thread, iType(begin), iType(end)); +} + +template <typename iType> +KOKKOS_INLINE_FUNCTION + Impl::TeamVectorRangeBoundariesStruct<iType, Impl::SYCLTeamMember> + TeamVectorRange(const Impl::SYCLTeamMember& thread, const iType& count) { + return Impl::TeamVectorRangeBoundariesStruct<iType, Impl::SYCLTeamMember>( + thread, count); +} + +template <typename iType1, typename iType2> +KOKKOS_INLINE_FUNCTION Impl::TeamVectorRangeBoundariesStruct< + typename std::common_type<iType1, iType2>::type, Impl::SYCLTeamMember> +TeamVectorRange(const Impl::SYCLTeamMember& thread, const iType1& begin, + const iType2& end) { + using iType = typename std::common_type<iType1, iType2>::type; + return Impl::TeamVectorRangeBoundariesStruct<iType, Impl::SYCLTeamMember>( + thread, iType(begin), iType(end)); +} + +template <typename iType> +KOKKOS_INLINE_FUNCTION + Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::SYCLTeamMember> + ThreadVectorRange(const Impl::SYCLTeamMember& thread, iType count) { + return Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::SYCLTeamMember>( + thread, count); +} + +template <typename iType1, typename iType2> +KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct< + typename std::common_type<iType1, iType2>::type, Impl::SYCLTeamMember> +ThreadVectorRange(const Impl::SYCLTeamMember& thread, iType1 arg_begin, + iType2 arg_end) { + using iType = typename std::common_type<iType1, iType2>::type; + return Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::SYCLTeamMember>( + thread, iType(arg_begin), iType(arg_end)); +} + +KOKKOS_INLINE_FUNCTION +Impl::ThreadSingleStruct<Impl::SYCLTeamMember> PerTeam( + const Impl::SYCLTeamMember& thread) { + return Impl::ThreadSingleStruct<Impl::SYCLTeamMember>(thread); +} + +KOKKOS_INLINE_FUNCTION +Impl::VectorSingleStruct<Impl::SYCLTeamMember> PerThread( + const Impl::SYCLTeamMember& thread) { + return Impl::VectorSingleStruct<Impl::SYCLTeamMember>(thread); +} + +//---------------------------------------------------------------------------- + +/** \brief Inter-thread parallel_for. + * + * Executes closure(iType i) for each i=[0..N). + * + * The range [0..N) is mapped to all threads of the calling thread team. + */ +template <typename iType, class Closure> +KOKKOS_INLINE_FUNCTION void parallel_for( + const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::SYCLTeamMember>& + loop_boundaries, + const Closure& closure) { + // FIXME_SYCL Fix for vector_length>1. + for (iType i = loop_boundaries.start + + loop_boundaries.member.item().get_local_id(0); + i < loop_boundaries.end; + i += loop_boundaries.member.item().get_local_range(0)) + closure(i); +} + +//---------------------------------------------------------------------------- + +/** \brief Inter-thread parallel_reduce with a reducer. + * + * Executes closure(iType i, ValueType & val) for each i=[0..N) + * + * The range [0..N) is mapped to all threads of the + * calling thread team and a summation of val is + * performed and put into result. + */ +template <typename iType, class Closure, class ReducerType> +KOKKOS_INLINE_FUNCTION + typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type + parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::SYCLTeamMember>& loop_boundaries, + const Closure& closure, const ReducerType& reducer) { + typename ReducerType::value_type value; + reducer.init(value); + + // FIXME_SYCL Fix for vector_length>1. + for (iType i = loop_boundaries.start + + loop_boundaries.member.item().get_local_id(0); + i < loop_boundaries.end; + i += loop_boundaries.member.item().get_local_range(0)) { + closure(i, value); + } + + loop_boundaries.member.team_reduce(reducer, value); +} + +/** \brief Inter-thread parallel_reduce assuming summation. + * + * Executes closure(iType i, ValueType & val) for each i=[0..N) + * + * The range [0..N) is mapped to all threads of the + * calling thread team and a summation of val is + * performed and put into result. + */ +template <typename iType, class Closure, typename ValueType> +KOKKOS_INLINE_FUNCTION + typename std::enable_if<!Kokkos::is_reducer<ValueType>::value>::type + parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::SYCLTeamMember>& loop_boundaries, + const Closure& closure, ValueType& result) { + ValueType val; + Kokkos::Sum<ValueType> reducer(val); + + reducer.init(reducer.reference()); + + // FIXME_SYCL Fix for vector_length>1. + for (iType i = loop_boundaries.start + + loop_boundaries.member.item().get_local_id(0); + i < loop_boundaries.end; + i += loop_boundaries.member.item().get_local_range(0)) { + closure(i, val); + } + + loop_boundaries.member.team_reduce(reducer, val); + result = reducer.reference(); +} + +/** \brief Inter-thread parallel exclusive prefix sum. + * + * Executes closure(iType i, ValueType & val, bool final) for each i=[0..N) + * + * The range [0..N) is mapped to each rank in the team (whose global rank is + * less than N) and a scan operation is performed. The last call to closure has + * final == true. + */ +// This is the same code as in CUDA and largely the same as in OpenMPTarget +template <typename iType, typename FunctorType> +KOKKOS_INLINE_FUNCTION void parallel_scan( + const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::SYCLTeamMember>& + loop_bounds, + const FunctorType& lambda) { + // Extract value_type from lambda + using value_type = typename Kokkos::Impl::FunctorAnalysis< + Kokkos::Impl::FunctorPatternInterface::SCAN, void, + FunctorType>::value_type; + + const auto start = loop_bounds.start; + const auto end = loop_bounds.end; + auto& member = loop_bounds.member; + const auto team_size = member.team_size(); + const auto team_rank = member.team_rank(); + const auto nchunk = (end - start + team_size - 1) / team_size; + value_type accum = 0; + // each team has to process one or more chunks of the prefix scan + for (iType i = 0; i < nchunk; ++i) { + auto ii = start + i * team_size + team_rank; + // local accumulation for this chunk + value_type local_accum = 0; + // user updates value with prefix value + if (ii < loop_bounds.end) lambda(ii, local_accum, false); + // perform team scan + local_accum = member.team_scan(local_accum); + // add this blocks accum to total accumulation + auto val = accum + local_accum; + // user updates their data with total accumulation + if (ii < loop_bounds.end) lambda(ii, val, true); + // the last value needs to be propogated to next chunk + if (team_rank == team_size - 1) accum = val; + // broadcast last value to rest of the team + member.team_broadcast(accum, team_size - 1); + } +} + +template <typename iType, class Closure> +KOKKOS_INLINE_FUNCTION void parallel_for( + const Impl::TeamVectorRangeBoundariesStruct<iType, Impl::SYCLTeamMember>& + loop_boundaries, + const Closure& closure) { + // FIXME_SYCL adapt for vector_length != 1 + for (iType i = loop_boundaries.start + + loop_boundaries.member.item().get_local_id(0); + i < loop_boundaries.end; + i += loop_boundaries.member.item().get_local_range(0)) + closure(i); +} + +template <typename iType, class Closure, class ReducerType> +KOKKOS_INLINE_FUNCTION + typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type + parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< + iType, Impl::SYCLTeamMember>& loop_boundaries, + const Closure& closure, const ReducerType& reducer) { + // FIXME_SYCL adapt for vector_length != 1 + typename ReducerType::value_type value; + reducer.init(value); + + for (iType i = loop_boundaries.start + + loop_boundaries.member.item().get_local_id(0); + i < loop_boundaries.end; + i += loop_boundaries.member.item().get_local_range(0)) { + closure(i, value); + } + + loop_boundaries.member.team_reduce(reducer, value); +} + +template <typename iType, class Closure, typename ValueType> +KOKKOS_INLINE_FUNCTION + typename std::enable_if<!Kokkos::is_reducer<ValueType>::value>::type + parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< + iType, Impl::SYCLTeamMember>& loop_boundaries, + const Closure& closure, ValueType& result) { + // FIXME_SYCL adapt for vector_length != 1 + ValueType val; + Kokkos::Sum<ValueType> reducer(val); + + reducer.init(reducer.reference()); + + for (iType i = loop_boundaries.start + + loop_boundaries.member.item().get_local_id(0); + i < loop_boundaries.end; + i += loop_boundaries.member.item().get_local_range(0)) { + closure(i, val); + } + + loop_boundaries.member.team_reduce(reducer, val); + result = reducer.reference(); +} + +//---------------------------------------------------------------------------- + +/** \brief Intra-thread vector parallel_for. + * + * Executes closure(iType i) for each i=[0..N) + * + * The range [0..N) is mapped to all vector lanes of the calling thread. + */ +template <typename iType, class Closure> +KOKKOS_INLINE_FUNCTION void parallel_for( + const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::SYCLTeamMember>& + loop_boundaries, + const Closure& closure) { + // FIXME_SYC: adapt for vector_length!=1 + for (auto i = loop_boundaries.start; i != loop_boundaries.end; ++i) + closure(i); +} + +//---------------------------------------------------------------------------- + +/** \brief Intra-thread vector parallel_reduce. + * + * Calls closure(iType i, ValueType & val) for each i=[0..N). + * + * The range [0..N) is mapped to all vector lanes of + * the calling thread and a reduction of val is performed using += + * and output into result. + * + * The identity value for the += operator is assumed to be the default + * constructed value. + */ +template <typename iType, class Closure, class ReducerType> +KOKKOS_INLINE_FUNCTION + typename std::enable_if<is_reducer<ReducerType>::value>::type + parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::SYCLTeamMember> const& loop_boundaries, + Closure const& closure, ReducerType const& reducer) { + // FIXME_SYCL adapt for vector_length != 1 + reducer.init(reducer.reference()); + + for (iType i = loop_boundaries.start; i < loop_boundaries.end; ++i) { + closure(i, reducer.reference()); + } +} + +/** \brief Intra-thread vector parallel_reduce. + * + * Calls closure(iType i, ValueType & val) for each i=[0..N). + * + * The range [0..N) is mapped to all vector lanes of + * the calling thread and a reduction of val is performed using += + * and output into result. + * + * The identity value for the += operator is assumed to be the default + * constructed value. + */ +template <typename iType, class Closure, typename ValueType> +KOKKOS_INLINE_FUNCTION + typename std::enable_if<!is_reducer<ValueType>::value>::type + parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::SYCLTeamMember> const& loop_boundaries, + Closure const& closure, ValueType& result) { + // FIXME_SYCL adapt for vector_length != 1 + result = ValueType(); + + for (iType i = loop_boundaries.start; i < loop_boundaries.end; ++i) { + closure(i, result); + } +} + +//---------------------------------------------------------------------------- + +/** \brief Intra-thread vector parallel exclusive prefix sum with reducer. + * + * Executes closure(iType i, ValueType & val, bool final) for each i=[0..N) + * + * The range [0..N) is mapped to all vector lanes in the + * thread and a scan operation is performed. + * The last call to closure has final == true. + */ +template <typename iType, class Closure, typename ReducerType> +KOKKOS_INLINE_FUNCTION + typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type + parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::SYCLTeamMember>& loop_boundaries, + const Closure& closure, const ReducerType& reducer) { + // FIXME_SYCL modify for vector_length!=1 + using value_type = typename Kokkos::Impl::FunctorAnalysis< + Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type; + + value_type accum; + reducer.init(accum); + + for (iType i = loop_boundaries.start; i < loop_boundaries.end; ++i) { + closure(i, accum, true); + } +} + +/** \brief Intra-thread vector parallel exclusive prefix sum. + * + * Executes closure(iType i, ValueType & val, bool final) for each i=[0..N) + * + * The range [0..N) is mapped to all vector lanes in the + * thread and a scan operation is performed. + * The last call to closure has final == true. + */ +template <typename iType, class Closure> +KOKKOS_INLINE_FUNCTION void parallel_scan( + const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::SYCLTeamMember>& + loop_boundaries, + const Closure& closure) { + using value_type = typename Kokkos::Impl::FunctorAnalysis< + Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type; + value_type dummy; + parallel_scan(loop_boundaries, closure, Kokkos::Sum<value_type>{dummy}); +} + +} // namespace Kokkos + +namespace Kokkos { + +template <class FunctorType> +KOKKOS_INLINE_FUNCTION void single( + const Impl::VectorSingleStruct<Impl::SYCLTeamMember>& single_struct, + const FunctorType& lambda) { + if (single_struct.team_member.item().get_local_id(1) == 0) lambda(); +} + +template <class FunctorType> +KOKKOS_INLINE_FUNCTION void single( + const Impl::ThreadSingleStruct<Impl::SYCLTeamMember>& single_struct, + const FunctorType& lambda) { + if (single_struct.team_member.team_rank() == 0) lambda(); +} + +template <class FunctorType, class ValueType> +KOKKOS_INLINE_FUNCTION void single( + const Impl::VectorSingleStruct<Impl::SYCLTeamMember>& single_struct, + const FunctorType& lambda, ValueType& val) { + if (single_struct.team_member.item().get_local_id(1) == 0) lambda(val); +} + +template <class FunctorType, class ValueType> +KOKKOS_INLINE_FUNCTION void single( + const Impl::ThreadSingleStruct<Impl::SYCLTeamMember>& single_struct, + const FunctorType& lambda, ValueType& val) { + if (single_struct.team_member.team_rank() == 0) lambda(val); +} + +} // namespace Kokkos + +#endif + +#endif /* #ifndef KOKKOS_SYCL_TEAM_HPP */ diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp new file mode 100644 index 0000000000000000000000000000000000000000..141a692f6090555cf129997a64bc9e99941f830d --- /dev/null +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp @@ -0,0 +1,134 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_SYCL_UNIQUE_TOKEN_HPP +#define KOKKOS_SYCL_UNIQUE_TOKEN_HPP + +#include <impl/Kokkos_ConcurrentBitset.hpp> +#include <Kokkos_SYCL_Space.hpp> +#include <Kokkos_UniqueToken.hpp> + +namespace Kokkos { +namespace Experimental { + +// both global and instance Unique Tokens are implemented in the same way +template <> +class UniqueToken<SYCL, UniqueTokenScope::Global> { + protected: + uint32_t volatile* m_buffer; + uint32_t m_count; + + public: + using execution_space = SYCL; + using size_type = int32_t; + + explicit UniqueToken(execution_space const& = execution_space()) + : m_buffer(Impl::SYCLInternal::singleton().m_scratchConcurrentBitset), + m_count(SYCL::concurrency()) {} + + KOKKOS_DEFAULTED_FUNCTION + UniqueToken(const UniqueToken&) = default; + + KOKKOS_DEFAULTED_FUNCTION + UniqueToken(UniqueToken&&) = default; + + KOKKOS_DEFAULTED_FUNCTION + UniqueToken& operator=(const UniqueToken&) = default; + + KOKKOS_DEFAULTED_FUNCTION + UniqueToken& operator=(UniqueToken&&) = default; + + /// \brief upper bound for acquired values, i.e. 0 <= value < size() + KOKKOS_INLINE_FUNCTION + size_type size() const noexcept { return m_count; } + + /// \brief acquire value such that 0 <= value < size() + KOKKOS_INLINE_FUNCTION + size_type acquire() const { + const Kokkos::pair<int, int> result = + Kokkos::Impl::concurrent_bitset::acquire_bounded( + m_buffer, m_count +#if defined(KOKKOS_ARCH_INTEL_GEN) + , + Kokkos::Impl::clock_tic() % m_count +#endif + ); + + if (result.first < 0) { + Kokkos::abort( + "UniqueToken<SYCL> failure to acquire tokens, no tokens available"); + } + + return result.first; + } + + /// \brief release an acquired value + KOKKOS_INLINE_FUNCTION + void release(size_type i) const noexcept { + Kokkos::Impl::concurrent_bitset::release(m_buffer, i); + } +}; + +template <> +class UniqueToken<SYCL, UniqueTokenScope::Instance> + : public UniqueToken<SYCL, UniqueTokenScope::Global> { + View<uint32_t*, SYCLDeviceUSMSpace> m_buffer_view; + + public: + explicit UniqueToken(execution_space const& arg = execution_space()) + : UniqueToken<SYCL, UniqueTokenScope::Global>(arg) {} + + UniqueToken(size_type max_size, execution_space const& = execution_space()) + : m_buffer_view( + "UniqueToken::m_buffer_view", + ::Kokkos::Impl::concurrent_bitset::buffer_bound(max_size)) { + m_buffer = m_buffer_view.data(); + m_count = max_size; + } +}; + +} // namespace Experimental +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp b/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp new file mode 100644 index 0000000000000000000000000000000000000000..92bd671bd53bf89482aee39cdd34b3391e9a01a2 --- /dev/null +++ b/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp @@ -0,0 +1,852 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_THREADS) + +#include <cstdint> +#include <limits> +#include <utility> +#include <iostream> +#include <sstream> + +#include <Kokkos_Core.hpp> + +#include <impl/Kokkos_Error.hpp> +#include <impl/Kokkos_CPUDiscovery.hpp> +#include <impl/Kokkos_Tools.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { +namespace { + +ThreadsExec s_threads_process; +ThreadsExec *s_threads_exec[ThreadsExec::MAX_THREAD_COUNT] = {nullptr}; +pthread_t s_threads_pid[ThreadsExec::MAX_THREAD_COUNT] = {0}; +std::pair<unsigned, unsigned> s_threads_coord[ThreadsExec::MAX_THREAD_COUNT]; + +int s_thread_pool_size[3] = {0, 0, 0}; + +unsigned s_current_reduce_size = 0; +unsigned s_current_shared_size = 0; + +void (*volatile s_current_function)(ThreadsExec &, const void *); +const void *volatile s_current_function_arg = nullptr; + +struct Sentinel { + ~Sentinel() { + if (s_thread_pool_size[0] || s_thread_pool_size[1] || + s_thread_pool_size[2] || s_current_reduce_size || + s_current_shared_size || s_current_function || s_current_function_arg || + s_threads_exec[0]) { + std::cerr << "ERROR : Process exiting while Kokkos::Threads is still " + "initialized" + << std::endl; + } + } +}; + +inline unsigned fan_size(const unsigned rank, const unsigned size) { + const unsigned rank_rev = size - (rank + 1); + unsigned count = 0; + for (unsigned n = 1; (rank_rev + n < size) && !(rank_rev & n); n <<= 1) { + ++count; + } + return count; +} + +} // namespace +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +void execute_function_noop(ThreadsExec &, const void *) {} + +void ThreadsExec::driver() { + SharedAllocationRecord<void, void>::tracking_enable(); + + ThreadsExec this_thread; + + while (ThreadsExec::Active == this_thread.m_pool_state) { + (*s_current_function)(this_thread, s_current_function_arg); + + // Deactivate thread and wait for reactivation + this_thread.m_pool_state = ThreadsExec::Inactive; + + wait_yield(this_thread.m_pool_state, ThreadsExec::Inactive); + } +} + +ThreadsExec::ThreadsExec() + : m_pool_base(nullptr), + m_scratch(nullptr), + m_scratch_reduce_end(0), + m_scratch_thread_end(0), + m_numa_rank(0), + m_numa_core_rank(0), + m_pool_rank(0), + m_pool_size(0), + m_pool_fan_size(0), + m_pool_state(ThreadsExec::Terminating) { + if (&s_threads_process != this) { + // A spawned thread + + ThreadsExec *const nil = nullptr; + + // Which entry in 's_threads_exec', possibly determined from hwloc binding + const int entry = + ((size_t)s_current_function_arg) < size_t(s_thread_pool_size[0]) + ? ((size_t)s_current_function_arg) + : size_t(Kokkos::hwloc::bind_this_thread(s_thread_pool_size[0], + s_threads_coord)); + + // Given a good entry set this thread in the 's_threads_exec' array + if (entry < s_thread_pool_size[0] && + nil == atomic_compare_exchange(s_threads_exec + entry, nil, this)) { + const std::pair<unsigned, unsigned> coord = + Kokkos::hwloc::get_this_thread_coordinate(); + + m_numa_rank = coord.first; + m_numa_core_rank = coord.second; + m_pool_base = s_threads_exec; + m_pool_rank = s_thread_pool_size[0] - (entry + 1); + m_pool_rank_rev = s_thread_pool_size[0] - (pool_rank() + 1); + m_pool_size = s_thread_pool_size[0]; + m_pool_fan_size = fan_size(m_pool_rank, m_pool_size); + m_pool_state = ThreadsExec::Active; + + s_threads_pid[m_pool_rank] = pthread_self(); + + // Inform spawning process that the threads_exec entry has been set. + s_threads_process.m_pool_state = ThreadsExec::Active; + } else { + // Inform spawning process that the threads_exec entry could not be set. + s_threads_process.m_pool_state = ThreadsExec::Terminating; + } + } else { + // Enables 'parallel_for' to execute on unitialized Threads device + m_pool_rank = 0; + m_pool_size = 1; + m_pool_state = ThreadsExec::Inactive; + + s_threads_pid[m_pool_rank] = pthread_self(); + } +} + +ThreadsExec::~ThreadsExec() { + const unsigned entry = m_pool_size - (m_pool_rank + 1); + + using Record = Kokkos::Impl::SharedAllocationRecord<Kokkos::HostSpace, void>; + + if (m_scratch) { + Record *const r = Record::get_record(m_scratch); + + m_scratch = nullptr; + + Record::decrement(r); + } + + m_pool_base = nullptr; + m_scratch_reduce_end = 0; + m_scratch_thread_end = 0; + m_numa_rank = 0; + m_numa_core_rank = 0; + m_pool_rank = 0; + m_pool_size = 0; + m_pool_fan_size = 0; + + m_pool_state = ThreadsExec::Terminating; + + if (&s_threads_process != this && entry < MAX_THREAD_COUNT) { + ThreadsExec *const nil = nullptr; + + atomic_compare_exchange(s_threads_exec + entry, this, nil); + + s_threads_process.m_pool_state = ThreadsExec::Terminating; + } +} + +int ThreadsExec::get_thread_count() { return s_thread_pool_size[0]; } + +ThreadsExec *ThreadsExec::get_thread(const int init_thread_rank) { + ThreadsExec *const th = + init_thread_rank < s_thread_pool_size[0] + ? s_threads_exec[s_thread_pool_size[0] - (init_thread_rank + 1)] + : nullptr; + + if (nullptr == th || th->m_pool_rank != init_thread_rank) { + std::ostringstream msg; + msg << "Kokkos::Impl::ThreadsExec::get_thread ERROR : " + << "thread " << init_thread_rank << " of " << s_thread_pool_size[0]; + if (nullptr == th) { + msg << " does not exist"; + } else { + msg << " has wrong thread_rank " << th->m_pool_rank; + } + Kokkos::Impl::throw_runtime_exception(msg.str()); + } + + return th; +} + +//---------------------------------------------------------------------------- + +void ThreadsExec::execute_sleep(ThreadsExec &exec, const void *) { + ThreadsExec::global_lock(); + ThreadsExec::global_unlock(); + + const int n = exec.m_pool_fan_size; + const int rank_rev = exec.m_pool_size - (exec.m_pool_rank + 1); + + for (int i = 0; i < n; ++i) { + Impl::spinwait_while_equal<int>( + exec.m_pool_base[rank_rev + (1 << i)]->m_pool_state, + ThreadsExec::Active); + } + + exec.m_pool_state = ThreadsExec::Inactive; +} + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +void ThreadsExec::verify_is_process(const std::string &name, + const bool initialized) { + if (!is_process()) { + std::string msg(name); + msg.append( + " FAILED : Called by a worker thread, can only be called by the master " + "process."); + Kokkos::Impl::throw_runtime_exception(msg); + } + + if (initialized && 0 == s_thread_pool_size[0]) { + std::string msg(name); + msg.append(" FAILED : Threads not initialized."); + Kokkos::Impl::throw_runtime_exception(msg); + } +} + +int ThreadsExec::in_parallel() { + // A thread function is in execution and + // the function argument is not the special threads process argument and + // the master process is a worker or is not the master process. + return s_current_function && (&s_threads_process != s_current_function_arg) && + (s_threads_process.m_pool_base || !is_process()); +} + +// Wait for root thread to become inactive +void ThreadsExec::fence() { + if (s_thread_pool_size[0]) { + // Wait for the root thread to complete: + Impl::spinwait_while_equal<int>(s_threads_exec[0]->m_pool_state, + ThreadsExec::Active); + } + + s_current_function = nullptr; + s_current_function_arg = nullptr; + + // Make sure function and arguments are cleared before + // potentially re-activating threads with a subsequent launch. + memory_fence(); +} + +/** \brief Begin execution of the asynchronous functor */ +void ThreadsExec::start(void (*func)(ThreadsExec &, const void *), + const void *arg) { + verify_is_process("ThreadsExec::start", true); + + if (s_current_function || s_current_function_arg) { + Kokkos::Impl::throw_runtime_exception( + std::string("ThreadsExec::start() FAILED : already executing")); + } + + s_current_function = func; + s_current_function_arg = arg; + + // Make sure function and arguments are written before activating threads. + memory_fence(); + + // Activate threads: + for (int i = s_thread_pool_size[0]; 0 < i--;) { + s_threads_exec[i]->m_pool_state = ThreadsExec::Active; + } + + if (s_threads_process.m_pool_size) { + // Master process is the root thread, run it: + (*func)(s_threads_process, arg); + s_threads_process.m_pool_state = ThreadsExec::Inactive; + } +} + +//---------------------------------------------------------------------------- + +bool ThreadsExec::sleep() { + verify_is_process("ThreadsExec::sleep", true); + + if (&execute_sleep == s_current_function) return false; + + fence(); + + ThreadsExec::global_lock(); + + s_current_function = &execute_sleep; + + // Activate threads: + for (unsigned i = s_thread_pool_size[0]; 0 < i;) { + s_threads_exec[--i]->m_pool_state = ThreadsExec::Active; + } + + return true; +} + +bool ThreadsExec::wake() { + verify_is_process("ThreadsExec::wake", true); + + if (&execute_sleep != s_current_function) return false; + + ThreadsExec::global_unlock(); + + if (s_threads_process.m_pool_base) { + execute_sleep(s_threads_process, nullptr); + s_threads_process.m_pool_state = ThreadsExec::Inactive; + } + + fence(); + + return true; +} + +//---------------------------------------------------------------------------- + +void ThreadsExec::execute_serial(void (*func)(ThreadsExec &, const void *)) { + s_current_function = func; + s_current_function_arg = &s_threads_process; + + // Make sure function and arguments are written before activating threads. + memory_fence(); + + const unsigned begin = s_threads_process.m_pool_base ? 1 : 0; + + for (unsigned i = s_thread_pool_size[0]; begin < i;) { + ThreadsExec &th = *s_threads_exec[--i]; + + th.m_pool_state = ThreadsExec::Active; + + wait_yield(th.m_pool_state, ThreadsExec::Active); + } + + if (s_threads_process.m_pool_base) { + s_threads_process.m_pool_state = ThreadsExec::Active; + (*func)(s_threads_process, nullptr); + s_threads_process.m_pool_state = ThreadsExec::Inactive; + } + + s_current_function_arg = nullptr; + s_current_function = nullptr; + + // Make sure function and arguments are cleared before proceeding. + memory_fence(); +} + +//---------------------------------------------------------------------------- + +void *ThreadsExec::root_reduce_scratch() { + return s_threads_process.reduce_memory(); +} + +void ThreadsExec::execute_resize_scratch(ThreadsExec &exec, const void *) { + using Record = Kokkos::Impl::SharedAllocationRecord<Kokkos::HostSpace, void>; + + if (exec.m_scratch) { + Record *const r = Record::get_record(exec.m_scratch); + + exec.m_scratch = nullptr; + + Record::decrement(r); + } + + exec.m_scratch_reduce_end = s_threads_process.m_scratch_reduce_end; + exec.m_scratch_thread_end = s_threads_process.m_scratch_thread_end; + + if (s_threads_process.m_scratch_thread_end) { + // Allocate tracked memory: + { + Record *const r = + Record::allocate(Kokkos::HostSpace(), "Kokkos::thread_scratch", + s_threads_process.m_scratch_thread_end); + + Record::increment(r); + + exec.m_scratch = r->data(); + } + + unsigned *ptr = reinterpret_cast<unsigned *>(exec.m_scratch); + + unsigned *const end = + ptr + s_threads_process.m_scratch_thread_end / sizeof(unsigned); + + // touch on this thread + while (ptr < end) *ptr++ = 0; + } +} + +void *ThreadsExec::resize_scratch(size_t reduce_size, size_t thread_size) { + enum { ALIGN_MASK = Kokkos::Impl::MEMORY_ALIGNMENT - 1 }; + + fence(); + + const size_t old_reduce_size = s_threads_process.m_scratch_reduce_end; + const size_t old_thread_size = s_threads_process.m_scratch_thread_end - + s_threads_process.m_scratch_reduce_end; + + reduce_size = (reduce_size + ALIGN_MASK) & ~ALIGN_MASK; + thread_size = (thread_size + ALIGN_MASK) & ~ALIGN_MASK; + + // Increase size or deallocate completely. + + if ((old_reduce_size < reduce_size) || (old_thread_size < thread_size) || + ((reduce_size == 0 && thread_size == 0) && + (old_reduce_size != 0 || old_thread_size != 0))) { + verify_is_process("ThreadsExec::resize_scratch", true); + + s_threads_process.m_scratch_reduce_end = reduce_size; + s_threads_process.m_scratch_thread_end = reduce_size + thread_size; + + execute_serial(&execute_resize_scratch); + + s_threads_process.m_scratch = s_threads_exec[0]->m_scratch; + } + + return s_threads_process.m_scratch; +} + +//---------------------------------------------------------------------------- + +void ThreadsExec::print_configuration(std::ostream &s, const bool detail) { + verify_is_process("ThreadsExec::print_configuration", false); + + fence(); + + const unsigned numa_count = Kokkos::hwloc::get_available_numa_count(); + const unsigned cores_per_numa = Kokkos::hwloc::get_available_cores_per_numa(); + const unsigned threads_per_core = + Kokkos::hwloc::get_available_threads_per_core(); + + // Forestall compiler warnings for unused variables. + (void)numa_count; + (void)cores_per_numa; + (void)threads_per_core; + + s << "Kokkos::Threads"; + +#if defined(KOKKOS_ENABLE_THREADS) + s << " KOKKOS_ENABLE_THREADS"; +#endif +#if defined(KOKKOS_ENABLE_HWLOC) + s << " hwloc[" << numa_count << "x" << cores_per_numa << "x" + << threads_per_core << "]"; +#endif + + if (s_thread_pool_size[0]) { + s << " threads[" << s_thread_pool_size[0] << "]" + << " threads_per_numa[" << s_thread_pool_size[1] << "]" + << " threads_per_core[" << s_thread_pool_size[2] << "]"; + if (nullptr == s_threads_process.m_pool_base) { + s << " Asynchronous"; + } + s << " ReduceScratch[" << s_current_reduce_size << "]" + << " SharedScratch[" << s_current_shared_size << "]"; + s << std::endl; + + if (detail) { + for (int i = 0; i < s_thread_pool_size[0]; ++i) { + ThreadsExec *const th = s_threads_exec[i]; + + if (th) { + const int rank_rev = th->m_pool_size - (th->m_pool_rank + 1); + + s << " Thread[ " << th->m_pool_rank << " : " << th->m_numa_rank << "." + << th->m_numa_core_rank << " ]"; + + s << " Fan{"; + for (int j = 0; j < th->m_pool_fan_size; ++j) { + ThreadsExec *const thfan = th->m_pool_base[rank_rev + (1 << j)]; + s << " [ " << thfan->m_pool_rank << " : " << thfan->m_numa_rank + << "." << thfan->m_numa_core_rank << " ]"; + } + s << " }"; + + if (th == &s_threads_process) { + s << " is_process"; + } + } + s << std::endl; + } + } + } else { + s << " not initialized" << std::endl; + } +} + +//---------------------------------------------------------------------------- + +int ThreadsExec::is_initialized() { return nullptr != s_threads_exec[0]; } + +void ThreadsExec::initialize(unsigned thread_count, unsigned use_numa_count, + unsigned use_cores_per_numa, + bool allow_asynchronous_threadpool) { + // need to provide an initializer for Intel compilers + static const Sentinel sentinel = {}; + + const bool is_initialized = 0 != s_thread_pool_size[0]; + + unsigned thread_spawn_failed = 0; + + for (int i = 0; i < ThreadsExec::MAX_THREAD_COUNT; i++) + s_threads_exec[i] = nullptr; + + if (!is_initialized) { + // If thread_count, use_numa_count, or use_cores_per_numa are zero + // then they will be given default values based upon hwloc detection + // and allowed asynchronous execution. + + const bool hwloc_avail = Kokkos::hwloc::available(); + const bool hwloc_can_bind = + hwloc_avail && Kokkos::hwloc::can_bind_threads(); + + if (thread_count == 0) { + thread_count = hwloc_avail + ? Kokkos::hwloc::get_available_numa_count() * + Kokkos::hwloc::get_available_cores_per_numa() * + Kokkos::hwloc::get_available_threads_per_core() + : 1; + } + + const unsigned thread_spawn_begin = hwloc::thread_mapping( + "Kokkos::Threads::initialize", allow_asynchronous_threadpool, + thread_count, use_numa_count, use_cores_per_numa, s_threads_coord); + + const std::pair<unsigned, unsigned> proc_coord = s_threads_coord[0]; + + if (thread_spawn_begin) { + // Synchronous with s_threads_coord[0] as the process core + // Claim entry #0 for binding the process core. + s_threads_coord[0] = std::pair<unsigned, unsigned>(~0u, ~0u); + } + + s_thread_pool_size[0] = thread_count; + s_thread_pool_size[1] = s_thread_pool_size[0] / use_numa_count; + s_thread_pool_size[2] = s_thread_pool_size[1] / use_cores_per_numa; + s_current_function = + &execute_function_noop; // Initialization work function + + for (unsigned ith = thread_spawn_begin; ith < thread_count; ++ith) { + s_threads_process.m_pool_state = ThreadsExec::Inactive; + + // If hwloc available then spawned thread will + // choose its own entry in 's_threads_coord' + // otherwise specify the entry. + s_current_function_arg = + (void *)static_cast<uintptr_t>(hwloc_can_bind ? ~0u : ith); + + // Make sure all outstanding memory writes are complete + // before spawning the new thread. + memory_fence(); + + // Spawn thread executing the 'driver()' function. + // Wait until spawned thread has attempted to initialize. + // If spawning and initialization is successful then + // an entry in 's_threads_exec' will be assigned. + if (ThreadsExec::spawn()) { + wait_yield(s_threads_process.m_pool_state, ThreadsExec::Inactive); + } + if (s_threads_process.m_pool_state == ThreadsExec::Terminating) break; + } + + // Wait for all spawned threads to deactivate before zeroing the function. + + for (unsigned ith = thread_spawn_begin; ith < thread_count; ++ith) { + // Try to protect against cache coherency failure by casting to volatile. + ThreadsExec *const th = ((ThreadsExec * volatile *)s_threads_exec)[ith]; + if (th) { + wait_yield(th->m_pool_state, ThreadsExec::Active); + } else { + ++thread_spawn_failed; + } + } + + s_current_function = nullptr; + s_current_function_arg = nullptr; + s_threads_process.m_pool_state = ThreadsExec::Inactive; + + memory_fence(); + + if (!thread_spawn_failed) { + // Bind process to the core on which it was located before spawning + // occurred + if (hwloc_can_bind) { + Kokkos::hwloc::bind_this_thread(proc_coord); + } + + if (thread_spawn_begin) { // Include process in pool. + const std::pair<unsigned, unsigned> coord = + Kokkos::hwloc::get_this_thread_coordinate(); + + s_threads_exec[0] = &s_threads_process; + s_threads_process.m_numa_rank = coord.first; + s_threads_process.m_numa_core_rank = coord.second; + s_threads_process.m_pool_base = s_threads_exec; + s_threads_process.m_pool_rank = + thread_count - 1; // Reversed for scan-compatible reductions + s_threads_process.m_pool_size = thread_count; + s_threads_process.m_pool_fan_size = fan_size( + s_threads_process.m_pool_rank, s_threads_process.m_pool_size); + s_threads_pid[s_threads_process.m_pool_rank] = pthread_self(); + } else { + s_threads_process.m_pool_base = nullptr; + s_threads_process.m_pool_rank = 0; + s_threads_process.m_pool_size = 0; + s_threads_process.m_pool_fan_size = 0; + } + + // Initial allocations: + ThreadsExec::resize_scratch(1024, 1024); + } else { + s_thread_pool_size[0] = 0; + s_thread_pool_size[1] = 0; + s_thread_pool_size[2] = 0; + } + } + + if (is_initialized || thread_spawn_failed) { + std::ostringstream msg; + + msg << "Kokkos::Threads::initialize ERROR"; + + if (is_initialized) { + msg << " : already initialized"; + } + if (thread_spawn_failed) { + msg << " : failed to spawn " << thread_spawn_failed << " threads"; + } + + Kokkos::Impl::throw_runtime_exception(msg.str()); + } + + // Check for over-subscription + if (Kokkos::show_warnings() && + (Impl::mpi_ranks_per_node() * long(thread_count) > + Impl::processors_per_node())) { + std::cerr << "Kokkos::Threads::initialize WARNING: You are likely " + "oversubscribing your CPU cores." + << std::endl; + std::cerr << " Detected: " + << Impl::processors_per_node() << " cores per node." << std::endl; + std::cerr << " Detected: " + << Impl::mpi_ranks_per_node() << " MPI_ranks per node." + << std::endl; + std::cerr << " Requested: " + << thread_count << " threads per process." << std::endl; + } + + // Init the array for used for arbitrarily sized atomics + Impl::init_lock_array_host_space(); + + Impl::SharedAllocationRecord<void, void>::tracking_enable(); +} + +//---------------------------------------------------------------------------- + +void ThreadsExec::finalize() { + verify_is_process("ThreadsExec::finalize", false); + + fence(); + + resize_scratch(0, 0); + + const unsigned begin = s_threads_process.m_pool_base ? 1 : 0; + + for (unsigned i = s_thread_pool_size[0]; begin < i--;) { + if (s_threads_exec[i]) { + s_threads_exec[i]->m_pool_state = ThreadsExec::Terminating; + + wait_yield(s_threads_process.m_pool_state, ThreadsExec::Inactive); + + s_threads_process.m_pool_state = ThreadsExec::Inactive; + } + + s_threads_pid[i] = 0; + } + + if (s_threads_process.m_pool_base) { + (&s_threads_process)->~ThreadsExec(); + s_threads_exec[0] = nullptr; + } + + if (Kokkos::hwloc::can_bind_threads()) { + Kokkos::hwloc::unbind_this_thread(); + } + + s_thread_pool_size[0] = 0; + s_thread_pool_size[1] = 0; + s_thread_pool_size[2] = 0; + + // Reset master thread to run solo. + s_threads_process.m_numa_rank = 0; + s_threads_process.m_numa_core_rank = 0; + s_threads_process.m_pool_base = nullptr; + s_threads_process.m_pool_rank = 0; + s_threads_process.m_pool_size = 1; + s_threads_process.m_pool_fan_size = 0; + s_threads_process.m_pool_state = ThreadsExec::Inactive; + + Kokkos::Profiling::finalize(); +} + +//---------------------------------------------------------------------------- + +} /* namespace Impl */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +int Threads::concurrency() { return impl_thread_pool_size(0); } +void Threads::fence() const { Impl::ThreadsExec::fence(); } + +Threads &Threads::impl_instance(int) { + static Threads t; + return t; +} + +int Threads::impl_thread_pool_size(int depth) { + return Impl::s_thread_pool_size[depth]; +} + +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) +int Threads::impl_thread_pool_rank() { + const pthread_t pid = pthread_self(); + int i = 0; + while ((i < Impl::s_thread_pool_size[0]) && (pid != Impl::s_threads_pid[i])) { + ++i; + } + return i; +} +#endif + +const char *Threads::name() { return "Threads"; } + +namespace Impl { + +int g_threads_space_factory_initialized = + initialize_space_factory<ThreadsSpaceInitializer>("050_Threads"); + +void ThreadsSpaceInitializer::initialize(const InitArguments &args) { + const int num_threads = args.num_threads; + const int use_numa = args.num_numa; + if (std::is_same<Kokkos::Threads, Kokkos::DefaultExecutionSpace>::value || + std::is_same<Kokkos::Threads, + Kokkos::HostSpace::execution_space>::value) { + if (num_threads > 0) { + if (use_numa > 0) { + Kokkos::Threads::impl_initialize(num_threads, use_numa); + } else { + Kokkos::Threads::impl_initialize(num_threads); + } + } else { + Kokkos::Threads::impl_initialize(); + } + // std::cout << "Kokkos::initialize() fyi: Pthread enabled and initialized" + // << std::endl ; + } else { + // std::cout << "Kokkos::initialize() fyi: Pthread enabled but not + // initialized" << std::endl ; + } +} + +void ThreadsSpaceInitializer::finalize(const bool all_spaces) { + if (std::is_same<Kokkos::Threads, Kokkos::DefaultExecutionSpace>::value || + std::is_same<Kokkos::Threads, + Kokkos::HostSpace::execution_space>::value || + all_spaces) { + if (Kokkos::Threads::impl_is_initialized()) + Kokkos::Threads::impl_finalize(); + } +} + +void ThreadsSpaceInitializer::fence() { Kokkos::Threads::impl_static_fence(); } + +void ThreadsSpaceInitializer::print_configuration(std::ostream &msg, + const bool detail) { + msg << "Host Parallel Execution Space:" << std::endl; + msg << " KOKKOS_ENABLE_THREADS: "; + msg << "yes" << std::endl; + + msg << "\nThreads Runtime Configuration:" << std::endl; + Kokkos::Threads::print_configuration(msg, detail); +} + +} // namespace Impl +} /* namespace Kokkos */ +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- +#else +void KOKKOS_CORE_SRC_THREADS_EXEC_PREVENT_LINK_ERROR() {} +#endif /* #if defined( KOKKOS_ENABLE_THREADS ) */ diff --git a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp b/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp new file mode 100644 index 0000000000000000000000000000000000000000..1c8b3ac5f6a7685d2bec7d36b53fc657bf7ba1b9 --- /dev/null +++ b/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp @@ -0,0 +1,760 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_THREADSEXEC_HPP +#define KOKKOS_THREADSEXEC_HPP + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_THREADS) + +#include <cstdio> + +#include <utility> +#include <impl/Kokkos_Spinwait.hpp> +#include <impl/Kokkos_FunctorAdapter.hpp> + +#include <Kokkos_Atomic.hpp> + +#include <Kokkos_UniqueToken.hpp> +#include <impl/Kokkos_ConcurrentBitset.hpp> + +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +class ThreadsExec { + public: + // Fan array has log_2(NT) reduction threads plus 2 scan threads + // Currently limited to 16k threads. + enum { MAX_FAN_COUNT = 16 }; + enum { MAX_THREAD_COUNT = 1 << (MAX_FAN_COUNT - 2) }; + enum { VECTOR_LENGTH = 8 }; + + /** \brief States of a worker thread */ + enum { + Terminating ///< Termination in progress + , + Inactive ///< Exists, waiting for work + , + Active ///< Exists, performing work + , + Rendezvous ///< Exists, waiting in a barrier or reduce + + , + ScanCompleted, + ScanAvailable, + ReductionAvailable + }; + + private: + friend class Kokkos::Threads; + + // Fan-in operations' root is the highest ranking thread + // to place the 'scan' reduction intermediate values on + // the threads that need them. + // For a simple reduction the thread location is arbitrary. + + ThreadsExec *const *m_pool_base; ///< Base for pool fan-in + + void *m_scratch; + int m_scratch_reduce_end; + int m_scratch_thread_end; + int m_numa_rank; + int m_numa_core_rank; + int m_pool_rank; + int m_pool_rank_rev; + int m_pool_size; + int m_pool_fan_size; + int volatile m_pool_state; ///< State for global synchronizations + + // Members for dynamic scheduling + // Which thread am I stealing from currently + int m_current_steal_target; + // This thread's owned work_range + Kokkos::pair<long, long> m_work_range __attribute__((aligned(16))); + // Team Offset if one thread determines work_range for others + long m_team_work_index; + + // Is this thread stealing (i.e. its owned work_range is exhausted + bool m_stealing; + + static void global_lock(); + static void global_unlock(); + static bool spawn(); + + static void execute_resize_scratch(ThreadsExec &, const void *); + static void execute_sleep(ThreadsExec &, const void *); + + ThreadsExec(const ThreadsExec &); + ThreadsExec &operator=(const ThreadsExec &); + + static void execute_serial(void (*)(ThreadsExec &, const void *)); + + public: + KOKKOS_INLINE_FUNCTION int pool_size() const { return m_pool_size; } + KOKKOS_INLINE_FUNCTION int pool_rank() const { return m_pool_rank; } + KOKKOS_INLINE_FUNCTION int numa_rank() const { return m_numa_rank; } + KOKKOS_INLINE_FUNCTION int numa_core_rank() const { return m_numa_core_rank; } + inline long team_work_index() const { return m_team_work_index; } + + static int get_thread_count(); + static ThreadsExec *get_thread(const int init_thread_rank); + + inline void *reduce_memory() const { return m_scratch; } + KOKKOS_INLINE_FUNCTION void *scratch_memory() const { + return reinterpret_cast<unsigned char *>(m_scratch) + m_scratch_reduce_end; + } + + KOKKOS_INLINE_FUNCTION int volatile &state() { return m_pool_state; } + KOKKOS_INLINE_FUNCTION ThreadsExec *const *pool_base() const { + return m_pool_base; + } + + static void driver(void); + + ~ThreadsExec(); + ThreadsExec(); + + static void *resize_scratch(size_t reduce_size, size_t thread_size); + + static void *root_reduce_scratch(); + + static bool is_process(); + + static void verify_is_process(const std::string &, const bool initialized); + + static int is_initialized(); + + static void initialize(unsigned thread_count, unsigned use_numa_count, + unsigned use_cores_per_numa, + bool allow_asynchronous_threadpool); + + static void finalize(); + + /* Given a requested team size, return valid team size */ + static unsigned team_size_valid(unsigned); + + static void print_configuration(std::ostream &, const bool detail = false); + + //------------------------------------ + + static void wait_yield(volatile int &, const int); + + //------------------------------------ + // All-thread functions: + + inline int all_reduce(const int value) { + // Make sure there is enough scratch space: + const int rev_rank = m_pool_size - (m_pool_rank + 1); + + *((volatile int *)reduce_memory()) = value; + + memory_fence(); + + // Fan-in reduction with highest ranking thread as the root + for (int i = 0; i < m_pool_fan_size; ++i) { + // Wait: Active -> Rendezvous + Impl::spinwait_while_equal<int>( + m_pool_base[rev_rank + (1 << i)]->m_pool_state, ThreadsExec::Active); + } + + if (rev_rank) { + m_pool_state = ThreadsExec::Rendezvous; + // Wait: Rendezvous -> Active + Impl::spinwait_while_equal<int>(m_pool_state, ThreadsExec::Rendezvous); + } else { + // Root thread does the reduction and broadcast + + int accum = 0; + + for (int rank = 0; rank < m_pool_size; ++rank) { + accum += *((volatile int *)get_thread(rank)->reduce_memory()); + } + + for (int rank = 0; rank < m_pool_size; ++rank) { + *((volatile int *)get_thread(rank)->reduce_memory()) = accum; + } + + memory_fence(); + + for (int rank = 0; rank < m_pool_size; ++rank) { + get_thread(rank)->m_pool_state = ThreadsExec::Active; + } + } + + return *((volatile int *)reduce_memory()); + } + + inline void barrier() { + // Make sure there is enough scratch space: + const int rev_rank = m_pool_size - (m_pool_rank + 1); + + memory_fence(); + + // Fan-in reduction with highest ranking thread as the root + for (int i = 0; i < m_pool_fan_size; ++i) { + // Wait: Active -> Rendezvous + Impl::spinwait_while_equal<int>( + m_pool_base[rev_rank + (1 << i)]->m_pool_state, ThreadsExec::Active); + } + + if (rev_rank) { + m_pool_state = ThreadsExec::Rendezvous; + // Wait: Rendezvous -> Active + Impl::spinwait_while_equal<int>(m_pool_state, ThreadsExec::Rendezvous); + } else { + // Root thread does the reduction and broadcast + + memory_fence(); + + for (int rank = 0; rank < m_pool_size; ++rank) { + get_thread(rank)->m_pool_state = ThreadsExec::Active; + } + } + } + + //------------------------------------ + // All-thread functions: + + template <class FunctorType, class ArgTag> + inline void fan_in_reduce(const FunctorType &f) const { + using Join = Kokkos::Impl::FunctorValueJoin<FunctorType, ArgTag>; + using Final = Kokkos::Impl::FunctorFinal<FunctorType, ArgTag>; + + const int rev_rank = m_pool_size - (m_pool_rank + 1); + + for (int i = 0; i < m_pool_fan_size; ++i) { + ThreadsExec &fan = *m_pool_base[rev_rank + (1 << i)]; + + Impl::spinwait_while_equal<int>(fan.m_pool_state, ThreadsExec::Active); + + Join::join(f, reduce_memory(), fan.reduce_memory()); + } + + if (!rev_rank) { + Final::final(f, reduce_memory()); + } + + // This thread has updated 'reduce_memory()' and upon returning + // from this function will set 'm_pool_state' to inactive. + // If this is a non-root thread then setting 'm_pool_state' + // to inactive triggers another thread to exit a spinwait + // and read the 'reduce_memory'. + // Must 'memory_fence()' to guarantee that storing the update to + // 'reduce_memory()' will complete before storing the the update to + // 'm_pool_state'. + + memory_fence(); + } + + inline void fan_in() const { + const int rev_rank = m_pool_size - (m_pool_rank + 1); + + for (int i = 0; i < m_pool_fan_size; ++i) { + Impl::spinwait_while_equal<int>( + m_pool_base[rev_rank + (1 << i)]->m_pool_state, ThreadsExec::Active); + } + } + + template <class FunctorType, class ArgTag> + inline void scan_large(const FunctorType &f) { + // Sequence of states: + // 0) Active : entry and exit state + // 1) ReductionAvailable : reduction value available + // 2) ScanAvailable : inclusive scan value available + // 3) Rendezvous : All threads inclusive scan value are available + // 4) ScanCompleted : exclusive scan value copied + + using Traits = Kokkos::Impl::FunctorValueTraits<FunctorType, ArgTag>; + using Join = Kokkos::Impl::FunctorValueJoin<FunctorType, ArgTag>; + using Init = Kokkos::Impl::FunctorValueInit<FunctorType, ArgTag>; + + using scalar_type = typename Traits::value_type; + + const int rev_rank = m_pool_size - (m_pool_rank + 1); + const unsigned count = Traits::value_count(f); + + scalar_type *const work_value = (scalar_type *)reduce_memory(); + + //-------------------------------- + // Fan-in reduction with highest ranking thread as the root + for (int i = 0; i < m_pool_fan_size; ++i) { + ThreadsExec &fan = *m_pool_base[rev_rank + (1 << i)]; + + // Wait: Active -> ReductionAvailable (or ScanAvailable) + Impl::spinwait_while_equal<int>(fan.m_pool_state, ThreadsExec::Active); + Join::join(f, work_value, fan.reduce_memory()); + } + + // Copy reduction value to scan value before releasing from this phase. + for (unsigned i = 0; i < count; ++i) { + work_value[i + count] = work_value[i]; + } + + if (rev_rank) { + // Set: Active -> ReductionAvailable + m_pool_state = ThreadsExec::ReductionAvailable; + + // Wait for contributing threads' scan value to be available. + if ((1 << m_pool_fan_size) < (m_pool_rank + 1)) { + ThreadsExec &th = *m_pool_base[rev_rank + (1 << m_pool_fan_size)]; + + // Wait: Active -> ReductionAvailable + // Wait: ReductionAvailable -> ScanAvailable + Impl::spinwait_while_equal<int>(th.m_pool_state, ThreadsExec::Active); + Impl::spinwait_while_equal<int>(th.m_pool_state, + ThreadsExec::ReductionAvailable); + + Join::join(f, work_value + count, + ((scalar_type *)th.reduce_memory()) + count); + } + + // This thread has completed inclusive scan + // Set: ReductionAvailable -> ScanAvailable + m_pool_state = ThreadsExec::ScanAvailable; + + // Wait for all threads to complete inclusive scan + // Wait: ScanAvailable -> Rendezvous + Impl::spinwait_while_equal<int>(m_pool_state, ThreadsExec::ScanAvailable); + } + + //-------------------------------- + + for (int i = 0; i < m_pool_fan_size; ++i) { + ThreadsExec &fan = *m_pool_base[rev_rank + (1 << i)]; + // Wait: ReductionAvailable -> ScanAvailable + Impl::spinwait_while_equal<int>(fan.m_pool_state, + ThreadsExec::ReductionAvailable); + // Set: ScanAvailable -> Rendezvous + fan.m_pool_state = ThreadsExec::Rendezvous; + } + + // All threads have completed the inclusive scan. + // All non-root threads are in the Rendezvous state. + // Threads are free to overwrite their reduction value. + //-------------------------------- + + if ((rev_rank + 1) < m_pool_size) { + // Exclusive scan: copy the previous thread's inclusive scan value + + ThreadsExec &th = *m_pool_base[rev_rank + 1]; // Not the root thread + + const scalar_type *const src_value = + ((scalar_type *)th.reduce_memory()) + count; + + for (unsigned j = 0; j < count; ++j) { + work_value[j] = src_value[j]; + } + } else { + (void)Init::init(f, work_value); + } + + //-------------------------------- + // Wait for all threads to copy previous thread's inclusive scan value + // Wait for all threads: Rendezvous -> ScanCompleted + for (int i = 0; i < m_pool_fan_size; ++i) { + Impl::spinwait_while_equal<int>( + m_pool_base[rev_rank + (1 << i)]->m_pool_state, + ThreadsExec::Rendezvous); + } + if (rev_rank) { + // Set: ScanAvailable -> ScanCompleted + m_pool_state = ThreadsExec::ScanCompleted; + // Wait: ScanCompleted -> Active + Impl::spinwait_while_equal<int>(m_pool_state, ThreadsExec::ScanCompleted); + } + // Set: ScanCompleted -> Active + for (int i = 0; i < m_pool_fan_size; ++i) { + m_pool_base[rev_rank + (1 << i)]->m_pool_state = ThreadsExec::Active; + } + } + + template <class FunctorType, class ArgTag> + inline void scan_small(const FunctorType &f) { + using Traits = Kokkos::Impl::FunctorValueTraits<FunctorType, ArgTag>; + using Join = Kokkos::Impl::FunctorValueJoin<FunctorType, ArgTag>; + using Init = Kokkos::Impl::FunctorValueInit<FunctorType, ArgTag>; + + using scalar_type = typename Traits::value_type; + + const int rev_rank = m_pool_size - (m_pool_rank + 1); + const unsigned count = Traits::value_count(f); + + scalar_type *const work_value = (scalar_type *)reduce_memory(); + + //-------------------------------- + // Fan-in reduction with highest ranking thread as the root + for (int i = 0; i < m_pool_fan_size; ++i) { + // Wait: Active -> Rendezvous + Impl::spinwait_while_equal<int>( + m_pool_base[rev_rank + (1 << i)]->m_pool_state, ThreadsExec::Active); + } + + for (unsigned i = 0; i < count; ++i) { + work_value[i + count] = work_value[i]; + } + + if (rev_rank) { + m_pool_state = ThreadsExec::Rendezvous; + // Wait: Rendezvous -> Active + Impl::spinwait_while_equal<int>(m_pool_state, ThreadsExec::Rendezvous); + } else { + // Root thread does the thread-scan before releasing threads + + scalar_type *ptr_prev = nullptr; + + for (int rank = 0; rank < m_pool_size; ++rank) { + scalar_type *const ptr = + (scalar_type *)get_thread(rank)->reduce_memory(); + if (rank) { + for (unsigned i = 0; i < count; ++i) { + ptr[i] = ptr_prev[i + count]; + } + Join::join(f, ptr + count, ptr); + } else { + (void)Init::init(f, ptr); + } + ptr_prev = ptr; + } + } + + for (int i = 0; i < m_pool_fan_size; ++i) { + m_pool_base[rev_rank + (1 << i)]->m_pool_state = ThreadsExec::Active; + } + } + + //------------------------------------ + /** \brief Wait for previous asynchronous functor to + * complete and release the Threads device. + * Acquire the Threads device and start this functor. + */ + static void start(void (*)(ThreadsExec &, const void *), const void *); + + static int in_parallel(); + static void fence(); + static bool sleep(); + static bool wake(); + + /* Dynamic Scheduling related functionality */ + // Initialize the work range for this thread + inline void set_work_range(const long &begin, const long &end, + const long &chunk_size) { + m_work_range.first = (begin + chunk_size - 1) / chunk_size; + m_work_range.second = + end > 0 ? (end + chunk_size - 1) / chunk_size : m_work_range.first; + } + + // Claim and index from this thread's range from the beginning + inline long get_work_index_begin() { + Kokkos::pair<long, long> work_range_new = m_work_range; + Kokkos::pair<long, long> work_range_old = work_range_new; + if (work_range_old.first >= work_range_old.second) return -1; + + work_range_new.first += 1; + + bool success = false; + while (!success) { + work_range_new = Kokkos::atomic_compare_exchange( + &m_work_range, work_range_old, work_range_new); + success = ((work_range_new == work_range_old) || + (work_range_new.first >= work_range_new.second)); + work_range_old = work_range_new; + work_range_new.first += 1; + } + if (work_range_old.first < work_range_old.second) + return work_range_old.first; + else + return -1; + } + + // Claim and index from this thread's range from the end + inline long get_work_index_end() { + Kokkos::pair<long, long> work_range_new = m_work_range; + Kokkos::pair<long, long> work_range_old = work_range_new; + if (work_range_old.first >= work_range_old.second) return -1; + work_range_new.second -= 1; + bool success = false; + while (!success) { + work_range_new = Kokkos::atomic_compare_exchange( + &m_work_range, work_range_old, work_range_new); + success = ((work_range_new == work_range_old) || + (work_range_new.first >= work_range_new.second)); + work_range_old = work_range_new; + work_range_new.second -= 1; + } + if (work_range_old.first < work_range_old.second) + return work_range_old.second - 1; + else + return -1; + } + + // Reset the steal target + inline void reset_steal_target() { + m_current_steal_target = (m_pool_rank + 1) % pool_size(); + m_stealing = false; + } + + // Reset the steal target + inline void reset_steal_target(int team_size) { + m_current_steal_target = (m_pool_rank_rev + team_size); + if (m_current_steal_target >= pool_size()) + m_current_steal_target = 0; // pool_size()-1; + m_stealing = false; + } + + // Get a steal target; start with my-rank + 1 and go round robin, until + // arriving at this threads rank Returns -1 fi no active steal target + // available + inline int get_steal_target() { + while ((m_pool_base[m_current_steal_target]->m_work_range.second <= + m_pool_base[m_current_steal_target]->m_work_range.first) && + (m_current_steal_target != m_pool_rank)) { + m_current_steal_target = (m_current_steal_target + 1) % pool_size(); + } + if (m_current_steal_target == m_pool_rank) + return -1; + else + return m_current_steal_target; + } + + inline int get_steal_target(int team_size) { + while ((m_pool_base[m_current_steal_target]->m_work_range.second <= + m_pool_base[m_current_steal_target]->m_work_range.first) && + (m_current_steal_target != m_pool_rank_rev)) { + if (m_current_steal_target + team_size < pool_size()) + m_current_steal_target = (m_current_steal_target + team_size); + else + m_current_steal_target = 0; + } + + if (m_current_steal_target == m_pool_rank_rev) + return -1; + else + return m_current_steal_target; + } + + inline long steal_work_index(int team_size = 0) { + long index = -1; + int steal_target = + team_size > 0 ? get_steal_target(team_size) : get_steal_target(); + while ((steal_target != -1) && (index == -1)) { + index = m_pool_base[steal_target]->get_work_index_end(); + if (index == -1) + steal_target = + team_size > 0 ? get_steal_target(team_size) : get_steal_target(); + } + return index; + } + + // Get a work index. Claim from owned range until its exhausted, then steal + // from other thread + inline long get_work_index(int team_size = 0) { + long work_index = -1; + if (!m_stealing) work_index = get_work_index_begin(); + + if (work_index == -1) { + memory_fence(); + m_stealing = true; + work_index = steal_work_index(team_size); + } + + m_team_work_index = work_index; + memory_fence(); + return work_index; + } +}; + +} /* namespace Impl */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +inline int Threads::in_parallel() { return Impl::ThreadsExec::in_parallel(); } + +inline int Threads::impl_is_initialized() { + return Impl::ThreadsExec::is_initialized(); +} + +inline void Threads::impl_initialize(unsigned threads_count, + unsigned use_numa_count, + unsigned use_cores_per_numa, + bool allow_asynchronous_threadpool) { + Impl::ThreadsExec::initialize(threads_count, use_numa_count, + use_cores_per_numa, + allow_asynchronous_threadpool); +} + +inline void Threads::impl_finalize() { Impl::ThreadsExec::finalize(); } + +inline void Threads::print_configuration(std::ostream &s, const bool detail) { + Impl::ThreadsExec::print_configuration(s, detail); +} + +inline void Threads::impl_static_fence() { Impl::ThreadsExec::fence(); } +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Experimental { + +template <> +class UniqueToken<Threads, UniqueTokenScope::Instance> { + private: + using buffer_type = Kokkos::View<uint32_t *, Kokkos::HostSpace>; + int m_count; + buffer_type m_buffer_view; + uint32_t volatile *m_buffer; + + public: + using execution_space = Threads; + using size_type = int; + + /// \brief create object size for concurrency on the given instance + /// + /// This object should not be shared between instances + UniqueToken(execution_space const & = execution_space()) noexcept + : m_count(::Kokkos::Threads::impl_thread_pool_size()), + m_buffer_view(buffer_type()), + m_buffer(nullptr) {} + + UniqueToken(size_type max_size, execution_space const & = execution_space()) + : m_count(max_size > ::Kokkos::Threads::impl_thread_pool_size() + ? ::Kokkos::Threads::impl_thread_pool_size() + : max_size), + m_buffer_view( + max_size > ::Kokkos::Threads::impl_thread_pool_size() + ? buffer_type() + : buffer_type("UniqueToken::m_buffer_view", + ::Kokkos::Impl::concurrent_bitset::buffer_bound( + m_count))), + m_buffer(m_buffer_view.data()) {} + + /// \brief upper bound for acquired values, i.e. 0 <= value < size() + KOKKOS_INLINE_FUNCTION + int size() const noexcept { return m_count; } + + /// \brief acquire value such that 0 <= value < size() + KOKKOS_INLINE_FUNCTION + int acquire() const noexcept { +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + if (m_buffer == nullptr) { + return Threads::impl_thread_pool_rank(); + } else { + const ::Kokkos::pair<int, int> result = + ::Kokkos::Impl::concurrent_bitset::acquire_bounded( + m_buffer, m_count, ::Kokkos::Impl::clock_tic() % m_count); + + if (result.first < 0) { + ::Kokkos::abort( + "UniqueToken<Threads> failure to acquire tokens, no tokens " + "available"); + } + return result.first; + } +#else + return 0; +#endif + } + + /// \brief release a value acquired by generate + KOKKOS_INLINE_FUNCTION + void release(int i) const noexcept { +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + if (m_buffer != nullptr) { + ::Kokkos::Impl::concurrent_bitset::release(m_buffer, i); + } +#else + (void)i; +#endif + } +}; + +template <> +class UniqueToken<Threads, UniqueTokenScope::Global> { + public: + using execution_space = Threads; + using size_type = int; + + /// \brief create object size for concurrency on the given instance + /// + /// This object should not be shared between instances + UniqueToken(execution_space const & = execution_space()) noexcept {} + + /// \brief upper bound for acquired values, i.e. 0 <= value < size() + KOKKOS_INLINE_FUNCTION + int size() const noexcept { +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + return Threads::impl_thread_pool_size(); +#else + return 0; +#endif + } + + /// \brief acquire value such that 0 <= value < size() + KOKKOS_INLINE_FUNCTION + int acquire() const noexcept { +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + return Threads::impl_thread_pool_rank(); +#else + return 0; +#endif + } + + /// \brief release a value acquired by generate + KOKKOS_INLINE_FUNCTION + void release(int) const noexcept {} +}; + +} // namespace Experimental +} // namespace Kokkos +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- +#endif +#endif /* #define KOKKOS_THREADSEXEC_HPP */ diff --git a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp b/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp new file mode 100644 index 0000000000000000000000000000000000000000..40a09ed22ab1d6d73b62084549049521e0eb3150 --- /dev/null +++ b/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp @@ -0,0 +1,241 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_THREADS) + +#include <Kokkos_Core_fwd.hpp> +/* Standard 'C' Linux libraries */ + +#include <pthread.h> +#include <sched.h> +#include <errno.h> + +/* Standard C++ libraries */ + +#include <cstdlib> +#include <string> +#include <iostream> +#include <stdexcept> + +#include <Kokkos_Threads.hpp> + +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { +namespace { + +pthread_mutex_t host_internal_pthread_mutex = PTHREAD_MUTEX_INITIALIZER; + +// Pthreads compatible driver. +// Recovery from an exception would require constant intra-thread health +// verification; which would negatively impact runtime. As such simply +// abort the process. + +void* internal_pthread_driver(void*) { + try { + ThreadsExec::driver(); + } catch (const std::exception& x) { + std::cerr << "Exception thrown from worker thread: " << x.what() + << std::endl; + std::cerr.flush(); + std::abort(); + } catch (...) { + std::cerr << "Exception thrown from worker thread" << std::endl; + std::cerr.flush(); + std::abort(); + } + return nullptr; +} + +} // namespace + +//---------------------------------------------------------------------------- +// Spawn a thread + +bool ThreadsExec::spawn() { + bool result = false; + + pthread_attr_t attr; + + if (0 == pthread_attr_init(&attr) || + 0 == pthread_attr_setscope(&attr, PTHREAD_SCOPE_SYSTEM) || + 0 == pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED)) { + pthread_t pt; + + result = 0 == pthread_create(&pt, &attr, internal_pthread_driver, nullptr); + } + + pthread_attr_destroy(&attr); + + return result; +} + +//---------------------------------------------------------------------------- + +bool ThreadsExec::is_process() { + static const pthread_t master_pid = pthread_self(); + + return pthread_equal(master_pid, pthread_self()); +} + +void ThreadsExec::global_lock() { + pthread_mutex_lock(&host_internal_pthread_mutex); +} + +void ThreadsExec::global_unlock() { + pthread_mutex_unlock(&host_internal_pthread_mutex); +} + +//---------------------------------------------------------------------------- + +void ThreadsExec::wait_yield(volatile int& flag, const int value) { + while (value == flag) { + sched_yield(); + } +} + +} // namespace Impl +} // namespace Kokkos + +/* end #if defined( KOKKOS_ENABLE_THREADS ) */ +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#elif defined(KOKKOS_ENABLE_WINTHREAD) + +#include <Kokkos_Core_fwd.hpp> + +/* Windows libraries */ +#include <winsock2.h> +#include <windows.h> +#include <process.h> + +/* Standard C++ libraries */ + +#include <cstdlib> +#include <string> +#include <iostream> +#include <stdexcept> + +#include <Kokkos_Threads.hpp> + +//---------------------------------------------------------------------------- +// Driver for each created pthread + +namespace Kokkos { +namespace Impl { +namespace { + +unsigned WINAPI internal_winthread_driver(void* arg) { + ThreadsExec::driver(); + + return 0; +} + +class ThreadLockWindows { + private: + CRITICAL_SECTION m_handle; + + ~ThreadLockWindows() { DeleteCriticalSection(&m_handle); } + + ThreadLockWindows(); + { InitializeCriticalSection(&m_handle); } + + ThreadLockWindows(const ThreadLockWindows&); + ThreadLockWindows& operator=(const ThreadLockWindows&); + + public: + static ThreadLockWindows& singleton(); + + void lock() { EnterCriticalSection(&m_handle); } + + void unlock() { LeaveCriticalSection(&m_handle); } +}; + +ThreadLockWindows& ThreadLockWindows::singleton() { + static ThreadLockWindows self; + return self; +} + +} // namespace +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +// Spawn this thread + +bool ThreadsExec::spawn() { + unsigned Win32ThreadID = 0; + + HANDLE handle = + _beginthreadex(0, 0, internal_winthread_driver, 0, 0, &Win32ThreadID); + + return !handle; +} + +bool ThreadsExec::is_process() { return true; } + +void ThreadsExec::global_lock() { ThreadLockWindows::singleton().lock(); } + +void ThreadsExec::global_unlock() { ThreadLockWindows::singleton().unlock(); } + +void ThreadsExec::wait_yield(volatile int& flag, const int value){} { + while (value == flag) { + Sleep(0); + } +} + +} // namespace Impl +} // namespace Kokkos + +#else +void KOKKOS_CORE_SRC_THREADS_EXEC_BASE_PREVENT_LINK_ERROR() {} +#endif /* end #elif defined( KOKKOS_ENABLE_WINTHREAD ) */ diff --git a/packages/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp b/packages/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e0ae43dd87ec337d24f659e3da74a662f31dfb84 --- /dev/null +++ b/packages/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp @@ -0,0 +1,1162 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_THREADSTEAM_HPP +#define KOKKOS_THREADSTEAM_HPP + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_THREADS) + +#include <cstdio> + +#include <utility> +#include <impl/Kokkos_Spinwait.hpp> +#include <impl/Kokkos_FunctorAdapter.hpp> +#include <impl/Kokkos_HostThreadTeam.hpp> + +#include <Kokkos_Atomic.hpp> + +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +//---------------------------------------------------------------------------- + +template <class> +struct ThreadsExecAdapter; + +//---------------------------------------------------------------------------- + +class ThreadsExecTeamMember { + private: + enum { TEAM_REDUCE_SIZE = 512 }; + + public: + using execution_space = Kokkos::Threads; + using scratch_memory_space = execution_space::scratch_memory_space; + + private: + using space = execution_space::scratch_memory_space; + ThreadsExec* const m_exec; + ThreadsExec* const* m_team_base; ///< Base for team fan-in + space m_team_shared; + int m_team_shared_size; + int m_team_size; + int m_team_rank; + int m_team_rank_rev; + int m_league_size; + int m_league_end; + int m_league_rank; + + int m_chunk_size; + int m_league_chunk_end; + + int m_invalid_thread; + int m_team_alloc; + + inline void set_team_shared() { + new (&m_team_shared) + space(((char*)(*m_team_base)->scratch_memory()) + TEAM_REDUCE_SIZE, + m_team_shared_size); + } + + public: + // Fan-in and wait until the matching fan-out is called. + // The root thread which does not wait will return true. + // All other threads will return false during the fan-out. + KOKKOS_INLINE_FUNCTION bool team_fan_in() const { + int n, j; + + // Wait for fan-in threads + for (n = 1; + (!(m_team_rank_rev & n)) && ((j = m_team_rank_rev + n) < m_team_size); + n <<= 1) { + Impl::spinwait_while_equal<int>(m_team_base[j]->state(), + ThreadsExec::Active); + } + + // If not root then wait for release + if (m_team_rank_rev) { + m_exec->state() = ThreadsExec::Rendezvous; + Impl::spinwait_while_equal<int>(m_exec->state(), ThreadsExec::Rendezvous); + } + + return !m_team_rank_rev; + } + + KOKKOS_INLINE_FUNCTION void team_fan_out() const { + int n, j; + for (n = 1; + (!(m_team_rank_rev & n)) && ((j = m_team_rank_rev + n) < m_team_size); + n <<= 1) { + m_team_base[j]->state() = ThreadsExec::Active; + } + } + + public: + KOKKOS_INLINE_FUNCTION static int team_reduce_size() { + return TEAM_REDUCE_SIZE; + } + + KOKKOS_INLINE_FUNCTION + const execution_space::scratch_memory_space& team_shmem() const { + return m_team_shared.set_team_thread_mode(0, 1, 0); + } + + KOKKOS_INLINE_FUNCTION + const execution_space::scratch_memory_space& team_scratch(int) const { + return m_team_shared.set_team_thread_mode(0, 1, 0); + } + + KOKKOS_INLINE_FUNCTION + const execution_space::scratch_memory_space& thread_scratch(int) const { + return m_team_shared.set_team_thread_mode(0, team_size(), team_rank()); + } + + KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank; } + KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size; } + KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank; } + KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size; } + + KOKKOS_INLINE_FUNCTION void team_barrier() const { + team_fan_in(); + team_fan_out(); + } + + template <class ValueType> + KOKKOS_INLINE_FUNCTION void team_broadcast(ValueType& value, + const int& thread_id) const { +#if !defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + { + (void)value; + (void)thread_id; + } +#else + // Make sure there is enough scratch space: + using type = typename if_c<sizeof(ValueType) < TEAM_REDUCE_SIZE, ValueType, + void>::type; + + if (m_team_base) { + type* const local_value = ((type*)m_team_base[0]->scratch_memory()); + memory_fence(); + team_barrier(); + if (team_rank() == thread_id) *local_value = value; + memory_fence(); + team_barrier(); + value = *local_value; + } +#endif + } + + template <class Closure, class ValueType> + KOKKOS_INLINE_FUNCTION void team_broadcast(Closure const& f, ValueType& value, + const int& thread_id) const { +#if !defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + { + (void)f; + (void)value; + (void)thread_id; + } +#else + // Make sure there is enough scratch space: + using type = typename if_c<sizeof(ValueType) < TEAM_REDUCE_SIZE, ValueType, + void>::type; + f(value); + if (m_team_base) { + type* const local_value = ((type*)m_team_base[0]->scratch_memory()); + memory_fence(); + team_barrier(); + if (team_rank() == thread_id) *local_value = value; + memory_fence(); + team_barrier(); + value = *local_value; + } +#endif + } + + template <typename Type> + KOKKOS_INLINE_FUNCTION + typename std::enable_if<!Kokkos::is_reducer<Type>::value, Type>::type + team_reduce(const Type& value) const +#if !defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + { + return value; + } +#else + { + // Make sure there is enough scratch space: + using type = + typename if_c<sizeof(Type) < TEAM_REDUCE_SIZE, Type, void>::type; + + if (nullptr == m_exec) return value; + + if (team_rank() != team_size() - 1) + *((volatile type*)m_exec->scratch_memory()) = value; + + memory_fence(); + + type& accum = *((type*)m_team_base[0]->scratch_memory()); + + if (team_fan_in()) { + accum = value; + for (int i = 1; i < m_team_size; ++i) { + accum += *((type*)m_team_base[i]->scratch_memory()); + } + memory_fence(); + } + + team_fan_out(); + + return accum; + } +#endif + + template <typename ReducerType> + KOKKOS_INLINE_FUNCTION + typename std::enable_if<is_reducer<ReducerType>::value>::type + team_reduce(ReducerType const& reducer) const noexcept { + team_reduce(reducer, reducer.reference()); + } + + template <typename ReducerType> + KOKKOS_INLINE_FUNCTION + typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type +#if !defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + team_reduce(const ReducerType&, + const typename ReducerType::value_type) const { + } +#else + team_reduce(const ReducerType& reducer, + const typename ReducerType::value_type contribution) const { + using value_type = typename ReducerType::value_type; + // Make sure there is enough scratch space: + using type = typename if_c<sizeof(value_type) < TEAM_REDUCE_SIZE, + value_type, void>::type; + + if (nullptr == m_exec) return; + + type* const local_value = ((type*)m_exec->scratch_memory()); + + // Set this thread's contribution + if (team_rank() != team_size() - 1) *local_value = contribution; + + // Fence to make sure the base team member has access: + memory_fence(); + + if (team_fan_in()) { + // The last thread to synchronize returns true, all other threads wait for + // team_fan_out() + type* const team_value = ((type*)m_team_base[0]->scratch_memory()); + + *team_value = contribution; + // Join to the team value: + for (int i = 1; i < m_team_size; ++i) { + reducer.join(*team_value, *((type*)m_team_base[i]->scratch_memory())); + } + + // Team base thread may "lap" member threads so copy out to their local + // value. + for (int i = 1; i < m_team_size; ++i) { + *((type*)m_team_base[i]->scratch_memory()) = *team_value; + } + + // Fence to make sure all team members have access + memory_fence(); + } + + team_fan_out(); + + // Value was changed by the team base + reducer.reference() = *((type volatile const*)local_value); + } +#endif + + /** \brief Intra-team exclusive prefix sum with team_rank() ordering + * with intra-team non-deterministic ordering accumulation. + * + * The global inter-team accumulation value will, at the end of the + * league's parallel execution, be the scan's total. + * Parallel execution ordering of the league's teams is non-deterministic. + * As such the base value for each team's scan operation is similarly + * non-deterministic. + */ + template <typename ArgType> + KOKKOS_INLINE_FUNCTION ArgType team_scan(const ArgType& value, + ArgType* const global_accum) const +#if !defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + { + (void)global_accum; + return value; + } +#else + { + // Make sure there is enough scratch space: + using type = + typename if_c<sizeof(ArgType) < TEAM_REDUCE_SIZE, ArgType, void>::type; + + if (nullptr == m_exec) return type(0); + + volatile type* const work_value = ((type*)m_exec->scratch_memory()); + + *work_value = value; + + memory_fence(); + + if (team_fan_in()) { + // The last thread to synchronize returns true, all other threads wait for + // team_fan_out() m_team_base[0] == highest ranking team + // member m_team_base[ m_team_size - 1 ] == lowest ranking team member + // + // 1) copy from lower to higher rank, initialize lowest rank to zero + // 2) prefix sum from lowest to highest rank, skipping lowest rank + + type accum = 0; + + if (global_accum) { + for (int i = m_team_size; i--;) { + type& val = *((type*)m_team_base[i]->scratch_memory()); + accum += val; + } + accum = atomic_fetch_add(global_accum, accum); + } + + for (int i = m_team_size; i--;) { + type& val = *((type*)m_team_base[i]->scratch_memory()); + const type offset = accum; + accum += val; + val = offset; + } + + memory_fence(); + } + + team_fan_out(); + + return *work_value; + } +#endif + + /** \brief Intra-team exclusive prefix sum with team_rank() ordering. + * + * The highest rank thread can compute the reduction total as + * reduction_total = dev.team_scan( value ) + value ; + */ + template <typename ArgType> + KOKKOS_INLINE_FUNCTION ArgType team_scan(const ArgType& value) const { + return this->template team_scan<ArgType>(value, nullptr); + } + + //---------------------------------------- + // Private for the driver + + template <class... Properties> + ThreadsExecTeamMember( + Impl::ThreadsExec* exec, + const TeamPolicyInternal<Kokkos::Threads, Properties...>& team, + const int shared_size) + : m_exec(exec), + m_team_base(nullptr), + m_team_shared(nullptr, 0), + m_team_shared_size(shared_size), + m_team_size(team.team_size()), + m_team_rank(0), + m_team_rank_rev(0), + m_league_size(0), + m_league_end(0), + m_league_rank(0), + m_chunk_size(team.chunk_size()), + m_league_chunk_end(0), + m_team_alloc(team.team_alloc()) { + if (team.league_size()) { + // Execution is using device-team interface: + + const int pool_rank_rev = m_exec->pool_size() - (m_exec->pool_rank() + 1); + const int team_rank_rev = pool_rank_rev % team.team_alloc(); + const size_t pool_league_size = m_exec->pool_size() / team.team_alloc(); + const size_t pool_league_rank_rev = pool_rank_rev / team.team_alloc(); + if (pool_league_rank_rev >= pool_league_size) { + m_invalid_thread = 1; + return; + } + const size_t pool_league_rank = + pool_league_size - (pool_league_rank_rev + 1); + + const int pool_num_teams = m_exec->pool_size() / team.team_alloc(); + const int chunk_size = + team.chunk_size() > 0 ? team.chunk_size() : team.team_iter(); + const int chunks_per_team = + (team.league_size() + chunk_size * pool_num_teams - 1) / + (chunk_size * pool_num_teams); + int league_iter_end = team.league_size() - + pool_league_rank_rev * chunks_per_team * chunk_size; + int league_iter_begin = league_iter_end - chunks_per_team * chunk_size; + if (league_iter_begin < 0) league_iter_begin = 0; + if (league_iter_end > team.league_size()) + league_iter_end = team.league_size(); + + if ((team.team_alloc() > m_team_size) + ? (team_rank_rev >= m_team_size) + : (m_exec->pool_size() - pool_num_teams * m_team_size > + m_exec->pool_rank())) + m_invalid_thread = 1; + else + m_invalid_thread = 0; + + // May be using fewer threads per team than a multiple of threads per + // core, some threads will idle. + + if (team_rank_rev < team.team_size() && !m_invalid_thread) { + m_team_base = + m_exec->pool_base() + team.team_alloc() * pool_league_rank_rev; + m_team_size = team.team_size(); + m_team_rank = team.team_size() - (team_rank_rev + 1); + m_team_rank_rev = team_rank_rev; + m_league_size = team.league_size(); + + m_league_rank = + (team.league_size() * pool_league_rank) / pool_league_size; + m_league_end = + (team.league_size() * (pool_league_rank + 1)) / pool_league_size; + + set_team_shared(); + } + + if ((m_team_rank_rev == 0) && (m_invalid_thread == 0)) { + m_exec->set_work_range(m_league_rank, m_league_end, m_chunk_size); + m_exec->reset_steal_target(m_team_size); + } + if (std::is_same<typename TeamPolicyInternal< + Kokkos::Threads, Properties...>::schedule_type::type, + Kokkos::Dynamic>::value) { + m_exec->barrier(); + } + } else { + m_invalid_thread = 1; + } + } + + ThreadsExecTeamMember() + : m_exec(nullptr), + m_team_base(nullptr), + m_team_shared(nullptr, 0), + m_team_shared_size(0), + m_team_size(1), + m_team_rank(0), + m_team_rank_rev(0), + m_league_size(1), + m_league_end(0), + m_league_rank(0), + m_chunk_size(0), + m_league_chunk_end(0), + m_invalid_thread(0), + m_team_alloc(0) {} + + inline ThreadsExec& threads_exec_team_base() const { + return m_team_base ? **m_team_base : *m_exec; + } + + bool valid_static() const { return m_league_rank < m_league_end; } + + void next_static() { + if (m_league_rank < m_league_end) { + // Make sure all stores are complete before entering the barrier + memory_fence(); + team_barrier(); + set_team_shared(); + } + m_league_rank++; + } + + bool valid_dynamic() { + if (m_invalid_thread) return false; + if ((m_league_rank < m_league_chunk_end) && + (m_league_rank < m_league_size)) { + return true; + } + + if (m_team_rank_rev == 0) { + m_team_base[0]->get_work_index(m_team_alloc); + } + team_barrier(); + + long work_index = m_team_base[0]->team_work_index(); + + m_league_rank = work_index * m_chunk_size; + m_league_chunk_end = (work_index + 1) * m_chunk_size; + + if (m_league_chunk_end > m_league_size) m_league_chunk_end = m_league_size; + + if ((m_league_rank >= 0) && (m_league_rank < m_league_chunk_end)) + return true; + return false; + } + + void next_dynamic() { + if (m_invalid_thread) return; + + if (m_league_rank < m_league_chunk_end) { + // Make sure all stores are complete before entering the barrier + memory_fence(); + team_barrier(); + set_team_shared(); + } + m_league_rank++; + } + + void set_league_shmem(const int arg_league_rank, const int arg_league_size, + const int arg_shmem_size) { + m_league_rank = arg_league_rank; + m_league_size = arg_league_size; + m_team_shared_size = arg_shmem_size; + set_team_shared(); + } +}; + +} /* namespace Impl */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { +template <class... Properties> +class TeamPolicyInternal<Kokkos::Threads, Properties...> + : public PolicyTraits<Properties...> { + private: + int m_league_size; + int m_team_size; + int m_team_alloc; + int m_team_iter; + + size_t m_team_scratch_size[2]; + size_t m_thread_scratch_size[2]; + + int m_chunk_size; + + bool m_tune_team_size; + bool m_tune_vector_length; + + inline void init(const int league_size_request, const int team_size_request) { + const int pool_size = traits::execution_space::impl_thread_pool_size(0); + const int max_host_team_size = Impl::HostThreadTeamData::max_team_members; + const int team_max = + pool_size < max_host_team_size ? pool_size : max_host_team_size; + const int team_grain = traits::execution_space::impl_thread_pool_size(2); + + m_league_size = league_size_request; + + if (team_size_request > team_max) + Kokkos::abort("Kokkos::abort: Requested Team Size is too large!"); + + m_team_size = team_size_request < team_max ? team_size_request : team_max; + + // Round team size up to a multiple of 'team_gain' + const int team_size_grain = + (m_team_size + team_grain - 1 <= 0) + ? 1 + : team_grain * ((m_team_size + team_grain - 1) / team_grain); + const int team_count = pool_size / team_size_grain; + + // Constraint : pool_size = m_team_alloc * team_count + m_team_alloc = pool_size / team_count; + + // Maxumum number of iterations each team will take: + m_team_iter = (m_league_size + team_count - 1) / team_count; + + set_auto_chunk_size(); + } + + public: + //! Tag this class as a kokkos execution policy + //! Tag this class as a kokkos execution policy + using execution_policy = TeamPolicyInternal; + + using traits = PolicyTraits<Properties...>; + + const typename traits::execution_space& space() const { + static typename traits::execution_space m_space; + return m_space; + } + + template <class ExecSpace, class... OtherProperties> + friend class TeamPolicyInternal; + + template <class... OtherProperties> + TeamPolicyInternal( + const TeamPolicyInternal<Kokkos::Threads, OtherProperties...>& p) { + m_league_size = p.m_league_size; + m_team_size = p.m_team_size; + m_team_alloc = p.m_team_alloc; + m_team_iter = p.m_team_iter; + m_team_scratch_size[0] = p.m_team_scratch_size[0]; + m_thread_scratch_size[0] = p.m_thread_scratch_size[0]; + m_team_scratch_size[1] = p.m_team_scratch_size[1]; + m_thread_scratch_size[1] = p.m_thread_scratch_size[1]; + m_chunk_size = p.m_chunk_size; + m_tune_team_size = p.m_tune_team_size; + m_tune_vector_length = p.m_tune_vector_length; + } + + //---------------------------------------- + + template <class FunctorType> + int team_size_max(const FunctorType&, const ParallelForTag&) const { + int pool_size = traits::execution_space::impl_thread_pool_size(1); + int max_host_team_size = Impl::HostThreadTeamData::max_team_members; + return pool_size < max_host_team_size ? pool_size : max_host_team_size; + } + template <class FunctorType> + int team_size_max(const FunctorType&, const ParallelReduceTag&) const { + int pool_size = traits::execution_space::impl_thread_pool_size(1); + int max_host_team_size = Impl::HostThreadTeamData::max_team_members; + return pool_size < max_host_team_size ? pool_size : max_host_team_size; + } + template <class FunctorType, class ReducerType> + inline int team_size_max(const FunctorType& f, const ReducerType&, + const ParallelReduceTag& t) const { + return team_size_max(f, t); + } + template <class FunctorType> + int team_size_recommended(const FunctorType&, const ParallelForTag&) const { + return traits::execution_space::impl_thread_pool_size(2); + } + template <class FunctorType> + int team_size_recommended(const FunctorType&, + const ParallelReduceTag&) const { + return traits::execution_space::impl_thread_pool_size(2); + } + template <class FunctorType, class ReducerType> + inline int team_size_recommended(const FunctorType& f, const ReducerType&, + const ParallelReduceTag& t) const { + return team_size_recommended(f, t); + } + + inline static int vector_length_max() { + return 1024; + } // Use arbitrary large number, is meant as a vectorizable length + + inline static int scratch_size_max(int level) { + return (level == 0 ? 1024 * 32 : // Roughly L1 size + 20 * 1024 * 1024); // Limit to keep compatibility with CUDA + } + + //---------------------------------------- + + inline int team_size() const { return m_team_size; } + inline int impl_vector_length() const { return 1; } + inline int team_alloc() const { return m_team_alloc; } + inline int league_size() const { return m_league_size; } + + inline bool impl_auto_team_size() const { return m_tune_team_size; } + inline bool impl_auto_vector_length() const { return m_tune_vector_length; } + inline void impl_set_team_size(size_t size) { init(m_league_size, size); } + inline void impl_set_vector_length(size_t /**size*/) {} + inline size_t scratch_size(const int& level, int team_size_ = -1) const { + if (team_size_ < 0) team_size_ = m_team_size; + return m_team_scratch_size[level] + + team_size_ * m_thread_scratch_size[level]; + } + + inline int team_iter() const { return m_team_iter; } + + /** \brief Specify league size, request team size */ + TeamPolicyInternal(const typename traits::execution_space&, + int league_size_request, int team_size_request, + int vector_length_request = 1) + : m_league_size(0), + m_team_size(0), + m_team_alloc(0), + m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_chunk_size(0), + m_tune_team_size(false), + m_tune_vector_length(false) { + init(league_size_request, team_size_request); + (void)vector_length_request; + } + + /** \brief Specify league size, request team size and vector length*/ + TeamPolicyInternal(const typename traits::execution_space& space, + int league_size_request, + const Kokkos::AUTO_t& /* team_size_request */ + , + const Kokkos::AUTO_t& /* vector_length_request */) + : TeamPolicyInternal(space, league_size_request, -1, -1) {} + + /** \brief Specify league size, request team size*/ + TeamPolicyInternal(const typename traits::execution_space& space, + int league_size_request, + const Kokkos::AUTO_t& /* team_size_request */ + , + int vector_length_request) + : TeamPolicyInternal(space, league_size_request, -1, + vector_length_request) {} + + /** \brief Specify league size and team size, request vector length*/ + TeamPolicyInternal(const typename traits::execution_space& space, + int league_size_request, int team_size_request, + const Kokkos::AUTO_t& /* vector_length_request */) + : TeamPolicyInternal(space, league_size_request, team_size_request, -1) {} + + TeamPolicyInternal(int league_size_request, int team_size_request, + int vector_length_request = 1) + : TeamPolicyInternal(typename traits::execution_space(), + league_size_request, team_size_request, + vector_length_request) {} + + TeamPolicyInternal(int league_size_request, + const Kokkos::AUTO_t& /* team_size_request */ + , + int vector_length_request = 1) + : TeamPolicyInternal(typename traits::execution_space(), + league_size_request, -1, vector_length_request) {} + + /** \brief Specify league size, request team size and vector length*/ + TeamPolicyInternal(int league_size_request, + const Kokkos::AUTO_t& /* team_size_request */ + , + const Kokkos::AUTO_t& /* vector_length_request */) + : TeamPolicyInternal(typename traits::execution_space(), + league_size_request, -1, -1) {} + + /** \brief Specify league size and team size, request vector length*/ + TeamPolicyInternal(int league_size_request, int team_size_request, + const Kokkos::AUTO_t& /* vector_length_request */) + : TeamPolicyInternal(typename traits::execution_space(), + league_size_request, team_size_request, -1) {} + + inline int chunk_size() const { return m_chunk_size; } + + /** \brief set chunk_size to a discrete value*/ + inline TeamPolicyInternal& set_chunk_size( + typename traits::index_type chunk_size_) { + m_chunk_size = chunk_size_; + return *this; + } + + /** \brief set per team scratch size for a specific level of the scratch + * hierarchy */ + inline TeamPolicyInternal& set_scratch_size(const int& level, + const PerTeamValue& per_team) { + m_team_scratch_size[level] = per_team.value; + return *this; + } + + /** \brief set per thread scratch size for a specific level of the scratch + * hierarchy */ + inline TeamPolicyInternal& set_scratch_size( + const int& level, const PerThreadValue& per_thread) { + m_thread_scratch_size[level] = per_thread.value; + return *this; + } + + /** \brief set per thread and per team scratch size for a specific level of + * the scratch hierarchy */ + inline TeamPolicyInternal& set_scratch_size( + const int& level, const PerTeamValue& per_team, + const PerThreadValue& per_thread) { + m_team_scratch_size[level] = per_team.value; + m_thread_scratch_size[level] = per_thread.value; + return *this; + } + + private: + /** \brief finalize chunk_size if it was set to AUTO*/ + inline void set_auto_chunk_size() { + int64_t concurrency = traits::execution_space::concurrency() / m_team_alloc; + if (concurrency == 0) concurrency = 1; + + if (m_chunk_size > 0) { + if (!Impl::is_integral_power_of_two(m_chunk_size)) + Kokkos::abort("TeamPolicy blocking granularity must be power of two"); + } + + int new_chunk_size = 1; + while (new_chunk_size * 100 * concurrency < m_league_size) + new_chunk_size *= 2; + if (new_chunk_size < 128) { + new_chunk_size = 1; + while ((new_chunk_size * 40 * concurrency < m_league_size) && + (new_chunk_size < 128)) + new_chunk_size *= 2; + } + m_chunk_size = new_chunk_size; + } + + public: + using member_type = Impl::ThreadsExecTeamMember; + + friend class Impl::ThreadsExecTeamMember; +}; + +} /*namespace Impl */ +} /* namespace Kokkos */ + +namespace Kokkos { + +template <typename iType> +KOKKOS_INLINE_FUNCTION + Impl::TeamThreadRangeBoundariesStruct<iType, Impl::ThreadsExecTeamMember> + TeamThreadRange(const Impl::ThreadsExecTeamMember& thread, + const iType& count) { + return Impl::TeamThreadRangeBoundariesStruct<iType, + Impl::ThreadsExecTeamMember>( + thread, count); +} + +template <typename iType1, typename iType2> +KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct< + typename std::common_type<iType1, iType2>::type, + Impl::ThreadsExecTeamMember> +TeamThreadRange(const Impl::ThreadsExecTeamMember& thread, const iType1& begin, + const iType2& end) { + using iType = typename std::common_type<iType1, iType2>::type; + return Impl::TeamThreadRangeBoundariesStruct<iType, + Impl::ThreadsExecTeamMember>( + thread, iType(begin), iType(end)); +} + +template <typename iType> +KOKKOS_INLINE_FUNCTION + Impl::TeamThreadRangeBoundariesStruct<iType, Impl::ThreadsExecTeamMember> + TeamVectorRange(const Impl::ThreadsExecTeamMember& thread, + const iType& count) { + return Impl::TeamThreadRangeBoundariesStruct<iType, + Impl::ThreadsExecTeamMember>( + thread, count); +} + +template <typename iType1, typename iType2> +KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct< + typename std::common_type<iType1, iType2>::type, + Impl::ThreadsExecTeamMember> +TeamVectorRange(const Impl::ThreadsExecTeamMember& thread, const iType1& begin, + const iType2& end) { + using iType = typename std::common_type<iType1, iType2>::type; + return Impl::TeamThreadRangeBoundariesStruct<iType, + Impl::ThreadsExecTeamMember>( + thread, iType(begin), iType(end)); +} + +template <typename iType> +KOKKOS_INLINE_FUNCTION + Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::ThreadsExecTeamMember> + ThreadVectorRange(const Impl::ThreadsExecTeamMember& thread, + const iType& count) { + return Impl::ThreadVectorRangeBoundariesStruct<iType, + Impl::ThreadsExecTeamMember>( + thread, count); +} + +template <typename iType1, typename iType2> +KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct< + typename std::common_type<iType1, iType2>::type, + Impl::ThreadsExecTeamMember> +ThreadVectorRange(const Impl::ThreadsExecTeamMember& thread, + const iType1& arg_begin, const iType2& arg_end) { + using iType = typename std::common_type<iType1, iType2>::type; + return Impl::ThreadVectorRangeBoundariesStruct<iType, + Impl::ThreadsExecTeamMember>( + thread, iType(arg_begin), iType(arg_end)); +} + +KOKKOS_INLINE_FUNCTION +Impl::ThreadSingleStruct<Impl::ThreadsExecTeamMember> PerTeam( + const Impl::ThreadsExecTeamMember& thread) { + return Impl::ThreadSingleStruct<Impl::ThreadsExecTeamMember>(thread); +} + +KOKKOS_INLINE_FUNCTION +Impl::VectorSingleStruct<Impl::ThreadsExecTeamMember> PerThread( + const Impl::ThreadsExecTeamMember& thread) { + return Impl::VectorSingleStruct<Impl::ThreadsExecTeamMember>(thread); +} +} // namespace Kokkos + +namespace Kokkos { + +/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each + * i=0..N-1. + * + * The range i=0..N-1 is mapped to all threads of the the calling thread team. + */ +template <typename iType, class Lambda> +KOKKOS_INLINE_FUNCTION void parallel_for( + const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::ThreadsExecTeamMember>& loop_boundaries, + const Lambda& lambda) { + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) + lambda(i); +} + +/** \brief Inter-thread vector parallel_reduce. Executes lambda(iType i, + * ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all threads of the the calling thread team + * and a summation of val is performed and put into result. + */ +template <typename iType, class Lambda, typename ValueType> +KOKKOS_INLINE_FUNCTION + typename std::enable_if<!Kokkos::is_reducer<ValueType>::value>::type + parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::ThreadsExecTeamMember>& loop_boundaries, + const Lambda& lambda, ValueType& result) { + ValueType intermediate; + Sum<ValueType> sum(intermediate); + sum.init(intermediate); + + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) { + ValueType tmp = ValueType(); + lambda(i, tmp); + intermediate += tmp; + } + + loop_boundaries.thread.team_reduce(sum, intermediate); + result = sum.reference(); +} + +template <typename iType, class Lambda, typename ReducerType> +KOKKOS_INLINE_FUNCTION + typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type + parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::ThreadsExecTeamMember>& loop_boundaries, + const Lambda& lambda, const ReducerType& reducer) { + typename ReducerType::value_type value; + reducer.init(value); + + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) { + lambda(i, value); + } + + loop_boundaries.thread.team_reduce(reducer, value); +} + +} // namespace Kokkos + +namespace Kokkos { +/** \brief Intra-thread vector parallel_for. Executes lambda(iType i) for each + * i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling thread. + */ +template <typename iType, class Lambda> +KOKKOS_INLINE_FUNCTION void parallel_for( + const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::ThreadsExecTeamMember>& loop_boundaries, + const Lambda& lambda) { +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) + lambda(i); +} + +/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, + * ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling thread + * and a summation of val is performed and put into result. + */ +template <typename iType, class Lambda, typename ValueType> +KOKKOS_INLINE_FUNCTION + typename std::enable_if<!Kokkos::is_reducer<ValueType>::value>::type + parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::ThreadsExecTeamMember>& loop_boundaries, + const Lambda& lambda, ValueType& result) { + result = ValueType(); + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) { + lambda(i, result); + } +} + +template <typename iType, class Lambda, typename ReducerType> +KOKKOS_INLINE_FUNCTION + typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type + parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::ThreadsExecTeamMember>& loop_boundaries, + const Lambda& lambda, const ReducerType& reducer) { + reducer.init(reducer.reference()); + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) { + lambda(i, reducer.reference()); + } +} + +/** \brief Inter-thread parallel exclusive prefix sum. Executes + * lambda(iType i, ValueType & val, bool final) for each i=0..N-1. + * + */ +template <typename iType, class FunctorType> +KOKKOS_INLINE_FUNCTION void parallel_scan( + const Impl::TeamThreadRangeBoundariesStruct< + iType, Impl::ThreadsExecTeamMember>& loop_bounds, + const FunctorType& lambda) { + using value_type = typename Kokkos::Impl::FunctorAnalysis< + Kokkos::Impl::FunctorPatternInterface::SCAN, void, + FunctorType>::value_type; + + auto scan_val = value_type{}; + + // Intra-member scan +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif + for (iType i = loop_bounds.start; i < loop_bounds.end; + i += loop_bounds.increment) { + lambda(i, scan_val, false); + } + + // 'scan_val' output is the exclusive prefix sum + scan_val = loop_bounds.thread.team_scan(scan_val); + +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif + for (iType i = loop_bounds.start; i < loop_bounds.end; + i += loop_bounds.increment) { + lambda(i, scan_val, true); + } +} + +/** \brief Intra-thread vector parallel exclusive prefix sum. Executes + * lambda(iType i, ValueType & val, bool final) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan + * operation is performed. Depending on the target execution space the operator + * might be called twice: once with final=false and once with final=true. When + * final==true val contains the prefix sum value. The contribution of this "i" + * needs to be added to val no matter whether final==true or not. In a serial + * execution (i.e. team_size==1) the operator is only called once with + * final==true. Scan_val will be set to the final sum value over all vector + * lanes. + */ +template <typename iType, class FunctorType> +KOKKOS_INLINE_FUNCTION void parallel_scan( + const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::ThreadsExecTeamMember>& loop_boundaries, + const FunctorType& lambda) { + using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, void>; + using value_type = typename ValueTraits::value_type; + + value_type scan_val = value_type(); + +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) { + lambda(i, scan_val, true); + } +} + +/** \brief Intra-thread vector parallel scan with reducer + * + */ +template <typename iType, class FunctorType, typename ReducerType> +KOKKOS_INLINE_FUNCTION + typename std::enable_if<Kokkos::is_reducer<ReducerType>::value>::type + parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::ThreadsExecTeamMember>& loop_boundaries, + const FunctorType& lambda, const ReducerType& reducer) { + typename ReducerType::value_type scan_val; + reducer.init(scan_val); + +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) { + lambda(i, scan_val, true); + } +} + +} // namespace Kokkos + +namespace Kokkos { + +template <class FunctorType> +KOKKOS_INLINE_FUNCTION void single( + const Impl::VectorSingleStruct< + Impl::ThreadsExecTeamMember>& /*single_struct*/, + const FunctorType& lambda) { + lambda(); +} + +template <class FunctorType> +KOKKOS_INLINE_FUNCTION void single( + const Impl::ThreadSingleStruct<Impl::ThreadsExecTeamMember>& single_struct, + const FunctorType& lambda) { + if (single_struct.team_member.team_rank() == 0) lambda(); +} + +template <class FunctorType, class ValueType> +KOKKOS_INLINE_FUNCTION void single( + const Impl::VectorSingleStruct< + Impl::ThreadsExecTeamMember>& /*single_struct*/, + const FunctorType& lambda, ValueType& val) { + lambda(val); +} + +template <class FunctorType, class ValueType> +KOKKOS_INLINE_FUNCTION void single( + const Impl::ThreadSingleStruct<Impl::ThreadsExecTeamMember>& single_struct, + const FunctorType& lambda, ValueType& val) { + if (single_struct.team_member.team_rank() == 0) { + lambda(val); + } + single_struct.team_member.team_broadcast(val, 0); +} +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- +#endif +#endif /* #define KOKKOS_THREADSTEAM_HPP */ diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp new file mode 100644 index 0000000000000000000000000000000000000000..c08615188f68be6de52a2c66e9c717fffb012606 --- /dev/null +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp @@ -0,0 +1,1014 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_THREADS_PARALLEL_HPP +#define KOKKOS_THREADS_PARALLEL_HPP + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_THREADS) + +#include <Kokkos_Parallel.hpp> + +#include <impl/Kokkos_FunctorAdapter.hpp> + +#include <KokkosExp_MDRangePolicy.hpp> + +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- +/* ParallelFor Kokkos::Threads with RangePolicy */ + +template <class FunctorType, class... Traits> +class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, + Kokkos::Threads> { + private: + using Policy = Kokkos::RangePolicy<Traits...>; + using WorkTag = typename Policy::work_tag; + using WorkRange = typename Policy::WorkRange; + using Member = typename Policy::member_type; + + const FunctorType m_functor; + const Policy m_policy; + + template <class TagType> + inline static + typename std::enable_if<std::is_same<TagType, void>::value>::type + exec_range(const FunctorType &functor, const Member ibeg, + const Member iend) { +#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ + defined(KOKKOS_ENABLE_PRAGMA_IVDEP) +#pragma ivdep +#endif + for (Member i = ibeg; i < iend; ++i) { + functor(i); + } + } + + template <class TagType> + inline static + typename std::enable_if<!std::is_same<TagType, void>::value>::type + exec_range(const FunctorType &functor, const Member ibeg, + const Member iend) { + const TagType t{}; +#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ + defined(KOKKOS_ENABLE_PRAGMA_IVDEP) +#pragma ivdep +#endif + for (Member i = ibeg; i < iend; ++i) { + functor(t, i); + } + } + + static void exec(ThreadsExec &exec, const void *arg) { + exec_schedule<typename Policy::schedule_type::type>(exec, arg); + } + + template <class Schedule> + static typename std::enable_if< + std::is_same<Schedule, Kokkos::Static>::value>::type + exec_schedule(ThreadsExec &exec, const void *arg) { + const ParallelFor &self = *((const ParallelFor *)arg); + + WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + + ParallelFor::template exec_range<WorkTag>(self.m_functor, range.begin(), + range.end()); + + exec.fan_in(); + } + + template <class Schedule> + static typename std::enable_if< + std::is_same<Schedule, Kokkos::Dynamic>::value>::type + exec_schedule(ThreadsExec &exec, const void *arg) { + const ParallelFor &self = *((const ParallelFor *)arg); + + WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + + exec.set_work_range(range.begin() - self.m_policy.begin(), + range.end() - self.m_policy.begin(), + self.m_policy.chunk_size()); + exec.reset_steal_target(); + exec.barrier(); + + long work_index = exec.get_work_index(); + + while (work_index != -1) { + const Member begin = + static_cast<Member>(work_index) * self.m_policy.chunk_size() + + self.m_policy.begin(); + const Member end = + begin + self.m_policy.chunk_size() < self.m_policy.end() + ? begin + self.m_policy.chunk_size() + : self.m_policy.end(); + ParallelFor::template exec_range<WorkTag>(self.m_functor, begin, end); + work_index = exec.get_work_index(); + } + + exec.fan_in(); + } + + public: + inline void execute() const { + ThreadsExec::start(&ParallelFor::exec, this); + ThreadsExec::fence(); + } + + ParallelFor(const FunctorType &arg_functor, const Policy &arg_policy) + : m_functor(arg_functor), m_policy(arg_policy) {} +}; + +// MDRangePolicy impl +template <class FunctorType, class... Traits> +class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, + Kokkos::Threads> { + private: + using MDRangePolicy = Kokkos::MDRangePolicy<Traits...>; + using Policy = typename MDRangePolicy::impl_range_policy; + + using WorkTag = typename MDRangePolicy::work_tag; + + using WorkRange = typename Policy::WorkRange; + using Member = typename Policy::member_type; + + using iterate_type = typename Kokkos::Impl::HostIterateTile< + MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void>; + + const FunctorType m_functor; + const MDRangePolicy m_mdr_policy; + const Policy m_policy; // construct as RangePolicy( 0, num_tiles + // ).set_chunk_size(1) in ctor + + inline static void exec_range(const MDRangePolicy &mdr_policy, + const FunctorType &functor, const Member ibeg, + const Member iend) { +#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ + defined(KOKKOS_ENABLE_PRAGMA_IVDEP) +#pragma ivdep +#endif + for (Member i = ibeg; i < iend; ++i) { + iterate_type(mdr_policy, functor)(i); + } + } + + static void exec(ThreadsExec &exec, const void *arg) { + exec_schedule<typename Policy::schedule_type::type>(exec, arg); + } + + template <class Schedule> + static typename std::enable_if< + std::is_same<Schedule, Kokkos::Static>::value>::type + exec_schedule(ThreadsExec &exec, const void *arg) { + const ParallelFor &self = *((const ParallelFor *)arg); + + WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + + ParallelFor::exec_range(self.m_mdr_policy, self.m_functor, range.begin(), + range.end()); + + exec.fan_in(); + } + + template <class Schedule> + static typename std::enable_if< + std::is_same<Schedule, Kokkos::Dynamic>::value>::type + exec_schedule(ThreadsExec &exec, const void *arg) { + const ParallelFor &self = *((const ParallelFor *)arg); + + WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + + exec.set_work_range(range.begin(), range.end(), self.m_policy.chunk_size()); + exec.reset_steal_target(); + exec.barrier(); + + long work_index = exec.get_work_index(); + + while (work_index != -1) { + const Member begin = + static_cast<Member>(work_index) * self.m_policy.chunk_size(); + const Member end = + begin + self.m_policy.chunk_size() < self.m_policy.end() + ? begin + self.m_policy.chunk_size() + : self.m_policy.end(); + + ParallelFor::exec_range(self.m_mdr_policy, self.m_functor, begin, end); + work_index = exec.get_work_index(); + } + + exec.fan_in(); + } + + public: + inline void execute() const { + ThreadsExec::start(&ParallelFor::exec, this); + ThreadsExec::fence(); + } + + ParallelFor(const FunctorType &arg_functor, const MDRangePolicy &arg_policy) + : m_functor(arg_functor), + m_mdr_policy(arg_policy), + m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)) {} +}; + +//---------------------------------------------------------------------------- +/* ParallelFor Kokkos::Threads with TeamPolicy */ + +template <class FunctorType, class... Properties> +class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, + Kokkos::Threads> { + private: + using Policy = + Kokkos::Impl::TeamPolicyInternal<Kokkos::Threads, Properties...>; + using WorkTag = typename Policy::work_tag; + using Member = typename Policy::member_type; + + const FunctorType m_functor; + const Policy m_policy; + const int m_shared; + + template <class TagType, class Schedule> + inline static typename std::enable_if< + std::is_same<TagType, void>::value && + std::is_same<Schedule, Kokkos::Static>::value>::type + exec_team(const FunctorType &functor, Member member) { + for (; member.valid_static(); member.next_static()) { + functor(member); + } + } + + template <class TagType, class Schedule> + inline static typename std::enable_if< + !std::is_same<TagType, void>::value && + std::is_same<Schedule, Kokkos::Static>::value>::type + exec_team(const FunctorType &functor, Member member) { + const TagType t{}; + for (; member.valid_static(); member.next_static()) { + functor(t, member); + } + } + + template <class TagType, class Schedule> + inline static typename std::enable_if< + std::is_same<TagType, void>::value && + std::is_same<Schedule, Kokkos::Dynamic>::value>::type + exec_team(const FunctorType &functor, Member member) { + for (; member.valid_dynamic(); member.next_dynamic()) { + functor(member); + } + } + + template <class TagType, class Schedule> + inline static typename std::enable_if< + !std::is_same<TagType, void>::value && + std::is_same<Schedule, Kokkos::Dynamic>::value>::type + exec_team(const FunctorType &functor, Member member) { + const TagType t{}; + for (; member.valid_dynamic(); member.next_dynamic()) { + functor(t, member); + } + } + + static void exec(ThreadsExec &exec, const void *arg) { + const ParallelFor &self = *((const ParallelFor *)arg); + + ParallelFor::exec_team<WorkTag, typename Policy::schedule_type::type>( + self.m_functor, Member(&exec, self.m_policy, self.m_shared)); + + exec.barrier(); + exec.fan_in(); + } + template <typename Policy> + Policy fix_policy(Policy policy) { + if (policy.impl_vector_length() < 0) { + policy.impl_set_vector_length(1); + } + if (policy.team_size() < 0) { + policy.impl_set_team_size( + policy.team_size_recommended(m_functor, ParallelForTag{})); + } + return policy; + } + + public: + inline void execute() const { + ThreadsExec::resize_scratch( + 0, Policy::member_type::team_reduce_size() + m_shared); + + ThreadsExec::start(&ParallelFor::exec, this); + + ThreadsExec::fence(); + } + + ParallelFor(const FunctorType &arg_functor, const Policy &arg_policy) + : m_functor(arg_functor), + m_policy(fix_policy(arg_policy)), + m_shared(m_policy.scratch_size(0) + m_policy.scratch_size(1) + + FunctorTeamShmemSize<FunctorType>::value( + arg_functor, m_policy.team_size())) {} +}; + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- +/* ParallelReduce with Kokkos::Threads and RangePolicy */ + +template <class FunctorType, class ReducerType, class... Traits> +class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType, + Kokkos::Threads> { + private: + using Policy = Kokkos::RangePolicy<Traits...>; + + using WorkTag = typename Policy::work_tag; + using WorkRange = typename Policy::WorkRange; + using Member = typename Policy::member_type; + + using ReducerConditional = + Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, + FunctorType, ReducerType>; + using ReducerTypeFwd = typename ReducerConditional::type; + using WorkTagFwd = + typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, + WorkTag, void>::type; + + using ValueTraits = + Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>; + using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>; + using ValueFinal = Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>; + + using pointer_type = typename ValueTraits::pointer_type; + using reference_type = typename ValueTraits::reference_type; + + const FunctorType m_functor; + const Policy m_policy; + const ReducerType m_reducer; + const pointer_type m_result_ptr; + + template <class TagType> + inline static + typename std::enable_if<std::is_same<TagType, void>::value>::type + exec_range(const FunctorType &functor, const Member &ibeg, + const Member &iend, reference_type update) { +#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ + defined(KOKKOS_ENABLE_PRAGMA_IVDEP) +#pragma ivdep +#endif + for (Member i = ibeg; i < iend; ++i) { + functor(i, update); + } + } + + template <class TagType> + inline static + typename std::enable_if<!std::is_same<TagType, void>::value>::type + exec_range(const FunctorType &functor, const Member &ibeg, + const Member &iend, reference_type update) { + const TagType t{}; +#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ + defined(KOKKOS_ENABLE_PRAGMA_IVDEP) +#pragma ivdep +#endif + for (Member i = ibeg; i < iend; ++i) { + functor(t, i, update); + } + } + + static void exec(ThreadsExec &exec, const void *arg) { + exec_schedule<typename Policy::schedule_type::type>(exec, arg); + } + + template <class Schedule> + static typename std::enable_if< + std::is_same<Schedule, Kokkos::Static>::value>::type + exec_schedule(ThreadsExec &exec, const void *arg) { + const ParallelReduce &self = *((const ParallelReduce *)arg); + const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + + ParallelReduce::template exec_range<WorkTag>( + self.m_functor, range.begin(), range.end(), + ValueInit::init( + ReducerConditional::select(self.m_functor, self.m_reducer), + exec.reduce_memory())); + + exec.template fan_in_reduce<ReducerTypeFwd, WorkTagFwd>( + ReducerConditional::select(self.m_functor, self.m_reducer)); + } + + template <class Schedule> + static typename std::enable_if< + std::is_same<Schedule, Kokkos::Dynamic>::value>::type + exec_schedule(ThreadsExec &exec, const void *arg) { + const ParallelReduce &self = *((const ParallelReduce *)arg); + const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + + exec.set_work_range(range.begin() - self.m_policy.begin(), + range.end() - self.m_policy.begin(), + self.m_policy.chunk_size()); + exec.reset_steal_target(); + exec.barrier(); + + long work_index = exec.get_work_index(); + reference_type update = ValueInit::init( + ReducerConditional::select(self.m_functor, self.m_reducer), + exec.reduce_memory()); + while (work_index != -1) { + const Member begin = + static_cast<Member>(work_index) * self.m_policy.chunk_size() + + self.m_policy.begin(); + const Member end = + begin + self.m_policy.chunk_size() < self.m_policy.end() + ? begin + self.m_policy.chunk_size() + : self.m_policy.end(); + ParallelReduce::template exec_range<WorkTag>(self.m_functor, begin, end, + update); + work_index = exec.get_work_index(); + } + + exec.template fan_in_reduce<ReducerTypeFwd, WorkTagFwd>( + ReducerConditional::select(self.m_functor, self.m_reducer)); + } + + public: + inline void execute() const { + if (m_policy.end() <= m_policy.begin()) { + if (m_result_ptr) { + ValueInit::init(ReducerConditional::select(m_functor, m_reducer), + m_result_ptr); + ValueFinal::final(ReducerConditional::select(m_functor, m_reducer), + m_result_ptr); + } + } else { + ThreadsExec::resize_scratch( + ValueTraits::value_size( + ReducerConditional::select(m_functor, m_reducer)), + 0); + + ThreadsExec::start(&ParallelReduce::exec, this); + + ThreadsExec::fence(); + + if (m_result_ptr) { + const pointer_type data = + (pointer_type)ThreadsExec::root_reduce_scratch(); + + const unsigned n = ValueTraits::value_count( + ReducerConditional::select(m_functor, m_reducer)); + for (unsigned i = 0; i < n; ++i) { + m_result_ptr[i] = data[i]; + } + } + } + } + + template <class HostViewType> + ParallelReduce( + const FunctorType &arg_functor, const Policy &arg_policy, + const HostViewType &arg_result_view, + typename std::enable_if<Kokkos::is_view<HostViewType>::value && + !Kokkos::is_reducer_type<ReducerType>::value, + void *>::type = nullptr) + : m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(InvalidType()), + m_result_ptr(arg_result_view.data()) { + static_assert(Kokkos::is_view<HostViewType>::value, + "Kokkos::Threads reduce result must be a View"); + + static_assert( + std::is_same<typename HostViewType::memory_space, HostSpace>::value, + "Kokkos::Threads reduce result must be a View in HostSpace"); + } + + inline ParallelReduce(const FunctorType &arg_functor, Policy arg_policy, + const ReducerType &reducer) + : m_functor(arg_functor), + m_policy(arg_policy), + m_reducer(reducer), + m_result_ptr(reducer.view().data()) { + /*static_assert( std::is_same< typename ViewType::memory_space + , Kokkos::HostSpace >::value + , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" + );*/ + } +}; + +// MDRangePolicy impl +template <class FunctorType, class ReducerType, class... Traits> +class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType, + Kokkos::Threads> { + private: + using MDRangePolicy = Kokkos::MDRangePolicy<Traits...>; + using Policy = typename MDRangePolicy::impl_range_policy; + + using WorkTag = typename MDRangePolicy::work_tag; + using WorkRange = typename Policy::WorkRange; + using Member = typename Policy::member_type; + + using ReducerConditional = + Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, + FunctorType, ReducerType>; + using ReducerTypeFwd = typename ReducerConditional::type; + using WorkTagFwd = + typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, + WorkTag, void>::type; + + using ValueTraits = + Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>; + using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>; + + using pointer_type = typename ValueTraits::pointer_type; + using value_type = typename ValueTraits::value_type; + using reference_type = typename ValueTraits::reference_type; + + using iterate_type = + typename Kokkos::Impl::HostIterateTile<MDRangePolicy, FunctorType, + WorkTag, reference_type>; + + const FunctorType m_functor; + const MDRangePolicy m_mdr_policy; + const Policy m_policy; // construct as RangePolicy( 0, num_tiles + // ).set_chunk_size(1) in ctor + const ReducerType m_reducer; + const pointer_type m_result_ptr; + + inline static void exec_range(const MDRangePolicy &mdr_policy, + const FunctorType &functor, const Member &ibeg, + const Member &iend, reference_type update) { +#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ + defined(KOKKOS_ENABLE_PRAGMA_IVDEP) +#pragma ivdep +#endif + for (Member i = ibeg; i < iend; ++i) { + iterate_type(mdr_policy, functor, update)(i); + } + } + + static void exec(ThreadsExec &exec, const void *arg) { + exec_schedule<typename Policy::schedule_type::type>(exec, arg); + } + + template <class Schedule> + static typename std::enable_if< + std::is_same<Schedule, Kokkos::Static>::value>::type + exec_schedule(ThreadsExec &exec, const void *arg) { + const ParallelReduce &self = *((const ParallelReduce *)arg); + const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + + ParallelReduce::exec_range( + self.m_mdr_policy, self.m_functor, range.begin(), range.end(), + ValueInit::init( + ReducerConditional::select(self.m_functor, self.m_reducer), + exec.reduce_memory())); + + exec.template fan_in_reduce<ReducerTypeFwd, WorkTagFwd>( + ReducerConditional::select(self.m_functor, self.m_reducer)); + } + + template <class Schedule> + static typename std::enable_if< + std::is_same<Schedule, Kokkos::Dynamic>::value>::type + exec_schedule(ThreadsExec &exec, const void *arg) { + const ParallelReduce &self = *((const ParallelReduce *)arg); + const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + + exec.set_work_range(range.begin(), range.end(), self.m_policy.chunk_size()); + exec.reset_steal_target(); + exec.barrier(); + + long work_index = exec.get_work_index(); + reference_type update = ValueInit::init( + ReducerConditional::select(self.m_functor, self.m_reducer), + exec.reduce_memory()); + while (work_index != -1) { + const Member begin = + static_cast<Member>(work_index) * self.m_policy.chunk_size(); + const Member end = + begin + self.m_policy.chunk_size() < self.m_policy.end() + ? begin + self.m_policy.chunk_size() + : self.m_policy.end(); + ParallelReduce::exec_range(self.m_mdr_policy, self.m_functor, begin, end, + update); + work_index = exec.get_work_index(); + } + + exec.template fan_in_reduce<ReducerTypeFwd, WorkTagFwd>( + ReducerConditional::select(self.m_functor, self.m_reducer)); + } + + public: + inline void execute() const { + ThreadsExec::resize_scratch( + ValueTraits::value_size( + ReducerConditional::select(m_functor, m_reducer)), + 0); + + ThreadsExec::start(&ParallelReduce::exec, this); + + ThreadsExec::fence(); + + if (m_result_ptr) { + const pointer_type data = + (pointer_type)ThreadsExec::root_reduce_scratch(); + + const unsigned n = ValueTraits::value_count( + ReducerConditional::select(m_functor, m_reducer)); + for (unsigned i = 0; i < n; ++i) { + m_result_ptr[i] = data[i]; + } + } + } + + template <class HostViewType> + ParallelReduce( + const FunctorType &arg_functor, const MDRangePolicy &arg_policy, + const HostViewType &arg_result_view, + typename std::enable_if<Kokkos::is_view<HostViewType>::value && + !Kokkos::is_reducer_type<ReducerType>::value, + void *>::type = nullptr) + : m_functor(arg_functor), + m_mdr_policy(arg_policy), + m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)), + m_reducer(InvalidType()), + m_result_ptr(arg_result_view.data()) { + static_assert(Kokkos::is_view<HostViewType>::value, + "Kokkos::Threads reduce result must be a View"); + + static_assert( + std::is_same<typename HostViewType::memory_space, HostSpace>::value, + "Kokkos::Threads reduce result must be a View in HostSpace"); + } + + inline ParallelReduce(const FunctorType &arg_functor, + MDRangePolicy arg_policy, const ReducerType &reducer) + : m_functor(arg_functor), + m_mdr_policy(arg_policy), + m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)), + m_reducer(reducer), + m_result_ptr(reducer.view().data()) { + /*static_assert( std::is_same< typename ViewType::memory_space + , Kokkos::HostSpace >::value + , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" + );*/ + } +}; + +//---------------------------------------------------------------------------- +/* ParallelReduce with Kokkos::Threads and TeamPolicy */ + +template <class FunctorType, class ReducerType, class... Properties> +class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>, + ReducerType, Kokkos::Threads> { + private: + using Policy = + Kokkos::Impl::TeamPolicyInternal<Kokkos::Threads, Properties...>; + using WorkTag = typename Policy::work_tag; + using Member = typename Policy::member_type; + + using ReducerConditional = + Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, + FunctorType, ReducerType>; + using ReducerTypeFwd = typename ReducerConditional::type; + using WorkTagFwd = + typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value, + WorkTag, void>::type; + + using ValueTraits = + Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>; + using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>; + using ValueFinal = Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>; + + using pointer_type = typename ValueTraits::pointer_type; + using reference_type = typename ValueTraits::reference_type; + + const FunctorType m_functor; + const Policy m_policy; + const ReducerType m_reducer; + const pointer_type m_result_ptr; + const int m_shared; + + template <class TagType> + inline static + typename std::enable_if<std::is_same<TagType, void>::value>::type + exec_team(const FunctorType &functor, Member member, + reference_type update) { + for (; member.valid_static(); member.next_static()) { + functor(member, update); + } + } + + template <class TagType> + inline static + typename std::enable_if<!std::is_same<TagType, void>::value>::type + exec_team(const FunctorType &functor, Member member, + reference_type update) { + const TagType t{}; + for (; member.valid_static(); member.next_static()) { + functor(t, member, update); + } + } + + static void exec(ThreadsExec &exec, const void *arg) { + const ParallelReduce &self = *((const ParallelReduce *)arg); + + ParallelReduce::template exec_team<WorkTag>( + self.m_functor, Member(&exec, self.m_policy, self.m_shared), + ValueInit::init( + ReducerConditional::select(self.m_functor, self.m_reducer), + exec.reduce_memory())); + + exec.template fan_in_reduce<ReducerTypeFwd, WorkTagFwd>( + ReducerConditional::select(self.m_functor, self.m_reducer)); + } + + public: + inline void execute() const { + if (m_policy.league_size() * m_policy.team_size() == 0) { + if (m_result_ptr) { + ValueInit::init(ReducerConditional::select(m_functor, m_reducer), + m_result_ptr); + ValueFinal::final(ReducerConditional::select(m_functor, m_reducer), + m_result_ptr); + } + } else { + ThreadsExec::resize_scratch( + ValueTraits::value_size( + ReducerConditional::select(m_functor, m_reducer)), + Policy::member_type::team_reduce_size() + m_shared); + + ThreadsExec::start(&ParallelReduce::exec, this); + + ThreadsExec::fence(); + + if (m_result_ptr) { + const pointer_type data = + (pointer_type)ThreadsExec::root_reduce_scratch(); + + const unsigned n = ValueTraits::value_count( + ReducerConditional::select(m_functor, m_reducer)); + for (unsigned i = 0; i < n; ++i) { + m_result_ptr[i] = data[i]; + } + } + } + } + + template <typename Policy> + Policy fix_policy(Policy policy) { + if (policy.impl_vector_length() < 0) { + policy.impl_set_vector_length(1); + } + if (policy.team_size() < 0) { + policy.impl_set_team_size(policy.team_size_recommended( + m_functor, m_reducer, ParallelReduceTag{})); + } + return policy; + } + + template <class ViewType> + inline ParallelReduce( + const FunctorType &arg_functor, const Policy &arg_policy, + const ViewType &arg_result, + typename std::enable_if<Kokkos::is_view<ViewType>::value && + !Kokkos::is_reducer_type<ReducerType>::value, + void *>::type = nullptr) + : m_functor(arg_functor), + m_policy(fix_policy(arg_policy)), + m_reducer(InvalidType()), + m_result_ptr(arg_result.data()), + m_shared(m_policy.scratch_size(0) + m_policy.scratch_size(1) + + FunctorTeamShmemSize<FunctorType>::value( + arg_functor, m_policy.team_size())) {} + + inline ParallelReduce(const FunctorType &arg_functor, Policy arg_policy, + const ReducerType &reducer) + : m_functor(arg_functor), + m_policy(fix_policy(arg_policy)), + m_reducer(reducer), + m_result_ptr(reducer.view().data()), + m_shared(m_policy.scratch_size(0) + m_policy.scratch_size(1) + + FunctorTeamShmemSize<FunctorType>::value( + arg_functor, m_policy.team_size())) { + /*static_assert( std::is_same< typename ViewType::memory_space + , Kokkos::HostSpace >::value + , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" + );*/ + } +}; + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- +/* ParallelScan with Kokkos::Threads and RangePolicy */ + +template <class FunctorType, class... Traits> +class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, + Kokkos::Threads> { + private: + using Policy = Kokkos::RangePolicy<Traits...>; + using WorkRange = typename Policy::WorkRange; + using WorkTag = typename Policy::work_tag; + using Member = typename Policy::member_type; + using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, WorkTag>; + using ValueInit = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>; + + using pointer_type = typename ValueTraits::pointer_type; + using reference_type = typename ValueTraits::reference_type; + + const FunctorType m_functor; + const Policy m_policy; + + template <class TagType> + inline static + typename std::enable_if<std::is_same<TagType, void>::value>::type + exec_range(const FunctorType &functor, const Member &ibeg, + const Member &iend, reference_type update, const bool final) { +#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ + defined(KOKKOS_ENABLE_PRAGMA_IVDEP) +#pragma ivdep +#endif + for (Member i = ibeg; i < iend; ++i) { + functor(i, update, final); + } + } + + template <class TagType> + inline static + typename std::enable_if<!std::is_same<TagType, void>::value>::type + exec_range(const FunctorType &functor, const Member &ibeg, + const Member &iend, reference_type update, const bool final) { + const TagType t{}; +#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ + defined(KOKKOS_ENABLE_PRAGMA_IVDEP) +#pragma ivdep +#endif + for (Member i = ibeg; i < iend; ++i) { + functor(t, i, update, final); + } + } + + static void exec(ThreadsExec &exec, const void *arg) { + const ParallelScan &self = *((const ParallelScan *)arg); + + const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + + reference_type update = + ValueInit::init(self.m_functor, exec.reduce_memory()); + + ParallelScan::template exec_range<WorkTag>(self.m_functor, range.begin(), + range.end(), update, false); + + // exec.template scan_large<FunctorType,WorkTag>( self.m_functor ); + exec.template scan_small<FunctorType, WorkTag>(self.m_functor); + + ParallelScan::template exec_range<WorkTag>(self.m_functor, range.begin(), + range.end(), update, true); + + exec.fan_in(); + } + + public: + inline void execute() const { + ThreadsExec::resize_scratch(2 * ValueTraits::value_size(m_functor), 0); + ThreadsExec::start(&ParallelScan::exec, this); + ThreadsExec::fence(); + } + + ParallelScan(const FunctorType &arg_functor, const Policy &arg_policy) + : m_functor(arg_functor), m_policy(arg_policy) {} +}; + +template <class FunctorType, class ReturnType, class... Traits> +class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>, + ReturnType, Kokkos::Threads> { + private: + using Policy = Kokkos::RangePolicy<Traits...>; + using WorkRange = typename Policy::WorkRange; + using WorkTag = typename Policy::work_tag; + using Member = typename Policy::member_type; + using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, WorkTag>; + using ValueInit = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>; + + using pointer_type = typename ValueTraits::pointer_type; + using reference_type = typename ValueTraits::reference_type; + + const FunctorType m_functor; + const Policy m_policy; + ReturnType &m_returnvalue; + + template <class TagType> + inline static + typename std::enable_if<std::is_same<TagType, void>::value>::type + exec_range(const FunctorType &functor, const Member &ibeg, + const Member &iend, reference_type update, const bool final) { +#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ + defined(KOKKOS_ENABLE_PRAGMA_IVDEP) +#pragma ivdep +#endif + for (Member i = ibeg; i < iend; ++i) { + functor(i, update, final); + } + } + + template <class TagType> + inline static + typename std::enable_if<!std::is_same<TagType, void>::value>::type + exec_range(const FunctorType &functor, const Member &ibeg, + const Member &iend, reference_type update, const bool final) { + const TagType t{}; +#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ + defined(KOKKOS_ENABLE_PRAGMA_IVDEP) +#pragma ivdep +#endif + for (Member i = ibeg; i < iend; ++i) { + functor(t, i, update, final); + } + } + + static void exec(ThreadsExec &exec, const void *arg) { + const ParallelScanWithTotal &self = *((const ParallelScanWithTotal *)arg); + + const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + + reference_type update = + ValueInit::init(self.m_functor, exec.reduce_memory()); + + ParallelScanWithTotal::template exec_range<WorkTag>( + self.m_functor, range.begin(), range.end(), update, false); + + // exec.template scan_large<FunctorType,WorkTag>( self.m_functor ); + exec.template scan_small<FunctorType, WorkTag>(self.m_functor); + + ParallelScanWithTotal::template exec_range<WorkTag>( + self.m_functor, range.begin(), range.end(), update, true); + + exec.fan_in(); + + if (exec.pool_rank() == exec.pool_size() - 1) { + self.m_returnvalue = update; + } + } + + public: + inline void execute() const { + ThreadsExec::resize_scratch(2 * ValueTraits::value_size(m_functor), 0); + ThreadsExec::start(&ParallelScanWithTotal::exec, this); + ThreadsExec::fence(); + } + + ParallelScanWithTotal(const FunctorType &arg_functor, + const Policy &arg_policy, ReturnType &arg_returnvalue) + : m_functor(arg_functor), + m_policy(arg_policy), + m_returnvalue(arg_returnvalue) {} +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif +#endif /* #define KOKKOS_THREADS_PARALLEL_HPP */ diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp new file mode 100644 index 0000000000000000000000000000000000000000..401f3c0b1a08bb6fab05d2988e7da92460571905 --- /dev/null +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp @@ -0,0 +1,111 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_THREADS_WORKGRAPHPOLICY_HPP +#define KOKKOS_THREADS_WORKGRAPHPOLICY_HPP + +#include <Kokkos_Core_fwd.hpp> +#include <Kokkos_Threads.hpp> + +namespace Kokkos { +namespace Impl { + +template <class FunctorType, class... Traits> +class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>, + Kokkos::Threads> { + private: + using Policy = Kokkos::WorkGraphPolicy<Traits...>; + + using Self = ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>, + Kokkos::Threads>; + + Policy m_policy; + FunctorType m_functor; + + template <class TagType> + typename std::enable_if<std::is_same<TagType, void>::value>::type exec_one( + const std::int32_t w) const noexcept { + m_functor(w); + } + + template <class TagType> + typename std::enable_if<!std::is_same<TagType, void>::value>::type exec_one( + const std::int32_t w) const noexcept { + const TagType t{}; + m_functor(t, w); + } + + inline void exec_one_thread() const noexcept { + // Spin until COMPLETED_TOKEN. + // END_TOKEN indicates no work is currently available. + + for (std::int32_t w = Policy::END_TOKEN; + Policy::COMPLETED_TOKEN != (w = m_policy.pop_work());) { + if (Policy::END_TOKEN != w) { + exec_one<typename Policy::work_tag>(w); + m_policy.completed_work(w); + } + } + } + + static inline void thread_main(ThreadsExec& exec, const void* arg) noexcept { + const Self& self = *(static_cast<const Self*>(arg)); + self.exec_one_thread(); + exec.fan_in(); + } + + public: + inline void execute() { + ThreadsExec::start(&Self::thread_main, this); + ThreadsExec::fence(); + } + + inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) + : m_policy(arg_policy), m_functor(arg_functor) {} +}; + +} // namespace Impl +} // namespace Kokkos + +#endif /* #define KOKKOS_THREADS_WORKGRAPHPOLICY_HPP */ diff --git a/packages/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp b/packages/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp new file mode 100644 index 0000000000000000000000000000000000000000..df09e9e7215310e26d72009cc32f7e5339dfdc5b --- /dev/null +++ b/packages/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp @@ -0,0 +1,61 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_DECLARE_CUDA_HPP +#define KOKKOS_DECLARE_CUDA_HPP + +#if defined(KOKKOS_ENABLE_CUDA) +#include <Kokkos_Cuda.hpp> +#include <Cuda/Kokkos_Cuda_Parallel.hpp> +#include <Cuda/Kokkos_Cuda_KernelLaunch.hpp> +#include <Cuda/Kokkos_Cuda_Instance.hpp> +#include <Cuda/Kokkos_Cuda_View.hpp> +#include <Cuda/Kokkos_Cuda_Team.hpp> +#include <Cuda/Kokkos_Cuda_Parallel.hpp> +#include <Cuda/Kokkos_Cuda_Task.hpp> +#include <Cuda/Kokkos_Cuda_MDRangePolicy.hpp> +#include <Cuda/Kokkos_Cuda_UniqueToken.hpp> +#endif + +#endif diff --git a/packages/kokkos/core/src/decl/Kokkos_Declare_HBWSpace.hpp b/packages/kokkos/core/src/decl/Kokkos_Declare_HBWSpace.hpp new file mode 100644 index 0000000000000000000000000000000000000000..ff5133ed286d3cc845245e878c0b4c8e0a407ecf --- /dev/null +++ b/packages/kokkos/core/src/decl/Kokkos_Declare_HBWSpace.hpp @@ -0,0 +1,52 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_DECLARE_HBWSPACE_HPP +#define KOKKOS_DECLARE_HBWSPACE_HPP + +#ifdef KOKKOS_ENABLE_HBWSPACE +#include <Kokkos_HBWSpace.hpp> +#endif + +#endif diff --git a/packages/kokkos/core/src/decl/Kokkos_Declare_HIP.hpp b/packages/kokkos/core/src/decl/Kokkos_Declare_HIP.hpp new file mode 100644 index 0000000000000000000000000000000000000000..b3bf14dbf2408d68cd76e618f4e5c9346dca21b4 --- /dev/null +++ b/packages/kokkos/core/src/decl/Kokkos_Declare_HIP.hpp @@ -0,0 +1,52 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_DECLARE_HIP_HPP +#define KOKKOS_DECLARE_HIP_HPP + +#if defined(KOKKOS_ENABLE_HIP) +#include <Kokkos_HIP.hpp> +#endif + +#endif diff --git a/packages/kokkos/core/src/decl/Kokkos_Declare_HPX.hpp b/packages/kokkos/core/src/decl/Kokkos_Declare_HPX.hpp new file mode 100644 index 0000000000000000000000000000000000000000..2cbecc9e76c47d3b1fee87de9c74f3acc1051852 --- /dev/null +++ b/packages/kokkos/core/src/decl/Kokkos_Declare_HPX.hpp @@ -0,0 +1,52 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_DECLARE_HPX_HPP +#define KOKKOS_DECLARE_HPX_HPP + +#if defined(KOKKOS_ENABLE_HPX) +#include <Kokkos_HPX.hpp> +#endif + +#endif diff --git a/packages/kokkos/core/src/decl/Kokkos_Declare_OPENMP.hpp b/packages/kokkos/core/src/decl/Kokkos_Declare_OPENMP.hpp new file mode 100644 index 0000000000000000000000000000000000000000..069dd5c160bec56a485b667da9c58c548943be42 --- /dev/null +++ b/packages/kokkos/core/src/decl/Kokkos_Declare_OPENMP.hpp @@ -0,0 +1,52 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_DECLARE_OPENMP_HPP +#define KOKKOS_DECLARE_OPENMP_HPP + +#if defined(KOKKOS_ENABLE_OPENMP) +#include <Kokkos_OpenMP.hpp> +#endif + +#endif diff --git a/packages/kokkos/core/src/decl/Kokkos_Declare_OPENMPTARGET.hpp b/packages/kokkos/core/src/decl/Kokkos_Declare_OPENMPTARGET.hpp new file mode 100644 index 0000000000000000000000000000000000000000..b193d1e741bc19d1725994839c682fa84f2267f9 --- /dev/null +++ b/packages/kokkos/core/src/decl/Kokkos_Declare_OPENMPTARGET.hpp @@ -0,0 +1,54 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_DECLARE_OPENMPTARGET_HPP +#define KOKKOS_DECLARE_OPENMPTARGET_HPP + +#if defined(KOKKOS_ENABLE_OPENMPTARGET) +#include <Kokkos_OpenMPTarget.hpp> +#include <Kokkos_OpenMPTargetSpace.hpp> +#include <OpenMPTarget/Kokkos_OpenMPTarget_UniqueToken.hpp> +#endif + +#endif diff --git a/packages/kokkos/core/src/decl/Kokkos_Declare_SERIAL.hpp b/packages/kokkos/core/src/decl/Kokkos_Declare_SERIAL.hpp new file mode 100644 index 0000000000000000000000000000000000000000..45661b5af29949b54dca0126e90bd605e2bea6c6 --- /dev/null +++ b/packages/kokkos/core/src/decl/Kokkos_Declare_SERIAL.hpp @@ -0,0 +1,52 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_DECLARE_SERIAL_HPP +#define KOKKOS_DECLARE_SERIAL_HPP + +#if defined(KOKKOS_ENABLE_SERIAL) +#include <Kokkos_Serial.hpp> +#endif + +#endif diff --git a/packages/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp b/packages/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp new file mode 100644 index 0000000000000000000000000000000000000000..92cd85bcae8b9e8c65d37b9308033a0748c8d3aa --- /dev/null +++ b/packages/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp @@ -0,0 +1,59 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_DECLARE_SYCL_HPP +#define KOKKOS_DECLARE_SYCL_HPP + +#if defined(KOKKOS_ENABLE_SYCL) +#include <Kokkos_SYCL.hpp> +#include <SYCL/Kokkos_SYCL_DeepCopy.hpp> +#include <SYCL/Kokkos_SYCL_MDRangePolicy.hpp> +#include <SYCL/Kokkos_SYCL_Parallel_Range.hpp> +#include <SYCL/Kokkos_SYCL_Parallel_Reduce.hpp> +#include <SYCL/Kokkos_SYCL_Parallel_Scan.hpp> +#include <SYCL/Kokkos_SYCL_Parallel_Team.hpp> +#include <SYCL/Kokkos_SYCL_UniqueToken.hpp> +#endif + +#endif diff --git a/packages/kokkos/core/src/decl/Kokkos_Declare_THREADS.hpp b/packages/kokkos/core/src/decl/Kokkos_Declare_THREADS.hpp new file mode 100644 index 0000000000000000000000000000000000000000..adb8f12a9c7b1b97a112f631d3455ef9f20d2b9b --- /dev/null +++ b/packages/kokkos/core/src/decl/Kokkos_Declare_THREADS.hpp @@ -0,0 +1,52 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_DECLARE_THREADS_HPP +#define KOKKOS_DECLARE_THREADS_HPP + +#if defined(KOKKOS_ENABLE_THREADS) +#include <Kokkos_Threads.hpp> +#endif + +#endif diff --git a/packages/kokkos/core/src/dummy.cpp b/packages/kokkos/core/src/dummy.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4f5f14e7d323e97a5289d271be98667c59b5cf34 --- /dev/null +++ b/packages/kokkos/core/src/dummy.cpp @@ -0,0 +1,10 @@ + + +namespace Kokkos { +namespace AvoidCompilerWarnings { +int dontComplain() { + // keep the compiler from complaining about emptiness + return 0; +} +} // namespace AvoidCompilerWarnings +} // namespace Kokkos diff --git a/packages/kokkos/core/src/fwd/Kokkos_Fwd_CUDA.hpp b/packages/kokkos/core/src/fwd/Kokkos_Fwd_CUDA.hpp new file mode 100644 index 0000000000000000000000000000000000000000..4bda5e9411f05bfd76495c346f053acf958a84a7 --- /dev/null +++ b/packages/kokkos/core/src/fwd/Kokkos_Fwd_CUDA.hpp @@ -0,0 +1,67 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CUDA_FWD_HPP_ +#define KOKKOS_CUDA_FWD_HPP_ +#if defined(KOKKOS_ENABLE_CUDA) +namespace Kokkos { + +class CudaSpace; ///< Memory space on Cuda GPU +class CudaUVMSpace; ///< Memory space on Cuda GPU with UVM +class CudaHostPinnedSpace; ///< Memory space on Host accessible to Cuda GPU +class Cuda; ///< Execution space for Cuda GPU + +namespace Impl { + +template <class ExecSpace> +void cuda_prefetch_pointer(const ExecSpace& /*space*/, const void* /*ptr*/, + size_t /*bytes*/, bool /*to_device*/) {} + +void cuda_prefetch_pointer(const Cuda& space, const void* ptr, size_t bytes, + bool to_device); + +} // namespace Impl +} // namespace Kokkos +#endif +#endif diff --git a/packages/kokkos/core/src/fwd/Kokkos_Fwd_HBWSpace.hpp b/packages/kokkos/core/src/fwd/Kokkos_Fwd_HBWSpace.hpp new file mode 100644 index 0000000000000000000000000000000000000000..d9dada27a01ddf24eb36dd5b3030ecc2c2e8c41b --- /dev/null +++ b/packages/kokkos/core/src/fwd/Kokkos_Fwd_HBWSpace.hpp @@ -0,0 +1,57 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_HBWSPACE_FWD_HPP_ +#define KOKKOS_HBWSPACE_FWD_HPP_ + +#ifdef KOKKOS_ENABLE_HBWSPACE +namespace Kokkos { + +namespace Experimental { +class HBWSpace; /// Memory space for hbw_malloc from memkind (e.g. for KNL + /// processor) +} // namespace Experimental +} // namespace Kokkos +#endif +#endif diff --git a/packages/kokkos/core/src/fwd/Kokkos_Fwd_HIP.hpp b/packages/kokkos/core/src/fwd/Kokkos_Fwd_HIP.hpp new file mode 100644 index 0000000000000000000000000000000000000000..1a4e7b482c44b93f87ed981682e3895cf5a534ff --- /dev/null +++ b/packages/kokkos/core/src/fwd/Kokkos_Fwd_HIP.hpp @@ -0,0 +1,57 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_HIP_FWD_HPP_ +#define KOKKOS_HIP_FWD_HPP_ + +#if defined(KOKKOS_ENABLE_HIP) +namespace Kokkos { +namespace Experimental { +class HIPSpace; ///< Memory space on HIP GPU +class HIPHostPinnedSpace; ///< Memory space on Host accessible to HIP GPU +class HIP; ///< Execution space for HIP GPU +} // namespace Experimental +} // namespace Kokkos +#endif +#endif diff --git a/packages/kokkos/core/src/fwd/Kokkos_Fwd_HPX.hpp b/packages/kokkos/core/src/fwd/Kokkos_Fwd_HPX.hpp new file mode 100644 index 0000000000000000000000000000000000000000..8949c527fc62a1cad4cf0cc9de197f8eee57165f --- /dev/null +++ b/packages/kokkos/core/src/fwd/Kokkos_Fwd_HPX.hpp @@ -0,0 +1,55 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_HPX_FWD_HPP_ +#define KOKKOS_HPX_FWD_HPP_ + +#if defined(KOKKOS_ENABLE_HPX) +namespace Kokkos { +namespace Experimental { +class HPX; ///< Execution space with HPX back-end. +} // namespace Experimental +} // namespace Kokkos +#endif +#endif diff --git a/packages/kokkos/core/src/fwd/Kokkos_Fwd_OPENMP.hpp b/packages/kokkos/core/src/fwd/Kokkos_Fwd_OPENMP.hpp new file mode 100644 index 0000000000000000000000000000000000000000..fc2223d3e29dc9f5e227dd3f84c1f5be05572b84 --- /dev/null +++ b/packages/kokkos/core/src/fwd/Kokkos_Fwd_OPENMP.hpp @@ -0,0 +1,53 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_OPENMP_FWD_HPP_ +#define KOKKOS_OPENMP_FWD_HPP_ + +#if defined(KOKKOS_ENABLE_OPENMP) +namespace Kokkos { +class OpenMP; ///< OpenMP execution space. +} // namespace Kokkos +#endif +#endif diff --git a/packages/kokkos/core/src/fwd/Kokkos_Fwd_OPENMPTARGET.hpp b/packages/kokkos/core/src/fwd/Kokkos_Fwd_OPENMPTARGET.hpp new file mode 100644 index 0000000000000000000000000000000000000000..8d12b8b701482295f53373609608dcb6ac4b038e --- /dev/null +++ b/packages/kokkos/core/src/fwd/Kokkos_Fwd_OPENMPTARGET.hpp @@ -0,0 +1,56 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_OPENMPTARGET_FWD_HPP_ +#define KOKKOS_OPENMPTARGET_FWD_HPP_ + +#if defined(KOKKOS_ENABLE_OPENMPTARGET) +namespace Kokkos { +namespace Experimental { +class OpenMPTarget; ///< OpenMPTarget execution space. +class OpenMPTargetSpace; +} // namespace Experimental +} // namespace Kokkos +#endif +#endif diff --git a/packages/kokkos/core/src/fwd/Kokkos_Fwd_SERIAL.hpp b/packages/kokkos/core/src/fwd/Kokkos_Fwd_SERIAL.hpp new file mode 100644 index 0000000000000000000000000000000000000000..8f253d0a7504102d8c7fbe0d5d4aaec6c3ad6e14 --- /dev/null +++ b/packages/kokkos/core/src/fwd/Kokkos_Fwd_SERIAL.hpp @@ -0,0 +1,53 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_SERIAL_FWD_HPP_ +#define KOKKOS_SERIAL_FWD_HPP_ + +#if defined(KOKKOS_ENABLE_SERIAL) +namespace Kokkos { +class Serial; ///< Execution space main process on CPU. +} // namespace Kokkos +#endif +#endif diff --git a/packages/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp b/packages/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp new file mode 100644 index 0000000000000000000000000000000000000000..7754daa8a0189a3d0708ce6505955be4b76b2d61 --- /dev/null +++ b/packages/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp @@ -0,0 +1,59 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_SYCL_FWD_HPP_ +#define KOKKOS_SYCL_FWD_HPP_ + +#if defined(KOKKOS_ENABLE_SYCL) +namespace Kokkos { +namespace Experimental { +class SYCLDeviceUSMSpace; ///< Memory space on SYCL device, not accessible from + ///< the host +class SYCLSharedUSMSpace; ///< Memory space accessible from both the SYCL + ///< device and the host +class SYCL; ///< Execution space for SYCL +} // namespace Experimental +} // namespace Kokkos +#endif +#endif diff --git a/packages/kokkos/core/src/fwd/Kokkos_Fwd_THREADS.hpp b/packages/kokkos/core/src/fwd/Kokkos_Fwd_THREADS.hpp new file mode 100644 index 0000000000000000000000000000000000000000..28ffb685df85658064e3888e5c389aeed6ab0ced --- /dev/null +++ b/packages/kokkos/core/src/fwd/Kokkos_Fwd_THREADS.hpp @@ -0,0 +1,53 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_THREADS_FWD_HPP_ +#define KOKKOS_THREADS_FWD_HPP_ + +#if defined(KOKKOS_ENABLE_THREADS) +namespace Kokkos { +class Threads; ///< Execution space with pthreads back-end. +} // namespace Kokkos +#endif +#endif diff --git a/packages/kokkos/core/src/impl/CMakeLists.txt b/packages/kokkos/core/src/impl/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..9ff02a2eae84ba86f5c825b9054798f0b7218f61 --- /dev/null +++ b/packages/kokkos/core/src/impl/CMakeLists.txt @@ -0,0 +1,18 @@ + +SET(HEADERS "") +SET(SOURCES "") + +FILE(GLOB HEADERS *.hpp *.h) +FILE(GLOB SOURCES *.cpp) + +TRIBITS_ADD_LIBRARY( + kokkoscore_impl + NOINSTALLHEADERS ${HEADERS} + SOURCES ${SOURCES} + DEPLIBS + ) + +SET(TRILINOS_INCDIR ${CMAKE_INSTALL_PREFIX}/${${PROJECT_NAME}_INSTALL_INCLUDE_DIR}) + +INSTALL(FILES ${HEADERS} DESTINATION ${TRILINOS_INCDIR}/impl/) + diff --git a/packages/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp b/packages/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp new file mode 100644 index 0000000000000000000000000000000000000000..7f72b3983f57c9adea157cf70d815339696cd986 --- /dev/null +++ b/packages/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp @@ -0,0 +1,2799 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_HOST_EXP_ITERATE_TILE_HPP +#define KOKKOS_HOST_EXP_ITERATE_TILE_HPP + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ + defined(KOKKOS_ENABLE_PRAGMA_IVDEP) && !defined(__CUDA_ARCH__) +#define KOKKOS_MDRANGE_IVDEP +#endif + +#ifdef KOKKOS_MDRANGE_IVDEP +#define KOKKOS_ENABLE_IVDEP_MDRANGE _Pragma("ivdep") +#else +#define KOKKOS_ENABLE_IVDEP_MDRANGE +#endif + +#include <algorithm> + +namespace Kokkos { +namespace Impl { + +// Temporary, for testing new loop macros +#define KOKKOS_ENABLE_NEW_LOOP_MACROS 1 + +#define LOOP_1L(type, tile) \ + KOKKOS_ENABLE_IVDEP_MDRANGE \ + for (type i0 = 0; i0 < static_cast<type>(tile[0]); ++i0) + +#define LOOP_2L(type, tile) \ + for (type i1 = 0; i1 < static_cast<type>(tile[1]); ++i1) LOOP_1L(type, tile) + +#define LOOP_3L(type, tile) \ + for (type i2 = 0; i2 < static_cast<type>(tile[2]); ++i2) LOOP_2L(type, tile) + +#define LOOP_4L(type, tile) \ + for (type i3 = 0; i3 < static_cast<type>(tile[3]); ++i3) LOOP_3L(type, tile) + +#define LOOP_5L(type, tile) \ + for (type i4 = 0; i4 < static_cast<type>(tile[4]); ++i4) LOOP_4L(type, tile) + +#define LOOP_6L(type, tile) \ + for (type i5 = 0; i5 < static_cast<type>(tile[5]); ++i5) LOOP_5L(type, tile) + +#define LOOP_7L(type, tile) \ + for (type i6 = 0; i6 < static_cast<type>(tile[6]); ++i6) LOOP_6L(type, tile) + +#define LOOP_8L(type, tile) \ + for (type i7 = 0; i7 < static_cast<type>(tile[7]); ++i7) LOOP_7L(type, tile) + +#define LOOP_1R(type, tile) \ + KOKKOS_ENABLE_IVDEP_MDRANGE \ + for (type i0 = 0; i0 < static_cast<type>(tile[0]); ++i0) + +#define LOOP_2R(type, tile) \ + LOOP_1R(type, tile) \ + for (type i1 = 0; i1 < static_cast<type>(tile[1]); ++i1) + +#define LOOP_3R(type, tile) \ + LOOP_2R(type, tile) \ + for (type i2 = 0; i2 < static_cast<type>(tile[2]); ++i2) + +#define LOOP_4R(type, tile) \ + LOOP_3R(type, tile) \ + for (type i3 = 0; i3 < static_cast<type>(tile[3]); ++i3) + +#define LOOP_5R(type, tile) \ + LOOP_4R(type, tile) \ + for (type i4 = 0; i4 < static_cast<type>(tile[4]); ++i4) + +#define LOOP_6R(type, tile) \ + LOOP_5R(type, tile) \ + for (type i5 = 0; i5 < static_cast<type>(tile[5]); ++i5) + +#define LOOP_7R(type, tile) \ + LOOP_6R(type, tile) \ + for (type i6 = 0; i6 < static_cast<type>(tile[6]); ++i6) + +#define LOOP_8R(type, tile) \ + LOOP_7R(type, tile) \ + for (type i7 = 0; i7 < static_cast<type>(tile[7]); ++i7) + +#define LOOP_ARGS_1 i0 + m_offset[0] +#define LOOP_ARGS_2 LOOP_ARGS_1, i1 + m_offset[1] +#define LOOP_ARGS_3 LOOP_ARGS_2, i2 + m_offset[2] +#define LOOP_ARGS_4 LOOP_ARGS_3, i3 + m_offset[3] +#define LOOP_ARGS_5 LOOP_ARGS_4, i4 + m_offset[4] +#define LOOP_ARGS_6 LOOP_ARGS_5, i5 + m_offset[5] +#define LOOP_ARGS_7 LOOP_ARGS_6, i6 + m_offset[6] +#define LOOP_ARGS_8 LOOP_ARGS_7, i7 + m_offset[7] + +// New Loop Macros... +// parallel_for, non-tagged +#define APPLY(func, ...) func(__VA_ARGS__); + +// LayoutRight +// d = 0 to start +#define LOOP_R_1(func, type, m_offset, extent, d, ...) \ + KOKKOS_ENABLE_IVDEP_MDRANGE \ + for (type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) { \ + APPLY(func, __VA_ARGS__, i0 + m_offset[d]) \ + } + +#define LOOP_R_2(func, type, m_offset, extent, d, ...) \ + for (type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) { \ + LOOP_R_1(func, type, m_offset, extent, d + 1, __VA_ARGS__, \ + i1 + m_offset[d]) \ + } + +#define LOOP_R_3(func, type, m_offset, extent, d, ...) \ + for (type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) { \ + LOOP_R_2(func, type, m_offset, extent, d + 1, __VA_ARGS__, \ + i2 + m_offset[d]) \ + } + +#define LOOP_R_4(func, type, m_offset, extent, d, ...) \ + for (type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) { \ + LOOP_R_3(func, type, m_offset, extent, d + 1, __VA_ARGS__, \ + i3 + m_offset[d]) \ + } + +#define LOOP_R_5(func, type, m_offset, extent, d, ...) \ + for (type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) { \ + LOOP_R_4(func, type, m_offset, extent, d + 1, __VA_ARGS__, \ + i4 + m_offset[d]) \ + } + +#define LOOP_R_6(func, type, m_offset, extent, d, ...) \ + for (type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) { \ + LOOP_R_5(func, type, m_offset, extent, d + 1, __VA_ARGS__, \ + i5 + m_offset[d]) \ + } + +#define LOOP_R_7(func, type, m_offset, extent, d, ...) \ + for (type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) { \ + LOOP_R_6(func, type, m_offset, extent, d + 1, __VA_ARGS__, \ + i6 + m_offset[d]) \ + } + +#define LOOP_R_8(func, type, m_offset, extent, d, ...) \ + for (type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) { \ + LOOP_R_7(func, type, m_offset, extent, d + 1, __VA_ARGS__, \ + i7 + m_offset[d]) \ + } + +// LayoutLeft +// d = rank-1 to start +#define LOOP_L_1(func, type, m_offset, extent, d, ...) \ + KOKKOS_ENABLE_IVDEP_MDRANGE \ + for (type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) { \ + APPLY(func, i0 + m_offset[d], __VA_ARGS__) \ + } + +#define LOOP_L_2(func, type, m_offset, extent, d, ...) \ + for (type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) { \ + LOOP_L_1(func, type, m_offset, extent, d - 1, i1 + m_offset[d], \ + __VA_ARGS__) \ + } + +#define LOOP_L_3(func, type, m_offset, extent, d, ...) \ + for (type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) { \ + LOOP_L_2(func, type, m_offset, extent, d - 1, i2 + m_offset[d], \ + __VA_ARGS__) \ + } + +#define LOOP_L_4(func, type, m_offset, extent, d, ...) \ + for (type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) { \ + LOOP_L_3(func, type, m_offset, extent, d - 1, i3 + m_offset[d], \ + __VA_ARGS__) \ + } + +#define LOOP_L_5(func, type, m_offset, extent, d, ...) \ + for (type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) { \ + LOOP_L_4(func, type, m_offset, extent, d - 1, i4 + m_offset[d], \ + __VA_ARGS__) \ + } + +#define LOOP_L_6(func, type, m_offset, extent, d, ...) \ + for (type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) { \ + LOOP_L_5(func, type, m_offset, extent, d - 1, i5 + m_offset[d], \ + __VA_ARGS__) \ + } + +#define LOOP_L_7(func, type, m_offset, extent, d, ...) \ + for (type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) { \ + LOOP_L_6(func, type, m_offset, extent, d - 1, i6 + m_offset[d], \ + __VA_ARGS__) \ + } + +#define LOOP_L_8(func, type, m_offset, extent, d, ...) \ + for (type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) { \ + LOOP_L_7(func, type, m_offset, extent, d - 1, i7 + m_offset[d], \ + __VA_ARGS__) \ + } + +// Left vs Right +// TODO: rank not necessary to pass through, can hardcode the values +#define LOOP_LAYOUT_1(func, type, is_left, m_offset, extent, rank) \ + KOKKOS_ENABLE_IVDEP_MDRANGE \ + for (type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) { \ + APPLY(func, i0 + m_offset[0]) \ + } + +#define LOOP_LAYOUT_2(func, type, is_left, m_offset, extent, rank) \ + if (is_left) { \ + for (type i1 = (type)0; i1 < static_cast<type>(extent[rank - 1]); ++i1) { \ + LOOP_L_1(func, type, m_offset, extent, rank - 2, \ + i1 + m_offset[rank - 1]) \ + } \ + } else { \ + for (type i1 = (type)0; i1 < static_cast<type>(extent[0]); ++i1) { \ + LOOP_R_1(func, type, m_offset, extent, 1, i1 + m_offset[0]) \ + } \ + } + +#define LOOP_LAYOUT_3(func, type, is_left, m_offset, extent, rank) \ + if (is_left) { \ + for (type i2 = (type)0; i2 < static_cast<type>(extent[rank - 1]); ++i2) { \ + LOOP_L_2(func, type, m_offset, extent, rank - 2, \ + i2 + m_offset[rank - 1]) \ + } \ + } else { \ + for (type i2 = (type)0; i2 < static_cast<type>(extent[0]); ++i2) { \ + LOOP_R_2(func, type, m_offset, extent, 1, i2 + m_offset[0]) \ + } \ + } + +#define LOOP_LAYOUT_4(func, type, is_left, m_offset, extent, rank) \ + if (is_left) { \ + for (type i3 = (type)0; i3 < static_cast<type>(extent[rank - 1]); ++i3) { \ + LOOP_L_3(func, type, m_offset, extent, rank - 2, \ + i3 + m_offset[rank - 1]) \ + } \ + } else { \ + for (type i3 = (type)0; i3 < static_cast<type>(extent[0]); ++i3) { \ + LOOP_R_3(func, type, m_offset, extent, 1, i3 + m_offset[0]) \ + } \ + } + +#define LOOP_LAYOUT_5(func, type, is_left, m_offset, extent, rank) \ + if (is_left) { \ + for (type i4 = (type)0; i4 < static_cast<type>(extent[rank - 1]); ++i4) { \ + LOOP_L_4(func, type, m_offset, extent, rank - 2, \ + i4 + m_offset[rank - 1]) \ + } \ + } else { \ + for (type i4 = (type)0; i4 < static_cast<type>(extent[0]); ++i4) { \ + LOOP_R_4(func, type, m_offset, extent, 1, i4 + m_offset[0]) \ + } \ + } + +#define LOOP_LAYOUT_6(func, type, is_left, m_offset, extent, rank) \ + if (is_left) { \ + for (type i5 = (type)0; i5 < static_cast<type>(extent[rank - 1]); ++i5) { \ + LOOP_L_5(func, type, m_offset, extent, rank - 2, \ + i5 + m_offset[rank - 1]) \ + } \ + } else { \ + for (type i5 = (type)0; i5 < static_cast<type>(extent[0]); ++i5) { \ + LOOP_R_5(func, type, m_offset, extent, 1, i5 + m_offset[0]) \ + } \ + } + +#define LOOP_LAYOUT_7(func, type, is_left, m_offset, extent, rank) \ + if (is_left) { \ + for (type i6 = (type)0; i6 < static_cast<type>(extent[rank - 1]); ++i6) { \ + LOOP_L_6(func, type, m_offset, extent, rank - 2, \ + i6 + m_offset[rank - 1]) \ + } \ + } else { \ + for (type i6 = (type)0; i6 < static_cast<type>(extent[0]); ++i6) { \ + LOOP_R_6(func, type, m_offset, extent, 1, i6 + m_offset[0]) \ + } \ + } + +#define LOOP_LAYOUT_8(func, type, is_left, m_offset, extent, rank) \ + if (is_left) { \ + for (type i7 = (type)0; i7 < static_cast<type>(extent[rank - 1]); ++i7) { \ + LOOP_L_7(func, type, m_offset, extent, rank - 2, \ + i7 + m_offset[rank - 1]) \ + } \ + } else { \ + for (type i7 = (type)0; i7 < static_cast<type>(extent[0]); ++i7) { \ + LOOP_R_7(func, type, m_offset, extent, 1, i7 + m_offset[0]) \ + } \ + } + +// Partial vs Full Tile +#define TILE_LOOP_1(func, type, is_left, cond, m_offset, extent_full, \ + extent_partial, rank) \ + if (cond) { \ + LOOP_LAYOUT_1(func, type, is_left, m_offset, extent_full, rank) \ + } else { \ + LOOP_LAYOUT_1(func, type, is_left, m_offset, extent_partial, rank) \ + } + +#define TILE_LOOP_2(func, type, is_left, cond, m_offset, extent_full, \ + extent_partial, rank) \ + if (cond) { \ + LOOP_LAYOUT_2(func, type, is_left, m_offset, extent_full, rank) \ + } else { \ + LOOP_LAYOUT_2(func, type, is_left, m_offset, extent_partial, rank) \ + } + +#define TILE_LOOP_3(func, type, is_left, cond, m_offset, extent_full, \ + extent_partial, rank) \ + if (cond) { \ + LOOP_LAYOUT_3(func, type, is_left, m_offset, extent_full, rank) \ + } else { \ + LOOP_LAYOUT_3(func, type, is_left, m_offset, extent_partial, rank) \ + } + +#define TILE_LOOP_4(func, type, is_left, cond, m_offset, extent_full, \ + extent_partial, rank) \ + if (cond) { \ + LOOP_LAYOUT_4(func, type, is_left, m_offset, extent_full, rank) \ + } else { \ + LOOP_LAYOUT_4(func, type, is_left, m_offset, extent_partial, rank) \ + } + +#define TILE_LOOP_5(func, type, is_left, cond, m_offset, extent_full, \ + extent_partial, rank) \ + if (cond) { \ + LOOP_LAYOUT_5(func, type, is_left, m_offset, extent_full, rank) \ + } else { \ + LOOP_LAYOUT_5(func, type, is_left, m_offset, extent_partial, rank) \ + } + +#define TILE_LOOP_6(func, type, is_left, cond, m_offset, extent_full, \ + extent_partial, rank) \ + if (cond) { \ + LOOP_LAYOUT_6(func, type, is_left, m_offset, extent_full, rank) \ + } else { \ + LOOP_LAYOUT_6(func, type, is_left, m_offset, extent_partial, rank) \ + } + +#define TILE_LOOP_7(func, type, is_left, cond, m_offset, extent_full, \ + extent_partial, rank) \ + if (cond) { \ + LOOP_LAYOUT_7(func, type, is_left, m_offset, extent_full, rank) \ + } else { \ + LOOP_LAYOUT_7(func, type, is_left, m_offset, extent_partial, rank) \ + } + +#define TILE_LOOP_8(func, type, is_left, cond, m_offset, extent_full, \ + extent_partial, rank) \ + if (cond) { \ + LOOP_LAYOUT_8(func, type, is_left, m_offset, extent_full, rank) \ + } else { \ + LOOP_LAYOUT_8(func, type, is_left, m_offset, extent_partial, rank) \ + } + +// parallel_reduce, non-tagged +// Reduction version +#define APPLY_REDUX(val, func, ...) func(__VA_ARGS__, val); + +// LayoutRight +// d = 0 to start +#define LOOP_R_1_REDUX(val, func, type, m_offset, extent, d, ...) \ + KOKKOS_ENABLE_IVDEP_MDRANGE \ + for (type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) { \ + APPLY_REDUX(val, func, __VA_ARGS__, i0 + m_offset[d]) \ + } + +#define LOOP_R_2_REDUX(val, func, type, m_offset, extent, d, ...) \ + for (type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) { \ + LOOP_R_1_REDUX(val, func, type, m_offset, extent, d + 1, __VA_ARGS__, \ + i1 + m_offset[d]) \ + } + +#define LOOP_R_3_REDUX(val, func, type, m_offset, extent, d, ...) \ + for (type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) { \ + LOOP_R_2_REDUX(val, func, type, m_offset, extent, d + 1, __VA_ARGS__, \ + i2 + m_offset[d]) \ + } + +#define LOOP_R_4_REDUX(val, func, type, m_offset, extent, d, ...) \ + for (type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) { \ + LOOP_R_3_REDUX(val, func, type, m_offset, extent, d + 1, __VA_ARGS__, \ + i3 + m_offset[d]) \ + } + +#define LOOP_R_5_REDUX(val, func, type, m_offset, extent, d, ...) \ + for (type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) { \ + LOOP_R_4_REDUX(val, func, type, m_offset, extent, d + 1, __VA_ARGS__, \ + i4 + m_offset[d]) \ + } + +#define LOOP_R_6_REDUX(val, func, type, m_offset, extent, d, ...) \ + for (type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) { \ + LOOP_R_5_REDUX(val, func, type, m_offset, extent, d + 1, __VA_ARGS__, \ + i5 + m_offset[d]) \ + } + +#define LOOP_R_7_REDUX(val, func, type, m_offset, extent, d, ...) \ + for (type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) { \ + LOOP_R_6_REDUX(val, func, type, m_offset, extent, d + 1, __VA_ARGS__, \ + i6 + m_offset[d]) \ + } + +#define LOOP_R_8_REDUX(val, func, type, m_offset, extent, d, ...) \ + for (type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) { \ + LOOP_R_7_REDUX(val, func, type, m_offset, extent, d + 1, __VA_ARGS__, \ + i7 + m_offset[d]) \ + } + +// LayoutLeft +// d = rank-1 to start +#define LOOP_L_1_REDUX(val, func, type, m_offset, extent, d, ...) \ + KOKKOS_ENABLE_IVDEP_MDRANGE \ + for (type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) { \ + APPLY_REDUX(val, func, i0 + m_offset[d], __VA_ARGS__) \ + } + +#define LOOP_L_2_REDUX(val, func, type, m_offset, extent, d, ...) \ + for (type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) { \ + LOOP_L_1_REDUX(val, func, type, m_offset, extent, d - 1, i1 + m_offset[d], \ + __VA_ARGS__) \ + } + +#define LOOP_L_3_REDUX(val, func, type, m_offset, extent, d, ...) \ + for (type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) { \ + LOOP_L_2_REDUX(val, func, type, m_offset, extent, d - 1, i2 + m_offset[d], \ + __VA_ARGS__) \ + } + +#define LOOP_L_4_REDUX(val, func, type, m_offset, extent, d, ...) \ + for (type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) { \ + LOOP_L_3_REDUX(val, func, type, m_offset, extent, d - 1, i3 + m_offset[d], \ + __VA_ARGS__) \ + } + +#define LOOP_L_5_REDUX(val, func, type, m_offset, extent, d, ...) \ + for (type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) { \ + LOOP_L_4_REDUX(val, func, type, m_offset, extent, d - 1, i4 + m_offset[d], \ + __VA_ARGS__) \ + } + +#define LOOP_L_6_REDUX(val, func, type, m_offset, extent, d, ...) \ + for (type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) { \ + LOOP_L_5_REDUX(val, func, type, m_offset, extent, d - 1, i5 + m_offset[d], \ + __VA_ARGS__) \ + } + +#define LOOP_L_7_REDUX(val, func, type, m_offset, extent, d, ...) \ + for (type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) { \ + LOOP_L_6_REDUX(val, func, type, m_offset, extent, d - 1, i6 + m_offset[d], \ + __VA_ARGS__) \ + } + +#define LOOP_L_8_REDUX(val, func, type, m_offset, extent, d, ...) \ + for (type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) { \ + LOOP_L_7_REDUX(val, func, type, m_offset, extent, d - 1, i7 + m_offset[d], \ + __VA_ARGS__) \ + } + +// Left vs Right +#define LOOP_LAYOUT_1_REDUX(val, func, type, is_left, m_offset, extent, rank) \ + KOKKOS_ENABLE_IVDEP_MDRANGE \ + for (type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) { \ + APPLY_REDUX(val, func, i0 + m_offset[0]) \ + } + +#define LOOP_LAYOUT_2_REDUX(val, func, type, is_left, m_offset, extent, rank) \ + if (is_left) { \ + for (type i1 = (type)0; i1 < static_cast<type>(extent[rank - 1]); ++i1) { \ + LOOP_L_1_REDUX(val, func, type, m_offset, extent, rank - 2, \ + i1 + m_offset[rank - 1]) \ + } \ + } else { \ + for (type i1 = (type)0; i1 < static_cast<type>(extent[0]); ++i1) { \ + LOOP_R_1_REDUX(val, func, type, m_offset, extent, 1, i1 + m_offset[0]) \ + } \ + } + +#define LOOP_LAYOUT_3_REDUX(val, func, type, is_left, m_offset, extent, rank) \ + if (is_left) { \ + for (type i2 = (type)0; i2 < static_cast<type>(extent[rank - 1]); ++i2) { \ + LOOP_L_2_REDUX(val, func, type, m_offset, extent, rank - 2, \ + i2 + m_offset[rank - 1]) \ + } \ + } else { \ + for (type i2 = (type)0; i2 < static_cast<type>(extent[0]); ++i2) { \ + LOOP_R_2_REDUX(val, func, type, m_offset, extent, 1, i2 + m_offset[0]) \ + } \ + } + +#define LOOP_LAYOUT_4_REDUX(val, func, type, is_left, m_offset, extent, rank) \ + if (is_left) { \ + for (type i3 = (type)0; i3 < static_cast<type>(extent[rank - 1]); ++i3) { \ + LOOP_L_3_REDUX(val, func, type, m_offset, extent, rank - 2, \ + i3 + m_offset[rank - 1]) \ + } \ + } else { \ + for (type i3 = (type)0; i3 < static_cast<type>(extent[0]); ++i3) { \ + LOOP_R_3_REDUX(val, func, type, m_offset, extent, 1, i3 + m_offset[0]) \ + } \ + } + +#define LOOP_LAYOUT_5_REDUX(val, func, type, is_left, m_offset, extent, rank) \ + if (is_left) { \ + for (type i4 = (type)0; i4 < static_cast<type>(extent[rank - 1]); ++i4) { \ + LOOP_L_4_REDUX(val, func, type, m_offset, extent, rank - 2, \ + i4 + m_offset[rank - 1]) \ + } \ + } else { \ + for (type i4 = (type)0; i4 < static_cast<type>(extent[0]); ++i4) { \ + LOOP_R_4_REDUX(val, func, type, m_offset, extent, 1, i4 + m_offset[0]) \ + } \ + } + +#define LOOP_LAYOUT_6_REDUX(val, func, type, is_left, m_offset, extent, rank) \ + if (is_left) { \ + for (type i5 = (type)0; i5 < static_cast<type>(extent[rank - 1]); ++i5) { \ + LOOP_L_5_REDUX(val, func, type, m_offset, extent, rank - 2, \ + i5 + m_offset[rank - 1]) \ + } \ + } else { \ + for (type i5 = (type)0; i5 < static_cast<type>(extent[0]); ++i5) { \ + LOOP_R_5_REDUX(val, func, type, m_offset, extent, 1, i5 + m_offset[0]) \ + } \ + } + +#define LOOP_LAYOUT_7_REDUX(val, func, type, is_left, m_offset, extent, rank) \ + if (is_left) { \ + for (type i6 = (type)0; i6 < static_cast<type>(extent[rank - 1]); ++i6) { \ + LOOP_L_6_REDUX(val, func, type, m_offset, extent, rank - 2, \ + i6 + m_offset[rank - 1]) \ + } \ + } else { \ + for (type i6 = (type)0; i6 < static_cast<type>(extent[0]); ++i6) { \ + LOOP_R_6_REDUX(val, func, type, m_offset, extent, 1, i6 + m_offset[0]) \ + } \ + } + +#define LOOP_LAYOUT_8_REDUX(val, func, type, is_left, m_offset, extent, rank) \ + if (is_left) { \ + for (type i7 = (type)0; i7 < static_cast<type>(extent[rank - 1]); ++i7) { \ + LOOP_L_7_REDUX(val, func, type, m_offset, extent, rank - 2, \ + i7 + m_offset[rank - 1]) \ + } \ + } else { \ + for (type i7 = (type)0; i7 < static_cast<type>(extent[0]); ++i7) { \ + LOOP_R_7_REDUX(val, func, type, m_offset, extent, 1, i7 + m_offset[0]) \ + } \ + } + +// Partial vs Full Tile +#define TILE_LOOP_1_REDUX(val, func, type, is_left, cond, m_offset, \ + extent_full, extent_partial, rank) \ + if (cond) { \ + LOOP_LAYOUT_1_REDUX(val, func, type, is_left, m_offset, extent_full, rank) \ + } else { \ + LOOP_LAYOUT_1_REDUX(val, func, type, is_left, m_offset, extent_partial, \ + rank) \ + } + +#define TILE_LOOP_2_REDUX(val, func, type, is_left, cond, m_offset, \ + extent_full, extent_partial, rank) \ + if (cond) { \ + LOOP_LAYOUT_2_REDUX(val, func, type, is_left, m_offset, extent_full, rank) \ + } else { \ + LOOP_LAYOUT_2_REDUX(val, func, type, is_left, m_offset, extent_partial, \ + rank) \ + } + +#define TILE_LOOP_3_REDUX(val, func, type, is_left, cond, m_offset, \ + extent_full, extent_partial, rank) \ + if (cond) { \ + LOOP_LAYOUT_3_REDUX(val, func, type, is_left, m_offset, extent_full, rank) \ + } else { \ + LOOP_LAYOUT_3_REDUX(val, func, type, is_left, m_offset, extent_partial, \ + rank) \ + } + +#define TILE_LOOP_4_REDUX(val, func, type, is_left, cond, m_offset, \ + extent_full, extent_partial, rank) \ + if (cond) { \ + LOOP_LAYOUT_4_REDUX(val, func, type, is_left, m_offset, extent_full, rank) \ + } else { \ + LOOP_LAYOUT_4_REDUX(val, func, type, is_left, m_offset, extent_partial, \ + rank) \ + } + +#define TILE_LOOP_5_REDUX(val, func, type, is_left, cond, m_offset, \ + extent_full, extent_partial, rank) \ + if (cond) { \ + LOOP_LAYOUT_5_REDUX(val, func, type, is_left, m_offset, extent_full, rank) \ + } else { \ + LOOP_LAYOUT_5_REDUX(val, func, type, is_left, m_offset, extent_partial, \ + rank) \ + } + +#define TILE_LOOP_6_REDUX(val, func, type, is_left, cond, m_offset, \ + extent_full, extent_partial, rank) \ + if (cond) { \ + LOOP_LAYOUT_6_REDUX(val, func, type, is_left, m_offset, extent_full, rank) \ + } else { \ + LOOP_LAYOUT_6_REDUX(val, func, type, is_left, m_offset, extent_partial, \ + rank) \ + } + +#define TILE_LOOP_7_REDUX(val, func, type, is_left, cond, m_offset, \ + extent_full, extent_partial, rank) \ + if (cond) { \ + LOOP_LAYOUT_7_REDUX(val, func, type, is_left, m_offset, extent_full, rank) \ + } else { \ + LOOP_LAYOUT_7_REDUX(val, func, type, is_left, m_offset, extent_partial, \ + rank) \ + } + +#define TILE_LOOP_8_REDUX(val, func, type, is_left, cond, m_offset, \ + extent_full, extent_partial, rank) \ + if (cond) { \ + LOOP_LAYOUT_8_REDUX(val, func, type, is_left, m_offset, extent_full, rank) \ + } else { \ + LOOP_LAYOUT_8_REDUX(val, func, type, is_left, m_offset, extent_partial, \ + rank) \ + } +// end New Loop Macros + +// tagged macros +#define TAGGED_APPLY(tag, func, ...) func(tag, __VA_ARGS__); + +// LayoutRight +// d = 0 to start +#define TAGGED_LOOP_R_1(tag, func, type, m_offset, extent, d, ...) \ + KOKKOS_ENABLE_IVDEP_MDRANGE \ + for (type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) { \ + TAGGED_APPLY(tag, func, __VA_ARGS__, i0 + m_offset[d]) \ + } + +#define TAGGED_LOOP_R_2(tag, func, type, m_offset, extent, d, ...) \ + for (type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) { \ + TAGGED_LOOP_R_1(tag, func, type, m_offset, extent, d + 1, __VA_ARGS__, \ + i1 + m_offset[d]) \ + } + +#define TAGGED_LOOP_R_3(tag, func, type, m_offset, extent, d, ...) \ + for (type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) { \ + TAGGED_LOOP_R_2(tag, func, type, m_offset, extent, d + 1, __VA_ARGS__, \ + i2 + m_offset[d]) \ + } + +#define TAGGED_LOOP_R_4(tag, func, type, m_offset, extent, d, ...) \ + for (type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) { \ + TAGGED_LOOP_R_3(tag, func, type, m_offset, extent, d + 1, __VA_ARGS__, \ + i3 + m_offset[d]) \ + } + +#define TAGGED_LOOP_R_5(tag, func, type, m_offset, extent, d, ...) \ + for (type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) { \ + TAGGED_LOOP_R_4(tag, func, type, m_offset, extent, d + 1, __VA_ARGS__, \ + i4 + m_offset[d]) \ + } + +#define TAGGED_LOOP_R_6(tag, func, type, m_offset, extent, d, ...) \ + for (type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) { \ + TAGGED_LOOP_R_5(tag, func, type, m_offset, extent, d + 1, __VA_ARGS__, \ + i5 + m_offset[d]) \ + } + +#define TAGGED_LOOP_R_7(tag, func, type, m_offset, extent, d, ...) \ + for (type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) { \ + TAGGED_LOOP_R_6(tag, func, type, m_offset, extent, d + 1, __VA_ARGS__, \ + i6 + m_offset[d]) \ + } + +#define TAGGED_LOOP_R_8(tag, func, type, m_offset, extent, d, ...) \ + for (type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) { \ + TAGGED_LOOP_R_7(tag, func, type, m_offset, extent, d + 1, __VA_ARGS__, \ + i7 + m_offset[d]) \ + } + +// LayoutLeft +// d = rank-1 to start +#define TAGGED_LOOP_L_1(tag, func, type, m_offset, extent, d, ...) \ + KOKKOS_ENABLE_IVDEP_MDRANGE \ + for (type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) { \ + TAGGED_APPLY(tag, func, i0 + m_offset[d], __VA_ARGS__) \ + } + +#define TAGGED_LOOP_L_2(tag, func, type, m_offset, extent, d, ...) \ + for (type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) { \ + TAGGED_LOOP_L_1(tag, func, type, m_offset, extent, d - 1, \ + i1 + m_offset[d], __VA_ARGS__) \ + } + +#define TAGGED_LOOP_L_3(tag, func, type, m_offset, extent, d, ...) \ + for (type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) { \ + TAGGED_LOOP_L_2(tag, func, type, m_offset, extent, d - 1, \ + i2 + m_offset[d], __VA_ARGS__) \ + } + +#define TAGGED_LOOP_L_4(tag, func, type, m_offset, extent, d, ...) \ + for (type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) { \ + TAGGED_LOOP_L_3(tag, func, type, m_offset, extent, d - 1, \ + i3 + m_offset[d], __VA_ARGS__) \ + } + +#define TAGGED_LOOP_L_5(tag, func, type, m_offset, extent, d, ...) \ + for (type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) { \ + TAGGED_LOOP_L_4(tag, func, type, m_offset, extent, d - 1, \ + i4 + m_offset[d], __VA_ARGS__) \ + } + +#define TAGGED_LOOP_L_6(tag, func, type, m_offset, extent, d, ...) \ + for (type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) { \ + TAGGED_LOOP_L_5(tag, func, type, m_offset, extent, d - 1, \ + i5 + m_offset[d], __VA_ARGS__) \ + } + +#define TAGGED_LOOP_L_7(tag, func, type, m_offset, extent, d, ...) \ + for (type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) { \ + TAGGED_LOOP_L_6(tag, func, type, m_offset, extent, d - 1, \ + i6 + m_offset[d], __VA_ARGS__) \ + } + +#define TAGGED_LOOP_L_8(tag, func, type, m_offset, extent, d, ...) \ + for (type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) { \ + TAGGED_LOOP_L_7(tag, func, type, m_offset, extent, d - 1, \ + i7 + m_offset[d], __VA_ARGS__) \ + } + +// Left vs Right +// TODO: rank not necessary to pass through, can hardcode the values +#define TAGGED_LOOP_LAYOUT_1(tag, func, type, is_left, m_offset, extent, rank) \ + KOKKOS_ENABLE_IVDEP_MDRANGE \ + for (type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) { \ + TAGGED_APPLY(tag, func, i0 + m_offset[0]) \ + } + +#define TAGGED_LOOP_LAYOUT_2(tag, func, type, is_left, m_offset, extent, rank) \ + if (is_left) { \ + for (type i1 = (type)0; i1 < static_cast<type>(extent[rank - 1]); ++i1) { \ + TAGGED_LOOP_L_1(tag, func, type, m_offset, extent, rank - 2, \ + i1 + m_offset[rank - 1]) \ + } \ + } else { \ + for (type i1 = (type)0; i1 < static_cast<type>(extent[0]); ++i1) { \ + TAGGED_LOOP_R_1(tag, func, type, m_offset, extent, 1, i1 + m_offset[0]) \ + } \ + } + +#define TAGGED_LOOP_LAYOUT_3(tag, func, type, is_left, m_offset, extent, rank) \ + if (is_left) { \ + for (type i2 = (type)0; i2 < static_cast<type>(extent[rank - 1]); ++i2) { \ + TAGGED_LOOP_L_2(tag, func, type, m_offset, extent, rank - 2, \ + i2 + m_offset[rank - 1]) \ + } \ + } else { \ + for (type i2 = (type)0; i2 < static_cast<type>(extent[0]); ++i2) { \ + TAGGED_LOOP_R_2(tag, func, type, m_offset, extent, 1, i2 + m_offset[0]) \ + } \ + } + +#define TAGGED_LOOP_LAYOUT_4(tag, func, type, is_left, m_offset, extent, rank) \ + if (is_left) { \ + for (type i3 = (type)0; i3 < static_cast<type>(extent[rank - 1]); ++i3) { \ + TAGGED_LOOP_L_3(tag, func, type, m_offset, extent, rank - 2, \ + i3 + m_offset[rank - 1]) \ + } \ + } else { \ + for (type i3 = (type)0; i3 < static_cast<type>(extent[0]); ++i3) { \ + TAGGED_LOOP_R_3(tag, func, type, m_offset, extent, 1, i3 + m_offset[0]) \ + } \ + } + +#define TAGGED_LOOP_LAYOUT_5(tag, func, type, is_left, m_offset, extent, rank) \ + if (is_left) { \ + for (type i4 = (type)0; i4 < static_cast<type>(extent[rank - 1]); ++i4) { \ + TAGGED_LOOP_L_4(tag, func, type, m_offset, extent, rank - 2, \ + i4 + m_offset[rank - 1]) \ + } \ + } else { \ + for (type i4 = (type)0; i4 < static_cast<type>(extent[0]); ++i4) { \ + TAGGED_LOOP_R_4(tag, func, type, m_offset, extent, 1, i4 + m_offset[0]) \ + } \ + } + +#define TAGGED_LOOP_LAYOUT_6(tag, func, type, is_left, m_offset, extent, rank) \ + if (is_left) { \ + for (type i5 = (type)0; i5 < static_cast<type>(extent[rank - 1]); ++i5) { \ + TAGGED_LOOP_L_5(tag, func, type, m_offset, extent, rank - 2, \ + i5 + m_offset[rank - 1]) \ + } \ + } else { \ + for (type i5 = (type)0; i5 < static_cast<type>(extent[0]); ++i5) { \ + TAGGED_LOOP_R_5(tag, func, type, m_offset, extent, 1, i5 + m_offset[0]) \ + } \ + } + +#define TAGGED_LOOP_LAYOUT_7(tag, func, type, is_left, m_offset, extent, rank) \ + if (is_left) { \ + for (type i6 = (type)0; i6 < static_cast<type>(extent[rank - 1]); ++i6) { \ + TAGGED_LOOP_L_6(tag, func, type, m_offset, extent, rank - 2, \ + i6 + m_offset[rank - 1]) \ + } \ + } else { \ + for (type i6 = (type)0; i6 < static_cast<type>(extent[0]); ++i6) { \ + TAGGED_LOOP_R_6(tag, func, type, m_offset, extent, 1, i6 + m_offset[0]) \ + } \ + } + +#define TAGGED_LOOP_LAYOUT_8(tag, func, type, is_left, m_offset, extent, rank) \ + if (is_left) { \ + for (type i7 = (type)0; i7 < static_cast<type>(extent[rank - 1]); ++i7) { \ + TAGGED_LOOP_L_7(tag, func, type, m_offset, extent, rank - 2, \ + i7 + m_offset[rank - 1]) \ + } \ + } else { \ + for (type i7 = (type)0; i7 < static_cast<type>(extent[0]); ++i7) { \ + TAGGED_LOOP_R_7(tag, func, type, m_offset, extent, 1, i7 + m_offset[0]) \ + } \ + } + +// Partial vs Full Tile +#define TAGGED_TILE_LOOP_1(tag, func, type, is_left, cond, m_offset, \ + extent_full, extent_partial, rank) \ + if (cond) { \ + TAGGED_LOOP_LAYOUT_1(tag, func, type, is_left, m_offset, extent_full, \ + rank) \ + } else { \ + TAGGED_LOOP_LAYOUT_1(tag, func, type, is_left, m_offset, extent_partial, \ + rank) \ + } + +#define TAGGED_TILE_LOOP_2(tag, func, type, is_left, cond, m_offset, \ + extent_full, extent_partial, rank) \ + if (cond) { \ + TAGGED_LOOP_LAYOUT_2(tag, func, type, is_left, m_offset, extent_full, \ + rank) \ + } else { \ + TAGGED_LOOP_LAYOUT_2(tag, func, type, is_left, m_offset, extent_partial, \ + rank) \ + } + +#define TAGGED_TILE_LOOP_3(tag, func, type, is_left, cond, m_offset, \ + extent_full, extent_partial, rank) \ + if (cond) { \ + TAGGED_LOOP_LAYOUT_3(tag, func, type, is_left, m_offset, extent_full, \ + rank) \ + } else { \ + TAGGED_LOOP_LAYOUT_3(tag, func, type, is_left, m_offset, extent_partial, \ + rank) \ + } + +#define TAGGED_TILE_LOOP_4(tag, func, type, is_left, cond, m_offset, \ + extent_full, extent_partial, rank) \ + if (cond) { \ + TAGGED_LOOP_LAYOUT_4(tag, func, type, is_left, m_offset, extent_full, \ + rank) \ + } else { \ + TAGGED_LOOP_LAYOUT_4(tag, func, type, is_left, m_offset, extent_partial, \ + rank) \ + } + +#define TAGGED_TILE_LOOP_5(tag, func, type, is_left, cond, m_offset, \ + extent_full, extent_partial, rank) \ + if (cond) { \ + TAGGED_LOOP_LAYOUT_5(tag, func, type, is_left, m_offset, extent_full, \ + rank) \ + } else { \ + TAGGED_LOOP_LAYOUT_5(tag, func, type, is_left, m_offset, extent_partial, \ + rank) \ + } + +#define TAGGED_TILE_LOOP_6(tag, func, type, is_left, cond, m_offset, \ + extent_full, extent_partial, rank) \ + if (cond) { \ + TAGGED_LOOP_LAYOUT_6(tag, func, type, is_left, m_offset, extent_full, \ + rank) \ + } else { \ + TAGGED_LOOP_LAYOUT_6(tag, func, type, is_left, m_offset, extent_partial, \ + rank) \ + } + +#define TAGGED_TILE_LOOP_7(tag, func, type, is_left, cond, m_offset, \ + extent_full, extent_partial, rank) \ + if (cond) { \ + TAGGED_LOOP_LAYOUT_7(tag, func, type, is_left, m_offset, extent_full, \ + rank) \ + } else { \ + TAGGED_LOOP_LAYOUT_7(tag, func, type, is_left, m_offset, extent_partial, \ + rank) \ + } + +#define TAGGED_TILE_LOOP_8(tag, func, type, is_left, cond, m_offset, \ + extent_full, extent_partial, rank) \ + if (cond) { \ + TAGGED_LOOP_LAYOUT_8(tag, func, type, is_left, m_offset, extent_full, \ + rank) \ + } else { \ + TAGGED_LOOP_LAYOUT_8(tag, func, type, is_left, m_offset, extent_partial, \ + rank) \ + } + +// parallel_reduce, tagged +// Reduction version +#define TAGGED_APPLY_REDUX(val, tag, func, ...) func(tag, __VA_ARGS__, val); + +// LayoutRight +// d = 0 to start +#define TAGGED_LOOP_R_1_REDUX(val, tag, func, type, m_offset, extent, d, ...) \ + KOKKOS_ENABLE_IVDEP_MDRANGE \ + for (type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) { \ + TAGGED_APPLY_REDUX(val, tag, func, __VA_ARGS__, i0 + m_offset[d]) \ + } + +#define TAGGED_LOOP_R_2_REDUX(val, tag, func, type, m_offset, extent, d, ...) \ + for (type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) { \ + TAGGED_LOOP_R_1_REDUX(val, tag, func, type, m_offset, extent, d + 1, \ + __VA_ARGS__, i1 + m_offset[d]) \ + } + +#define TAGGED_LOOP_R_3_REDUX(val, tag, func, type, m_offset, extent, d, ...) \ + for (type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) { \ + TAGGED_LOOP_R_2_REDUX(val, tag, func, type, m_offset, extent, d + 1, \ + __VA_ARGS__, i2 + m_offset[d]) \ + } + +#define TAGGED_LOOP_R_4_REDUX(val, tag, func, type, m_offset, extent, d, ...) \ + for (type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) { \ + TAGGED_LOOP_R_3_REDUX(val, tag, func, type, m_offset, extent, d + 1, \ + __VA_ARGS__, i3 + m_offset[d]) \ + } + +#define TAGGED_LOOP_R_5_REDUX(val, tag, func, type, m_offset, extent, d, ...) \ + for (type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) { \ + TAGGED_LOOP_R_4_REDUX(val, tag, func, type, m_offset, extent, d + 1, \ + __VA_ARGS__, i4 + m_offset[d]) \ + } + +#define TAGGED_LOOP_R_6_REDUX(val, tag, func, type, m_offset, extent, d, ...) \ + for (type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) { \ + TAGGED_LOOP_R_5_REDUX(val, tag, func, type, m_offset, extent, d + 1, \ + __VA_ARGS__, i5 + m_offset[d]) \ + } + +#define TAGGED_LOOP_R_7_REDUX(val, tag, func, type, m_offset, extent, d, ...) \ + for (type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) { \ + TAGGED_LOOP_R_6_REDUX(val, tag, func, type, m_offset, extent, d + 1, \ + __VA_ARGS__, i6 + m_offset[d]) \ + } + +#define TAGGED_LOOP_R_8_REDUX(val, tag, func, type, m_offset, extent, d, ...) \ + for (type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) { \ + TAGGED_LOOP_R_7_REDUX(val, tag, func, type, m_offset, extent, d + 1, \ + __VA_ARGS__, i7 + m_offset[d]) \ + } + +// LayoutLeft +// d = rank-1 to start +#define TAGGED_LOOP_L_1_REDUX(val, tag, func, type, m_offset, extent, d, ...) \ + KOKKOS_ENABLE_IVDEP_MDRANGE \ + for (type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) { \ + TAGGED_APPLY_REDUX(val, tag, func, i0 + m_offset[d], __VA_ARGS__) \ + } + +#define TAGGED_LOOP_L_2_REDUX(val, tag, func, type, m_offset, extent, d, ...) \ + for (type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) { \ + TAGGED_LOOP_L_1_REDUX(val, tag, func, type, m_offset, extent, d - 1, \ + i1 + m_offset[d], __VA_ARGS__) \ + } + +#define TAGGED_LOOP_L_3_REDUX(val, tag, func, type, m_offset, extent, d, ...) \ + for (type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) { \ + TAGGED_LOOP_L_2_REDUX(val, tag, func, type, m_offset, extent, d - 1, \ + i2 + m_offset[d], __VA_ARGS__) \ + } + +#define TAGGED_LOOP_L_4_REDUX(val, tag, func, type, m_offset, extent, d, ...) \ + for (type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) { \ + TAGGED_LOOP_L_3_REDUX(val, tag, func, type, m_offset, extent, d - 1, \ + i3 + m_offset[d], __VA_ARGS__) \ + } + +#define TAGGED_LOOP_L_5_REDUX(val, tag, func, type, m_offset, extent, d, ...) \ + for (type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) { \ + TAGGED_LOOP_L_4_REDUX(val, tag, func, type, m_offset, extent, d - 1, \ + i4 + m_offset[d], __VA_ARGS__) \ + } + +#define TAGGED_LOOP_L_6_REDUX(val, tag, func, type, m_offset, extent, d, ...) \ + for (type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) { \ + TAGGED_LOOP_L_5_REDUX(val, tag, func, type, m_offset, extent, d - 1, \ + i5 + m_offset[d], __VA_ARGS__) \ + } + +#define TAGGED_LOOP_L_7_REDUX(val, tag, func, type, m_offset, extent, d, ...) \ + for (type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) { \ + TAGGED_LOOP_L_6_REDUX(val, tag, func, type, m_offset, extent, d - 1, \ + i6 + m_offset[d], __VA_ARGS__) \ + } + +#define TAGGED_LOOP_L_8_REDUX(val, tag, func, type, m_offset, extent, d, ...) \ + for (type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) { \ + TAGGED_LOOP_L_7_REDUX(val, tag, func, type, m_offset, extent, d - 1, \ + i7 + m_offset[d], __VA_ARGS__) \ + } + +// Left vs Right +#define TAGGED_LOOP_LAYOUT_1_REDUX(val, tag, func, type, is_left, m_offset, \ + extent, rank) \ + KOKKOS_ENABLE_IVDEP_MDRANGE \ + for (type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) { \ + TAGGED_APPLY_REDUX(val, tag, func, i0 + m_offset[0]) \ + } + +#define TAGGED_LOOP_LAYOUT_2_REDUX(val, tag, func, type, is_left, m_offset, \ + extent, rank) \ + if (is_left) { \ + for (type i1 = (type)0; i1 < static_cast<type>(extent[rank - 1]); ++i1) { \ + TAGGED_LOOP_L_1_REDUX(val, tag, func, type, m_offset, extent, rank - 2, \ + i1 + m_offset[rank - 1]) \ + } \ + } else { \ + for (type i1 = (type)0; i1 < static_cast<type>(extent[0]); ++i1) { \ + TAGGED_LOOP_R_1_REDUX(val, tag, func, type, m_offset, extent, 1, \ + i1 + m_offset[0]) \ + } \ + } + +#define TAGGED_LOOP_LAYOUT_3_REDUX(val, tag, func, type, is_left, m_offset, \ + extent, rank) \ + if (is_left) { \ + for (type i2 = (type)0; i2 < static_cast<type>(extent[rank - 1]); ++i2) { \ + TAGGED_LOOP_L_2_REDUX(val, tag, func, type, m_offset, extent, rank - 2, \ + i2 + m_offset[rank - 1]) \ + } \ + } else { \ + for (type i2 = (type)0; i2 < static_cast<type>(extent[0]); ++i2) { \ + TAGGED_LOOP_R_2_REDUX(val, tag, func, type, m_offset, extent, 1, \ + i2 + m_offset[0]) \ + } \ + } + +#define TAGGED_LOOP_LAYOUT_4_REDUX(val, tag, func, type, is_left, m_offset, \ + extent, rank) \ + if (is_left) { \ + for (type i3 = (type)0; i3 < static_cast<type>(extent[rank - 1]); ++i3) { \ + TAGGED_LOOP_L_3_REDUX(val, tag, func, type, m_offset, extent, rank - 2, \ + i3 + m_offset[rank - 1]) \ + } \ + } else { \ + for (type i3 = (type)0; i3 < static_cast<type>(extent[0]); ++i3) { \ + TAGGED_LOOP_R_3_REDUX(val, tag, func, type, m_offset, extent, 1, \ + i3 + m_offset[0]) \ + } \ + } + +#define TAGGED_LOOP_LAYOUT_5_REDUX(val, tag, func, type, is_left, m_offset, \ + extent, rank) \ + if (is_left) { \ + for (type i4 = (type)0; i4 < static_cast<type>(extent[rank - 1]); ++i4) { \ + TAGGED_LOOP_L_4_REDUX(val, tag, func, type, m_offset, extent, rank - 2, \ + i4 + m_offset[rank - 1]) \ + } \ + } else { \ + for (type i4 = (type)0; i4 < static_cast<type>(extent[0]); ++i4) { \ + TAGGED_LOOP_R_4_REDUX(val, tag, func, type, m_offset, extent, 1, \ + i4 + m_offset[0]) \ + } \ + } + +#define TAGGED_LOOP_LAYOUT_6_REDUX(val, tag, func, type, is_left, m_offset, \ + extent, rank) \ + if (is_left) { \ + for (type i5 = (type)0; i5 < static_cast<type>(extent[rank - 1]); ++i5) { \ + TAGGED_LOOP_L_5_REDUX(val, tag, func, type, m_offset, extent, rank - 2, \ + i5 + m_offset[rank - 1]) \ + } \ + } else { \ + for (type i5 = (type)0; i5 < static_cast<type>(extent[0]); ++i5) { \ + TAGGED_LOOP_R_5_REDUX(val, tag, func, type, m_offset, extent, 1, \ + i5 + m_offset[0]) \ + } \ + } + +#define TAGGED_LOOP_LAYOUT_7_REDUX(val, tag, func, type, is_left, m_offset, \ + extent, rank) \ + if (is_left) { \ + for (type i6 = (type)0; i6 < static_cast<type>(extent[rank - 1]); ++i6) { \ + TAGGED_LOOP_L_6_REDUX(val, tag, func, type, m_offset, extent, rank - 2, \ + i6 + m_offset[rank - 1]) \ + } \ + } else { \ + for (type i6 = (type)0; i6 < static_cast<type>(extent[0]); ++i6) { \ + TAGGED_LOOP_R_6_REDUX(val, tag, func, type, m_offset, extent, 1, \ + i6 + m_offset[0]) \ + } \ + } + +#define TAGGED_LOOP_LAYOUT_8_REDUX(val, tag, func, type, is_left, m_offset, \ + extent, rank) \ + if (is_left) { \ + for (type i7 = (type)0; i7 < static_cast<type>(extent[rank - 1]); ++i7) { \ + TAGGED_LOOP_L_7_REDUX(val, tag, func, type, m_offset, extent, rank - 2, \ + i7 + m_offset[rank - 1]) \ + } \ + } else { \ + for (type i7 = (type)0; i7 < static_cast<type>(extent[0]); ++i7) { \ + TAGGED_LOOP_R_7_REDUX(val, tag, func, type, m_offset, extent, 1, \ + i7 + m_offset[0]) \ + } \ + } + +// Partial vs Full Tile +#define TAGGED_TILE_LOOP_1_REDUX(val, tag, func, type, is_left, cond, \ + m_offset, extent_full, extent_partial, rank) \ + if (cond) { \ + TAGGED_LOOP_LAYOUT_1_REDUX(val, tag, func, type, is_left, m_offset, \ + extent_full, rank) \ + } else { \ + TAGGED_LOOP_LAYOUT_1_REDUX(val, tag, func, type, is_left, m_offset, \ + extent_partial, rank) \ + } + +#define TAGGED_TILE_LOOP_2_REDUX(val, tag, func, type, is_left, cond, \ + m_offset, extent_full, extent_partial, rank) \ + if (cond) { \ + TAGGED_LOOP_LAYOUT_2_REDUX(val, tag, func, type, is_left, m_offset, \ + extent_full, rank) \ + } else { \ + TAGGED_LOOP_LAYOUT_2_REDUX(val, tag, func, type, is_left, m_offset, \ + extent_partial, rank) \ + } + +#define TAGGED_TILE_LOOP_3_REDUX(val, tag, func, type, is_left, cond, \ + m_offset, extent_full, extent_partial, rank) \ + if (cond) { \ + TAGGED_LOOP_LAYOUT_3_REDUX(val, tag, func, type, is_left, m_offset, \ + extent_full, rank) \ + } else { \ + TAGGED_LOOP_LAYOUT_3_REDUX(val, tag, func, type, is_left, m_offset, \ + extent_partial, rank) \ + } + +#define TAGGED_TILE_LOOP_4_REDUX(val, tag, func, type, is_left, cond, \ + m_offset, extent_full, extent_partial, rank) \ + if (cond) { \ + TAGGED_LOOP_LAYOUT_4_REDUX(val, tag, func, type, is_left, m_offset, \ + extent_full, rank) \ + } else { \ + TAGGED_LOOP_LAYOUT_4_REDUX(val, tag, func, type, is_left, m_offset, \ + extent_partial, rank) \ + } + +#define TAGGED_TILE_LOOP_5_REDUX(val, tag, func, type, is_left, cond, \ + m_offset, extent_full, extent_partial, rank) \ + if (cond) { \ + TAGGED_LOOP_LAYOUT_5_REDUX(val, tag, func, type, is_left, m_offset, \ + extent_full, rank) \ + } else { \ + TAGGED_LOOP_LAYOUT_5_REDUX(val, tag, func, type, is_left, m_offset, \ + extent_partial, rank) \ + } + +#define TAGGED_TILE_LOOP_6_REDUX(val, tag, func, type, is_left, cond, \ + m_offset, extent_full, extent_partial, rank) \ + if (cond) { \ + TAGGED_LOOP_LAYOUT_6_REDUX(val, tag, func, type, is_left, m_offset, \ + extent_full, rank) \ + } else { \ + TAGGED_LOOP_LAYOUT_6_REDUX(val, tag, func, type, is_left, m_offset, \ + extent_partial, rank) \ + } + +#define TAGGED_TILE_LOOP_7_REDUX(val, tag, func, type, is_left, cond, \ + m_offset, extent_full, extent_partial, rank) \ + if (cond) { \ + TAGGED_LOOP_LAYOUT_7_REDUX(val, tag, func, type, is_left, m_offset, \ + extent_full, rank) \ + } else { \ + TAGGED_LOOP_LAYOUT_7_REDUX(val, tag, func, type, is_left, m_offset, \ + extent_partial, rank) \ + } + +#define TAGGED_TILE_LOOP_8_REDUX(val, tag, func, type, is_left, cond, \ + m_offset, extent_full, extent_partial, rank) \ + if (cond) { \ + TAGGED_LOOP_LAYOUT_8_REDUX(val, tag, func, type, is_left, m_offset, \ + extent_full, rank) \ + } else { \ + TAGGED_LOOP_LAYOUT_8_REDUX(val, tag, func, type, is_left, m_offset, \ + extent_partial, rank) \ + } + +// end tagged macros + +// Structs for calling loops +template <int Rank, bool IsLeft, typename IType, typename Tagged, + typename Enable = void> +struct Tile_Loop_Type; + +template <bool IsLeft, typename IType> +struct Tile_Loop_Type<1, IsLeft, IType, void, void> { + template <typename Func, typename Offset, typename ExtentA, typename ExtentB> + static void apply(Func const& func, bool cond, Offset const& offset, + ExtentA const& a, ExtentB const& b) { + TILE_LOOP_1(func, IType, IsLeft, cond, offset, a, b, 1); + } + + template <typename ValType, typename Func, typename Offset, typename ExtentA, + typename ExtentB> + static void apply(ValType& value, Func const& func, bool cond, + Offset const& offset, ExtentA const& a, ExtentB const& b) { + TILE_LOOP_1_REDUX(value, func, IType, IsLeft, cond, offset, a, b, 1); + } +}; + +template <bool IsLeft, typename IType> +struct Tile_Loop_Type<2, IsLeft, IType, void, void> { + template <typename Func, typename Offset, typename ExtentA, typename ExtentB> + static void apply(Func const& func, bool cond, Offset const& offset, + ExtentA const& a, ExtentB const& b) { + TILE_LOOP_2(func, IType, IsLeft, cond, offset, a, b, 2); + } + + template <typename ValType, typename Func, typename Offset, typename ExtentA, + typename ExtentB> + static void apply(ValType& value, Func const& func, bool cond, + Offset const& offset, ExtentA const& a, ExtentB const& b) { + TILE_LOOP_2_REDUX(value, func, IType, IsLeft, cond, offset, a, b, 2); + } +}; + +template <bool IsLeft, typename IType> +struct Tile_Loop_Type<3, IsLeft, IType, void, void> { + template <typename Func, typename Offset, typename ExtentA, typename ExtentB> + static void apply(Func const& func, bool cond, Offset const& offset, + ExtentA const& a, ExtentB const& b) { + TILE_LOOP_3(func, IType, IsLeft, cond, offset, a, b, 3); + } + + template <typename ValType, typename Func, typename Offset, typename ExtentA, + typename ExtentB> + static void apply(ValType& value, Func const& func, bool cond, + Offset const& offset, ExtentA const& a, ExtentB const& b) { + TILE_LOOP_3_REDUX(value, func, IType, IsLeft, cond, offset, a, b, 3); + } +}; + +template <bool IsLeft, typename IType> +struct Tile_Loop_Type<4, IsLeft, IType, void, void> { + template <typename Func, typename Offset, typename ExtentA, typename ExtentB> + static void apply(Func const& func, bool cond, Offset const& offset, + ExtentA const& a, ExtentB const& b) { + TILE_LOOP_4(func, IType, IsLeft, cond, offset, a, b, 4); + } + + template <typename ValType, typename Func, typename Offset, typename ExtentA, + typename ExtentB> + static void apply(ValType& value, Func const& func, bool cond, + Offset const& offset, ExtentA const& a, ExtentB const& b) { + TILE_LOOP_4_REDUX(value, func, IType, IsLeft, cond, offset, a, b, 4); + } +}; + +template <bool IsLeft, typename IType> +struct Tile_Loop_Type<5, IsLeft, IType, void, void> { + template <typename Func, typename Offset, typename ExtentA, typename ExtentB> + static void apply(Func const& func, bool cond, Offset const& offset, + ExtentA const& a, ExtentB const& b) { + TILE_LOOP_5(func, IType, IsLeft, cond, offset, a, b, 5); + } + + template <typename ValType, typename Func, typename Offset, typename ExtentA, + typename ExtentB> + static void apply(ValType& value, Func const& func, bool cond, + Offset const& offset, ExtentA const& a, ExtentB const& b) { + TILE_LOOP_5_REDUX(value, func, IType, IsLeft, cond, offset, a, b, 5); + } +}; + +template <bool IsLeft, typename IType> +struct Tile_Loop_Type<6, IsLeft, IType, void, void> { + template <typename Func, typename Offset, typename ExtentA, typename ExtentB> + static void apply(Func const& func, bool cond, Offset const& offset, + ExtentA const& a, ExtentB const& b) { + TILE_LOOP_6(func, IType, IsLeft, cond, offset, a, b, 6); + } + + template <typename ValType, typename Func, typename Offset, typename ExtentA, + typename ExtentB> + static void apply(ValType& value, Func const& func, bool cond, + Offset const& offset, ExtentA const& a, ExtentB const& b) { + TILE_LOOP_6_REDUX(value, func, IType, IsLeft, cond, offset, a, b, 6); + } +}; + +template <bool IsLeft, typename IType> +struct Tile_Loop_Type<7, IsLeft, IType, void, void> { + template <typename Func, typename Offset, typename ExtentA, typename ExtentB> + static void apply(Func const& func, bool cond, Offset const& offset, + ExtentA const& a, ExtentB const& b) { + TILE_LOOP_7(func, IType, IsLeft, cond, offset, a, b, 7); + } + + template <typename ValType, typename Func, typename Offset, typename ExtentA, + typename ExtentB> + static void apply(ValType& value, Func const& func, bool cond, + Offset const& offset, ExtentA const& a, ExtentB const& b) { + TILE_LOOP_7_REDUX(value, func, IType, IsLeft, cond, offset, a, b, 7); + } +}; + +template <bool IsLeft, typename IType> +struct Tile_Loop_Type<8, IsLeft, IType, void, void> { + template <typename Func, typename Offset, typename ExtentA, typename ExtentB> + static void apply(Func const& func, bool cond, Offset const& offset, + ExtentA const& a, ExtentB const& b) { + TILE_LOOP_8(func, IType, IsLeft, cond, offset, a, b, 8); + } + + template <typename ValType, typename Func, typename Offset, typename ExtentA, + typename ExtentB> + static void apply(ValType& value, Func const& func, bool cond, + Offset const& offset, ExtentA const& a, ExtentB const& b) { + TILE_LOOP_8_REDUX(value, func, IType, IsLeft, cond, offset, a, b, 8); + } +}; + +// tagged versions + +template <bool IsLeft, typename IType, typename Tagged> +struct Tile_Loop_Type< + 1, IsLeft, IType, Tagged, + typename std::enable_if<!std::is_same<Tagged, void>::value>::type> { + template <typename Func, typename Offset, typename ExtentA, typename ExtentB> + static void apply(Func const& func, bool cond, Offset const& offset, + ExtentA const& a, ExtentB const& b) { + TAGGED_TILE_LOOP_1(Tagged(), func, IType, IsLeft, cond, offset, a, b, 1); + } + + template <typename ValType, typename Func, typename Offset, typename ExtentA, + typename ExtentB> + static void apply(ValType& value, Func const& func, bool cond, + Offset const& offset, ExtentA const& a, ExtentB const& b) { + TAGGED_TILE_LOOP_1_REDUX(value, Tagged(), func, IType, IsLeft, cond, offset, + a, b, 1); + } +}; + +template <bool IsLeft, typename IType, typename Tagged> +struct Tile_Loop_Type< + 2, IsLeft, IType, Tagged, + typename std::enable_if<!std::is_same<Tagged, void>::value>::type> { + template <typename Func, typename Offset, typename ExtentA, typename ExtentB> + static void apply(Func const& func, bool cond, Offset const& offset, + ExtentA const& a, ExtentB const& b) { + TAGGED_TILE_LOOP_2(Tagged(), func, IType, IsLeft, cond, offset, a, b, 2); + } + + template <typename ValType, typename Func, typename Offset, typename ExtentA, + typename ExtentB> + static void apply(ValType& value, Func const& func, bool cond, + Offset const& offset, ExtentA const& a, ExtentB const& b) { + TAGGED_TILE_LOOP_2_REDUX(value, Tagged(), func, IType, IsLeft, cond, offset, + a, b, 2); + } +}; + +template <bool IsLeft, typename IType, typename Tagged> +struct Tile_Loop_Type< + 3, IsLeft, IType, Tagged, + typename std::enable_if<!std::is_same<Tagged, void>::value>::type> { + template <typename Func, typename Offset, typename ExtentA, typename ExtentB> + static void apply(Func const& func, bool cond, Offset const& offset, + ExtentA const& a, ExtentB const& b) { + TAGGED_TILE_LOOP_3(Tagged(), func, IType, IsLeft, cond, offset, a, b, 3); + } + + template <typename ValType, typename Func, typename Offset, typename ExtentA, + typename ExtentB> + static void apply(ValType& value, Func const& func, bool cond, + Offset const& offset, ExtentA const& a, ExtentB const& b) { + TAGGED_TILE_LOOP_3_REDUX(value, Tagged(), func, IType, IsLeft, cond, offset, + a, b, 3); + } +}; + +template <bool IsLeft, typename IType, typename Tagged> +struct Tile_Loop_Type< + 4, IsLeft, IType, Tagged, + typename std::enable_if<!std::is_same<Tagged, void>::value>::type> { + template <typename Func, typename Offset, typename ExtentA, typename ExtentB> + static void apply(Func const& func, bool cond, Offset const& offset, + ExtentA const& a, ExtentB const& b) { + TAGGED_TILE_LOOP_4(Tagged(), func, IType, IsLeft, cond, offset, a, b, 4); + } + + template <typename ValType, typename Func, typename Offset, typename ExtentA, + typename ExtentB> + static void apply(ValType& value, Func const& func, bool cond, + Offset const& offset, ExtentA const& a, ExtentB const& b) { + TAGGED_TILE_LOOP_4_REDUX(value, Tagged(), func, IType, IsLeft, cond, offset, + a, b, 4); + } +}; + +template <bool IsLeft, typename IType, typename Tagged> +struct Tile_Loop_Type< + 5, IsLeft, IType, Tagged, + typename std::enable_if<!std::is_same<Tagged, void>::value>::type> { + template <typename Func, typename Offset, typename ExtentA, typename ExtentB> + static void apply(Func const& func, bool cond, Offset const& offset, + ExtentA const& a, ExtentB const& b) { + TAGGED_TILE_LOOP_5(Tagged(), func, IType, IsLeft, cond, offset, a, b, 5); + } + + template <typename ValType, typename Func, typename Offset, typename ExtentA, + typename ExtentB> + static void apply(ValType& value, Func const& func, bool cond, + Offset const& offset, ExtentA const& a, ExtentB const& b) { + TAGGED_TILE_LOOP_5_REDUX(value, Tagged(), func, IType, IsLeft, cond, offset, + a, b, 5); + } +}; + +template <bool IsLeft, typename IType, typename Tagged> +struct Tile_Loop_Type< + 6, IsLeft, IType, Tagged, + typename std::enable_if<!std::is_same<Tagged, void>::value>::type> { + template <typename Func, typename Offset, typename ExtentA, typename ExtentB> + static void apply(Func const& func, bool cond, Offset const& offset, + ExtentA const& a, ExtentB const& b) { + TAGGED_TILE_LOOP_6(Tagged(), func, IType, IsLeft, cond, offset, a, b, 6); + } + + template <typename ValType, typename Func, typename Offset, typename ExtentA, + typename ExtentB> + static void apply(ValType& value, Func const& func, bool cond, + Offset const& offset, ExtentA const& a, ExtentB const& b) { + TAGGED_TILE_LOOP_6_REDUX(value, Tagged(), func, IType, IsLeft, cond, offset, + a, b, 6); + } +}; + +template <bool IsLeft, typename IType, typename Tagged> +struct Tile_Loop_Type< + 7, IsLeft, IType, Tagged, + typename std::enable_if<!std::is_same<Tagged, void>::value>::type> { + template <typename Func, typename Offset, typename ExtentA, typename ExtentB> + static void apply(Func const& func, bool cond, Offset const& offset, + ExtentA const& a, ExtentB const& b) { + TAGGED_TILE_LOOP_7(Tagged(), func, IType, IsLeft, cond, offset, a, b, 7); + } + + template <typename ValType, typename Func, typename Offset, typename ExtentA, + typename ExtentB> + static void apply(ValType& value, Func const& func, bool cond, + Offset const& offset, ExtentA const& a, ExtentB const& b) { + TAGGED_TILE_LOOP_7_REDUX(value, Tagged(), func, IType, IsLeft, cond, offset, + a, b, 7); + } +}; + +template <bool IsLeft, typename IType, typename Tagged> +struct Tile_Loop_Type< + 8, IsLeft, IType, Tagged, + typename std::enable_if<!std::is_same<Tagged, void>::value>::type> { + template <typename Func, typename Offset, typename ExtentA, typename ExtentB> + static void apply(Func const& func, bool cond, Offset const& offset, + ExtentA const& a, ExtentB const& b) { + TAGGED_TILE_LOOP_8(Tagged(), func, IType, IsLeft, cond, offset, a, b, 8); + } + + template <typename ValType, typename Func, typename Offset, typename ExtentA, + typename ExtentB> + static void apply(ValType& value, Func const& func, bool cond, + Offset const& offset, ExtentA const& a, ExtentB const& b) { + TAGGED_TILE_LOOP_8_REDUX(value, Tagged(), func, IType, IsLeft, cond, offset, + a, b, 8); + } +}; +// end Structs for calling loops + +template <typename T> +using is_void_type = std::is_same<T, void>; + +template <typename T> +struct is_type_array : std::false_type { + using value_type = T; +}; + +template <typename T> +struct is_type_array<T[]> : std::true_type { + using value_type = T; +}; + +template <typename RP, typename Functor, typename Tag = void, + typename ValueType = void, typename Enable = void> +struct HostIterateTile; + +// For ParallelFor +template <typename RP, typename Functor, typename Tag, typename ValueType> +struct HostIterateTile< + RP, Functor, Tag, ValueType, + typename std::enable_if<is_void_type<ValueType>::value>::type> { + using index_type = typename RP::index_type; + using point_type = typename RP::point_type; + + using value_type = ValueType; + + inline HostIterateTile(RP const& rp, Functor const& func) + : m_rp(rp), m_func(func) {} + + inline bool check_iteration_bounds(point_type& partial_tile, + point_type& offset) const { + bool is_full_tile = true; + + for (int i = 0; i < RP::rank; ++i) { + if ((offset[i] + m_rp.m_tile[i]) <= m_rp.m_upper[i]) { + partial_tile[i] = m_rp.m_tile[i]; + } else { + is_full_tile = false; + partial_tile[i] = + (m_rp.m_upper[i] - 1 - offset[i]) == 0 + ? 1 + : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 + ? (m_rp.m_upper[i] - offset[i]) + : (m_rp.m_upper[i] - + m_rp.m_lower[i]); // when single tile encloses range + } + } + + return is_full_tile; + } // end check bounds + + template <int Rank> + struct RankTag { + using type = RankTag<Rank>; + enum { value = (int)Rank }; + }; + +#if KOKKOS_ENABLE_NEW_LOOP_MACROS + template <typename IType> + inline void operator()(IType tile_idx) const { + point_type m_offset; + point_type m_tiledims; + + if (RP::outer_direction == Iterate::Left) { + for (int i = 0; i < RP::rank; ++i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } else { + for (int i = RP::rank - 1; i >= 0; --i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } + + // Check if offset+tiledim in bounds - if not, replace tile dims with the + // partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); + + Tile_Loop_Type<RP::rank, (RP::inner_direction == Iterate::Left), index_type, + Tag>::apply(m_func, full_tile, m_offset, m_rp.m_tile, + m_tiledims); + } + +#else + template <typename IType> + inline void operator()(IType tile_idx) const { + operator_impl(tile_idx, RankTag<RP::rank>()); + } + // added due to compiler error when using sfinae to choose operator based on + // rank w/ cuda+serial + + template <typename IType> + inline void operator_impl(IType tile_idx, const RankTag<2>) const { + point_type m_offset; + point_type m_tiledims; + + if (RP::outer_direction == Iterate::Left) { + for (int i = 0; i < RP::rank; ++i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } else { + for (int i = RP::rank - 1; i >= 0; --i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } + + // Check if offset+tiledim in bounds - if not, replace tile dims with the + // partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); + + if (RP::inner_direction == Iterate::Left) { + if (full_tile) { + // #pragma simd + LOOP_2L(index_type, m_tiledims) { apply(LOOP_ARGS_2); } + } else { + // #pragma simd + LOOP_2L(index_type, m_tiledims) { apply(LOOP_ARGS_2); } + } + } // end Iterate::Left + else { + if (full_tile) { + // #pragma simd + LOOP_2R(index_type, m_tiledims) { apply(LOOP_ARGS_2); } + } else { + // #pragma simd + LOOP_2R(index_type, m_tiledims) { apply(LOOP_ARGS_2); } + } + } // end Iterate::Right + + } // end op() rank == 2 + + template <typename IType> + inline void operator_impl(IType tile_idx, const RankTag<3>) const { + point_type m_offset; + point_type m_tiledims; + + if (RP::outer_direction == Iterate::Left) { + for (int i = 0; i < RP::rank; ++i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } else { + for (int i = RP::rank - 1; i >= 0; --i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } + + // Check if offset+tiledim in bounds - if not, replace tile dims with the + // partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); + + if (RP::inner_direction == Iterate::Left) { + if (full_tile) { + // #pragma simd + LOOP_3L(index_type, m_tiledims) { apply(LOOP_ARGS_3); } + } else { + // #pragma simd + LOOP_3L(index_type, m_tiledims) { apply(LOOP_ARGS_3); } + } + } // end Iterate::Left + else { + if (full_tile) { + // #pragma simd + LOOP_3R(index_type, m_tiledims) { apply(LOOP_ARGS_3); } + } else { + // #pragma simd + LOOP_3R(index_type, m_tiledims) { apply(LOOP_ARGS_3); } + } + } // end Iterate::Right + + } // end op() rank == 3 + + template <typename IType> + inline void operator_impl(IType tile_idx, const RankTag<4>) const { + point_type m_offset; + point_type m_tiledims; + + if (RP::outer_direction == Iterate::Left) { + for (int i = 0; i < RP::rank; ++i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } else { + for (int i = RP::rank - 1; i >= 0; --i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } + + // Check if offset+tiledim in bounds - if not, replace tile dims with the + // partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); + + if (RP::inner_direction == Iterate::Left) { + if (full_tile) { + // #pragma simd + LOOP_4L(index_type, m_tiledims) { apply(LOOP_ARGS_4); } + } else { + // #pragma simd + LOOP_4L(index_type, m_tiledims) { apply(LOOP_ARGS_4); } + } + } // end Iterate::Left + else { + if (full_tile) { + // #pragma simd + LOOP_4R(index_type, m_tiledims) { apply(LOOP_ARGS_4); } + } else { + // #pragma simd + LOOP_4R(index_type, m_tiledims) { apply(LOOP_ARGS_4); } + } + } // end Iterate::Right + + } // end op() rank == 4 + + template <typename IType> + inline void operator_impl(IType tile_idx, const RankTag<5>) const { + point_type m_offset; + point_type m_tiledims; + + if (RP::outer_direction == Iterate::Left) { + for (int i = 0; i < RP::rank; ++i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } else { + for (int i = RP::rank - 1; i >= 0; --i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } + + // Check if offset+tiledim in bounds - if not, replace tile dims with the + // partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); + + if (RP::inner_direction == Iterate::Left) { + if (full_tile) { + // #pragma simd + LOOP_5L(index_type, m_tiledims) { apply(LOOP_ARGS_5); } + } else { + // #pragma simd + LOOP_5L(index_type, m_tiledims) { apply(LOOP_ARGS_5); } + } + } // end Iterate::Left + else { + if (full_tile) { + // #pragma simd + LOOP_5R(index_type, m_tiledims) { apply(LOOP_ARGS_5); } + } else { + // #pragma simd + LOOP_5R(index_type, m_tiledims) { apply(LOOP_ARGS_5); } + } + } // end Iterate::Right + + } // end op() rank == 5 + + template <typename IType> + inline void operator_impl(IType tile_idx, const RankTag<6>) const { + point_type m_offset; + point_type m_tiledims; + + if (RP::outer_direction == Iterate::Left) { + for (int i = 0; i < RP::rank; ++i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } else { + for (int i = RP::rank - 1; i >= 0; --i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } + + // Check if offset+tiledim in bounds - if not, replace tile dims with the + // partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); + + if (RP::inner_direction == Iterate::Left) { + if (full_tile) { + // #pragma simd + LOOP_6L(index_type, m_tiledims) { apply(LOOP_ARGS_6); } + } else { + // #pragma simd + LOOP_6L(index_type, m_tiledims) { apply(LOOP_ARGS_6); } + } + } // end Iterate::Left + else { + if (full_tile) { + // #pragma simd + LOOP_6R(index_type, m_tiledims) { apply(LOOP_ARGS_6); } + } else { + // #pragma simd + LOOP_6R(index_type, m_tiledims) { apply(LOOP_ARGS_6); } + } + } // end Iterate::Right + + } // end op() rank == 6 + + template <typename IType> + inline void operator_impl(IType tile_idx, const RankTag<7>) const { + point_type m_offset; + point_type m_tiledims; + + if (RP::outer_direction == Iterate::Left) { + for (int i = 0; i < RP::rank; ++i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } else { + for (int i = RP::rank - 1; i >= 0; --i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } + + // Check if offset+tiledim in bounds - if not, replace tile dims with the + // partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); + + if (RP::inner_direction == Iterate::Left) { + if (full_tile) { + // #pragma simd + LOOP_7L(index_type, m_tiledims) { apply(LOOP_ARGS_7); } + } else { + // #pragma simd + LOOP_7L(index_type, m_tiledims) { apply(LOOP_ARGS_7); } + } + } // end Iterate::Left + else { + if (full_tile) { + // #pragma simd + LOOP_7R(index_type, m_tiledims) { apply(LOOP_ARGS_7); } + } else { + // #pragma simd + LOOP_7R(index_type, m_tiledims) { apply(LOOP_ARGS_7); } + } + } // end Iterate::Right + + } // end op() rank == 7 + + template <typename IType> + inline void operator_impl(IType tile_idx, const RankTag<8>) const { + point_type m_offset; + point_type m_tiledims; + + if (RP::outer_direction == Iterate::Left) { + for (int i = 0; i < RP::rank; ++i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } else { + for (int i = RP::rank - 1; i >= 0; --i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } + + // Check if offset+tiledim in bounds - if not, replace tile dims with the + // partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); + + if (RP::inner_direction == Iterate::Left) { + if (full_tile) { + // #pragma simd + LOOP_8L(index_type, m_tiledims) { apply(LOOP_ARGS_8); } + } else { + // #pragma simd + LOOP_8L(index_type, m_tiledims) { apply(LOOP_ARGS_8); } + } + } // end Iterate::Left + else { + if (full_tile) { + // #pragma simd + LOOP_8R(index_type, m_tiledims) { apply(LOOP_ARGS_8); } + } else { + // #pragma simd + LOOP_8R(index_type, m_tiledims) { apply(LOOP_ARGS_8); } + } + } // end Iterate::Right + + } // end op() rank == 8 +#endif + + template <typename... Args> + typename std::enable_if<(sizeof...(Args) == RP::rank && + std::is_same<Tag, void>::value), + void>::type + apply(Args&&... args) const { + m_func(args...); + } + + template <typename... Args> + typename std::enable_if<(sizeof...(Args) == RP::rank && + !std::is_same<Tag, void>::value), + void>::type + apply(Args&&... args) const { + m_func(m_tag, args...); + } + + RP const& m_rp; + Functor const& m_func; + typename std::conditional<std::is_same<Tag, void>::value, int, Tag>::type + m_tag; +}; + +// For ParallelReduce +// ValueType - scalar: For reductions +template <typename RP, typename Functor, typename Tag, typename ValueType> +struct HostIterateTile< + RP, Functor, Tag, ValueType, + typename std::enable_if<!is_void_type<ValueType>::value && + !is_type_array<ValueType>::value>::type> { + using index_type = typename RP::index_type; + using point_type = typename RP::point_type; + + using value_type = ValueType; + + inline HostIterateTile(RP const& rp, Functor const& func, value_type& v) + : m_rp(rp) // Cuda 7.0 does not like braces... + , + m_func(func), + m_v(v) // use with non-void ValueType struct + { + // Errors due to braces rather than parenthesis for init (with cuda 7.0) + // /home/ndellin/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp:1216:98: + // error: too many braces around initializer for ‘int’ [-fpermissive] + // /home/ndellin/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp:1216:98: + // error: aggregate value used where an integer was expected + } + + inline bool check_iteration_bounds(point_type& partial_tile, + point_type& offset) const { + bool is_full_tile = true; + + for (int i = 0; i < RP::rank; ++i) { + if ((offset[i] + m_rp.m_tile[i]) <= m_rp.m_upper[i]) { + partial_tile[i] = m_rp.m_tile[i]; + } else { + is_full_tile = false; + partial_tile[i] = + (m_rp.m_upper[i] - 1 - offset[i]) == 0 + ? 1 + : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 + ? (m_rp.m_upper[i] - offset[i]) + : (m_rp.m_upper[i] - + m_rp.m_lower[i]); // when single tile encloses range + } + } + + return is_full_tile; + } // end check bounds + + template <int Rank> + struct RankTag { + using type = RankTag<Rank>; + enum { value = (int)Rank }; + }; + +#if KOKKOS_ENABLE_NEW_LOOP_MACROS + template <typename IType> + inline void operator()(IType tile_idx) const { + point_type m_offset; + point_type m_tiledims; + + if (RP::outer_direction == Iterate::Left) { + for (int i = 0; i < RP::rank; ++i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } else { + for (int i = RP::rank - 1; i >= 0; --i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } + + // Check if offset+tiledim in bounds - if not, replace tile dims with the + // partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); + + Tile_Loop_Type<RP::rank, (RP::inner_direction == Iterate::Left), index_type, + Tag>::apply(m_v, m_func, full_tile, m_offset, m_rp.m_tile, + m_tiledims); + } + +#else + template <typename IType> + inline void operator()(IType tile_idx) const { + operator_impl(tile_idx, RankTag<RP::rank>()); + } + // added due to compiler error when using sfinae to choose operator based on + // rank + + template <typename IType> + inline void operator_impl(IType tile_idx, const RankTag<2>) const { + point_type m_offset; + point_type m_tiledims; + + if (RP::outer_direction == Iterate::Left) { + for (int i = 0; i < RP::rank; ++i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } else { + for (int i = RP::rank - 1; i >= 0; --i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } + + // Check if offset+tiledim in bounds - if not, replace tile dims with the + // partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); + + if (RP::inner_direction == Iterate::Left) { + if (full_tile) { + // #pragma simd + LOOP_2L(index_type, m_tiledims) { apply(LOOP_ARGS_2); } + } else { + // #pragma simd + LOOP_2L(index_type, m_tiledims) { apply(LOOP_ARGS_2); } + } + } // end Iterate::Left + else { + if (full_tile) { + // #pragma simd + LOOP_2R(index_type, m_tiledims) { apply(LOOP_ARGS_2); } + } else { + // #pragma simd + LOOP_2R(index_type, m_tiledims) { apply(LOOP_ARGS_2); } + } + } // end Iterate::Right + + } // end op() rank == 2 + + template <typename IType> + inline void operator_impl(IType tile_idx, const RankTag<3>) const { + point_type m_offset; + point_type m_tiledims; + + if (RP::outer_direction == Iterate::Left) { + for (int i = 0; i < RP::rank; ++i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } else { + for (int i = RP::rank - 1; i >= 0; --i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } + + // Check if offset+tiledim in bounds - if not, replace tile dims with the + // partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); + + if (RP::inner_direction == Iterate::Left) { + if (full_tile) { + // #pragma simd + LOOP_3L(index_type, m_tiledims) { apply(LOOP_ARGS_3); } + } else { + // #pragma simd + LOOP_3L(index_type, m_tiledims) { apply(LOOP_ARGS_3); } + } + } // end Iterate::Left + else { + if (full_tile) { + // #pragma simd + LOOP_3R(index_type, m_tiledims) { apply(LOOP_ARGS_3); } + } else { + // #pragma simd + LOOP_3R(index_type, m_tiledims) { apply(LOOP_ARGS_3); } + } + } // end Iterate::Right + + } // end op() rank == 3 + + template <typename IType> + inline void operator_impl(IType tile_idx, const RankTag<4>) const { + point_type m_offset; + point_type m_tiledims; + + if (RP::outer_direction == Iterate::Left) { + for (int i = 0; i < RP::rank; ++i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } else { + for (int i = RP::rank - 1; i >= 0; --i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } + + // Check if offset+tiledim in bounds - if not, replace tile dims with the + // partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); + + if (RP::inner_direction == Iterate::Left) { + if (full_tile) { + // #pragma simd + LOOP_4L(index_type, m_tiledims) { apply(LOOP_ARGS_4); } + } else { + // #pragma simd + LOOP_4L(index_type, m_tiledims) { apply(LOOP_ARGS_4); } + } + } // end Iterate::Left + else { + if (full_tile) { + // #pragma simd + LOOP_4R(index_type, m_tiledims) { apply(LOOP_ARGS_4); } + } else { + // #pragma simd + LOOP_4R(index_type, m_tiledims) { apply(LOOP_ARGS_4); } + } + } // end Iterate::Right + + } // end op() rank == 4 + + template <typename IType> + inline void operator_impl(IType tile_idx, const RankTag<5>) const { + point_type m_offset; + point_type m_tiledims; + + if (RP::outer_direction == Iterate::Left) { + for (int i = 0; i < RP::rank; ++i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } else { + for (int i = RP::rank - 1; i >= 0; --i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } + + // Check if offset+tiledim in bounds - if not, replace tile dims with the + // partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); + + if (RP::inner_direction == Iterate::Left) { + if (full_tile) { + // #pragma simd + LOOP_5L(index_type, m_tiledims) { apply(LOOP_ARGS_5); } + } else { + // #pragma simd + LOOP_5L(index_type, m_tiledims) { apply(LOOP_ARGS_5); } + } + } // end Iterate::Left + else { + if (full_tile) { + // #pragma simd + LOOP_5R(index_type, m_tiledims) { apply(LOOP_ARGS_5); } + } else { + // #pragma simd + LOOP_5R(index_type, m_tiledims) { apply(LOOP_ARGS_5); } + } + } // end Iterate::Right + + } // end op() rank == 5 + + template <typename IType> + inline void operator_impl(IType tile_idx, const RankTag<6>) const { + point_type m_offset; + point_type m_tiledims; + + if (RP::outer_direction == Iterate::Left) { + for (int i = 0; i < RP::rank; ++i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } else { + for (int i = RP::rank - 1; i >= 0; --i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } + + // Check if offset+tiledim in bounds - if not, replace tile dims with the + // partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); + + if (RP::inner_direction == Iterate::Left) { + if (full_tile) { + // #pragma simd + LOOP_6L(index_type, m_tiledims) { apply(LOOP_ARGS_6); } + } else { + // #pragma simd + LOOP_6L(index_type, m_tiledims) { apply(LOOP_ARGS_6); } + } + } // end Iterate::Left + else { + if (full_tile) { + // #pragma simd + LOOP_6R(index_type, m_tiledims) { apply(LOOP_ARGS_6); } + } else { + // #pragma simd + LOOP_6R(index_type, m_tiledims) { apply(LOOP_ARGS_6); } + } + } // end Iterate::Right + + } // end op() rank == 6 + + template <typename IType> + inline void operator_impl(IType tile_idx, const RankTag<7>) const { + point_type m_offset; + point_type m_tiledims; + + if (RP::outer_direction == Iterate::Left) { + for (int i = 0; i < RP::rank; ++i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } else { + for (int i = RP::rank - 1; i >= 0; --i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } + + // Check if offset+tiledim in bounds - if not, replace tile dims with the + // partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); + + if (RP::inner_direction == Iterate::Left) { + if (full_tile) { + // #pragma simd + LOOP_7L(index_type, m_tiledims) { apply(LOOP_ARGS_7); } + } else { + // #pragma simd + LOOP_7L(index_type, m_tiledims) { apply(LOOP_ARGS_7); } + } + } // end Iterate::Left + else { + if (full_tile) { + // #pragma simd + LOOP_7R(index_type, m_tiledims) { apply(LOOP_ARGS_7); } + } else { + // #pragma simd + LOOP_7R(index_type, m_tiledims) { apply(LOOP_ARGS_7); } + } + } // end Iterate::Right + + } // end op() rank == 7 + + template <typename IType> + inline void operator_impl(IType tile_idx, const RankTag<8>) const { + point_type m_offset; + point_type m_tiledims; + + if (RP::outer_direction == Iterate::Left) { + for (int i = 0; i < RP::rank; ++i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } else { + for (int i = RP::rank - 1; i >= 0; --i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } + + // Check if offset+tiledim in bounds - if not, replace tile dims with the + // partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); + + if (RP::inner_direction == Iterate::Left) { + if (full_tile) { + // #pragma simd + LOOP_8L(index_type, m_tiledims) { apply(LOOP_ARGS_8); } + } else { + // #pragma simd + LOOP_8L(index_type, m_tiledims) { apply(LOOP_ARGS_8); } + } + } // end Iterate::Left + else { + if (full_tile) { + // #pragma simd + LOOP_8R(index_type, m_tiledims) { apply(LOOP_ARGS_8); } + } else { + // #pragma simd + LOOP_8R(index_type, m_tiledims) { apply(LOOP_ARGS_8); } + } + } // end Iterate::Right + + } // end op() rank == 8 +#endif + + template <typename... Args> + typename std::enable_if<(sizeof...(Args) == RP::rank && + std::is_same<Tag, void>::value), + void>::type + apply(Args&&... args) const { + m_func(args..., m_v); + } + + template <typename... Args> + typename std::enable_if<(sizeof...(Args) == RP::rank && + !std::is_same<Tag, void>::value), + void>::type + apply(Args&&... args) const { + m_func(m_tag, args..., m_v); + } + + RP const& m_rp; + Functor const& m_func; + value_type& m_v; + typename std::conditional<std::is_same<Tag, void>::value, int, Tag>::type + m_tag; +}; + +// For ParallelReduce +// Extra specialization for array reductions +// ValueType[]: For array reductions +template <typename RP, typename Functor, typename Tag, typename ValueType> +struct HostIterateTile< + RP, Functor, Tag, ValueType, + typename std::enable_if<!is_void_type<ValueType>::value && + is_type_array<ValueType>::value>::type> { + using index_type = typename RP::index_type; + using point_type = typename RP::point_type; + + using value_type = + typename is_type_array<ValueType>::value_type; // strip away the + // 'array-ness' [], only + // underlying type remains + + inline HostIterateTile( + RP const& rp, Functor const& func, + value_type* v) // v should be an array; treat as pointer for + // compatibility since size is not known nor needed here + : m_rp(rp) // Cuda 7.0 does not like braces... + , + m_func(func), + m_v(v) // use with non-void ValueType struct + {} + + inline bool check_iteration_bounds(point_type& partial_tile, + point_type& offset) const { + bool is_full_tile = true; + + for (int i = 0; i < RP::rank; ++i) { + if ((offset[i] + m_rp.m_tile[i]) <= m_rp.m_upper[i]) { + partial_tile[i] = m_rp.m_tile[i]; + } else { + is_full_tile = false; + partial_tile[i] = + (m_rp.m_upper[i] - 1 - offset[i]) == 0 + ? 1 + : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 + ? (m_rp.m_upper[i] - offset[i]) + : (m_rp.m_upper[i] - + m_rp.m_lower[i]); // when single tile encloses range + } + } + + return is_full_tile; + } // end check bounds + + template <int Rank> + struct RankTag { + using type = RankTag<Rank>; + enum { value = (int)Rank }; + }; + +#if KOKKOS_ENABLE_NEW_LOOP_MACROS + template <typename IType> + inline void operator()(IType tile_idx) const { + point_type m_offset; + point_type m_tiledims; + + if (RP::outer_direction == Iterate::Left) { + for (int i = 0; i < RP::rank; ++i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } else { + for (int i = RP::rank - 1; i >= 0; --i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } + + // Check if offset+tiledim in bounds - if not, replace tile dims with the + // partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); + + Tile_Loop_Type<RP::rank, (RP::inner_direction == Iterate::Left), index_type, + Tag>::apply(m_v, m_func, full_tile, m_offset, m_rp.m_tile, + m_tiledims); + } + +#else + template <typename IType> + inline void operator()(IType tile_idx) const { + operator_impl(tile_idx, RankTag<RP::rank>()); + } + // added due to compiler error when using sfinae to choose operator based on + // rank + + template <typename IType> + inline void operator_impl(IType tile_idx, const RankTag<2>) const { + point_type m_offset; + point_type m_tiledims; + + if (RP::outer_direction == Iterate::Left) { + for (int i = 0; i < RP::rank; ++i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } else { + for (int i = RP::rank - 1; i >= 0; --i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } + + // Check if offset+tiledim in bounds - if not, replace tile dims with the + // partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); + + if (RP::inner_direction == Iterate::Left) { + if (full_tile) { + // #pragma simd + LOOP_2L(index_type, m_tiledims) { apply(LOOP_ARGS_2); } + } else { + // #pragma simd + LOOP_2L(index_type, m_tiledims) { apply(LOOP_ARGS_2); } + } + } // end Iterate::Left + else { + if (full_tile) { + // #pragma simd + LOOP_2R(index_type, m_tiledims) { apply(LOOP_ARGS_2); } + } else { + // #pragma simd + LOOP_2R(index_type, m_tiledims) { apply(LOOP_ARGS_2); } + } + } // end Iterate::Right + + } // end op() rank == 2 + + template <typename IType> + inline void operator_impl(IType tile_idx, const RankTag<3>) const { + point_type m_offset; + point_type m_tiledims; + + if (RP::outer_direction == Iterate::Left) { + for (int i = 0; i < RP::rank; ++i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } else { + for (int i = RP::rank - 1; i >= 0; --i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } + + // Check if offset+tiledim in bounds - if not, replace tile dims with the + // partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); + + if (RP::inner_direction == Iterate::Left) { + if (full_tile) { + // #pragma simd + LOOP_3L(index_type, m_tiledims) { apply(LOOP_ARGS_3); } + } else { + // #pragma simd + LOOP_3L(index_type, m_tiledims) { apply(LOOP_ARGS_3); } + } + } // end Iterate::Left + else { + if (full_tile) { + // #pragma simd + LOOP_3R(index_type, m_tiledims) { apply(LOOP_ARGS_3); } + } else { + // #pragma simd + LOOP_3R(index_type, m_tiledims) { apply(LOOP_ARGS_3); } + } + } // end Iterate::Right + + } // end op() rank == 3 + + template <typename IType> + inline void operator_impl(IType tile_idx, const RankTag<4>) const { + point_type m_offset; + point_type m_tiledims; + + if (RP::outer_direction == Iterate::Left) { + for (int i = 0; i < RP::rank; ++i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } else { + for (int i = RP::rank - 1; i >= 0; --i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } + + // Check if offset+tiledim in bounds - if not, replace tile dims with the + // partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); + + if (RP::inner_direction == Iterate::Left) { + if (full_tile) { + // #pragma simd + LOOP_4L(index_type, m_tiledims) { apply(LOOP_ARGS_4); } + } else { + // #pragma simd + LOOP_4L(index_type, m_tiledims) { apply(LOOP_ARGS_4); } + } + } // end Iterate::Left + else { + if (full_tile) { + // #pragma simd + LOOP_4R(index_type, m_tiledims) { apply(LOOP_ARGS_4); } + } else { + // #pragma simd + LOOP_4R(index_type, m_tiledims) { apply(LOOP_ARGS_4); } + } + } // end Iterate::Right + + } // end op() rank == 4 + + template <typename IType> + inline void operator_impl(IType tile_idx, const RankTag<5>) const { + point_type m_offset; + point_type m_tiledims; + + if (RP::outer_direction == Iterate::Left) { + for (int i = 0; i < RP::rank; ++i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } else { + for (int i = RP::rank - 1; i >= 0; --i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } + + // Check if offset+tiledim in bounds - if not, replace tile dims with the + // partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); + + if (RP::inner_direction == Iterate::Left) { + if (full_tile) { + // #pragma simd + LOOP_5L(index_type, m_tiledims) { apply(LOOP_ARGS_5); } + } else { + // #pragma simd + LOOP_5L(index_type, m_tiledims) { apply(LOOP_ARGS_5); } + } + } // end Iterate::Left + else { + if (full_tile) { + // #pragma simd + LOOP_5R(index_type, m_tiledims) { apply(LOOP_ARGS_5); } + } else { + // #pragma simd + LOOP_5R(index_type, m_tiledims) { apply(LOOP_ARGS_5); } + } + } // end Iterate::Right + + } // end op() rank == 5 + + template <typename IType> + inline void operator_impl(IType tile_idx, const RankTag<6>) const { + point_type m_offset; + point_type m_tiledims; + + if (RP::outer_direction == Iterate::Left) { + for (int i = 0; i < RP::rank; ++i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } else { + for (int i = RP::rank - 1; i >= 0; --i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } + + // Check if offset+tiledim in bounds - if not, replace tile dims with the + // partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); + + if (RP::inner_direction == Iterate::Left) { + if (full_tile) { + // #pragma simd + LOOP_6L(index_type, m_tiledims) { apply(LOOP_ARGS_6); } + } else { + // #pragma simd + LOOP_6L(index_type, m_tiledims) { apply(LOOP_ARGS_6); } + } + } // end Iterate::Left + else { + if (full_tile) { + // #pragma simd + LOOP_6R(index_type, m_tiledims) { apply(LOOP_ARGS_6); } + } else { + // #pragma simd + LOOP_6R(index_type, m_tiledims) { apply(LOOP_ARGS_6); } + } + } // end Iterate::Right + + } // end op() rank == 6 + + template <typename IType> + inline void operator_impl(IType tile_idx, const RankTag<7>) const { + point_type m_offset; + point_type m_tiledims; + + if (RP::outer_direction == Iterate::Left) { + for (int i = 0; i < RP::rank; ++i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } else { + for (int i = RP::rank - 1; i >= 0; --i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } + + // Check if offset+tiledim in bounds - if not, replace tile dims with the + // partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); + + if (RP::inner_direction == Iterate::Left) { + if (full_tile) { + // #pragma simd + LOOP_7L(index_type, m_tiledims) { apply(LOOP_ARGS_7); } + } else { + // #pragma simd + LOOP_7L(index_type, m_tiledims) { apply(LOOP_ARGS_7); } + } + } // end Iterate::Left + else { + if (full_tile) { + // #pragma simd + LOOP_7R(index_type, m_tiledims) { apply(LOOP_ARGS_7); } + } else { + // #pragma simd + LOOP_7R(index_type, m_tiledims) { apply(LOOP_ARGS_7); } + } + } // end Iterate::Right + + } // end op() rank == 7 + + template <typename IType> + inline void operator_impl(IType tile_idx, const RankTag<8>) const { + point_type m_offset; + point_type m_tiledims; + + if (RP::outer_direction == Iterate::Left) { + for (int i = 0; i < RP::rank; ++i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } else { + for (int i = RP::rank - 1; i >= 0; --i) { + m_offset[i] = + (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; + tile_idx /= m_rp.m_tile_end[i]; + } + } + + // Check if offset+tiledim in bounds - if not, replace tile dims with the + // partial tile dims + const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); + + if (RP::inner_direction == Iterate::Left) { + if (full_tile) { + // #pragma simd + LOOP_8L(index_type, m_tiledims) { apply(LOOP_ARGS_8); } + } else { + // #pragma simd + LOOP_8L(index_type, m_tiledims) { apply(LOOP_ARGS_8); } + } + } // end Iterate::Left + else { + if (full_tile) { + // #pragma simd + LOOP_8R(index_type, m_tiledims) { apply(LOOP_ARGS_8); } + } else { + // #pragma simd + LOOP_8R(index_type, m_tiledims) { apply(LOOP_ARGS_8); } + } + } // end Iterate::Right + + } // end op() rank == 8 +#endif + + template <typename... Args> + typename std::enable_if<(sizeof...(Args) == RP::rank && + std::is_same<Tag, void>::value), + void>::type + apply(Args&&... args) const { + m_func(args..., m_v); + } + + template <typename... Args> + typename std::enable_if<(sizeof...(Args) == RP::rank && + !std::is_same<Tag, void>::value), + void>::type + apply(Args&&... args) const { + m_func(m_tag, args..., m_v); + } + + RP const& m_rp; + Functor const& m_func; + value_type* m_v; + typename std::conditional<std::is_same<Tag, void>::value, int, Tag>::type + m_tag; +}; + +// ------------------------------------------------------------------ // + +#undef KOKKOS_ENABLE_NEW_LOOP_MACROS + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/impl/KokkosExp_IterateTileGPU.hpp b/packages/kokkos/core/src/impl/KokkosExp_IterateTileGPU.hpp new file mode 100644 index 0000000000000000000000000000000000000000..688afcc107e4e4ff93a2b415c8209d29bf4c0ba2 --- /dev/null +++ b/packages/kokkos/core/src/impl/KokkosExp_IterateTileGPU.hpp @@ -0,0 +1,1041 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_EXP_ITERATE_TILE_GPU_HPP +#define KOKKOS_EXP_ITERATE_TILE_GPU_HPP + +#include <Kokkos_Macros.hpp> + +#include <algorithm> + +#include <utility> + +#include <impl/Kokkos_Profiling_Interface.hpp> +#include <typeinfo> + +namespace Kokkos { +namespace Impl { + +#ifdef KOKKOS_ENABLE_SYCL +template <typename index_type> +struct EmulateCUDADim3 { + index_type x; + index_type y; + index_type z; +}; +#endif + +template <class Tag, class Functor, class... Args> +KOKKOS_IMPL_FORCEINLINE_FUNCTION std::enable_if_t<std::is_void<Tag>::value> +_tag_invoke(Functor const& f, Args&&... args) { + f((Args &&) args...); +} + +template <class Tag, class Functor, class... Args> +KOKKOS_IMPL_FORCEINLINE_FUNCTION std::enable_if_t<!std::is_void<Tag>::value> +_tag_invoke(Functor const& f, Args&&... args) { + f(Tag{}, (Args &&) args...); +} + +template <class Tag, class Functor, class T, size_t N, size_t... Idxs, + class... Args> +KOKKOS_IMPL_FORCEINLINE_FUNCTION void _tag_invoke_array_helper( + Functor const& f, T (&vals)[N], std::integer_sequence<size_t, Idxs...>, + Args&&... args) { + _tag_invoke<Tag>(f, vals[Idxs]..., (Args &&) args...); +} + +template <class Tag, class Functor, class T, size_t N, class... Args> +KOKKOS_IMPL_FORCEINLINE_FUNCTION void _tag_invoke_array(Functor const& f, + T (&vals)[N], + Args&&... args) { + _tag_invoke_array_helper<Tag>(f, vals, std::make_index_sequence<N>{}, + (Args &&) args...); +} + +// ------------------------------------------------------------------ // +// ParallelFor iteration pattern +template <int N, typename PolicyType, typename Functor, typename Tag> +struct DeviceIterateTile; + +// Rank 2 +template <typename PolicyType, typename Functor, typename Tag> +struct DeviceIterateTile<2, PolicyType, Functor, Tag> { + using index_type = typename PolicyType::index_type; + +#ifdef KOKKOS_ENABLE_SYCL + KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile( + const PolicyType& policy_, const Functor& f_, + const EmulateCUDADim3<index_type> gridDim_, + const EmulateCUDADim3<index_type> blockIdx_, + const EmulateCUDADim3<index_type> threadIdx_) + : m_policy(policy_), + m_func(f_), + gridDim(gridDim_), + blockIdx(blockIdx_), + threadIdx(threadIdx_) {} +#else + KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(const PolicyType& policy_, + const Functor& f_) + : m_policy(policy_), m_func(f_) {} +#endif + + KOKKOS_IMPL_DEVICE_FUNCTION + void exec_range() const { + if (PolicyType::inner_direction == Iterate::Left) { + // Loop over size maxnumblocks until full range covered + for (index_type tile_id1 = static_cast<index_type>(blockIdx.y); + tile_id1 < m_policy.m_tile_end[1]; tile_id1 += gridDim.y) { + const index_type offset_1 = + tile_id1 * m_policy.m_tile[1] + + static_cast<index_type>(threadIdx.y) + + static_cast<index_type>(m_policy.m_lower[1]); + if (offset_1 < m_policy.m_upper[1] && + static_cast<index_type>(threadIdx.y) < m_policy.m_tile[1]) { + for (index_type tile_id0 = static_cast<index_type>(blockIdx.x); + tile_id0 < m_policy.m_tile_end[0]; tile_id0 += gridDim.x) { + const index_type offset_0 = + tile_id0 * m_policy.m_tile[0] + + static_cast<index_type>(threadIdx.x) + + static_cast<index_type>(m_policy.m_lower[0]); + if (offset_0 < m_policy.m_upper[0] && + static_cast<index_type>(threadIdx.x) < m_policy.m_tile[0]) { + Impl::_tag_invoke<Tag>(m_func, offset_0, offset_1); + } + } + } + } + } else { + for (index_type tile_id0 = static_cast<index_type>(blockIdx.x); + tile_id0 < m_policy.m_tile_end[0]; tile_id0 += gridDim.x) { + const index_type offset_0 = + tile_id0 * m_policy.m_tile[0] + + static_cast<index_type>(threadIdx.x) + + static_cast<index_type>(m_policy.m_lower[0]); + if (offset_0 < m_policy.m_upper[0] && + static_cast<index_type>(threadIdx.x) < m_policy.m_tile[0]) { + for (index_type tile_id1 = static_cast<index_type>(blockIdx.y); + tile_id1 < m_policy.m_tile_end[1]; tile_id1 += gridDim.y) { + const index_type offset_1 = + tile_id1 * m_policy.m_tile[1] + + static_cast<index_type>(threadIdx.y) + + static_cast<index_type>(m_policy.m_lower[1]); + if (offset_1 < m_policy.m_upper[1] && + static_cast<index_type>(threadIdx.y) < m_policy.m_tile[1]) { + Impl::_tag_invoke<Tag>(m_func, offset_0, offset_1); + } + } + } + } + } + } // end exec_range + + private: + const PolicyType& m_policy; + const Functor& m_func; +#ifdef KOKKOS_ENABLE_SYCL + const EmulateCUDADim3<index_type> gridDim; + const EmulateCUDADim3<index_type> blockIdx; + const EmulateCUDADim3<index_type> threadIdx; +#endif +}; + +// Rank 3 +template <typename PolicyType, typename Functor, typename Tag> +struct DeviceIterateTile<3, PolicyType, Functor, Tag> { + using index_type = typename PolicyType::index_type; + +#ifdef KOKKOS_ENABLE_SYCL + KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile( + const PolicyType& policy_, const Functor& f_, + const EmulateCUDADim3<index_type> gridDim_, + const EmulateCUDADim3<index_type> blockIdx_, + const EmulateCUDADim3<index_type> threadIdx_) + : m_policy(policy_), + m_func(f_), + gridDim(gridDim_), + blockIdx(blockIdx_), + threadIdx(threadIdx_) {} +#else + KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(const PolicyType& policy_, + const Functor& f_) + : m_policy(policy_), m_func(f_) {} +#endif + + KOKKOS_IMPL_DEVICE_FUNCTION + void exec_range() const { + if (PolicyType::inner_direction == Iterate::Left) { + for (index_type tile_id2 = static_cast<index_type>(blockIdx.z); + tile_id2 < m_policy.m_tile_end[2]; tile_id2 += gridDim.z) { + const index_type offset_2 = + tile_id2 * m_policy.m_tile[2] + + static_cast<index_type>(threadIdx.z) + + static_cast<index_type>(m_policy.m_lower[2]); + if (offset_2 < m_policy.m_upper[2] && + static_cast<index_type>(threadIdx.z) < m_policy.m_tile[2]) { + for (index_type tile_id1 = static_cast<index_type>(blockIdx.y); + tile_id1 < m_policy.m_tile_end[1]; tile_id1 += gridDim.y) { + const index_type offset_1 = + tile_id1 * m_policy.m_tile[1] + + static_cast<index_type>(threadIdx.y) + + static_cast<index_type>(m_policy.m_lower[1]); + if (offset_1 < m_policy.m_upper[1] && + static_cast<index_type>(threadIdx.y) < m_policy.m_tile[1]) { + for (index_type tile_id0 = static_cast<index_type>(blockIdx.x); + tile_id0 < m_policy.m_tile_end[0]; tile_id0 += gridDim.x) { + const index_type offset_0 = + tile_id0 * m_policy.m_tile[0] + + static_cast<index_type>(threadIdx.x) + + static_cast<index_type>(m_policy.m_lower[0]); + if (offset_0 < m_policy.m_upper[0] && + static_cast<index_type>(threadIdx.x) < m_policy.m_tile[0]) { + Impl::_tag_invoke<Tag>(m_func, offset_0, offset_1, offset_2); + } + } + } + } + } + } + } else { + for (index_type tile_id0 = static_cast<index_type>(blockIdx.x); + tile_id0 < m_policy.m_tile_end[0]; tile_id0 += gridDim.x) { + const index_type offset_0 = + tile_id0 * m_policy.m_tile[0] + + static_cast<index_type>(threadIdx.x) + + static_cast<index_type>(m_policy.m_lower[0]); + if (offset_0 < m_policy.m_upper[0] && + static_cast<index_type>(threadIdx.x) < m_policy.m_tile[0]) { + for (index_type tile_id1 = static_cast<index_type>(blockIdx.y); + tile_id1 < m_policy.m_tile_end[1]; tile_id1 += gridDim.y) { + const index_type offset_1 = + tile_id1 * m_policy.m_tile[1] + + static_cast<index_type>(threadIdx.y) + + static_cast<index_type>(m_policy.m_lower[1]); + if (offset_1 < m_policy.m_upper[1] && + static_cast<index_type>(threadIdx.y) < m_policy.m_tile[1]) { + for (index_type tile_id2 = static_cast<index_type>(blockIdx.z); + tile_id2 < m_policy.m_tile_end[2]; tile_id2 += gridDim.z) { + const index_type offset_2 = + tile_id2 * m_policy.m_tile[2] + + static_cast<index_type>(threadIdx.z) + + static_cast<index_type>(m_policy.m_lower[2]); + if (offset_2 < m_policy.m_upper[2] && + static_cast<index_type>(threadIdx.z) < m_policy.m_tile[2]) { + Impl::_tag_invoke<Tag>(m_func, offset_0, offset_1, offset_2); + } + } + } + } + } + } + } + } // end exec_range + + private: + const PolicyType& m_policy; + const Functor& m_func; +#ifdef KOKKOS_ENABLE_SYCL + const EmulateCUDADim3<index_type> gridDim; + const EmulateCUDADim3<index_type> blockIdx; + const EmulateCUDADim3<index_type> threadIdx; +#endif +}; + +// Rank 4 +template <typename PolicyType, typename Functor, typename Tag> +struct DeviceIterateTile<4, PolicyType, Functor, Tag> { + using index_type = typename PolicyType::index_type; + +#ifdef KOKKOS_ENABLE_SYCL + KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile( + const PolicyType& policy_, const Functor& f_, + const EmulateCUDADim3<index_type> gridDim_, + const EmulateCUDADim3<index_type> blockIdx_, + const EmulateCUDADim3<index_type> threadIdx_) + : m_policy(policy_), + m_func(f_), + gridDim(gridDim_), + blockIdx(blockIdx_), + threadIdx(threadIdx_) {} +#else + KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(const PolicyType& policy_, + const Functor& f_) + : m_policy(policy_), m_func(f_) {} +#endif + + static constexpr index_type max_blocks = 65535; + + KOKKOS_IMPL_DEVICE_FUNCTION + void exec_range() const { + if (PolicyType::inner_direction == Iterate::Left) { + const index_type temp0 = m_policy.m_tile_end[0]; + const index_type temp1 = m_policy.m_tile_end[1]; + const index_type numbl0 = (temp0 <= max_blocks ? temp0 : max_blocks); + const index_type numbl1 = + (temp0 * temp1 > max_blocks + ? static_cast<index_type>(max_blocks / numbl0) + : (temp1 <= max_blocks ? temp1 : max_blocks)); + + const index_type tile_id0 = static_cast<index_type>(blockIdx.x) % numbl0; + const index_type tile_id1 = static_cast<index_type>(blockIdx.x) / numbl0; + const index_type thr_id0 = + static_cast<index_type>(threadIdx.x) % m_policy.m_tile[0]; + const index_type thr_id1 = + static_cast<index_type>(threadIdx.x) / m_policy.m_tile[0]; + + for (index_type tile_id3 = static_cast<index_type>(blockIdx.z); + tile_id3 < m_policy.m_tile_end[3]; tile_id3 += gridDim.z) { + const index_type offset_3 = + tile_id3 * m_policy.m_tile[3] + + static_cast<index_type>(threadIdx.z) + + static_cast<index_type>(m_policy.m_lower[3]); + if (offset_3 < m_policy.m_upper[3] && + static_cast<index_type>(threadIdx.z) < m_policy.m_tile[3]) { + for (index_type tile_id2 = static_cast<index_type>(blockIdx.y); + tile_id2 < m_policy.m_tile_end[2]; tile_id2 += gridDim.y) { + const index_type offset_2 = + tile_id2 * m_policy.m_tile[2] + + static_cast<index_type>(threadIdx.y) + + static_cast<index_type>(m_policy.m_lower[2]); + if (offset_2 < m_policy.m_upper[2] && + static_cast<index_type>(threadIdx.y) < m_policy.m_tile[2]) { + for (index_type j = tile_id1; j < m_policy.m_tile_end[1]; + j += numbl1) { + const index_type offset_1 = + j * m_policy.m_tile[1] + thr_id1 + + static_cast<index_type>(m_policy.m_lower[1]); + if (offset_1 < m_policy.m_upper[1] && + thr_id1 < m_policy.m_tile[1]) { + for (index_type i = tile_id0; i < m_policy.m_tile_end[0]; + i += numbl0) { + const index_type offset_0 = + i * m_policy.m_tile[0] + thr_id0 + + static_cast<index_type>(m_policy.m_lower[0]); + if (offset_0 < m_policy.m_upper[0] && + thr_id0 < m_policy.m_tile[0]) { + Impl::_tag_invoke<Tag>(m_func, offset_0, offset_1, + offset_2, offset_3); + } + } + } + } + } + } + } + } + } else { + const index_type temp0 = m_policy.m_tile_end[0]; + const index_type temp1 = m_policy.m_tile_end[1]; + const index_type numbl1 = (temp1 <= max_blocks ? temp1 : max_blocks); + const index_type numbl0 = + (temp0 * temp1 > max_blocks + ? index_type(max_blocks / numbl1) + : (temp0 <= max_blocks ? temp0 : max_blocks)); + + const index_type tile_id0 = static_cast<index_type>(blockIdx.x) / numbl1; + const index_type tile_id1 = static_cast<index_type>(blockIdx.x) % numbl1; + const index_type thr_id0 = + static_cast<index_type>(threadIdx.x) / m_policy.m_tile[1]; + const index_type thr_id1 = + static_cast<index_type>(threadIdx.x) % m_policy.m_tile[1]; + + for (index_type i = tile_id0; i < m_policy.m_tile_end[0]; i += numbl0) { + const index_type offset_0 = + i * m_policy.m_tile[0] + thr_id0 + + static_cast<index_type>(m_policy.m_lower[0]); + if (offset_0 < m_policy.m_upper[0] && thr_id0 < m_policy.m_tile[0]) { + for (index_type j = tile_id1; j < m_policy.m_tile_end[1]; + j += numbl1) { + const index_type offset_1 = + j * m_policy.m_tile[1] + thr_id1 + + static_cast<index_type>(m_policy.m_lower[1]); + if (offset_1 < m_policy.m_upper[1] && + thr_id1 < m_policy.m_tile[1]) { + for (index_type tile_id2 = static_cast<index_type>(blockIdx.y); + tile_id2 < m_policy.m_tile_end[2]; tile_id2 += gridDim.y) { + const index_type offset_2 = + tile_id2 * m_policy.m_tile[2] + + static_cast<index_type>(threadIdx.y) + + static_cast<index_type>(m_policy.m_lower[2]); + if (offset_2 < m_policy.m_upper[2] && + static_cast<index_type>(threadIdx.y) < m_policy.m_tile[2]) { + for (index_type tile_id3 = + static_cast<index_type>(blockIdx.z); + tile_id3 < m_policy.m_tile_end[3]; + tile_id3 += gridDim.z) { + const index_type offset_3 = + tile_id3 * m_policy.m_tile[3] + + static_cast<index_type>(threadIdx.z) + + static_cast<index_type>(m_policy.m_lower[3]); + if (offset_3 < m_policy.m_upper[3] && + static_cast<index_type>(threadIdx.z) < + m_policy.m_tile[3]) { + Impl::_tag_invoke<Tag>(m_func, offset_0, offset_1, + offset_2, offset_3); + } + } + } + } + } + } + } + } + } + } // end exec_range + + private: + const PolicyType& m_policy; + const Functor& m_func; +#ifdef KOKKOS_ENABLE_SYCL + const EmulateCUDADim3<index_type> gridDim; + const EmulateCUDADim3<index_type> blockIdx; + const EmulateCUDADim3<index_type> threadIdx; +#endif +}; + +// Rank 5 +template <typename PolicyType, typename Functor, typename Tag> +struct DeviceIterateTile<5, PolicyType, Functor, Tag> { + using index_type = typename PolicyType::index_type; + +#ifdef KOKKOS_ENABLE_SYCL + KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile( + const PolicyType& policy_, const Functor& f_, + const EmulateCUDADim3<index_type> gridDim_, + const EmulateCUDADim3<index_type> blockIdx_, + const EmulateCUDADim3<index_type> threadIdx_) + : m_policy(policy_), + m_func(f_), + gridDim(gridDim_), + blockIdx(blockIdx_), + threadIdx(threadIdx_) {} +#else + KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(const PolicyType& policy_, + const Functor& f_) + : m_policy(policy_), m_func(f_) {} +#endif + + static constexpr index_type max_blocks = 65535; + + KOKKOS_IMPL_DEVICE_FUNCTION + void exec_range() const { + // LL + if (PolicyType::inner_direction == Iterate::Left) { + index_type temp0 = m_policy.m_tile_end[0]; + index_type temp1 = m_policy.m_tile_end[1]; + const index_type numbl0 = (temp0 <= max_blocks ? temp0 : max_blocks); + const index_type numbl1 = + (temp0 * temp1 > max_blocks + ? index_type(max_blocks / numbl0) + : (temp1 <= max_blocks ? temp1 : max_blocks)); + + const index_type tile_id0 = static_cast<index_type>(blockIdx.x) % numbl0; + const index_type tile_id1 = static_cast<index_type>(blockIdx.x) / numbl0; + const index_type thr_id0 = + static_cast<index_type>(threadIdx.x) % m_policy.m_tile[0]; + const index_type thr_id1 = + static_cast<index_type>(threadIdx.x) / m_policy.m_tile[0]; + + temp0 = m_policy.m_tile_end[2]; + temp1 = m_policy.m_tile_end[3]; + const index_type numbl2 = (temp0 <= max_blocks ? temp0 : max_blocks); + const index_type numbl3 = + (temp0 * temp1 > max_blocks + ? index_type(max_blocks / numbl2) + : (temp1 <= max_blocks ? temp1 : max_blocks)); + + const index_type tile_id2 = static_cast<index_type>(blockIdx.y) % numbl2; + const index_type tile_id3 = static_cast<index_type>(blockIdx.y) / numbl2; + const index_type thr_id2 = + static_cast<index_type>(threadIdx.y) % m_policy.m_tile[2]; + const index_type thr_id3 = + static_cast<index_type>(threadIdx.y) / m_policy.m_tile[2]; + + for (index_type tile_id4 = static_cast<index_type>(blockIdx.z); + tile_id4 < m_policy.m_tile_end[4]; tile_id4 += gridDim.z) { + const index_type offset_4 = + tile_id4 * m_policy.m_tile[4] + + static_cast<index_type>(threadIdx.z) + + static_cast<index_type>(m_policy.m_lower[4]); + if (offset_4 < m_policy.m_upper[4] && + static_cast<index_type>(threadIdx.z) < m_policy.m_tile[4]) { + for (index_type l = tile_id3; l < m_policy.m_tile_end[3]; + l += numbl3) { + const index_type offset_3 = + l * m_policy.m_tile[3] + thr_id3 + + static_cast<index_type>(m_policy.m_lower[3]); + if (offset_3 < m_policy.m_upper[3] && + thr_id3 < m_policy.m_tile[3]) { + for (index_type k = tile_id2; k < m_policy.m_tile_end[2]; + k += numbl2) { + const index_type offset_2 = + k * m_policy.m_tile[2] + thr_id2 + + static_cast<index_type>(m_policy.m_lower[2]); + if (offset_2 < m_policy.m_upper[2] && + thr_id2 < m_policy.m_tile[2]) { + for (index_type j = tile_id1; j < m_policy.m_tile_end[1]; + j += numbl1) { + const index_type offset_1 = + j * m_policy.m_tile[1] + thr_id1 + + static_cast<index_type>(m_policy.m_lower[1]); + if (offset_1 < m_policy.m_upper[1] && + thr_id1 < m_policy.m_tile[1]) { + for (index_type i = tile_id0; i < m_policy.m_tile_end[0]; + i += numbl0) { + const index_type offset_0 = + i * m_policy.m_tile[0] + thr_id0 + + static_cast<index_type>(m_policy.m_lower[0]); + if (offset_0 < m_policy.m_upper[0] && + thr_id0 < m_policy.m_tile[0]) { + Impl::_tag_invoke<Tag>(m_func, offset_0, offset_1, + offset_2, offset_3, offset_4); + } + } + } + } + } + } + } + } + } + } + } + // LR + else { + index_type temp0 = m_policy.m_tile_end[0]; + index_type temp1 = m_policy.m_tile_end[1]; + const index_type numbl1 = (temp1 <= max_blocks ? temp1 : max_blocks); + const index_type numbl0 = + (temp0 * temp1 > max_blocks + ? static_cast<index_type>(max_blocks / numbl1) + : (temp0 <= max_blocks ? temp0 : max_blocks)); + + const index_type tile_id0 = static_cast<index_type>(blockIdx.x) / numbl1; + const index_type tile_id1 = static_cast<index_type>(blockIdx.x) % numbl1; + const index_type thr_id0 = + static_cast<index_type>(threadIdx.x) / m_policy.m_tile[1]; + const index_type thr_id1 = + static_cast<index_type>(threadIdx.x) % m_policy.m_tile[1]; + + temp0 = m_policy.m_tile_end[2]; + temp1 = m_policy.m_tile_end[3]; + const index_type numbl3 = (temp1 <= max_blocks ? temp1 : max_blocks); + const index_type numbl2 = + (temp0 * temp1 > max_blocks + ? index_type(max_blocks / numbl3) + : (temp0 <= max_blocks ? temp0 : max_blocks)); + + const index_type tile_id2 = static_cast<index_type>(blockIdx.y) / numbl3; + const index_type tile_id3 = static_cast<index_type>(blockIdx.y) % numbl3; + const index_type thr_id2 = + static_cast<index_type>(threadIdx.y) / m_policy.m_tile[3]; + const index_type thr_id3 = + static_cast<index_type>(threadIdx.y) % m_policy.m_tile[3]; + + for (index_type i = tile_id0; i < m_policy.m_tile_end[0]; i += numbl0) { + const index_type offset_0 = + i * m_policy.m_tile[0] + thr_id0 + + static_cast<index_type>(m_policy.m_lower[0]); + if (offset_0 < m_policy.m_upper[0] && thr_id0 < m_policy.m_tile[0]) { + for (index_type j = tile_id1; j < m_policy.m_tile_end[1]; + j += numbl1) { + const index_type offset_1 = + j * m_policy.m_tile[1] + thr_id1 + + static_cast<index_type>(m_policy.m_lower[1]); + if (offset_1 < m_policy.m_upper[1] && + thr_id1 < m_policy.m_tile[1]) { + for (index_type k = tile_id2; k < m_policy.m_tile_end[2]; + k += numbl2) { + const index_type offset_2 = + k * m_policy.m_tile[2] + thr_id2 + + static_cast<index_type>(m_policy.m_lower[2]); + if (offset_2 < m_policy.m_upper[2] && + thr_id2 < m_policy.m_tile[2]) { + for (index_type l = tile_id3; l < m_policy.m_tile_end[3]; + l += numbl3) { + const index_type offset_3 = + l * m_policy.m_tile[3] + thr_id3 + + static_cast<index_type>(m_policy.m_lower[3]); + if (offset_3 < m_policy.m_upper[3] && + thr_id3 < m_policy.m_tile[3]) { + for (index_type tile_id4 = + static_cast<index_type>(blockIdx.z); + tile_id4 < m_policy.m_tile_end[4]; + tile_id4 += gridDim.z) { + const index_type offset_4 = + tile_id4 * m_policy.m_tile[4] + + static_cast<index_type>(threadIdx.z) + + static_cast<index_type>(m_policy.m_lower[4]); + if (offset_4 < m_policy.m_upper[4] && + static_cast<index_type>(threadIdx.z) < + m_policy.m_tile[4]) { + Impl::_tag_invoke<Tag>(m_func, offset_0, offset_1, + offset_2, offset_3, offset_4); + } + } + } + } + } + } + } + } + } + } + } + } // end exec_range + + private: + const PolicyType& m_policy; + const Functor& m_func; +#ifdef KOKKOS_ENABLE_SYCL + const EmulateCUDADim3<index_type> gridDim; + const EmulateCUDADim3<index_type> blockIdx; + const EmulateCUDADim3<index_type> threadIdx; +#endif +}; + +// Rank 6 +template <typename PolicyType, typename Functor, typename Tag> +struct DeviceIterateTile<6, PolicyType, Functor, Tag> { + using index_type = typename PolicyType::index_type; + +#ifdef KOKKOS_ENABLE_SYCL + KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile( + const PolicyType& policy_, const Functor& f_, + const EmulateCUDADim3<index_type> gridDim_, + const EmulateCUDADim3<index_type> blockIdx_, + const EmulateCUDADim3<index_type> threadIdx_) + : m_policy(policy_), + m_func(f_), + gridDim(gridDim_), + blockIdx(blockIdx_), + threadIdx(threadIdx_) {} +#else + KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(const PolicyType& policy_, + const Functor& f_) + : m_policy(policy_), m_func(f_) {} +#endif + + static constexpr index_type max_blocks = 65535; + + KOKKOS_IMPL_DEVICE_FUNCTION + void exec_range() const { + // LL + if (PolicyType::inner_direction == Iterate::Left) { + index_type temp0 = m_policy.m_tile_end[0]; + index_type temp1 = m_policy.m_tile_end[1]; + const index_type numbl0 = (temp0 <= max_blocks ? temp0 : max_blocks); + const index_type numbl1 = + (temp0 * temp1 > max_blocks + ? static_cast<index_type>(max_blocks / numbl0) + : (temp1 <= max_blocks ? temp1 : max_blocks)); + + const index_type tile_id0 = static_cast<index_type>(blockIdx.x) % numbl0; + const index_type tile_id1 = static_cast<index_type>(blockIdx.x) / numbl0; + const index_type thr_id0 = + static_cast<index_type>(threadIdx.x) % m_policy.m_tile[0]; + const index_type thr_id1 = + static_cast<index_type>(threadIdx.x) / m_policy.m_tile[0]; + + temp0 = m_policy.m_tile_end[2]; + temp1 = m_policy.m_tile_end[3]; + const index_type numbl2 = (temp0 <= max_blocks ? temp0 : max_blocks); + const index_type numbl3 = + (temp0 * temp1 > max_blocks + ? static_cast<index_type>(max_blocks / numbl2) + : (temp1 <= max_blocks ? temp1 : max_blocks)); + + const index_type tile_id2 = static_cast<index_type>(blockIdx.y) % numbl2; + const index_type tile_id3 = static_cast<index_type>(blockIdx.y) / numbl2; + const index_type thr_id2 = + static_cast<index_type>(threadIdx.y) % m_policy.m_tile[2]; + const index_type thr_id3 = + static_cast<index_type>(threadIdx.y) / m_policy.m_tile[2]; + + temp0 = m_policy.m_tile_end[4]; + temp1 = m_policy.m_tile_end[5]; + const index_type numbl4 = (temp0 <= max_blocks ? temp0 : max_blocks); + const index_type numbl5 = + (temp0 * temp1 > max_blocks + ? static_cast<index_type>(max_blocks / numbl4) + : (temp1 <= max_blocks ? temp1 : max_blocks)); + + const index_type tile_id4 = static_cast<index_type>(blockIdx.z) % numbl4; + const index_type tile_id5 = static_cast<index_type>(blockIdx.z) / numbl4; + const index_type thr_id4 = + static_cast<index_type>(threadIdx.z) % m_policy.m_tile[4]; + const index_type thr_id5 = + static_cast<index_type>(threadIdx.z) / m_policy.m_tile[4]; + + for (index_type n = tile_id5; n < m_policy.m_tile_end[5]; n += numbl5) { + const index_type offset_5 = + n * m_policy.m_tile[5] + thr_id5 + + static_cast<index_type>(m_policy.m_lower[5]); + if (offset_5 < m_policy.m_upper[5] && thr_id5 < m_policy.m_tile[5]) { + for (index_type m = tile_id4; m < m_policy.m_tile_end[4]; + m += numbl4) { + const index_type offset_4 = + m * m_policy.m_tile[4] + thr_id4 + + static_cast<index_type>(m_policy.m_lower[4]); + if (offset_4 < m_policy.m_upper[4] && + thr_id4 < m_policy.m_tile[4]) { + for (index_type l = tile_id3; l < m_policy.m_tile_end[3]; + l += numbl3) { + const index_type offset_3 = + l * m_policy.m_tile[3] + thr_id3 + + static_cast<index_type>(m_policy.m_lower[3]); + if (offset_3 < m_policy.m_upper[3] && + thr_id3 < m_policy.m_tile[3]) { + for (index_type k = tile_id2; k < m_policy.m_tile_end[2]; + k += numbl2) { + const index_type offset_2 = + k * m_policy.m_tile[2] + thr_id2 + + static_cast<index_type>(m_policy.m_lower[2]); + if (offset_2 < m_policy.m_upper[2] && + thr_id2 < m_policy.m_tile[2]) { + for (index_type j = tile_id1; j < m_policy.m_tile_end[1]; + j += numbl1) { + const index_type offset_1 = + j * m_policy.m_tile[1] + thr_id1 + + static_cast<index_type>(m_policy.m_lower[1]); + if (offset_1 < m_policy.m_upper[1] && + thr_id1 < m_policy.m_tile[1]) { + for (index_type i = tile_id0; + i < m_policy.m_tile_end[0]; i += numbl0) { + const index_type offset_0 = + i * m_policy.m_tile[0] + thr_id0 + + static_cast<index_type>(m_policy.m_lower[0]); + if (offset_0 < m_policy.m_upper[0] && + thr_id0 < m_policy.m_tile[0]) { + Impl::_tag_invoke<Tag>(m_func, offset_0, offset_1, + offset_2, offset_3, + offset_4, offset_5); + } + } + } + } + } + } + } + } + } + } + } + } + } + // LR + else { + index_type temp0 = m_policy.m_tile_end[0]; + index_type temp1 = m_policy.m_tile_end[1]; + const index_type numbl1 = (temp1 <= max_blocks ? temp1 : max_blocks); + const index_type numbl0 = + (temp0 * temp1 > max_blocks + ? static_cast<index_type>(max_blocks / numbl1) + : (temp0 <= max_blocks ? temp0 : max_blocks)); + + const index_type tile_id0 = static_cast<index_type>(blockIdx.x) / numbl1; + const index_type tile_id1 = static_cast<index_type>(blockIdx.x) % numbl1; + const index_type thr_id0 = + static_cast<index_type>(threadIdx.x) / m_policy.m_tile[1]; + const index_type thr_id1 = + static_cast<index_type>(threadIdx.x) % m_policy.m_tile[1]; + + temp0 = m_policy.m_tile_end[2]; + temp1 = m_policy.m_tile_end[3]; + const index_type numbl3 = (temp1 <= max_blocks ? temp1 : max_blocks); + const index_type numbl2 = + (temp0 * temp1 > max_blocks + ? static_cast<index_type>(max_blocks / numbl3) + : (temp0 <= max_blocks ? temp0 : max_blocks)); + + const index_type tile_id2 = static_cast<index_type>(blockIdx.y) / numbl3; + const index_type tile_id3 = static_cast<index_type>(blockIdx.y) % numbl3; + const index_type thr_id2 = + static_cast<index_type>(threadIdx.y) / m_policy.m_tile[3]; + const index_type thr_id3 = + static_cast<index_type>(threadIdx.y) % m_policy.m_tile[3]; + + temp0 = m_policy.m_tile_end[4]; + temp1 = m_policy.m_tile_end[5]; + const index_type numbl5 = (temp1 <= max_blocks ? temp1 : max_blocks); + const index_type numbl4 = + (temp0 * temp1 > max_blocks + ? static_cast<index_type>(max_blocks / numbl5) + : (temp0 <= max_blocks ? temp0 : max_blocks)); + + const index_type tile_id4 = static_cast<index_type>(blockIdx.z) / numbl5; + const index_type tile_id5 = static_cast<index_type>(blockIdx.z) % numbl5; + const index_type thr_id4 = + static_cast<index_type>(threadIdx.z) / m_policy.m_tile[5]; + const index_type thr_id5 = + static_cast<index_type>(threadIdx.z) % m_policy.m_tile[5]; + + for (index_type i = tile_id0; i < m_policy.m_tile_end[0]; i += numbl0) { + const index_type offset_0 = + i * m_policy.m_tile[0] + thr_id0 + + static_cast<index_type>(m_policy.m_lower[0]); + if (offset_0 < m_policy.m_upper[0] && thr_id0 < m_policy.m_tile[0]) { + for (index_type j = tile_id1; j < m_policy.m_tile_end[1]; + j += numbl1) { + const index_type offset_1 = + j * m_policy.m_tile[1] + thr_id1 + + static_cast<index_type>(m_policy.m_lower[1]); + if (offset_1 < m_policy.m_upper[1] && + thr_id1 < m_policy.m_tile[1]) { + for (index_type k = tile_id2; k < m_policy.m_tile_end[2]; + k += numbl2) { + const index_type offset_2 = + k * m_policy.m_tile[2] + thr_id2 + + static_cast<index_type>(m_policy.m_lower[2]); + if (offset_2 < m_policy.m_upper[2] && + thr_id2 < m_policy.m_tile[2]) { + for (index_type l = tile_id3; l < m_policy.m_tile_end[3]; + l += numbl3) { + const index_type offset_3 = + l * m_policy.m_tile[3] + thr_id3 + + static_cast<index_type>(m_policy.m_lower[3]); + if (offset_3 < m_policy.m_upper[3] && + thr_id3 < m_policy.m_tile[3]) { + for (index_type m = tile_id4; m < m_policy.m_tile_end[4]; + m += numbl4) { + const index_type offset_4 = + m * m_policy.m_tile[4] + thr_id4 + + static_cast<index_type>(m_policy.m_lower[4]); + if (offset_4 < m_policy.m_upper[4] && + thr_id4 < m_policy.m_tile[4]) { + for (index_type n = tile_id5; + n < m_policy.m_tile_end[5]; n += numbl5) { + const index_type offset_5 = + n * m_policy.m_tile[5] + thr_id5 + + static_cast<index_type>(m_policy.m_lower[5]); + if (offset_5 < m_policy.m_upper[5] && + thr_id5 < m_policy.m_tile[5]) { + Impl::_tag_invoke<Tag>(m_func, offset_0, offset_1, + offset_2, offset_3, + offset_4, offset_5); + } + } + } + } + } + } + } + } + } + } + } + } + } + } // end exec_range + + private: + const PolicyType& m_policy; + const Functor& m_func; +#ifdef KOKKOS_ENABLE_SYCL + const EmulateCUDADim3<index_type> gridDim; + const EmulateCUDADim3<index_type> blockIdx; + const EmulateCUDADim3<index_type> threadIdx; +#endif +}; + +// ---------------------------------------------------------------------------------- + +namespace Reduce { + +template <typename T> +using is_void = std::is_same<T, void>; + +template <typename T> +struct is_array_type : std::false_type { + using value_type = T; +}; + +template <typename T> +struct is_array_type<T*> : std::true_type { + using value_type = T; +}; + +template <typename T> +struct is_array_type<T[]> : std::true_type { + using value_type = T; +}; + +// ------------------------------------------------------------------ // + +template <typename T> +using value_type_storage_t = + typename std::conditional_t<is_array_type<T>::value, std::decay<T>, + std::add_lvalue_reference<T> >::type; + +// ParallelReduce iteration pattern +// Scalar reductions + +// num_blocks = min( num_tiles, max_num_blocks ); //i.e. determined by number of +// tiles and reduction algorithm constraints extract n-dim tile offsets (i.e. +// tile's global starting mulit-index) from the tileid = blockid using tile +// dimensions local indices within a tile extracted from (index_type)threadIdx_x +// using tile dims, constrained by blocksize combine tile and local id info for +// multi-dim global ids + +// Pattern: +// Each block+thread is responsible for a tile+local_id combo (additional when +// striding by num_blocks) +// 1. create offset arrays +// 2. loop over number of tiles, striding by griddim (equal to num tiles, or max +// num blocks) +// 3. temps set for tile_idx and thrd_idx, which will be modified +// 4. if LL vs LR: +// determine tile starting point offsets (multidim) +// determine local index offsets (multidim) +// concatentate tile offset + local offset for global multi-dim index +// if offset withinin range bounds AND local offset within tile bounds, call +// functor + +template <int N, typename PolicyType, typename Functor, typename Tag, + typename ValueType, typename Enable = void> +struct DeviceIterateTile { + using index_type = typename PolicyType::index_type; + using value_type_storage = value_type_storage_t<ValueType>; + +#ifdef KOKKOS_ENABLE_SYCL + KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile( + const PolicyType& policy_, const Functor& f_, value_type_storage v_, + const EmulateCUDADim3<index_type> gridDim_, + const EmulateCUDADim3<index_type> blockIdx_, + const EmulateCUDADim3<index_type> threadIdx_) + : m_policy(policy_), + m_func(f_), + m_v(v_), + gridDim(gridDim_), + blockIdx(blockIdx_), + threadIdx(threadIdx_) {} +#else + KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile(const PolicyType& policy_, + const Functor& f_, + value_type_storage v_) + : m_policy(policy_), m_func(f_), m_v(v_) {} +#endif + + KOKKOS_IMPL_DEVICE_FUNCTION + void exec_range() const { + if (static_cast<index_type>(blockIdx.x) < m_policy.m_num_tiles && + static_cast<index_type>(threadIdx.y) < m_policy.m_prod_tile_dims) { + index_type m_offset[PolicyType::rank]; // tile starting global id offset + index_type + m_local_offset[PolicyType::rank]; // tile starting global id offset + + for (index_type tileidx = static_cast<index_type>(blockIdx.x); + tileidx < m_policy.m_num_tiles; tileidx += gridDim.x) { + index_type tile_idx = + tileidx; // temp because tile_idx will be modified while + // determining tile starting point offsets + index_type thrd_idx = static_cast<index_type>(threadIdx.y); + bool in_bounds = true; + + // LL + if (PolicyType::inner_direction == Iterate::Left) { + for (int i = 0; i < PolicyType::rank; ++i) { + m_offset[i] = + (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] + + m_policy.m_lower[i]; + tile_idx /= m_policy.m_tile_end[i]; + + // tile-local indices identified with (index_type)threadIdx_y + m_local_offset[i] = (thrd_idx % m_policy.m_tile[i]); + thrd_idx /= m_policy.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if (!(m_offset[i] < m_policy.m_upper[i] && + m_local_offset[i] < m_policy.m_tile[i])) { + in_bounds = false; + } + } + if (in_bounds) { + Impl::_tag_invoke_array<Tag>(m_func, m_offset, m_v); + } + } + // LR + else { + for (int i = PolicyType::rank - 1; i >= 0; --i) { + m_offset[i] = + (tile_idx % m_policy.m_tile_end[i]) * m_policy.m_tile[i] + + m_policy.m_lower[i]; + tile_idx /= m_policy.m_tile_end[i]; + + // tile-local indices identified with (index_type)threadIdx_y + m_local_offset[i] = + (thrd_idx % + m_policy.m_tile[i]); // Move this to first computation, + // add to m_offset right away + thrd_idx /= m_policy.m_tile[i]; + + m_offset[i] += m_local_offset[i]; + if (!(m_offset[i] < m_policy.m_upper[i] && + m_local_offset[i] < m_policy.m_tile[i])) { + in_bounds = false; + } + } + if (in_bounds) { + Impl::_tag_invoke_array<Tag>(m_func, m_offset, m_v); + } + } + } + } + } // end exec_range + + private: + const PolicyType& m_policy; + const Functor& m_func; + value_type_storage m_v; +#ifdef KOKKOS_ENABLE_SYCL + const EmulateCUDADim3<index_type> gridDim; + const EmulateCUDADim3<index_type> blockIdx; + const EmulateCUDADim3<index_type> threadIdx; +#endif +}; + +} // namespace Reduce +} // namespace Impl +} // namespace Kokkos +#endif diff --git a/packages/kokkos/core/src/impl/KokkosExp_ViewMapping.hpp b/packages/kokkos/core/src/impl/KokkosExp_ViewMapping.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a41ffb1e8a0572b4b1a06be8c27bfab0126c9500 --- /dev/null +++ b/packages/kokkos/core/src/impl/KokkosExp_ViewMapping.hpp @@ -0,0 +1,47 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +// Deprecated file for backward compatibility + +#include <impl/Kokkos_ViewMapping.hpp> diff --git a/packages/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp b/packages/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp new file mode 100644 index 0000000000000000000000000000000000000000..c513817b5b8cbd74847e180099081bb475020c44 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp @@ -0,0 +1,139 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_IMPL_ANALYZE_POLICY_HPP +#define KOKKOS_IMPL_ANALYZE_POLICY_HPP + +#include <Kokkos_Core_fwd.hpp> +#include <Kokkos_Concepts.hpp> // IndexType +#include <traits/Kokkos_Traits_fwd.hpp> +#include <traits/Kokkos_PolicyTraitAdaptor.hpp> + +#include <traits/Kokkos_ExecutionSpaceTrait.hpp> +#include <traits/Kokkos_GraphKernelTrait.hpp> +#include <traits/Kokkos_IndexTypeTrait.hpp> +#include <traits/Kokkos_IterationPatternTrait.hpp> +#include <traits/Kokkos_LaunchBoundsTrait.hpp> +#include <traits/Kokkos_OccupancyControlTrait.hpp> +#include <traits/Kokkos_ScheduleTrait.hpp> +#include <traits/Kokkos_WorkItemPropertyTrait.hpp> +#include <traits/Kokkos_WorkTagTrait.hpp> + +namespace Kokkos { +namespace Impl { + +//------------------------------------------------------------------------------ + +using execution_policy_trait_specifications = + type_list<ExecutionSpaceTrait, GraphKernelTrait, IndexTypeTrait, + IterationPatternTrait, LaunchBoundsTrait, OccupancyControlTrait, + ScheduleTrait, WorkItemPropertyTrait, WorkTagTrait>; + +//------------------------------------------------------------------------------ +// Ignore void for backwards compatibility purposes, though hopefully no one is +// using this in application code +template <class... Traits> +struct AnalyzeExecPolicy<void, void, Traits...> + : AnalyzeExecPolicy<void, Traits...> { + using base_t = AnalyzeExecPolicy<void, Traits...>; + using base_t::base_t; +}; + +//------------------------------------------------------------------------------ +// Mix in the defaults (base_traits) for the traits that aren't yet handled + +template <class TraitSpecList> +struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION AnalyzeExecPolicyBaseTraits; +template <class... TraitSpecifications> +struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION + AnalyzeExecPolicyBaseTraits<type_list<TraitSpecifications...>> + : TraitSpecifications::base_traits... {}; + +template <> +struct AnalyzeExecPolicy<void> + : AnalyzeExecPolicyBaseTraits<execution_policy_trait_specifications> { + // Ensure default constructibility since a converting constructor causes it to + // be deleted. + AnalyzeExecPolicy() = default; + + // Base converting constructor and assignment operator: unless an individual + // policy analysis deletes a constructor, assume it's convertible + template <class Other> + AnalyzeExecPolicy(ExecPolicyTraitsWithDefaults<Other> const&) {} + + template <class Other> + AnalyzeExecPolicy& operator=(ExecPolicyTraitsWithDefaults<Other> const&) { + return *this; + } +}; + +//------------------------------------------------------------------------------ +// Used for defaults that depend on other analysis results +template <class AnalysisResults> +struct ExecPolicyTraitsWithDefaults : AnalysisResults { + using base_t = AnalysisResults; + using base_t::base_t; + // The old code turned this into an integral type for backwards compatibility, + // so that's what we're doing here. The original comment was: + // nasty hack to make index_type into an integral_type + // instead of the wrapped IndexType<T> for backwards compatibility + using index_type = typename std::conditional_t< + base_t::index_type_is_defaulted, + Kokkos::IndexType<typename base_t::execution_space::size_type>, + typename base_t::index_type>::type; +}; + +//------------------------------------------------------------------------------ +template <typename... Traits> +struct PolicyTraits + : ExecPolicyTraitsWithDefaults<AnalyzeExecPolicy<void, Traits...>> { + using base_t = + ExecPolicyTraitsWithDefaults<AnalyzeExecPolicy<void, Traits...>>; + using base_t::base_t; +}; + +} // namespace Impl +} // namespace Kokkos + +#endif // KOKKOS_IMPL_ANALYZE_POLICY_HPP diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Assembly.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Assembly.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a31dd1cf49365bd1b05d0f188637e398417a0dd7 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Assembly.hpp @@ -0,0 +1,107 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_ASSEMBLY_HPP) +#define KOKKOS_ATOMIC_ASSEMBLY_HPP +namespace Kokkos { + +namespace Impl { + +#if !defined(_WIN32) +struct cas128_t { + uint64_t lower; + uint64_t upper; + + KOKKOS_INLINE_FUNCTION + cas128_t() { + lower = 0; + upper = 0; + } + + KOKKOS_INLINE_FUNCTION + cas128_t(const cas128_t& a) { + lower = a.lower; + upper = a.upper; + } + KOKKOS_INLINE_FUNCTION + cas128_t(volatile cas128_t* a) { + lower = a->lower; + upper = a->upper; + } + + KOKKOS_INLINE_FUNCTION + bool operator!=(const cas128_t& a) const { + return (lower != a.lower) || upper != a.upper; + } + + KOKKOS_INLINE_FUNCTION + void operator=(const cas128_t& a) { + lower = a.lower; + upper = a.upper; + } + KOKKOS_INLINE_FUNCTION + void operator=(const cas128_t& a) volatile { + lower = a.lower; + upper = a.upper; + } +} __attribute__((__aligned__(16))); +#endif + +#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) +inline cas128_t cas128(volatile cas128_t* ptr, cas128_t cmp, cas128_t swap) { + bool swapped = false; + __asm__ __volatile__( + "lock cmpxchg16b %1\n\t" + "setz %0" + : "=q"(swapped), "+m"(*ptr), "+d"(cmp.upper), "+a"(cmp.lower) + : "c"(swap.upper), "b"(swap.lower), "q"(swapped)); + return cmp; +} +#endif + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp new file mode 100644 index 0000000000000000000000000000000000000000..dd571eb6d72e23bf0d028493fbeec42f645b382b --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp @@ -0,0 +1,453 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) +#include <xmmintrin.h> +#endif + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ATOMIC_HPP) && \ + !defined(KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP) +#define KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP + +#if defined(KOKKOS_ENABLE_CUDA) +#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp> +#endif + +#include <impl/Kokkos_Atomic_Memory_Order.hpp> +#include <impl/Kokkos_Memory_Fence.hpp> + +#if defined(KOKKOS_ENABLE_CUDA) +#include <Cuda/Kokkos_Cuda_Atomic_Intrinsics.hpp> +#endif + +namespace Kokkos { + +//---------------------------------------------------------------------------- +// Cuda native CAS supports int, unsigned int, and unsigned long long int +// (non-standard type). Must cast-away 'volatile' for the CAS call. + +#if defined(KOKKOS_ENABLE_CUDA) + +#if defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) +__inline__ __device__ int atomic_compare_exchange(volatile int* const dest, + const int compare, + const int val) { + return atomicCAS((int*)dest, compare, val); +} + +__inline__ __device__ unsigned int atomic_compare_exchange( + volatile unsigned int* const dest, const unsigned int compare, + const unsigned int val) { + return atomicCAS((unsigned int*)dest, compare, val); +} + +__inline__ __device__ unsigned long long int atomic_compare_exchange( + volatile unsigned long long int* const dest, + const unsigned long long int compare, const unsigned long long int val) { + return atomicCAS((unsigned long long int*)dest, compare, val); +} + +template <typename T> +__inline__ __device__ T atomic_compare_exchange( + volatile T* const dest, const T& compare, + typename std::enable_if<sizeof(T) == sizeof(int), const T&>::type val) { + const int tmp = atomicCAS((int*)dest, *((int*)&compare), *((int*)&val)); + return *((T*)&tmp); +} + +template <typename T> +__inline__ __device__ T atomic_compare_exchange( + volatile T* const dest, const T& compare, + typename std::enable_if<sizeof(T) != sizeof(int) && + sizeof(T) == sizeof(unsigned long long int), + const T&>::type val) { + using type = unsigned long long int; + const type tmp = atomicCAS((type*)dest, *((type*)&compare), *((type*)&val)); + return *((T*)&tmp); +} + +template <typename T> +__inline__ __device__ T atomic_compare_exchange( + volatile T* const dest, const T& compare, + typename std::enable_if<(sizeof(T) != 4) && (sizeof(T) != 8), + const T>::type& val) { + T return_val; + // This is a way to (hopefully) avoid dead lock in a warp + int done = 0; +#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK + unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK; + unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1); +#else + unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1); +#endif + unsigned int done_active = 0; + while (active != done_active) { + if (!done) { + if (Impl::lock_address_cuda_space((void*)dest)) { + Kokkos::memory_fence(); + return_val = *dest; + if (return_val == compare) *dest = val; + Kokkos::memory_fence(); + Impl::unlock_address_cuda_space((void*)dest); + done = 1; + } + } +#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK + done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, done); +#else + done_active = KOKKOS_IMPL_CUDA_BALLOT(done); +#endif + } + return return_val; +} +#endif +#endif + +//---------------------------------------------------------------------------- +// GCC native CAS supports int, long, unsigned int, unsigned long. +// Intel native CAS support int and long with the same interface as GCC. +#if !defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) +#if defined(KOKKOS_ENABLE_WINDOWS_ATOMICS) +// atomic_compare_exchange are already defined in Kokkos_Atomic_Windows.hpp +#elif defined(KOKKOS_ENABLE_GNU_ATOMICS) || defined(KOKKOS_ENABLE_INTEL_ATOMICS) + +inline int atomic_compare_exchange(volatile int* const dest, const int compare, + const int val) { +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + return __sync_val_compare_and_swap(dest, compare, val); +} + +inline long atomic_compare_exchange(volatile long* const dest, + const long compare, const long val) { +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + return __sync_val_compare_and_swap(dest, compare, val); +} + +#if defined(KOKKOS_ENABLE_GNU_ATOMICS) + +// GCC supports unsigned + +inline unsigned int atomic_compare_exchange(volatile unsigned int* const dest, + const unsigned int compare, + const unsigned int val) { + return __sync_val_compare_and_swap(dest, compare, val); +} + +inline unsigned long atomic_compare_exchange(volatile unsigned long* const dest, + const unsigned long compare, + const unsigned long val) { + return __sync_val_compare_and_swap(dest, compare, val); +} + +inline unsigned long long atomic_compare_exchange( + volatile unsigned long long* const dest, const unsigned long long compare, + const unsigned long long val) { + return __sync_val_compare_and_swap(dest, compare, val); +} + +#endif + +template <typename T> +inline T atomic_compare_exchange( + volatile T* const dest, const T& compare, + typename std::enable_if<sizeof(T) == sizeof(int), const T&>::type val) { + union U { + int i; + T t; + KOKKOS_INLINE_FUNCTION U() {} + } tmp; + +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + + tmp.i = + __sync_val_compare_and_swap((int*)dest, *((int*)&compare), *((int*)&val)); + return tmp.t; +} + +template <typename T> +inline T atomic_compare_exchange( + volatile T* const dest, const T& compare, + typename std::enable_if<sizeof(T) != sizeof(int) && + sizeof(T) == sizeof(long), + const T&>::type val) { + union U { + long i; + T t; + KOKKOS_INLINE_FUNCTION U() {} + } tmp; + +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + + tmp.i = __sync_val_compare_and_swap((long*)dest, *((long*)&compare), + *((long*)&val)); + return tmp.t; +} + +#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) +template <typename T> +inline T atomic_compare_exchange( + volatile T* const dest, const T& compare, + typename std::enable_if<sizeof(T) != sizeof(int) && + sizeof(T) != sizeof(long) && + sizeof(T) == sizeof(Impl::cas128_t), + const T&>::type val) { + union U { + Impl::cas128_t i; + T t; + KOKKOS_INLINE_FUNCTION U() {} + } tmp; + +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + + tmp.i = Impl::cas128((Impl::cas128_t*)dest, *((Impl::cas128_t*)&compare), + *((Impl::cas128_t*)&val)); + return tmp.t; +} +#endif + +template <typename T> +inline T atomic_compare_exchange( + volatile T* const dest, const T compare, + typename std::enable_if<(sizeof(T) != 4) && (sizeof(T) != 8) +#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) + && (sizeof(T) != 16) +#endif + , + const T>::type& val) { +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + + while (!Impl::lock_address_host_space((void*)dest)) + ; + Kokkos::memory_fence(); + T return_val = *dest; + if (return_val == compare) { + // Don't use the following line of code here: + // + // const T tmp = *dest = val; + // + // Instead, put each assignment in its own statement. This is + // because the overload of T::operator= for volatile *this should + // return void, not volatile T&. See Kokkos #177: + // + // https://github.com/kokkos/kokkos/issues/177 + *dest = val; + const T tmp = *dest; +#ifndef KOKKOS_COMPILER_CLANG + (void)tmp; +#endif + Kokkos::memory_fence(); + } + Impl::unlock_address_host_space((void*)dest); + return return_val; +} +//---------------------------------------------------------------------------- + +#elif defined(KOKKOS_ENABLE_OPENMP_ATOMICS) + +template <typename T> +KOKKOS_INLINE_FUNCTION T atomic_compare_exchange(volatile T* const dest, + const T compare, const T val) { + T retval; +#pragma omp critical + { + retval = dest[0]; + if (retval == compare) dest[0] = val; + } + return retval; +} + +#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS) + +template <typename T> +KOKKOS_INLINE_FUNCTION T atomic_compare_exchange(volatile T* const dest_v, + const T compare, const T val) { + T* dest = const_cast<T*>(dest_v); + T retval = *dest; + if (retval == compare) *dest = val; + return retval; +} + +#endif +#endif + +// dummy for non-CUDA Kokkos headers being processed by NVCC +#if defined(__CUDA_ARCH__) && !defined(KOKKOS_ENABLE_CUDA) +template <typename T> +__inline__ __device__ T +atomic_compare_exchange(volatile T* const, const Kokkos::Impl::identity_t<T>, + const Kokkos::Impl::identity_t<T>) { + return T(); +} +#endif + +template <typename T> +KOKKOS_INLINE_FUNCTION bool atomic_compare_exchange_strong( + volatile T* const dest, const T compare, const T val) { + return compare == atomic_compare_exchange(dest, compare, val); +} +//---------------------------------------------------------------------------- + +namespace Impl { +// memory-ordered versions are in the Impl namespace + +template <class T, class MemoryOrderFailure> +KOKKOS_INLINE_FUNCTION bool _atomic_compare_exchange_strong_fallback( + T* dest, T compare, T val, memory_order_seq_cst_t, MemoryOrderFailure) { + Kokkos::memory_fence(); + auto rv = Kokkos::atomic_compare_exchange_strong(dest, compare, val); + Kokkos::memory_fence(); + return rv; +} + +template <class T, class MemoryOrderFailure> +KOKKOS_INLINE_FUNCTION bool _atomic_compare_exchange_strong_fallback( + T* dest, T compare, T val, memory_order_acquire_t, MemoryOrderFailure) { + auto rv = Kokkos::atomic_compare_exchange_strong(dest, compare, val); + Kokkos::memory_fence(); + return rv; +} + +template <class T, class MemoryOrderFailure> +KOKKOS_INLINE_FUNCTION bool _atomic_compare_exchange_strong_fallback( + T* dest, T compare, T val, memory_order_release_t, MemoryOrderFailure) { + Kokkos::memory_fence(); + return Kokkos::atomic_compare_exchange_strong(dest, compare, val); +} + +template <class T, class MemoryOrderFailure> +KOKKOS_INLINE_FUNCTION bool _atomic_compare_exchange_strong_fallback( + T* dest, T compare, T val, memory_order_relaxed_t, MemoryOrderFailure) { + return Kokkos::atomic_compare_exchange_strong(dest, compare, val); +} + +#if (defined(KOKKOS_ENABLE_GNU_ATOMICS) && !defined(__CUDA_ARCH__)) || \ + (defined(KOKKOS_ENABLE_INTEL_ATOMICS) && !defined(__CUDA_ARCH__)) || \ + defined(KOKKOS_ENABLE_CUDA_ASM_ATOMICS) + +#if defined(__CUDA_ARCH__) +#define KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH __inline__ __device__ +#else +#define KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH inline +#endif + +template <class T, class MemoryOrderSuccess, class MemoryOrderFailure> +KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH bool _atomic_compare_exchange_strong( + T* dest, T compare, T val, MemoryOrderSuccess, MemoryOrderFailure, + typename std::enable_if< + (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8 || + sizeof(T) == 16) && + std::is_same< + typename MemoryOrderSuccess::memory_order, + typename std::remove_cv<MemoryOrderSuccess>::type>::value && + std::is_same< + typename MemoryOrderFailure::memory_order, + typename std::remove_cv<MemoryOrderFailure>::type>::value, + void const**>::type = nullptr) { + return __atomic_compare_exchange_n(dest, &compare, val, /* weak = */ false, + MemoryOrderSuccess::gnu_constant, + MemoryOrderFailure::gnu_constant); +} + +template <class T, class MemoryOrderSuccess, class MemoryOrderFailure> +KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH bool _atomic_compare_exchange_strong( + T* dest, T compare, T val, MemoryOrderSuccess order_success, + MemoryOrderFailure order_failure, + typename std::enable_if< + !(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || + sizeof(T) == 8 || sizeof(T) == 16) && + std::is_same< + typename MemoryOrderSuccess::memory_order, + typename std::remove_cv<MemoryOrderSuccess>::type>::value && + std::is_same< + typename MemoryOrderFailure::memory_order, + typename std::remove_cv<MemoryOrderFailure>::type>::value, + void const**>::type = nullptr) { + return _atomic_compare_exchange_fallback(dest, compare, val, order_success, + order_failure); +} + +#else + +template <class T, class MemoryOrderSuccess, class MemoryOrderFailure> +KOKKOS_INLINE_FUNCTION bool _atomic_compare_exchange_strong( + T* dest, T compare, T val, MemoryOrderSuccess order_success, + MemoryOrderFailure order_failure) { + return _atomic_compare_exchange_strong_fallback(dest, compare, val, + order_success, order_failure); +} + +#endif + +// TODO static asserts in overloads that don't make sense (as listed in +// https://gcc.gnu.org/onlinedocs/gcc-5.2.0/gcc/_005f_005fatomic-Builtins.html) +template <class T, class MemoryOrderSuccess, class MemoryOrderFailure> +KOKKOS_FORCEINLINE_FUNCTION bool atomic_compare_exchange_strong( + T* dest, T compare, T val, MemoryOrderSuccess order_success, + MemoryOrderFailure order_failure) { + return _atomic_compare_exchange_strong(dest, compare, val, order_success, + order_failure); +} + +} // end namespace Impl + +} // namespace Kokkos + +#if defined(KOKKOS_ENABLE_CUDA) +#include <Cuda/Kokkos_Cuda_Atomic_Intrinsics_Restore_Builtins.hpp> +#endif + +#endif diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Weak.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Weak.hpp new file mode 100644 index 0000000000000000000000000000000000000000..bbea3c99b8fcfbf5c25132c913f92b7337001806 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Weak.hpp @@ -0,0 +1,412 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) +#include <xmmintrin.h> +#endif + +#include <Kokkos_Macros.hpp> +#include <Kokkos_Atomic.hpp> +#ifndef KOKKOS_ATOMIC_COMPARE_EXCHANGE_WEAK_HPP +#define KOKKOS_ATOMIC_COMPARE_EXCHANGE_WEAK_HPP + +#if defined(KOKKOS_ENABLE_CUDA) +#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp> +#endif + +namespace Kokkos { + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- +// Cuda sm_70 or greater supports C++-like semantics directly + +#if defined(KOKKOS_ENABLE_CUDA) + +#if defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) + +#if __CUDA_ARCH__ >= 700 +// See: https://github.com/ogiroux/freestanding +#define kokkos_cuda_internal_cas_release_32(ptr, old, expected, desired) \ + asm volatile("atom.cas.release.sys.b32 %0, [%1], %2, %3;" \ + : "=r"(old) \ + : "l"(ptr), "r"(expected), "r"(desired) \ + : "memory") +#define kokkos_cuda_internal_cas_acquire_32(ptr, old, expected, desired) \ + asm volatile("atom.cas.acquire.sys.b32 %0, [%1], %2, %3;" \ + : "=r"(old) \ + : "l"(ptr), "r"(expected), "r"(desired) \ + : "memory") +#define kokkos_cuda_internal_cas_acq_rel_32(ptr, old, expected, desired) \ + asm volatile("atom.cas.acq_rel.sys.b32 %0, [%1], %2, %3;" \ + : "=r"(old) \ + : "l"(ptr), "r"(expected), "r"(desired) \ + : "memory") +#define kokkos_cuda_internal_cas_relaxed_32(ptr, old, expected, desired) \ + asm volatile("atom.cas.relaxed.sys.b32 %0, [%1], %2, %3;" \ + : "=r"(old) \ + : "l"(ptr), "r"(expected), "r"(desired) \ + : "memory") +#define kokkos_cuda_internal_fence_seq_cst() \ + asm volatile("fence.sc.sys;" : : : "memory") +#define kokkos_cuda_internal_fence_acq_rel() \ + asm volatile("fence.acq_rel.sys;" : : : "memory") +#else +#define kokkos_cuda_internal_fence_acq_rel() \ + asm volatile("membar.sys;" : : : "memory") +#define kokkos_cuda_internal_fence_seq_cst() \ + asm volatile("membar.sys;" : : : "memory") +#endif + +// 32-bit version +template <class T, typename std::enable_if<sizeof(T) == 4, int>::type = 0> +__inline__ __device__ bool atomic_compare_exchange_weak( + T volatile* const dest, T* const expected, T const desired, + std::memory_order success_order = std::memory_order_seq_cst, + std::memory_order failure_order = std::memory_order_seq_cst) { + // TODO assert that success_order >= failure_order + // See: https://github.com/ogiroux/freestanding + int32_t tmp = 0; + int32_t old = 0; + memcpy(&tmp, &desired, sizeof(T)); + memcpy(&old, expected, sizeof(T)); + int32_t old_tmp = old; +#if __CUDA_ARCH__ >= 700 + switch (success_order) { + case std::memory_order_seq_cst: + // sequentially consistent is just an acquire with a seq_cst fence + kokkos_cuda_internal_fence_seq_cst(); + kokkos_cuda_internal_cas_acquire_32((T*)dest, old, old_tmp, tmp); + break; + case std::memory_order_acquire: + kokkos_cuda_internal_cas_acquire_32((T*)dest, old, old_tmp, tmp); + break; + case std::memory_order_consume: + // same as acquire on PTX compatible platforms + kokkos_cuda_internal_cas_acquire_32((T*)dest, old, old_tmp, tmp); + break; + case std::memory_order_acq_rel: + kokkos_cuda_internal_cas_acq_rel_32((T*)dest, old, old_tmp, tmp); + break; + case std::memory_order_release: + kokkos_cuda_internal_cas_release_32((T*)dest, old, old_tmp, tmp); + break; + case std::memory_order_relaxed: + kokkos_cuda_internal_cas_relaxed_32((T*)dest, old, old_tmp, tmp); + break; + }; +#else + // All of the orders that require a fence before the relaxed atomic operation: + if (success_order == std::memory_order_release || + success_order == std::memory_order_acq_rel) { + kokkos_cuda_internal_fence_acq_rel(); + } else if (success_order == std::memory_order_seq_cst) { + kokkos_cuda_internal_fence_seq_cst(); + } + // This is relaxed: + // Cuda API requires casting away volatile + atomicCAS((T*)dest, old_tmp, tmp); +#endif + bool const rv = (old == old_tmp); +#if __CUDA_ARCH__ < 700 + if (rv) { + if (success_order == std::memory_order_acquire || + success_order == std::memory_order_consume || + success_order == std::memory_order_acq_rel) { + kokkos_cuda_internal_fence_acq_rel(); + } else if (success_order == std::memory_order_seq_cst) { + kokkos_cuda_internal_fence_seq_cst(); + } + } else { + if (failure_order == std::memory_order_acquire || + failure_order == std::memory_order_consume || + failure_order == std::memory_order_acq_rel) { + kokkos_cuda_internal_fence_acq_rel(); + } else if (failure_order == std::memory_order_seq_cst) { + kokkos_cuda_internal_fence_seq_cst(); + } + } +#endif + memcpy(expected, &old, sizeof(T)); + return rv; +} + +// 64-bit version +template <class T, typename std::enable_if<sizeof(T) == 8, int>::type = 0> +bool atomic_compare_exchange_weak( + T volatile* const dest, T* const expected, T const desired, + std::memory_order success_order = std::memory_order_seq_cst, + std::memory_order failure_order = std::memory_order_seq_cst) { + // TODO assert that success_order >= failure_order + // See: https://github.com/ogiroux/freestanding + int64_t tmp = 0; + int64_t old = 0; + memcpy(&tmp, &desired, sizeof(T)); + memcpy(&old, expected, sizeof(T)); + int64_t old_tmp = old; +#if __CUDA_ARCH__ >= 700 + switch (success_order) { + case std::memory_order_seq_cst: + // sequentially consistent is just an acquire with a seq_cst fence + kokkos_cuda_internal_fence_seq_cst(); + kokkos_cuda_internal_cas_acquire_64((T*)dest, old, old_tmp, tmp); + break; + case std::memory_order_acquire: + kokkos_cuda_internal_cas_acquire_64((T*)dest, old, old_tmp, tmp); + break; + case std::memory_order_consume: + // same as acquire on PTX compatible platforms + kokkos_cuda_internal_cas_acquire_64((T*)dest, old, old_tmp, tmp); + break; + case std::memory_order_acq_rel: + kokkos_cuda_internal_cas_acq_rel_64((T*)dest, old, old_tmp, tmp); + break; + case std::memory_order_release: + kokkos_cuda_internal_cas_release_64((T*)dest, old, old_tmp, tmp); + break; + case std::memory_order_relaxed: + kokkos_cuda_internal_cas_relaxed_64((T*)dest, old, old_tmp, tmp); + break; + }; +#else + // Cuda API requires casting away volatile + atomicCAS((T*)dest, old_tmp, tmp); +#endif + bool const rv = (old == old_tmp); + memcpy(expected, &old, sizeof(T)); + return rv; +} + +#endif // defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) + +#endif // defined( KOKKOS_ENABLE_CUDA ) + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +// GCC native CAS supports int, long, unsigned int, unsigned long. +// Intel native CAS support int and long with the same interface as GCC. +#if !defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) +#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || defined(KOKKOS_ENABLE_INTEL_ATOMICS) + +inline int atomic_compare_exchange(volatile int* const dest, const int compare, + const int val) { +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + return __sync_val_compare_and_swap(dest, compare, val); +} + +inline long atomic_compare_exchange(volatile long* const dest, + const long compare, const long val) { +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + return __sync_val_compare_and_swap(dest, compare, val); +} + +#if defined(KOKKOS_ENABLE_GNU_ATOMICS) + +// GCC supports unsigned + +inline unsigned int atomic_compare_exchange(volatile unsigned int* const dest, + const unsigned int compare, + const unsigned int val) { + return __sync_val_compare_and_swap(dest, compare, val); +} + +inline unsigned long atomic_compare_exchange(volatile unsigned long* const dest, + const unsigned long compare, + const unsigned long val) { + return __sync_val_compare_and_swap(dest, compare, val); +} + +inline unsigned long long atomic_compare_exchange( + volatile unsigned long long* const dest, const unsigned long long compare, + const unsigned long long val) { + return __sync_val_compare_and_swap(dest, compare, val); +} + +#endif + +template <typename T> +inline T atomic_compare_exchange( + volatile T* const dest, const T& compare, + typename std::enable_if<sizeof(T) == sizeof(int), const T&>::type val) { + union U { + int i; + T t; + KOKKOS_INLINE_FUNCTION U() {} + } tmp; + +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + + tmp.i = + __sync_val_compare_and_swap((int*)dest, *((int*)&compare), *((int*)&val)); + return tmp.t; +} + +template <typename T> +inline T atomic_compare_exchange( + volatile T* const dest, const T& compare, + typename std::enable_if<sizeof(T) != sizeof(int) && + sizeof(T) == sizeof(long), + const T&>::type val) { + union U { + long i; + T t; + KOKKOS_INLINE_FUNCTION U() {} + } tmp; + +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + + tmp.i = __sync_val_compare_and_swap((long*)dest, *((long*)&compare), + *((long*)&val)); + return tmp.t; +} + +#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) +template <typename T> +inline T atomic_compare_exchange( + volatile T* const dest, const T& compare, + typename std::enable_if<sizeof(T) != sizeof(int) && + sizeof(T) != sizeof(long) && + sizeof(T) == sizeof(Impl::cas128_t), + const T&>::type val) { + union U { + Impl::cas128_t i; + T t; + KOKKOS_INLINE_FUNCTION U() {} + } tmp; + +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + + tmp.i = Impl::cas128((Impl::cas128_t*)dest, *((Impl::cas128_t*)&compare), + *((Impl::cas128_t*)&val)); + return tmp.t; +} +#endif + +template <typename T> +inline T atomic_compare_exchange( + volatile T* const dest, const T compare, + typename std::enable_if<(sizeof(T) != 4) && (sizeof(T) != 8) +#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) + && (sizeof(T) != 16) +#endif + , + const T>::type& val) { +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + + while (!Impl::lock_address_host_space((void*)dest)) + ; + Kokkos::memory_fence(); + T return_val = *dest; + if (return_val == compare) { + // Don't use the following line of code here: + // + // const T tmp = *dest = val; + // + // Instead, put each assignment in its own statement. This is + // because the overload of T::operator= for volatile *this should + // return void, not volatile T&. See Kokkos #177: + // + // https://github.com/kokkos/kokkos/issues/177 + *dest = val; + const T tmp = *dest; +#ifndef KOKKOS_COMPILER_CLANG + (void)tmp; +#endif + Kokkos::memory_fence(); + } + Impl::unlock_address_host_space((void*)dest); + return return_val; +} +//---------------------------------------------------------------------------- + +#elif defined(KOKKOS_ENABLE_OPENMP_ATOMICS) + +template <typename T> +KOKKOS_INLINE_FUNCTION T atomic_compare_exchange(volatile T* const dest, + const T compare, const T val) { + T retval; +#pragma omp critical + { + retval = dest[0]; + if (retval == compare) dest[0] = val; + } + return retval; +} + +#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS) + +template <typename T> +KOKKOS_INLINE_FUNCTION T atomic_compare_exchange(volatile T* const dest_v, + const T compare, const T val) { + T* dest = const_cast<T*>(dest_v); + T retval = *dest; + if (retval == compare) *dest = val; + return retval; +} + +#endif +#endif + +template <typename T> +KOKKOS_INLINE_FUNCTION bool atomic_compare_exchange_strong( + volatile T* const dest, const T compare, const T val) { + return compare == atomic_compare_exchange(dest, compare, val); +} +//---------------------------------------------------------------------------- + +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Decrement.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Decrement.hpp new file mode 100644 index 0000000000000000000000000000000000000000..47961b5c717fcd88c2b6a66266a42a6df256a00b --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Decrement.hpp @@ -0,0 +1,147 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) +#include <xmmintrin.h> +#endif + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_DECREMENT_HPP) +#define KOKKOS_ATOMIC_DECREMENT_HPP + +#include "impl/Kokkos_Atomic_Fetch_Sub.hpp" + +namespace Kokkos { + +// Atomic decrement +template <> +KOKKOS_INLINE_FUNCTION void atomic_decrement<char>(volatile char* a) { +#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) && \ + !defined(_WIN32) && !defined(__CUDA_ARCH__) +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)a, _MM_HINT_ET0); +#endif + __asm__ __volatile__("lock decb %0" + : /* no output registers */ + : "m"(a[0]) + : "memory"); +#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS) + char* a_nv = const_cast<char*>(a); + --(*a_nv); +#else + Kokkos::atomic_fetch_sub(a, char(1)); +#endif +} + +template <> +KOKKOS_INLINE_FUNCTION void atomic_decrement<short>(volatile short* a) { +#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) && \ + !defined(_WIN32) && !defined(__CUDA_ARCH__) +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)a, _MM_HINT_ET0); +#endif + __asm__ __volatile__("lock decw %0" + : /* no output registers */ + : "m"(a[0]) + : "memory"); +#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS) + short* a_nv = const_cast<short*>(a); + --(*a_nv); +#else + Kokkos::atomic_fetch_sub(a, short(1)); +#endif +} + +template <> +KOKKOS_INLINE_FUNCTION void atomic_decrement<int>(volatile int* a) { +#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) && \ + !defined(_WIN32) && !defined(__CUDA_ARCH__) +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)a, _MM_HINT_ET0); +#endif + __asm__ __volatile__("lock decl %0" + : /* no output registers */ + : "m"(a[0]) + : "memory"); +#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS) + int* a_nv = const_cast<int*>(a); + --(*a_nv); +#else + Kokkos::atomic_fetch_sub(a, int(1)); +#endif +} + +template <> +KOKKOS_INLINE_FUNCTION void atomic_decrement<long long int>( + volatile long long int* a) { +#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) && \ + !defined(_WIN32) && !defined(__CUDA_ARCH__) +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)a, _MM_HINT_ET0); +#endif + __asm__ __volatile__("lock decq %0" + : /* no output registers */ + : "m"(a[0]) + : "memory"); +#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS) + long long int* a_nv = const_cast<long long int*>(a); + --(*a_nv); +#else + using T = long long int; + Kokkos::atomic_fetch_sub(a, T(1)); +#endif +} + +template <typename T> +KOKKOS_INLINE_FUNCTION void atomic_decrement(volatile T* a) { +#if defined(KOKKOS_ENABLE_SERIAL_ATOMICS) + T* a_nv = const_cast<T*>(a); + --(*a_nv); +#else + Kokkos::atomic_fetch_sub(a, T(1)); +#endif +} + +} // End of namespace Kokkos +#endif diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp new file mode 100644 index 0000000000000000000000000000000000000000..f2c1c756a910d26de0eb3765e0b90684e564d243 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp @@ -0,0 +1,416 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) +#include <xmmintrin.h> +#endif + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_EXCHANGE_HPP) +#define KOKKOS_ATOMIC_EXCHANGE_HPP + +#if defined(KOKKOS_ENABLE_CUDA) +#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp> +#endif + +namespace Kokkos { + +//---------------------------------------------------------------------------- + +#if defined(KOKKOS_ENABLE_CUDA) +#if defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) + +__inline__ __device__ int atomic_exchange(volatile int* const dest, + const int val) { + // return __iAtomicExch( (int*) dest , val ); + return atomicExch((int*)dest, val); +} + +__inline__ __device__ unsigned int atomic_exchange( + volatile unsigned int* const dest, const unsigned int val) { + // return __uAtomicExch( (unsigned int*) dest , val ); + return atomicExch((unsigned int*)dest, val); +} + +__inline__ __device__ unsigned long long int atomic_exchange( + volatile unsigned long long int* const dest, + const unsigned long long int val) { + // return __ullAtomicExch( (unsigned long long*) dest , val ); + return atomicExch((unsigned long long*)dest, val); +} + +/** \brief Atomic exchange for any type with compatible size */ +template <typename T> +__inline__ __device__ T atomic_exchange( + volatile T* const dest, + typename std::enable_if<sizeof(T) == sizeof(int), const T&>::type val) { + // int tmp = __ullAtomicExch( (int*) dest , *((int*)&val) ); +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + + int tmp = atomicExch(((int*)dest), *((int*)&val)); + return *((T*)&tmp); +} + +template <typename T> +__inline__ __device__ T atomic_exchange( + volatile T* const dest, + typename std::enable_if<sizeof(T) != sizeof(int) && + sizeof(T) == sizeof(unsigned long long int), + const T&>::type val) { + using type = unsigned long long int; + +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + + // type tmp = __ullAtomicExch( (type*) dest , *((type*)&val) ); + type tmp = atomicExch(((type*)dest), *((type*)&val)); + return *((T*)&tmp); +} + +template <typename T> +__inline__ __device__ T +atomic_exchange(volatile T* const dest, + typename std::enable_if<(sizeof(T) != 4) && (sizeof(T) != 8), + const T>::type& val) { + T return_val; + // This is a way to (hopefully) avoid dead lock in a warp +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + + int done = 0; +#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK + unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK; + unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1); +#else + unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1); +#endif + unsigned int done_active = 0; + while (active != done_active) { + if (!done) { + if (Impl::lock_address_cuda_space((void*)dest)) { + Kokkos::memory_fence(); + return_val = *dest; + *dest = val; + Kokkos::memory_fence(); + Impl::unlock_address_cuda_space((void*)dest); + done = 1; + } + } +#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK + done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, done); +#else + done_active = KOKKOS_IMPL_CUDA_BALLOT(done); +#endif + } + return return_val; +} +/** \brief Atomic exchange for any type with compatible size */ +template <typename T> +__inline__ __device__ void atomic_assign( + volatile T* const dest, + typename std::enable_if<sizeof(T) == sizeof(int), const T&>::type val) { + // (void) __ullAtomicExch( (int*) dest , *((int*)&val) ); + (void)atomicExch(((int*)dest), *((int*)&val)); +} + +template <typename T> +__inline__ __device__ void atomic_assign( + volatile T* const dest, + typename std::enable_if<sizeof(T) != sizeof(int) && + sizeof(T) == sizeof(unsigned long long int), + const T&>::type val) { + using type = unsigned long long int; + // (void) __ullAtomicExch( (type*) dest , *((type*)&val) ); + (void)atomicExch(((type*)dest), *((type*)&val)); +} + +template <typename T> +__inline__ __device__ void atomic_assign( + volatile T* const dest, + typename std::enable_if<sizeof(T) != sizeof(int) && + sizeof(T) != sizeof(unsigned long long int), + const T&>::type val) { + (void)atomic_exchange(dest, val); +} + +#endif +#endif + +//---------------------------------------------------------------------------- + +#if !defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) +#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || defined(KOKKOS_ENABLE_INTEL_ATOMICS) + +template <typename T> +inline T atomic_exchange(volatile T* const dest, + typename std::enable_if<sizeof(T) == sizeof(int) || + sizeof(T) == sizeof(long), + const T&>::type val) { + using type = std::conditional_t<sizeof(T) == sizeof(int), int, long>; +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + + const type v = *((type*)&val); // Extract to be sure the value doesn't change + + type assumed; + + union U { + T val_T; + type val_type; + inline U() {} + } old; + + old.val_T = *dest; + + do { + assumed = old.val_type; + old.val_type = + __sync_val_compare_and_swap((volatile type*)dest, assumed, v); + } while (assumed != old.val_type); + + return old.val_T; +} + +#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) +template <typename T> +inline T atomic_exchange( + volatile T* const dest, + typename std::enable_if<sizeof(T) == sizeof(Impl::cas128_t), const T&>::type + val) { +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + + union U { + Impl::cas128_t i; + T t; + inline U() {} + } assume, oldval, newval; + + oldval.t = *dest; + newval.t = val; + + do { + assume.i = oldval.i; + oldval.i = Impl::cas128((volatile Impl::cas128_t*)dest, assume.i, newval.i); + } while (assume.i != oldval.i); + + return oldval.t; +} +#endif + +//---------------------------------------------------------------------------- + +template <typename T> +inline T atomic_exchange( + volatile T* const dest, + typename std::enable_if<(sizeof(T) != 4) && (sizeof(T) != 8) +#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) + && (sizeof(T) != 16) +#endif + , + const T>::type& val) { + while (!Impl::lock_address_host_space((void*)dest)) + ; + Kokkos::memory_fence(); + T return_val = *dest; + // Don't use the following line of code here: + // + // const T tmp = *dest = val; + // + // Instead, put each assignment in its own statement. This is + // because the overload of T::operator= for volatile *this should + // return void, not volatile T&. See Kokkos #177: + // + // https://github.com/kokkos/kokkos/issues/177 + *dest = val; + const T tmp = *dest; +#ifndef KOKKOS_COMPILER_CLANG + (void)tmp; +#endif + Kokkos::memory_fence(); + Impl::unlock_address_host_space((void*)dest); + return return_val; +} + +template <typename T> +inline void atomic_assign(volatile T* const dest, + typename std::enable_if<sizeof(T) == sizeof(int) || + sizeof(T) == sizeof(long), + const T&>::type val) { + using type = std::conditional_t<sizeof(T) == sizeof(int), int, long>; + +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + + const type v = *((type*)&val); // Extract to be sure the value doesn't change + + type assumed; + + union U { + T val_T; + type val_type; + inline U() {} + } old; + + old.val_T = *dest; + + do { + assumed = old.val_type; + old.val_type = + __sync_val_compare_and_swap((volatile type*)dest, assumed, v); + } while (assumed != old.val_type); +} + +#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) +template <typename T> +inline void atomic_assign( + volatile T* const dest, + typename std::enable_if<sizeof(T) == sizeof(Impl::cas128_t), const T&>::type + val) { +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + + union U { + Impl::cas128_t i; + T t; + inline U() {} + } assume, oldval, newval; + + oldval.t = *dest; + newval.t = val; + do { + assume.i = oldval.i; + oldval.i = Impl::cas128((volatile Impl::cas128_t*)dest, assume.i, newval.i); + } while (assume.i != oldval.i); +} +#endif + +template <typename T> +inline void atomic_assign( + volatile T* const dest, + typename std::enable_if<(sizeof(T) != 4) && (sizeof(T) != 8) +#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) + && (sizeof(T) != 16) +#endif + , + const T>::type& val) { + while (!Impl::lock_address_host_space((void*)dest)) + ; + Kokkos::memory_fence(); + // This is likely an aggregate type with a defined + // 'volatile T & operator = ( const T & ) volatile' + // member. The volatile return value implicitly defines a + // dereference that some compilers (gcc 4.7.2) warn is being ignored. + // Suppress warning by casting return to void. + //(void)( *dest = val ); + *dest = val; + Kokkos::memory_fence(); + Impl::unlock_address_host_space((void*)dest); +} +//---------------------------------------------------------------------------- + +#elif defined(KOKKOS_ENABLE_OPENMP_ATOMICS) + +template <typename T> +inline T atomic_exchange(volatile T* const dest, const T val) { + T retval; + //#pragma omp atomic capture +#pragma omp critical + { + retval = dest[0]; + dest[0] = val; + } + return retval; +} + +template <typename T> +inline void atomic_assign(volatile T* const dest, const T val) { + //#pragma omp atomic +#pragma omp critical + { dest[0] = val; } +} + +#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS) + +template <typename T> +inline T atomic_exchange(volatile T* const dest_v, const T val) { + T* dest = const_cast<T*>(dest_v); + T retval = *dest; + *dest = val; + return retval; +} + +template <typename T> +inline void atomic_assign(volatile T* const dest_v, const T val) { + T* dest = const_cast<T*>(dest_v); + *dest = val; +} + +#endif +#endif + +// dummy for non-CUDA Kokkos headers being processed by NVCC +#if defined(__CUDA_ARCH__) && !defined(KOKKOS_ENABLE_CUDA) +template <typename T> +__inline__ __device__ T atomic_exchange(volatile T* const, + const Kokkos::Impl::identity_t<T>) { + return T(); +} + +template <typename T> +__inline__ __device__ void atomic_assign(volatile T* const, + const Kokkos::Impl::identity_t<T>) {} +#endif + +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp new file mode 100644 index 0000000000000000000000000000000000000000..5c3f825ed100450bac57110829f64094b782011d --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp @@ -0,0 +1,402 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) +#include <xmmintrin.h> +#endif + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_FETCH_ADD_HPP) +#define KOKKOS_ATOMIC_FETCH_ADD_HPP + +#if defined(KOKKOS_ENABLE_CUDA) +#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp> +#endif + +namespace Kokkos { + +//---------------------------------------------------------------------------- + +#if defined(KOKKOS_ENABLE_CUDA) +#if defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) + +// Support for int, unsigned int, unsigned long long int, and float + +__inline__ __device__ int atomic_fetch_add(volatile int* const dest, + const int val) { + return atomicAdd((int*)dest, val); +} + +__inline__ __device__ unsigned int atomic_fetch_add( + volatile unsigned int* const dest, const unsigned int val) { + return atomicAdd((unsigned int*)dest, val); +} + +__inline__ __device__ unsigned long long int atomic_fetch_add( + volatile unsigned long long int* const dest, + const unsigned long long int val) { + return atomicAdd((unsigned long long int*)dest, val); +} + +__inline__ __device__ float atomic_fetch_add(volatile float* const dest, + const float val) { + return atomicAdd((float*)dest, val); +} + +#if (600 <= __CUDA_ARCH__) +__inline__ __device__ double atomic_fetch_add(volatile double* const dest, + const double val) { + return atomicAdd((double*)dest, val); +} +#endif + +template <typename T> +__inline__ __device__ T atomic_fetch_add( + volatile T* const dest, + typename std::enable_if<sizeof(T) == sizeof(int), const T>::type val) { + // to work around a bug in the clang cuda compiler, the name here needs to be + // different from the one internal to the other overloads + union U1 { + int i; + T t; + KOKKOS_INLINE_FUNCTION U1() {} + } assume, oldval, newval; + + oldval.t = *dest; + + do { + assume.i = oldval.i; + newval.t = assume.t + val; + oldval.i = atomicCAS((int*)dest, assume.i, newval.i); + } while (assume.i != oldval.i); + + return oldval.t; +} + +template <typename T> +__inline__ __device__ T atomic_fetch_add( + volatile T* const dest, + typename std::enable_if<sizeof(T) != sizeof(int) && + sizeof(T) == sizeof(unsigned long long int), + const T>::type val) { + // to work around a bug in the clang cuda compiler, the name here needs to be + // different from the one internal to the other overloads + union U2 { + unsigned long long int i; + T t; + KOKKOS_INLINE_FUNCTION U2() {} + } assume, oldval, newval; + + oldval.t = *dest; + + do { + assume.i = oldval.i; + newval.t = assume.t + val; + oldval.i = atomicCAS((unsigned long long int*)dest, assume.i, newval.i); + } while (assume.i != oldval.i); + + return oldval.t; +} + +//---------------------------------------------------------------------------- + +template <typename T> +__inline__ __device__ T +atomic_fetch_add(volatile T* const dest, + typename std::enable_if<(sizeof(T) != 4) && (sizeof(T) != 8), + const T>::type& val) { + T return_val; + // This is a way to (hopefully) avoid dead lock in a warp + int done = 0; +#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK + unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK; + unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1); +#else + unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1); +#endif + unsigned int done_active = 0; + while (active != done_active) { + if (!done) { + bool locked = Impl::lock_address_cuda_space((void*)dest); + if (locked) { + Kokkos::memory_fence(); + return_val = *dest; + *dest = return_val + val; + Kokkos::memory_fence(); + Impl::unlock_address_cuda_space((void*)dest); + done = 1; + } + } + +#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK + done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, done); +#else + done_active = KOKKOS_IMPL_CUDA_BALLOT(done); +#endif + } + return return_val; +} +#endif +#endif +//---------------------------------------------------------------------------- +#if !defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) +#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || defined(KOKKOS_ENABLE_INTEL_ATOMICS) + +#if defined(KOKKOS_ENABLE_ASM) && (defined(KOKKOS_ENABLE_ISA_X86_64) || \ + defined(KOKKOS_KNL_USE_ASM_WORKAROUND)) +inline int atomic_fetch_add(volatile int* dest, const int val) { +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + + int original = val; + + __asm__ __volatile__("lock xadd %1, %0" + : "+m"(*dest), "+r"(original) + : "m"(*dest), "r"(original) + : "memory"); + + return original; +} +#else +inline int atomic_fetch_add(volatile int* const dest, const int val) { +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + return __sync_fetch_and_add(dest, val); +} +#endif + +inline long int atomic_fetch_add(volatile long int* const dest, + const long int val) { +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + return __sync_fetch_and_add(dest, val); +} + +#if defined(KOKKOS_ENABLE_GNU_ATOMICS) + +inline unsigned int atomic_fetch_add(volatile unsigned int* const dest, + const unsigned int val) { +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + return __sync_fetch_and_add(dest, val); +} + +inline unsigned long int atomic_fetch_add( + volatile unsigned long int* const dest, const unsigned long int val) { +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + return __sync_fetch_and_add(dest, val); +} + +inline unsigned long long int atomic_fetch_add( + volatile unsigned long long int* const dest, + const unsigned long long int val) { +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + return __sync_fetch_and_add(dest, val); +} + +#endif + +template <typename T> +inline T atomic_fetch_add( + volatile T* const dest, + typename std::enable_if<sizeof(T) == sizeof(int), const T>::type val) { + union U { + int i; + T t; + inline U() {} + } assume, oldval, newval; + +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + + oldval.t = *dest; + + do { + assume.i = oldval.i; + newval.t = assume.t + val; + oldval.i = __sync_val_compare_and_swap((int*)dest, assume.i, newval.i); + } while (assume.i != oldval.i); + + return oldval.t; +} + +template <typename T> +inline T atomic_fetch_add(volatile T* const dest, + typename std::enable_if<sizeof(T) != sizeof(int) && + sizeof(T) == sizeof(long), + const T>::type val) { + union U { + long i; + T t; + inline U() {} + } assume, oldval, newval; + +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + + oldval.t = *dest; + + do { + assume.i = oldval.i; + newval.t = assume.t + val; + oldval.i = __sync_val_compare_and_swap((long*)dest, assume.i, newval.i); + } while (assume.i != oldval.i); + + return oldval.t; +} + +#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) +template <typename T> +inline T atomic_fetch_add( + volatile T* const dest, + typename std::enable_if<sizeof(T) != sizeof(int) && + sizeof(T) != sizeof(long) && + sizeof(T) == sizeof(Impl::cas128_t), + const T>::type val) { + union U { + Impl::cas128_t i; + T t; + inline U() {} + } assume, oldval, newval; + +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + + oldval.t = *dest; + + do { + assume.i = oldval.i; + newval.t = assume.t + val; + oldval.i = Impl::cas128((volatile Impl::cas128_t*)dest, assume.i, newval.i); + } while (assume.i != oldval.i); + + return oldval.t; +} +#endif + +//---------------------------------------------------------------------------- + +template <typename T> +inline T atomic_fetch_add( + volatile T* const dest, + typename std::enable_if<(sizeof(T) != 4) && (sizeof(T) != 8) +#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) + && (sizeof(T) != 16) +#endif + , + const T>::type& val) { + while (!Impl::lock_address_host_space((void*)dest)) + ; + Kokkos::memory_fence(); + T return_val = *dest; + + // Don't use the following line of code here: + // + // const T tmp = *dest = return_val + val; + // + // Instead, put each assignment in its own statement. This is + // because the overload of T::operator= for volatile *this should + // return void, not volatile T&. See Kokkos #177: + // + // https://github.com/kokkos/kokkos/issues/177 + *dest = return_val + val; + const T tmp = *dest; + (void)tmp; + Kokkos::memory_fence(); + Impl::unlock_address_host_space((void*)dest); + + return return_val; +} +//---------------------------------------------------------------------------- + +#elif defined(KOKKOS_ENABLE_OPENMP_ATOMICS) + +template <typename T> +T atomic_fetch_add(volatile T* const dest, const T val) { + T retval; +#pragma omp atomic capture + { + retval = dest[0]; + dest[0] += val; + } + return retval; +} + +#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS) + +template <typename T> +T atomic_fetch_add(volatile T* const dest_v, + typename std::add_const<T>::type val) { + T* dest = const_cast<T*>(dest_v); + T retval = *dest; + *dest += val; + return retval; +} + +#endif +#endif +//---------------------------------------------------------------------------- + +// dummy for non-CUDA Kokkos headers being processed by NVCC +#if defined(__CUDA_ARCH__) && !defined(KOKKOS_ENABLE_CUDA) +template <typename T> +__inline__ __device__ T atomic_fetch_add(volatile T* const, + Kokkos::Impl::identity_t<T>) { + return T(); +} +#endif + +} // namespace Kokkos +#endif diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp new file mode 100644 index 0000000000000000000000000000000000000000..50bd2b0fa146cd68c5ec655efef01b506cdef41d --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp @@ -0,0 +1,192 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) +#include <xmmintrin.h> +#endif + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_FETCH_AND_HPP) +#define KOKKOS_ATOMIC_FETCH_AND_HPP + +namespace Kokkos { + +//---------------------------------------------------------------------------- + +#if defined(KOKKOS_ENABLE_CUDA) +#if defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) + +// Support for int, unsigned int, unsigned long long int, and float + +__inline__ __device__ int atomic_fetch_and(volatile int* const dest, + const int val) { + return atomicAnd((int*)dest, val); +} + +__inline__ __device__ unsigned int atomic_fetch_and( + volatile unsigned int* const dest, const unsigned int val) { + return atomicAnd((unsigned int*)dest, val); +} + +#if defined(__CUDA_ARCH__) && (350 <= __CUDA_ARCH__) +__inline__ __device__ unsigned long long int atomic_fetch_and( + volatile unsigned long long int* const dest, + const unsigned long long int val) { + return atomicAnd((unsigned long long int*)dest, val); +} +#endif +#endif +#endif + +// 08/05/20 Overload to work around https://bugs.llvm.org/show_bug.cgi?id=46922 + +#if (defined(KOKKOS_ENABLE_CUDA) && \ + (defined(__CUDA_ARCH__) || \ + defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND))) || \ + (defined(KOKKOS_ENABLE_HIP)) +__inline__ __device__ unsigned long atomic_fetch_and( + volatile unsigned long* const dest, const unsigned long val) { + return atomic_fetch_and<unsigned long>(dest, val); +} +__inline__ __device__ long atomic_fetch_and(volatile long* const dest, + long val) { + return atomic_fetch_and<long>(dest, val); +} +#endif + +//---------------------------------------------------------------------------- +#if !defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) +#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || defined(KOKKOS_ENABLE_INTEL_ATOMICS) + +inline int atomic_fetch_and(volatile int* const dest, const int val) { +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + return __sync_fetch_and_and(dest, val); +} + +inline long int atomic_fetch_and(volatile long int* const dest, + const long int val) { +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + return __sync_fetch_and_and(dest, val); +} + +#if defined(KOKKOS_ENABLE_GNU_ATOMICS) + +inline unsigned int atomic_fetch_and(volatile unsigned int* const dest, + const unsigned int val) { +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + return __sync_fetch_and_and(dest, val); +} + +inline unsigned long int atomic_fetch_and( + volatile unsigned long int* const dest, const unsigned long int val) { +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + return __sync_fetch_and_and(dest, val); +} + +inline unsigned long long int atomic_fetch_and( + volatile unsigned long long int* const dest, + const unsigned long long int val) { +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + return __sync_fetch_and_and(dest, val); +} + +#endif + +//---------------------------------------------------------------------------- + +#elif defined(KOKKOS_ENABLE_OPENMP_ATOMICS) + +template <typename T> +T atomic_fetch_and(volatile T* const dest, const T val) { + T retval; +#pragma omp atomic capture + { + retval = dest[0]; + dest[0] &= val; + } + return retval; +} + +#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS) + +template <typename T> +T atomic_fetch_and(volatile T* const dest_v, const T val) { + T* dest = const_cast<T*>(dest_v); + T retval = *dest; + *dest &= val; + return retval; +} + +#endif +#endif +//---------------------------------------------------------------------------- + +// dummy for non-CUDA Kokkos headers being processed by NVCC +#if defined(__CUDA_ARCH__) && !defined(KOKKOS_ENABLE_CUDA) +template <typename T> +__inline__ __device__ T atomic_fetch_and(volatile T* const, + Kokkos::Impl::identity_t<T>) { + return T(); +} +#endif + +// Simpler version of atomic_fetch_and without the fetch +template <typename T> +KOKKOS_INLINE_FUNCTION void atomic_and(volatile T* const dest, const T src) { + (void)atomic_fetch_and(dest, src); +} + +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp new file mode 100644 index 0000000000000000000000000000000000000000..7a04a8c7cab1889608c958fd2b0102f776ad17d8 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp @@ -0,0 +1,193 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) +#include <xmmintrin.h> +#endif + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_FETCH_OR_HPP) +#define KOKKOS_ATOMIC_FETCH_OR_HPP + +namespace Kokkos { + +//---------------------------------------------------------------------------- + +#if defined(KOKKOS_ENABLE_CUDA) +#if defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) + +// Support for int, unsigned int, unsigned long long int, and float + +__inline__ __device__ int atomic_fetch_or(volatile int* const dest, + const int val) { + return atomicOr((int*)dest, val); +} + +__inline__ __device__ unsigned int atomic_fetch_or( + volatile unsigned int* const dest, const unsigned int val) { + return atomicOr((unsigned int*)dest, val); +} + +#if defined(__CUDA_ARCH__) && (350 <= __CUDA_ARCH__) +__inline__ __device__ unsigned long long int atomic_fetch_or( + volatile unsigned long long int* const dest, + const unsigned long long int val) { + return atomicOr((unsigned long long int*)dest, val); +} +#endif +#endif +#endif + +// 08/05/20 Overload to work around https://bugs.llvm.org/show_bug.cgi?id=46922 + +#if (defined(KOKKOS_ENABLE_CUDA) && \ + (defined(__CUDA_ARCH__) || \ + defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND))) || \ + (defined(KOKKOS_ENABLE_HIP)) +__inline__ __device__ unsigned long atomic_fetch_or( + volatile unsigned long* const dest, const unsigned long val) { + return atomic_fetch_or<unsigned long>(dest, val); +} + +__inline__ __device__ long atomic_fetch_or(volatile long* const dest, + long val) { + return atomic_fetch_or<long>(dest, val); +} +#endif + +//---------------------------------------------------------------------------- +#if !defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) +#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || defined(KOKKOS_ENABLE_INTEL_ATOMICS) + +inline int atomic_fetch_or(volatile int* const dest, const int val) { +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + return __sync_fetch_and_or(dest, val); +} + +inline long int atomic_fetch_or(volatile long int* const dest, + const long int val) { +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + return __sync_fetch_and_or(dest, val); +} + +#if defined(KOKKOS_ENABLE_GNU_ATOMICS) + +inline unsigned int atomic_fetch_or(volatile unsigned int* const dest, + const unsigned int val) { +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + return __sync_fetch_and_or(dest, val); +} + +inline unsigned long int atomic_fetch_or(volatile unsigned long int* const dest, + const unsigned long int val) { +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + return __sync_fetch_and_or(dest, val); +} + +inline unsigned long long int atomic_fetch_or( + volatile unsigned long long int* const dest, + const unsigned long long int val) { +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + return __sync_fetch_and_or(dest, val); +} + +#endif + +//---------------------------------------------------------------------------- + +#elif defined(KOKKOS_ENABLE_OPENMP_ATOMICS) + +template <typename T> +T atomic_fetch_or(volatile T* const dest, const T val) { + T retval; +#pragma omp atomic capture + { + retval = dest[0]; + dest[0] |= val; + } + return retval; +} + +#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS) + +template <typename T> +T atomic_fetch_or(volatile T* const dest_v, const T val) { + T* dest = const_cast<T*>(dest_v); + T retval = *dest; + *dest |= val; + return retval; +} + +#endif +#endif +//---------------------------------------------------------------------------- + +// dummy for non-CUDA Kokkos headers being processed by NVCC +#if defined(__CUDA_ARCH__) && !defined(KOKKOS_ENABLE_CUDA) +template <typename T> +__inline__ __device__ T atomic_fetch_or(volatile T* const, + Kokkos::Impl::identity_t<T>) { + return T(); +} +#endif + +// Simpler version of atomic_fetch_or without the fetch +template <typename T> +KOKKOS_INLINE_FUNCTION void atomic_or(volatile T* const dest, const T src) { + (void)atomic_fetch_or(dest, src); +} + +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp new file mode 100644 index 0000000000000000000000000000000000000000..c3446ae6a3bda89fac094ab9693688c2be9f77a5 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp @@ -0,0 +1,335 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) +#include <xmmintrin.h> +#endif + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_FETCH_SUB_HPP) +#define KOKKOS_ATOMIC_FETCH_SUB_HPP + +#if defined(KOKKOS_ENABLE_CUDA) +#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp> +#endif + +namespace Kokkos { + +//---------------------------------------------------------------------------- + +#if defined(KOKKOS_ENABLE_CUDA) +#if defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) + +// Support for int, unsigned int, unsigned long long int, and float + +__inline__ __device__ int atomic_fetch_sub(volatile int* const dest, + const int val) { + return atomicSub((int*)dest, val); +} + +__inline__ __device__ unsigned int atomic_fetch_sub( + volatile unsigned int* const dest, const unsigned int val) { + return atomicSub((unsigned int*)dest, val); +} + +__inline__ __device__ unsigned int atomic_fetch_sub( + volatile int64_t* const dest, const int64_t val) { + return atomic_fetch_add(dest, -val); +} + +__inline__ __device__ unsigned int atomic_fetch_sub(volatile float* const dest, + const float val) { + return atomicAdd((float*)dest, -val); +} + +#if (600 <= __CUDA_ARCH__) +__inline__ __device__ unsigned int atomic_fetch_sub(volatile double* const dest, + const double val) { + return atomicAdd((double*)dest, -val); +} +#endif + +template <typename T> +__inline__ __device__ T atomic_fetch_sub( + volatile T* const dest, + typename std::enable_if<sizeof(T) == sizeof(int), const T>::type val) { + union U { + int i; + T t; + KOKKOS_INLINE_FUNCTION U() {} + } oldval, assume, newval; + + oldval.t = *dest; + + do { + assume.i = oldval.i; + newval.t = assume.t - val; + oldval.i = atomicCAS((int*)dest, assume.i, newval.i); + } while (assume.i != oldval.i); + + return oldval.t; +} + +template <typename T> +__inline__ __device__ T atomic_fetch_sub( + volatile T* const dest, + typename std::enable_if<sizeof(T) != sizeof(int) && + sizeof(T) == sizeof(unsigned long long int), + const T>::type val) { + union U { + unsigned long long int i; + T t; + KOKKOS_INLINE_FUNCTION U() {} + } oldval, assume, newval; + + oldval.t = *dest; + + do { + assume.i = oldval.i; + newval.t = assume.t - val; + oldval.i = atomicCAS((unsigned long long int*)dest, assume.i, newval.i); + } while (assume.i != oldval.i); + + return oldval.t; +} + +//---------------------------------------------------------------------------- + +template <typename T> +__inline__ __device__ T +atomic_fetch_sub(volatile T* const dest, + typename std::enable_if<(sizeof(T) != 4) && (sizeof(T) != 8), + const T>::type& val) { + T return_val; + // This is a way to (hopefully) avoid dead lock in a warp + int done = 0; +#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK + unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK; + unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1); +#else + unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1); +#endif + unsigned int done_active = 0; + while (active != done_active) { + if (!done) { + if (Impl::lock_address_cuda_space((void*)dest)) { + Kokkos::memory_fence(); + return_val = *dest; + *dest = return_val - val; + Kokkos::memory_fence(); + Impl::unlock_address_cuda_space((void*)dest); + done = 1; + } + } +#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK + done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, done); +#else + done_active = KOKKOS_IMPL_CUDA_BALLOT(done); +#endif + } + return return_val; +} +#endif +#endif +//---------------------------------------------------------------------------- +#if !defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) +#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || defined(KOKKOS_ENABLE_INTEL_ATOMICS) + +inline int atomic_fetch_sub(volatile int* const dest, const int val) { +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + return __sync_fetch_and_sub(dest, val); +} + +inline long int atomic_fetch_sub(volatile long int* const dest, + const long int val) { +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + return __sync_fetch_and_sub(dest, val); +} + +#if defined(KOKKOS_ENABLE_GNU_ATOMICS) + +inline unsigned int atomic_fetch_sub(volatile unsigned int* const dest, + const unsigned int val) { +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + return __sync_fetch_and_sub(dest, val); +} + +inline unsigned long int atomic_fetch_sub( + volatile unsigned long int* const dest, const unsigned long int val) { +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + return __sync_fetch_and_sub(dest, val); +} + +inline unsigned long long int atomic_fetch_sub( + volatile unsigned long long int* const dest, + const unsigned long long int val) { +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + return __sync_fetch_and_sub(dest, val); +} + +#endif + +template <typename T> +inline T atomic_fetch_sub( + volatile T* const dest, + typename std::enable_if<sizeof(T) == sizeof(int), const T>::type val) { + union U { + int i; + T t; + KOKKOS_INLINE_FUNCTION U() {} + } oldval, assume, newval; + +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + + oldval.t = *dest; + + do { + assume.i = oldval.i; + newval.t = assume.t - val; + oldval.i = __sync_val_compare_and_swap((int*)dest, assume.i, newval.i); + } while (assume.i != oldval.i); + + return oldval.t; +} + +template <typename T> +inline T atomic_fetch_sub(volatile T* const dest, + typename std::enable_if<sizeof(T) != sizeof(int) && + sizeof(T) == sizeof(long), + const T>::type val) { +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + + union U { + long i; + T t; + KOKKOS_INLINE_FUNCTION U() {} + } oldval, assume, newval; + + oldval.t = *dest; + + do { + assume.i = oldval.i; + newval.t = assume.t - val; + oldval.i = __sync_val_compare_and_swap((long*)dest, assume.i, newval.i); + } while (assume.i != oldval.i); + + return oldval.t; +} + +//---------------------------------------------------------------------------- + +template <typename T> +inline T atomic_fetch_sub( + volatile T* const dest, + typename std::enable_if<(sizeof(T) != 4) && (sizeof(T) != 8), + const T>::type& val) { +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)dest, _MM_HINT_ET0); +#endif + + while (!Impl::lock_address_host_space((void*)dest)) + ; + Kokkos::memory_fence(); + T return_val = *dest; + *dest = return_val - val; + Kokkos::memory_fence(); + Impl::unlock_address_host_space((void*)dest); + return return_val; +} + +//---------------------------------------------------------------------------- + +#elif defined(KOKKOS_ENABLE_OPENMP_ATOMICS) + +template <typename T> +T atomic_fetch_sub(volatile T* const dest, const T val) { + T retval; +#pragma omp atomic capture + { + retval = dest[0]; + dest[0] -= val; + } + return retval; +} + +#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS) + +template <typename T> +T atomic_fetch_sub(volatile T* const dest_v, const T val) { + T* dest = const_cast<T*>(dest_v); + T retval = *dest; + *dest -= val; + return retval; +} + +#endif +#endif + +// dummy for non-CUDA Kokkos headers being processed by NVCC +#if defined(__CUDA_ARCH__) && !defined(KOKKOS_ENABLE_CUDA) +template <typename T> +__inline__ __device__ T atomic_fetch_sub(volatile T* const, + Kokkos::Impl::identity_t<T>) { + return T(); +} +#endif + +} // namespace Kokkos + +#include <impl/Kokkos_Atomic_Assembly.hpp> +#endif diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp new file mode 100644 index 0000000000000000000000000000000000000000..28ac7a3bab9e748f9d315ca479f57db885ed75c4 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp @@ -0,0 +1,574 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_GENERIC_HPP) +#define KOKKOS_ATOMIC_GENERIC_HPP +#include <Kokkos_Macros.hpp> + +#if defined(KOKKOS_ENABLE_CUDA) +#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp> +#endif + +// Combination operands to be used in an Compare and Exchange based atomic +// operation +namespace Kokkos { +namespace Impl { + +template <class Op, class Scalar1, class Scalar2, class Enable = bool> +struct _check_early_exit_impl { + KOKKOS_FORCEINLINE_FUNCTION + static constexpr bool check(Op const&, Scalar1 const&, + Scalar2 const&) noexcept { + return false; + } +}; + +template <class Op, class Scalar1, class Scalar2> +struct _check_early_exit_impl< + Op, Scalar1, Scalar2, + decltype(std::declval<Op const&>().check_early_exit( + std::declval<Scalar1 const&>(), std::declval<Scalar2 const&>()))> { + KOKKOS_FORCEINLINE_FUNCTION + static constexpr bool check(Op const& op, Scalar1 const& v1, + Scalar2 const& v2) { + return op.check_early_exit(v1, v2); + } +}; + +template <class Op, class Scalar1, class Scalar2> +KOKKOS_FORCEINLINE_FUNCTION constexpr bool check_early_exit( + Op const& op, Scalar1 const& v1, Scalar2 const& v2) noexcept { + return _check_early_exit_impl<Op, Scalar1, Scalar2>::check(op, v1, v2); +} + +template <class Scalar1, class Scalar2> +struct MaxOper { + KOKKOS_FORCEINLINE_FUNCTION + static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { + return (val1 > val2 ? val1 : val2); + } + KOKKOS_FORCEINLINE_FUNCTION + static constexpr bool check_early_exit(Scalar1 const& val1, + Scalar2 const& val2) noexcept { + return (val1 > val2); + } +}; + +template <class Scalar1, class Scalar2> +struct MinOper { + KOKKOS_FORCEINLINE_FUNCTION + static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { + return (val1 < val2 ? val1 : val2); + } + KOKKOS_FORCEINLINE_FUNCTION + static constexpr bool check_early_exit(Scalar1 const& val1, + Scalar2 const& val2) noexcept { + return (val1 < val2); + } +}; + +template <class Scalar1, class Scalar2> +struct AddOper { + KOKKOS_FORCEINLINE_FUNCTION + static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { + return val1 + val2; + } +}; + +template <class Scalar1, class Scalar2> +struct SubOper { + KOKKOS_FORCEINLINE_FUNCTION + static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { + return val1 - val2; + } +}; + +template <class Scalar1, class Scalar2> +struct MulOper { + KOKKOS_FORCEINLINE_FUNCTION + static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { + return val1 * val2; + } +}; + +template <class Scalar1, class Scalar2> +struct DivOper { + KOKKOS_FORCEINLINE_FUNCTION + static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { + return val1 / val2; + } +}; + +template <class Scalar1, class Scalar2> +struct ModOper { + KOKKOS_FORCEINLINE_FUNCTION + static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { + return val1 % val2; + } +}; + +template <class Scalar1, class Scalar2> +struct AndOper { + KOKKOS_FORCEINLINE_FUNCTION + static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { + return val1 & val2; + } +}; + +template <class Scalar1, class Scalar2> +struct OrOper { + KOKKOS_FORCEINLINE_FUNCTION + static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { + return val1 | val2; + } +}; + +template <class Scalar1, class Scalar2> +struct XorOper { + KOKKOS_FORCEINLINE_FUNCTION + static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { + return val1 ^ val2; + } +}; + +template <class Scalar1, class Scalar2> +struct LShiftOper { + KOKKOS_FORCEINLINE_FUNCTION + static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { + return val1 << val2; + } +}; + +template <class Scalar1, class Scalar2> +struct RShiftOper { + KOKKOS_FORCEINLINE_FUNCTION + static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { + return val1 >> val2; + } +}; + +template <class Oper, typename T> +KOKKOS_INLINE_FUNCTION T atomic_fetch_oper( + const Oper& op, volatile T* const dest, + typename std::enable_if<sizeof(T) != sizeof(int) && + sizeof(T) == sizeof(unsigned long long int), + const T>::type val) { + union U { + unsigned long long int i; + T t; + KOKKOS_INLINE_FUNCTION U() {} + } oldval, assume, newval; + + oldval.t = *dest; + + do { + if (check_early_exit(op, oldval.t, val)) return oldval.t; + assume.i = oldval.i; + newval.t = op.apply(assume.t, val); + oldval.i = Kokkos::atomic_compare_exchange((unsigned long long int*)dest, + assume.i, newval.i); + } while (assume.i != oldval.i); + + return oldval.t; +} + +template <class Oper, typename T> +KOKKOS_INLINE_FUNCTION T atomic_oper_fetch( + const Oper& op, volatile T* const dest, + typename std::enable_if<sizeof(T) != sizeof(int) && + sizeof(T) == sizeof(unsigned long long int), + const T>::type val) { + union U { + unsigned long long int i; + T t; + KOKKOS_INLINE_FUNCTION U() {} + } oldval, assume, newval; + + oldval.t = *dest; + + do { + if (check_early_exit(op, oldval.t, val)) return oldval.t; + assume.i = oldval.i; + newval.t = op.apply(assume.t, val); + oldval.i = Kokkos::atomic_compare_exchange((unsigned long long int*)dest, + assume.i, newval.i); + } while (assume.i != oldval.i); + + return newval.t; +} + +template <class Oper, typename T> +KOKKOS_INLINE_FUNCTION T atomic_fetch_oper( + const Oper& op, volatile T* const dest, + typename std::enable_if<sizeof(T) == sizeof(int), const T>::type val) { + union U { + int i; + T t; + KOKKOS_INLINE_FUNCTION U() {} + } oldval, assume, newval; + + oldval.t = *dest; + + do { + if (check_early_exit(op, oldval.t, val)) return oldval.t; + assume.i = oldval.i; + newval.t = op.apply(assume.t, val); + oldval.i = Kokkos::atomic_compare_exchange((int*)dest, assume.i, newval.i); + } while (assume.i != oldval.i); + + return oldval.t; +} + +template <class Oper, typename T> +KOKKOS_INLINE_FUNCTION T atomic_oper_fetch( + const Oper& op, volatile T* const dest, + typename std::enable_if<sizeof(T) == sizeof(int), const T>::type val) { + union U { + int i; + T t; + KOKKOS_INLINE_FUNCTION U() {} + } oldval, assume, newval; + + oldval.t = *dest; + + do { + if (check_early_exit(op, oldval.t, val)) return oldval.t; + assume.i = oldval.i; + newval.t = op.apply(assume.t, val); + oldval.i = Kokkos::atomic_compare_exchange((int*)dest, assume.i, newval.i); + } while (assume.i != oldval.i); + + return newval.t; +} + +template <class Oper, typename T> +KOKKOS_INLINE_FUNCTION T atomic_fetch_oper( + const Oper& op, volatile T* const dest, + typename std::enable_if<(sizeof(T) != 4) && (sizeof(T) != 8), const T>::type + val) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + while (!Impl::lock_address_host_space((void*)dest)) + ; + Kokkos::memory_fence(); + T return_val = *dest; + *dest = op.apply(return_val, val); + Kokkos::memory_fence(); + Impl::unlock_address_host_space((void*)dest); + return return_val; +#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA) + // This is a way to (hopefully) avoid dead lock in a warp + T return_val; + int done = 0; +#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK + unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK; + unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1); +#else + unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1); +#endif + unsigned int done_active = 0; + while (active != done_active) { + if (!done) { + if (Impl::lock_address_cuda_space((void*)dest)) { + Kokkos::memory_fence(); + return_val = *dest; + *dest = op.apply(return_val, val); + Kokkos::memory_fence(); + Impl::unlock_address_cuda_space((void*)dest); + done = 1; + } + } +#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK + done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, done); +#else + done_active = KOKKOS_IMPL_CUDA_BALLOT(done); +#endif + } + return return_val; +#elif defined(__HIP_DEVICE_COMPILE__) + T return_val = *dest; + int done = 0; + unsigned int active = __ballot(1); + unsigned int done_active = 0; + while (active != done_active) { + if (!done) { + if (Impl::lock_address_hip_space((void*)dest)) { + return_val = *dest; + *dest = op.apply(return_val, val); + Impl::unlock_address_hip_space((void*)dest); + done = 1; + } + } + done_active = __ballot(done); + } + return return_val; +#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) + // FIXME_SYCL + Kokkos::abort("Not implemented!"); + (void)op; + (void)dest; + (void)val; + return 0; +#endif +} + +template <class Oper, typename T> +KOKKOS_INLINE_FUNCTION T +atomic_oper_fetch(const Oper& op, volatile T* const dest, + typename std::enable_if<(sizeof(T) != 4) && (sizeof(T) != 8) +#if defined(KOKKOS_ENABLE_ASM) && \ + defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + && (sizeof(T) != 16) +#endif + , + const T>::type& val) { + +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + while (!Impl::lock_address_host_space((void*)dest)) + ; + Kokkos::memory_fence(); + T return_val = op.apply(*dest, val); + *dest = return_val; + Kokkos::memory_fence(); + Impl::unlock_address_host_space((void*)dest); + return return_val; +#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA) + T return_val; + // This is a way to (hopefully) avoid dead lock in a warp + int done = 0; +#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK + unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK; + unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1); +#else + unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1); +#endif + unsigned int done_active = 0; + while (active != done_active) { + if (!done) { + if (Impl::lock_address_cuda_space((void*)dest)) { + Kokkos::memory_fence(); + return_val = op.apply(*dest, val); + *dest = return_val; + Kokkos::memory_fence(); + Impl::unlock_address_cuda_space((void*)dest); + done = 1; + } + } +#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK + done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, done); +#else + done_active = KOKKOS_IMPL_CUDA_BALLOT(done); +#endif + } + return return_val; +#elif defined(__HIP_DEVICE_COMPILE__) + T return_val; + int done = 0; + unsigned int active = __ballot(1); + unsigned int done_active = 0; + while (active != done_active) { + if (!done) { + if (Impl::lock_address_hip_space((void*)dest)) { + return_val = op.apply(*dest, val); + *dest = return_val; + Impl::unlock_address_hip_space((void*)dest); + done = 1; + } + } + done_active = __ballot(done); + } + return return_val; +#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) + // FIXME_SYCL + std::abort(); + (void)op; + (void)dest; + (void)val; + return 0; +#endif +} + +} // namespace Impl +} // namespace Kokkos + +namespace Kokkos { + +// Fetch_Oper atomics: return value before operation +template <typename T> +KOKKOS_INLINE_FUNCTION T atomic_fetch_max(volatile T* const dest, const T val) { + return Impl::atomic_fetch_oper(Impl::MaxOper<T, const T>(), dest, val); +} + +template <typename T> +KOKKOS_INLINE_FUNCTION T atomic_fetch_min(volatile T* const dest, const T val) { + return Impl::atomic_fetch_oper(Impl::MinOper<T, const T>(), dest, val); +} + +template <typename T> +KOKKOS_INLINE_FUNCTION T atomic_fetch_mul(volatile T* const dest, const T val) { + return Impl::atomic_fetch_oper(Impl::MulOper<T, const T>(), dest, val); +} + +template <typename T> +KOKKOS_INLINE_FUNCTION T atomic_fetch_div(volatile T* const dest, const T val) { + return Impl::atomic_fetch_oper(Impl::DivOper<T, const T>(), dest, val); +} + +template <typename T> +KOKKOS_INLINE_FUNCTION T atomic_fetch_mod(volatile T* const dest, const T val) { + return Impl::atomic_fetch_oper(Impl::ModOper<T, const T>(), dest, val); +} + +#if !defined(KOKKOS_ENABLE_SERIAL_ATOMICS) + +template <typename T> +KOKKOS_INLINE_FUNCTION T atomic_fetch_and(volatile T* const dest, const T val) { + return Impl::atomic_fetch_oper(Impl::AndOper<T, const T>(), dest, val); +} + +template <typename T> +KOKKOS_INLINE_FUNCTION T atomic_fetch_or(volatile T* const dest, const T val) { + return Impl::atomic_fetch_oper(Impl::OrOper<T, const T>(), dest, val); +} + +#endif + +template <typename T> +KOKKOS_INLINE_FUNCTION T atomic_fetch_xor(volatile T* const dest, const T val) { + return Impl::atomic_fetch_oper(Impl::XorOper<T, const T>(), dest, val); +} + +template <typename T> +KOKKOS_INLINE_FUNCTION T atomic_fetch_lshift(volatile T* const dest, + const unsigned int val) { + return Impl::atomic_fetch_oper(Impl::LShiftOper<T, const unsigned int>(), + dest, val); +} + +template <typename T> +KOKKOS_INLINE_FUNCTION T atomic_fetch_rshift(volatile T* const dest, + const unsigned int val) { + return Impl::atomic_fetch_oper(Impl::RShiftOper<T, const unsigned int>(), + dest, val); +} + +// Oper Fetch atomics: return value after operation +template <typename T> +KOKKOS_INLINE_FUNCTION T atomic_max_fetch(volatile T* const dest, const T val) { + return Impl::atomic_oper_fetch(Impl::MaxOper<T, const T>(), dest, val); +} + +template <typename T> +KOKKOS_INLINE_FUNCTION T atomic_min_fetch(volatile T* const dest, const T val) { + return Impl::atomic_oper_fetch(Impl::MinOper<T, const T>(), dest, val); +} + +template <typename T> +KOKKOS_INLINE_FUNCTION T atomic_mul_fetch(volatile T* const dest, const T val) { + return Impl::atomic_oper_fetch(Impl::MulOper<T, const T>(), dest, val); +} + +template <typename T> +KOKKOS_INLINE_FUNCTION T atomic_div_fetch(volatile T* const dest, const T val) { + return Impl::atomic_oper_fetch(Impl::DivOper<T, const T>(), dest, val); +} + +template <typename T> +KOKKOS_INLINE_FUNCTION T atomic_mod_fetch(volatile T* const dest, const T val) { + return Impl::atomic_oper_fetch(Impl::ModOper<T, const T>(), dest, val); +} + +template <typename T> +KOKKOS_INLINE_FUNCTION T atomic_and_fetch(volatile T* const dest, const T val) { + return Impl::atomic_oper_fetch(Impl::AndOper<T, const T>(), dest, val); +} + +template <typename T> +KOKKOS_INLINE_FUNCTION T atomic_or_fetch(volatile T* const dest, const T val) { + return Impl::atomic_oper_fetch(Impl::OrOper<T, const T>(), dest, val); +} + +template <typename T> +KOKKOS_INLINE_FUNCTION T atomic_xor_fetch(volatile T* const dest, const T val) { + return Impl::atomic_oper_fetch(Impl::XorOper<T, const T>(), dest, val); +} + +template <typename T> +KOKKOS_INLINE_FUNCTION T atomic_lshift_fetch(volatile T* const dest, + const unsigned int val) { + return Impl::atomic_oper_fetch(Impl::LShiftOper<T, const unsigned int>(), + dest, val); +} + +template <typename T> +KOKKOS_INLINE_FUNCTION T atomic_rshift_fetch(volatile T* const dest, + const unsigned int val) { + return Impl::atomic_oper_fetch(Impl::RShiftOper<T, const unsigned int>(), + dest, val); +} + +#ifdef _WIN32 +template <typename T> +KOKKOS_INLINE_FUNCTION T atomic_add_fetch(volatile T* const dest, const T val) { + return Impl::atomic_oper_fetch(Impl::AddOper<T, const T>(), dest, val); +} + +template <typename T> +KOKKOS_INLINE_FUNCTION T atomic_sub_fetch(volatile T* const dest, const T val) { + return Impl::atomic_oper_fetch(Impl::SubOper<T, const T>(), dest, val); +} + +template <typename T> +KOKKOS_INLINE_FUNCTION T atomic_fetch_add(volatile T* const dest, const T val) { + return Impl::atomic_fetch_oper(Impl::AddOper<T, const T>(), dest, val); +} + +template <typename T> +KOKKOS_INLINE_FUNCTION T atomic_fetch_sub(volatile T* const dest, const T val) { + return Impl::atomic_fetch_oper(Impl::SubOper<T, const T>(), dest, val); +} +#endif + +} // namespace Kokkos +#endif diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Generic_Secondary.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Generic_Secondary.hpp new file mode 100644 index 0000000000000000000000000000000000000000..7ab6358434e1c0129672fcf04a7d5eba83e61e87 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Generic_Secondary.hpp @@ -0,0 +1,86 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_GENERIC_SECONDARY_HPP) +#define KOKKOS_ATOMIC_GENERIC_SECONDARY_HPP +#include <Kokkos_Macros.hpp> + +namespace Kokkos { + +#ifndef KOKKOS_ENABLE_SERIAL_ATOMICS +template <typename T> +KOKKOS_INLINE_FUNCTION T atomic_exchange(volatile T* const dest, const T val) { + T oldval = *dest; + T assume; + do { + assume = oldval; + oldval = atomic_compare_exchange(dest, assume, val); + } while (assume != oldval); + + return oldval; +} +#endif + +template <typename T> +KOKKOS_INLINE_FUNCTION void atomic_add(volatile T* const dest, const T val) { + (void)atomic_fetch_add(dest, val); +} + +template <typename T> +KOKKOS_INLINE_FUNCTION void atomic_sub(volatile T* const dest, const T val) { + (void)atomic_fetch_sub(dest, val); +} + +template <typename T> +KOKKOS_INLINE_FUNCTION void atomic_mul(volatile T* const dest, const T val) { + (void)atomic_fetch_mul(dest, val); +} + +template <typename T> +KOKKOS_INLINE_FUNCTION void atomic_div(volatile T* const dest, const T val) { + (void)atomic_fetch_div(dest, val); +} + +} // namespace Kokkos +#endif diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Increment.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Increment.hpp new file mode 100644 index 0000000000000000000000000000000000000000..65630aa84cdf9845afca761d56e68c6ffe3ef269 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Increment.hpp @@ -0,0 +1,147 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) +#include <xmmintrin.h> +#endif + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_INCREMENT_HPP) +#define KOKKOS_ATOMIC_INCREMENT_HPP + +namespace Kokkos { + +// Atomic increment +template <> +KOKKOS_INLINE_FUNCTION void atomic_increment<char>(volatile char* a) { +#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) && \ + !defined(_WIN32) && !defined(__CUDA_ARCH__) +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)a, _MM_HINT_ET0); +#endif + __asm__ __volatile__("lock incb %0" + : /* no output registers */ + : "m"(a[0]) + : "memory"); +#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS) + char* a_nv = const_cast<char*>(a); + ++(*a_nv); +#else + Kokkos::atomic_fetch_add(a, char(1)); +#endif +} + +template <> +KOKKOS_INLINE_FUNCTION void atomic_increment<short>(volatile short* a) { +#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) && \ + !defined(_WIN32) && !defined(__CUDA_ARCH__) +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)a, _MM_HINT_ET0); +#endif + __asm__ __volatile__("lock incw %0" + : /* no output registers */ + : "m"(a[0]) + : "memory"); +#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS) + short* a_nv = const_cast<short*>(a); + ++(*a_nv); +#else + Kokkos::atomic_fetch_add(a, short(1)); +#endif +} + +#ifndef _WIN32 +template <> +KOKKOS_INLINE_FUNCTION void atomic_increment<int>(volatile int* a) { +#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) && \ + !defined(_WIN32) && !defined(__CUDA_ARCH__) +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)a, _MM_HINT_ET0); +#endif + __asm__ __volatile__("lock incl %0" + : /* no output registers */ + : "m"(a[0]) + : "memory"); +#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS) + int* a_nv = const_cast<int*>(a); + ++(*a_nv); +#else + Kokkos::atomic_fetch_add(a, int(1)); +#endif +} +#endif + +template <> +KOKKOS_INLINE_FUNCTION void atomic_increment<long long int>( + volatile long long int* a) { +#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) && \ + !defined(_WIN32) && !defined(__CUDA_ARCH__) +#if defined(KOKKOS_ENABLE_RFO_PREFETCH) + _mm_prefetch((const char*)a, _MM_HINT_ET0); +#endif + __asm__ __volatile__("lock incq %0" + : /* no output registers */ + : "m"(a[0]) + : "memory"); +#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS) + long long int* a_nv = const_cast<long long int*>(a); + ++(*a_nv); +#else + using T = long long int; + Kokkos::atomic_fetch_add(a, T(1)); +#endif +} + +template <typename T> +KOKKOS_INLINE_FUNCTION void atomic_increment(volatile T* a) { +#if defined(KOKKOS_ENABLE_SERIAL_ATOMICS) + T* a_nv = const_cast<T*>(a); + ++(*a_nv); +#else + Kokkos::atomic_fetch_add(a, T(1)); +#endif +} + +} // End of namespace Kokkos +#endif diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Load.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Load.hpp new file mode 100644 index 0000000000000000000000000000000000000000..f3b77a297629867a11ef25225b1302f1d68aa937 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Load.hpp @@ -0,0 +1,230 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2019) Sandia Corporation +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_IMPL_KOKKOS_ATOMIC_LOAD_HPP +#define KOKKOS_IMPL_KOKKOS_ATOMIC_LOAD_HPP + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ATOMIC_HPP) + +#include <impl/Kokkos_Atomic_Memory_Order.hpp> +#include <impl/Kokkos_Atomic_Generic.hpp> + +#if defined(KOKKOS_ENABLE_CUDA) +#include <Cuda/Kokkos_Cuda_Atomic_Intrinsics.hpp> +#endif + +namespace Kokkos { +namespace Impl { + +// Olivier's implementation helpfully binds to the same builtins as GNU, so +// we make this code common across multiple options +#if (defined(KOKKOS_ENABLE_GNU_ATOMICS) && !defined(__CUDA_ARCH__)) || \ + (defined(KOKKOS_ENABLE_INTEL_ATOMICS) && !defined(__CUDA_ARCH__)) || \ + defined(KOKKOS_ENABLE_CUDA_ASM_ATOMICS) + +#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA_ASM_ATOMICS) +#define KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH __inline__ __device__ +#else +#define KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH inline +#endif + +template <class T, class MemoryOrder> +KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH T _atomic_load( + T* ptr, MemoryOrder, + typename std::enable_if< + (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || + sizeof(T) == 8) && + std::is_same<typename MemoryOrder::memory_order, + typename std::remove_cv<MemoryOrder>::type>::value, + void const**>::type = nullptr) { + return __atomic_load_n(ptr, MemoryOrder::gnu_constant); +} + +template <class T, class MemoryOrder> +KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH T _atomic_load( + T* ptr, MemoryOrder, + typename std::enable_if< + !(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || + sizeof(T) == 8) && + std::is_default_constructible<T>::value && + std::is_same<typename MemoryOrder::memory_order, + typename std::remove_cv<MemoryOrder>::type>::value, + void const**>::type = nullptr) { + T rv{}; + __atomic_load(ptr, &rv, MemoryOrder::gnu_constant); + return rv; +} + +#undef KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH + +#elif defined(__CUDA_ARCH__) + +// Not compiling for Volta or later, or Cuda ASM atomics were manually disabled + +template <class T> +__device__ __inline__ T _relaxed_atomic_load_impl( + T* ptr, typename std::enable_if<(sizeof(T) == 1 || sizeof(T) == 2 || + sizeof(T) == 4 || sizeof(T) == 8), + void const**>::type = nullptr) { + return *ptr; +} + +template <class T> +struct NoOpOper { + __device__ __inline__ static constexpr T apply(T const& t, + T const&) noexcept { + return t; + } +}; + +template <class T> +__device__ __inline__ T _relaxed_atomic_load_impl( + T* ptr, typename std::enable_if<!(sizeof(T) == 1 || sizeof(T) == 2 || + sizeof(T) == 4 || sizeof(T) == 8), + void const**>::type = nullptr) { + T rv{}; + // TODO remove a copy operation here? + return Kokkos::Impl::atomic_oper_fetch(NoOpOper<T>{}, ptr, rv); +} + +template <class T> +__device__ __inline__ T _atomic_load(T* ptr, memory_order_seq_cst_t) { + Kokkos::memory_fence(); + T rv = Impl::_relaxed_atomic_load_impl(ptr); + Kokkos::memory_fence(); + return rv; +} + +template <class T> +__device__ __inline__ T _atomic_load(T* ptr, memory_order_acquire_t) { + T rv = Impl::_relaxed_atomic_load_impl(ptr); + Kokkos::memory_fence(); + return rv; +} + +template <class T> +__device__ __inline__ T _atomic_load(T* ptr, memory_order_relaxed_t) { + return _relaxed_atomic_load_impl(ptr); +} + +#elif defined(KOKKOS_ENABLE_OPENMP_ATOMICS) + +template <class T, class MemoryOrder> +inline T _atomic_load(T* ptr, MemoryOrder) { + // AFAICT, all OpenMP atomics are sequentially consistent, so memory order + // doesn't matter + T retval{}; +#pragma omp atomic read + { retval = *ptr; } + return retval; +} + +#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS) + +template <class T, class MemoryOrder> +inline T _atomic_load(T* ptr, MemoryOrder) { + return *ptr; +} + +#elif defined(KOKKOS_ENABLE_WINDOWS_ATOMICS) + +template <class T, class MemoryOrder> +inline T _atomic_load(T* ptr, MemoryOrder) { + atomic_compare_exchange(ptr, 0, 0); + return *ptr; +} + +#endif // end of all atomic implementations + +template <class T> +KOKKOS_FORCEINLINE_FUNCTION T atomic_load(T* ptr, + Impl::memory_order_seq_cst_t) { + return _atomic_load(ptr, Impl::memory_order_seq_cst); +} + +template <class T> +KOKKOS_FORCEINLINE_FUNCTION T atomic_load(T* ptr, + Impl::memory_order_acquire_t) { + return _atomic_load(ptr, Impl::memory_order_acquire); +} + +template <class T> +KOKKOS_FORCEINLINE_FUNCTION T atomic_load(T* ptr, + Impl::memory_order_relaxed_t) { + return _atomic_load(ptr, Impl::memory_order_relaxed); +} + +template <class T> +KOKKOS_FORCEINLINE_FUNCTION T atomic_load(T* /*ptr*/, + Impl::memory_order_release_t) { + static_assert( + sizeof(T) == 0, // just something that will always be false, but only on + // instantiation + "atomic_load with memory order release doesn't make any sense!"); +} + +template <class T> +KOKKOS_FORCEINLINE_FUNCTION T atomic_load(T* /*ptr*/, + Impl::memory_order_acq_rel_t) { + static_assert( + sizeof(T) == 0, // just something that will always be false, but only on + // instantiation + "atomic_load with memory order acq_rel doesn't make any sense!"); +} + +template <class T> +KOKKOS_FORCEINLINE_FUNCTION T atomic_load(T* ptr) { + // relaxed by default! + return _atomic_load(ptr, Impl::memory_order_relaxed); +} + +} // end namespace Impl +} // end namespace Kokkos + +#if defined(KOKKOS_ENABLE_CUDA) +#include <Cuda/Kokkos_Cuda_Atomic_Intrinsics_Restore_Builtins.hpp> +#endif + +#endif // defined(KOKKOS_ATOMIC_HPP) +#endif // KOKKOS_IMPL_KOKKOS_ATOMIC_LOAD_HPP diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Memory_Order.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Memory_Order.hpp new file mode 100644 index 0000000000000000000000000000000000000000..72a6dfa9a466e0161b4b98ec8069f99187c748a2 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Memory_Order.hpp @@ -0,0 +1,121 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2019) Sandia Corporation +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_KOKKOS_ATOMIC_MEMORY_ORDER_HPP +#define KOKKOS_KOKKOS_ATOMIC_MEMORY_ORDER_HPP + +#include <Kokkos_Macros.hpp> + +#include <atomic> + +namespace Kokkos { +namespace Impl { + +/** @file + * Provides strongly-typed analogs of the standard memory order enumerators. + * In addition to (very slightly) reducing the constant propagation burden on + * the compiler, this allows us to give compile-time errors for things that + * don't make sense, like atomic_load with memory order release. + */ + +struct memory_order_seq_cst_t { + using memory_order = memory_order_seq_cst_t; +#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || \ + defined(KOKKOS_ENABLE_INTEL_ATOMICS) || \ + defined(KOKKOS_ENABLE_CUDA_ASM_ATOMICS) + static constexpr auto gnu_constant = __ATOMIC_SEQ_CST; +#endif + static constexpr auto std_constant = std::memory_order_seq_cst; +}; +constexpr memory_order_seq_cst_t memory_order_seq_cst = {}; + +struct memory_order_relaxed_t { + using memory_order = memory_order_relaxed_t; +#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || \ + defined(KOKKOS_ENABLE_INTEL_ATOMICS) || \ + defined(KOKKOS_ENABLE_CUDA_ASM_ATOMICS) + static constexpr auto gnu_constant = __ATOMIC_RELAXED; +#endif + static constexpr auto std_constant = std::memory_order_relaxed; +}; +constexpr memory_order_relaxed_t memory_order_relaxed = {}; + +struct memory_order_acquire_t { + using memory_order = memory_order_acquire_t; +#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || \ + defined(KOKKOS_ENABLE_INTEL_ATOMICS) || \ + defined(KOKKOS_ENABLE_CUDA_ASM_ATOMICS) + static constexpr auto gnu_constant = __ATOMIC_ACQUIRE; +#endif + static constexpr auto std_constant = std::memory_order_acquire; +}; +constexpr memory_order_acquire_t memory_order_acquire = {}; + +struct memory_order_release_t { + using memory_order = memory_order_release_t; +#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || \ + defined(KOKKOS_ENABLE_INTEL_ATOMICS) || \ + defined(KOKKOS_ENABLE_CUDA_ASM_ATOMICS) + static constexpr auto gnu_constant = __ATOMIC_RELEASE; +#endif + static constexpr auto std_constant = std::memory_order_release; +}; +constexpr memory_order_release_t memory_order_release = {}; + +struct memory_order_acq_rel_t { + using memory_order = memory_order_acq_rel_t; +#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || \ + defined(KOKKOS_ENABLE_INTEL_ATOMICS) || \ + defined(KOKKOS_ENABLE_CUDA_ASM_ATOMICS) + static constexpr auto gnu_constant = __ATOMIC_ACQ_REL; +#endif + static constexpr auto std_constant = std::memory_order_acq_rel; +}; +constexpr memory_order_acq_rel_t memory_order_acq_rel = {}; + +// Intentionally omit consume (for now) + +} // end namespace Impl +} // end namespace Kokkos + +#endif // KOKKOS_KOKKOS_ATOMIC_MEMORY_ORDER_HPP diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_MinMax.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_MinMax.hpp new file mode 100644 index 0000000000000000000000000000000000000000..7338a5c545f25f58662c2e05c4a20bda4992e203 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_MinMax.hpp @@ -0,0 +1,319 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_MINMAX_HPP) +#define KOKKOS_ATOMIC_MINMAX_HPP + +namespace Kokkos { + +//---------------------------------------------------------------------------- + +#if defined(KOKKOS_ENABLE_CUDA) +#if defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) + +// Support for int, unsigned int, unsigned long long int, and float + +// Atomic_fetch_{min,max} + +#ifdef KOKKOS_IMPL_CUDA_CLANG_WORKAROUND + +// Host implementations for CLANG compiler + +inline __host__ int atomic_fetch_min(volatile int* const dest, const int val) { + return Impl::atomic_fetch_oper(Impl::MinOper<const int, const int>(), dest, + val); +} + +inline __host__ unsigned int atomic_fetch_min(volatile unsigned int* const dest, + const unsigned int val) { + return Impl::atomic_fetch_oper( + Impl::MinOper<const unsigned int, const unsigned int>(), dest, val); +} + +inline __host__ unsigned long long int atomic_fetch_min( + volatile unsigned long long int* const dest, + const unsigned long long int val) { + return Impl::atomic_fetch_oper(Impl::MinOper<const unsigned long long int, + const unsigned long long int>(), + dest, val); +} + +inline __host__ int atomic_fetch_max(volatile int* const dest, const int val) { + return Impl::atomic_fetch_oper(Impl::MaxOper<const int, const int>(), dest, + val); +} + +inline __host__ unsigned int atomic_fetch_max(volatile unsigned int* const dest, + const unsigned int val) { + return Impl::atomic_fetch_oper( + Impl::MaxOper<const unsigned int, const unsigned int>(), dest, val); +} + +inline __host__ unsigned long long int atomic_fetch_max( + volatile unsigned long long int* const dest, + const unsigned long long int val) { + return Impl::atomic_fetch_oper(Impl::MaxOper<const unsigned long long int, + const unsigned long long int>(), + dest, val); +} + +#endif + +#if (350 > __CUDA_ARCH__) + +// Fallback for atomic{Min,Max} for Kepler + +inline __device__ int atomic_fetch_min(volatile int* const dest, + const int val) { + return Impl::atomic_fetch_oper(Impl::MinOper<const int, const int>(), dest, + val); +} + +inline __device__ unsigned int atomic_fetch_min( + volatile unsigned int* const dest, const unsigned int val) { + return Impl::atomic_fetch_oper( + Impl::MinOper<const unsigned int, const unsigned int>(), dest, val); +} + +inline __device__ unsigned long long int atomic_fetch_min( + volatile unsigned long long int* const dest, + const unsigned long long int val) { + return Impl::atomic_fetch_oper(Impl::MinOper<const unsigned long long int, + const unsigned long long int>(), + dest, val); +} + +inline __device__ int atomic_fetch_max(volatile int* const dest, + const int val) { + return Impl::atomic_fetch_oper(Impl::MaxOper<const int, const int>(), dest, + val); +} + +inline __device__ unsigned int atomic_fetch_max( + volatile unsigned int* const dest, const unsigned int val) { + return Impl::atomic_fetch_oper( + Impl::MaxOper<const unsigned int, const unsigned int>(), dest, val); +} + +inline __device__ unsigned long long int atomic_fetch_max( + volatile unsigned long long int* const dest, + const unsigned long long int val) { + return Impl::atomic_fetch_oper(Impl::MaxOper<const unsigned long long int, + const unsigned long long int>(), + dest, val); +} + +#else // Supported by devices of compute capability 3.5 and higher + +inline __device__ int atomic_fetch_min(volatile int* const dest, + const int val) { + return atomicMin((int*)dest, val); +} + +inline __device__ unsigned int atomic_fetch_min( + volatile unsigned int* const dest, const unsigned int val) { + return atomicMin((unsigned int*)dest, val); +} + +inline __device__ unsigned long long int atomic_fetch_min( + volatile unsigned long long int* const dest, + const unsigned long long int val) { + return atomicMin((unsigned long long int*)dest, val); +} + +inline __device__ int atomic_fetch_max(volatile int* const dest, + const int val) { + return atomicMax((int*)dest, val); +} + +inline __device__ unsigned int atomic_fetch_max( + volatile unsigned int* const dest, const unsigned int val) { + return atomicMax((unsigned int*)dest, val); +} + +inline __device__ unsigned long long int atomic_fetch_max( + volatile unsigned long long int* const dest, + const unsigned long long int val) { + return atomicMax((unsigned long long int*)dest, val); +} + +#endif + +// Atomic_{min,max}_fetch + +#ifdef KOKKOS_IMPL_CUDA_CLANG_WORKAROUND + +// Host implementations for CLANG compiler + +inline __host__ int atomic_min_fetch(volatile int* const dest, const int val) { + return Impl::atomic_oper_fetch(Impl::MinOper<const int, const int>(), dest, + val); +} + +inline __host__ unsigned int atomic_min_fetch(volatile unsigned int* const dest, + const unsigned int val) { + return Impl::atomic_oper_fetch( + Impl::MinOper<const unsigned int, const unsigned int>(), dest, val); +} + +inline __host__ unsigned long long int atomic_min_fetch( + volatile unsigned long long int* const dest, + const unsigned long long int val) { + return Impl::atomic_oper_fetch(Impl::MinOper<const unsigned long long int, + const unsigned long long int>(), + dest, val); +} + +inline __host__ int atomic_max_fetch(volatile int* const dest, const int val) { + return Impl::atomic_oper_fetch(Impl::MaxOper<const int, const int>(), dest, + val); +} + +inline __host__ unsigned int atomic_max_fetch(volatile unsigned int* const dest, + const unsigned int val) { + return Impl::atomic_oper_fetch( + Impl::MaxOper<const unsigned int, const unsigned int>(), dest, val); +} + +inline __host__ unsigned long long int atomic_max_fetch( + volatile unsigned long long int* const dest, + const unsigned long long int val) { + return Impl::atomic_oper_fetch(Impl::MaxOper<const unsigned long long int, + const unsigned long long int>(), + dest, val); +} +#endif + +#if (350 > __CUDA_ARCH__) + +// Fallback for atomic{Min,Max} for Kepler + +inline __device__ int atomic_min_fetch(volatile int* const dest, + const int val) { + return Impl::atomic_oper_fetch(Impl::MinOper<const int, const int>(), dest, + val); +} + +inline __device__ unsigned int atomic_min_fetch( + volatile unsigned int* const dest, const unsigned int val) { + return Impl::atomic_oper_fetch( + Impl::MinOper<const unsigned int, const unsigned int>(), dest, val); +} + +inline __device__ unsigned long long int atomic_min_fetch( + volatile unsigned long long int* const dest, + const unsigned long long int val) { + return Impl::atomic_oper_fetch(Impl::MinOper<const unsigned long long int, + const unsigned long long int>(), + dest, val); +} + +inline __device__ int atomic_max_fetch(volatile int* const dest, + const int val) { + return Impl::atomic_oper_fetch(Impl::MaxOper<const int, const int>(), dest, + val); +} + +inline __device__ unsigned int atomic_max_fetch( + volatile unsigned int* const dest, const unsigned int val) { + return Impl::atomic_oper_fetch( + Impl::MaxOper<const unsigned int, const unsigned int>(), dest, val); +} + +inline __device__ unsigned long long int atomic_max_fetch( + volatile unsigned long long int* const dest, + const unsigned long long int val) { + return Impl::atomic_oper_fetch(Impl::MaxOper<const unsigned long long int, + const unsigned long long int>(), + dest, val); +} + +#else // Supported by devices of compute capability 3.5 and higher + +inline __device__ int atomic_min_fetch(volatile int* const dest, + const int val) { + const int old = atomicMin((int*)dest, val); + return old < val ? old : val; +} + +inline __device__ unsigned int atomic_min_fetch( + volatile unsigned int* const dest, const unsigned int val) { + const unsigned int old = atomicMin((unsigned int*)dest, val); + return old < val ? old : val; +} + +inline __device__ unsigned long long int atomic_min_fetch( + volatile unsigned long long int* const dest, + const unsigned long long int val) { + const unsigned long long old = atomicMin((unsigned long long*)dest, val); + return old < val ? old : val; +} + +inline __device__ int atomic_max_fetch(volatile int* const dest, + const int val) { + const int old = atomicMax((int*)dest, val); + return old >= val ? old : val; +} + +inline __device__ unsigned int atomic_max_fetch( + volatile unsigned int* const dest, const unsigned int val) { + const unsigned int old = atomicMax((unsigned int*)dest, val); + return old >= val ? old : val; +} + +inline __device__ unsigned long long int atomic_max_fetch( + volatile unsigned long long int* const dest, + const unsigned long long int val) { + const unsigned long long old = atomicMax((unsigned long long*)dest, val); + return old >= val ? old : val; +} + +#endif + +#endif +#endif +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Store.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Store.hpp new file mode 100644 index 0000000000000000000000000000000000000000..264d6beaf5d8a9f3741deafca4c67820c0649b90 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Store.hpp @@ -0,0 +1,226 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2019) Sandia Corporation +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_IMPL_KOKKOS_ATOMIC_STORE_HPP +#define KOKKOS_IMPL_KOKKOS_ATOMIC_STORE_HPP + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ATOMIC_HPP) + +#include <impl/Kokkos_Atomic_Memory_Order.hpp> +#include <impl/Kokkos_Atomic_Generic.hpp> + +#if defined(KOKKOS_ENABLE_CUDA) +#include <Cuda/Kokkos_Cuda_Atomic_Intrinsics.hpp> +#endif + +namespace Kokkos { +namespace Impl { + +// Olivier's implementation helpfully binds to the same builtins as GNU, so +// we make this code common across multiple options +#if (defined(KOKKOS_ENABLE_GNU_ATOMICS) && !defined(__CUDA_ARCH__)) || \ + (defined(KOKKOS_ENABLE_INTEL_ATOMICS) && !defined(__CUDA_ARCH__)) || \ + defined(KOKKOS_ENABLE_CUDA_ASM_ATOMICS) + +#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA_ASM_ATOMICS) +#define KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH __inline__ __device__ +#else +#define KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH inline +#endif + +template <class T, class MemoryOrder> +KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH void _atomic_store( + T* ptr, T val, MemoryOrder, + typename std::enable_if< + (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || + sizeof(T) == 8) && + std::is_same<typename MemoryOrder::memory_order, + typename std::remove_cv<MemoryOrder>::type>::value, + void const**>::type = nullptr) { + __atomic_store_n(ptr, val, MemoryOrder::gnu_constant); +} + +template <class T, class MemoryOrder> +KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH void _atomic_store( + T* ptr, T val, MemoryOrder, + typename std::enable_if< + !(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || + sizeof(T) == 8) && + std::is_default_constructible<T>::value && + std::is_same<typename MemoryOrder::memory_order, + typename std::remove_cv<MemoryOrder>::type>::value, + void const**>::type = nullptr) { + __atomic_store(ptr, &val, MemoryOrder::gnu_constant); +} + +#undef KOKKOS_INTERNAL_INLINE_DEVICE_IF_CUDA_ARCH + +#elif defined(__CUDA_ARCH__) + +// Not compiling for Volta or later, or Cuda ASM atomics were manually disabled + +template <class T> +__device__ __inline__ void _relaxed_atomic_store_impl( + T* ptr, T val, + typename std::enable_if<(sizeof(T) == 1 || sizeof(T) == 2 || + sizeof(T) == 4 || sizeof(T) == 8), + void const**>::type = nullptr) { + *ptr = val; +} + +template <class T> +struct StoreOper { + __device__ __inline__ static constexpr T apply(T const&, + T const& val) noexcept { + return val; + } +}; + +template <class T> +__device__ __inline__ void _relaxed_atomic_store_impl( + T* ptr, T val, + typename std::enable_if<!(sizeof(T) == 1 || sizeof(T) == 2 || + sizeof(T) == 4 || sizeof(T) == 8), + void const**>::type = nullptr) { + Kokkos::Impl::atomic_oper_fetch(StoreOper<T>{}, ptr, (T &&) val); +} + +template <class T> +__device__ __inline__ void _atomic_store(T* ptr, T val, + memory_order_seq_cst_t) { + Kokkos::memory_fence(); + Impl::_relaxed_atomic_store_impl(ptr, val); + Kokkos::memory_fence(); +} + +template <class T> +__device__ __inline__ void _atomic_store(T* ptr, T val, + memory_order_release_t) { + Kokkos::memory_fence(); + _relaxed_atomic_store_impl(ptr, val); +} + +template <class T> +__device__ __inline__ void _atomic_store(T* ptr, T val, + memory_order_relaxed_t) { + _relaxed_atomic_store_impl(ptr, val); +} + +#elif defined(KOKKOS_ENABLE_OPENMP_ATOMICS) + +template <class T, class MemoryOrder> +inline void _atomic_store(T* ptr, T val, MemoryOrder) { + // AFAICT, all OpenMP atomics are sequentially consistent, so memory order + // doesn't matter +#pragma omp atomic write + { *ptr = val; } +} + +#elif defined(KOKKOS_ENABLE_SERIAL_ATOMICS) + +template <class T, class MemoryOrder> +inline void _atomic_store(T* ptr, T val, MemoryOrder) { + *ptr = val; +} + +#elif defined(KOKKOS_ENABLE_WINDOWS_ATOMICS) + +template <class T, class MemoryOrder> +inline void _atomic_store(T* ptr, T val, MemoryOrder) { + atomic_exchange(ptr, val); +} + +#endif // end of all atomic implementations + +template <class T> +KOKKOS_FORCEINLINE_FUNCTION void atomic_store(T* ptr, T val, + Impl::memory_order_seq_cst_t) { + _atomic_store(ptr, val, Impl::memory_order_seq_cst); +} + +template <class T> +KOKKOS_FORCEINLINE_FUNCTION void atomic_store(T* ptr, T val, + Impl::memory_order_release_t) { + _atomic_store(ptr, val, Impl::memory_order_release); +} + +template <class T> +KOKKOS_FORCEINLINE_FUNCTION void atomic_store(T* ptr, T val, + Impl::memory_order_relaxed_t) { + _atomic_store(ptr, val, Impl::memory_order_relaxed); +} + +template <class T> +KOKKOS_FORCEINLINE_FUNCTION void atomic_store(T* /*ptr*/, T /*val*/, + Impl::memory_order_acquire_t) { + static_assert( + sizeof(T) == 0, // just something that will always be false, but only on + // instantiation + "atomic_store with memory order acquire doesn't make any sense!"); +} + +template <class T> +KOKKOS_FORCEINLINE_FUNCTION void atomic_store(T* /*ptr*/, T /*val*/, + Impl::memory_order_acq_rel_t) { + static_assert( + sizeof(T) == 0, // just something that will always be false, but only on + // instantiation + "atomic_store with memory order acq_rel doesn't make any sense!"); +} + +template <class T> +KOKKOS_FORCEINLINE_FUNCTION void atomic_store(T* ptr, T val) { + // relaxed by default! + _atomic_store(ptr, val, Impl::memory_order_relaxed); +} + +} // end namespace Impl +} // end namespace Kokkos + +#if defined(KOKKOS_ENABLE_CUDA) +#include <Cuda/Kokkos_Cuda_Atomic_Intrinsics_Restore_Builtins.hpp> +#endif + +#endif // defined(KOKKOS_ATOMIC_HPP) +#endif // KOKKOS_IMPL_KOKKOS_ATOMIC_STORE_HPP diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_View.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_View.hpp new file mode 100644 index 0000000000000000000000000000000000000000..975318b7dde67a1d1569c3cf657060c3ae18215d --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_View.hpp @@ -0,0 +1,385 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#ifndef KOKKOS_ATOMIC_VIEW_HPP +#define KOKKOS_ATOMIC_VIEW_HPP + +#include <Kokkos_Macros.hpp> +#include <Kokkos_Atomic.hpp> + +namespace Kokkos { +namespace Impl { + +// The following tag is used to prevent an implicit call of the constructor when +// trying to assign a literal 0 int ( = 0 ); +struct AtomicViewConstTag {}; + +template <class ViewTraits> +class AtomicDataElement { + public: + using value_type = typename ViewTraits::value_type; + using const_value_type = typename ViewTraits::const_value_type; + using non_const_value_type = typename ViewTraits::non_const_value_type; + volatile value_type* const ptr; + + KOKKOS_INLINE_FUNCTION + AtomicDataElement(value_type* ptr_, AtomicViewConstTag) : ptr(ptr_) {} + + KOKKOS_INLINE_FUNCTION + const_value_type operator=(const_value_type& val) const { + *ptr = val; + return val; + } + KOKKOS_INLINE_FUNCTION + const_value_type operator=(volatile const_value_type& val) const { + *ptr = val; + return val; + } + + KOKKOS_INLINE_FUNCTION + void inc() const { Kokkos::atomic_increment(ptr); } + + KOKKOS_INLINE_FUNCTION + void dec() const { Kokkos::atomic_decrement(ptr); } + + KOKKOS_INLINE_FUNCTION + const_value_type operator++() const { + const_value_type tmp = + Kokkos::atomic_fetch_add(ptr, non_const_value_type(1)); + return tmp + 1; + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator--() const { + const_value_type tmp = + Kokkos::atomic_fetch_sub(ptr, non_const_value_type(1)); + return tmp - 1; + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator++(int) const { + return Kokkos::atomic_fetch_add(ptr, non_const_value_type(1)); + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator--(int) const { + return Kokkos::atomic_fetch_sub(ptr, non_const_value_type(1)); + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator+=(const_value_type& val) const { + const_value_type tmp = Kokkos::atomic_fetch_add(ptr, val); + return tmp + val; + } + KOKKOS_INLINE_FUNCTION + const_value_type operator+=(volatile const_value_type& val) const { + const_value_type tmp = Kokkos::atomic_fetch_add(ptr, val); + return tmp + val; + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator-=(const_value_type& val) const { + const_value_type tmp = Kokkos::atomic_fetch_sub(ptr, val); + return tmp - val; + } + KOKKOS_INLINE_FUNCTION + const_value_type operator-=(volatile const_value_type& val) const { + const_value_type tmp = Kokkos::atomic_fetch_sub(ptr, val); + return tmp - val; + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator*=(const_value_type& val) const { + return Kokkos::atomic_mul_fetch(ptr, val); + } + KOKKOS_INLINE_FUNCTION + const_value_type operator*=(volatile const_value_type& val) const { + return Kokkos::atomic_mul_fetch(ptr, val); + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator/=(const_value_type& val) const { + return Kokkos::atomic_div_fetch(ptr, val); + } + KOKKOS_INLINE_FUNCTION + const_value_type operator/=(volatile const_value_type& val) const { + return Kokkos::atomic_div_fetch(ptr, val); + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator%=(const_value_type& val) const { + return Kokkos::atomic_mod_fetch(ptr, val); + } + KOKKOS_INLINE_FUNCTION + const_value_type operator%=(volatile const_value_type& val) const { + return Kokkos::atomic_mod_fetch(ptr, val); + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator&=(const_value_type& val) const { + return Kokkos::atomic_and_fetch(ptr, val); + } + KOKKOS_INLINE_FUNCTION + const_value_type operator&=(volatile const_value_type& val) const { + return Kokkos::atomic_and_fetch(ptr, val); + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator^=(const_value_type& val) const { + return Kokkos::atomic_xor_fetch(ptr, val); + } + KOKKOS_INLINE_FUNCTION + const_value_type operator^=(volatile const_value_type& val) const { + return Kokkos::atomic_xor_fetch(ptr, val); + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator|=(const_value_type& val) const { + return Kokkos::atomic_or_fetch(ptr, val); + } + KOKKOS_INLINE_FUNCTION + const_value_type operator|=(volatile const_value_type& val) const { + return Kokkos::atomic_or_fetch(ptr, val); + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator<<=(const_value_type& val) const { + return Kokkos::atomic_lshift_fetch(ptr, val); + } + KOKKOS_INLINE_FUNCTION + const_value_type operator<<=(volatile const_value_type& val) const { + return Kokkos::atomic_lshift_fetch(ptr, val); + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator>>=(const_value_type& val) const { + return Kokkos::atomic_rshift_fetch(ptr, val); + } + KOKKOS_INLINE_FUNCTION + const_value_type operator>>=(volatile const_value_type& val) const { + return Kokkos::atomic_rshift_fetch(ptr, val); + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator+(const_value_type& val) const { return *ptr + val; } + KOKKOS_INLINE_FUNCTION + const_value_type operator+(volatile const_value_type& val) const { + return *ptr + val; + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator-(const_value_type& val) const { return *ptr - val; } + KOKKOS_INLINE_FUNCTION + const_value_type operator-(volatile const_value_type& val) const { + return *ptr - val; + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator*(const_value_type& val) const { return *ptr * val; } + KOKKOS_INLINE_FUNCTION + const_value_type operator*(volatile const_value_type& val) const { + return *ptr * val; + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator/(const_value_type& val) const { return *ptr / val; } + KOKKOS_INLINE_FUNCTION + const_value_type operator/(volatile const_value_type& val) const { + return *ptr / val; + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator%(const_value_type& val) const { return *ptr ^ val; } + KOKKOS_INLINE_FUNCTION + const_value_type operator%(volatile const_value_type& val) const { + return *ptr ^ val; + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator!() const { return !*ptr; } + + KOKKOS_INLINE_FUNCTION + const_value_type operator&&(const_value_type& val) const { + return *ptr && val; + } + KOKKOS_INLINE_FUNCTION + const_value_type operator&&(volatile const_value_type& val) const { + return *ptr && val; + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator||(const_value_type& val) const { + return *ptr | val; + } + KOKKOS_INLINE_FUNCTION + const_value_type operator||(volatile const_value_type& val) const { + return *ptr | val; + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator&(const_value_type& val) const { return *ptr & val; } + KOKKOS_INLINE_FUNCTION + const_value_type operator&(volatile const_value_type& val) const { + return *ptr & val; + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator|(const_value_type& val) const { return *ptr | val; } + KOKKOS_INLINE_FUNCTION + const_value_type operator|(volatile const_value_type& val) const { + return *ptr | val; + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator^(const_value_type& val) const { return *ptr ^ val; } + KOKKOS_INLINE_FUNCTION + const_value_type operator^(volatile const_value_type& val) const { + return *ptr ^ val; + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator~() const { return ~*ptr; } + + KOKKOS_INLINE_FUNCTION + const_value_type operator<<(const unsigned int& val) const { + return *ptr << val; + } + KOKKOS_INLINE_FUNCTION + const_value_type operator<<(volatile const unsigned int& val) const { + return *ptr << val; + } + + KOKKOS_INLINE_FUNCTION + const_value_type operator>>(const unsigned int& val) const { + return *ptr >> val; + } + KOKKOS_INLINE_FUNCTION + const_value_type operator>>(volatile const unsigned int& val) const { + return *ptr >> val; + } + + KOKKOS_INLINE_FUNCTION + bool operator==(const AtomicDataElement& val) const { return *ptr == val; } + KOKKOS_INLINE_FUNCTION + bool operator==(volatile const AtomicDataElement& val) const { + return *ptr == val; + } + + KOKKOS_INLINE_FUNCTION + bool operator!=(const AtomicDataElement& val) const { return *ptr != val; } + KOKKOS_INLINE_FUNCTION + bool operator!=(volatile const AtomicDataElement& val) const { + return *ptr != val; + } + + KOKKOS_INLINE_FUNCTION + bool operator>=(const_value_type& val) const { return *ptr >= val; } + KOKKOS_INLINE_FUNCTION + bool operator>=(volatile const_value_type& val) const { return *ptr >= val; } + + KOKKOS_INLINE_FUNCTION + bool operator<=(const_value_type& val) const { return *ptr <= val; } + KOKKOS_INLINE_FUNCTION + bool operator<=(volatile const_value_type& val) const { return *ptr <= val; } + + KOKKOS_INLINE_FUNCTION + bool operator<(const_value_type& val) const { return *ptr < val; } + KOKKOS_INLINE_FUNCTION + bool operator<(volatile const_value_type& val) const { return *ptr < val; } + + KOKKOS_INLINE_FUNCTION + bool operator>(const_value_type& val) const { return *ptr > val; } + KOKKOS_INLINE_FUNCTION + bool operator>(volatile const_value_type& val) const { return *ptr > val; } + + KOKKOS_INLINE_FUNCTION + operator const_value_type() const { + // return Kokkos::atomic_load(ptr); + return *ptr; + } + + KOKKOS_INLINE_FUNCTION + operator volatile non_const_value_type() volatile const { + // return Kokkos::atomic_load(ptr); + return *ptr; + } +}; + +template <class ViewTraits> +class AtomicViewDataHandle { + public: + typename ViewTraits::value_type* ptr; + + KOKKOS_INLINE_FUNCTION + AtomicViewDataHandle() : ptr(nullptr) {} + + KOKKOS_INLINE_FUNCTION + AtomicViewDataHandle(typename ViewTraits::value_type* ptr_) : ptr(ptr_) {} + + template <class iType> + KOKKOS_INLINE_FUNCTION AtomicDataElement<ViewTraits> operator[]( + const iType& i) const { + return AtomicDataElement<ViewTraits>(ptr + i, AtomicViewConstTag()); + } + + KOKKOS_INLINE_FUNCTION + operator typename ViewTraits::value_type*() const { return ptr; } +}; + +template <unsigned Size> +struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars; + +template <> +struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<4> { + using type = int; +}; + +template <> +struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<8> { + using type = int64_t; +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Windows.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Windows.hpp new file mode 100644 index 0000000000000000000000000000000000000000..3f2e8914ea9347f74cad8d84dde87bc8f5764f19 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Windows.hpp @@ -0,0 +1,165 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#ifndef KOKKOS_ATOMIC_WINDOWS_HPP +#define KOKKOS_ATOMIC_WINDOWS_HPP + +#ifdef _WIN32 + +#ifndef NOMINMAX +#define NOMINMAX +#endif +#include <winsock2.h> +#include <windows.h> + +#undef VOID + +namespace Kokkos { +namespace Impl { +#ifdef _MSC_VER +_declspec(align(16)) +#endif + struct cas128_t { + LONGLONG lower; + LONGLONG upper; + KOKKOS_INLINE_FUNCTION + bool operator!=(const cas128_t& a) const { + return (lower != a.lower) || upper != a.upper; + } +} +#if defined(__GNUC__) || defined(__clang__) +__attribute__((aligned(16))) +#endif +; +} // namespace Impl + +#if !defined(__CUDA_ARCH__) || defined(__clang__) +template <typename T> +inline T atomic_compare_exchange( + volatile T* const dest, const T& compare, + typename std::enable_if<sizeof(T) == sizeof(CHAR), const T&>::type val) { + union U { + CHAR i; + T t; + KOKKOS_INLINE_FUNCTION U(){}; + } tmp; + + tmp.i = _InterlockedCompareExchange8((CHAR*)dest, *((CHAR*)&val), + *((CHAR*)&compare)); + return tmp.t; +} + +template <typename T> +inline T atomic_compare_exchange( + volatile T* const dest, const T& compare, + typename std::enable_if<sizeof(T) == sizeof(SHORT), const T&>::type val) { + union U { + SHORT i; + T t; + KOKKOS_INLINE_FUNCTION U(){}; + } tmp; + + tmp.i = _InterlockedCompareExchange16((SHORT*)dest, *((SHORT*)&val), + *((SHORT*)&compare)); + return tmp.t; +} + +template <typename T> +inline T atomic_compare_exchange( + volatile T* const dest, const T& compare, + typename std::enable_if<sizeof(T) == sizeof(LONG), const T&>::type val) { + union U { + LONG i; + T t; + KOKKOS_INLINE_FUNCTION U() {} + } tmp; + + tmp.i = _InterlockedCompareExchange((LONG*)dest, *((LONG*)&val), + *((LONG*)&compare)); + return tmp.t; +} + +template <typename T> +inline T atomic_compare_exchange( + volatile T* const dest, const T& compare, + typename std::enable_if<sizeof(T) == sizeof(LONGLONG), const T&>::type + val) { + union U { + LONGLONG i; + T t; + KOKKOS_INLINE_FUNCTION U() {} + } tmp; + + tmp.i = _InterlockedCompareExchange64((LONGLONG*)dest, *((LONGLONG*)&val), + *((LONGLONG*)&compare)); + return tmp.t; +} + +template <typename T> +inline T atomic_compare_exchange( + volatile T* const dest, const T& compare, + typename std::enable_if<sizeof(T) == sizeof(Impl::cas128_t), const T&>::type + val) { + T compare_and_result(compare); + union U { + Impl::cas128_t i; + T t; + KOKKOS_INLINE_FUNCTION U(){}; + } newval; + newval.t = val; + _InterlockedCompareExchange128((LONGLONG*)dest, newval.i.upper, + newval.i.lower, + ((LONGLONG*)&compare_and_result)); + return compare_and_result; +} + +template <typename T> +inline T atomic_compare_exchange_strong(volatile T* const dest, + const T& compare, const T& val) { + return atomic_compare_exchange(dest, compare, val); +} +#endif + +} // namespace Kokkos +#endif +#endif diff --git a/packages/kokkos/core/src/impl/Kokkos_BitOps.hpp b/packages/kokkos/core/src/impl/Kokkos_BitOps.hpp new file mode 100644 index 0000000000000000000000000000000000000000..eb0f07557fe5f22569797e17c50cb7a82bfac431 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_BitOps.hpp @@ -0,0 +1,158 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_BITOPS_HPP +#define KOKKOS_BITOPS_HPP + +#include <Kokkos_Macros.hpp> +#include <cstdint> +#include <climits> + +#ifdef KOKKOS_COMPILER_INTEL +#include <immintrin.h> +#endif + +namespace Kokkos { + +KOKKOS_FORCEINLINE_FUNCTION +int log2(unsigned i) { + enum : int { shift = sizeof(unsigned) * CHAR_BIT - 1 }; +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + return shift - __clz(i); +#elif defined(KOKKOS_COMPILER_INTEL) + return _bit_scan_reverse(i); +#elif defined(KOKKOS_COMPILER_CRAYC) + return i ? shift - _leadz32(i) : 0; +#elif defined(__GNUC__) || defined(__GNUG__) + return shift - __builtin_clz(i); +#else + int offset = 0; + if (i) { + for (offset = shift; (i & (1 << offset)) == 0; --offset) + ; + } + return offset; +#endif +} + +namespace Impl { + +/**\brief Find first zero bit. + * + * If none then return -1 ; + */ +KOKKOS_FORCEINLINE_FUNCTION +int bit_first_zero(unsigned i) noexcept { + enum : unsigned { full = ~0u }; + +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + return full != i ? __ffs(~i) - 1 : -1; +#elif defined(KOKKOS_COMPILER_INTEL) + return full != i ? _bit_scan_forward(~i) : -1; +#elif defined(KOKKOS_COMPILER_CRAYC) + return full != i ? _popcnt(i ^ (i + 1)) - 1 : -1; +#elif defined(KOKKOS_COMPILER_GNU) || defined(__GNUC__) || defined(__GNUG__) + return full != i ? __builtin_ffs(~i) - 1 : -1; +#else + int offset = -1; + if (full != i) { + for (offset = 0; i & (1 << offset); ++offset) + ; + } + return offset; +#endif +} + +KOKKOS_FORCEINLINE_FUNCTION +int bit_scan_forward(unsigned i) { +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + return __ffs(i) - 1; +#elif defined(KOKKOS_COMPILER_INTEL) + return _bit_scan_forward(i); +#elif defined(KOKKOS_COMPILER_CRAYC) + return i ? _popcnt(~i & (i - 1)) : -1; +#elif defined(KOKKOS_COMPILER_GNU) || defined(__GNUC__) || defined(__GNUG__) + return __builtin_ffs(i) - 1; +#else + int offset = -1; + if (i) { + for (offset = 0; (i & (1 << offset)) == 0; ++offset) + ; + } + return offset; +#endif +} + +/// Count the number of bits set. +KOKKOS_FORCEINLINE_FUNCTION +int bit_count(unsigned i) { +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + return __popc(i); +#elif defined(__INTEL_COMPILER) + return _popcnt32(i); +#elif defined(KOKKOS_COMPILER_CRAYC) + return _popcnt(i); +#elif defined(__GNUC__) || defined(__GNUG__) + return __builtin_popcount(i); +#else + // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetNaive + i = i - ((i >> 1) & ~0u / 3u); // temp + i = (i & ~0u / 15u * 3u) + ((i >> 2) & ~0u / 15u * 3u); // temp + i = (i + (i >> 4)) & ~0u / 255u * 15u; // temp + + // count + return (int)((i * (~0u / 255u)) >> (sizeof(unsigned) - 1) * CHAR_BIT); +#endif +} + +KOKKOS_INLINE_FUNCTION +unsigned integral_power_of_two_that_contains(const unsigned N) { + const unsigned i = Kokkos::log2(N); + return ((1u << i) < N) ? i + 1 : i; +} + +} // namespace Impl +} // namespace Kokkos + +#endif // KOKKOS_BITOPS_HPP diff --git a/packages/kokkos/core/src/impl/Kokkos_CPUDiscovery.cpp b/packages/kokkos/core/src/impl/Kokkos_CPUDiscovery.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3251cb0f5c8c5daa3e6693d8bda536a9a3de8d0f --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_CPUDiscovery.cpp @@ -0,0 +1,120 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifdef _WIN32 +#define WIN32_LEAN_AND_MEAN +#include <windows.h> +#elif defined(__APPLE__) +#include <sys/types.h> +#include <sys/sysctl.h> +#else +#include <unistd.h> +#endif +#include <cstdio> +#include <cstdlib> +#include <cstring> +#include <cerrno> +#include <string> + +namespace Kokkos { +namespace Impl { + +int processors_per_node() { +#ifdef _SC_NPROCESSORS_ONLN + int const num_procs = sysconf(_SC_NPROCESSORS_ONLN); + int const num_procs_max = sysconf(_SC_NPROCESSORS_CONF); + if ((num_procs < 1) || (num_procs_max < 1)) { + return -1; + } + return num_procs; +#elif defined(__APPLE__) + int ncpu; + int activecpu; + size_t size = sizeof(int); + sysctlbyname("hw.ncpu", &ncpu, &size, nullptr, 0); + sysctlbyname("hw.activecpu", &activecpu, &size, nullptr, 0); + if (ncpu < 1 || activecpu < 1) + return -1; + else + return activecpu; +#else + return -1; +#endif +} + +int mpi_ranks_per_node() { + char *str; + int ppn = 1; + // if ((str = getenv("SLURM_TASKS_PER_NODE"))) { + // ppn = std::stoi(str); + // if(ppn<=0) ppn = 1; + //} + if ((str = getenv("MV2_COMM_WORLD_LOCAL_SIZE"))) { + ppn = std::stoi(str); + if (ppn <= 0) ppn = 1; + } + if ((str = getenv("OMPI_COMM_WORLD_LOCAL_SIZE"))) { + ppn = std::stoi(str); + if (ppn <= 0) ppn = 1; + } + return ppn; +} + +int mpi_local_rank_on_node() { + char *str; + int local_rank = 0; + // if ((str = getenv("SLURM_LOCALID"))) { + // local_rank = std::stoi(str); + //} + if ((str = getenv("MV2_COMM_WORLD_LOCAL_RANK"))) { + local_rank = std::stoi(str); + } + if ((str = getenv("OMPI_COMM_WORLD_LOCAL_RANK"))) { + local_rank = std::stoi(str); + } + return local_rank; +} + +} // namespace Impl +} // namespace Kokkos diff --git a/packages/kokkos/core/src/impl/Kokkos_CPUDiscovery.hpp b/packages/kokkos/core/src/impl/Kokkos_CPUDiscovery.hpp new file mode 100644 index 0000000000000000000000000000000000000000..6ba14c85b1c8d6bbaf2e8afc97780df1cace4b32 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_CPUDiscovery.hpp @@ -0,0 +1,52 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +namespace Kokkos { +namespace Impl { + +int processors_per_node(); +int mpi_ranks_per_node(); +int mpi_local_rank_on_node(); + +} // namespace Impl +} // namespace Kokkos diff --git a/packages/kokkos/core/src/impl/Kokkos_ChaseLev.hpp b/packages/kokkos/core/src/impl/Kokkos_ChaseLev.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e2283f11fd20d08e5fb554581547622d7140a039 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_ChaseLev.hpp @@ -0,0 +1,312 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +// Experimental unified task-data parallel manycore LDRD + +#ifndef KOKKOS_IMPL_LOCKFREEDEQUE_HPP +#define KOKKOS_IMPL_LOCKFREEDEQUE_HPP + +#include <Kokkos_Macros.hpp> +#ifdef KOKKOS_ENABLE_TASKDAG + +#include <Kokkos_Core_fwd.hpp> + +#include <Kokkos_PointerOwnership.hpp> +#include <impl/Kokkos_OptionalRef.hpp> +#include <impl/Kokkos_Error.hpp> // KOKKOS_EXPECTS +#include <impl/Kokkos_LinkedListNode.hpp> // KOKKOS_EXPECTS + +#include <Kokkos_Atomic.hpp> // atomic_compare_exchange, atomic_fence +#include "Kokkos_LIFO.hpp" + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +template <class NodeType, size_t CircularBufferSize, class SizeType = size_t> +struct fixed_size_circular_buffer { + public: + using node_type = NodeType; + using size_type = SizeType; + + private: + node_type* m_buffer[CircularBufferSize] = {nullptr}; + + public: + fixed_size_circular_buffer() = default; + fixed_size_circular_buffer(fixed_size_circular_buffer const&) = delete; + fixed_size_circular_buffer(fixed_size_circular_buffer&&) = default; + fixed_size_circular_buffer& operator=(fixed_size_circular_buffer const&) = + delete; + fixed_size_circular_buffer& operator=(fixed_size_circular_buffer&&) = default; + ~fixed_size_circular_buffer() = default; + + KOKKOS_FORCEINLINE_FUNCTION + static constexpr size_type size() noexcept { + return size_type(CircularBufferSize); + } + + KOKKOS_FORCEINLINE_FUNCTION + node_type* operator[](size_type idx) const noexcept { + return m_buffer[idx % size()]; + } + + KOKKOS_FORCEINLINE_FUNCTION + node_type*& operator[](size_type idx) noexcept { + return m_buffer[idx % size()]; + } +}; + +template <class NodeType, class SizeType = size_t> +struct non_owning_variable_size_circular_buffer { + public: + using node_type = NodeType; + using size_type = SizeType; + + private: + ObservingRawPtr<node_type*> m_buffer = nullptr; + size_type m_size = 0; + + public: + KOKKOS_INLINE_FUNCTION + non_owning_variable_size_circular_buffer(ObservingRawPtr<node_type*> buffer, + size_type arg_size) noexcept + : m_buffer(buffer), m_size(arg_size) {} + + non_owning_variable_size_circular_buffer() = default; + non_owning_variable_size_circular_buffer( + non_owning_variable_size_circular_buffer const&) = delete; + non_owning_variable_size_circular_buffer( + non_owning_variable_size_circular_buffer&&) = default; + non_owning_variable_size_circular_buffer& operator =( + non_owning_variable_size_circular_buffer const&) = delete; + non_owning_variable_size_circular_buffer& operator =( + non_owning_variable_size_circular_buffer&&) = default; + ~non_owning_variable_size_circular_buffer() = default; + + KOKKOS_FORCEINLINE_FUNCTION + constexpr size_type size() const noexcept { return m_size; } + + KOKKOS_FORCEINLINE_FUNCTION + node_type* operator[](size_type idx) const noexcept { + return m_buffer[idx % size()]; + } + + KOKKOS_FORCEINLINE_FUNCTION + node_type*& operator[](size_type idx) noexcept { + return m_buffer[idx % size()]; + } +}; + +/** Based on "Correct and Efficient Work-Stealing for Weak Memory Models," + * PPoPP '13, https://www.di.ens.fr/~zappa/readings/ppopp13.pdf + * + */ +template <class T, class CircularBufferT, class SizeType = int32_t> +struct ChaseLevDeque { + public: + using size_type = SizeType; + using value_type = T; + // Still using intrusive linked list for waiting queue + using node_type = SimpleSinglyLinkedListNode<>; + + private: + // TODO @tasking @new_feature DSH variable size circular buffer? + + CircularBufferT m_array; + size_type m_top = 0; + size_type m_bottom = 0; + + public: + template <class _ignore = void, + class = typename std::enable_if< + std::is_default_constructible<CircularBufferT>::value>::type> + ChaseLevDeque() : m_array() {} + + explicit ChaseLevDeque(CircularBufferT buffer) : m_array(std::move(buffer)) {} + + KOKKOS_INLINE_FUNCTION + bool empty() const { + // TODO @tasking @memory_order DSH memory order + return m_top > m_bottom - 1; + } + + KOKKOS_INLINE_FUNCTION + OptionalRef<T> pop() { + auto b = m_bottom - 1; // atomic load relaxed + auto& a = m_array; // atomic load relaxed + m_bottom = b; // atomic store relaxed + Kokkos::memory_fence(); // memory order seq_cst + auto t = m_top; // atomic load relaxed + OptionalRef<T> return_value; + if (t <= b) { + /* non-empty queue */ + return_value = *static_cast<T*>(a[b]); // relaxed load + if (t == b) { + /* single last element in the queue. */ +#ifdef _WIN32 + Kokkos::memory_fence(); + bool const success = + Kokkos::atomic_compare_exchange_strong(&m_top, t, t + 1); + Kokkos::memory_fence(); + if (!success) { + return_value = nullptr; + } +#else + if (!Impl::atomic_compare_exchange_strong( + &m_top, t, t + 1, memory_order_seq_cst, memory_order_relaxed)) { + /* failed race, someone else stole it */ + return_value = nullptr; + } +#endif + m_bottom = b + 1; // memory order relaxed + } + } else { + /* empty queue */ + m_bottom = b + 1; // memory order relaxed + } + return return_value; + } + + KOKKOS_INLINE_FUNCTION + bool push(node_type&& node) { + // Just forward to the lvalue version + return push(node); + } + + KOKKOS_INLINE_FUNCTION + bool push(node_type& node) { + auto b = m_bottom; // memory order relaxed + auto t = Impl::atomic_load(&m_top, memory_order_acquire); + auto& a = m_array; + if (b - t > a.size() - 1) { + /* queue is full, resize */ + // m_array = a->grow(); + // a = m_array; + return false; + } + a[b] = &node; // relaxed + Impl::atomic_store(&m_bottom, b + 1, memory_order_release); + return true; + } + + KOKKOS_INLINE_FUNCTION + OptionalRef<T> steal() { + auto t = m_top; // TODO @tasking @memory_order DSH: atomic load acquire + Kokkos::memory_fence(); // seq_cst fence, so why does the above need to be + // acquire? + auto b = Impl::atomic_load(&m_bottom, memory_order_acquire); + OptionalRef<T> return_value; + if (t < b) { + /* Non-empty queue */ + auto& a = m_array; // TODO @tasking @memory_order DSH: technically + // consume ordered, but acquire should be fine + Kokkos::load_fence(); // TODO @tasking @memory_order DSH memory order + // instead of fence + return_value = *static_cast<T*>(a[t]); // relaxed +#ifdef _WIN32 + Kokkos::memory_fence(); + bool const success = + Kokkos::atomic_compare_exchange_strong(&m_top, t, t + 1); + Kokkos::memory_fence(); + if (!success) { + return_value = nullptr; + } +#else + if (!Impl::atomic_compare_exchange_strong( + &m_top, t, t + 1, memory_order_seq_cst, memory_order_relaxed)) { + return_value = nullptr; + } +#endif + } + return return_value; + } +}; + +/* + // The atomicity of this load was more important in the paper's version + // because that version had a circular buffer that could grow. We're + // essentially using the memory order in this version as a fence, which + // may be unnecessary + auto buffer_ptr = (node_type***)&m_array.buffer; + auto a = Impl::atomic_load(buffer_ptr, memory_order_acquire); // + technically consume ordered, but acquire should be fine return_value = + *static_cast<T*>(a[t % m_array->size]); // relaxed; we'd have to replace the + m_array->size if we ever allow growth +*/ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +template <size_t CircularBufferSize> +struct TaskQueueTraitsChaseLev { + template <class Task> + using ready_queue_type = + ChaseLevDeque<Task, + fixed_size_circular_buffer<SimpleSinglyLinkedListNode<>, + CircularBufferSize, int32_t>, + int32_t>; + + template <class Task> + using waiting_queue_type = SingleConsumeOperationLIFO<Task>; + + template <class Task> + using intrusive_task_base_type = typename ready_queue_type<Task>::node_type; + + static constexpr auto ready_queue_insertion_may_fail = true; +}; + +} // end namespace Impl +} // end namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* defined KOKKOS_ENABLE_TASKDAG */ +#endif /* #ifndef KOKKOS_IMPL_LOCKFREEDEQUE_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp b/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp new file mode 100644 index 0000000000000000000000000000000000000000..4e46b8d157f83129182d4db9b725bcddbe3ed28b --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp @@ -0,0 +1,124 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CLOCKTIC_HPP +#define KOKKOS_CLOCKTIC_HPP + +#include <Kokkos_Macros.hpp> +#include <stdint.h> +#include <chrono> +#ifdef KOKKOS_ENABLE_OPENMPTARGET +#include <omp.h> +#endif + +// To use OpenCL(TM) built-in intrinsics inside kernels, we have to +// forward-declare their prototype, also see +// https://github.com/intel/pti-gpu/blob/master/chapters/binary_instrumentation/OpenCLBuiltIn.md +#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GEN) && \ + defined(__SYCL_DEVICE_ONLY__) +extern SYCL_EXTERNAL unsigned long __attribute__((overloadable)) +intel_get_cycle_counter(); +#endif + +namespace Kokkos { +namespace Impl { + +/**\brief Quick query of clock register tics + * + * Primary use case is to, with low overhead, + * obtain a integral value that consistently varies + * across concurrent threads of execution within + * a parallel algorithm. + * This value is often used to "randomly" seed an + * attempt to acquire an indexed resource (e.g., bit) + * from an array of resources (e.g., bitset) such that + * concurrent threads will have high likelihood of + * having different index-seed values. + */ + +KOKKOS_FORCEINLINE_FUNCTION +uint64_t clock_tic() noexcept { +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + + // Return value of 64-bit hi-res clock register. + + return clock64(); + +#elif defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GEN) && \ + defined(__SYCL_DEVICE_ONLY__) + return intel_get_cycle_counter(); +#elif defined(KOKKOS_ENABLE_OPENMPTARGET) + return uint64_t(omp_get_wtime() * 1.e9); +#elif defined(__i386__) || defined(__x86_64) + + // Return value of 64-bit hi-res clock register. + + unsigned a = 0, d = 0; + + __asm__ volatile("rdtsc" : "=a"(a), "=d"(d)); + + return ((uint64_t)a) | (((uint64_t)d) << 32); + +#elif defined(__powerpc) || defined(__powerpc__) || defined(__powerpc64__) || \ + defined(__POWERPC__) || defined(__ppc__) || defined(__ppc64__) + + unsigned int cycles = 0; + + asm volatile("mftb %0" : "=r"(cycles)); + + return (uint64_t)cycles; + +#else + + return (uint64_t)std::chrono::high_resolution_clock::now() + .time_since_epoch() + .count(); + +#endif +} + +} // namespace Impl +} // namespace Kokkos + +#endif // KOKKOS_CLOCKTIC_HPP diff --git a/packages/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp b/packages/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp new file mode 100644 index 0000000000000000000000000000000000000000..06681a95ae902c613c701cd78ff572d35da6c0a1 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp @@ -0,0 +1,703 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_COMBINED_REDUCER_HPP +#define KOKKOS_COMBINED_REDUCER_HPP + +#include <Kokkos_Macros.hpp> +#include <Kokkos_Core_fwd.hpp> + +#include <Kokkos_Parallel_Reduce.hpp> +#include <Kokkos_ExecPolicy.hpp> +#include <Kokkos_AnonymousSpace.hpp> +#include <impl/Kokkos_Utilities.hpp> // comma operator fold emulation + +#include <utility> + +namespace Kokkos { +namespace Impl { + +//============================================================================== +// <editor-fold desc="CombinedReducer reducer and value storage helpers"> {{{1 + +// Note: the index is only to avoid repeating the same base class multiple times +template <size_t /*Idx*/, class ValueType> +struct CombinedReducerValueItemImpl { + public: + using value_type = ValueType; + + private: + value_type m_value; + + public: + KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerValueItemImpl() = default; + KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerValueItemImpl( + CombinedReducerValueItemImpl const&) = default; + KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerValueItemImpl( + CombinedReducerValueItemImpl&&) = default; + KOKKOS_DEFAULTED_FUNCTION KOKKOS_CONSTEXPR_14 CombinedReducerValueItemImpl& + operator=(CombinedReducerValueItemImpl const&) = default; + KOKKOS_DEFAULTED_FUNCTION KOKKOS_CONSTEXPR_14 CombinedReducerValueItemImpl& + operator=(CombinedReducerValueItemImpl&&) = default; + KOKKOS_DEFAULTED_FUNCTION + ~CombinedReducerValueItemImpl() = default; + explicit KOKKOS_FUNCTION CombinedReducerValueItemImpl(value_type arg_value) + : m_value(std::move(arg_value)) {} + + KOKKOS_FORCEINLINE_FUNCTION + KOKKOS_CONSTEXPR_14 value_type& ref() & noexcept { return m_value; } + KOKKOS_FORCEINLINE_FUNCTION + constexpr value_type const& ref() const& noexcept { return m_value; } + KOKKOS_FORCEINLINE_FUNCTION + value_type volatile& ref() volatile& noexcept { return m_value; } + KOKKOS_FORCEINLINE_FUNCTION + value_type const volatile& ref() const volatile& noexcept { return m_value; } +}; + +//============================================================================== + +template <class IdxSeq, class... ValueTypes> +struct CombinedReducerValueImpl; + +template <size_t... Idxs, class... ValueTypes> +struct CombinedReducerValueImpl<std::integer_sequence<size_t, Idxs...>, + ValueTypes...> + : CombinedReducerValueItemImpl<Idxs, ValueTypes>... { + public: + KOKKOS_DEFAULTED_FUNCTION + constexpr CombinedReducerValueImpl() = default; + KOKKOS_DEFAULTED_FUNCTION + constexpr CombinedReducerValueImpl(CombinedReducerValueImpl const&) = default; + KOKKOS_DEFAULTED_FUNCTION + constexpr CombinedReducerValueImpl(CombinedReducerValueImpl&&) = default; + KOKKOS_DEFAULTED_FUNCTION + KOKKOS_CONSTEXPR_14 CombinedReducerValueImpl& operator=( + CombinedReducerValueImpl const&) = default; + KOKKOS_DEFAULTED_FUNCTION + KOKKOS_CONSTEXPR_14 CombinedReducerValueImpl& operator=( + CombinedReducerValueImpl&&) = default; + KOKKOS_DEFAULTED_FUNCTION + ~CombinedReducerValueImpl() = default; + + KOKKOS_FUNCTION + explicit CombinedReducerValueImpl(ValueTypes... arg_values) + : CombinedReducerValueItemImpl<Idxs, ValueTypes>( + std::move(arg_values))... {} + + template <size_t Idx, class ValueType> + KOKKOS_INLINE_FUNCTION ValueType& get() & noexcept { + return this->CombinedReducerValueItemImpl<Idx, ValueType>::ref(); + } + template <size_t Idx, class ValueType> + KOKKOS_INLINE_FUNCTION ValueType const& get() const& noexcept { + return this->CombinedReducerValueItemImpl<Idx, ValueType>::ref(); + } + template <size_t Idx, class ValueType> + KOKKOS_INLINE_FUNCTION ValueType volatile& get() volatile& noexcept { + return this->CombinedReducerValueItemImpl<Idx, ValueType>::ref(); + } + template <size_t Idx, class ValueType> + KOKKOS_INLINE_FUNCTION ValueType const volatile& get() const + volatile& noexcept { + return this->CombinedReducerValueItemImpl<Idx, ValueType>::ref(); + } +}; + +//============================================================================== + +// TODO Empty base optmization? +template <size_t /*Idx*/, class Reducer> +// requires Kokkos::is_reducer<Reducer> +struct CombinedReducerStorageImpl { + public: + using value_type = typename Reducer::value_type; + + private: + Reducer m_reducer; + + public: + KOKKOS_INLINE_FUNCTION + explicit constexpr CombinedReducerStorageImpl(Reducer arg_reducer) + : m_reducer(std::move(arg_reducer)) {} + + // Leading underscores to make it clear that this class is not intended to + // model Reducer + + KOKKOS_INLINE_FUNCTION + KOKKOS_CONSTEXPR_14 _fold_comma_emulation_return + _init(value_type& val) const { + m_reducer.init(val); + return _fold_comma_emulation_return{}; + } + + KOKKOS_INLINE_FUNCTION KOKKOS_CONSTEXPR_14 _fold_comma_emulation_return + _join(value_type& dest, value_type const& src) const { + m_reducer.join(dest, src); + return _fold_comma_emulation_return{}; + } + + KOKKOS_INLINE_FUNCTION KOKKOS_CONSTEXPR_14 _fold_comma_emulation_return + _join(value_type volatile& dest, value_type const volatile& src) const { + m_reducer.join(dest, src); + return _fold_comma_emulation_return{}; + } +}; + +// </editor-fold> end CombinedReducerStorage }}}1 +//============================================================================== + +//============================================================================== +// <editor-fold desc="CombinedReducer"> {{{1 + +struct _construct_combined_reducer_from_args_tag {}; + +template <class T> +KOKKOS_INLINE_FUNCTION auto _get_value_from_combined_reducer_ctor_arg( + T&& arg) noexcept -> + typename std::enable_if< + !is_view<typename std::decay<T>::type>::value && + !is_reducer<typename std::decay<T>::type>::value, + typename std::decay<T>::type>::type { + return arg; +} + +template <class T> +KOKKOS_INLINE_FUNCTION auto _get_value_from_combined_reducer_ctor_arg( + T&& arg) noexcept -> + typename std::enable_if<is_view<typename std::decay<T>::type>::value, + typename std::decay<T>::type>::type::value_type { + return arg(); +} + +template <class T> +KOKKOS_INLINE_FUNCTION auto _get_value_from_combined_reducer_ctor_arg( + T&& arg) noexcept -> + typename std::enable_if<is_reducer<typename std::decay<T>::type>::value, + typename std::decay<T>::type>::type::value_type { + return arg.reference(); +} + +template <class IdxSeq, class Space, class...> +struct CombinedReducerImpl; + +template <size_t... Idxs, class Space, class... Reducers> +struct CombinedReducerImpl<std::integer_sequence<size_t, Idxs...>, Space, + Reducers...> + : private CombinedReducerStorageImpl<Idxs, Reducers>... { + public: + using reducer = CombinedReducerImpl<std::integer_sequence<size_t, Idxs...>, + Space, Reducers...>; + using value_type = + CombinedReducerValueImpl<std::integer_sequence<size_t, Idxs...>, + typename Reducers::value_type...>; + using result_view_type = + Kokkos::View<value_type, Space, Kokkos::MemoryUnmanaged>; + + private: + result_view_type m_value_view; + + public: + KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl() = default; + KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl( + CombinedReducerImpl const&) = default; + KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl( + CombinedReducerImpl&&) = default; + KOKKOS_DEFAULTED_FUNCTION KOKKOS_CONSTEXPR_14 CombinedReducerImpl& operator=( + CombinedReducerImpl const&) = default; + KOKKOS_DEFAULTED_FUNCTION KOKKOS_CONSTEXPR_14 CombinedReducerImpl& operator=( + CombinedReducerImpl&&) = default; + + KOKKOS_DEFAULTED_FUNCTION ~CombinedReducerImpl() = default; + + template <class... ReducersDeduced> + KOKKOS_FUNCTION constexpr explicit CombinedReducerImpl( + value_type& value, ReducersDeduced&&... reducers) noexcept + : CombinedReducerStorageImpl<Idxs, Reducers>((ReducersDeduced &&) + reducers)..., + m_value_view(&value) {} + + KOKKOS_FUNCTION KOKKOS_CONSTEXPR_14 void join(value_type& dest, + value_type const& src) const + noexcept { + emulate_fold_comma_operator( + this->CombinedReducerStorageImpl<Idxs, Reducers>::_join( + dest.template get<Idxs, typename Reducers::value_type>(), + src.template get<Idxs, typename Reducers::value_type>())...); + } + + KOKKOS_FUNCTION void join(value_type volatile& dest, + value_type const volatile& src) const noexcept { + emulate_fold_comma_operator( + this->CombinedReducerStorageImpl<Idxs, Reducers>::_join( + dest.template get<Idxs, typename Reducers::value_type>(), + src.template get<Idxs, typename Reducers::value_type>())...); + } + + KOKKOS_FUNCTION KOKKOS_CONSTEXPR_14 void init(value_type& dest) const + noexcept { + emulate_fold_comma_operator( + this->CombinedReducerStorageImpl<Idxs, Reducers>::_init( + dest.template get<Idxs, typename Reducers::value_type>())...); + } + + // TODO figure out if we also need to call through to final + + KOKKOS_FUNCTION + constexpr bool references_scalar() const noexcept { + // For now, always pretend that we reference a scalar since we need to + // block to do the write-back because the references may not be contiguous + // in memory and the backends currently assume this and just do a single + // deep copy back to a chunk of memory associated with the output argument + return true; + } + + KOKKOS_FUNCTION + constexpr result_view_type const& view() const noexcept { + return m_value_view; + } + + KOKKOS_FUNCTION + KOKKOS_CONSTEXPR_14 static void write_value_back_to_original_references( + value_type const& value, + Reducers const&... reducers_that_reference_original_values) noexcept { + emulate_fold_comma_operator( + (reducers_that_reference_original_values.view()() = + value.template get<Idxs, typename Reducers::value_type>())...); + } +}; + +// Apparently this can't be an alias template because of a bug/unimplemented +// feature in GCC's name mangler. But in this case, this amounts to the same +// thing. +template <class Space, class... Reducers> +struct CombinedReducer + : CombinedReducerImpl<std::make_index_sequence<sizeof...(Reducers)>, Space, + Reducers...> { + using base_t = + CombinedReducerImpl<std::make_index_sequence<sizeof...(Reducers)>, Space, + Reducers...>; + using base_t::base_t; + using reducer = CombinedReducer<Space, Reducers...>; +}; + +// </editor-fold> end CombinedReducer }}}1 +//============================================================================== + +//============================================================================== +// <editor-fold desc="CombinedReductionFunctorWrapper"> {{{1 + +template <class IdxSeq, class Functor, class Space, class... Reducers> +struct CombinedReductionFunctorWrapperImpl; + +template <size_t... Idxs, class Functor, class Space, class... Reducers> +struct CombinedReductionFunctorWrapperImpl< + std::integer_sequence<size_t, Idxs...>, Functor, Space, Reducers...> { + private: + Functor m_functor; + + public: + //------------------------------------------------------------------------------ + // <editor-fold desc="type aliases"> {{{2 + + using reducer_type = CombinedReducer<Space, Reducers...>; + + // Prevent Kokkos from attempting to deduce value_type + using value_type = typename reducer_type::value_type; + + // </editor-fold> end type aliases }}}2 + //------------------------------------------------------------------------------ + + //---------------------------------------------------------------------------- + // <editor-fold desc="Ctors, destructor, and assignment"> {{{2 + + KOKKOS_DEFAULTED_FUNCTION + constexpr CombinedReductionFunctorWrapperImpl() noexcept = default; + KOKKOS_DEFAULTED_FUNCTION + constexpr CombinedReductionFunctorWrapperImpl( + CombinedReductionFunctorWrapperImpl const&) = default; + KOKKOS_DEFAULTED_FUNCTION + constexpr CombinedReductionFunctorWrapperImpl( + CombinedReductionFunctorWrapperImpl&&) = default; + KOKKOS_DEFAULTED_FUNCTION + KOKKOS_CONSTEXPR_14 CombinedReductionFunctorWrapperImpl& operator=( + CombinedReductionFunctorWrapperImpl const&) = default; + KOKKOS_DEFAULTED_FUNCTION + KOKKOS_CONSTEXPR_14 CombinedReductionFunctorWrapperImpl& operator=( + CombinedReductionFunctorWrapperImpl&&) = default; + KOKKOS_DEFAULTED_FUNCTION + ~CombinedReductionFunctorWrapperImpl() = default; + + KOKKOS_INLINE_FUNCTION + constexpr explicit CombinedReductionFunctorWrapperImpl(Functor arg_functor) + : m_functor(std::move(arg_functor)) {} + + // </editor-fold> end Ctors, destructor, and assignment }}}2 + //---------------------------------------------------------------------------- + + //---------------------------------------------------------------------------- + // <editor-fold desc="call operator"> {{{2 + + // Variadic version for MDRangePolicy + // There are a number of ways to do this, but most of them that involve + // not assuming an implementation of tuple is available are gross. + // Unfortunately, that's what we have to do here + template <class IndexOrMemberOrTagType1, + class... IndexOrMemberTypesThenValueType> + KOKKOS_FUNCTION void operator()( + IndexOrMemberOrTagType1&& arg_first, + IndexOrMemberTypesThenValueType&&... args) const { + this->template _call_op_impl<IndexOrMemberOrTagType1&&>( + (IndexOrMemberOrTagType1 &&) arg_first, + (IndexOrMemberTypesThenValueType &&) args...); + } + + // </editor-fold> end call operator }}}2 + //---------------------------------------------------------------------------- + + // These are things that need to be done if we decide to ever support + // functor-customized join/init/final hooks with combined reducers. For now, + // they are explicitly not supported. + // TODO: forward join() function to user functor hook, or just ignore it? + // TODO: forward init() function to user functor hook, or just ignore it? + // TODO: forward final() function to user functor hook, or just ignore it? + + private: + // variadic forwarding for MDRangePolicy + // see comment above for why this has to be so gross + // recursive case + template <class... IdxOrMemberTypes, class IdxOrMemberType1, + class... IdxOrMemberTypesThenValueType> + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + !std::is_same<remove_cvref_t<IdxOrMemberType1>, value_type>::value> + _call_op_impl(IdxOrMemberTypes&&... idxs, IdxOrMemberType1&& idx, + IdxOrMemberTypesThenValueType&&... args) const { + this->template _call_op_impl<IdxOrMemberTypes&&..., IdxOrMemberType1&&>( + (IdxOrMemberTypes &&) idxs..., (IdxOrMemberType1 &&) idx, + (IdxOrMemberTypesThenValueType &&) args...); + } + + // base case + template <class... IdxOrMemberTypes> + KOKKOS_FORCEINLINE_FUNCTION void _call_op_impl(IdxOrMemberTypes&&... idxs, + value_type& out) const { + m_functor((IdxOrMemberTypes &&) idxs..., + out.template get<Idxs, typename Reducers::value_type>()...); + } +}; + +template <class Functor, class Space, class... Reducers> +struct CombinedReductionFunctorWrapper + : CombinedReductionFunctorWrapperImpl< + std::make_index_sequence<sizeof...(Reducers)>, Functor, Space, + Reducers...> { + using base_t = CombinedReductionFunctorWrapperImpl< + std::make_index_sequence<sizeof...(Reducers)>, Functor, Space, + Reducers...>; + using base_t::base_t; +}; + +// </editor-fold> end CombinedReductionFunctorWrapper }}}1 +//============================================================================== + +//------------------------------------------------------------------------------ +// <editor-fold desc="_make_reducer_from_arg"> {{{2 + +template <class Space, class Reducer> +KOKKOS_INLINE_FUNCTION constexpr typename std::enable_if< + Kokkos::is_reducer<typename std::decay<Reducer>::type>::value, + typename std::decay<Reducer>::type>::type +_make_reducer_from_arg(Reducer&& arg_reducer) noexcept { + return arg_reducer; +} + +// Two purposes: SFINAE-safety for the `View` case and laziness for the return +// value otherwise to prevent extra instantiations of the Kokkos::Sum template +template <class Space, class T, class Enable = void> +struct _wrap_with_kokkos_sum { + using type = Kokkos::Sum<T, Space>; +}; + +template <class Space, class T> +struct _wrap_with_kokkos_sum< + Space, T, typename std::enable_if<Kokkos::is_view<T>::value>::type> { + using type = Kokkos::Sum<typename T::value_type, Space>; +}; + +// TODO better error message for the case when a const& to a scalar is passed in +// (this is needed in general, though) +template <class Space, class T> +KOKKOS_INLINE_FUNCTION constexpr typename std::enable_if< + !Kokkos::is_reducer<typename std::decay<T>::type>::value, + _wrap_with_kokkos_sum<Space, typename std::decay<T>::type>>::type::type +_make_reducer_from_arg(T&& arg_scalar) noexcept { + return + typename _wrap_with_kokkos_sum<Space, typename std::decay<T>::type>::type{ + arg_scalar}; +} + +// This can't be an alias template because GCC doesn't know how to mangle +// decltype expressions in return statements (and, even though every compiler +// is supposed to, GCC is the only one that does dependent alias template +// substitution correctly and tries to do the mangling, aparently). +template <class Space, class ReferenceOrViewOrReducer, class = void> +struct _reducer_from_arg { + using type = decltype(Impl::_make_reducer_from_arg<Space>( + std::declval<ReferenceOrViewOrReducer&&>())); +}; +template <class Space, class ReferenceOrViewOrReducer> +using _reducer_from_arg_t = + typename _reducer_from_arg<Space, ReferenceOrViewOrReducer>::type; + +// </editor-fold> end _make_reducer_from_arg }}}2 +//------------------------------------------------------------------------------ + +template <class Space, class... ReferencesOrViewsOrReducers> +KOKKOS_INLINE_FUNCTION constexpr auto make_combined_reducer_value( + ReferencesOrViewsOrReducers&&... args) { + //---------------------------------------- + // This is a bit round-about and we should make sure it doesn't have + // any performance implications. Basically, we make a reducer out of anything + // just to get the value back out here (for the sake of uniformity). Most + // compilers should figure out what's going on, but we should double-check + // that. + return CombinedReducerValueImpl< + std::make_index_sequence<sizeof...(ReferencesOrViewsOrReducers)>, + typename _reducer_from_arg_t<Space, + ReferencesOrViewsOrReducers>::value_type...>{ + // This helper function is now poorly named after refactoring. + _get_value_from_combined_reducer_ctor_arg((ReferencesOrViewsOrReducers &&) + args)...}; + //---------------------------------------- +} + +template <class Space, class ValueType, class... ReferencesOrViewsOrReducers> +KOKKOS_INLINE_FUNCTION constexpr auto make_combined_reducer( + ValueType& value, ReferencesOrViewsOrReducers&&... args) { + //---------------------------------------- + // This is doing more or less the same thing of making every argument into + // a reducer, just in a different place than in `make_combined_reducer_value`, + // so we should probably eventually make this read a little more similarly + using reducer_type = CombinedReducer< + Space, _reducer_from_arg_t<Space, ReferencesOrViewsOrReducers>...>; + return reducer_type(value, + _reducer_from_arg_t<Space, ReferencesOrViewsOrReducers>{ + (ReferencesOrViewsOrReducers &&) args}...); + //---------------------------------------- +} + +template <class Functor, class Space, class... ReferencesOrViewsOrReducers> +KOKKOS_INLINE_FUNCTION constexpr auto make_wrapped_combined_functor( + Functor const& functor, Space, ReferencesOrViewsOrReducers&&...) { + //---------------------------------------- + return CombinedReductionFunctorWrapper< + Functor, Space, + _reducer_from_arg_t<Space, ReferencesOrViewsOrReducers>...>(functor); + //---------------------------------------- +} + +} // end namespace Impl + +//============================================================================== +// <editor-fold desc="Overloads of parallel_reduce for multiple outputs"> {{{1 + +// These need to be forwarding references so that we can deduce const-ness, +// but none of them should be forwarded (and, indeed, none of them should be +// rvalue references) +template <class PolicyType, class Functor, class ReturnType1, class ReturnType2, + class... ReturnTypes> +auto parallel_reduce(std::string const& label, PolicyType const& policy, + Functor const& functor, ReturnType1&& returnType1, + ReturnType2&& returnType2, + ReturnTypes&&... returnTypes) noexcept -> + typename std::enable_if< + Kokkos::Impl::is_execution_policy<PolicyType>::value>::type { + //---------------------------------------- + // Since we don't support asynchronous combined reducers yet for various + // reasons, we actually just want to work with the pointers and references + // directly + using space_type = Kokkos::DefaultHostExecutionSpace::memory_space; + + auto value = Impl::make_combined_reducer_value<space_type>( + returnType1, returnType2, returnTypes...); + + using combined_reducer_type = Impl::CombinedReducer< + space_type, Impl::_reducer_from_arg_t<space_type, ReturnType1>, + Impl::_reducer_from_arg_t<space_type, ReturnType2>, + Impl::_reducer_from_arg_t<space_type, ReturnTypes>...>; + auto combined_reducer = Impl::make_combined_reducer<space_type>( + value, returnType1, returnType2, returnTypes...); + + auto combined_functor = Impl::make_wrapped_combined_functor( + functor, space_type{}, returnType1, returnType2, returnTypes...); + + using combined_functor_type = decltype(combined_functor); + static_assert( + Impl::FunctorDeclaresValueType<combined_functor_type, void>::value, + "value_type not properly detected"); + using reduce_adaptor_t = + Impl::ParallelReduceAdaptor<PolicyType, combined_functor_type, + combined_reducer_type>; + + reduce_adaptor_t::execute(label, policy, combined_functor, combined_reducer); + Impl::ParallelReduceFence<typename PolicyType::execution_space, + combined_reducer_type>::fence(policy.space(), + combined_reducer); + combined_reducer.write_value_back_to_original_references( + value, Impl::_make_reducer_from_arg<space_type>(returnType1), + Impl::_make_reducer_from_arg<space_type>(returnType2), + Impl::_make_reducer_from_arg<space_type>(returnTypes)...); + //---------------------------------------- +} + +template <class PolicyType, class Functor, class ReturnType1, class ReturnType2, + class... ReturnTypes> +auto parallel_reduce(PolicyType const& policy, Functor const& functor, + ReturnType1&& returnType1, ReturnType2&& returnType2, + ReturnTypes&&... returnTypes) noexcept -> + typename std::enable_if< + Kokkos::Impl::is_execution_policy<PolicyType>::value>::type { + //---------------------------------------- + Kokkos::parallel_reduce("", policy, functor, + std::forward<ReturnType1>(returnType1), + std::forward<ReturnType2>(returnType2), + std::forward<ReturnTypes>(returnTypes)...); + //---------------------------------------- +} + +template <class Functor, class ReturnType1, class ReturnType2, + class... ReturnTypes> +void parallel_reduce(std::string const& label, size_t n, Functor const& functor, + ReturnType1&& returnType1, ReturnType2&& returnType2, + ReturnTypes&&... returnTypes) noexcept { + Kokkos::parallel_reduce(label, + RangePolicy<Kokkos::DefaultExecutionSpace>(0, n), + functor, std::forward<ReturnType1>(returnType1), + std::forward<ReturnType2>(returnType2), + std::forward<ReturnTypes>(returnTypes)...); +} + +template <class Functor, class ReturnType1, class ReturnType2, + class... ReturnTypes> +void parallel_reduce(size_t n, Functor const& functor, + ReturnType1&& returnType1, ReturnType2&& returnType2, + ReturnTypes&&... returnTypes) noexcept { + Kokkos::parallel_reduce("", n, functor, + std::forward<ReturnType1>(returnType1), + std::forward<ReturnType2>(returnType2), + std::forward<ReturnTypes>(returnTypes)...); +} + +//------------------------------------------------------------------------------ +// <editor-fold desc="Team overloads"> {{{2 + +// Copied three times because that's the best way we have right now to match +// Impl::TeamThreadRangeBoundariesStruct, +// Impl::ThreadVectorRangeBoundariesStruct, and +// Impl::TeamVectorRangeBoundariesStruct. +// TODO make these work after restructuring + +// template <class iType, class MemberType, class Functor, class ReturnType1, +// class ReturnType2, class... ReturnTypes> +// KOKKOS_INLINE_FUNCTION void parallel_reduce( +// std::string const& label, +// Impl::TeamThreadRangeBoundariesStruct<iType, MemberType> const& +// boundaries, Functor const& functor, ReturnType1&& returnType1, +// ReturnType2&& returnType2, ReturnTypes&&... returnTypes) noexcept { +// const auto combined_reducer = +// Impl::make_combined_reducer<Kokkos::AnonymousSpace>( +// returnType1, returnType2, returnTypes...); +// +// auto combined_functor = Impl::make_wrapped_combined_functor( +// functor, Kokkos::AnonymousSpace{}, returnType1, returnType2, +// returnTypes...); +// +// parallel_reduce(label, boundaries, combined_functor, combined_reducer); +//} +// +// template <class iType, class MemberType, class Functor, class ReturnType1, +// class ReturnType2, class... ReturnTypes> +// KOKKOS_INLINE_FUNCTION void parallel_reduce( +// std::string const& label, +// Impl::ThreadVectorRangeBoundariesStruct<iType, MemberType> const& +// boundaries, +// Functor const& functor, ReturnType1&& returnType1, +// ReturnType2&& returnType2, ReturnTypes&&... returnTypes) noexcept { +// const auto combined_reducer = +// Impl::make_combined_reducer<Kokkos::AnonymousSpace>( +// returnType1, returnType2, returnTypes...); +// +// auto combined_functor = Impl::make_wrapped_combined_functor( +// functor, Kokkos::AnonymousSpace{}, returnType1, returnType2, +// returnTypes...); +// +// parallel_reduce(label, boundaries, combined_functor, combined_reducer); +//} + +// template <class iType, class MemberType, class Functor, class ReturnType1, +// class ReturnType2, class... ReturnTypes> +// KOKKOS_INLINE_FUNCTION void parallel_reduce( +// std::string const& label, +// Impl::TeamVectorRangeBoundariesStruct<iType, MemberType> const& +// boundaries, Functor const& functor, ReturnType1&& returnType1, +// ReturnType2&& returnType2, ReturnTypes&&... returnTypes) noexcept { +// const auto combined_reducer = +// Impl::make_combined_reducer<Kokkos::AnonymousSpace>( +// returnType1, returnType2, returnTypes...); +// +// auto combined_functor = Impl::make_wrapped_combined_functor( +// functor, Kokkos::AnonymousSpace{}, returnType1, returnType2, +// returnTypes...); +// +// parallel_reduce(label, boundaries, combined_functor, combined_reducer); +//} + +// </editor-fold> end Team overloads }}}2 +//------------------------------------------------------------------------------ + +// </editor-fold> end Overloads of parallel_reduce for multiple outputs }}}1 +//============================================================================== + +} // namespace Kokkos + +#endif // KOKKOS_COMBINED_REDUCER_HPP diff --git a/packages/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp b/packages/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp new file mode 100644 index 0000000000000000000000000000000000000000..c02f4acddacb41f5fb01c50536f6a426738fac99 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp @@ -0,0 +1,348 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CONCURRENTBITSET_HPP +#define KOKKOS_CONCURRENTBITSET_HPP + +#include <stdint.h> +#include <Kokkos_Atomic.hpp> +#include <impl/Kokkos_BitOps.hpp> +#include <impl/Kokkos_ClockTic.hpp> + +namespace Kokkos { +namespace Impl { + +struct concurrent_bitset { + public: + // 32 bits per integer value + + enum : uint32_t { bits_per_int_lg2 = 5 }; + enum : uint32_t { bits_per_int_mask = (1 << bits_per_int_lg2) - 1 }; + + // Buffer is uint32_t[ buffer_bound ] + // [ uint32_t { state_header | used_count } , uint32_t bits[*] ] + // + // Maximum bit count is 33 million (1u<<25): + // + // - Maximum bit set size occupies 1 Mbyte + // + // - State header can occupy bits [30-26] + // which can be the bit_count_lg2 + // + // - Accept at least 33 million concurrent calls to 'acquire' + // before risking an overflow race condition on a full bitset. + + enum : uint32_t { max_bit_count_lg2 = 25 }; + enum : uint32_t { max_bit_count = 1u << max_bit_count_lg2 }; + enum : uint32_t { state_shift = 26 }; + enum : uint32_t { state_used_mask = (1 << state_shift) - 1 }; + enum : uint32_t { state_header_mask = uint32_t(0x001f) << state_shift }; + + KOKKOS_INLINE_FUNCTION static constexpr uint32_t buffer_bound_lg2( + uint32_t const bit_bound_lg2) noexcept { + return bit_bound_lg2 <= max_bit_count_lg2 + ? 1 + (1u << (bit_bound_lg2 > bits_per_int_lg2 + ? bit_bound_lg2 - bits_per_int_lg2 + : 0)) + : 0; + } + + /**\brief Initialize bitset buffer */ + KOKKOS_INLINE_FUNCTION static constexpr uint32_t buffer_bound( + uint32_t const bit_bound) noexcept { + return bit_bound <= max_bit_count + ? 1 + (bit_bound >> bits_per_int_lg2) + + (bit_bound & bits_per_int_mask ? 1 : 0) + : 0; + } + + /**\brief Claim any bit within the bitset bound. + * + * Return : ( which_bit , bit_count ) + * + * if success then + * bit_count is the atomic-count of claimed > 0 + * which_bit is the claimed bit >= 0 + * else if attempt failed due to filled buffer + * bit_count == which_bit == -1 + * else if attempt failed due to non-matching state_header + * bit_count == which_bit == -2 + * else if attempt failed due to max_bit_count_lg2 < bit_bound_lg2 + * or invalid state_header + * or (1u << bit_bound_lg2) <= bit + * bit_count == which_bit == -3 + * endif + * + * Recommended to have hint + * bit = Kokkos::Impl::clock_tic() & ((1u<<bit_bound_lg2) - 1) + */ + KOKKOS_INLINE_FUNCTION static Kokkos::pair<int, int> acquire_bounded_lg2( + uint32_t volatile *const buffer, uint32_t const bit_bound_lg2, + uint32_t bit = 0 /* optional hint */ + , + uint32_t const state_header = 0 /* optional header */ + ) noexcept { + using type = Kokkos::pair<int, int>; + + const uint32_t bit_bound = 1 << bit_bound_lg2; + const uint32_t word_count = bit_bound >> bits_per_int_lg2; + + if ((max_bit_count_lg2 < bit_bound_lg2) || + (state_header & ~state_header_mask) || (bit_bound < bit)) { + return type(-3, -3); + } + + // Use potentially two fetch_add to avoid CAS loop. + // Could generate "racing" failure-to-acquire + // when is full at the atomic_fetch_add(+1) + // then a release occurs before the atomic_fetch_add(-1). + + const uint32_t state = + (uint32_t)Kokkos::atomic_fetch_add((volatile int *)buffer, 1); + + const uint32_t state_error = state_header != (state & state_header_mask); + + const uint32_t state_bit_used = state & state_used_mask; + + if (state_error || (bit_bound <= state_bit_used)) { + Kokkos::atomic_fetch_add((volatile int *)buffer, -1); + return state_error ? type(-2, -2) : type(-1, -1); + } + + // Do not update bit until count is visible: + + Kokkos::memory_fence(); + + // There is a zero bit available somewhere, + // now find the (first) available bit and set it. + + while (1) { + const uint32_t word = bit >> bits_per_int_lg2; + const uint32_t mask = 1u << (bit & bits_per_int_mask); + const uint32_t prev = Kokkos::atomic_fetch_or(buffer + word + 1, mask); + + if (!(prev & mask)) { + // Successfully claimed 'result.first' by + // atomically setting that bit. + return type(bit, state_bit_used + 1); + } + + // Failed race to set the selected bit + // Find a new bit to try. + + const int j = Kokkos::Impl::bit_first_zero(prev); + + if (0 <= j) { + bit = (word << bits_per_int_lg2) | uint32_t(j); + } else { + bit = ((word + 1) < word_count ? ((word + 1) << bits_per_int_lg2) : 0) | + (bit & bits_per_int_mask); + } + } + } + + /**\brief Claim any bit within the bitset bound. + * + * Return : ( which_bit , bit_count ) + * + * if success then + * bit_count is the atomic-count of claimed > 0 + * which_bit is the claimed bit >= 0 + * else if attempt failed due to filled buffer + * bit_count == which_bit == -1 + * else if attempt failed due to non-matching state_header + * bit_count == which_bit == -2 + * else if attempt failed due to max_bit_count_lg2 < bit_bound_lg2 + * or invalid state_header + * or bit_bound <= bit + * bit_count == which_bit == -3 + * endif + * + * Recommended to have hint + * bit = Kokkos::Impl::clock_tic() % bit_bound + */ + KOKKOS_INLINE_FUNCTION static Kokkos::pair<int, int> acquire_bounded( + uint32_t volatile *const buffer, uint32_t const bit_bound, + uint32_t bit = 0 /* optional hint */ + , + uint32_t const state_header = 0 /* optional header */ + ) noexcept { + using type = Kokkos::pair<int, int>; + + if ((max_bit_count < bit_bound) || (state_header & ~state_header_mask) || + (bit_bound <= bit)) { + return type(-3, -3); + } + + const uint32_t word_count = bit_bound >> bits_per_int_lg2; + + // Use potentially two fetch_add to avoid CAS loop. + // Could generate "racing" failure-to-acquire + // when is full at the atomic_fetch_add(+1) + // then a release occurs before the atomic_fetch_add(-1). + + const uint32_t state = + (uint32_t)Kokkos::atomic_fetch_add((volatile int *)buffer, 1); + + const uint32_t state_error = state_header != (state & state_header_mask); + + const uint32_t state_bit_used = state & state_used_mask; + + if (state_error || (bit_bound <= state_bit_used)) { + Kokkos::atomic_fetch_add((volatile int *)buffer, -1); + return state_error ? type(-2, -2) : type(-1, -1); + } + + // Do not update bit until count is visible: + + Kokkos::memory_fence(); + + // There is a zero bit available somewhere, + // now find the (first) available bit and set it. + + while (1) { + const uint32_t word = bit >> bits_per_int_lg2; + const uint32_t mask = 1u << (bit & bits_per_int_mask); + const uint32_t prev = Kokkos::atomic_fetch_or(buffer + word + 1, mask); + + if (!(prev & mask)) { + // Successfully claimed 'result.first' by + // atomically setting that bit. + // Flush the set operation. Technically this only needs to be acquire/ + // release semantics and not sequentially consistent, but for now + // we'll just do this. + Kokkos::memory_fence(); + return type(bit, state_bit_used + 1); + } + + // Failed race to set the selected bit + // Find a new bit to try. + + const int j = Kokkos::Impl::bit_first_zero(prev); + + if (0 <= j) { + bit = (word << bits_per_int_lg2) | uint32_t(j); + } + + if ((j < 0) || (bit_bound <= bit)) { + bit = ((word + 1) < word_count ? ((word + 1) << bits_per_int_lg2) : 0) | + (bit & bits_per_int_mask); + } + } + } + + /**\brief + * + * Requires: 'bit' previously acquired and has not yet been released. + * + * Returns: + * 0 <= used count after successful release + * -1 bit was already released + * -2 state_header error + */ + KOKKOS_INLINE_FUNCTION static int release( + uint32_t volatile *const buffer, uint32_t const bit, + uint32_t const state_header = 0 /* optional header */ + ) noexcept { + if (state_header != (state_header_mask & *buffer)) { + return -2; + } + + const uint32_t mask = 1u << (bit & bits_per_int_mask); + const uint32_t prev = + Kokkos::atomic_fetch_and(buffer + (bit >> bits_per_int_lg2) + 1, ~mask); + + if (!(prev & mask)) { + return -1; + } + + // Do not update count until bit clear is visible + Kokkos::memory_fence(); + + const int count = Kokkos::atomic_fetch_add((volatile int *)buffer, -1); + + // Flush the store-release + Kokkos::memory_fence(); + + return (count & state_used_mask) - 1; + } + + /**\brief + * + * Requires: Bit within bounds and not already set. + * + * Returns: + * 0 <= used count after successful release + * -1 bit was already released + * -2 bit or state_header error + */ + KOKKOS_INLINE_FUNCTION static int set( + uint32_t volatile *const buffer, uint32_t const bit, + uint32_t const state_header = 0 /* optional header */ + ) noexcept { + if (state_header != (state_header_mask & *buffer)) { + return -2; + } + + const uint32_t mask = 1u << (bit & bits_per_int_mask); + const uint32_t prev = + Kokkos::atomic_fetch_or(buffer + (bit >> bits_per_int_lg2) + 1, mask); + + if (!(prev & mask)) { + return -1; + } + + // Do not update count until bit clear is visible + Kokkos::memory_fence(); + + const int count = Kokkos::atomic_fetch_add((volatile int *)buffer, -1); + + return (count & state_used_mask) - 1; + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif /* #ifndef KOKKOS_CONCURRENTBITSET_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_Core.cpp b/packages/kokkos/core/src/impl/Kokkos_Core.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b4769fbeaa53be8353df315ede634708da1b297d --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Core.cpp @@ -0,0 +1,1146 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <impl/Kokkos_Error.hpp> +#include <impl/Kokkos_ExecSpaceInitializer.hpp> +#include <cctype> +#include <cstring> +#include <iostream> +#include <sstream> +#include <cstdlib> +#include <stack> +#include <functional> +#include <list> +#include <cerrno> +#include <regex> +#ifndef _WIN32 +#include <unistd.h> +#else +#include <Windows.h> +#endif + +//---------------------------------------------------------------------------- +namespace { +bool g_is_initialized = false; +bool g_show_warnings = true; +bool g_tune_internals = false; +// When compiling with clang/LLVM and using the GNU (GCC) C++ Standard Library +// (any recent version between GCC 7.3 and GCC 9.2), std::deque SEGV's during +// the unwinding of the atexit(3C) handlers at program termination. However, +// this bug is not observable when building with GCC. +// As an added bonus, std::list<T> provides constant insertion and +// deletion time complexity, which translates to better run-time performance. As +// opposed to std::deque<T> which does not provide the same constant time +// complexity for inserts/removals, since std::deque<T> is implemented as a +// segmented array. +using hook_function_type = std::function<void()>; +std::stack<hook_function_type, std::list<hook_function_type>> finalize_hooks; +} // namespace + +namespace Kokkos { +namespace Impl { +/** + * The category is only used in printing, tools + * get all metadata free of category + */ +using metadata_category_type = std::string; +using metadata_key_type = std::string; +using metadata_value_type = std::string; + +std::map<metadata_category_type, + std::map<metadata_key_type, metadata_value_type>> + metadata_map; + +void declare_configuration_metadata(const std::string& category, + const std::string& key, + const std::string& value) { + metadata_map[category][key] = value; +} + +ExecSpaceManager& ExecSpaceManager::get_instance() { + static ExecSpaceManager space_initializer = {}; + return space_initializer; +} + +void ExecSpaceManager::register_space_factory( + const std::string name, std::unique_ptr<ExecSpaceInitializerBase> space) { + exec_space_factory_list[name] = std::move(space); +} + +void ExecSpaceManager::initialize_spaces(const Kokkos::InitArguments& args) { + // Note: the names of the execution spaces, used as keys in the map, encode + // the ordering of the initialization code from the old initializtion stuff. + // Eventually, we may want to do something less brittle than this, but for now + // we're just preserving compatibility with the old implementation. + for (auto& to_init : exec_space_factory_list) { + to_init.second->initialize(args); + } +} + +void ExecSpaceManager::finalize_spaces(const bool all_spaces) { + for (auto& to_finalize : exec_space_factory_list) { + to_finalize.second->finalize(all_spaces); + } +} + +void ExecSpaceManager::static_fence() { + for (auto& to_fence : exec_space_factory_list) { + to_fence.second->fence(); + } +} +void ExecSpaceManager::print_configuration(std::ostream& msg, + const bool detail) { + for (auto& to_print : exec_space_factory_list) { + to_print.second->print_configuration(msg, detail); + } +} + +int get_ctest_gpu(const char* local_rank_str) { + auto const* ctest_kokkos_device_type = + std::getenv("CTEST_KOKKOS_DEVICE_TYPE"); + if (!ctest_kokkos_device_type) { + return 0; + } + + auto const* ctest_resource_group_count_str = + std::getenv("CTEST_RESOURCE_GROUP_COUNT"); + if (!ctest_resource_group_count_str) { + return 0; + } + + // Make sure rank is within bounds of resource groups specified by CTest + auto resource_group_count = std::stoi(ctest_resource_group_count_str); + auto local_rank = std::stoi(local_rank_str); + if (local_rank >= resource_group_count) { + std::ostringstream ss; + ss << "Error: local rank " << local_rank + << " is outside the bounds of resource groups provided by CTest. Raised" + << " by Kokkos::Impl::get_ctest_gpu()."; + throw_runtime_exception(ss.str()); + } + + // Get the resource types allocated to this resource group + std::ostringstream ctest_resource_group; + ctest_resource_group << "CTEST_RESOURCE_GROUP_" << local_rank; + std::string ctest_resource_group_name = ctest_resource_group.str(); + auto const* ctest_resource_group_str = + std::getenv(ctest_resource_group_name.c_str()); + if (!ctest_resource_group_str) { + std::ostringstream ss; + ss << "Error: " << ctest_resource_group_name << " is not specified. Raised" + << " by Kokkos::Impl::get_ctest_gpu()."; + throw_runtime_exception(ss.str()); + } + + // Look for the device type specified in CTEST_KOKKOS_DEVICE_TYPE + bool found_device = false; + std::string ctest_resource_group_cxx_str = ctest_resource_group_str; + std::istringstream instream(ctest_resource_group_cxx_str); + while (true) { + std::string devName; + std::getline(instream, devName, ','); + if (devName == ctest_kokkos_device_type) { + found_device = true; + break; + } + if (instream.eof() || devName.length() == 0) { + break; + } + } + + if (!found_device) { + std::ostringstream ss; + ss << "Error: device type '" << ctest_kokkos_device_type + << "' not included in " << ctest_resource_group_name + << ". Raised by Kokkos::Impl::get_ctest_gpu()."; + throw_runtime_exception(ss.str()); + } + + // Get the device ID + std::string ctest_device_type_upper = ctest_kokkos_device_type; + for (auto& c : ctest_device_type_upper) { + c = std::toupper(c); + } + ctest_resource_group << "_" << ctest_device_type_upper; + + std::string ctest_resource_group_id_name = ctest_resource_group.str(); + auto resource_str = std::getenv(ctest_resource_group_id_name.c_str()); + if (!resource_str) { + std::ostringstream ss; + ss << "Error: " << ctest_resource_group_id_name + << " is not specified. Raised by Kokkos::Impl::get_ctest_gpu()."; + throw_runtime_exception(ss.str()); + } + + auto const* comma = std::strchr(resource_str, ','); + if (!comma || strncmp(resource_str, "id:", 3)) { + std::ostringstream ss; + ss << "Error: invalid value of " << ctest_resource_group_id_name << ": '" + << resource_str << "'. Raised by Kokkos::Impl::get_ctest_gpu()."; + throw_runtime_exception(ss.str()); + } + + std::string id(resource_str + 3, comma - resource_str - 3); + return std::stoi(id.c_str()); +} + +// function to extract gpu # from args +int get_gpu(const InitArguments& args) { + int use_gpu = args.device_id; + const int ndevices = [](int num_devices) -> int { + if (num_devices > 0) return num_devices; +#if defined(KOKKOS_ENABLE_CUDA) + return Cuda::detect_device_count(); +#elif defined(KOKKOS_ENABLE_HIP) + return Experimental::HIP::detect_device_count(); +#elif defined(KOKKOS_ENABLE_SYCL) + return sycl::device::get_devices(sycl::info::device_type::gpu).size(); +#else + return num_devices; +#endif + }(args.ndevices); + const int skip_device = args.skip_device; + + // if the exact device is not set, but ndevices was given, assign round-robin + // using on-node MPI rank + if (use_gpu < 0) { + auto const* local_rank_str = + std::getenv("OMPI_COMM_WORLD_LOCAL_RANK"); // OpenMPI + if (!local_rank_str) + local_rank_str = std::getenv("MV2_COMM_WORLD_LOCAL_RANK"); // MVAPICH2 + if (!local_rank_str) + local_rank_str = std::getenv("SLURM_LOCALID"); // SLURM + + auto const* ctest_kokkos_device_type = + std::getenv("CTEST_KOKKOS_DEVICE_TYPE"); // CTest + auto const* ctest_resource_group_count_str = + std::getenv("CTEST_RESOURCE_GROUP_COUNT"); // CTest + if (ctest_kokkos_device_type && ctest_resource_group_count_str && + local_rank_str) { + // Use the device assigned by CTest + use_gpu = get_ctest_gpu(local_rank_str); + } else if (ndevices > 0) { + // Use the device assigned by the rank + if (local_rank_str) { + auto local_rank = std::stoi(local_rank_str); + use_gpu = local_rank % ndevices; + } else { + // user only gave use ndevices, but the MPI environment variable wasn't + // set. start with GPU 0 at this point + use_gpu = 0; + } + } + // shift assignments over by one so no one is assigned to "skip_device" + if (use_gpu >= skip_device) ++use_gpu; + } + return use_gpu; +} +namespace { +bool is_unsigned_int(const char* str) { + const size_t len = strlen(str); + for (size_t i = 0; i < len; ++i) { + if (!isdigit(str[i])) { + return false; + } + } + return true; +} + +void initialize_backends(const InitArguments& args) { +// This is an experimental setting +// For KNL in Flat mode this variable should be set, so that +// memkind allocates high bandwidth memory correctly. +#ifdef KOKKOS_ENABLE_HBWSPACE + setenv("MEMKIND_HBW_NODES", "1", 0); +#endif + + Impl::ExecSpaceManager::get_instance().initialize_spaces(args); +} + +void initialize_profiling(const InitArguments& args) { + Kokkos::Profiling::initialize(args.tool_lib); + if (args.tool_help) { + if (!Kokkos::Tools::printHelp(args.tool_args)) { + std::cerr << "Tool has not provided a help message" << std::endl; + } + g_is_initialized = true; + ::Kokkos::finalize(); + std::exit(EXIT_SUCCESS); + } + Kokkos::Tools::parseArgs(args.tool_args); + for (const auto& category_value : Kokkos::Impl::metadata_map) { + for (const auto& key_value : category_value.second) { + Kokkos::Tools::declareMetadata(key_value.first, key_value.second); + } + } +} + +std::string version_string_from_int(int version_number) { + std::stringstream str_builder; + str_builder << version_number / 10000 << "." << (version_number % 10000) / 100 + << "." << version_number % 100; + return str_builder.str(); +} +void pre_initialize_internal(const InitArguments& args) { + if (args.disable_warnings) g_show_warnings = false; + if (args.tune_internals) g_tune_internals = true; + declare_configuration_metadata("version_info", "Kokkos Version", + version_string_from_int(KOKKOS_VERSION)); +#ifdef KOKKOS_COMPILER_APPLECC + declare_configuration_metadata("compiler_version", "KOKKOS_COMPILER_APPLECC", + std::to_string(KOKKOS_COMPILER_APPLECC)); + declare_configuration_metadata("tools_only", "compiler_family", "apple"); +#endif +#ifdef KOKKOS_COMPILER_CLANG + declare_configuration_metadata("compiler_version", "KOKKOS_COMPILER_CLANG", + std::to_string(KOKKOS_COMPILER_CLANG)); + declare_configuration_metadata("tools_only", "compiler_family", "clang"); +#endif +#ifdef KOKKOS_COMPILER_CRAYC + declare_configuration_metadata("compiler_version", "KOKKOS_COMPILER_CRAYC", + std::to_string(KOKKOS_COMPILER_CRAYC)); + declare_configuration_metadata("tools_only", "compiler_family", "cray"); +#endif +#ifdef KOKKOS_COMPILER_GNU + declare_configuration_metadata("compiler_version", "KOKKOS_COMPILER_GNU", + std::to_string(KOKKOS_COMPILER_GNU)); + declare_configuration_metadata("tools_only", "compiler_family", "gnu"); +#endif +#ifdef KOKKOS_COMPILER_IBM + declare_configuration_metadata("compiler_version", "KOKKOS_COMPILER_IBM", + std::to_string(KOKKOS_COMPILER_IBM)); + declare_configuration_metadata("tools_only", "compiler_family", "ibm"); +#endif +#ifdef KOKKOS_COMPILER_INTEL + declare_configuration_metadata("compiler_version", "KOKKOS_COMPILER_INTEL", + std::to_string(KOKKOS_COMPILER_INTEL)); + declare_configuration_metadata("tools_only", "compiler_family", "intel"); +#endif +#ifdef KOKKOS_COMPILER_NVCC + declare_configuration_metadata("compiler_version", "KOKKOS_COMPILER_NVCC", + std::to_string(KOKKOS_COMPILER_NVCC)); + declare_configuration_metadata("tools_only", "compiler_family", "nvcc"); +#endif +#ifdef KOKKOS_COMPILER_PGI + declare_configuration_metadata("compiler_version", "KOKKOS_COMPILER_PGI", + std::to_string(KOKKOS_COMPILER_PGI)); + declare_configuration_metadata("tools_only", "compiler_family", "pgi"); +#endif +#ifdef KOKKOS_COMPILER_MSVC + declare_configuration_metadata("compiler_version", "KOKKOS_COMPILER_MSVC", + std::to_string(KOKKOS_COMPILER_MSVC)); + declare_configuration_metadata("tools_only", "compiler_family", "msvc"); +#endif +#ifdef KOKKOS_ENABLE_ISA_KNC + declare_configuration_metadata("architecture", "KOKKOS_ENABLE_ISA_KNC", + "yes"); +#else + declare_configuration_metadata("architecture", "KOKKOS_ENABLE_ISA_KNC", "no"); +#endif +#ifdef KOKKOS_ENABLE_ISA_POWERPCLE + declare_configuration_metadata("architecture", "KOKKOS_ENABLE_ISA_POWERPCLE", + "yes"); +#else + declare_configuration_metadata("architecture", "KOKKOS_ENABLE_ISA_POWERPCLE", + "no"); +#endif +#ifdef KOKKOS_ENABLE_ISA_X86_64 + declare_configuration_metadata("architecture", "KOKKOS_ENABLE_ISA_X86_64", + "yes"); +#else + declare_configuration_metadata("architecture", "KOKKOS_ENABLE_ISA_X86_64", + "no"); +#endif + +#ifdef KOKKOS_ENABLE_GNU_ATOMICS + declare_configuration_metadata("atomics", "KOKKOS_ENABLE_GNU_ATOMICS", "yes"); +#else + declare_configuration_metadata("atomics", "KOKKOS_ENABLE_GNU_ATOMICS", "no"); +#endif +#ifdef KOKKOS_ENABLE_INTEL_ATOMICS + declare_configuration_metadata("atomics", "KOKKOS_ENABLE_INTEL_ATOMICS", + "yes"); +#else + declare_configuration_metadata("atomics", "KOKKOS_ENABLE_INTEL_ATOMICS", + "no"); +#endif +#ifdef KOKKOS_ENABLE_WINDOWS_ATOMICS + declare_configuration_metadata("atomics", "KOKKOS_ENABLE_WINDOWS_ATOMICS", + "yes"); +#else + declare_configuration_metadata("atomics", "KOKKOS_ENABLE_WINDOWS_ATOMICS", + "no"); +#endif + +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP + declare_configuration_metadata("vectorization", "KOKKOS_ENABLE_PRAGMA_IVDEP", + "yes"); +#else + declare_configuration_metadata("vectorization", "KOKKOS_ENABLE_PRAGMA_IVDEP", + "no"); +#endif +#ifdef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT + declare_configuration_metadata("vectorization", + "KOKKOS_ENABLE_PRAGMA_LOOPCOUNT", "yes"); +#else + declare_configuration_metadata("vectorization", + "KOKKOS_ENABLE_PRAGMA_LOOPCOUNT", "no"); +#endif +#ifdef KOKKOS_ENABLE_PRAGMA_SIMD + declare_configuration_metadata("vectorization", "KOKKOS_ENABLE_PRAGMA_SIMD", + "yes"); +#else + declare_configuration_metadata("vectorization", "KOKKOS_ENABLE_PRAGMA_SIMD", + "no"); +#endif +#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL + declare_configuration_metadata("vectorization", "KOKKOS_ENABLE_PRAGMA_UNROLL", + "yes"); +#else + declare_configuration_metadata("vectorization", "KOKKOS_ENABLE_PRAGMA_UNROLL", + "no"); +#endif +#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR + declare_configuration_metadata("vectorization", "KOKKOS_ENABLE_PRAGMA_VECTOR", + "yes"); +#else + declare_configuration_metadata("vectorization", "KOKKOS_ENABLE_PRAGMA_VECTOR", + "no"); +#endif + +#ifdef KOKKOS_ENABLE_HBWSPACE + declare_configuration_metadata("memory", "KOKKOS_ENABLE_HBWSPACE", "yes"); +#else + declare_configuration_metadata("memory", "KOKKOS_ENABLE_HBWSPACE", "no"); +#endif +#ifdef KOKKOS_ENABLE_INTEL_MM_ALLOC + declare_configuration_metadata("memory", "KOKKOS_ENABLE_INTEL_MM_ALLOC", + "yes"); +#else + declare_configuration_metadata("memory", "KOKKOS_ENABLE_INTEL_MM_ALLOC", + "no"); +#endif +#ifdef KOKKOS_ENABLE_POSIX_MEMALIGN + declare_configuration_metadata("memory", "KOKKOS_ENABLE_POSIX_MEMALIGN", + "yes"); +#else + declare_configuration_metadata("memory", "KOKKOS_ENABLE_POSIX_MEMALIGN", + "no"); +#endif + +#ifdef KOKKOS_ENABLE_ASM + declare_configuration_metadata("options", "KOKKOS_ENABLE_ASM", "yes"); +#else + declare_configuration_metadata("options", "KOKKOS_ENABLE_ASM", "no"); +#endif +#ifdef KOKKOS_ENABLE_CXX14 + declare_configuration_metadata("options", "KOKKOS_ENABLE_CXX14", "yes"); +#else + declare_configuration_metadata("options", "KOKKOS_ENABLE_CXX14", "no"); +#endif +#ifdef KOKKOS_ENABLE_CXX17 + declare_configuration_metadata("options", "KOKKOS_ENABLE_CXX17", "yes"); +#else + declare_configuration_metadata("options", "KOKKOS_ENABLE_CXX17", "no"); +#endif +#ifdef KOKKOS_ENABLE_CXX20 + declare_configuration_metadata("options", "KOKKOS_ENABLE_CXX20", "yes"); +#else + declare_configuration_metadata("options", "KOKKOS_ENABLE_CXX20", "no"); +#endif +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK + declare_configuration_metadata("options", "KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK", + "yes"); +#else + declare_configuration_metadata("options", "KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK", + "no"); +#endif +#ifdef KOKKOS_ENABLE_HWLOC + declare_configuration_metadata("options", "KOKKOS_ENABLE_HWLOC", "yes"); +#else + declare_configuration_metadata("options", "KOKKOS_ENABLE_HWLOC", "no"); +#endif +#ifdef KOKKOS_ENABLE_LIBRT + declare_configuration_metadata("options", "KOKKOS_ENABLE_LIBRT", "yes"); +#else + declare_configuration_metadata("options", "KOKKOS_ENABLE_LIBRT", "no"); +#endif +#ifdef KOKKOS_ENABLE_MPI + declare_configuration_metadata("options", "KOKKOS_ENABLE_MPI", "yes"); +#else + declare_configuration_metadata("options", "KOKKOS_ENABLE_MPI", "no"); +#endif + declare_configuration_metadata("architecture", "Default Device", + typeid(Kokkos::DefaultExecutionSpace).name()); +} + +void post_initialize_internal(const InitArguments& args) { + initialize_profiling(args); + g_is_initialized = true; +} + +void initialize_internal(const InitArguments& args) { + pre_initialize_internal(args); + initialize_backends(args); + post_initialize_internal(args); +} + +void finalize_internal(const bool all_spaces = false) { + typename decltype(finalize_hooks)::size_type numSuccessfulCalls = 0; + while (!finalize_hooks.empty()) { + auto f = finalize_hooks.top(); + try { + f(); + } catch (...) { + std::cerr << "Kokkos::finalize: A finalize hook (set via " + "Kokkos::push_finalize_hook) threw an exception that it did " + "not catch." + " Per std::atexit rules, this results in std::terminate. " + "This is " + "finalize hook number " + << numSuccessfulCalls + << " (1-based indexing) " + "out of " + << finalize_hooks.size() + << " to call. Remember that " + "Kokkos::finalize calls finalize hooks in reverse order " + "from how they " + "were pushed." + << std::endl; + std::terminate(); + } + finalize_hooks.pop(); + ++numSuccessfulCalls; + } + + Kokkos::Profiling::finalize(); + + Impl::ExecSpaceManager::get_instance().finalize_spaces(all_spaces); + + g_is_initialized = false; + g_show_warnings = true; + g_tune_internals = false; +} + +void fence_internal() { Impl::ExecSpaceManager::get_instance().static_fence(); } + +bool check_arg(char const* arg, char const* expected) { + std::size_t arg_len = std::strlen(arg); + std::size_t exp_len = std::strlen(expected); + if (arg_len < exp_len) return false; + if (std::strncmp(arg, expected, exp_len) != 0) return false; + if (arg_len == exp_len) return true; + /* if expected is "--threads", ignore "--threads-for-application" + by checking this character ---------^ + to see if it continues to make a longer name */ + if (std::isalnum(arg[exp_len]) || arg[exp_len] == '-' || + arg[exp_len] == '_') { + return false; + } + return true; +} + +bool check_int_arg(char const* arg, char const* expected, int* value) { + if (!check_arg(arg, expected)) return false; + std::size_t arg_len = std::strlen(arg); + std::size_t exp_len = std::strlen(expected); + bool okay = true; + if (arg_len == exp_len || arg[exp_len] != '=') okay = false; + char const* number = arg + exp_len + 1; + if (!Impl::is_unsigned_int(number) || strlen(number) == 0) okay = false; + *value = std::stoi(number); + if (!okay) { + std::ostringstream ss; + ss << "Error: expecting an '=INT' after command line argument '" << expected + << "'"; + ss << ". Raised by Kokkos::initialize(int narg, char* argc[])."; + Impl::throw_runtime_exception(ss.str()); + } + return true; +} + +bool check_str_arg(char const* arg, char const* expected, std::string& value) { + if (!check_arg(arg, expected)) return false; + std::size_t arg_len = std::strlen(arg); + std::size_t exp_len = std::strlen(expected); + bool okay = true; + if (arg_len == exp_len || arg[exp_len] != '=') okay = false; + char const* remain = arg + exp_len + 1; + value = remain; + if (!okay) { + std::ostringstream ss; + ss << "Error: expecting an '=STRING' after command line argument '" + << expected << "'"; + ss << ". Raised by Kokkos::initialize(int narg, char* argc[])."; + Impl::throw_runtime_exception(ss.str()); + } + return true; +} + +void warn_deprecated_command_line_argument(std::string deprecated, + std::string valid) { + std::cerr + << "Warning: command line argument '" << deprecated + << "' is deprecated. Use '" << valid + << "' instead. Raised by Kokkos::initialize(int narg, char* argc[])." + << std::endl; +} + +unsigned get_process_id() { +#ifdef _WIN32 + return unsigned(GetCurrentProcessId()); +#else + return unsigned(getpid()); +#endif +} + +void parse_command_line_arguments(int& narg, char* arg[], + InitArguments& arguments) { + auto& num_threads = arguments.num_threads; + auto& numa = arguments.num_numa; + auto& device = arguments.device_id; + auto& ndevices = arguments.ndevices; + auto& skip_device = arguments.skip_device; + auto& disable_warnings = arguments.disable_warnings; + auto& tune_internals = arguments.tune_internals; + auto& tool_help = arguments.tool_help; + auto& tool_args = arguments.tool_args; + auto& tool_lib = arguments.tool_lib; + + bool kokkos_threads_found = false; + bool kokkos_numa_found = false; + bool kokkos_device_found = false; + bool kokkos_ndevices_found = false; + + int iarg = 0; + + while (iarg < narg) { + if (check_int_arg(arg[iarg], "--kokkos-threads", &num_threads)) { + for (int k = iarg; k < narg - 1; k++) { + arg[k] = arg[k + 1]; + } + kokkos_threads_found = true; + narg--; + } else if (!kokkos_threads_found && + check_int_arg(arg[iarg], "--threads", &num_threads)) { + iarg++; + } else if (check_int_arg(arg[iarg], "--kokkos-numa", &numa)) { + for (int k = iarg; k < narg - 1; k++) { + arg[k] = arg[k + 1]; + } + kokkos_numa_found = true; + narg--; + } else if (!kokkos_numa_found && + check_int_arg(arg[iarg], "--numa", &numa)) { + iarg++; + } else if (check_int_arg(arg[iarg], "--kokkos-device-id", &device) || + check_int_arg(arg[iarg], "--kokkos-device", &device)) { + if (check_arg(arg[iarg], "--kokkos-device")) { + warn_deprecated_command_line_argument("--kokkos-device", + "--kokkos-device-id"); + } + for (int k = iarg; k < narg - 1; k++) { + arg[k] = arg[k + 1]; + } + kokkos_device_found = true; + narg--; + } else if (!kokkos_device_found && + (check_int_arg(arg[iarg], "--device-id", &device) || + check_int_arg(arg[iarg], "--device", &device))) { + if (check_arg(arg[iarg], "--device")) { + warn_deprecated_command_line_argument("--device", "--device-id"); + } + iarg++; + } else if (check_arg(arg[iarg], "--kokkos-num-devices") || + check_arg(arg[iarg], "--num-devices") || + check_arg(arg[iarg], "--kokkos-ndevices") || + check_arg(arg[iarg], "--ndevices")) { + if (check_arg(arg[iarg], "--ndevices")) { + warn_deprecated_command_line_argument("--ndevices", "--num-devices"); + } + if (check_arg(arg[iarg], "--kokkos-ndevices")) { + warn_deprecated_command_line_argument("--kokkos-ndevices", + "--kokkos-num-devices"); + } + // Find the number of device (expecting --device=XX) + if (!((strncmp(arg[iarg], "--kokkos-num-devices=", 21) == 0) || + (strncmp(arg[iarg], "--num-ndevices=", 14) == 0) || + (strncmp(arg[iarg], "--kokkos-ndevices=", 18) == 0) || + (strncmp(arg[iarg], "--ndevices=", 11) == 0))) + throw_runtime_exception( + "Error: expecting an '=INT[,INT]' after command line argument " + "'--num-devices/--kokkos-num-devices'. Raised by " + "Kokkos::initialize(int narg, char* argc[])."); + + char* num1 = strchr(arg[iarg], '=') + 1; + char* num2 = strpbrk(num1, ","); + int num1_len = num2 == nullptr ? strlen(num1) : num2 - num1; + char* num1_only = new char[num1_len + 1]; + strncpy(num1_only, num1, num1_len); + num1_only[num1_len] = '\0'; + + if (!is_unsigned_int(num1_only) || (strlen(num1_only) == 0)) { + throw_runtime_exception( + "Error: expecting an integer number after command line argument " + "'--kokkos-numdevices'. Raised by " + "Kokkos::initialize(int narg, char* argc[])."); + } + if (check_arg(arg[iarg], "--kokkos-num-devices") || + check_arg(arg[iarg], "--kokkos-ndevices") || !kokkos_ndevices_found) + ndevices = std::stoi(num1_only); + delete[] num1_only; + + if (num2 != nullptr) { + if ((!is_unsigned_int(num2 + 1)) || (strlen(num2) == 1)) + throw_runtime_exception( + "Error: expecting an integer number after command line argument " + "'--kokkos-num-devices=XX,'. Raised by " + "Kokkos::initialize(int narg, char* argc[])."); + + if (check_arg(arg[iarg], "--kokkos-num-devices") || + check_arg(arg[iarg], "--kokkos-ndevices") || !kokkos_ndevices_found) + skip_device = std::stoi(num2 + 1); + } + + // Remove the --kokkos-num-devices argument from the list but leave + // --num-devices + if (check_arg(arg[iarg], "--kokkos-num-devices") || + check_arg(arg[iarg], "--kokkos-ndevices")) { + for (int k = iarg; k < narg - 1; k++) { + arg[k] = arg[k + 1]; + } + kokkos_ndevices_found = true; + narg--; + } else { + iarg++; + } + } else if (check_arg(arg[iarg], "--kokkos-disable-warnings")) { + disable_warnings = true; + for (int k = iarg; k < narg - 1; k++) { + arg[k] = arg[k + 1]; + } + narg--; + } else if (check_arg(arg[iarg], "--kokkos-tune-internals")) { + tune_internals = true; + for (int k = iarg; k < narg - 1; k++) { + arg[k] = arg[k + 1]; + } + narg--; + } else if (check_str_arg(arg[iarg], "--kokkos-tools-library", tool_lib)) { + for (int k = iarg; k < narg - 1; k++) { + arg[k] = arg[k + 1]; + } + narg--; + } else if (check_str_arg(arg[iarg], "--kokkos-tools-args", tool_args)) { + for (int k = iarg; k < narg - 1; k++) { + arg[k] = arg[k + 1]; + } + narg--; + // strip any leading and/or trailing quotes if they were retained in the + // string because this will very likely cause parsing issues for tools. + // If the quotes are retained (via bypassing the shell): + // <EXE> --kokkos-tools-args="-c my example" + // would be tokenized as: + // "<EXE>" "\"-c" "my" "example\"" + // instead of: + // "<EXE>" "-c" "my" "example" + if (!tool_args.empty()) { + if (tool_args.front() == '"') tool_args = tool_args.substr(1); + if (tool_args.back() == '"') + tool_args = tool_args.substr(0, tool_args.length() - 1); + } + // add the name of the executable to the beginning + if (narg > 0) tool_args = std::string(arg[0]) + " " + tool_args; + } else if (check_arg(arg[iarg], "--kokkos-tools-help")) { + tool_help = true; + for (int k = iarg; k < narg - 1; k++) { + arg[k] = arg[k + 1]; + } + narg--; + } else if (check_arg(arg[iarg], "--kokkos-help") || + check_arg(arg[iarg], "--help")) { + auto const help_message = R"( + -------------------------------------------------------------------------------- + -------------Kokkos command line arguments-------------------------------------- + -------------------------------------------------------------------------------- + The following arguments exist also without prefix 'kokkos' (e.g. --help). + The prefixed arguments will be removed from the list by Kokkos::initialize(), + the non-prefixed ones are not removed. Prefixed versions take precedence over + non prefixed ones, and the last occurrence of an argument overwrites prior + settings. + + --kokkos-help : print this message + --kokkos-disable-warnings : disable kokkos warning messages + --kokkos-tune-internals : allow Kokkos to autotune policies and declare + tuning features through the tuning system. If + left off, Kokkos uses heuristics + --kokkos-threads=INT : specify total number of threads or + number of threads per NUMA region if + used in conjunction with '--numa' option. + --kokkos-numa=INT : specify number of NUMA regions used by process. + --kokkos-device-id=INT : specify device id to be used by Kokkos. + --kokkos-num-devices=INT[,INT] : used when running MPI jobs. Specify number of + devices per node to be used. Process to device + mapping happens by obtaining the local MPI rank + and assigning devices round-robin. The optional + second argument allows for an existing device + to be ignored. This is most useful on workstations + with multiple GPUs of which one is used to drive + screen output. + --kokkos-tools-library : Equivalent to KOKKOS_PROFILE_LIBRARY environment + variable. Must either be full path to library or + name of library if the path is present in the + runtime library search path (e.g. LD_LIBRARY_PATH) + --kokkos-tools-help : Query the (loaded) kokkos-tool for its command-line + option support (which should then be passed via + --kokkos-tools-args="...") + --kokkos-tools-args=STR : A single (quoted) string of options which will be + whitespace delimited and passed to the loaded + kokkos-tool as command-line arguments. E.g. + `<EXE> --kokkos-tools-args="-c input.txt"` will + pass `<EXE> -c input.txt` as argc/argv to tool + -------------------------------------------------------------------------------- +)"; + std::cout << help_message << std::endl; + + // Remove the --kokkos-help argument from the list but leave --help + if (check_arg(arg[iarg], "--kokkos-help")) { + for (int k = iarg; k < narg - 1; k++) { + arg[k] = arg[k + 1]; + } + narg--; + } else { + iarg++; + } + } else + iarg++; + } + if (tool_args.empty() && narg > 0) tool_args = arg[0]; +} + +void parse_environment_variables(InitArguments& arguments) { + auto& num_threads = arguments.num_threads; + auto& numa = arguments.num_numa; + auto& device = arguments.device_id; + auto& ndevices = arguments.ndevices; + auto& skip_device = arguments.skip_device; + auto& disable_warnings = arguments.disable_warnings; + auto& tune_internals = arguments.tune_internals; + auto& tool_lib = arguments.tool_lib; + char* endptr; + auto env_num_threads_str = std::getenv("KOKKOS_NUM_THREADS"); + if (env_num_threads_str != nullptr) { + errno = 0; + auto env_num_threads = std::strtol(env_num_threads_str, &endptr, 10); + if (endptr == env_num_threads_str) + Impl::throw_runtime_exception( + "Error: cannot convert KOKKOS_NUM_THREADS to an integer. Raised by " + "Kokkos::initialize(int narg, char* argc[])."); + if (errno == ERANGE) + Impl::throw_runtime_exception( + "Error: KOKKOS_NUM_THREADS out of range of representable values by " + "an integer. Raised by Kokkos::initialize(int narg, char* argc[])."); + if ((num_threads != -1) && (env_num_threads != num_threads)) + Impl::throw_runtime_exception( + "Error: expecting a match between --kokkos-threads and " + "KOKKOS_NUM_THREADS if both are set. Raised by " + "Kokkos::initialize(int narg, char* argc[])."); + else + num_threads = env_num_threads; + } + auto env_numa_str = std::getenv("KOKKOS_NUMA"); + if (env_numa_str != nullptr) { + errno = 0; + auto env_numa = std::strtol(env_numa_str, &endptr, 10); + if (endptr == env_numa_str) + Impl::throw_runtime_exception( + "Error: cannot convert KOKKOS_NUMA to an integer. Raised by " + "Kokkos::initialize(int narg, char* argc[])."); + if (errno == ERANGE) + Impl::throw_runtime_exception( + "Error: KOKKOS_NUMA out of range of representable values by an " + "integer. Raised by Kokkos::initialize(int narg, char* argc[])."); + if ((numa != -1) && (env_numa != numa)) + Impl::throw_runtime_exception( + "Error: expecting a match between --kokkos-numa and KOKKOS_NUMA if " + "both are set. Raised by Kokkos::initialize(int narg, char* " + "argc[])."); + else + numa = env_numa; + } + auto env_device_str = std::getenv("KOKKOS_DEVICE_ID"); + if (env_device_str != nullptr) { + errno = 0; + auto env_device = std::strtol(env_device_str, &endptr, 10); + if (endptr == env_device_str) + Impl::throw_runtime_exception( + "Error: cannot convert KOKKOS_DEVICE_ID to an integer. Raised by " + "Kokkos::initialize(int narg, char* argc[])."); + if (errno == ERANGE) + Impl::throw_runtime_exception( + "Error: KOKKOS_DEVICE_ID out of range of representable values by an " + "integer. Raised by Kokkos::initialize(int narg, char* argc[])."); + if ((device != -1) && (env_device != device)) + Impl::throw_runtime_exception( + "Error: expecting a match between --kokkos-device and " + "KOKKOS_DEVICE_ID if both are set. Raised by Kokkos::initialize(int " + "narg, char* argc[])."); + else + device = env_device; + } + auto env_rdevices_str = std::getenv("KOKKOS_RAND_DEVICES"); + auto env_ndevices_str = std::getenv("KOKKOS_NUM_DEVICES"); + if (env_ndevices_str != nullptr || env_rdevices_str != nullptr) { + errno = 0; + if (env_ndevices_str != nullptr && env_rdevices_str != nullptr) { + Impl::throw_runtime_exception( + "Error: cannot specify both KOKKOS_NUM_DEVICES and " + "KOKKOS_RAND_DEVICES. " + "Raised by Kokkos::initialize(int narg, char* argc[])."); + } + int rdevices = -1; + if (env_ndevices_str != nullptr) { + auto env_ndevices = std::strtol(env_ndevices_str, &endptr, 10); + if (endptr == env_ndevices_str) + Impl::throw_runtime_exception( + "Error: cannot convert KOKKOS_NUM_DEVICES to an integer. Raised by " + "Kokkos::initialize(int narg, char* argc[])."); + if (errno == ERANGE) + Impl::throw_runtime_exception( + "Error: KOKKOS_NUM_DEVICES out of range of representable values by " + "an integer. Raised by Kokkos::initialize(int narg, char* " + "argc[])."); + if ((ndevices != -1) && (env_ndevices != ndevices)) + Impl::throw_runtime_exception( + "Error: expecting a match between --kokkos-ndevices and " + "KOKKOS_NUM_DEVICES if both are set. Raised by " + "Kokkos::initialize(int narg, char* argc[])."); + else + ndevices = env_ndevices; + } else { // you set KOKKOS_RAND_DEVICES + auto env_rdevices = std::strtol(env_rdevices_str, &endptr, 10); + if (endptr == env_ndevices_str) + Impl::throw_runtime_exception( + "Error: cannot convert KOKKOS_RAND_DEVICES to an integer. Raised " + "by Kokkos::initialize(int narg, char* argc[])."); + if (errno == ERANGE) + Impl::throw_runtime_exception( + "Error: KOKKOS_RAND_DEVICES out of range of representable values " + "by an integer. Raised by Kokkos::initialize(int narg, char* " + "argc[])."); + else + rdevices = env_rdevices; + } + // Skip device + auto env_skip_device_str = std::getenv("KOKKOS_SKIP_DEVICE"); + if (env_skip_device_str != nullptr) { + errno = 0; + auto env_skip_device = std::strtol(env_skip_device_str, &endptr, 10); + if (endptr == env_skip_device_str) + Impl::throw_runtime_exception( + "Error: cannot convert KOKKOS_SKIP_DEVICE to an integer. Raised by " + "Kokkos::initialize(int narg, char* argc[])."); + if (errno == ERANGE) + Impl::throw_runtime_exception( + "Error: KOKKOS_SKIP_DEVICE out of range of representable values by " + "an integer. Raised by Kokkos::initialize(int narg, char* " + "argc[])."); + if ((skip_device != 9999) && (env_skip_device != skip_device)) + Impl::throw_runtime_exception( + "Error: expecting a match between --kokkos-ndevices and " + "KOKKOS_SKIP_DEVICE if both are set. Raised by " + "Kokkos::initialize(int narg, char* argc[])."); + else + skip_device = env_skip_device; + } + if (rdevices > 0) { + if (skip_device > 0 && rdevices == 1) + Impl::throw_runtime_exception( + "Error: cannot KOKKOS_SKIP_DEVICE the only KOKKOS_RAND_DEVICE. " + "Raised by Kokkos::initialize(int narg, char* argc[])."); + + std::srand(get_process_id()); + while (device < 0) { + int test_device = std::rand() % rdevices; + if (test_device != skip_device) device = test_device; + } + } + } + char* env_disablewarnings_str = std::getenv("KOKKOS_DISABLE_WARNINGS"); + if (env_disablewarnings_str != nullptr) { + std::string env_str(env_disablewarnings_str); // deep-copies string + for (char& c : env_str) { + c = toupper(c); + } + const auto _rc = std::regex_constants::icase | std::regex_constants::egrep; + const auto _re = std::regex("^(true|on|yes|[1-9])$", _rc); + if (std::regex_match(env_str, _re)) + disable_warnings = true; + else if (disable_warnings) + Impl::throw_runtime_exception( + "Error: expecting a match between --kokkos-disable-warnings and " + "KOKKOS_DISABLE_WARNINGS if both are set. Raised by " + "Kokkos::initialize(int narg, char* argc[])."); + } + char* env_tuneinternals_str = std::getenv("KOKKOS_TUNE_INTERNALS"); + if (env_tuneinternals_str != nullptr) { + std::string env_str(env_tuneinternals_str); // deep-copies string + for (char& c : env_str) { + c = toupper(c); + } + if ((env_str == "TRUE") || (env_str == "ON") || (env_str == "1")) + tune_internals = true; + else if (tune_internals) + Impl::throw_runtime_exception( + "Error: expecting a match between --kokkos-tune-internals and " + "KOKKOS_TUNE_INTERNALS if both are set. Raised by " + "Kokkos::initialize(int narg, char* argc[])."); + } + auto env_tool_lib = std::getenv("KOKKOS_PROFILE_LIBRARY"); + if (env_tool_lib != nullptr) { + if (!tool_lib.empty() && std::string(env_tool_lib) != tool_lib) + Impl::throw_runtime_exception( + "Error: expecting a match between --kokkos-tools-library and " + "KOKKOS_PROFILE_LIBRARY if both are set. Raised by " + "Kokkos::initialize(int narg, char* argc[])."); + else + tool_lib = env_tool_lib; + } +} + +} // namespace + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- + +namespace Kokkos { + +void initialize(int& narg, char* arg[]) { + InitArguments arguments; + Impl::parse_command_line_arguments(narg, arg, arguments); + Impl::parse_environment_variables(arguments); + Impl::initialize_internal(arguments); +} + +void initialize(InitArguments arguments) { + Impl::parse_environment_variables(arguments); + Impl::initialize_internal(arguments); +} + +namespace Impl { + +void pre_initialize(const InitArguments& args) { + pre_initialize_internal(args); +} + +void post_initialize(const InitArguments& args) { + post_initialize_internal(args); +} + +} // namespace Impl + +void push_finalize_hook(std::function<void()> f) { finalize_hooks.push(f); } + +void finalize() { Impl::finalize_internal(); } + +void finalize_all() { + enum : bool { all_spaces = true }; + Impl::finalize_internal(all_spaces); +} + +void fence() { Impl::fence_internal(); } + +void print_helper(std::ostringstream& out, + const std::map<std::string, std::string>& print_me) { + for (const auto& kv : print_me) { + out << kv.first << ": " << kv.second << '\n'; + } +} + +void print_configuration(std::ostream& out, const bool detail) { + std::ostringstream msg; + + print_helper(msg, Kokkos::Impl::metadata_map["version_info"]); + + msg << "Compiler:" << std::endl; + print_helper(msg, Kokkos::Impl::metadata_map["compiler_version"]); + + msg << "Architecture:" << std::endl; + print_helper(msg, Kokkos::Impl::metadata_map["architecture"]); + + msg << "Atomics:" << std::endl; + print_helper(msg, Kokkos::Impl::metadata_map["atomics"]); + + msg << "Vectorization:" << std::endl; + print_helper(msg, Kokkos::Impl::metadata_map["vectorization"]); + + msg << "Memory:" << std::endl; + print_helper(msg, Kokkos::Impl::metadata_map["memory"]); + + msg << "Options:" << std::endl; + print_helper(msg, Kokkos::Impl::metadata_map["options"]); + + Impl::ExecSpaceManager::get_instance().print_configuration(msg, detail); + + out << msg.str() << std::endl; +} + +bool is_initialized() noexcept { return g_is_initialized; } + +bool show_warnings() noexcept { return g_show_warnings; } +bool tune_internals() noexcept { return g_tune_internals; } + +#ifdef KOKKOS_COMPILER_PGI +namespace Impl { +// Bizzarely, an extra jump instruction forces the PGI compiler to not have a +// bug related to (probably?) empty base optimization and/or aggregate +// construction. +void _kokkos_pgi_compiler_bug_workaround() {} +} // end namespace Impl +#endif + +} // namespace Kokkos diff --git a/packages/kokkos/core/src/impl/Kokkos_Default_GraphNodeKernel.hpp b/packages/kokkos/core/src/impl/Kokkos_Default_GraphNodeKernel.hpp new file mode 100644 index 0000000000000000000000000000000000000000..1edcca8f5e45dd2de4724fff42f9d1d7be4d57af --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Default_GraphNodeKernel.hpp @@ -0,0 +1,125 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_KOKKOS_HOST_GRAPHNODEKERNEL_HPP +#define KOKKOS_KOKKOS_HOST_GRAPHNODEKERNEL_HPP + +#include <Kokkos_Macros.hpp> + +#include <impl/Kokkos_Default_Graph_fwd.hpp> + +#include <Kokkos_Graph.hpp> +#include <Kokkos_Parallel.hpp> +#include <Kokkos_Parallel_Reduce.hpp> + +namespace Kokkos { +namespace Impl { + +//============================================================================== +// <editor-fold desc="GraphNodeKernelImpl"> {{{1 + +template <class ExecutionSpace> +struct GraphNodeKernelDefaultImpl { + // TODO @graphs decide if this should use vtable or intrusive erasure via + // function pointers like in the rest of the graph interface + virtual void execute_kernel() = 0; +}; + +// TODO Indicate that this kernel specialization is only for the Host somehow? +template <class ExecutionSpace, class PolicyType, class Functor, + class PatternTag, class... Args> +class GraphNodeKernelImpl + : public PatternImplSpecializationFromTag<PatternTag, Functor, PolicyType, + Args..., ExecutionSpace>::type, + public GraphNodeKernelDefaultImpl<ExecutionSpace> { + public: + using base_t = + typename PatternImplSpecializationFromTag<PatternTag, Functor, PolicyType, + Args..., ExecutionSpace>::type; + using execute_kernel_vtable_base_t = + GraphNodeKernelDefaultImpl<ExecutionSpace>; + // We have to use this name here because that's how it was done way back when + // then implementations of Impl::Parallel*<> were written + using Policy = PolicyType; + using graph_kernel = GraphNodeKernelImpl; + + // TODO @graph kernel name info propagation + template <class PolicyDeduced, class... ArgsDeduced> + GraphNodeKernelImpl(std::string const&, ExecutionSpace const&, + Functor arg_functor, PolicyDeduced&& arg_policy, + ArgsDeduced&&... args) + : base_t(std::move(arg_functor), (PolicyDeduced &&) arg_policy, + (ArgsDeduced &&) args...), + execute_kernel_vtable_base_t() {} + + // FIXME @graph Forward through the instance once that works in the backends + template <class PolicyDeduced, class... ArgsDeduced> + GraphNodeKernelImpl(ExecutionSpace const& ex, Functor arg_functor, + PolicyDeduced&& arg_policy, ArgsDeduced&&... args) + : GraphNodeKernelImpl("", ex, std::move(arg_functor), + (PolicyDeduced &&) arg_policy, + (ArgsDeduced &&) args...) {} + + void execute_kernel() final { this->base_t::execute(); } +}; + +// </editor-fold> end GraphNodeKernelImpl }}}1 +//============================================================================== + +template <class ExecutionSpace> +struct GraphNodeAggregateKernelDefaultImpl + : GraphNodeKernelDefaultImpl<ExecutionSpace> { + // Aggregates don't need a policy, but for the purposes of checking the static + // assertions about graph kernels, + struct Policy { + using is_graph_kernel = std::true_type; + }; + using graph_kernel = GraphNodeAggregateKernelDefaultImpl; + void execute_kernel() final {} +}; + +} // end namespace Impl +} // end namespace Kokkos + +#endif // KOKKOS_KOKKOS_HOST_GRAPHNODEKERNEL_HPP diff --git a/packages/kokkos/core/src/impl/Kokkos_Default_GraphNode_Impl.hpp b/packages/kokkos/core/src/impl/Kokkos_Default_GraphNode_Impl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..37041534718984b98e42f767a5bbde58f0490277 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Default_GraphNode_Impl.hpp @@ -0,0 +1,170 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_KOKKOS_HOST_GRAPHNODE_IMPL_HPP +#define KOKKOS_KOKKOS_HOST_GRAPHNODE_IMPL_HPP + +#include <Kokkos_Macros.hpp> + +#include <impl/Kokkos_Default_Graph_fwd.hpp> + +#include <Kokkos_Graph.hpp> + +#include <vector> +#include <memory> + +namespace Kokkos { +namespace Impl { + +//============================================================================== +// <editor-fold desc="GraphNodeBackendSpecificDetails"> {{{1 + +template <class ExecutionSpace> +struct GraphNodeBackendSpecificDetails { + private: + using execution_space_instance_storage_t = + ExecutionSpaceInstanceStorage<ExecutionSpace>; + using default_kernel_impl_t = GraphNodeKernelDefaultImpl<ExecutionSpace>; + using default_aggregate_kernel_impl_t = + GraphNodeAggregateKernelDefaultImpl<ExecutionSpace>; + + std::vector<std::shared_ptr<GraphNodeBackendSpecificDetails<ExecutionSpace>>> + m_predecessors = {}; + + Kokkos::ObservingRawPtr<default_kernel_impl_t> m_kernel_ptr = nullptr; + + bool m_has_executed = false; + bool m_is_aggregate = false; + bool m_is_root = false; + + template <class> + friend struct HostGraphImpl; + + protected: + //---------------------------------------------------------------------------- + // <editor-fold desc="Ctors, destructor, and assignment"> {{{2 + + explicit GraphNodeBackendSpecificDetails() = default; + + explicit GraphNodeBackendSpecificDetails( + _graph_node_is_root_ctor_tag) noexcept + : m_has_executed(true), m_is_root(true) {} + + GraphNodeBackendSpecificDetails(GraphNodeBackendSpecificDetails const&) = + delete; + + GraphNodeBackendSpecificDetails(GraphNodeBackendSpecificDetails&&) noexcept = + delete; + + GraphNodeBackendSpecificDetails& operator =( + GraphNodeBackendSpecificDetails const&) = delete; + + GraphNodeBackendSpecificDetails& operator =( + GraphNodeBackendSpecificDetails&&) noexcept = delete; + + ~GraphNodeBackendSpecificDetails() = default; + + // </editor-fold> end Ctors, destructor, and assignment }}}2 + //---------------------------------------------------------------------------- + + public: + void set_kernel(default_kernel_impl_t& arg_kernel) { + KOKKOS_EXPECTS(m_kernel_ptr == nullptr) + m_kernel_ptr = &arg_kernel; + } + + void set_kernel(default_aggregate_kernel_impl_t& arg_kernel) { + KOKKOS_EXPECTS(m_kernel_ptr == nullptr) + m_kernel_ptr = &arg_kernel; + m_is_aggregate = true; + } + + void set_predecessor( + std::shared_ptr<GraphNodeBackendSpecificDetails<ExecutionSpace>> + arg_pred_impl) { + // This method delegates responsibility for executing the predecessor to + // this node. Each node can have at most one predecessor (which may be an + // aggregate). + KOKKOS_EXPECTS(m_predecessors.empty() || m_is_aggregate) + KOKKOS_EXPECTS(bool(arg_pred_impl)) + KOKKOS_EXPECTS(!m_has_executed) + m_predecessors.push_back(std::move(arg_pred_impl)); + } + + void execute_node() { + // This node could have already been executed as the predecessor of some + // other + KOKKOS_EXPECTS(bool(m_kernel_ptr) || m_has_executed) + // Just execute the predecessor here, since calling set_predecessor() + // delegates the responsibility for running it to us. + if (!m_has_executed) { + // I'm pretty sure this doesn't need to be atomic under our current + // supported semantics, but instinct I have feels like it should be... + m_has_executed = true; + for (auto const& predecessor : m_predecessors) { + predecessor->execute_node(); + } + m_kernel_ptr->execute_kernel(); + } + KOKKOS_ENSURES(m_has_executed) + } + + // This is gross, but for the purposes of our simple default implementation... + void reset_has_executed() { + for (auto const& predecessor : m_predecessors) { + predecessor->reset_has_executed(); + } + // more readable, probably: + // if(!m_is_root) m_has_executed = false; + m_has_executed = m_is_root; + } +}; + +// </editor-fold> end GraphNodeBackendSpecificDetails }}}1 +//============================================================================== + +} // end namespace Impl +} // end namespace Kokkos + +#endif // KOKKOS_KOKKOS_HOST_GRAPHNODE_IMPL_HPP diff --git a/packages/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp b/packages/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e4667ca0121849511a44f1ca49fe0ca9646b24d9 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp @@ -0,0 +1,197 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_HOST_GRAPH_IMPL_HPP +#define KOKKOS_HOST_GRAPH_IMPL_HPP + +#include <Kokkos_ExecPolicy.hpp> +#include <Kokkos_Graph.hpp> + +#include <impl/Kokkos_GraphImpl_fwd.hpp> +#include <impl/Kokkos_Default_Graph_fwd.hpp> + +#include <Kokkos_Serial.hpp> +#include <Kokkos_OpenMP.hpp> +// FIXME @graph other backends? + +#include <impl/Kokkos_OptionalRef.hpp> +#include <impl/Kokkos_EBO.hpp> + +#include <set> + +namespace Kokkos { +namespace Impl { + +//============================================================================== +// <editor-fold desc="GraphImpl default implementation"> {{{1 + +template <class ExecutionSpace> +struct GraphImpl : private ExecutionSpaceInstanceStorage<ExecutionSpace> { + public: + using root_node_impl_t = + GraphNodeImpl<ExecutionSpace, Kokkos::Experimental::TypeErasedTag, + Kokkos::Experimental::TypeErasedTag>; + + private: + using execution_space_instance_storage_base_t = + ExecutionSpaceInstanceStorage<ExecutionSpace>; + + using node_details_t = GraphNodeBackendSpecificDetails<ExecutionSpace>; + std::set<std::shared_ptr<node_details_t>> m_sinks; + + public: + //---------------------------------------------------------------------------- + // <editor-fold desc="Constructors, destructor, and assignment"> {{{2 + + // Not moveable or copyable; it spends its whole live as a shared_ptr in the + // Graph object + GraphImpl() = default; + GraphImpl(GraphImpl const&) = delete; + GraphImpl(GraphImpl&&) = delete; + GraphImpl& operator=(GraphImpl const&) = delete; + GraphImpl& operator=(GraphImpl&&) = delete; + ~GraphImpl() = default; + + explicit GraphImpl(ExecutionSpace arg_space) + : execution_space_instance_storage_base_t(std::move(arg_space)) {} + + // </editor-fold> end Constructors, destructor, and assignment }}}2 + //---------------------------------------------------------------------------- + + ExecutionSpace const& get_execution_space() const { + return this + ->execution_space_instance_storage_base_t::execution_space_instance(); + } + + //---------------------------------------------------------------------------- + // <editor-fold desc="required customizations"> {{{2 + + template <class NodeImpl> + // requires NodeImplPtr is a shared_ptr to specialization of GraphNodeImpl + void add_node(std::shared_ptr<NodeImpl> const& arg_node_ptr) { + static_assert( + NodeImpl::kernel_type::Policy::is_graph_kernel::value, + "Something has gone horribly wrong, but it's too complicated to " + "explain here. Buy Daisy a coffee and she'll explain it to you."); + // Since this is always called before any calls to add_predecessor involving + // it, we can treat this node as a sink until we discover otherwise. + arg_node_ptr->node_details_t::set_kernel(arg_node_ptr->get_kernel()); + auto spot = m_sinks.find(arg_node_ptr); + KOKKOS_ASSERT(spot == m_sinks.end()) + m_sinks.insert(std::move(spot), std::move(arg_node_ptr)); + } + + template <class NodeImplPtr, class PredecessorRef> + // requires PredecessorRef is a specialization of GraphNodeRef that has + // already been added to this graph and NodeImpl is a specialization of + // GraphNodeImpl that has already been added to this graph. + void add_predecessor(NodeImplPtr arg_node_ptr, PredecessorRef arg_pred_ref) { + auto node_ptr_spot = m_sinks.find(arg_node_ptr); + auto pred_ptr = GraphAccess::get_node_ptr(arg_pred_ref); + auto pred_ref_spot = m_sinks.find(pred_ptr); + KOKKOS_ASSERT(node_ptr_spot != m_sinks.end()) + if (pred_ref_spot != m_sinks.end()) { + // delegate responsibility for executing the predecessor to arg_node + // and then remove the predecessor from the set of sinks + (*node_ptr_spot)->set_predecessor(std::move(*pred_ref_spot)); + m_sinks.erase(pred_ref_spot); + } else { + // We still want to check that it's executed, even though someone else + // should have executed it before us + (*node_ptr_spot)->set_predecessor(std::move(pred_ptr)); + } + } + + template <class... PredecessorRefs> + // See requirements/expectations in GraphBuilder + auto create_aggregate_ptr(PredecessorRefs&&...) { + // The attachment to predecessors, which is all we really need, happens + // in the generic layer, which calls through to add_predecessor for + // each predecessor ref, so all we need to do here is create the (trivial) + // aggregate node. + using aggregate_kernel_impl_t = + GraphNodeAggregateKernelDefaultImpl<ExecutionSpace>; + using aggregate_node_impl_t = + GraphNodeImpl<ExecutionSpace, aggregate_kernel_impl_t, + Kokkos::Experimental::TypeErasedTag>; + return GraphAccess::make_node_shared_ptr<aggregate_node_impl_t>( + this->execution_space_instance(), _graph_node_kernel_ctor_tag{}, + aggregate_kernel_impl_t{}); + } + + auto create_root_node_ptr() { + auto rv = Kokkos::Impl::GraphAccess::make_node_shared_ptr<root_node_impl_t>( + get_execution_space(), _graph_node_is_root_ctor_tag{}); + m_sinks.insert(rv); + return rv; + } + + void submit() { + // This reset is gross, but for the purposes of our simple host + // implementation... + for (auto& sink : m_sinks) { + sink->reset_has_executed(); + } + for (auto& sink : m_sinks) { + sink->execute_node(); + } + } + + // </editor-fold> end required customizations }}}2 + //---------------------------------------------------------------------------- +}; + +// </editor-fold> end GraphImpl default implementation }}}1 +//============================================================================== + +} // end namespace Impl + +} // end namespace Kokkos + +#include <OpenMP/Kokkos_OpenMP_Parallel.hpp> + +#include <impl/Kokkos_Default_GraphNodeKernel.hpp> +#include <impl/Kokkos_Default_GraphNode_Impl.hpp> + +#endif // KOKKOS_HOST_GRAPH_IMPL_HPP diff --git a/packages/kokkos/core/src/impl/Kokkos_Default_Graph_fwd.hpp b/packages/kokkos/core/src/impl/Kokkos_Default_Graph_fwd.hpp new file mode 100644 index 0000000000000000000000000000000000000000..cdbed72888288ffb71a3bf8ee4e3e837c3c90219 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Default_Graph_fwd.hpp @@ -0,0 +1,62 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_KOKKOS_HOST_GRAPH_FWD_HPP +#define KOKKOS_KOKKOS_HOST_GRAPH_FWD_HPP + +#include <Kokkos_Macros.hpp> + +namespace Kokkos { +namespace Impl { + +template <class ExecutionSpace> +struct GraphNodeKernelDefaultImpl; + +template <class ExecutionSpace> +struct GraphNodeAggregateKernelDefaultImpl; + +} // end namespace Impl +} // end namespace Kokkos + +#endif // KOKKOS_KOKKOS_HOST_GRAPH_FWD_HPP diff --git a/packages/kokkos/core/src/impl/Kokkos_EBO.hpp b/packages/kokkos/core/src/impl/Kokkos_EBO.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a124511c07e2fcb50c8392e56f7c5393262a3af7 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_EBO.hpp @@ -0,0 +1,358 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_EBO_HPP +#define KOKKOS_EBO_HPP + +//---------------------------------------------------------------------------- + +#include <Kokkos_Macros.hpp> + +#include <Kokkos_Core_fwd.hpp> +//---------------------------------------------------------------------------- + +#include <utility> +#include <type_traits> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <int I> +struct NotOnDeviceCtorDisambiguator {}; + +template <class... Args> +struct NoCtorsNotOnDevice : std::false_type {}; + +template <class... Args> +struct DefaultCtorNotOnDevice : std::false_type {}; + +template <> +struct DefaultCtorNotOnDevice<> : std::true_type {}; + +template <class T, bool Empty, + template <class...> class CtorNotOnDevice = NoCtorsNotOnDevice> +struct EBOBaseImpl; + +template <class T, template <class...> class CtorNotOnDevice> +struct EBOBaseImpl<T, true, CtorNotOnDevice> { + /* + * Workaround for constexpr in C++11: we need to still call T(args...), but we + * can't do so in the body of a constexpr function (in C++11), and there's no + * data member to construct into. But we can construct into an argument + * of a delegating constructor... + */ + // TODO @minor DSH the destructor gets called too early with this workaround + struct _constexpr_14_workaround_tag {}; + struct _constexpr_14_workaround_no_device_tag {}; + KOKKOS_FORCEINLINE_FUNCTION + constexpr EBOBaseImpl(_constexpr_14_workaround_tag, T&&) noexcept {} + inline constexpr EBOBaseImpl(_constexpr_14_workaround_no_device_tag, + T&&) noexcept {} + + template < + class... Args, class _ignored = void, + typename std::enable_if<std::is_void<_ignored>::value && + std::is_constructible<T, Args...>::value && + !CtorNotOnDevice<Args...>::value, + int>::type = 0> + KOKKOS_FORCEINLINE_FUNCTION constexpr explicit EBOBaseImpl( + Args&&... args) noexcept(noexcept(T(std::forward<Args>(args)...))) + // still call the constructor + : EBOBaseImpl(_constexpr_14_workaround_tag{}, + T(std::forward<Args>(args)...)) {} + + template < + class... Args, class _ignored = void, + typename std::enable_if<std::is_void<_ignored>::value && + std::is_constructible<T, Args...>::value && + CtorNotOnDevice<Args...>::value, + long>::type = 0> + inline constexpr explicit EBOBaseImpl(Args&&... args) noexcept( + noexcept(T(std::forward<Args>(args)...))) + // still call the constructor + : EBOBaseImpl(_constexpr_14_workaround_no_device_tag{}, + T(std::forward<Args>(args)...)) {} + + KOKKOS_DEFAULTED_FUNCTION + constexpr EBOBaseImpl(EBOBaseImpl const&) = default; + + KOKKOS_DEFAULTED_FUNCTION + constexpr EBOBaseImpl(EBOBaseImpl&&) = default; + + KOKKOS_DEFAULTED_FUNCTION + KOKKOS_CONSTEXPR_14 + EBOBaseImpl& operator=(EBOBaseImpl const&) = default; + + KOKKOS_DEFAULTED_FUNCTION + KOKKOS_CONSTEXPR_14 + EBOBaseImpl& operator=(EBOBaseImpl&&) = default; + + KOKKOS_DEFAULTED_FUNCTION + ~EBOBaseImpl() = default; + + KOKKOS_INLINE_FUNCTION + KOKKOS_CONSTEXPR_14 + T& _ebo_data_member() & { return *reinterpret_cast<T*>(this); } + + KOKKOS_INLINE_FUNCTION + constexpr T const& _ebo_data_member() const& { + return *reinterpret_cast<T const*>(this); + } + + KOKKOS_INLINE_FUNCTION + T volatile& _ebo_data_member() volatile& { + return *reinterpret_cast<T volatile*>(this); + } + + KOKKOS_INLINE_FUNCTION + T const volatile& _ebo_data_member() const volatile& { + return *reinterpret_cast<T const volatile*>(this); + } + + KOKKOS_INLINE_FUNCTION + KOKKOS_CONSTEXPR_14 + T&& _ebo_data_member() && { return std::move(*reinterpret_cast<T*>(this)); } +}; + +template <class T, template <class...> class CTorsNotOnDevice> +struct EBOBaseImpl<T, false, CTorsNotOnDevice> { + T m_ebo_object; + + template < + class... Args, class _ignored = void, + typename std::enable_if<std::is_void<_ignored>::value && + !CTorsNotOnDevice<Args...>::value && + std::is_constructible<T, Args...>::value, + int>::type = 0> + KOKKOS_FORCEINLINE_FUNCTION constexpr explicit EBOBaseImpl( + Args&&... args) noexcept(noexcept(T(std::forward<Args>(args)...))) + : m_ebo_object(std::forward<Args>(args)...) {} + + template < + class... Args, class _ignored = void, + typename std::enable_if<std::is_void<_ignored>::value && + CTorsNotOnDevice<Args...>::value && + std::is_constructible<T, Args...>::value, + long>::type = 0> + inline constexpr explicit EBOBaseImpl(Args&&... args) noexcept( + noexcept(T(std::forward<Args>(args)...))) + : m_ebo_object(std::forward<Args>(args)...) {} + + // TODO @tasking @minor DSH noexcept in the right places? + + KOKKOS_DEFAULTED_FUNCTION + constexpr EBOBaseImpl(EBOBaseImpl const&) = default; + + KOKKOS_DEFAULTED_FUNCTION + constexpr EBOBaseImpl(EBOBaseImpl&&) noexcept = default; + + KOKKOS_DEFAULTED_FUNCTION + KOKKOS_CONSTEXPR_14 + EBOBaseImpl& operator=(EBOBaseImpl const&) = default; + + KOKKOS_DEFAULTED_FUNCTION + KOKKOS_CONSTEXPR_14 + EBOBaseImpl& operator=(EBOBaseImpl&&) = default; + + KOKKOS_DEFAULTED_FUNCTION + ~EBOBaseImpl() = default; + + KOKKOS_INLINE_FUNCTION + T& _ebo_data_member() & { return m_ebo_object; } + + KOKKOS_INLINE_FUNCTION + T const& _ebo_data_member() const& { return m_ebo_object; } + + KOKKOS_INLINE_FUNCTION + T volatile& _ebo_data_member() volatile& { return m_ebo_object; } + + KOKKOS_INLINE_FUNCTION + T const volatile& _ebo_data_member() const volatile& { return m_ebo_object; } + + KOKKOS_INLINE_FUNCTION + T&& _ebo_data_member() && { return m_ebo_object; } +}; + +/** + * + * @tparam T + */ +template <class T, + template <class...> class CtorsNotOnDevice = NoCtorsNotOnDevice> +struct StandardLayoutNoUniqueAddressMemberEmulation + : EBOBaseImpl<T, std::is_empty<T>::value, CtorsNotOnDevice> { + private: + using ebo_base_t = EBOBaseImpl<T, std::is_empty<T>::value, CtorsNotOnDevice>; + + public: + using ebo_base_t::ebo_base_t; + + KOKKOS_FORCEINLINE_FUNCTION + KOKKOS_CONSTEXPR_14 + T& no_unique_address_data_member() & { + return this->ebo_base_t::_ebo_data_member(); + } + + KOKKOS_FORCEINLINE_FUNCTION + constexpr T const& no_unique_address_data_member() const& { + return this->ebo_base_t::_ebo_data_member(); + } + + KOKKOS_FORCEINLINE_FUNCTION + T volatile& no_unique_address_data_member() volatile& { + return this->ebo_base_t::_ebo_data_member(); + } + + KOKKOS_FORCEINLINE_FUNCTION + T const volatile& no_unique_address_data_member() const volatile& { + return this->ebo_base_t::_ebo_data_member(); + } + + KOKKOS_FORCEINLINE_FUNCTION + KOKKOS_CONSTEXPR_14 + T&& no_unique_address_data_member() && { + return this->ebo_base_t::_ebo_data_member(); + } +}; + +/** + * + * @tparam T + */ +template <class T, + template <class...> class CtorsNotOnDevice = NoCtorsNotOnDevice> +class NoUniqueAddressMemberEmulation + : private StandardLayoutNoUniqueAddressMemberEmulation<T, + CtorsNotOnDevice> { + private: + using base_t = + StandardLayoutNoUniqueAddressMemberEmulation<T, CtorsNotOnDevice>; + + public: + using base_t::base_t; + using base_t::no_unique_address_data_member; +}; + +template <class ExecutionSpace> +class ExecutionSpaceInstanceStorage + : private NoUniqueAddressMemberEmulation<ExecutionSpace, + DefaultCtorNotOnDevice> { + private: + using base_t = + NoUniqueAddressMemberEmulation<ExecutionSpace, DefaultCtorNotOnDevice>; + + protected: + constexpr explicit ExecutionSpaceInstanceStorage() : base_t() {} + + KOKKOS_INLINE_FUNCTION + constexpr explicit ExecutionSpaceInstanceStorage( + ExecutionSpace const& arg_execution_space) + : base_t(arg_execution_space) {} + + KOKKOS_INLINE_FUNCTION + constexpr explicit ExecutionSpaceInstanceStorage( + ExecutionSpace&& arg_execution_space) + : base_t(std::move(arg_execution_space)) {} + + KOKKOS_INLINE_FUNCTION + ExecutionSpace& execution_space_instance() & { + return this->no_unique_address_data_member(); + } + + KOKKOS_INLINE_FUNCTION + ExecutionSpace const& execution_space_instance() const& { + return this->no_unique_address_data_member(); + } + + KOKKOS_INLINE_FUNCTION + ExecutionSpace&& execution_space_instance() && { + return std::move(*this).no_unique_address_data_member(); + } +}; + +template <class MemorySpace> +class MemorySpaceInstanceStorage + : private NoUniqueAddressMemberEmulation<MemorySpace, + DefaultCtorNotOnDevice> { + private: + using base_t = + NoUniqueAddressMemberEmulation<MemorySpace, DefaultCtorNotOnDevice>; + + protected: + MemorySpaceInstanceStorage() : base_t() {} + + KOKKOS_INLINE_FUNCTION + MemorySpaceInstanceStorage(MemorySpace const& arg_memory_space) + : base_t(arg_memory_space) {} + + KOKKOS_INLINE_FUNCTION + constexpr explicit MemorySpaceInstanceStorage(MemorySpace&& arg_memory_space) + : base_t(arg_memory_space) {} + + KOKKOS_INLINE_FUNCTION + MemorySpace& memory_space_instance() & { + return this->no_unique_address_data_member(); + } + + KOKKOS_INLINE_FUNCTION + MemorySpace const& memory_space_instance() const& { + return this->no_unique_address_data_member(); + } + + KOKKOS_INLINE_FUNCTION + MemorySpace&& memory_space_instance() && { + return std::move(*this).no_unique_address_data_member(); + } +}; + +} // end namespace Impl +} // end namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #ifndef KOKKOS_EBO_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_Error.cpp b/packages/kokkos/core/src/impl/Kokkos_Error.cpp new file mode 100644 index 0000000000000000000000000000000000000000..dfb9f3a51cdbd9aa7e189e21f5956806d53823b5 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Error.cpp @@ -0,0 +1,181 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <cstdio> +#include <cstring> +#include <cstdlib> + +#include <ostream> +#include <sstream> +#include <iomanip> +#include <stdexcept> +#include <impl/Kokkos_Error.hpp> +#include <Cuda/Kokkos_Cuda_Error.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +void host_abort(const char *const message) { + fwrite(message, 1, strlen(message), stderr); + fflush(stderr); + ::abort(); +} + +void throw_runtime_exception(const std::string &msg) { + std::ostringstream o; + o << msg; + traceback_callstack(o); + throw std::runtime_error(o.str()); +} + +std::string human_memory_size(size_t arg_bytes) { + double bytes = arg_bytes; + const double K = 1024; + const double M = K * 1024; + const double G = M * 1024; + + std::ostringstream out; + if (bytes < K) { + out << std::setprecision(4) << bytes << " B"; + } else if (bytes < M) { + bytes /= K; + out << std::setprecision(4) << bytes << " K"; + } else if (bytes < G) { + bytes /= M; + out << std::setprecision(4) << bytes << " M"; + } else { + bytes /= G; + out << std::setprecision(4) << bytes << " G"; + } + return out.str(); +} + +} // namespace Impl + +void Experimental::RawMemoryAllocationFailure::print_error_message( + std::ostream &o) const { + o << "Allocation of size " << Impl::human_memory_size(m_attempted_size); + o << " failed"; + switch (m_failure_mode) { + case FailureMode::OutOfMemoryError: + o << ", likely due to insufficient memory."; + break; + case FailureMode::AllocationNotAligned: + o << " because the allocation was improperly aligned."; + break; + case FailureMode::InvalidAllocationSize: + o << " because the requested allocation size is not a valid size for the" + " requested allocation mechanism (it's probably too large)."; + break; + // TODO move this to the subclass for Cuda-related things + case FailureMode::MaximumCudaUVMAllocationsExceeded: + o << " because the maximum Cuda UVM allocations was exceeded."; + break; + case FailureMode::Unknown: o << " because of an unknown error."; break; + } + o << " (The allocation mechanism was "; + switch (m_mechanism) { + case AllocationMechanism::StdMalloc: o << "standard malloc()."; break; + case AllocationMechanism::PosixMemAlign: o << "posix_memalign()."; break; + case AllocationMechanism::PosixMMap: o << "POSIX mmap()."; break; + case AllocationMechanism::IntelMMAlloc: + o << "the Intel _mm_malloc() intrinsic."; + break; + case AllocationMechanism::CudaMalloc: o << "cudaMalloc()."; break; + case AllocationMechanism::CudaMallocManaged: + o << "cudaMallocManaged()."; + break; + case AllocationMechanism::CudaHostAlloc: o << "cudaHostAlloc()."; break; + case AllocationMechanism::HIPMalloc: o << "hipMalloc()."; break; + case AllocationMechanism::HIPHostMalloc: o << "hipHostMalloc()."; break; + case AllocationMechanism::SYCLMallocDevice: + o << "sycl::malloc_device()."; + break; + case AllocationMechanism::SYCLMallocShared: + o << "sycl::malloc_shared()."; + break; + } + append_additional_error_information(o); + o << ")" << std::endl; +} + +std::string Experimental::RawMemoryAllocationFailure::get_error_message() + const { + std::ostringstream out; + print_error_message(out); + return out.str(); +} + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +void traceback_callstack(std::ostream &msg) { + msg << std::endl << "Traceback functionality not available" << std::endl; +} + +} // namespace Impl + +#ifdef KOKKOS_ENABLE_CUDA +namespace Experimental { + +void CudaRawMemoryAllocationFailure::append_additional_error_information( + std::ostream &o) const { + if (m_error_code != cudaSuccess) { + o << " The Cuda allocation returned the error code \"\"" + << cudaGetErrorName(m_error_code) << "\"."; + } +} + +} // end namespace Experimental +#endif + +} // namespace Kokkos diff --git a/packages/kokkos/core/src/impl/Kokkos_Error.hpp b/packages/kokkos/core/src/impl/Kokkos_Error.hpp new file mode 100644 index 0000000000000000000000000000000000000000..5db459734631ddff5d0a29963a9ec04b9ec549ea --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Error.hpp @@ -0,0 +1,259 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_IMPL_ERROR_HPP +#define KOKKOS_IMPL_ERROR_HPP + +#include <string> +#include <iosfwd> +#include <Kokkos_Macros.hpp> +#ifdef KOKKOS_ENABLE_CUDA +#include <Cuda/Kokkos_Cuda_abort.hpp> +#endif +#ifdef KOKKOS_ENABLE_HIP +#include <HIP/Kokkos_HIP_Abort.hpp> +#endif +#ifdef KOKKOS_ENABLE_SYCL +#include <SYCL/Kokkos_SYCL_Abort.hpp> +#endif + +#ifndef KOKKOS_ABORT_MESSAGE_BUFFER_SIZE +#define KOKKOS_ABORT_MESSAGE_BUFFER_SIZE 2048 +#endif // ifndef KOKKOS_ABORT_MESSAGE_BUFFER_SIZE + +namespace Kokkos { +namespace Impl { + +[[noreturn]] void host_abort(const char *const); + +void throw_runtime_exception(const std::string &); + +void traceback_callstack(std::ostream &); + +std::string human_memory_size(size_t arg_bytes); + +} // namespace Impl + +namespace Experimental { + +class RawMemoryAllocationFailure : public std::bad_alloc { + public: + enum class FailureMode { + OutOfMemoryError, + AllocationNotAligned, + InvalidAllocationSize, + MaximumCudaUVMAllocationsExceeded, + Unknown + }; + enum class AllocationMechanism { + StdMalloc, + PosixMemAlign, + PosixMMap, + IntelMMAlloc, + CudaMalloc, + CudaMallocManaged, + CudaHostAlloc, + HIPMalloc, + HIPHostMalloc, + SYCLMallocDevice, + SYCLMallocShared + }; + + private: + size_t m_attempted_size; + size_t m_attempted_alignment; + FailureMode m_failure_mode; + AllocationMechanism m_mechanism; + + public: + RawMemoryAllocationFailure( + size_t arg_attempted_size, size_t arg_attempted_alignment, + FailureMode arg_failure_mode = FailureMode::OutOfMemoryError, + AllocationMechanism arg_mechanism = + AllocationMechanism::StdMalloc) noexcept + : m_attempted_size(arg_attempted_size), + m_attempted_alignment(arg_attempted_alignment), + m_failure_mode(arg_failure_mode), + m_mechanism(arg_mechanism) {} + + RawMemoryAllocationFailure() noexcept = delete; + + RawMemoryAllocationFailure(RawMemoryAllocationFailure const &) noexcept = + default; + RawMemoryAllocationFailure(RawMemoryAllocationFailure &&) noexcept = default; + + RawMemoryAllocationFailure &operator =( + RawMemoryAllocationFailure const &) noexcept = default; + RawMemoryAllocationFailure &operator =( + RawMemoryAllocationFailure &&) noexcept = default; + + ~RawMemoryAllocationFailure() noexcept override = default; + + KOKKOS_ATTRIBUTE_NODISCARD + const char *what() const noexcept override { + if (m_failure_mode == FailureMode::OutOfMemoryError) { + return "Memory allocation error: out of memory"; + } else if (m_failure_mode == FailureMode::AllocationNotAligned) { + return "Memory allocation error: allocation result was under-aligned"; + } + + return nullptr; // unreachable + } + + KOKKOS_ATTRIBUTE_NODISCARD + size_t attempted_size() const noexcept { return m_attempted_size; } + + KOKKOS_ATTRIBUTE_NODISCARD + size_t attempted_alignment() const noexcept { return m_attempted_alignment; } + + KOKKOS_ATTRIBUTE_NODISCARD + AllocationMechanism allocation_mechanism() const noexcept { + return m_mechanism; + } + + KOKKOS_ATTRIBUTE_NODISCARD + FailureMode failure_mode() const noexcept { return m_failure_mode; } + + void print_error_message(std::ostream &o) const; + KOKKOS_ATTRIBUTE_NODISCARD + std::string get_error_message() const; + + virtual void append_additional_error_information(std::ostream &) const {} +}; + +} // end namespace Experimental + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__) + +#if defined(__APPLE__) || defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) +// cuda_abort does not abort when building for macOS. +// required to workaround failures in random number generator unit tests with +// pre-volta architectures +#define KOKKOS_IMPL_ABORT_NORETURN +#else +// cuda_abort aborts when building for other platforms than macOS +#define KOKKOS_IMPL_ABORT_NORETURN [[noreturn]] +#endif + +#elif defined(KOKKOS_ENABLE_HIP) && defined(__HIP_DEVICE_COMPILE__) +// HIP aborts +#define KOKKOS_IMPL_ABORT_NORETURN [[noreturn]] +#elif defined(KOKKOS_ENABLE_SYCL) && defined(__SYCL_DEVICE_ONLY__) +// FIXME_SYCL SYCL doesn't abort +#define KOKKOS_IMPL_ABORT_NORETURN +#elif !defined(KOKKOS_ENABLE_OPENMPTARGET) +// Host aborts +#define KOKKOS_IMPL_ABORT_NORETURN [[noreturn]] +#else +// Everything else does not abort +#define KOKKOS_IMPL_ABORT_NORETURN +#endif + +namespace Kokkos { +KOKKOS_IMPL_ABORT_NORETURN KOKKOS_INLINE_FUNCTION void abort( + const char *const message) { +#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__) + Kokkos::Impl::cuda_abort(message); +#elif defined(KOKKOS_ENABLE_HIP) && defined(__HIP_DEVICE_COMPILE__) + Kokkos::Impl::hip_abort(message); +#elif defined(KOKKOS_ENABLE_SYCL) && defined(__SYCL_DEVICE_ONLY__) + Kokkos::Impl::sycl_abort(message); +#elif !defined(KOKKOS_ENABLE_OPENMPTARGET) + Kokkos::Impl::host_abort(message); +#else + (void)message; // FIXME_OPENMPTARGET +#endif +} + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#if !defined(NDEBUG) || defined(KOKKOS_ENFORCE_CONTRACTS) || \ + defined(KOKKOS_ENABLE_DEBUG) +#define KOKKOS_EXPECTS(...) \ + { \ + if (!bool(__VA_ARGS__)) { \ + ::Kokkos::abort( \ + "Kokkos contract violation:\n " \ + " Expected precondition `" #__VA_ARGS__ "` evaluated false."); \ + } \ + } +#define KOKKOS_ENSURES(...) \ + { \ + if (!bool(__VA_ARGS__)) { \ + ::Kokkos::abort( \ + "Kokkos contract violation:\n " \ + " Ensured postcondition `" #__VA_ARGS__ "` evaluated false."); \ + } \ + } +// some projects already define this for themselves, so don't mess them up +#ifndef KOKKOS_ASSERT +#define KOKKOS_ASSERT(...) \ + { \ + if (!bool(__VA_ARGS__)) { \ + ::Kokkos::abort( \ + "Kokkos contract violation:\n " \ + " Asserted condition `" #__VA_ARGS__ "` evaluated false."); \ + } \ + } +#endif // ifndef KOKKOS_ASSERT +#else // not debug mode +#define KOKKOS_EXPECTS(...) +#define KOKKOS_ENSURES(...) +#ifndef KOKKOS_ASSERT +#define KOKKOS_ASSERT(...) +#endif // ifndef KOKKOS_ASSERT +#endif // end debug mode ifdefs + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #ifndef KOKKOS_IMPL_ERROR_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp b/packages/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1c337b9575fadaa34f4d97028eac92ea886fcf2d --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp @@ -0,0 +1,70 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <sstream> + +namespace Kokkos { +namespace Impl { +PerTeamValue::PerTeamValue(int arg) : value(arg) {} + +PerThreadValue::PerThreadValue(int arg) : value(arg) {} +} // namespace Impl + +Impl::PerTeamValue PerTeam(const int& arg) { return Impl::PerTeamValue(arg); } + +Impl::PerThreadValue PerThread(const int& arg) { + return Impl::PerThreadValue(arg); +} + +void team_policy_check_valid_storage_level_argument(int level) { + if (!(level == 0 || level == 1)) { + std::stringstream ss; + ss << "TeamPolicy::set_scratch_size(/*level*/ " << level + << ", ...) storage level argument must be 0 or 1 to be valid\n"; + Impl::throw_runtime_exception(ss.str()); + } +} + +} // namespace Kokkos diff --git a/packages/kokkos/core/src/impl/Kokkos_ExecSpaceInitializer.hpp b/packages/kokkos/core/src/impl/Kokkos_ExecSpaceInitializer.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a922e7e3f9b19d0413487674f847b539e9d4f10a --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_ExecSpaceInitializer.hpp @@ -0,0 +1,66 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_EXEC_SPACE_INITIALIZER_HPP +#define KOKKOS_EXEC_SPACE_INITIALIZER_HPP + +#include <iosfwd> + +namespace Kokkos { +namespace Impl { + +class ExecSpaceInitializerBase { + public: + virtual void initialize(const InitArguments &args) = 0; + virtual void finalize(const bool all_spaces) = 0; + virtual void fence() = 0; + virtual void print_configuration(std::ostream &msg, const bool detail) = 0; + ExecSpaceInitializerBase() = default; + virtual ~ExecSpaceInitializerBase() = default; +}; + +} // namespace Impl +} // namespace Kokkos + +#endif // KOKKOS_EXEC_SPACE_INITIALIZER_HPP diff --git a/packages/kokkos/core/src/impl/Kokkos_FixedBufferMemoryPool.hpp b/packages/kokkos/core/src/impl/Kokkos_FixedBufferMemoryPool.hpp new file mode 100644 index 0000000000000000000000000000000000000000..3068ef3db0389d48149d2d9ce28efac3112f1c27 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_FixedBufferMemoryPool.hpp @@ -0,0 +1,306 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2019) Sandia Corporation +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_IMPL_KOKKOS_FIXEDBUFFERMEMORYPOOL_HPP +#define KOKKOS_IMPL_KOKKOS_FIXEDBUFFERMEMORYPOOL_HPP + +#include <Kokkos_Core_fwd.hpp> +#include <Kokkos_Atomic.hpp> + +#include <Kokkos_PointerOwnership.hpp> +#include <impl/Kokkos_SimpleTaskScheduler.hpp> + +namespace Kokkos { +namespace Impl { + +template <class DeviceType, size_t Size, size_t Align = 1, + class SizeType = typename DeviceType::execution_space::size_type> +class FixedBlockSizeMemoryPool + : private MemorySpaceInstanceStorage<typename DeviceType::memory_space> { + public: + using memory_space = typename DeviceType::memory_space; + using size_type = SizeType; + + private: + using memory_space_storage_base = + MemorySpaceInstanceStorage<typename DeviceType::memory_space>; + using tracker_type = Kokkos::Impl::SharedAllocationTracker; + using record_type = Kokkos::Impl::SharedAllocationRecord<memory_space>; + + struct alignas(Align) Block { + union { + char ignore; + char data[Size]; + }; + }; + + static constexpr auto actual_size = sizeof(Block); + + // TODO shared allocation tracker + // TODO @optimization put the index values on different cache lines (CPU) or + // pages (GPU)? + + tracker_type m_tracker = {}; + size_type m_num_blocks = 0; + size_type m_first_free_idx = 0; + size_type m_last_free_idx = 0; + Kokkos::OwningRawPtr<Block> m_first_block = nullptr; + Kokkos::OwningRawPtr<size_type> m_free_indices = nullptr; + + enum : size_type { IndexInUse = ~size_type(0) }; + + public: + FixedBlockSizeMemoryPool(memory_space const& mem_space, size_type num_blocks) + : memory_space_storage_base(mem_space), + m_tracker(), + m_num_blocks(num_blocks), + m_first_free_idx(0), + m_last_free_idx(num_blocks) { + // TODO alignment? + auto block_record = record_type::allocate( + mem_space, "FixedBlockSizeMemPool_blocks", num_blocks * sizeof(Block)); + KOKKOS_ASSERT(intptr_t(block_record->data()) % Align == 0); + m_tracker.assign_allocated_record_to_uninitialized(block_record); + m_first_block = (Block*)block_record->data(); + + auto idx_record = + record_type::allocate(mem_space, "Kokkos::FixedBlockSizeMemPool_blocks", + num_blocks * sizeof(size_type)); + KOKKOS_ASSERT(intptr_t(idx_record->data()) % alignof(size_type) == 0); + m_tracker.assign_allocated_record_to_uninitialized(idx_record); + m_free_indices = (size_type*)idx_record->data(); + + for (size_type i = 0; i < num_blocks; ++i) { + m_free_indices[i] = i; + } + + Kokkos::memory_fence(); + } + + // For compatibility with MemoryPool<> + FixedBlockSizeMemoryPool(memory_space const& mem_space, + size_t mempool_capacity, unsigned, unsigned, + unsigned) + : FixedBlockSizeMemoryPool( + mem_space, mempool_capacity / + actual_size) { /* forwarding ctor, must be empty */ + } + + KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool() = default; + KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool( + FixedBlockSizeMemoryPool&&) = default; + KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool( + FixedBlockSizeMemoryPool const&) = default; + KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool& operator=( + FixedBlockSizeMemoryPool&&) = default; + KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool& operator=( + FixedBlockSizeMemoryPool const&) = default; + + KOKKOS_INLINE_FUNCTION + void* allocate(size_type alloc_size) const noexcept { + (void)alloc_size; + KOKKOS_EXPECTS(alloc_size <= Size); + auto free_idx_counter = Kokkos::atomic_fetch_add( + (volatile size_type*)&m_first_free_idx, size_type(1)); + auto free_idx_idx = free_idx_counter % m_num_blocks; + + // We don't have exclusive access to m_free_indices[free_idx_idx] because + // the allocate counter might have lapped us since we incremented it + auto current_free_idx = m_free_indices[free_idx_idx]; + size_type free_idx = IndexInUse; + free_idx = Kokkos::atomic_compare_exchange(&m_free_indices[free_idx_idx], + current_free_idx, free_idx); + Kokkos::memory_fence(); + + // TODO figure out how to decrement here? + + if (free_idx == IndexInUse) { + return nullptr; + } else { + return (void*)&m_first_block[free_idx]; + } + } + + KOKKOS_INLINE_FUNCTION + void deallocate(void* ptr, size_type /*alloc_size*/) const noexcept { + // figure out which block we are + auto offset = intptr_t(ptr) - intptr_t(m_first_block); + + KOKKOS_EXPECTS(offset % actual_size == 0 && + offset / actual_size < m_num_blocks); + + Kokkos::memory_fence(); + auto last_idx_idx = Kokkos::atomic_fetch_add( + (volatile size_type*)&m_last_free_idx, size_type(1)); + last_idx_idx %= m_num_blocks; + m_free_indices[last_idx_idx] = offset / actual_size; + } +}; + +#if 0 +template < + class DeviceType, + size_t Size, + size_t Align=1, + class SizeType = typename DeviceType::execution_space::size_type +> +class FixedBlockSizeChaseLevMemoryPool + : private MemorySpaceInstanceStorage<typename DeviceType::memory_space> +{ +public: + + using memory_space = typename DeviceType::memory_space; + using size_type = SizeType; + +private: + + using memory_space_storage_base = MemorySpaceInstanceStorage<typename DeviceType::memory_space>; + using tracker_type = Kokkos::Impl::SharedAllocationTracker; + using record_type = Kokkos::Impl::SharedAllocationRecord<memory_space>; + + struct alignas(Align) Block { union { char ignore; char data[Size]; }; }; + + static constexpr auto actual_size = sizeof(Block); + + tracker_type m_tracker = { }; + size_type m_num_blocks = 0; + size_type m_first_free_idx = 0; + size_type m_last_free_idx = 0; + + + enum : size_type { IndexInUse = ~size_type(0) }; + +public: + + FixedBlockSizeMemoryPool( + memory_space const& mem_space, + size_type num_blocks + ) : memory_space_storage_base(mem_space), + m_tracker(), + m_num_blocks(num_blocks), + m_first_free_idx(0), + m_last_free_idx(num_blocks) + { + // TODO alignment? + auto block_record = record_type::allocate( + mem_space, "FixedBlockSizeMemPool_blocks", num_blocks * sizeof(Block) + ); + KOKKOS_ASSERT(intptr_t(block_record->data()) % Align == 0); + m_tracker.assign_allocated_record_to_uninitialized(block_record); + m_first_block = (Block*)block_record->data(); + + auto idx_record = record_type::allocate( + mem_space, "FixedBlockSizeMemPool_blocks", num_blocks * sizeof(size_type) + ); + KOKKOS_ASSERT(intptr_t(idx_record->data()) % alignof(size_type) == 0); + m_tracker.assign_allocated_record_to_uninitialized(idx_record); + m_free_indices = (size_type*)idx_record->data(); + + for(size_type i = 0; i < num_blocks; ++i) { + m_free_indices[i] = i; + } + + Kokkos::memory_fence(); + } + + // For compatibility with MemoryPool<> + FixedBlockSizeMemoryPool( + memory_space const& mem_space, + size_t mempool_capacity, + unsigned, unsigned, unsigned + ) : FixedBlockSizeMemoryPool(mem_space, mempool_capacity / actual_size) + { /* forwarding ctor, must be empty */ } + + KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool() = default; + KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool(FixedBlockSizeMemoryPool&&) = default; + KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool(FixedBlockSizeMemoryPool const&) = default; + KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool& operator=(FixedBlockSizeMemoryPool&&) = default; + KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool& operator=(FixedBlockSizeMemoryPool const&) = default; + + + KOKKOS_INLINE_FUNCTION + void* allocate(size_type alloc_size) const noexcept + { + KOKKOS_EXPECTS(alloc_size <= Size); + auto free_idx_counter = Kokkos::atomic_fetch_add((volatile size_type*)&m_first_free_idx, size_type(1)); + auto free_idx_idx = free_idx_counter % m_num_blocks; + + // We don't have exclusive access to m_free_indices[free_idx_idx] because + // the allocate counter might have lapped us since we incremented it + auto current_free_idx = m_free_indices[free_idx_idx]; + size_type free_idx = IndexInUse; + free_idx = + Kokkos::atomic_compare_exchange(&m_free_indices[free_idx_idx], current_free_idx, free_idx); + Kokkos::memory_fence(); + + // TODO figure out how to decrement here? + + if(free_idx == IndexInUse) { + return nullptr; + } + else { + return (void*)&m_first_block[free_idx]; + } + } + + KOKKOS_INLINE_FUNCTION + void deallocate(void* ptr, size_type alloc_size) const noexcept + { + // figure out which block we are + auto offset = intptr_t(ptr) - intptr_t(m_first_block); + + KOKKOS_EXPECTS(offset % actual_size == 0 && offset/actual_size < m_num_blocks); + + Kokkos::memory_fence(); + auto last_idx_idx = Kokkos::atomic_fetch_add((volatile size_type*)&m_last_free_idx, size_type(1)); + last_idx_idx %= m_num_blocks; + m_free_indices[last_idx_idx] = offset / actual_size; + } + +}; +#endif + +} // end namespace Impl +} // end namespace Kokkos + +#endif // KOKKOS_IMPL_KOKKOS_FIXEDBUFFERMEMORYPOOL_HPP diff --git a/packages/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp b/packages/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp new file mode 100644 index 0000000000000000000000000000000000000000..22e88ebc4fc57d4e7132bca0be2aa55f5bfc5f69 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp @@ -0,0 +1,2197 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_FUNCTORADAPTER_HPP +#define KOKKOS_FUNCTORADAPTER_HPP + +#include <cstddef> +#include <Kokkos_Core_fwd.hpp> +#include <impl/Kokkos_Traits.hpp> +#include <impl/Kokkos_Tags.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <class FunctorType, class Enable = void> +struct ReduceFunctorHasInit { + enum : bool { value = false }; +}; + +// The else clause idiom failed with NVCC+MSVC, causing some symbols not being +// compiled for the device. The code in there is anyway sketchy, and likely not +// standard compliant (just happens to work on all compilers we ever used) +// We intend to replace all of this long term with proper detection idiom. +#if defined(KOKKOS_COMPILER_MSVC) || defined(KOKKOS_IMPL_WINDOWS_CUDA) +template <class> +using impl_void_t_workaround = void; + +template <class F> +using init_archetype = decltype(&F::init); + +template <class FunctorType> +struct ReduceFunctorHasInit< + FunctorType, impl_void_t_workaround<init_archetype<FunctorType>>> { + enum : bool { value = true }; +}; +#else +template <class FunctorType> +struct ReduceFunctorHasInit< + FunctorType, + typename std::enable_if<0 < sizeof(&FunctorType::init)>::type> { + enum : bool { value = true }; +}; +#endif + +template <class FunctorType, class Enable = void> +struct ReduceFunctorHasJoin { + enum : bool { value = false }; +}; + +#if defined(KOKKOS_COMPILER_MSVC) || defined(KOKKOS_IMPL_WINDOWS_CUDA) +template <class F> +using join_archetype = decltype(&F::join); + +template <class FunctorType> +struct ReduceFunctorHasJoin< + FunctorType, impl_void_t_workaround<join_archetype<FunctorType>>> { + enum : bool { value = true }; +}; +#else +template <class FunctorType> +struct ReduceFunctorHasJoin< + FunctorType, + typename std::enable_if<0 < sizeof(&FunctorType::join)>::type> { + enum : bool { value = true }; +}; +#endif + +template <class FunctorType, class Enable = void> +struct ReduceFunctorHasFinal { + enum : bool { value = false }; +}; + +#if defined(KOKKOS_COMPILER_MSVC) || defined(KOKKOS_IMPL_WINDOWS_CUDA) +template <class F> +using final_archetype = decltype(&F::final); + +template <class FunctorType> +struct ReduceFunctorHasFinal< + FunctorType, impl_void_t_workaround<final_archetype<FunctorType>>> { + enum : bool { value = true }; +}; +#else +template <class FunctorType> +struct ReduceFunctorHasFinal< + FunctorType, + typename std::enable_if<0 < sizeof(&FunctorType::final)>::type> { + enum : bool { value = true }; +}; +#endif + +template <class FunctorType, class Enable = void> +struct ReduceFunctorHasShmemSize { + enum : bool { value = false }; +}; + +#if defined(KOKKOS_COMPILER_MSVC) || defined(KOKKOS_IMPL_WINDOWS_CUDA) +template <class F> +using shmemsize_archetype = decltype(&F::team_shmem_size); + +template <class FunctorType> +struct ReduceFunctorHasShmemSize< + FunctorType, impl_void_t_workaround<shmemsize_archetype<FunctorType>>> { + enum : bool { value = true }; +}; +#else +template <class FunctorType> +struct ReduceFunctorHasShmemSize< + FunctorType, + typename std::enable_if<0 < sizeof(&FunctorType::team_shmem_size)>::type> { + enum : bool { value = true }; +}; +#endif + +template <class FunctorType, class ArgTag, class Enable = void> +struct FunctorDeclaresValueType : public std::false_type {}; + +template <class FunctorType, class ArgTag> +struct FunctorDeclaresValueType<FunctorType, ArgTag, + void_t<typename FunctorType::value_type>> + : public std::true_type {}; + +template <class FunctorType, + bool Enable = (FunctorDeclaresValueType<FunctorType, void>::value) || + (ReduceFunctorHasInit<FunctorType>::value) || + (ReduceFunctorHasJoin<FunctorType>::value) || + (ReduceFunctorHasFinal<FunctorType>::value) || + (ReduceFunctorHasShmemSize<FunctorType>::value)> +struct IsNonTrivialReduceFunctor { + enum : bool { value = false }; +}; + +template <class FunctorType> +struct IsNonTrivialReduceFunctor<FunctorType, true> { + enum : bool { value = true }; +}; + +/** \brief Query Functor and execution policy argument tag for value type. + * + * If C++11 enabled and 'value_type' is not explicitly declared then attempt + * to deduce the type from FunctorType::operator(). + */ +template <class FunctorType, class ArgTag, + bool Dec = FunctorDeclaresValueType<FunctorType, ArgTag>::value> +struct FunctorValueTraits { + using value_type = void; + using pointer_type = void; + using reference_type = void; + using functor_type = void; + + enum { StaticValueSize = 0 }; + + KOKKOS_FORCEINLINE_FUNCTION static unsigned value_count(const FunctorType&) { + return 0; + } + + KOKKOS_FORCEINLINE_FUNCTION static unsigned value_size(const FunctorType&) { + return 0; + } +}; + +template <class ArgTag> +struct FunctorValueTraits<void, ArgTag, false> { + using value_type = void; + using pointer_type = void; + using reference_type = void; + using functor_type = void; +}; + +/** \brief FunctorType::value_type is explicitly declared so use it. + * + * Two options for declaration + * + * 1) A plain-old-data (POD) type + * using value_type = {pod_type}; + * + * 2) An array of POD of a runtime specified count. + * using value_type = {pod_type}[]; + * const unsigned value_count ; + */ +template <class FunctorType, class ArgTag> +struct FunctorValueTraits<FunctorType, ArgTag, + true /* == exists FunctorType::value_type */> { + using value_type = + typename std::remove_extent<typename FunctorType::value_type>::type; + using functor_type = FunctorType; + + static_assert((sizeof(value_type) < sizeof(int)) || + 0 == (sizeof(value_type) % sizeof(int)), + "Reduction functor's declared value_type requires: 0 == " + "sizeof(value_type) % sizeof(int)"); + + /* this cast to bool is needed for correctness by NVCC */ + enum : bool { + IsArray = static_cast<bool>( + std::is_array<typename FunctorType::value_type>::value) + }; + + // If not an array then what is the sizeof(value_type) + enum { StaticValueSize = IsArray ? 0 : sizeof(value_type) }; + + using pointer_type = value_type*; + + // The reference_type for an array is 'value_type *' + // The reference_type for a single value is 'value_type &' + + using reference_type = std::conditional_t<IsArray, value_type*, value_type&>; + + // Number of values if single value + template <class F> + KOKKOS_FORCEINLINE_FUNCTION static + typename std::enable_if<std::is_same<F, FunctorType>::value && !IsArray, + unsigned>::type + value_count(const F&) { + return 1; + } + + // Number of values if an array, protect via templating because + // 'f.value_count' will only exist when the functor declares the value_type to + // be an array. + template <class F> + KOKKOS_FORCEINLINE_FUNCTION static + typename std::enable_if<std::is_same<F, FunctorType>::value && IsArray, + unsigned>::type + value_count(const F& f) { + return f.value_count; + } + + // Total size of the value + KOKKOS_INLINE_FUNCTION static unsigned value_size(const FunctorType& f) { + return value_count(f) * sizeof(value_type); + } +}; + +template <class FunctorType, class ArgTag> +struct FunctorValueTraits<FunctorType, ArgTag, + false /* == exists FunctorType::value_type */ + > { + private: + struct VOIDTAG { + }; // Allow declaration of non-matching operator() with void argument tag. + struct REJECTTAG { + }; // Reject tagged operator() when using non-tagged execution policy. + + using tag_type = + std::conditional_t<std::is_same<ArgTag, void>::value, VOIDTAG, ArgTag>; + + //---------------------------------------- + // parallel_for operator without a tag: + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(ArgMember) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(ArgMember, ArgMember) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(ArgMember, ArgMember, ArgMember) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(ArgMember, ArgMember, ArgMember, ArgMember) + const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(ArgMember, ArgMember, ArgMember, ArgMember, + ArgMember) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(ArgMember, ArgMember, ArgMember, ArgMember, + ArgMember, ArgMember) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(ArgMember, ArgMember, ArgMember, ArgMember, + ArgMember, ArgMember, ArgMember) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(ArgMember, ArgMember, ArgMember, ArgMember, + ArgMember, ArgMember, ArgMember, ArgMember) + const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(const ArgMember&) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(const ArgMember&, const ArgMember&) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(const ArgMember&, const ArgMember&, + const ArgMember&) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&) const) {} + + template <class TagType, class ArgMember> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(TagType, ArgMember) const) {} + + template <class TagType, class ArgMember> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(TagType, ArgMember, ArgMember) const) {} + + template <class TagType, class ArgMember> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(TagType, ArgMember, ArgMember, ArgMember) const) {} + + template <class TagType, class ArgMember> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(TagType, ArgMember, ArgMember, ArgMember, + ArgMember) const) {} + + template <class TagType, class ArgMember> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(TagType, ArgMember, ArgMember, ArgMember, + ArgMember, ArgMember) const) {} + + template <class TagType, class ArgMember> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(TagType, ArgMember, ArgMember, ArgMember, + ArgMember, ArgMember, ArgMember) const) {} + + template <class TagType, class ArgMember> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(TagType, ArgMember, ArgMember, ArgMember, ArgMember, + ArgMember, ArgMember, ArgMember) const) {} + + template <class TagType, class ArgMember> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(TagType, ArgMember, ArgMember, ArgMember, + ArgMember, ArgMember, ArgMember, ArgMember, + ArgMember) const) {} + + template <class TagType, class ArgMember> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(TagType, const ArgMember&) const) {} + + template <class TagType, class ArgMember> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(TagType, const ArgMember&, + const ArgMember&) const) {} + + template <class TagType, class ArgMember> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(TagType, const ArgMember&, const ArgMember&, + const ArgMember&) const) {} + + template <class TagType, class ArgMember> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(TagType, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&) const) {} + + template <class TagType, class ArgMember> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(TagType, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&) const) {} + + template <class TagType, class ArgMember> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(TagType, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&) const) {} + + template <class TagType, class ArgMember> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(TagType, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&) const) {} + + template <class TagType, class ArgMember> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(TagType, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&) const) {} + + template <class TagType, class ArgMember> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(const TagType&, ArgMember) const) {} + + template <class TagType, class ArgMember> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(const TagType&, ArgMember, ArgMember) const) {} + + template <class TagType, class ArgMember> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(const TagType&, ArgMember, ArgMember, + ArgMember) const) {} + + template <class TagType, class ArgMember> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(const TagType&, ArgMember, ArgMember, + ArgMember, ArgMember) const) {} + + template <class TagType, class ArgMember> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(const TagType&, ArgMember, ArgMember, + ArgMember, ArgMember, ArgMember) const) {} + + template <class TagType, class ArgMember> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(const TagType&, ArgMember, ArgMember, ArgMember, + ArgMember, ArgMember, ArgMember) const) {} + + template <class TagType, class ArgMember> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(const TagType&, ArgMember, ArgMember, + ArgMember, ArgMember, ArgMember, ArgMember, + ArgMember) const) {} + + template <class TagType, class ArgMember> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(const TagType&, ArgMember, ArgMember, + ArgMember, ArgMember, ArgMember, ArgMember, + ArgMember, ArgMember) const) {} + + template <class TagType, class ArgMember> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(const TagType&, const ArgMember&) const) {} + + template <class TagType, class ArgMember> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(const TagType&, const ArgMember&, + const ArgMember&) const) {} + + template <class TagType, class ArgMember> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(const TagType&, const ArgMember&, const ArgMember&, + const ArgMember&) const) {} + + template <class TagType, class ArgMember> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(const TagType&, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&) const) {} + + template <class TagType, class ArgMember> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(const TagType&, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&) const) {} + + template <class TagType, class ArgMember> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(const TagType&, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&) const) {} + + template <class TagType, class ArgMember> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(const TagType&, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&) const) {} + + template <class TagType, class ArgMember> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(const TagType&, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&) const) {} + + //---------------------------------------- + // parallel_for operator with a tag: + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + tag_type, void (FunctorType::*)(tag_type, ArgMember) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + tag_type, void (FunctorType::*)(tag_type, ArgMember, ArgMember) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + tag_type, + void (FunctorType::*)(tag_type, ArgMember, ArgMember, ArgMember) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + tag_type, void (FunctorType::*)(tag_type, ArgMember, ArgMember, ArgMember, + ArgMember) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + tag_type, void (FunctorType::*)(tag_type, ArgMember, ArgMember, ArgMember, + ArgMember, ArgMember) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + tag_type, void (FunctorType::*)(tag_type, ArgMember, ArgMember, ArgMember, + ArgMember, ArgMember, ArgMember) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + tag_type, void (FunctorType::*)(tag_type, ArgMember, ArgMember, ArgMember, + ArgMember, ArgMember, ArgMember, + ArgMember) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + tag_type, void (FunctorType::*)(tag_type, ArgMember, ArgMember, ArgMember, + ArgMember, ArgMember, ArgMember, + ArgMember, ArgMember) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + tag_type, void (FunctorType::*)(const tag_type&, ArgMember) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + tag_type, + void (FunctorType::*)(const tag_type&, ArgMember, ArgMember) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + tag_type, void (FunctorType::*)(const tag_type&, ArgMember, ArgMember, + ArgMember) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + tag_type, void (FunctorType::*)(const tag_type&, ArgMember, ArgMember, + ArgMember, ArgMember) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + tag_type, void (FunctorType::*)(const tag_type&, ArgMember, ArgMember, + ArgMember, ArgMember, ArgMember) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + tag_type, + void (FunctorType::*)(const tag_type&, ArgMember, ArgMember, ArgMember, + ArgMember, ArgMember, ArgMember) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + tag_type, void (FunctorType::*)(const tag_type&, ArgMember, ArgMember, + ArgMember, ArgMember, ArgMember, + ArgMember, ArgMember) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + tag_type, void (FunctorType::*)(const tag_type&, ArgMember, ArgMember, + ArgMember, ArgMember, ArgMember, + ArgMember, ArgMember, ArgMember) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + tag_type, void (FunctorType::*)(tag_type, const ArgMember&) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + tag_type, void (FunctorType::*)(tag_type, const ArgMember&, + const ArgMember&) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + tag_type, + void (FunctorType::*)(tag_type, const ArgMember&, const ArgMember&, + const ArgMember&) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + tag_type, + void (FunctorType::*)(tag_type, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + tag_type, + void (FunctorType::*)(tag_type, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + tag_type, + void (FunctorType::*)(tag_type, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + tag_type, + void (FunctorType::*)(tag_type, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + tag_type, + void (FunctorType::*)(tag_type, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + tag_type, + void (FunctorType::*)(const tag_type&, const ArgMember&) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + tag_type, void (FunctorType::*)(const tag_type&, const ArgMember&, + const ArgMember&) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + tag_type, + void (FunctorType::*)(const tag_type&, const ArgMember&, const ArgMember&, + const ArgMember&) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + tag_type, + void (FunctorType::*)(const tag_type&, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + tag_type, + void (FunctorType::*)(const tag_type&, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + tag_type, + void (FunctorType::*)(const tag_type&, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + tag_type, + void (FunctorType::*)(const tag_type&, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&) const) {} + + template <class ArgMember> + KOKKOS_INLINE_FUNCTION static VOIDTAG deduce_reduce_type( + tag_type, + void (FunctorType::*)(const tag_type&, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&) const) {} + + //---------------------------------------- + // parallel_reduce operator without a tag: + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(ArgMember, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(ArgMember, ArgMember, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(ArgMember, ArgMember, ArgMember, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(ArgMember, ArgMember, ArgMember, ArgMember, + T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(ArgMember, ArgMember, ArgMember, ArgMember, + ArgMember, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(ArgMember, ArgMember, ArgMember, ArgMember, + ArgMember, ArgMember, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(ArgMember, ArgMember, ArgMember, ArgMember, + ArgMember, ArgMember, ArgMember, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(ArgMember, ArgMember, ArgMember, ArgMember, + ArgMember, ArgMember, ArgMember, ArgMember, + T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(const ArgMember&, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(const ArgMember&, const ArgMember&, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(const ArgMember&, const ArgMember&, + const ArgMember&, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, T&) const) {} + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(TagType, ArgMember, T&) const) {} + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(TagType, ArgMember, ArgMember, T&) const) { + } + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(TagType, ArgMember, ArgMember, ArgMember, + T&) const) {} + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(TagType, ArgMember, ArgMember, ArgMember, + ArgMember, T&) const) {} + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(TagType, ArgMember, ArgMember, ArgMember, + ArgMember, ArgMember, T&) const) {} + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(TagType, ArgMember, ArgMember, ArgMember, ArgMember, + ArgMember, ArgMember, T&) const) {} + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(TagType, ArgMember, ArgMember, ArgMember, ArgMember, + ArgMember, ArgMember, ArgMember, T&) const) {} + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(TagType, ArgMember, ArgMember, ArgMember, + ArgMember, ArgMember, ArgMember, ArgMember, + ArgMember, T&) const) {} + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(TagType, const ArgMember&, T&) const) {} + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(TagType, const ArgMember&, + const ArgMember&, T&) const) {} + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(TagType, const ArgMember&, const ArgMember&, + const ArgMember&, T&) const) {} + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(TagType, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, T&) const) {} + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(TagType, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, T&) const) {} + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(TagType, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, T&) const) {} + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(TagType, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, T&) const) {} + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(TagType, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, T&) const) {} + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(const TagType&, ArgMember, T&) const) {} + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(const TagType&, ArgMember, ArgMember, T&) const) {} + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(const TagType&, ArgMember, ArgMember, + ArgMember, T&) const) {} + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(const TagType&, ArgMember, ArgMember, + ArgMember, ArgMember, T&) const) {} + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(const TagType&, ArgMember, ArgMember, ArgMember, + ArgMember, ArgMember, T&) const) {} + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(const TagType&, ArgMember, ArgMember, ArgMember, + ArgMember, ArgMember, ArgMember, T&) const) {} + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(const TagType&, ArgMember, ArgMember, + ArgMember, ArgMember, ArgMember, ArgMember, + ArgMember, T&) const) {} + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(const TagType&, ArgMember, ArgMember, + ArgMember, ArgMember, ArgMember, ArgMember, + ArgMember, ArgMember, T&) const) {} + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(const TagType&, const ArgMember&, T&) const) {} + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(const TagType&, const ArgMember&, + const ArgMember&, T&) const) {} + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(const TagType&, const ArgMember&, const ArgMember&, + const ArgMember&, T&) const) {} + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(const TagType&, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, T&) const) {} + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(const TagType&, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, T&) const) {} + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(const TagType&, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, T&) const) {} + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(const TagType&, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, T&) const) {} + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(const TagType&, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, T&) const) {} + + //---------------------------------------- + // parallel_reduce operator with a tag: + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, void (FunctorType::*)(tag_type, ArgMember, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, + void (FunctorType::*)(tag_type, ArgMember, ArgMember, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, void (FunctorType::*)(tag_type, ArgMember, ArgMember, ArgMember, + T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, void (FunctorType::*)(tag_type, ArgMember, ArgMember, ArgMember, + ArgMember, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, void (FunctorType::*)(tag_type, ArgMember, ArgMember, ArgMember, + ArgMember, ArgMember, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, + void (FunctorType::*)(tag_type, ArgMember, ArgMember, ArgMember, + ArgMember, ArgMember, ArgMember, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, void (FunctorType::*)(tag_type, ArgMember, ArgMember, ArgMember, + ArgMember, ArgMember, ArgMember, + ArgMember, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, void (FunctorType::*)(tag_type, ArgMember, ArgMember, ArgMember, + ArgMember, ArgMember, ArgMember, + ArgMember, ArgMember, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, void (FunctorType::*)(const tag_type&, ArgMember, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, + void (FunctorType::*)(const tag_type&, ArgMember, ArgMember, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, void (FunctorType::*)(const tag_type&, ArgMember, ArgMember, + ArgMember, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, void (FunctorType::*)(const tag_type&, ArgMember, ArgMember, + ArgMember, ArgMember, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, + void (FunctorType::*)(const tag_type&, ArgMember, ArgMember, ArgMember, + ArgMember, ArgMember, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, + void (FunctorType::*)(const tag_type&, ArgMember, ArgMember, ArgMember, + ArgMember, ArgMember, ArgMember, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, void (FunctorType::*)(const tag_type&, ArgMember, ArgMember, + ArgMember, ArgMember, ArgMember, + ArgMember, ArgMember, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, + void (FunctorType::*)(const tag_type&, ArgMember, ArgMember, ArgMember, + ArgMember, ArgMember, ArgMember, ArgMember, + ArgMember, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, void (FunctorType::*)(tag_type, const ArgMember&, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, void (FunctorType::*)(tag_type, const ArgMember&, + const ArgMember&, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, + void (FunctorType::*)(tag_type, const ArgMember&, const ArgMember&, + const ArgMember&, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, + void (FunctorType::*)(tag_type, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, + void (FunctorType::*)(tag_type, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, + void (FunctorType::*)(tag_type, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, + void (FunctorType::*)(tag_type, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, + void (FunctorType::*)(tag_type, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, + void (FunctorType::*)(const tag_type&, const ArgMember&, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, void (FunctorType::*)(const tag_type&, const ArgMember&, + const ArgMember&, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, + void (FunctorType::*)(const tag_type&, const ArgMember&, const ArgMember&, + const ArgMember&, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, + void (FunctorType::*)(const tag_type&, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, + void (FunctorType::*)(const tag_type&, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, + void (FunctorType::*)(const tag_type&, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, + void (FunctorType::*)(const tag_type&, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, T&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, + void (FunctorType::*)(const tag_type&, const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, + const ArgMember&, const ArgMember&, T&) const) {} + + //---------------------------------------- + // parallel_scan operator without a tag: + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(ArgMember, T&, bool) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(const ArgMember&, T&, bool) const) {} + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(TagType, ArgMember, T&, bool) const) {} + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(TagType, const ArgMember&, T&, bool) const) {} + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(const TagType&, ArgMember, T&, bool) const) {} + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(const TagType&, const ArgMember&, T&, bool) + const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(ArgMember, T&, const bool&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(const ArgMember&, T&, const bool&) const) { + } + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, + void (FunctorType::*)(TagType, ArgMember, T&, const bool&) const) {} + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(TagType, const ArgMember&, T&, const bool&) + const) {} + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(const TagType&, ArgMember, T&, const bool&) + const) {} + + template <class TagType, class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static REJECTTAG deduce_reduce_type( + VOIDTAG, void (FunctorType::*)(const TagType&, const ArgMember&, T&, + const bool&) const) {} + //---------------------------------------- + // parallel_scan operator with a tag: + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, void (FunctorType::*)(tag_type, ArgMember, T&, bool) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, + void (FunctorType::*)(const tag_type&, ArgMember, T&, bool) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, + void (FunctorType::*)(tag_type, const ArgMember&, T&, bool) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, void (FunctorType::*)(const tag_type&, const ArgMember&, T&, + bool) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, + void (FunctorType::*)(tag_type, ArgMember, T&, const bool&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, void (FunctorType::*)(const tag_type&, ArgMember, T&, + const bool&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, void (FunctorType::*)(tag_type, const ArgMember&, T&, + const bool&) const) {} + + template <class ArgMember, class T> + KOKKOS_INLINE_FUNCTION static T deduce_reduce_type( + tag_type, void (FunctorType::*)(const tag_type&, const ArgMember&, T&, + const bool&) const) {} + //---------------------------------------- + + using ValueType = + decltype(deduce_reduce_type(tag_type(), &FunctorType::operator())); + + enum { IS_VOID = std::is_same<VOIDTAG, ValueType>::value }; + enum { IS_REJECT = std::is_same<REJECTTAG, ValueType>::value }; + + public: + using value_type = std::conditional_t<IS_VOID || IS_REJECT, void, ValueType>; + using pointer_type = + std::conditional_t<IS_VOID || IS_REJECT, void, ValueType*>; + using reference_type = + std::conditional_t<IS_VOID || IS_REJECT, void, ValueType&>; + using functor_type = FunctorType; + + static_assert( + IS_VOID || IS_REJECT || 0 == (sizeof(ValueType) % sizeof(int)), + "Reduction functor's value_type deduced from functor::operator() " + "requires: 0 == sizeof(value_type) % sizeof(int)"); + + enum { StaticValueSize = IS_VOID || IS_REJECT ? 0 : sizeof(ValueType) }; + + KOKKOS_FORCEINLINE_FUNCTION static unsigned value_size(const FunctorType&) { + return StaticValueSize; + } + + KOKKOS_FORCEINLINE_FUNCTION static unsigned value_count(const FunctorType&) { + return IS_VOID || IS_REJECT ? 0 : 1; + } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +/** Function signatures for FunctorType::init function with a tag. + * reference_type is 'value_type &' for scalar and 'value_type *' for array. + */ +template <class FunctorType, class ArgTag> +struct FunctorValueInitFunction { + using reference_type = + typename FunctorValueTraits<FunctorType, ArgTag>::reference_type; + + KOKKOS_INLINE_FUNCTION static void enable_if( + void (FunctorType::*)(ArgTag, reference_type) const); + KOKKOS_INLINE_FUNCTION static void enable_if( + void (FunctorType::*)(ArgTag const&, reference_type) const); + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag, + reference_type)); + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag const&, + reference_type)); +}; + +/** Function signatures for FunctorType::init function without a tag. + * reference_type is 'value_type &' for scalar and 'value_type *' for array. + */ +template <class FunctorType> +struct FunctorValueInitFunction<FunctorType, void> { + using reference_type = + typename FunctorValueTraits<FunctorType, void>::reference_type; + + KOKKOS_INLINE_FUNCTION static void enable_if( + void (FunctorType::*)(reference_type) const); + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(reference_type)); +}; + +// Adapter for value initialization function. +// If a proper FunctorType::init is declared then use it, +// otherwise use default constructor. +template <class FunctorType, class ArgTag, + class T = typename FunctorValueTraits<FunctorType, ArgTag>:: + reference_type // FIXME Fix FunctorValueTraits for multi-dim + // operator + , + class Enable = void> +struct FunctorValueInit; + +/* No 'init' function provided for single value */ +template <class FunctorType, class ArgTag, class T, class Enable> +struct FunctorValueInit<FunctorType, ArgTag, T&, Enable> { + KOKKOS_FORCEINLINE_FUNCTION static T& init(const FunctorType&, void* p) { + return *(new (p) T()); + }; +}; + +/* No 'init' function provided for array value */ +template <class FunctorType, class ArgTag, class T, class Enable> +struct FunctorValueInit<FunctorType, ArgTag, T*, Enable> { + KOKKOS_FORCEINLINE_FUNCTION static T* init(const FunctorType& f, void* p) { + const int n = FunctorValueTraits<FunctorType, ArgTag>::value_count(f); + for (int i = 0; i < n; ++i) { + new (((T*)p) + i) T(); + } + return (T*)p; + } +}; + +/* 'init' function provided for single value */ +template <class FunctorType, class T> +struct FunctorValueInit< + FunctorType, void, + T& + // First substitution failure when FunctorType::init does not exist. + // Second substitution failure when FunctorType::init is not compatible. + , + decltype(FunctorValueInitFunction<FunctorType, void>::enable_if( + &FunctorType::init))> { + KOKKOS_FORCEINLINE_FUNCTION static T& init(const FunctorType& f, void* p) { + f.init(*((T*)p)); + return *((T*)p); + } +}; + +/* 'init' function provided for array value */ +template <class FunctorType, class T> +struct FunctorValueInit< + FunctorType, void, + T* + // First substitution failure when FunctorType::init does not exist. + // Second substitution failure when FunctorType::init is not compatible + , + decltype(FunctorValueInitFunction<FunctorType, void>::enable_if( + &FunctorType::init))> { + KOKKOS_FORCEINLINE_FUNCTION static T* init(const FunctorType& f, void* p) { + f.init((T*)p); + return (T*)p; + } +}; + +/* 'init' function provided for single value */ +template <class FunctorType, class ArgTag, class T> +struct FunctorValueInit< + FunctorType, ArgTag, + T& + // First substitution failure when FunctorType::init does not exist. + // Second substitution failure when FunctorType::init is not compatible. + , + typename std::enable_if< + !std::is_same<ArgTag, void>::value, + decltype(FunctorValueInitFunction<FunctorType, ArgTag>::enable_if( + &FunctorType::init))>::type> { + KOKKOS_FORCEINLINE_FUNCTION static T& init(const FunctorType& f, void* p) { + f.init(ArgTag(), *((T*)p)); + return *((T*)p); + } +}; + +/* 'init' function provided for array value */ +template <class FunctorType, class ArgTag, class T> +struct FunctorValueInit< + FunctorType, ArgTag, + T* + // First substitution failure when FunctorType::init does not exist. + // Second substitution failure when FunctorType::init is not compatible + , + typename std::enable_if< + !std::is_same<ArgTag, void>::value, + decltype(FunctorValueInitFunction<FunctorType, ArgTag>::enable_if( + &FunctorType::init))>::type> { + KOKKOS_FORCEINLINE_FUNCTION static T* init(const FunctorType& f, void* p) { + f.init(ArgTag(), (T*)p); + return (T*)p; + } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +// Signatures for compatible FunctorType::join with tag and not an array +template <class FunctorType, class ArgTag, + bool IsArray = + 0 == FunctorValueTraits<FunctorType, ArgTag>::StaticValueSize> +struct FunctorValueJoinFunction { + using value_type = + typename FunctorValueTraits<FunctorType, ArgTag>::value_type; + + using vref_type = volatile value_type&; + using cvref_type = const volatile value_type&; + + KOKKOS_INLINE_FUNCTION static void enable_if( + void (FunctorType::*)(ArgTag, vref_type, cvref_type) const); + KOKKOS_INLINE_FUNCTION static void enable_if( + void (FunctorType::*)(ArgTag const&, vref_type, cvref_type) const); + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag, vref_type, + cvref_type)); + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag const&, + vref_type, cvref_type)); +}; + +// Signatures for compatible FunctorType::join with tag and is an array +template <class FunctorType, class ArgTag> +struct FunctorValueJoinFunction<FunctorType, ArgTag, true> { + using value_type = + typename FunctorValueTraits<FunctorType, ArgTag>::value_type; + + using vptr_type = volatile value_type*; + using cvptr_type = const volatile value_type*; + + KOKKOS_INLINE_FUNCTION static void enable_if( + void (FunctorType::*)(ArgTag, vptr_type, cvptr_type) const); + KOKKOS_INLINE_FUNCTION static void enable_if( + void (FunctorType::*)(ArgTag const&, vptr_type, cvptr_type) const); + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag, vptr_type, + cvptr_type)); + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag const&, + vptr_type, cvptr_type)); +}; + +// Signatures for compatible FunctorType::join without tag and not an array +template <class FunctorType> +struct FunctorValueJoinFunction<FunctorType, void, false> { + using value_type = typename FunctorValueTraits<FunctorType, void>::value_type; + + using vref_type = volatile value_type&; + using cvref_type = const volatile value_type&; + + KOKKOS_INLINE_FUNCTION static void enable_if(void (FunctorType::*)(vref_type, + cvref_type) + const); + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(vref_type, cvref_type)); +}; + +// Signatures for compatible FunctorType::join without tag and is an array +template <class FunctorType> +struct FunctorValueJoinFunction<FunctorType, void, true> { + using value_type = typename FunctorValueTraits<FunctorType, void>::value_type; + + using vptr_type = volatile value_type*; + using cvptr_type = const volatile value_type*; + + KOKKOS_INLINE_FUNCTION static void enable_if(void (FunctorType::*)(vptr_type, + cvptr_type) + const); + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(vptr_type, cvptr_type)); +}; + +template <class FunctorType, class ArgTag, + class T = + typename FunctorValueTraits<FunctorType, ArgTag>::reference_type, + class Enable = void> +struct FunctorValueJoin; + +/* No 'join' function provided, single value */ +template <class FunctorType, class ArgTag, class T, class Enable> +struct FunctorValueJoin<FunctorType, ArgTag, T&, Enable> { + KOKKOS_FORCEINLINE_FUNCTION + FunctorValueJoin(const FunctorType&) {} + + KOKKOS_FORCEINLINE_FUNCTION static void join(const FunctorType& /*f*/, + volatile void* const lhs, + const volatile void* const rhs) { + *((volatile T*)lhs) += *((const volatile T*)rhs); + } + KOKKOS_FORCEINLINE_FUNCTION + void operator()(volatile T& lhs, const volatile T& rhs) const { lhs += rhs; } + KOKKOS_FORCEINLINE_FUNCTION + void operator()(T& lhs, const T& rhs) const { lhs += rhs; } +}; + +/* No 'join' function provided, array of values */ +template <class FunctorType, class ArgTag, class T, class Enable> +struct FunctorValueJoin<FunctorType, ArgTag, T*, Enable> { + const FunctorType& f; + + KOKKOS_FORCEINLINE_FUNCTION + FunctorValueJoin(const FunctorType& f_) : f(f_) {} + + KOKKOS_FORCEINLINE_FUNCTION static void join(const FunctorType& f_, + volatile void* const lhs, + const volatile void* const rhs) { + const int n = FunctorValueTraits<FunctorType, ArgTag>::value_count(f_); + + for (int i = 0; i < n; ++i) { + ((volatile T*)lhs)[i] += ((const volatile T*)rhs)[i]; + } + } + KOKKOS_FORCEINLINE_FUNCTION + void operator()(volatile T* const lhs, const volatile T* const rhs) const { + const int n = FunctorValueTraits<FunctorType, ArgTag>::value_count(f); + + for (int i = 0; i < n; ++i) { + lhs[i] += rhs[i]; + } + } + KOKKOS_FORCEINLINE_FUNCTION + void operator()(T* lhs, const T* rhs) const { + const int n = FunctorValueTraits<FunctorType, ArgTag>::value_count(f); + + for (int i = 0; i < n; ++i) { + lhs[i] += rhs[i]; + } + } +}; + +/* 'join' function provided, single value */ +template <class FunctorType, class ArgTag, class T> +struct FunctorValueJoin< + FunctorType, ArgTag, + T& + // First substitution failure when FunctorType::join does not exist. + // Second substitution failure when enable_if( & Functor::join ) does not + // exist + , + decltype(FunctorValueJoinFunction<FunctorType, ArgTag>::enable_if( + &FunctorType::join))> { + const FunctorType& f; + + KOKKOS_FORCEINLINE_FUNCTION + FunctorValueJoin(const FunctorType& f_) : f(f_) {} + + KOKKOS_FORCEINLINE_FUNCTION static void join(const FunctorType& f_, + volatile void* const lhs, + const volatile void* const rhs) { + f_.join(ArgTag(), *((volatile T*)lhs), *((const volatile T*)rhs)); + } + KOKKOS_FORCEINLINE_FUNCTION + void operator()(volatile T& lhs, const volatile T& rhs) const { + f.join(ArgTag(), lhs, rhs); + } + KOKKOS_FORCEINLINE_FUNCTION + void operator()(T& lhs, const T& rhs) const { f.join(ArgTag(), lhs, rhs); } +}; + +/* 'join' function provided, no tag, single value */ +template <class FunctorType, class T> +struct FunctorValueJoin< + FunctorType, void, + T& + // First substitution failure when FunctorType::join does not exist. + // Second substitution failure when enable_if( & Functor::join ) does not + // exist + , + decltype(FunctorValueJoinFunction<FunctorType, void>::enable_if( + &FunctorType::join))> { + const FunctorType& f; + + KOKKOS_FORCEINLINE_FUNCTION + FunctorValueJoin(const FunctorType& f_) : f(f_) {} + + KOKKOS_FORCEINLINE_FUNCTION static void join(const FunctorType& f_, + volatile void* const lhs, + const volatile void* const rhs) { + f_.join(*((volatile T*)lhs), *((const volatile T*)rhs)); + } + KOKKOS_FORCEINLINE_FUNCTION + void operator()(volatile T& lhs, const volatile T& rhs) const { + f.join(lhs, rhs); + } + KOKKOS_FORCEINLINE_FUNCTION + void operator()(T& lhs, const T& rhs) const { f.join(lhs, rhs); } +}; + +/* 'join' function provided for array value */ +template <class FunctorType, class ArgTag, class T> +struct FunctorValueJoin< + FunctorType, ArgTag, + T* + // First substitution failure when FunctorType::join does not exist. + // Second substitution failure when enable_if( & Functor::join ) does not + // exist + , + decltype(FunctorValueJoinFunction<FunctorType, ArgTag>::enable_if( + &FunctorType::join))> { + const FunctorType& f; + + KOKKOS_FORCEINLINE_FUNCTION + FunctorValueJoin(const FunctorType& f_) : f(f_) {} + + KOKKOS_FORCEINLINE_FUNCTION static void join(const FunctorType& f_, + volatile void* const lhs, + const volatile void* const rhs) { + f_.join(ArgTag(), (volatile T*)lhs, (const volatile T*)rhs); + } + KOKKOS_FORCEINLINE_FUNCTION + void operator()(volatile T* const lhs, const volatile T* const rhs) const { + f.join(ArgTag(), lhs, rhs); + } + KOKKOS_FORCEINLINE_FUNCTION + void operator()(T* lhs, const T* rhs) const { f.join(ArgTag(), lhs, rhs); } +}; + +/* 'join' function provided, no tag, array value */ +template <class FunctorType, class T> +struct FunctorValueJoin< + FunctorType, void, + T* + // First substitution failure when FunctorType::join does not exist. + // Second substitution failure when enable_if( & Functor::join ) does not + // exist + , + decltype(FunctorValueJoinFunction<FunctorType, void>::enable_if( + &FunctorType::join))> { + const FunctorType& f; + + KOKKOS_FORCEINLINE_FUNCTION + FunctorValueJoin(const FunctorType& f_) : f(f_) {} + + KOKKOS_FORCEINLINE_FUNCTION static void join(const FunctorType& f_, + volatile void* const lhs, + const volatile void* const rhs) { + f_.join((volatile T*)lhs, (const volatile T*)rhs); + } + KOKKOS_FORCEINLINE_FUNCTION + void operator()(volatile T* const lhs, const volatile T* const rhs) const { + f.join(lhs, rhs); + } + KOKKOS_FORCEINLINE_FUNCTION + void operator()(T* lhs, const T* rhs) const { f.join(lhs, rhs); } +}; + +} // namespace Impl +} // namespace Kokkos + +namespace Kokkos { + +namespace Impl { + +template <typename ValueType, class JoinOp, class Enable = void> +struct JoinLambdaAdapter { + using value_type = ValueType; + const JoinOp& lambda; + KOKKOS_INLINE_FUNCTION + JoinLambdaAdapter(const JoinOp& lambda_) : lambda(lambda_) {} + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dst, const volatile value_type& src) const { + lambda(dst, src); + } + + KOKKOS_INLINE_FUNCTION + void join(value_type& dst, const value_type& src) const { lambda(dst, src); } + + KOKKOS_INLINE_FUNCTION + void operator()(volatile value_type& dst, + const volatile value_type& src) const { + lambda(dst, src); + } + + KOKKOS_INLINE_FUNCTION + void operator()(value_type& dst, const value_type& src) const { + lambda(dst, src); + } +}; + +template <typename ValueType, class JoinOp> +struct JoinLambdaAdapter<ValueType, JoinOp, + decltype(FunctorValueJoinFunction< + JoinOp, void>::enable_if(&JoinOp::join))> { + using value_type = ValueType; + static_assert( + std::is_same<ValueType, typename JoinOp::value_type>::value, + "JoinLambdaAdapter static_assert Fail: ValueType != JoinOp::value_type"); + + const JoinOp& lambda; + KOKKOS_INLINE_FUNCTION + JoinLambdaAdapter(const JoinOp& lambda_) : lambda(lambda_) {} + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dst, const volatile value_type& src) const { + lambda.join(dst, src); + } + + KOKKOS_INLINE_FUNCTION + void join(value_type& dst, const value_type& src) const { + lambda.join(dst, src); + } + + KOKKOS_INLINE_FUNCTION + void operator()(volatile value_type& dst, + const volatile value_type& src) const { + lambda.join(dst, src); + } + + KOKKOS_INLINE_FUNCTION + void operator()(value_type& dst, const value_type& src) const { + lambda.join(dst, src); + } +}; + +template <typename ValueType> +struct JoinAdd { + using value_type = ValueType; + + KOKKOS_DEFAULTED_FUNCTION + JoinAdd() = default; + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dst, const volatile value_type& src) const { + dst += src; + } + KOKKOS_INLINE_FUNCTION + void operator()(value_type& dst, const value_type& src) const { dst += src; } + KOKKOS_INLINE_FUNCTION + void operator()(volatile value_type& dst, + const volatile value_type& src) const { + dst += src; + } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <class FunctorType, class ArgTag, + class T = + typename FunctorValueTraits<FunctorType, ArgTag>::reference_type> +struct FunctorValueOps; + +template <class FunctorType, class ArgTag, class T> +struct FunctorValueOps<FunctorType, ArgTag, T&> { + KOKKOS_FORCEINLINE_FUNCTION static T* pointer(T& r) { return &r; } + + KOKKOS_FORCEINLINE_FUNCTION static T& reference(void* p) { return *((T*)p); } + + KOKKOS_FORCEINLINE_FUNCTION static void copy(const FunctorType&, + void* const lhs, + const void* const rhs) { + *((T*)lhs) = *((const T*)rhs); + } +}; + +/* No 'join' function provided, array of values */ +template <class FunctorType, class ArgTag, class T> +struct FunctorValueOps<FunctorType, ArgTag, T*> { + KOKKOS_FORCEINLINE_FUNCTION static T* pointer(T* p) { return p; } + + KOKKOS_FORCEINLINE_FUNCTION static T* reference(void* p) { return ((T*)p); } + + KOKKOS_FORCEINLINE_FUNCTION static void copy(const FunctorType& f, + void* const lhs, + const void* const rhs) { + const int n = FunctorValueTraits<FunctorType, ArgTag>::value_count(f); + for (int i = 0; i < n; ++i) { + ((T*)lhs)[i] = ((const T*)rhs)[i]; + } + } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +// Compatible functions for 'final' function and value_type not an array +template <class FunctorType, class ArgTag, + bool IsArray = + 0 == FunctorValueTraits<FunctorType, ArgTag>::StaticValueSize> +struct FunctorFinalFunction { + using value_type = + typename FunctorValueTraits<FunctorType, ArgTag>::value_type; + + KOKKOS_INLINE_FUNCTION static void enable_if( + void (FunctorType::*)(ArgTag, value_type&) const); + KOKKOS_INLINE_FUNCTION static void enable_if( + void (FunctorType::*)(ArgTag const&, value_type&) const); + KOKKOS_INLINE_FUNCTION static void enable_if( + void (FunctorType::*)(ArgTag, value_type&)); + KOKKOS_INLINE_FUNCTION static void enable_if( + void (FunctorType::*)(ArgTag const&, value_type&)); + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag, value_type&)); + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag const&, + value_type&)); + + // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag + // , value_type volatile & ) const ); KOKKOS_INLINE_FUNCTION static void + // enable_if( void (FunctorType::*)( ArgTag const & , value_type volatile & ) + // const ); KOKKOS_INLINE_FUNCTION static void enable_if( void + // (FunctorType::*)( ArgTag , value_type volatile & ) ); + // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag + // const & , value_type volatile & ) ); KOKKOS_INLINE_FUNCTION static void + // enable_if( void ( *)( ArgTag , value_type volatile & ) + // ); KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( + // ArgTag const & , value_type volatile & ) ); + + KOKKOS_INLINE_FUNCTION static void enable_if( + void (FunctorType::*)(ArgTag, value_type const&) const); + KOKKOS_INLINE_FUNCTION static void enable_if( + void (FunctorType::*)(ArgTag const&, value_type const&) const); + KOKKOS_INLINE_FUNCTION static void enable_if( + void (FunctorType::*)(ArgTag, value_type const&)); + KOKKOS_INLINE_FUNCTION static void enable_if( + void (FunctorType::*)(ArgTag const&, value_type const&)); + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag, + value_type const&)); + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag const&, + value_type const&)); + + // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag + // , value_type const volatile & ) const ); KOKKOS_INLINE_FUNCTION static void + // enable_if( void (FunctorType::*)( ArgTag const & , value_type const + // volatile & ) const ); KOKKOS_INLINE_FUNCTION static void enable_if( void + // (FunctorType::*)( ArgTag , value_type const volatile & ) ); + // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag + // const & , value_type const volatile & ) ); KOKKOS_INLINE_FUNCTION static + // void enable_if( void ( *)( ArgTag , value_type const + // volatile & ) ); KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( + // ArgTag const & , value_type const volatile & ) ); +}; + +// Compatible functions for 'final' function and value_type is an array +template <class FunctorType, class ArgTag> +struct FunctorFinalFunction<FunctorType, ArgTag, true> { + using value_type = + typename FunctorValueTraits<FunctorType, ArgTag>::value_type; + + KOKKOS_INLINE_FUNCTION static void enable_if( + void (FunctorType::*)(ArgTag, value_type*) const); + KOKKOS_INLINE_FUNCTION static void enable_if( + void (FunctorType::*)(ArgTag const&, value_type*) const); + KOKKOS_INLINE_FUNCTION static void enable_if( + void (FunctorType::*)(ArgTag, value_type*)); + KOKKOS_INLINE_FUNCTION static void enable_if( + void (FunctorType::*)(ArgTag const&, value_type*)); + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag, value_type*)); + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag const&, + value_type*)); + + // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag + // , value_type volatile * ) const ); KOKKOS_INLINE_FUNCTION static void + // enable_if( void (FunctorType::*)( ArgTag const & , value_type volatile * ) + // const ); KOKKOS_INLINE_FUNCTION static void enable_if( void + // (FunctorType::*)( ArgTag , value_type volatile * ) ); + // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag + // const & , value_type volatile * ) ); KOKKOS_INLINE_FUNCTION static void + // enable_if( void ( *)( ArgTag , value_type volatile * ) + // ); KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( + // ArgTag const & , value_type volatile * ) ); + + KOKKOS_INLINE_FUNCTION static void enable_if( + void (FunctorType::*)(ArgTag, value_type const*) const); + KOKKOS_INLINE_FUNCTION static void enable_if( + void (FunctorType::*)(ArgTag const&, value_type const*) const); + KOKKOS_INLINE_FUNCTION static void enable_if( + void (FunctorType::*)(ArgTag, value_type const*)); + KOKKOS_INLINE_FUNCTION static void enable_if( + void (FunctorType::*)(ArgTag const&, value_type const*)); + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag, + value_type const*)); + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag const&, + value_type const*)); + + // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag + // , value_type const volatile * ) const ); KOKKOS_INLINE_FUNCTION static void + // enable_if( void (FunctorType::*)( ArgTag const & , value_type const + // volatile * ) const ); KOKKOS_INLINE_FUNCTION static void enable_if( void + // (FunctorType::*)( ArgTag , value_type const volatile * ) ); + // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag + // const & , value_type const volatile * ) ); KOKKOS_INLINE_FUNCTION static + // void enable_if( void ( *)( ArgTag , value_type const + // volatile * ) ); KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)( + // ArgTag const & , value_type const volatile * ) ); +}; + +template <class FunctorType> +struct FunctorFinalFunction<FunctorType, void, false> { + using value_type = typename FunctorValueTraits<FunctorType, void>::value_type; + + KOKKOS_INLINE_FUNCTION static void enable_if( + void (FunctorType::*)(value_type&) const); + KOKKOS_INLINE_FUNCTION static void enable_if( + void (FunctorType::*)(value_type&)); + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(value_type&)); + + KOKKOS_INLINE_FUNCTION static void enable_if( + void (FunctorType::*)(const value_type&) const); + KOKKOS_INLINE_FUNCTION static void enable_if( + void (FunctorType::*)(const value_type&)); + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(const value_type&)); +}; + +template <class FunctorType> +struct FunctorFinalFunction<FunctorType, void, true> { + using value_type = typename FunctorValueTraits<FunctorType, void>::value_type; + + KOKKOS_INLINE_FUNCTION static void enable_if( + void (FunctorType::*)(value_type*) const); + KOKKOS_INLINE_FUNCTION static void enable_if( + void (FunctorType::*)(value_type*)); + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(value_type*)); + + KOKKOS_INLINE_FUNCTION static void enable_if( + void (FunctorType::*)(const value_type*) const); + KOKKOS_INLINE_FUNCTION static void enable_if( + void (FunctorType::*)(const value_type*)); + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(const value_type*)); +}; + +/* No 'final' function provided */ +template <class FunctorType, class ArgTag, + class ResultType = + typename FunctorValueTraits<FunctorType, ArgTag>::reference_type, + class Enable = void> +struct FunctorFinal { + KOKKOS_FORCEINLINE_FUNCTION static void final(const FunctorType&, void*) {} +}; + +/* 'final' function provided for single value but no tag*/ +template <class FunctorType, class ArgTag, class T> +struct FunctorFinal< + FunctorType, ArgTag, + T& + // First substitution failure when FunctorType::final does not exist. + // Second substitution failure when FunctorType::final is not compatible. + , + typename std::enable_if< + std::is_same<ArgTag, void>::value, + decltype(FunctorFinalFunction<FunctorType, ArgTag>::enable_if( + &FunctorType::final))>::type> { + KOKKOS_FORCEINLINE_FUNCTION static void final(const FunctorType& f, void* p) { + f.final(*((T*)p)); + } +}; + +/* 'final' function provided for array value but no tag*/ +template <class FunctorType, class ArgTag, class T> +struct FunctorFinal< + FunctorType, ArgTag, + T* + // First substitution failure when FunctorType::final does not exist. + // Second substitution failure when FunctorType::final is not compatible. + , + typename std::enable_if< + std::is_same<ArgTag, void>::value, + decltype(FunctorFinalFunction<FunctorType, ArgTag>::enable_if( + &FunctorType::final))>::type> { + KOKKOS_FORCEINLINE_FUNCTION static void final(const FunctorType& f, void* p) { + f.final((T*)p); + } +}; + +/* 'final' function provided for single value and with tag */ +template <class FunctorType, class ArgTag, class T> +struct FunctorFinal< + FunctorType, ArgTag, + T& + // First substitution failure when FunctorType::final does not exist. + // Second substitution failure when FunctorType::final is not compatible. + , + typename std::enable_if< + !std::is_same<ArgTag, void>::value, + decltype(FunctorFinalFunction<FunctorType, ArgTag>::enable_if( + &FunctorType::final))>::type> { + KOKKOS_FORCEINLINE_FUNCTION static void final(const FunctorType& f, void* p) { + f.final(ArgTag(), *((T*)p)); + } +}; + +/* 'final' function provided for array value and with tag */ +template <class FunctorType, class ArgTag, class T> +struct FunctorFinal< + FunctorType, ArgTag, + T* + // First substitution failure when FunctorType::final does not exist. + // Second substitution failure when FunctorType::final is not compatible. + , + typename std::enable_if< + !std::is_same<ArgTag, void>::value, + decltype(FunctorFinalFunction<FunctorType, ArgTag>::enable_if( + &FunctorType::final))>::type> { + KOKKOS_FORCEINLINE_FUNCTION static void final(const FunctorType& f, void* p) { + f.final(ArgTag(), (T*)p); + } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <class FunctorType, class ArgTag, + class ReferenceType = + typename FunctorValueTraits<FunctorType, ArgTag>::reference_type> +struct FunctorApplyFunction { + KOKKOS_INLINE_FUNCTION static void enable_if( + void (FunctorType::*)(ArgTag, ReferenceType) const); + KOKKOS_INLINE_FUNCTION static void enable_if( + void (FunctorType::*)(ArgTag const&, ReferenceType) const); + KOKKOS_INLINE_FUNCTION static void enable_if( + void (FunctorType::*)(ArgTag, ReferenceType)); + KOKKOS_INLINE_FUNCTION static void enable_if( + void (FunctorType::*)(ArgTag const&, ReferenceType)); + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag, ReferenceType)); + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag const&, + ReferenceType)); +}; + +template <class FunctorType, class ReferenceType> +struct FunctorApplyFunction<FunctorType, void, ReferenceType> { + KOKKOS_INLINE_FUNCTION static void enable_if( + void (FunctorType::*)(ReferenceType) const); + KOKKOS_INLINE_FUNCTION static void enable_if( + void (FunctorType::*)(ReferenceType)); + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ReferenceType)); +}; + +template <class FunctorType> +struct FunctorApplyFunction<FunctorType, void, void> { + KOKKOS_INLINE_FUNCTION static void enable_if(void (FunctorType::*)() const); + KOKKOS_INLINE_FUNCTION static void enable_if(void (FunctorType::*)()); +}; + +template <class FunctorType, class ArgTag, class ReferenceType, + class Enable = void> +struct FunctorApply { + KOKKOS_FORCEINLINE_FUNCTION static void apply(const FunctorType&, void*) {} +}; + +/* 'apply' function provided for void value */ +template <class FunctorType, class ArgTag> +struct FunctorApply< + FunctorType, ArgTag, + void + // First substitution failure when FunctorType::apply does not exist. + // Second substitution failure when enable_if( & Functor::apply ) does not + // exist + , + decltype(FunctorApplyFunction<FunctorType, ArgTag, void>::enable_if( + &FunctorType::apply))> { + KOKKOS_FORCEINLINE_FUNCTION static void apply(FunctorType& f) { f.apply(); } + + KOKKOS_FORCEINLINE_FUNCTION static void apply(const FunctorType& f) { + f.apply(); + } +}; + +/* 'apply' function provided for single value */ +template <class FunctorType, class ArgTag, class T> +struct FunctorApply<FunctorType, ArgTag, + T& + // First substitution failure when FunctorType::apply does + // not exist. Second substitution failure when enable_if( & + // Functor::apply ) does not exist + , + decltype( + FunctorApplyFunction<FunctorType, ArgTag>::enable_if( + &FunctorType::apply))> { + KOKKOS_FORCEINLINE_FUNCTION static void apply(const FunctorType& f, void* p) { + f.apply(*((T*)p)); + } + + KOKKOS_FORCEINLINE_FUNCTION static void apply(FunctorType& f, void* p) { + f.apply(*((T*)p)); + } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* KOKKOS_FUNCTORADAPTER_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp b/packages/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a56d19ee722668389b4b43bc377d79bb7fd9799b --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp @@ -0,0 +1,811 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_FUNCTORANALYSIS_HPP +#define KOKKOS_FUNCTORANALYSIS_HPP + +#include <cstddef> +#include <Kokkos_Core_fwd.hpp> +#include <impl/Kokkos_Traits.hpp> +#include <impl/Kokkos_Tags.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +struct FunctorPatternInterface { + struct FOR {}; + struct REDUCE {}; + struct SCAN {}; +}; + +/** \brief Query Functor and execution policy argument tag for value type. + * + * If 'value_type' is not explicitly declared in the functor + * then attempt to deduce the type from FunctorType::operator() + * interface used by the pattern and policy. + * + * For the REDUCE pattern generate a Reducer and finalization function + * derived from what is available within the functor. + */ +template <typename PatternInterface, class Policy, class Functor> +struct FunctorAnalysis { + private: + using FOR = FunctorPatternInterface::FOR; + using REDUCE = FunctorPatternInterface::REDUCE; + using SCAN = FunctorPatternInterface::SCAN; + + //---------------------------------------- + + struct VOID {}; + + template <typename P = Policy, typename = std::false_type> + struct has_work_tag { + using type = void; + using wtag = VOID; + }; + + template <typename P> + struct has_work_tag<P, + typename std::is_same<typename P::work_tag, void>::type> { + using type = typename P::work_tag; + using wtag = typename P::work_tag; + }; + + using Tag = typename has_work_tag<>::type; + using WTag = typename has_work_tag<>::wtag; + + //---------------------------------------- + // Check for T::execution_space + + template <typename T, typename = std::false_type> + struct has_execution_space { + using type = void; + enum : bool { value = false }; + }; + + template <typename T> + struct has_execution_space< + T, typename std::is_same<typename T::execution_space, void>::type> { + using type = typename T::execution_space; + enum : bool { value = true }; + }; + + using policy_has_space = has_execution_space<Policy>; + using functor_has_space = has_execution_space<Functor>; + + static_assert(!policy_has_space::value || !functor_has_space::value || + std::is_same<typename policy_has_space::type, + typename functor_has_space::type>::value, + "Execution Policy and Functor execution space must match"); + + //---------------------------------------- + // Check for Functor::value_type, which is either a simple type T or T[] + + template <typename F, typename = std::false_type> + struct has_value_type { + using type = void; + }; + + template <typename F> + struct has_value_type< + F, typename std::is_same<typename F::value_type, void>::type> { + using type = typename F::value_type; + + static_assert(!std::is_reference<type>::value && + std::rank<type>::value <= 1 && + std::extent<type>::value == 0, + "Kokkos Functor::value_type is T or T[]"); + }; + + //---------------------------------------- + // If Functor::value_type does not exist then evaluate operator(), + // depending upon the pattern and whether the policy has a work tag, + // to determine the reduction or scan value_type. + + template <typename F, typename P = PatternInterface, + typename V = typename has_value_type<F>::type, + bool T = std::is_same<Tag, void>::value> + struct deduce_value_type { + using type = V; + }; + + template <typename F> + struct deduce_value_type<F, REDUCE, void, true> { + template <typename M, typename A> + KOKKOS_INLINE_FUNCTION static A deduce(void (Functor::*)(M, A&) const); + + template <typename M, typename A> + KOKKOS_INLINE_FUNCTION static A deduce(void (Functor::*)(M, M, A&) const); + + template <typename M, typename A> + KOKKOS_INLINE_FUNCTION static A deduce(void (Functor::*)(M, M, M, A&) + const); + + template <typename M, typename A> + KOKKOS_INLINE_FUNCTION static A deduce(void (Functor::*)(M, M, M, M, A&) + const); + + template <typename M, typename A> + KOKKOS_INLINE_FUNCTION static A deduce(void (Functor::*)(M, M, M, M, M, A&) + const); + + template <typename M, typename A> + KOKKOS_INLINE_FUNCTION static A deduce(void (Functor::*)(M, M, M, M, M, M, + A&) const); + + template <typename M, typename A> + KOKKOS_INLINE_FUNCTION static A deduce(void (Functor::*)(M, M, M, M, M, M, + M, A&) const); + + template <typename M, typename A> + KOKKOS_INLINE_FUNCTION static A deduce(void (Functor::*)(M, M, M, M, M, M, + M, M, A&) const); + + using type = decltype(deduce(&F::operator())); + }; + + template <typename F> + struct deduce_value_type<F, REDUCE, void, false> { + template <typename M, typename A> + KOKKOS_INLINE_FUNCTION static A deduce(void (Functor::*)(WTag, M, A&) + const); + + template <typename M, typename A> + KOKKOS_INLINE_FUNCTION static A deduce(void (Functor::*)(WTag, M, M, A&) + const); + + template <typename M, typename A> + KOKKOS_INLINE_FUNCTION static A deduce(void (Functor::*)(WTag, M, M, M, A&) + const); + + template <typename M, typename A> + KOKKOS_INLINE_FUNCTION static A deduce(void (Functor::*)(WTag, M, M, M, M, + A&) const); + + template <typename M, typename A> + KOKKOS_INLINE_FUNCTION static A deduce(void (Functor::*)(WTag, M, M, M, M, + M, A&) const); + + template <typename M, typename A> + KOKKOS_INLINE_FUNCTION static A deduce(void (Functor::*)(WTag, M, M, M, M, + M, M, A&) const); + + template <typename M, typename A> + KOKKOS_INLINE_FUNCTION static A deduce(void (Functor::*)(WTag, M, M, M, M, + M, M, M, A&) + const); + + template <typename M, typename A> + KOKKOS_INLINE_FUNCTION static A deduce(void (Functor::*)(WTag, M, M, M, M, + M, M, M, M, A&) + const); + + template <typename M, typename A> + KOKKOS_INLINE_FUNCTION static A deduce(void (Functor::*)(WTag const&, M, A&) + const); + + template <typename M, typename A> + KOKKOS_INLINE_FUNCTION static A deduce(void (Functor::*)(WTag const&, M, M, + A&) const); + + template <typename M, typename A> + KOKKOS_INLINE_FUNCTION static A deduce(void (Functor::*)(WTag const&, M, M, + M, A&) const); + + template <typename M, typename A> + KOKKOS_INLINE_FUNCTION static A deduce(void (Functor::*)(WTag const&, M, M, + M, M, A&) const); + + template <typename M, typename A> + KOKKOS_INLINE_FUNCTION static A deduce(void (Functor::*)(WTag const&, M, M, + M, M, M, A&) + const); + + template <typename M, typename A> + KOKKOS_INLINE_FUNCTION static A deduce(void (Functor::*)(WTag const&, M, M, + M, M, M, M, A&) + const); + + template <typename M, typename A> + KOKKOS_INLINE_FUNCTION static A deduce(void (Functor::*)(WTag const&, M, M, + M, M, M, M, M, A&) + const); + + template <typename M, typename A> + KOKKOS_INLINE_FUNCTION static A deduce(void (Functor::*)(WTag const&, M, M, + M, M, M, M, M, M, + A&) const); + + using type = decltype(deduce(&F::operator())); + }; + + template <typename F> + struct deduce_value_type<F, SCAN, void, true> { + template <typename M, typename A, typename I> + KOKKOS_INLINE_FUNCTION static A deduce(void (Functor::*)(M, A&, I) const); + + using type = decltype(deduce(&F::operator())); + }; + + template <typename F> + struct deduce_value_type<F, SCAN, void, false> { + template <typename M, typename A, typename I> + KOKKOS_INLINE_FUNCTION static A deduce(void (Functor::*)(WTag, M, A&, I) + const); + + template <typename M, typename A, typename I> + KOKKOS_INLINE_FUNCTION static A deduce(void (Functor::*)(WTag const&, M, A&, + I) const); + + using type = decltype(deduce(&F::operator())); + }; + + //---------------------------------------- + + using candidate_type = typename deduce_value_type<Functor>::type; + + enum { + candidate_is_void = std::is_same<candidate_type, void>::value, + candidate_is_array = std::rank<candidate_type>::value == 1 + }; + + //---------------------------------------- + + public: + using execution_space = typename std::conditional< + functor_has_space::value, typename functor_has_space::type, + typename std::conditional<policy_has_space::value, + typename policy_has_space::type, + Kokkos::DefaultExecutionSpace>::type>::type; + + using value_type = typename std::remove_extent<candidate_type>::type; + + static_assert(!std::is_const<value_type>::value, + "Kokkos functor operator reduce argument cannot be const"); + + private: + // Stub to avoid defining a type 'void &' + using ValueType = + typename std::conditional<candidate_is_void, VOID, value_type>::type; + + public: + using pointer_type = + typename std::conditional<candidate_is_void, void, ValueType*>::type; + + using reference_type = typename std::conditional< + candidate_is_array, ValueType*, + typename std::conditional<!candidate_is_void, ValueType&, + void>::type>::type; + + private: + template <bool IsArray, class FF> + KOKKOS_INLINE_FUNCTION static constexpr + typename std::enable_if<IsArray, unsigned>::type + get_length(FF const& f) { + return f.value_count; + } + + template <bool IsArray, class FF> + KOKKOS_INLINE_FUNCTION static constexpr + typename std::enable_if<!IsArray, unsigned>::type + get_length(FF const&) { + return candidate_is_void ? 0 : 1; + } + + public: + enum { + StaticValueSize = + !candidate_is_void && !candidate_is_array ? sizeof(ValueType) : 0 + }; + + KOKKOS_FORCEINLINE_FUNCTION static constexpr unsigned value_count( + const Functor& f) { + return FunctorAnalysis::template get_length<candidate_is_array>(f); + } + + KOKKOS_FORCEINLINE_FUNCTION static constexpr unsigned value_size( + const Functor& f) { + return FunctorAnalysis::template get_length<candidate_is_array>(f) * + sizeof(ValueType); + } + + //---------------------------------------- + + template <class Unknown> + KOKKOS_FORCEINLINE_FUNCTION static constexpr unsigned value_count( + const Unknown&) { + return candidate_is_void ? 0 : 1; + } + + template <class Unknown> + KOKKOS_FORCEINLINE_FUNCTION static constexpr unsigned value_size( + const Unknown&) { + return candidate_is_void ? 0 : sizeof(ValueType); + } + + private: + enum INTERFACE : int { + DISABLE = 0, + NO_TAG_NOT_ARRAY = 1, + NO_TAG_IS_ARRAY = 2, + HAS_TAG_NOT_ARRAY = 3, + HAS_TAG_IS_ARRAY = 4, + DEDUCED = + !std::is_same<PatternInterface, REDUCE>::value + ? DISABLE + : (std::is_same<Tag, void>::value + ? (candidate_is_array ? NO_TAG_IS_ARRAY : NO_TAG_NOT_ARRAY) + : (candidate_is_array ? HAS_TAG_IS_ARRAY + : HAS_TAG_NOT_ARRAY)) + }; + + //---------------------------------------- + // parallel_reduce join operator + + template <class F, INTERFACE> + struct has_join_function; + + template <class F> + struct has_join_function<F, NO_TAG_NOT_ARRAY> { + using vref_type = volatile ValueType&; + using cvref_type = const volatile ValueType&; + + KOKKOS_INLINE_FUNCTION static void enable_if(void (F::*)(vref_type, + cvref_type) const); + + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(vref_type, + cvref_type)); + + KOKKOS_INLINE_FUNCTION static void join(F const* const f, + ValueType volatile* dst, + ValueType volatile const* src) { + f->join(*dst, *src); + } + }; + + template <class F> + struct has_join_function<F, NO_TAG_IS_ARRAY> { + using vref_type = volatile ValueType*; + using cvref_type = const volatile ValueType*; + + KOKKOS_INLINE_FUNCTION static void enable_if(void (F::*)(vref_type, + cvref_type) const); + + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(vref_type, + cvref_type)); + + KOKKOS_INLINE_FUNCTION static void join(F const* const f, + ValueType volatile* dst, + ValueType volatile const* src) { + f->join(dst, src); + } + }; + + template <class F> + struct has_join_function<F, HAS_TAG_NOT_ARRAY> { + using vref_type = volatile ValueType&; + using cvref_type = const volatile ValueType&; + + KOKKOS_INLINE_FUNCTION static void enable_if(void (F::*)(WTag, vref_type, + cvref_type) const); + + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(WTag, vref_type, + cvref_type)); + + KOKKOS_INLINE_FUNCTION static void enable_if(void (F::*)(WTag const&, + vref_type, + cvref_type) const); + + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(WTag const&, + vref_type, + cvref_type)); + + KOKKOS_INLINE_FUNCTION static void join(F const* const f, + ValueType volatile* dst, + ValueType volatile const* src) { + f->join(WTag(), *dst, *src); + } + }; + + template <class F> + struct has_join_function<F, HAS_TAG_IS_ARRAY> { + using vref_type = volatile ValueType*; + using cvref_type = const volatile ValueType*; + + KOKKOS_INLINE_FUNCTION static void enable_if(void (F::*)(WTag, vref_type, + cvref_type) const); + + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(WTag, vref_type, + cvref_type)); + + KOKKOS_INLINE_FUNCTION static void enable_if(void (F::*)(WTag const&, + vref_type, + cvref_type) const); + + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(WTag const&, + vref_type, + cvref_type)); + + KOKKOS_INLINE_FUNCTION static void join(F const* const f, + ValueType volatile* dst, + ValueType volatile const* src) { + f->join(WTag(), dst, src); + } + }; + + template <class F = Functor, INTERFACE = DEDUCED, typename = void> + struct DeduceJoin { + enum : bool { value = false }; + + KOKKOS_INLINE_FUNCTION static void join(F const* const f, + ValueType volatile* dst, + ValueType volatile const* src) { + const int n = FunctorAnalysis::value_count(*f); + for (int i = 0; i < n; ++i) dst[i] += src[i]; + } + }; + + template <class F> + struct DeduceJoin<F, DISABLE, void> { + enum : bool { value = false }; + + KOKKOS_INLINE_FUNCTION static void join(F const* const, ValueType volatile*, + ValueType volatile const*) {} + }; + + template <class F, INTERFACE I> + struct DeduceJoin<F, I, + decltype(has_join_function<F, I>::enable_if(&F::join))> + : public has_join_function<F, I> { + enum : bool { value = true }; + }; + + //---------------------------------------- + + template <class, INTERFACE> + struct has_init_function; + + template <class F> + struct has_init_function<F, NO_TAG_NOT_ARRAY> { + KOKKOS_INLINE_FUNCTION static void enable_if(void (F::*)(ValueType&) const); + + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ValueType&)); + + KOKKOS_INLINE_FUNCTION static void init(F const* const f, ValueType* dst) { + f->init(*dst); + } + }; + + template <class F> + struct has_init_function<F, NO_TAG_IS_ARRAY> { + KOKKOS_INLINE_FUNCTION static void enable_if(void (F::*)(ValueType*) const); + + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ValueType*)); + + KOKKOS_INLINE_FUNCTION static void init(F const* const f, ValueType* dst) { + f->init(dst); + } + }; + + template <class F> + struct has_init_function<F, HAS_TAG_NOT_ARRAY> { + KOKKOS_INLINE_FUNCTION static void enable_if(void (F::*)(WTag, ValueType&) + const); + + KOKKOS_INLINE_FUNCTION static void enable_if(void (F::*)(WTag const&, + ValueType&) const); + + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(WTag, ValueType&)); + + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(WTag const&, + ValueType&)); + + KOKKOS_INLINE_FUNCTION static void init(F const* const f, ValueType* dst) { + f->init(WTag(), *dst); + } + }; + + template <class F> + struct has_init_function<F, HAS_TAG_IS_ARRAY> { + KOKKOS_INLINE_FUNCTION static void enable_if(void (F::*)(WTag, ValueType*) + const); + + KOKKOS_INLINE_FUNCTION static void enable_if(void (F::*)(WTag const&, + ValueType*) const); + + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(WTag, ValueType*)); + + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(WTag const&, + ValueType*)); + + KOKKOS_INLINE_FUNCTION static void init(F const* const f, ValueType* dst) { + f->init(WTag(), dst); + } + }; + + template <class F = Functor, INTERFACE = DEDUCED, typename = void> + struct DeduceInit { + enum : bool { value = false }; + + KOKKOS_INLINE_FUNCTION static void init(F const* const, ValueType* dst) { + new (dst) ValueType(); + } + }; + + template <class F> + struct DeduceInit<F, DISABLE, void> { + enum : bool { value = false }; + + KOKKOS_INLINE_FUNCTION static void init(F const* const, ValueType*) {} + }; + + template <class F, INTERFACE I> + struct DeduceInit<F, I, + decltype(has_init_function<F, I>::enable_if(&F::init))> + : public has_init_function<F, I> { + enum : bool { value = true }; + }; + + //---------------------------------------- + + template <class, INTERFACE> + struct has_final_function; + + // No tag, not array + template <class F> + struct has_final_function<F, NO_TAG_NOT_ARRAY> { + KOKKOS_INLINE_FUNCTION static void enable_if(void (F::*)(ValueType&) const); + + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ValueType&)); + + KOKKOS_INLINE_FUNCTION static void final(F const* const f, ValueType* dst) { + f->final(*dst); + } + }; + + // No tag, is array + template <class F> + struct has_final_function<F, NO_TAG_IS_ARRAY> { + KOKKOS_INLINE_FUNCTION static void enable_if(void (F::*)(ValueType*) const); + + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ValueType*)); + + KOKKOS_INLINE_FUNCTION static void final(F const* const f, ValueType* dst) { + f->final(dst); + } + }; + + // Has tag, not array + template <class F> + struct has_final_function<F, HAS_TAG_NOT_ARRAY> { + KOKKOS_INLINE_FUNCTION static void enable_if(void (F::*)(WTag, ValueType&) + const); + + KOKKOS_INLINE_FUNCTION static void enable_if(void (F::*)(WTag const&, + ValueType&) const); + + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(WTag, ValueType&)); + + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(WTag const&, + ValueType&)); + + KOKKOS_INLINE_FUNCTION static void final(F const* const f, ValueType* dst) { + f->final(WTag(), *dst); + } + }; + + // Has tag, is array + template <class F> + struct has_final_function<F, HAS_TAG_IS_ARRAY> { + KOKKOS_INLINE_FUNCTION static void enable_if(void (F::*)(WTag, ValueType*) + const); + + KOKKOS_INLINE_FUNCTION static void enable_if(void (F::*)(WTag const&, + ValueType*) const); + + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(WTag, ValueType*)); + + KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(WTag const&, + ValueType*)); + + KOKKOS_INLINE_FUNCTION static void final(F const* const f, ValueType* dst) { + f->final(WTag(), dst); + } + }; + + template <class F = Functor, INTERFACE = DEDUCED, typename = void> + struct DeduceFinal { + enum : bool { value = false }; + + KOKKOS_INLINE_FUNCTION + static void final(F const* const, ValueType*) {} + }; + + template <class F, INTERFACE I> + struct DeduceFinal<F, I, + decltype(has_final_function<F, I>::enable_if(&F::final))> + : public has_final_function<F, I> { + enum : bool { value = true }; + }; + + //---------------------------------------- + + template <class F = Functor, typename = void> + struct DeduceTeamShmem { + enum : bool { value = false }; + + static size_t team_shmem_size(F const&, int) { return 0; } + }; + + template <class F> + struct DeduceTeamShmem< + F, typename std::enable_if<0 < sizeof(&F::team_shmem_size)>::type> { + enum : bool { value = true }; + + static size_t team_shmem_size(F const* const f, int team_size) { + return f->team_shmem_size(team_size); + } + }; + + template <class F> + struct DeduceTeamShmem< + F, typename std::enable_if<0 < sizeof(&F::shmem_size)>::type> { + enum : bool { value = true }; + + static size_t team_shmem_size(F const* const f, int team_size) { + return f->shmem_size(team_size); + } + }; + + //---------------------------------------- + + public: + inline static size_t team_shmem_size(Functor const& f) { + return DeduceTeamShmem<>::team_shmem_size(f); + } + + //---------------------------------------- + + enum { has_join_member_function = DeduceJoin<>::value }; + enum { has_init_member_function = DeduceInit<>::value }; + enum { has_final_member_function = DeduceFinal<>::value }; + + template <class MemorySpace = typename execution_space::memory_space> + struct Reducer { + private: + Functor const* const m_functor; + ValueType* const m_result; + + template <bool IsArray> + KOKKOS_INLINE_FUNCTION constexpr + typename std::enable_if<IsArray, FunctorAnalysis::ValueType*>::type + ref() const noexcept { + return m_result; + } + + template <bool IsArray> + KOKKOS_INLINE_FUNCTION constexpr + typename std::enable_if<!IsArray, FunctorAnalysis::ValueType&>::type + ref() const noexcept { + return *m_result; + } + + template <bool IsArray> + KOKKOS_INLINE_FUNCTION constexpr typename std::enable_if<IsArray, int>::type + len() const noexcept { + return m_functor->value_count; + } + + template <bool IsArray> + KOKKOS_INLINE_FUNCTION constexpr + typename std::enable_if<!IsArray, int>::type + len() const noexcept { + return candidate_is_void ? 0 : 1; + } + + public: + using reducer = Reducer; + using value_type = FunctorAnalysis::value_type; + using memory_space = MemorySpace; + using reference_type = FunctorAnalysis::reference_type; + using functor_type = Functor; // Adapts a functor + + KOKKOS_INLINE_FUNCTION constexpr value_type* data() const noexcept { + return m_result; + } + + KOKKOS_INLINE_FUNCTION constexpr reference_type reference() const noexcept { + return Reducer::template ref<candidate_is_array>(); + } + + KOKKOS_INLINE_FUNCTION constexpr int length() const noexcept { + return Reducer::template len<candidate_is_array>(); + } + + KOKKOS_INLINE_FUNCTION + void copy(ValueType* const dst, ValueType const* const src) const noexcept { + for (int i = 0; i < Reducer::template len<candidate_is_array>(); ++i) + dst[i] = src[i]; + } + + KOKKOS_INLINE_FUNCTION + void join(ValueType volatile* dst, ValueType volatile const* src) const + noexcept { + DeduceJoin<>::join(m_functor, dst, src); + } + + KOKKOS_INLINE_FUNCTION + void init(ValueType* dst) const noexcept { + DeduceInit<>::init(m_functor, dst); + } + + KOKKOS_INLINE_FUNCTION + void final(ValueType* dst) const noexcept { + DeduceFinal<>::final(m_functor, dst); + } + + Reducer(Reducer const&) = default; + Reducer(Reducer&&) = default; + Reducer& operator=(Reducer const&) = delete; + Reducer& operator=(Reducer&&) = delete; + + template <class S> + using rebind = Reducer<S>; + + KOKKOS_INLINE_FUNCTION explicit constexpr Reducer( + Functor const* arg_functor = 0, ValueType* arg_value = nullptr) noexcept + : m_functor(arg_functor), m_result(arg_value) {} + }; +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* KOKKOS_FUNCTORANALYSIS_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_GraphImpl.hpp b/packages/kokkos/core/src/impl/Kokkos_GraphImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..9bf9e29d0fdee715aad9eb7a8db4e22db99140a7 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_GraphImpl.hpp @@ -0,0 +1,156 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_IMPL_KOKKOS_GRAPHIMPL_HPP +#define KOKKOS_IMPL_KOKKOS_GRAPHIMPL_HPP + +#include <Kokkos_Macros.hpp> + +#include <Kokkos_Core_fwd.hpp> +#include <Kokkos_Graph_fwd.hpp> + +#include <Kokkos_Concepts.hpp> // is_execution_policy +#include <Kokkos_PointerOwnership.hpp> +#include <impl/Kokkos_GraphImpl_fwd.hpp> + +#include <memory> // std::make_shared + +namespace Kokkos { +namespace Impl { + +struct GraphAccess { + template <class ExecutionSpace> + static Kokkos::Experimental::Graph<ExecutionSpace> construct_graph( + ExecutionSpace ex) { + //----------------------------------------// + return Kokkos::Experimental::Graph<ExecutionSpace>{ + std::make_shared<GraphImpl<ExecutionSpace>>(std::move(ex))}; + //----------------------------------------// + } + template <class ExecutionSpace> + static auto create_root_ref( + Kokkos::Experimental::Graph<ExecutionSpace>& arg_graph) { + auto const& graph_impl_ptr = arg_graph.m_impl_ptr; + + auto root_ptr = graph_impl_ptr->create_root_node_ptr(); + + return Kokkos::Experimental::GraphNodeRef<ExecutionSpace>{ + graph_impl_ptr, std::move(root_ptr)}; + } + + template <class NodeType, class... Args> + static auto make_node_shared_ptr(Args&&... args) { + static_assert( + Kokkos::Impl::is_specialization_of<NodeType, GraphNodeImpl>::value, + "Kokkos Internal Error in graph interface"); + return std::make_shared<NodeType>((Args &&) args...); + } + + template <class GraphImplWeakPtr, class ExecutionSpace, class Kernel, + class Predecessor> + static auto make_graph_node_ref( + GraphImplWeakPtr graph_impl, + std::shared_ptr< + Kokkos::Impl::GraphNodeImpl<ExecutionSpace, Kernel, Predecessor>> + pred_impl) { + //---------------------------------------- + return Kokkos::Experimental::GraphNodeRef<ExecutionSpace, Kernel, + Predecessor>{ + std::move(graph_impl), std::move(pred_impl)}; + //---------------------------------------- + } + + //---------------------------------------------------------------------------- + // <editor-fold desc="accessors for private members of public interface"> {{{2 + + template <class NodeRef> + static auto get_node_ptr(NodeRef&& node_ref) { + static_assert( + is_specialization_of<remove_cvref_t<NodeRef>, + Kokkos::Experimental::GraphNodeRef>::value, + "Kokkos Internal Implementation error (bad argument to " + "`GraphAccess::get_node_ptr()`)"); + return ((NodeRef &&) node_ref).get_node_ptr(); + } + + template <class NodeRef> + static auto get_graph_weak_ptr(NodeRef&& node_ref) { + static_assert( + is_specialization_of<remove_cvref_t<NodeRef>, + Kokkos::Experimental::GraphNodeRef>::value, + "Kokkos Internal Implementation error (bad argument to " + "`GraphAccess::get_graph_weak_ptr()`)"); + return ((NodeRef &&) node_ref).get_graph_weak_ptr(); + } + + // </editor-fold> end accessors for private members of public interface }}}2 + //---------------------------------------------------------------------------- +}; + +template <class Policy> +struct _add_graph_kernel_tag; + +template <template <class...> class PolicyTemplate, class... PolicyTraits> +struct _add_graph_kernel_tag<PolicyTemplate<PolicyTraits...>> { + using type = PolicyTemplate<PolicyTraits..., IsGraphKernelTag>; +}; + +} // end namespace Impl + +namespace Experimental { // but not for users, so... + +template <class Policy> +// requires ExecutionPolicy<Policy> +constexpr auto require(Policy const& policy, + Kokkos::Impl::KernelInGraphProperty) { + static_assert(Kokkos::is_execution_policy<Policy>::value, + "Internal implementation error!"); + return typename Kokkos::Impl::_add_graph_kernel_tag<Policy>::type{policy}; +} + +} // end namespace Experimental + +} // end namespace Kokkos + +#endif // KOKKOS_IMPL_KOKKOS_GRAPHIMPL_HPP diff --git a/packages/kokkos/core/src/impl/Kokkos_GraphImpl_Utilities.hpp b/packages/kokkos/core/src/impl/Kokkos_GraphImpl_Utilities.hpp new file mode 100644 index 0000000000000000000000000000000000000000..109d37a05db2c0b2b52e7361ed3e1e7ab5008c10 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_GraphImpl_Utilities.hpp @@ -0,0 +1,119 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_KOKKOS_GRAPHIMPL_UTILITIES_HPP +#define KOKKOS_KOKKOS_GRAPHIMPL_UTILITIES_HPP + +#include <Kokkos_Macros.hpp> + +#include <Kokkos_Graph_fwd.hpp> + +#include <type_traits> + +namespace Kokkos { +namespace Impl { + +//============================================================================== +// <editor-fold desc="is_compatible_type_erasure"> {{{1 + +template <class Src, class Dst, class Enable = void> +struct is_compatible_type_erasure : std::false_type {}; + +template <class T> +struct is_compatible_type_erasure<T, Kokkos::Experimental::TypeErasedTag> + : std::true_type {}; + +template <> +struct is_compatible_type_erasure<Kokkos::Experimental::TypeErasedTag, + Kokkos::Experimental::TypeErasedTag> + : std::true_type {}; + +template <class T> +struct is_compatible_type_erasure<T, T> : std::true_type {}; + +// So there are a couple of ways we could do this, but I didn't want to set up +// all of the machinery to do a lazy instantiation of the convertibility +// condition in the converting constructor of GraphNodeRef, so I'm going with +// this for now: +// TODO @desul-integration make this variadic once we have a meta-conjunction +template <template <class, class, class> class Template, class TSrc, class USrc, + class VSrc, class TDst, class UDst, class VDst> +struct is_compatible_type_erasure< + Template<TSrc, USrc, VSrc>, Template<TDst, UDst, VDst>, + // Because gcc thinks this is ambiguous, we need to add this: + std::enable_if_t<!std::is_same<TSrc, TDst>::value || + !std::is_same<USrc, UDst>::value || + !std::is_same<VSrc, VDst>::value>> + : std::integral_constant< + bool, is_compatible_type_erasure<TSrc, TDst>::value && + is_compatible_type_erasure<USrc, UDst>::value && + is_compatible_type_erasure<VSrc, VDst>::value> {}; + +// </editor-fold> end is_compatible_type_erasure }}}1 +//============================================================================== + +//============================================================================== +// <editor-fold desc="is_more_type_erased"> {{{1 + +template <class T, class U> +struct is_more_type_erased : std::false_type {}; + +template <class T> +struct is_more_type_erased<Kokkos::Experimental::TypeErasedTag, T> + : std::true_type {}; + +template <> +struct is_more_type_erased<Kokkos::Experimental::TypeErasedTag, + Kokkos::Experimental::TypeErasedTag> + : std::false_type {}; + +// TODO @desul-integration variadic version of this, like the above + +// </editor-fold> end is_more_type_erased }}}1 +//============================================================================== + +} // end namespace Impl +} // end namespace Kokkos + +#endif // KOKKOS_KOKKOS_GRAPHIMPL_UTILITIES_HPP diff --git a/packages/kokkos/core/src/impl/Kokkos_GraphImpl_fwd.hpp b/packages/kokkos/core/src/impl/Kokkos_GraphImpl_fwd.hpp new file mode 100644 index 0000000000000000000000000000000000000000..37c53075d01df464ee790b3db8546385b2447398 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_GraphImpl_fwd.hpp @@ -0,0 +1,87 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_IMPL_KOKKOS_GRAPHIMPL_FWD_HPP +#define KOKKOS_IMPL_KOKKOS_GRAPHIMPL_FWD_HPP + +#include <Kokkos_Macros.hpp> + +namespace Kokkos { +namespace Impl { + +template <class ExecutionSpace, class Kernel, class Predecessor> +struct GraphNodeImpl; + +template <class ExecutionSpace> +struct GraphImpl; + +template <class ExecutionSpace, class Policy, class Functor, + class KernelTypeTag, class... Args> +class GraphNodeKernelImpl; + +struct _graph_node_kernel_ctor_tag {}; +struct _graph_node_predecessor_ctor_tag {}; +struct _graph_node_is_root_ctor_tag {}; + +struct GraphAccess; + +// Customizable for backends +template <class ExecutionSpace> +struct GraphNodeBackendSpecificDetails; + +// Customizable for backends +template <class ExecutionSpace, class Kernel, class PredecessorRef> +struct GraphNodeBackendDetailsBeforeTypeErasure; + +// TODO move this to a more appropriate place +struct DoNotExplicitlySpecifyThisTemplateParameter; + +struct KernelInGraphProperty {}; + +struct IsGraphKernelTag {}; + +} // end namespace Impl +} // end namespace Kokkos + +#endif // KOKKOS_IMPL_KOKKOS_GRAPHIMPL_FWD_HPP diff --git a/packages/kokkos/core/src/impl/Kokkos_GraphNodeCustomization.hpp b/packages/kokkos/core/src/impl/Kokkos_GraphNodeCustomization.hpp new file mode 100644 index 0000000000000000000000000000000000000000..fc75f945a1ff0f7b10ee4aa12b4e25c40e6ef7a7 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_GraphNodeCustomization.hpp @@ -0,0 +1,98 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_IMPL_KOKKOS_GRAPHNODECUSTOMIZATION_HPP +#define KOKKOS_IMPL_KOKKOS_GRAPHNODECUSTOMIZATION_HPP + +#include <Kokkos_Macros.hpp> +#include <Kokkos_Core_fwd.hpp> +#include <Kokkos_Graph_fwd.hpp> +#include <impl/Kokkos_GraphImpl_fwd.hpp> + +namespace Kokkos { +namespace Impl { + +// Customizable for backends +template <class ExecutionSpace, class Kernel, class PredecessorRef> +struct GraphNodeBackendDetailsBeforeTypeErasure { + protected: + //---------------------------------------------------------------------------- + // <editor-fold desc="ctors, destructor, and assignment"> {{{2 + + // Required constructors in customizations: + GraphNodeBackendDetailsBeforeTypeErasure( + ExecutionSpace const&, Kernel&, PredecessorRef const&, + GraphNodeBackendSpecificDetails<ExecutionSpace>& + /* this_as_details */) noexcept {} + GraphNodeBackendDetailsBeforeTypeErasure( + ExecutionSpace const&, _graph_node_is_root_ctor_tag, + GraphNodeBackendSpecificDetails<ExecutionSpace>& + /* this_as_details */) noexcept {} + + // Not copyable or movable at the concept level, so the default + // implementation shouldn't be either. + GraphNodeBackendDetailsBeforeTypeErasure() = delete; + + GraphNodeBackendDetailsBeforeTypeErasure( + GraphNodeBackendDetailsBeforeTypeErasure const&) = delete; + + GraphNodeBackendDetailsBeforeTypeErasure( + GraphNodeBackendDetailsBeforeTypeErasure&&) = delete; + + GraphNodeBackendDetailsBeforeTypeErasure& operator =( + GraphNodeBackendDetailsBeforeTypeErasure const&) = delete; + + GraphNodeBackendDetailsBeforeTypeErasure& operator=( + GraphNodeBackendDetailsBeforeTypeErasure&&) = delete; + + ~GraphNodeBackendDetailsBeforeTypeErasure() = default; + + // </editor-fold> end ctors, destructor, and assignment }}}2 + //---------------------------------------------------------------------------- +}; + +} // end namespace Impl +} // end namespace Kokkos + +#endif // KOKKOS_KOKKOS_GRAPHNODECUSTOMIZATION_HPP diff --git a/packages/kokkos/core/src/impl/Kokkos_GraphNodeImpl.hpp b/packages/kokkos/core/src/impl/Kokkos_GraphNodeImpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..2515995c0e3861fff03c8e1345a12dd25c1e0c52 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_GraphNodeImpl.hpp @@ -0,0 +1,298 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_IMPL_GRAPHNODEIMPL_HPP +#define KOKKOS_IMPL_GRAPHNODEIMPL_HPP + +#include <Kokkos_Macros.hpp> + +#include <Kokkos_Core_fwd.hpp> +#include <Kokkos_Graph_fwd.hpp> + +#include <impl/Kokkos_SimpleTaskScheduler.hpp> // ExecutionSpaceInstanceStorage +#include <impl/Kokkos_GraphImpl.hpp> +#include <impl/Kokkos_GraphNodeCustomization.hpp> + +#include <impl/Kokkos_EBO.hpp> + +#include <memory> + +namespace Kokkos { +namespace Impl { + +//============================================================================== +// <editor-fold desc="Fully type-erased GraphNodeImpl"> {{{1 + +// Base specialization for the case where both the kernel and the predecessor +// type information is type-erased +template <class ExecutionSpace> +struct GraphNodeImpl<ExecutionSpace, Kokkos::Experimental::TypeErasedTag, + Kokkos::Experimental::TypeErasedTag> + : GraphNodeBackendSpecificDetails<ExecutionSpace>, + ExecutionSpaceInstanceStorage<ExecutionSpace> { + public: + using node_ref_t = + Kokkos::Experimental::GraphNodeRef<ExecutionSpace, + Kokkos::Experimental::TypeErasedTag, + Kokkos::Experimental::TypeErasedTag>; + + protected: + using implementation_base_t = GraphNodeBackendSpecificDetails<ExecutionSpace>; + using execution_space_storage_base_t = + ExecutionSpaceInstanceStorage<ExecutionSpace>; + + public: + virtual ~GraphNodeImpl() = default; + + protected: + //---------------------------------------------------------------------------- + // <editor-fold desc="protected ctors and destructors"> {{{2 + + explicit GraphNodeImpl(ExecutionSpace const& ex) noexcept + : implementation_base_t(), execution_space_storage_base_t(ex) {} + + // </editor-fold> end protected ctors and destructors }}}2 + //---------------------------------------------------------------------------- + + public: + //---------------------------------------------------------------------------- + // <editor-fold desc="public(-ish) constructors"> {{{2 + + template <class... Args> + GraphNodeImpl(ExecutionSpace const& ex, _graph_node_is_root_ctor_tag, + Args&&... args) noexcept + : implementation_base_t(_graph_node_is_root_ctor_tag{}, + (Args &&) args...), + execution_space_storage_base_t(ex) {} + + // </editor-fold> end public(-ish) constructors }}}2 + //---------------------------------------------------------------------------- + + //---------------------------------------------------------------------------- + // <editor-fold desc="no other constructors"> {{{2 + + GraphNodeImpl() = delete; + GraphNodeImpl(GraphNodeImpl const&) = delete; + GraphNodeImpl(GraphNodeImpl&&) = delete; + GraphNodeImpl& operator=(GraphNodeImpl const&) = delete; + GraphNodeImpl& operator=(GraphNodeImpl&&) = delete; + + // </editor-fold> end no other constructors }}}2 + //---------------------------------------------------------------------------- + + ExecutionSpace const& execution_space_instance() const { + return this->execution_space_storage_base_t::execution_space_instance(); + } +}; + +// </editor-fold> end Fully type-erased GraphNodeImpl }}}1 +//============================================================================== + +//============================================================================== +// <editor-fold desc="Type-erased predecessor GraphNodeImpl"> {{{1 + +// Specialization for the case with the concrete type of the kernel, but the +// predecessor erased. +template <class ExecutionSpace, class Kernel> +struct GraphNodeImpl<ExecutionSpace, Kernel, + Kokkos::Experimental::TypeErasedTag> + : GraphNodeImpl<ExecutionSpace, Kokkos::Experimental::TypeErasedTag, + Kokkos::Experimental::TypeErasedTag> { + private: + using base_t = + GraphNodeImpl<ExecutionSpace, Kokkos::Experimental::TypeErasedTag, + Kokkos::Experimental::TypeErasedTag>; + + public: + //---------------------------------------------------------------------------- + // <editor-fold desc="public member types"> {{{2 + + using node_ref_t = + Kokkos::Experimental::GraphNodeRef<ExecutionSpace, Kernel, + Kokkos::Experimental::TypeErasedTag>; + using kernel_type = Kernel; + + // </editor-fold> end public member types }}}2 + //---------------------------------------------------------------------------- + + private: + //---------------------------------------------------------------------------- + // <editor-fold desc="private data members"> {{{2 + + Kernel m_kernel; + + // </editor-fold> end private data members }}}2 + //---------------------------------------------------------------------------- + + public: + //---------------------------------------------------------------------------- + // <editor-fold desc="Ctors, destructors, and assignment"> {{{2 + + template <class KernelDeduced> + GraphNodeImpl(ExecutionSpace const& ex, _graph_node_kernel_ctor_tag, + KernelDeduced&& arg_kernel) + : base_t(ex), m_kernel((KernelDeduced &&) arg_kernel) {} + + template <class... Args> + GraphNodeImpl(ExecutionSpace const& ex, _graph_node_is_root_ctor_tag, + Args&&... args) + : base_t(ex, _graph_node_is_root_ctor_tag{}, (Args &&) args...) {} + + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // <editor-fold desc="Rule of 6 for not copyable or movable"> {{{3 + + // Not copyable or movable + GraphNodeImpl() = delete; + GraphNodeImpl(GraphNodeImpl const&) = delete; + GraphNodeImpl(GraphNodeImpl&&) = delete; + GraphNodeImpl& operator=(GraphNodeImpl const&) = delete; + GraphNodeImpl& operator=(GraphNodeImpl&&) = delete; + ~GraphNodeImpl() override = default; + + // </editor-fold> end Rule of 6 for not copyable or movable }}}3 + //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + // </editor-fold> end Ctors, destructors, and assignment }}}2 + //---------------------------------------------------------------------------- + + //---------------------------------------------------------------------------- + // <editor-fold desc="member accessors"> {{{2 + + // Reference qualified to prevent dangling reference to data member + Kernel& get_kernel() & { return m_kernel; } + Kernel const& get_kernel() const& { return m_kernel; } + Kernel&& get_kernel() && = delete; + + // </editor-fold> end member accessors }}}2 + //---------------------------------------------------------------------------- +}; + +// </editor-fold> end Type-erased predecessor GraphNodeImpl }}}1 +//============================================================================== + +//============================================================================== +// <editor-fold desc="Fully concrete GraphNodeImpl"> {{{1 + +// Specialization for the case where nothing is type-erased +template <class ExecutionSpace, class Kernel, class PredecessorRef> +struct GraphNodeImpl + : GraphNodeImpl<ExecutionSpace, Kernel, + Kokkos::Experimental::TypeErasedTag>, + GraphNodeBackendDetailsBeforeTypeErasure<ExecutionSpace, Kernel, + PredecessorRef> { + private: + using base_t = GraphNodeImpl<ExecutionSpace, Kernel, + Kokkos::Experimental::TypeErasedTag>; + using backend_details_base_t = + GraphNodeBackendDetailsBeforeTypeErasure<ExecutionSpace, Kernel, + PredecessorRef>; + // The fully type-erased base type, for the destroy function + using type_erased_base_t = + GraphNodeImpl<ExecutionSpace, Kokkos::Experimental::TypeErasedTag, + Kokkos::Experimental::TypeErasedTag>; + + public: + //---------------------------------------------------------------------------- + // <editor-fold desc="public data members"> {{{2 + + using node_ref_t = Kokkos::Experimental::GraphNodeRef<ExecutionSpace, Kernel, + PredecessorRef>; + + // </editor-fold> end public data members }}}2 + //---------------------------------------------------------------------------- + + private: + //---------------------------------------------------------------------------- + // <editor-fold desc="private data members"> {{{2 + + PredecessorRef m_predecessor_ref; + + // </editor-fold> end private data members }}}2 + //---------------------------------------------------------------------------- + + public: + //---------------------------------------------------------------------------- + // <editor-fold desc="Ctors, destructors, and assignment"> {{{2 + + // Not copyable or movable + GraphNodeImpl() = delete; + GraphNodeImpl(GraphNodeImpl const&) = delete; + GraphNodeImpl(GraphNodeImpl&&) = delete; + GraphNodeImpl& operator=(GraphNodeImpl const&) = delete; + GraphNodeImpl& operator=(GraphNodeImpl&&) = delete; + ~GraphNodeImpl() override = default; + + // Normal kernel-and-predecessor constructor + template <class KernelDeduced, class PredecessorPtrDeduced> + GraphNodeImpl(ExecutionSpace const& ex, _graph_node_kernel_ctor_tag, + KernelDeduced&& arg_kernel, _graph_node_predecessor_ctor_tag, + PredecessorPtrDeduced&& arg_predecessor) + : base_t(ex, _graph_node_kernel_ctor_tag{}, + (KernelDeduced &&) arg_kernel), + // The backend gets the ability to store (weak, non-owning) references + // to the kernel in it's final resting place here if it wants. The + // predecessor is already a pointer, so it doesn't matter that it isn't + // already at its final address + backend_details_base_t(ex, this->base_t::get_kernel(), arg_predecessor, + *this), + m_predecessor_ref((PredecessorPtrDeduced &&) arg_predecessor) {} + + // Root-tagged constructor + template <class... Args> + GraphNodeImpl(ExecutionSpace const& ex, _graph_node_is_root_ctor_tag, + Args&&... args) + : base_t(ex, _graph_node_is_root_ctor_tag{}, (Args &&) args...), + backend_details_base_t(ex, _graph_node_is_root_ctor_tag{}, *this), + m_predecessor_ref() {} + + // </editor-fold> end Ctors, destructors, and assignment }}}2 + //------------------------------------------------------------------------------ +}; + +// </editor-fold> end Fully concrete GraphNodeImpl }}}1 +//============================================================================== +} // end namespace Impl +} // end namespace Kokkos + +#endif // KOKKOS_IMPL_GRAPHNODEIMPL_HPP diff --git a/packages/kokkos/core/src/impl/Kokkos_HBWSpace.cpp b/packages/kokkos/core/src/impl/Kokkos_HBWSpace.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5c0eaa0a1ef80fa02e2f745f1d7e53d6fc45b8d3 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_HBWSpace.cpp @@ -0,0 +1,376 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Macros.hpp> + +#include <cstddef> +#include <cstdlib> +#include <cstdint> +#include <cstring> + +#include <iostream> +#include <sstream> +#include <cstring> +#include <algorithm> + +#include <Kokkos_HBWSpace.hpp> +#include <impl/Kokkos_Error.hpp> +#include <impl/Kokkos_MemorySpace.hpp> +#include <Kokkos_Atomic.hpp> +#ifdef KOKKOS_ENABLE_HBWSPACE +#include <memkind.h> +#endif + +#include <impl/Kokkos_Tools.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_HBWSPACE +#define MEMKIND_TYPE MEMKIND_HBW // hbw_get_kind(HBW_PAGESIZE_4KB) + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Experimental { + +/* Default allocation mechanism */ +HBWSpace::HBWSpace() : m_alloc_mech(HBWSpace::STD_MALLOC) { + printf("Init\n"); + setenv("MEMKIND_HBW_NODES", "1", 0); +} + +/* Default allocation mechanism */ +HBWSpace::HBWSpace(const HBWSpace::AllocationMechanism &arg_alloc_mech) + : m_alloc_mech(HBWSpace::STD_MALLOC) { + printf("Init2\n"); + setenv("MEMKIND_HBW_NODES", "1", 0); + if (arg_alloc_mech == STD_MALLOC) { + m_alloc_mech = HBWSpace::STD_MALLOC; + } +} + +void *HBWSpace::allocate(const size_t arg_alloc_size) const { + return allocate("[unlabeled]", arg_alloc_size); +} +void *HBWSpace::allocate(const char *arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size) const { + return impl_allocate(arg_label, arg_alloc_size, arg_logical_size); +} +void *HBWSpace::impl_allocate( + const char *arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { + static_assert(sizeof(void *) == sizeof(uintptr_t), + "Error sizeof(void*) != sizeof(uintptr_t)"); + + static_assert( + Kokkos::Impl::power_of_two<Kokkos::Impl::MEMORY_ALIGNMENT>::value, + "Memory alignment must be power of two"); + + constexpr uintptr_t alignment = Kokkos::Impl::MEMORY_ALIGNMENT; + constexpr uintptr_t alignment_mask = alignment - 1; + + void *ptr = nullptr; + + if (arg_alloc_size) { + if (m_alloc_mech == STD_MALLOC) { + // Over-allocate to and round up to guarantee proper alignment. + size_t size_padded = arg_alloc_size + sizeof(void *) + alignment; + + void *alloc_ptr = memkind_malloc(MEMKIND_TYPE, size_padded); + + if (alloc_ptr) { + uintptr_t address = reinterpret_cast<uintptr_t>(alloc_ptr); + + // offset enough to record the alloc_ptr + address += sizeof(void *); + uintptr_t rem = address % alignment; + uintptr_t offset = rem ? (alignment - rem) : 0u; + address += offset; + ptr = reinterpret_cast<void *>(address); + // record the alloc'd pointer + address -= sizeof(void *); + *reinterpret_cast<void **>(address) = alloc_ptr; + } + } + } + + if ((ptr == nullptr) || (reinterpret_cast<uintptr_t>(ptr) == ~uintptr_t(0)) || + (reinterpret_cast<uintptr_t>(ptr) & alignment_mask)) { + std::ostringstream msg; + msg << "Kokkos::Experimental::HBWSpace::allocate[ "; + switch (m_alloc_mech) { + case STD_MALLOC: msg << "STD_MALLOC"; break; + case POSIX_MEMALIGN: msg << "POSIX_MEMALIGN"; break; + case POSIX_MMAP: msg << "POSIX_MMAP"; break; + case INTEL_MM_ALLOC: msg << "INTEL_MM_ALLOC"; break; + } + msg << " ]( " << arg_alloc_size << " ) FAILED"; + if (ptr == nullptr) { + msg << " nullptr"; + } else { + msg << " NOT ALIGNED " << ptr; + } + + std::cerr << msg.str() << std::endl; + std::cerr.flush(); + + Kokkos::Impl::throw_runtime_exception(msg.str()); + } + if (Kokkos::Profiling::profileLibraryLoaded()) { + const size_t reported_size = + (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; + Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size); + } + + return ptr; +} + +void HBWSpace::deallocate(void *const arg_alloc_ptr, + const size_t arg_alloc_size) const { + deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size); +} +void HBWSpace::deallocate(const char *arg_label, void *const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t arg_logical_size) const { + impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size); +} +void HBWSpace::impl_deallocate( + const char *arg_label, void *const arg_alloc_ptr, + const size_t arg_alloc_size, const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { + if (arg_alloc_ptr) { + if (Kokkos::Profiling::profileLibraryLoaded()) { + const size_t reported_size = + (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; + Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, + reported_size); + } + + if (m_alloc_mech == STD_MALLOC) { + void *alloc_ptr = *(reinterpret_cast<void **>(arg_alloc_ptr) - 1); + memkind_free(MEMKIND_TYPE, alloc_ptr); + } + } +} + +} // namespace Experimental +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +#ifdef KOKKOS_ENABLE_DEBUG +SharedAllocationRecord<void, void> + SharedAllocationRecord<Kokkos::Experimental::HBWSpace, void>::s_root_record; +#endif + +void SharedAllocationRecord<Kokkos::Experimental::HBWSpace, void>::deallocate( + SharedAllocationRecord<void, void> *arg_rec) { + delete static_cast<SharedAllocationRecord *>(arg_rec); +} + +SharedAllocationRecord<Kokkos::Experimental::HBWSpace, + void>::~SharedAllocationRecord() +#if defined( \ + KOKKOS_IMPL_INTEL_WORKAROUND_NOEXCEPT_SPECIFICATION_VIRTUAL_FUNCTION) + noexcept +#endif +{ + + m_space.deallocate(RecordBase::m_alloc_ptr->m_label, + SharedAllocationRecord<void, void>::m_alloc_ptr, + SharedAllocationRecord<void, void>::m_alloc_size, + (SharedAllocationRecord<void, void>::m_alloc_size - + sizeof(SharedAllocationHeader))); +} + +SharedAllocationRecord<Kokkos::Experimental::HBWSpace, void>:: + SharedAllocationRecord( + const Kokkos::Experimental::HBWSpace &arg_space, + const std::string &arg_label, const size_t arg_alloc_size, + const SharedAllocationRecord<void, void>::function_type arg_dealloc) + // Pass through allocated [ SharedAllocationHeader , user_memory ] + // Pass through deallocation function + : SharedAllocationRecord<void, void>( +#ifdef KOKKOS_ENABLE_DEBUG + &SharedAllocationRecord<Kokkos::Experimental::HBWSpace, + void>::s_root_record, +#endif + Impl::checked_allocation_with_header(arg_space, arg_label, + arg_alloc_size), + sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc), + m_space(arg_space) { + // Fill in the Header information + RecordBase::m_alloc_ptr->m_record = + static_cast<SharedAllocationRecord<void, void> *>(this); + + strncpy(RecordBase::m_alloc_ptr->m_label, arg_label.c_str(), + SharedAllocationHeader::maximum_label_length - 1); + // Set last element zero, in case c_str is too long + RecordBase::m_alloc_ptr + ->m_label[SharedAllocationHeader::maximum_label_length - 1] = '\0'; +} + +//---------------------------------------------------------------------------- + +void * +SharedAllocationRecord<Kokkos::Experimental::HBWSpace, void>::allocate_tracked( + const Kokkos::Experimental::HBWSpace &arg_space, + const std::string &arg_alloc_label, const size_t arg_alloc_size) { + if (!arg_alloc_size) return nullptr; + + SharedAllocationRecord *const r = + allocate(arg_space, arg_alloc_label, arg_alloc_size); + + RecordBase::increment(r); + + return r->data(); +} + +void SharedAllocationRecord<Kokkos::Experimental::HBWSpace, + void>::deallocate_tracked(void *const + arg_alloc_ptr) { + if (arg_alloc_ptr != nullptr) { + SharedAllocationRecord *const r = get_record(arg_alloc_ptr); + + RecordBase::decrement(r); + } +} + +void *SharedAllocationRecord<Kokkos::Experimental::HBWSpace, void>:: + reallocate_tracked(void *const arg_alloc_ptr, const size_t arg_alloc_size) { + SharedAllocationRecord *const r_old = get_record(arg_alloc_ptr); + SharedAllocationRecord *const r_new = + allocate(r_old->m_space, r_old->get_label(), arg_alloc_size); + + Kokkos::Impl::DeepCopy<Kokkos::Experimental::HBWSpace, + Kokkos::Experimental::HBWSpace>( + r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size())); + + RecordBase::increment(r_new); + RecordBase::decrement(r_old); + + return r_new->data(); +} + +SharedAllocationRecord<Kokkos::Experimental::HBWSpace, void> + *SharedAllocationRecord<Kokkos::Experimental::HBWSpace, void>::get_record( + void *alloc_ptr) { + using Header = SharedAllocationHeader; + using RecordHost = + SharedAllocationRecord<Kokkos::Experimental::HBWSpace, void>; + + SharedAllocationHeader const *const head = + alloc_ptr ? Header::get_header(alloc_ptr) : nullptr; + RecordHost *const record = + head ? static_cast<RecordHost *>(head->m_record) : nullptr; + + if (!alloc_ptr || record->m_alloc_ptr != head) { + Kokkos::Impl::throw_runtime_exception(std::string( + "Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace " + ", void >::get_record ERROR")); + } + + return record; +} + +// Iterate records to print orphaned memory ... +void SharedAllocationRecord<Kokkos::Experimental::HBWSpace, void>:: + print_records(std::ostream &s, const Kokkos::Experimental::HBWSpace &space, + bool detail) { +#ifdef KOKKOS_ENABLE_DEBUG + SharedAllocationRecord<void, void>::print_host_accessible_records( + s, "HBWSpace", &s_root_record, detail); +#else + throw_runtime_exception( + "SharedAllocationRecord<HBWSpace>::print_records" + " only works with KOKKOS_ENABLE_DEBUG enabled"); +#endif +} + +} // namespace Impl +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Experimental { +namespace { +const unsigned HBW_SPACE_ATOMIC_MASK = 0xFFFF; +const unsigned HBW_SPACE_ATOMIC_XOR_MASK = 0x5A39; +static int HBW_SPACE_ATOMIC_LOCKS[HBW_SPACE_ATOMIC_MASK + 1]; +} // namespace + +namespace Impl { +void init_lock_array_hbw_space() { + static int is_initialized = 0; + if (!is_initialized) + for (int i = 0; i < static_cast<int>(HBW_SPACE_ATOMIC_MASK + 1); i++) + HBW_SPACE_ATOMIC_LOCKS[i] = 0; +} + +bool lock_address_hbw_space(void *ptr) { + return 0 == atomic_compare_exchange( + &HBW_SPACE_ATOMIC_LOCKS[((size_t(ptr) >> 2) & + HBW_SPACE_ATOMIC_MASK) ^ + HBW_SPACE_ATOMIC_XOR_MASK], + 0, 1); +} + +void unlock_address_hbw_space(void *ptr) { + atomic_exchange( + &HBW_SPACE_ATOMIC_LOCKS[((size_t(ptr) >> 2) & HBW_SPACE_ATOMIC_MASK) ^ + HBW_SPACE_ATOMIC_XOR_MASK], + 0); +} + +} // namespace Impl +} // namespace Experimental +} // namespace Kokkos +#endif diff --git a/packages/kokkos/core/src/impl/Kokkos_HostBarrier.cpp b/packages/kokkos/core/src/impl/Kokkos_HostBarrier.cpp new file mode 100644 index 0000000000000000000000000000000000000000..79ee7e80db3115f1c9c14366e2c237c042ab0bdb --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_HostBarrier.cpp @@ -0,0 +1,91 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Macros.hpp> + +#include <impl/Kokkos_HostBarrier.hpp> +#include <impl/Kokkos_BitOps.hpp> + +#include <impl/Kokkos_HostBarrier.hpp> + +#include <thread> +#if defined(_WIN32) +#include <process.h> +#include <winsock2.h> +#include <windows.h> +#endif + +namespace Kokkos { +namespace Impl { + +void HostBarrier::impl_backoff_wait_until_equal( + int* ptr, const int v, const bool active_wait) noexcept { + unsigned count = 0u; + + while (!test_equal(ptr, v)) { + const int c = ::Kokkos::log2(++count); + if (!active_wait || c > log2_iterations_till_sleep) { + std::this_thread::sleep_for( + std::chrono::nanoseconds(c < 16 ? 256 * c : 4096)); + } else if (c > log2_iterations_till_yield) { + std::this_thread::yield(); + } +#if defined(KOKKOS_ENABLE_ASM) +#if defined(__PPC64__) + for (int j = 0; j < num_nops; ++j) { + asm volatile("nop\n"); + } + asm volatile("or 27, 27, 27" ::: "memory"); +#elif defined(__amd64) || defined(__amd64__) || defined(__x86_64) || \ + defined(__x86_64__) + for (int j = 0; j < num_nops; ++j) { + asm volatile("nop\n"); + } + asm volatile("pause\n" ::: "memory"); +#endif +#endif + } +} +} // namespace Impl +} // namespace Kokkos diff --git a/packages/kokkos/core/src/impl/Kokkos_HostBarrier.hpp b/packages/kokkos/core/src/impl/Kokkos_HostBarrier.hpp new file mode 100644 index 0000000000000000000000000000000000000000..4b9235ab70260e3b4a80d4bec735e033f71bf443 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_HostBarrier.hpp @@ -0,0 +1,256 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_HOST_BARRIER_HPP +#define KOKKOS_HOST_BARRIER_HPP + +#include <Kokkos_Macros.hpp> +#include <Kokkos_Atomic.hpp> + +namespace Kokkos { +namespace Impl { + +// class HostBarrier +// +// provides a static and member interface for a barrier shared between threads +// of execution. +// +// *buffer* is a shared resource between the threads of execution +// *step* should be a stack variable associated with the current thread of +// execution *size* is the number of threads which share the barrier +// +// before calling any arrive type function the buffer and step must have been +// initialized to 0 and one of the following conditions must be true +// +// 1) step == 0 (i.e. first arrive call to HostBarrier), +// 2) try_wait has returned true for the current thread of execution, +// 3) a wait type function has returned for the current thread of execution, or +// 4) split_arrive returned true on the current thread of execution and it has +// called split_release +// +// The purporse of the split functions is to allow the last thread to arrive +// an opportunity to perform some actions before releasing the waiting threads +// +// If all threads have arrived (and split_release has been call if using +// split_arrive) before a wait type call, the wait may return quickly +class HostBarrier { + public: + using buffer_type = int; + static constexpr int required_buffer_size = 128; + static constexpr int required_buffer_length = + required_buffer_size / sizeof(int); + + private: + // fit the following 3 atomics within a 128 bytes while + // keeping the arrive atomic at least 64 bytes away from + // the wait atomic to reduce contention on the caches + static constexpr int arrive_idx = 32 / sizeof(int); + static constexpr int master_idx = 64 / sizeof(int); + static constexpr int wait_idx = 96 / sizeof(int); + + static constexpr int num_nops = 32; + static constexpr int iterations_till_backoff = 64; + static constexpr int log2_iterations_till_yield = 4; + static constexpr int log2_iterations_till_sleep = 6; + + public: + // will return true if call is the last thread to arrive + KOKKOS_INLINE_FUNCTION + static bool split_arrive(int* buffer, const int size, int& step, + const bool master_wait = true) noexcept { + if (size <= 1) return true; + + ++step; + Kokkos::memory_fence(); + const bool result = + Kokkos::atomic_fetch_add(buffer + arrive_idx, 1) == size - 1; + + if (master_wait && result) { + Kokkos::atomic_fetch_add(buffer + master_idx, 1); + } + + return result; + } + + // release waiting threads + // only the thread which received a return value of true from split_arrive + // or the thread which calls split_master_wait may call split_release + KOKKOS_INLINE_FUNCTION + static void split_release(int* buffer, const int size, const int /*step*/ + ) noexcept { + if (size <= 1) return; + Kokkos::memory_fence(); + Kokkos::atomic_fetch_sub(buffer + arrive_idx, size); + Kokkos::atomic_fetch_add(buffer + wait_idx, 1); + } + + // should only be called by the master thread, will allow the master thread to + // resume after all threads have arrived + KOKKOS_INLINE_FUNCTION + static void split_master_wait(int* buffer, const int size, const int step, + const bool active_wait = true) noexcept { + if (size <= 1) return; + wait_until_equal(buffer + master_idx, step, active_wait); + } + + // arrive, last thread automatically release waiting threads + KOKKOS_INLINE_FUNCTION + static void arrive(int* buffer, const int size, int& step) noexcept { + if (size <= 1) return; + if (split_arrive(buffer, size, step)) { + split_release(buffer, size, step); + } + } + + // test if all threads have arrived + KOKKOS_INLINE_FUNCTION + static bool try_wait(int* buffer, const int size, const int step) noexcept { + if (size <= 1) return true; + return test_equal(buffer + wait_idx, step); + } + + // wait for all threads to arrive + KOKKOS_INLINE_FUNCTION + static void wait(int* buffer, const int size, const int step, + bool active_wait = true) noexcept { + if (size <= 1) return; + wait_until_equal(buffer + wait_idx, step, active_wait); + } + + public: + KOKKOS_INLINE_FUNCTION + bool split_arrive(const bool master_wait = true) const noexcept { + return split_arrive(m_buffer, m_size, m_step, master_wait); + } + + KOKKOS_INLINE_FUNCTION + void split_release() const noexcept { + split_release(m_buffer, m_size, m_step); + } + + KOKKOS_INLINE_FUNCTION + void split_master_wait(const bool active_wait = true) noexcept { + split_master_wait(m_buffer, m_size, m_step, active_wait); + } + + KOKKOS_INLINE_FUNCTION + void arrive() const noexcept { return arrive(m_buffer, m_size, m_step); } + + KOKKOS_INLINE_FUNCTION + bool try_wait() const noexcept { return try_wait(m_buffer, m_size, m_step); } + + KOKKOS_INLINE_FUNCTION + void wait() const noexcept { wait(m_buffer, m_size, m_step); } + + HostBarrier() = default; + HostBarrier(HostBarrier&&) = default; + HostBarrier& operator=(HostBarrier&&) = default; + + KOKKOS_INLINE_FUNCTION + HostBarrier(int size, int* buffer) + : m_size{size}, m_step{0u}, m_buffer{buffer} {} + + HostBarrier(const HostBarrier&) = delete; + HostBarrier& operator=(const HostBarrier&) = delete; + + private: + KOKKOS_INLINE_FUNCTION + static bool test_equal(int* ptr, int v) noexcept { + const bool result = Kokkos::atomic_fetch_add(ptr, 0) == v; + if (result) { + Kokkos::memory_fence(); + } + return result; + } + + KOKKOS_INLINE_FUNCTION + static void wait_until_equal(int* ptr, const int v, + bool active_wait = true) noexcept { +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + bool result = test_equal(ptr, v); + for (int i = 0; !result && i < iterations_till_backoff; ++i) { +#if defined(KOKKOS_ENABLE_ASM) +#if defined(_WIN32) + for (int j = 0; j < num_nops; ++j) { + __asm__ __volatile__("nop\n"); + } + __asm__ __volatile__("pause\n" ::: "memory"); +#elif defined(__PPC64__) + for (int j = 0; j < num_nops; ++j) { + asm volatile("nop\n"); + } + asm volatile("or 27, 27, 27" ::: "memory"); +#elif defined(__amd64) || defined(__amd64__) || defined(__x86_64) || \ + defined(__x86_64__) + for (int j = 0; j < num_nops; ++j) { + asm volatile("nop\n"); + } + asm volatile("pause\n" ::: "memory"); +#endif +#endif + result = test_equal(ptr, v); + } + if (!result) { + impl_backoff_wait_until_equal(ptr, v, active_wait); + } +#else + (void)active_wait; + while (!test_equal(ptr, v)) { + } +#endif + } + + static void impl_backoff_wait_until_equal(int* ptr, const int v, + const bool active_wait) noexcept; + + private: + int m_size{0}; + mutable int m_step{0}; + int* m_buffer{nullptr}; +}; + +} // namespace Impl +} // namespace Kokkos + +#endif // KOKKOS_HOST_BARRIER_HPP diff --git a/packages/kokkos/core/src/impl/Kokkos_HostSharedPtr.hpp b/packages/kokkos/core/src/impl/Kokkos_HostSharedPtr.hpp new file mode 100644 index 0000000000000000000000000000000000000000..97286dd07f4ea2ee94f3070768f425e2ef5b7896 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_HostSharedPtr.hpp @@ -0,0 +1,178 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_IMPL_HOST_SHARED_PTR_HPP +#define KOKKOS_IMPL_HOST_SHARED_PTR_HPP + +#include <Kokkos_Macros.hpp> +#include <Kokkos_Atomic.hpp> + +#include <functional> + +namespace Kokkos { +namespace Impl { + +template <typename T> +class HostSharedPtr { + public: + using element_type = T; + + KOKKOS_DEFAULTED_FUNCTION constexpr HostSharedPtr() = default; + KOKKOS_FUNCTION constexpr HostSharedPtr(std::nullptr_t) {} + + explicit HostSharedPtr(T* element_ptr) + : HostSharedPtr(element_ptr, [](T* const t) { delete t; }) {} + + template <class Deleter> + HostSharedPtr(T* element_ptr, const Deleter& deleter) + : m_element_ptr(element_ptr) { +#ifdef KOKKOS_ENABLE_CXX17 + static_assert(std::is_invocable_v<Deleter, T*> && + std::is_copy_constructible_v<Deleter>); +#endif + if (element_ptr) { + try { + m_control = new Control{deleter, 1}; + } catch (...) { + deleter(element_ptr); + throw; + } + } + } + + KOKKOS_FUNCTION HostSharedPtr(HostSharedPtr&& other) noexcept + : m_element_ptr(other.m_element_ptr), m_control(other.m_control) { + other.m_element_ptr = nullptr; + other.m_control = nullptr; + } + + KOKKOS_FUNCTION HostSharedPtr(const HostSharedPtr& other) noexcept + : m_element_ptr(other.m_element_ptr), m_control(other.m_control) { + // FIXME_OPENMPTARGET requires something like KOKKOS_IMPL_IF_ON_HOST +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + if (m_control) Kokkos::atomic_add(&(m_control->m_counter), 1); +#endif + } + + KOKKOS_FUNCTION HostSharedPtr& operator=(HostSharedPtr&& other) noexcept { + if (&other != this) { + cleanup(); + m_element_ptr = other.m_element_ptr; + other.m_element_ptr = nullptr; + m_control = other.m_control; + other.m_control = nullptr; + } + return *this; + } + + KOKKOS_FUNCTION HostSharedPtr& operator=( + const HostSharedPtr& other) noexcept { + if (&other != this) { + cleanup(); + m_element_ptr = other.m_element_ptr; + m_control = other.m_control; + // FIXME_OPENMPTARGET +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + if (m_control) Kokkos::atomic_add(&(m_control->m_counter), 1); +#endif + } + return *this; + } + + KOKKOS_FUNCTION ~HostSharedPtr() { cleanup(); } + + // returns the stored pointer + KOKKOS_FUNCTION T* get() const noexcept { return m_element_ptr; } + // dereferences the stored pointer + KOKKOS_FUNCTION T& operator*() const noexcept { + KOKKOS_EXPECTS(bool(*this)); + return *get(); + } + // dereferences the stored pointer + KOKKOS_FUNCTION T* operator->() const noexcept { + KOKKOS_EXPECTS(bool(*this)); + return get(); + } + + // checks if the stored pointer is not null + KOKKOS_FUNCTION explicit operator bool() const noexcept { + return get() != nullptr; + } + + // returns the number of HostSharedPtr instances managing the curent object or + // 0 if there is no managed object. + int use_count() const noexcept { + return m_control ? m_control->m_counter : 0; + } + + private: + KOKKOS_FUNCTION void cleanup() noexcept { + // FIXME_OPENMPTARGET +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + // If m_counter is set, then this instance is responsible for managing the + // object pointed to by m_counter and m_element_ptr. + if (m_control) { + int const count = Kokkos::atomic_fetch_sub(&(m_control->m_counter), 1); + if (count == 1) { + (m_control->m_deleter)(m_element_ptr); + m_element_ptr = nullptr; + delete m_control; + m_control = nullptr; + } + } +#endif + } + + struct Control { + std::function<void(T*)> m_deleter; + int m_counter; + }; + + T* m_element_ptr = nullptr; + Control* m_control = nullptr; +}; +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/impl/Kokkos_HostSpace.cpp b/packages/kokkos/core/src/impl/Kokkos_HostSpace.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ed46d170e53ebb58e118c8d020073ed12d3c1064 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_HostSpace.cpp @@ -0,0 +1,501 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Macros.hpp> + +#include <impl/Kokkos_Error.hpp> +#include <impl/Kokkos_MemorySpace.hpp> +#include <impl/Kokkos_Tools.hpp> + +/*--------------------------------------------------------------------------*/ + +#if defined(__INTEL_COMPILER) && !defined(KOKKOS_ENABLE_CUDA) + +// Intel specialized allocator does not interoperate with CUDA memory allocation + +#define KOKKOS_ENABLE_INTEL_MM_ALLOC + +#endif + +/*--------------------------------------------------------------------------*/ + +#if defined(KOKKOS_ENABLE_POSIX_MEMALIGN) + +#include <unistd.h> +#include <sys/mman.h> + +/* mmap flags for private anonymous memory allocation */ + +#if defined(MAP_ANONYMOUS) && defined(MAP_PRIVATE) +#define KOKKOS_IMPL_POSIX_MMAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS) +#elif defined(MAP_ANON) && defined(MAP_PRIVATE) +#define KOKKOS_IMPL_POSIX_MMAP_FLAGS (MAP_PRIVATE | MAP_ANON) +#endif + +// mmap flags for huge page tables +// the Cuda driver does not interoperate with MAP_HUGETLB +#if defined(KOKKOS_IMPL_POSIX_MMAP_FLAGS) +#if defined(MAP_HUGETLB) && !defined(KOKKOS_ENABLE_CUDA) +#define KOKKOS_IMPL_POSIX_MMAP_FLAGS_HUGE \ + (KOKKOS_IMPL_POSIX_MMAP_FLAGS | MAP_HUGETLB) +#else +#define KOKKOS_IMPL_POSIX_MMAP_FLAGS_HUGE KOKKOS_IMPL_POSIX_MMAP_FLAGS +#endif +#endif + +#endif + +/*--------------------------------------------------------------------------*/ + +#include <cstddef> +#include <cstdlib> +#include <cstdint> +#include <cstring> + +#include <iostream> +#include <sstream> +#include <cstring> + +#include <Kokkos_HostSpace.hpp> +#include <impl/Kokkos_Error.hpp> +#include <Kokkos_Atomic.hpp> + +#if (defined(KOKKOS_ENABLE_ASM) || defined(KOKKOS_ENABLE_TM)) && \ + defined(KOKKOS_ENABLE_ISA_X86_64) && !defined(KOKKOS_COMPILER_PGI) +#include <immintrin.h> +#endif + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +/* Default allocation mechanism */ +HostSpace::HostSpace() + : m_alloc_mech( +#if defined(KOKKOS_ENABLE_INTEL_MM_ALLOC) + HostSpace::INTEL_MM_ALLOC +#elif defined(KOKKOS_IMPL_POSIX_MMAP_FLAGS) + HostSpace::POSIX_MMAP +#elif defined(KOKKOS_ENABLE_POSIX_MEMALIGN) + HostSpace::POSIX_MEMALIGN +#else + HostSpace::STD_MALLOC +#endif + ) { +} + +/* Default allocation mechanism */ +HostSpace::HostSpace(const HostSpace::AllocationMechanism &arg_alloc_mech) + : m_alloc_mech(HostSpace::STD_MALLOC) { + if (arg_alloc_mech == STD_MALLOC) { + m_alloc_mech = HostSpace::STD_MALLOC; + } +#if defined(KOKKOS_ENABLE_INTEL_MM_ALLOC) + else if (arg_alloc_mech == HostSpace::INTEL_MM_ALLOC) { + m_alloc_mech = HostSpace::INTEL_MM_ALLOC; + } +#elif defined(KOKKOS_ENABLE_POSIX_MEMALIGN) + else if (arg_alloc_mech == HostSpace::POSIX_MEMALIGN) { + m_alloc_mech = HostSpace::POSIX_MEMALIGN; + } +#elif defined(KOKKOS_IMPL_POSIX_MMAP_FLAGS) + else if (arg_alloc_mech == HostSpace::POSIX_MMAP) { + m_alloc_mech = HostSpace::POSIX_MMAP; + } +#endif + else { + const char *const mech = + (arg_alloc_mech == HostSpace::INTEL_MM_ALLOC) + ? "INTEL_MM_ALLOC" + : ((arg_alloc_mech == HostSpace::POSIX_MEMALIGN) + ? "POSIX_MEMALIGN" + : ((arg_alloc_mech == HostSpace::POSIX_MMAP) ? "POSIX_MMAP" + : "")); + + std::string msg; + msg.append("Kokkos::HostSpace "); + msg.append(mech); + msg.append(" is not available"); + Kokkos::Impl::throw_runtime_exception(msg); + } +} + +void *HostSpace::allocate(const size_t arg_alloc_size) const { + return allocate("[unlabeled]", arg_alloc_size); +} +void *HostSpace::allocate(const char *arg_label, const size_t arg_alloc_size, + const size_t + + arg_logical_size) const { + return impl_allocate(arg_label, arg_alloc_size, arg_logical_size); +} +void *HostSpace::impl_allocate( + const char *arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { + const size_t reported_size = + (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; + static_assert(sizeof(void *) == sizeof(uintptr_t), + "Error sizeof(void*) != sizeof(uintptr_t)"); + + static_assert( + Kokkos::Impl::is_integral_power_of_two(Kokkos::Impl::MEMORY_ALIGNMENT), + "Memory alignment must be power of two"); + + constexpr uintptr_t alignment = Kokkos::Impl::MEMORY_ALIGNMENT; + constexpr uintptr_t alignment_mask = alignment - 1; + + void *ptr = nullptr; + + if (arg_alloc_size) { + if (m_alloc_mech == STD_MALLOC) { + // Over-allocate to and round up to guarantee proper alignment. + size_t size_padded = arg_alloc_size + sizeof(void *) + alignment; + + void *alloc_ptr = malloc(size_padded); + + if (alloc_ptr) { + auto address = reinterpret_cast<uintptr_t>(alloc_ptr); + + // offset enough to record the alloc_ptr + address += sizeof(void *); + uintptr_t rem = address % alignment; + uintptr_t offset = rem ? (alignment - rem) : 0u; + address += offset; + ptr = reinterpret_cast<void *>(address); + // record the alloc'd pointer + address -= sizeof(void *); + *reinterpret_cast<void **>(address) = alloc_ptr; + } + } +#if defined(KOKKOS_ENABLE_INTEL_MM_ALLOC) + else if (m_alloc_mech == INTEL_MM_ALLOC) { + ptr = _mm_malloc(arg_alloc_size, alignment); + } +#endif + +#if defined(KOKKOS_ENABLE_POSIX_MEMALIGN) + else if (m_alloc_mech == POSIX_MEMALIGN) { + posix_memalign(&ptr, alignment, arg_alloc_size); + } +#endif + +#if defined(KOKKOS_IMPL_POSIX_MMAP_FLAGS) + else if (m_alloc_mech == POSIX_MMAP) { + constexpr size_t use_huge_pages = (1u << 27); + constexpr int prot = PROT_READ | PROT_WRITE; + const int flags = arg_alloc_size < use_huge_pages + ? KOKKOS_IMPL_POSIX_MMAP_FLAGS + : KOKKOS_IMPL_POSIX_MMAP_FLAGS_HUGE; + + // read write access to private memory + + ptr = + mmap(nullptr /* address hint, if nullptr OS kernel chooses address */ + , + arg_alloc_size /* size in bytes */ + , + prot /* memory protection */ + , + flags /* visibility of updates */ + , + -1 /* file descriptor */ + , + 0 /* offset */ + ); + + /* Associated reallocation: + ptr = mremap( old_ptr , old_size , new_size , MREMAP_MAYMOVE ); + */ + } +#endif + } + + if ((ptr == nullptr) || (reinterpret_cast<uintptr_t>(ptr) == ~uintptr_t(0)) || + (reinterpret_cast<uintptr_t>(ptr) & alignment_mask)) { + Experimental::RawMemoryAllocationFailure::FailureMode failure_mode = + Experimental::RawMemoryAllocationFailure::FailureMode:: + AllocationNotAligned; + if (ptr == nullptr) { + failure_mode = Experimental::RawMemoryAllocationFailure::FailureMode:: + OutOfMemoryError; + } + + Experimental::RawMemoryAllocationFailure::AllocationMechanism alloc_mec = + Experimental::RawMemoryAllocationFailure::AllocationMechanism:: + StdMalloc; + switch (m_alloc_mech) { + case STD_MALLOC: break; // default + case POSIX_MEMALIGN: + alloc_mec = Experimental::RawMemoryAllocationFailure:: + AllocationMechanism::PosixMemAlign; + break; + case POSIX_MMAP: + alloc_mec = Experimental::RawMemoryAllocationFailure:: + AllocationMechanism::PosixMMap; + break; + case INTEL_MM_ALLOC: + alloc_mec = Experimental::RawMemoryAllocationFailure:: + AllocationMechanism::IntelMMAlloc; + break; + } + + throw Kokkos::Experimental::RawMemoryAllocationFailure( + arg_alloc_size, alignment, failure_mode, alloc_mec); + } + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size); + } + return ptr; +} + +void HostSpace::deallocate(void *const arg_alloc_ptr, + const size_t arg_alloc_size) const { + deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size); +} + +void HostSpace::deallocate(const char *arg_label, void *const arg_alloc_ptr, + const size_t arg_alloc_size, + const size_t + + arg_logical_size) const { + impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size); +} +void HostSpace::impl_deallocate( + const char *arg_label, void *const arg_alloc_ptr, + const size_t arg_alloc_size, const size_t arg_logical_size, + const Kokkos::Tools::SpaceHandle arg_handle) const { + if (arg_alloc_ptr) { + size_t reported_size = + (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, + reported_size); + } + if (m_alloc_mech == STD_MALLOC) { + void *alloc_ptr = *(reinterpret_cast<void **>(arg_alloc_ptr) - 1); + free(alloc_ptr); + } +#if defined(KOKKOS_ENABLE_INTEL_MM_ALLOC) + else if (m_alloc_mech == INTEL_MM_ALLOC) { + _mm_free(arg_alloc_ptr); + } +#endif + +#if defined(KOKKOS_ENABLE_POSIX_MEMALIGN) + else if (m_alloc_mech == POSIX_MEMALIGN) { + free(arg_alloc_ptr); + } +#endif + +#if defined(KOKKOS_IMPL_POSIX_MMAP_FLAGS) + else if (m_alloc_mech == POSIX_MMAP) { + munmap(arg_alloc_ptr, arg_alloc_size); + } +#endif + } +} + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +#ifdef KOKKOS_ENABLE_DEBUG +SharedAllocationRecord<void, void> + SharedAllocationRecord<Kokkos::HostSpace, void>::s_root_record; +#endif + +SharedAllocationRecord<Kokkos::HostSpace, void>::~SharedAllocationRecord() +#if defined( \ + KOKKOS_IMPL_INTEL_WORKAROUND_NOEXCEPT_SPECIFICATION_VIRTUAL_FUNCTION) + noexcept +#endif +{ + m_space.deallocate(RecordBase::m_alloc_ptr->m_label, + SharedAllocationRecord<void, void>::m_alloc_ptr, + SharedAllocationRecord<void, void>::m_alloc_size, + (SharedAllocationRecord<void, void>::m_alloc_size - + sizeof(SharedAllocationHeader))); +} + +SharedAllocationHeader *_do_allocation(Kokkos::HostSpace const &space, + std::string const &label, + size_t alloc_size) { + try { + return reinterpret_cast<SharedAllocationHeader *>( + space.allocate(alloc_size)); + } catch (Experimental::RawMemoryAllocationFailure const &failure) { + if (failure.failure_mode() == Experimental::RawMemoryAllocationFailure:: + FailureMode::AllocationNotAligned) { + // TODO: delete the misaligned memory + } + + std::cerr << "Kokkos failed to allocate memory for label \"" << label + << "\". Allocation using MemorySpace named \"" << space.name() + << " failed with the following error: "; + failure.print_error_message(std::cerr); + std::cerr.flush(); + Kokkos::Impl::throw_runtime_exception("Memory allocation failure"); + } + return nullptr; // unreachable +} + +SharedAllocationRecord<Kokkos::HostSpace, void>::SharedAllocationRecord( + const Kokkos::HostSpace &arg_space, const std::string &arg_label, + const size_t arg_alloc_size, + const SharedAllocationRecord<void, void>::function_type arg_dealloc) + // Pass through allocated [ SharedAllocationHeader , user_memory ] + // Pass through deallocation function + : base_t( +#ifdef KOKKOS_ENABLE_DEBUG + &SharedAllocationRecord<Kokkos::HostSpace, void>::s_root_record, +#endif + Impl::checked_allocation_with_header(arg_space, arg_label, + arg_alloc_size), + sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc), + m_space(arg_space) { + this->base_t::_fill_host_accessible_header_info(*RecordBase::m_alloc_ptr, + arg_label); +} + +} // namespace Impl +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace { +const unsigned HOST_SPACE_ATOMIC_MASK = 0xFFFF; +const unsigned HOST_SPACE_ATOMIC_XOR_MASK = 0x5A39; +static int HOST_SPACE_ATOMIC_LOCKS[HOST_SPACE_ATOMIC_MASK + 1]; +} // namespace + +namespace Impl { +void init_lock_array_host_space() { + static int is_initialized = 0; + if (!is_initialized) + for (int i = 0; i < static_cast<int>(HOST_SPACE_ATOMIC_MASK + 1); i++) + HOST_SPACE_ATOMIC_LOCKS[i] = 0; +} + +bool lock_address_host_space(void *ptr) { +#if defined(KOKKOS_ENABLE_ISA_X86_64) && defined(KOKKOS_ENABLE_TM) && \ + !defined(KOKKOS_COMPILER_PGI) + const unsigned status = _xbegin(); + + if (_XBEGIN_STARTED == status) { + const int val = + HOST_SPACE_ATOMIC_LOCKS[((size_t(ptr) >> 2) & HOST_SPACE_ATOMIC_MASK) ^ + HOST_SPACE_ATOMIC_XOR_MASK]; + + if (0 == val) { + HOST_SPACE_ATOMIC_LOCKS[((size_t(ptr) >> 2) & HOST_SPACE_ATOMIC_MASK) ^ + HOST_SPACE_ATOMIC_XOR_MASK] = 1; + } else { + _xabort(1); + } + + _xend(); + + return 1; + } else { +#endif + return 0 == atomic_compare_exchange( + &HOST_SPACE_ATOMIC_LOCKS[((size_t(ptr) >> 2) & + HOST_SPACE_ATOMIC_MASK) ^ + HOST_SPACE_ATOMIC_XOR_MASK], + 0, 1); +#if defined(KOKKOS_ENABLE_ISA_X86_64) && defined(KOKKOS_ENABLE_TM) && \ + !defined(KOKKOS_COMPILER_PGI) + } +#endif +} + +void unlock_address_host_space(void *ptr) { +#if defined(KOKKOS_ENABLE_ISA_X86_64) && defined(KOKKOS_ENABLE_TM) && \ + !defined(KOKKOS_COMPILER_PGI) + const unsigned status = _xbegin(); + + if (_XBEGIN_STARTED == status) { + HOST_SPACE_ATOMIC_LOCKS[((size_t(ptr) >> 2) & HOST_SPACE_ATOMIC_MASK) ^ + HOST_SPACE_ATOMIC_XOR_MASK] = 0; + } else { +#endif + atomic_exchange( + &HOST_SPACE_ATOMIC_LOCKS[((size_t(ptr) >> 2) & HOST_SPACE_ATOMIC_MASK) ^ + HOST_SPACE_ATOMIC_XOR_MASK], + 0); +#if defined(KOKKOS_ENABLE_ISA_X86_64) && defined(KOKKOS_ENABLE_TM) && \ + !defined(KOKKOS_COMPILER_PGI) + } +#endif +} + +} // namespace Impl +} // namespace Kokkos + +//============================================================================== +// <editor-fold desc="Explicit instantiations of CRTP Base classes"> {{{1 + +#include <impl/Kokkos_SharedAlloc_timpl.hpp> + +namespace Kokkos { +namespace Impl { + +// To avoid additional compilation cost for something that's (mostly?) not +// performance sensitive, we explicity instantiate these CRTP base classes here, +// where we have access to the associated *_timpl.hpp header files. +template class SharedAllocationRecordCommon<Kokkos::HostSpace>; + +} // end namespace Impl +} // end namespace Kokkos + +// </editor-fold> end Explicit instantiations of CRTP Base classes }}}1 +//============================================================================== diff --git a/packages/kokkos/core/src/impl/Kokkos_HostSpace_deepcopy.cpp b/packages/kokkos/core/src/impl/Kokkos_HostSpace_deepcopy.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b86670346c6466b4c4cff860b8e4873bbb540ce8 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_HostSpace_deepcopy.cpp @@ -0,0 +1,143 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include "Kokkos_Core.hpp" +#include "Kokkos_HostSpace_deepcopy.hpp" + +namespace Kokkos { + +namespace Impl { + +#ifndef KOKKOS_IMPL_HOST_DEEP_COPY_SERIAL_LIMIT +#define KOKKOS_IMPL_HOST_DEEP_COPY_SERIAL_LIMIT 10 * 8192 +#endif + +void hostspace_parallel_deepcopy(void* dst, const void* src, ptrdiff_t n) { + if ((n < KOKKOS_IMPL_HOST_DEEP_COPY_SERIAL_LIMIT) || + (Kokkos::DefaultHostExecutionSpace().concurrency() == 1)) { + std::memcpy(dst, src, n); + return; + } + + using policy_t = Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>; + + // Both src and dst are aligned the same way with respect to 8 byte words + if (reinterpret_cast<ptrdiff_t>(src) % 8 == + reinterpret_cast<ptrdiff_t>(dst) % 8) { + char* dst_c = reinterpret_cast<char*>(dst); + const char* src_c = reinterpret_cast<const char*>(src); + int count = 0; + // get initial bytes copied + while (reinterpret_cast<ptrdiff_t>(dst_c) % 8 != 0) { + *dst_c = *src_c; + dst_c++; + src_c++; + count++; + } + + // copy the bulk of the data + double* dst_p = reinterpret_cast<double*>(dst_c); + const double* src_p = reinterpret_cast<const double*>(src_c); + Kokkos::parallel_for("Kokkos::Impl::host_space_deepcopy_double", + policy_t(0, (n - count) / 8), + [=](const ptrdiff_t i) { dst_p[i] = src_p[i]; }); + + // get final data copied + dst_c += ((n - count) / 8) * 8; + src_c += ((n - count) / 8) * 8; + char* dst_end = reinterpret_cast<char*>(dst) + n; + while (dst_c != dst_end) { + *dst_c = *src_c; + dst_c++; + src_c++; + } + return; + } + + // Both src and dst are aligned the same way with respect to 4 byte words + if (reinterpret_cast<ptrdiff_t>(src) % 4 == + reinterpret_cast<ptrdiff_t>(dst) % 4) { + char* dst_c = reinterpret_cast<char*>(dst); + const char* src_c = reinterpret_cast<const char*>(src); + int count = 0; + // get initial bytes copied + while (reinterpret_cast<ptrdiff_t>(dst_c) % 4 != 0) { + *dst_c = *src_c; + dst_c++; + src_c++; + count++; + } + + // copy the bulk of the data + int32_t* dst_p = reinterpret_cast<int32_t*>(dst_c); + const int32_t* src_p = reinterpret_cast<const int32_t*>(src_c); + Kokkos::parallel_for("Kokkos::Impl::host_space_deepcopy_int", + policy_t(0, (n - count) / 4), + [=](const ptrdiff_t i) { dst_p[i] = src_p[i]; }); + + // get final data copied + dst_c += ((n - count) / 4) * 4; + src_c += ((n - count) / 4) * 4; + char* dst_end = reinterpret_cast<char*>(dst) + n; + while (dst_c != dst_end) { + *dst_c = *src_c; + dst_c++; + src_c++; + } + return; + } + + // Src and dst are not aligned the same way, we can only to byte wise copy. + { + char* dst_p = reinterpret_cast<char*>(dst); + const char* src_p = reinterpret_cast<const char*>(src); + Kokkos::parallel_for("Kokkos::Impl::host_space_deepcopy_char", + policy_t(0, n), + [=](const ptrdiff_t i) { dst_p[i] = src_p[i]; }); + } +} + +} // namespace Impl + +} // namespace Kokkos diff --git a/packages/kokkos/core/src/impl/Kokkos_HostSpace_deepcopy.hpp b/packages/kokkos/core/src/impl/Kokkos_HostSpace_deepcopy.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e9e0ef52a0dd6ef5254f82b9bad10d9bc569805d --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_HostSpace_deepcopy.hpp @@ -0,0 +1,54 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#include <cstdint> + +namespace Kokkos { + +namespace Impl { + +void hostspace_parallel_deepcopy(void* dst, const void* src, ptrdiff_t n); + +} // namespace Impl + +} // namespace Kokkos diff --git a/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp b/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2e5587e4a342c8c2b167f307f8c8b3a3215f304a --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp @@ -0,0 +1,311 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <limits> +#include <Kokkos_Macros.hpp> +#include <impl/Kokkos_HostThreadTeam.hpp> +#include <impl/Kokkos_Error.hpp> +#include <impl/Kokkos_Spinwait.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +void HostThreadTeamData::organize_pool(HostThreadTeamData *members[], + const int size) { + bool ok = true; + + memory_fence(); + + // Verify not already a member of a pool: + for (int rank = 0; rank < size && ok; ++rank) { + ok = (nullptr != members[rank]) && + (nullptr == members[rank]->m_pool_scratch); + } + + if (ok) { + int64_t *const root_scratch = members[0]->m_scratch; + + for (int i = m_pool_rendezvous; i < m_pool_reduce; ++i) { + root_scratch[i] = 0; + } + + { + HostThreadTeamData **const pool = + (HostThreadTeamData **)(root_scratch + m_pool_members); + + // team size == 1, league size == pool_size + + for (int rank = 0; rank < size; ++rank) { + HostThreadTeamData *const mem = members[rank]; + mem->m_pool_scratch = root_scratch; + mem->m_team_scratch = mem->m_scratch; + mem->m_pool_rank = rank; + mem->m_pool_size = size; + mem->m_team_base = rank; + mem->m_team_rank = 0; + mem->m_team_size = 1; + mem->m_team_alloc = 1; + mem->m_league_rank = rank; + mem->m_league_size = size; + mem->m_team_rendezvous_step = 0; + pool[rank] = mem; + } + } + + Kokkos::memory_fence(); + } else { + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::HostThreadTeamData::organize_pool ERROR pool already " + "exists"); + } +} + +void HostThreadTeamData::disband_pool() { + m_work_range.first = -1; + m_work_range.second = -1; + m_pool_scratch = nullptr; + m_team_scratch = nullptr; + m_pool_rank = 0; + m_pool_size = 1; + m_team_base = 0; + m_team_rank = 0; + m_team_size = 1; + m_team_alloc = 1; + m_league_rank = 0; + m_league_size = 1; + m_team_rendezvous_step = 0; +} + +int HostThreadTeamData::organize_team(const int team_size) { + // Pool is initialized + const bool ok_pool = nullptr != m_pool_scratch; + + // Team is not set + const bool ok_team = + m_team_scratch == m_scratch && m_team_base == m_pool_rank && + m_team_rank == 0 && m_team_size == 1 && m_team_alloc == 1 && + m_league_rank == m_pool_rank && m_league_size == m_pool_size; + + if (ok_pool && ok_team) { + if (team_size <= 0) return 0; // No teams to organize + + if (team_size == 1) return 1; // Already organized in teams of one + + HostThreadTeamData *const *const pool = + (HostThreadTeamData **)(m_pool_scratch + m_pool_members); + + // "league_size" in this context is the number of concurrent teams + // that the pool can accommodate. Excess threads are idle. + const int league_size = m_pool_size / team_size; + const int team_alloc_size = m_pool_size / league_size; + const int team_alloc_rank = m_pool_rank % team_alloc_size; + const int league_rank = m_pool_rank / team_alloc_size; + const int team_base_rank = league_rank * team_alloc_size; + + m_team_scratch = pool[team_base_rank]->m_scratch; + m_team_base = team_base_rank; + // This needs to check overflow, if m_pool_size % team_alloc_size !=0 + // there are two corner cases: + // (i) if team_alloc_size == team_size there might be a non-full + // zombi team around (for example m_pool_size = 5 and team_size = 2 + // (ii) if team_alloc > team_size then the last team might have less + // threads than the others + m_team_rank = (team_base_rank + team_size <= m_pool_size) && + (team_alloc_rank < team_size) + ? team_alloc_rank + : -1; + m_team_size = team_size; + m_team_alloc = team_alloc_size; + m_league_rank = league_rank; + m_league_size = league_size; + m_team_rendezvous_step = 0; + + if (team_base_rank == m_pool_rank) { + // Initialize team's rendezvous memory + for (int i = m_team_rendezvous; i < m_pool_reduce; ++i) { + m_scratch[i] = 0; + } + // Make sure team's rendezvous memory initialized + // is written before proceeding. + Kokkos::memory_fence(); + } + + // Organizing threads into a team performs a barrier across the + // entire pool to insure proper initialization of the team + // rendezvous mechanism before a team rendezvous can be performed. + + if (pool_rendezvous()) { + pool_rendezvous_release(); + } + } else { + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::HostThreadTeamData::organize_team ERROR"); + } + + return 0 <= m_team_rank; +} + +void HostThreadTeamData::disband_team() { + m_team_scratch = m_scratch; + m_team_base = m_pool_rank; + m_team_rank = 0; + m_team_size = 1; + m_team_alloc = 1; + m_league_rank = m_pool_rank; + m_league_size = m_pool_size; + m_team_rendezvous_step = 0; +} + +//---------------------------------------------------------------------------- + +int HostThreadTeamData::get_work_stealing() noexcept { + pair_int_t w(-1, -1); + + // TODO DJS 3-17-2018: + // Discover why the work stealing algorithm only works when called + // by the master thread of the team. If we can refactor this section to + // remove that requirement we should be able to remove the split_master_wait + // behavior in the team and pool rendezvous algorithms + if (1 == m_team_size || team_rendezvous()) { + // Attempt first from beginning of my work range + for (int attempt = m_work_range.first < m_work_range.second; attempt;) { + // Query and attempt to update m_work_range + // from: [ w.first , w.second ) + // to: [ w.first + 1 , w.second ) = w_new + // + // If w is invalid then is just a query. + + const pair_int_t w_new(w.first + 1, w.second); + + w = Kokkos::atomic_compare_exchange(&m_work_range, w, w_new); + + if (w.first < w.second) { + // m_work_range is viable + + // If steal is successful then don't repeat attempt to steal + attempt = !(w_new.first == w.first + 1 && w_new.second == w.second); + } else { + // m_work_range is not viable + w.first = -1; + w.second = -1; + + attempt = 0; + } + } + + if (w.first == -1 && m_steal_rank != m_pool_rank) { + HostThreadTeamData *const *const pool = + (HostThreadTeamData **)(m_pool_scratch + m_pool_members); + + // Attempt from beginning failed, try to steal from end of neighbor + + pair_int_t volatile *steal_range = &(pool[m_steal_rank]->m_work_range); + + for (int attempt = true; attempt;) { + // Query and attempt to update steal_work_range + // from: [ w.first , w.second ) + // to: [ w.first , w.second - 1 ) = w_new + // + // If w is invalid then is just a query. + + const pair_int_t w_new(w.first, w.second - 1); + + w = Kokkos::atomic_compare_exchange(steal_range, w, w_new); + + if (w.first < w.second) { + // steal_work_range is viable + + // If steal is successful then don't repeat attempt to steal + attempt = !(w_new.first == w.first && w_new.second == w.second - 1); + } else { + // steal_work_range is not viable, move to next member + w.first = -1; + w.second = -1; + + // We need to figure out whether the next team is active + // m_steal_rank + m_team_alloc could be the next base_rank to steal + // from but only if there are another m_team_size threads available so + // that that base rank has a full team. + m_steal_rank = + m_steal_rank + m_team_alloc + m_team_size <= m_pool_size + ? m_steal_rank + m_team_alloc + : 0; + + steal_range = &(pool[m_steal_rank]->m_work_range); + + // If tried all other members then don't repeat attempt to steal + attempt = m_steal_rank != m_pool_rank; + } + } + + if (w.first != -1) w.first = w.second - 1; + } + + if (1 < m_team_size) { + // Must share the work index + *((int volatile *)team_reduce()) = w.first; + + team_rendezvous_release(); + } + } else if (1 < m_team_size) { + w.first = *((int volatile *)team_reduce()); + } + + // May exit because successfully stole work and w is good. + // May exit because no work left to steal and w = (-1,-1). + +#if 0 +fprintf(stdout,"HostThreadTeamData::get_work_stealing() pool(%d of %d) %d\n" + , m_pool_rank , m_pool_size , w.first ); +fflush(stdout); +#endif + + return w.first; +} + +} // namespace Impl +} // namespace Kokkos diff --git a/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp b/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp new file mode 100644 index 0000000000000000000000000000000000000000..d4cae7f122ed182cf88522d5d60729a0906cce5b --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp @@ -0,0 +1,1089 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_IMPL_HOSTTHREADTEAM_HPP +#define KOKKOS_IMPL_HOSTTHREADTEAM_HPP + +#include <Kokkos_Core_fwd.hpp> +#include <Kokkos_Pair.hpp> +#include <Kokkos_Atomic.hpp> +#include <Kokkos_ExecPolicy.hpp> +#include <impl/Kokkos_FunctorAdapter.hpp> +#include <impl/Kokkos_FunctorAnalysis.hpp> +#include <impl/Kokkos_HostBarrier.hpp> + +#include <limits> // std::numeric_limits +#include <algorithm> // std::max + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <class HostExecSpace> +class HostThreadTeamMember; + +class HostThreadTeamData { + public: + template <class> + friend class HostThreadTeamMember; + + // Assume upper bounds on number of threads: + // pool size <= 1024 threads + // team size <= 64 threads + + enum : int { max_pool_members = 1024 }; + enum : int { max_team_members = 64 }; + enum : int { max_pool_rendezvous = HostBarrier::required_buffer_size }; + enum : int { max_team_rendezvous = HostBarrier::required_buffer_size }; + + private: + // per-thread scratch memory buffer chunks: + // + // [ pool_members ] = [ m_pool_members .. m_pool_rendezvous ) + // [ pool_rendezvous ] = [ m_pool_rendezvous .. m_team_rendezvous ) + // [ team_rendezvous ] = [ m_team_rendezvous .. m_pool_reduce ) + // [ pool_reduce ] = [ m_pool_reduce .. m_team_reduce ) + // [ team_reduce ] = [ m_team_reduce .. m_team_shared ) + // [ team_shared ] = [ m_team_shared .. m_thread_local ) + // [ thread_local ] = [ m_thread_local .. m_scratch_size ) + + enum : int { m_pool_members = 0 }; + enum : int { m_pool_rendezvous = m_pool_members + max_pool_members }; + enum : int { m_team_rendezvous = m_pool_rendezvous + max_pool_rendezvous }; + enum : int { m_pool_reduce = m_team_rendezvous + max_team_rendezvous }; + + using pair_int_t = Kokkos::pair<int64_t, int64_t>; + + pair_int_t m_work_range; + int64_t m_work_end; + int64_t* m_scratch; // per-thread buffer + int64_t* m_pool_scratch; // == pool[0]->m_scratch + int64_t* m_team_scratch; // == pool[ 0 + m_team_base ]->m_scratch + int m_pool_rank; + int m_pool_size; + int m_team_reduce; + int m_team_shared; + int m_thread_local; + int m_scratch_size; + int m_team_base; + int m_team_rank; + int m_team_size; + int m_team_alloc; + int m_league_rank; + int m_league_size; + int m_work_chunk; + int m_steal_rank; // work stealing rank + int mutable m_pool_rendezvous_step; + int mutable m_team_rendezvous_step; + + HostThreadTeamData* team_member(int r) const noexcept { + return ((HostThreadTeamData**)(m_pool_scratch + + m_pool_members))[m_team_base + r]; + } + + public: + inline bool team_rendezvous() const noexcept { + int* ptr = (int*)(m_team_scratch + m_team_rendezvous); + HostBarrier::split_arrive(ptr, m_team_size, m_team_rendezvous_step); + if (m_team_rank != 0) { + HostBarrier::wait(ptr, m_team_size, m_team_rendezvous_step); + } else { + HostBarrier::split_master_wait(ptr, m_team_size, m_team_rendezvous_step); + } + + return m_team_rank == 0; + } + + inline bool team_rendezvous(const int source_team_rank) const noexcept { + int* ptr = (int*)(m_team_scratch + m_team_rendezvous); + HostBarrier::split_arrive(ptr, m_team_size, m_team_rendezvous_step); + if (m_team_rank != source_team_rank) { + HostBarrier::wait(ptr, m_team_size, m_team_rendezvous_step); + } else { + HostBarrier::split_master_wait(ptr, m_team_size, m_team_rendezvous_step); + } + + return (m_team_rank == source_team_rank); + } + + inline void team_rendezvous_release() const noexcept { + HostBarrier::split_release((int*)(m_team_scratch + m_team_rendezvous), + m_team_size, m_team_rendezvous_step); + } + + inline int pool_rendezvous() const noexcept { + int* ptr = (int*)(m_pool_scratch + m_pool_rendezvous); + HostBarrier::split_arrive(ptr, m_pool_size, m_pool_rendezvous_step); + if (m_pool_rank != 0) { + HostBarrier::wait(ptr, m_pool_size, m_pool_rendezvous_step); + } else { + HostBarrier::split_master_wait(ptr, m_pool_size, m_pool_rendezvous_step); + } + + return m_pool_rank == 0; + } + + inline void pool_rendezvous_release() const noexcept { + HostBarrier::split_release((int*)(m_pool_scratch + m_pool_rendezvous), + m_pool_size, m_pool_rendezvous_step); + } + + //---------------------------------------- + + constexpr HostThreadTeamData() noexcept + : m_work_range(-1, -1), + m_work_end(0), + m_scratch(nullptr), + m_pool_scratch(nullptr), + m_team_scratch(nullptr), + m_pool_rank(0), + m_pool_size(1), + m_team_reduce(0), + m_team_shared(0), + m_thread_local(0), + m_scratch_size(0), + m_team_base(0), + m_team_rank(0), + m_team_size(1), + m_team_alloc(1), + m_league_rank(0), + m_league_size(1), + m_work_chunk(0), + m_steal_rank(0), + m_pool_rendezvous_step(0), + m_team_rendezvous_step(0) {} + + //---------------------------------------- + // Organize array of members into a pool. + // The 0th member is the root of the pool. + // Requires: members are not already in a pool. + // Requires: called by one thread. + // Pool members are ordered as "close" - sorted by NUMA and then CORE + // Each thread is its own team with team_size == 1. + static void organize_pool(HostThreadTeamData* members[], const int size); + + // Called by each thread within the pool + void disband_pool(); + + //---------------------------------------- + // Each thread within a pool organizes itself into a team. + // Must be called by all threads of the pool. + // Organizing threads into a team performs a barrier across the + // entire pool to insure proper initialization of the team + // rendezvous mechanism before a team rendezvous can be performed. + // + // Return true if a valid member of a team. + // Return false if not a member and thread should be idled. + int organize_team(const int team_size); + + // Each thread within a pool disbands itself from current team. + // Each thread becomes its own team with team_size == 1. + // Must be called by all threads of the pool. + void disband_team(); + + //---------------------------------------- + + constexpr int pool_rank() const { return m_pool_rank; } + constexpr int pool_size() const { return m_pool_size; } + + HostThreadTeamData* pool_member(int r) const noexcept { + return ((HostThreadTeamData**)(m_pool_scratch + m_pool_members))[r]; + } + + //---------------------------------------- + + private: + enum : int { mask_to_16 = 0x0f }; // align to 16 bytes + enum : int { shift_to_8 = 3 }; // size to 8 bytes + + public: + static constexpr int align_to_int64(int n) { + return ((n + mask_to_16) & ~mask_to_16) >> shift_to_8; + } + + constexpr int pool_reduce_bytes() const { + return m_scratch_size ? sizeof(int64_t) * (m_team_reduce - m_pool_reduce) + : 0; + } + + constexpr int team_reduce_bytes() const { + return sizeof(int64_t) * (m_team_shared - m_team_reduce); + } + + constexpr int team_shared_bytes() const { + return sizeof(int64_t) * (m_thread_local - m_team_shared); + } + + constexpr int thread_local_bytes() const { + return sizeof(int64_t) * (m_scratch_size - m_thread_local); + } + + constexpr int scratch_bytes() const { + return sizeof(int64_t) * m_scratch_size; + } + + // Memory chunks: + + int64_t* scratch_buffer() const noexcept { return m_scratch; } + + int64_t* pool_reduce() const noexcept { + return m_pool_scratch + m_pool_reduce; + } + + int64_t* pool_reduce_local() const noexcept { + return m_scratch + m_pool_reduce; + } + + int64_t* team_reduce() const noexcept { + return m_team_scratch + m_team_reduce; + } + + int64_t* team_reduce_local() const noexcept { + return m_scratch + m_team_reduce; + } + + int64_t* team_shared() const noexcept { + return m_team_scratch + m_team_shared; + } + + int64_t* local_scratch() const noexcept { return m_scratch + m_thread_local; } + + // Given: + // pool_reduce_size = number bytes for pool reduce + // team_reduce_size = number bytes for team reduce + // team_shared_size = number bytes for team shared memory + // thread_local_size = number bytes for thread local memory + // Return: + // total number of bytes that must be allocated + static size_t scratch_size(int pool_reduce_size, int team_reduce_size, + int team_shared_size, int thread_local_size) { + pool_reduce_size = align_to_int64(pool_reduce_size); + team_reduce_size = align_to_int64(team_reduce_size); + team_shared_size = align_to_int64(team_shared_size); + thread_local_size = align_to_int64(thread_local_size); + + const size_t total_bytes = + (m_pool_reduce + pool_reduce_size + team_reduce_size + + team_shared_size + thread_local_size) * + sizeof(int64_t); + + return total_bytes; + } + + // Given: + // alloc_ptr = pointer to allocated memory + // alloc_size = number bytes of allocated memory + // pool_reduce_size = number bytes for pool reduce/scan operations + // team_reduce_size = number bytes for team reduce/scan operations + // team_shared_size = number bytes for team-shared memory + // thread_local_size = number bytes for thread-local memory + // Return: + // total number of bytes that must be allocated + void scratch_assign(void* const alloc_ptr, size_t const alloc_size, + int pool_reduce_size, int team_reduce_size, + int team_shared_size, int /* thread_local_size */) { + pool_reduce_size = align_to_int64(pool_reduce_size); + team_reduce_size = align_to_int64(team_reduce_size); + team_shared_size = align_to_int64(team_shared_size); + // thread_local_size = align_to_int64( thread_local_size ); + + m_scratch = (int64_t*)alloc_ptr; + m_team_reduce = m_pool_reduce + pool_reduce_size; + m_team_shared = m_team_reduce + team_reduce_size; + m_thread_local = m_team_shared + team_shared_size; + m_scratch_size = align_to_int64(alloc_size); + +#if 0 +fprintf(stdout,"HostThreadTeamData::scratch_assign { %d %d %d %d %d %d %d }\n" + , int(m_pool_members) + , int(m_pool_rendezvous) + , int(m_pool_reduce) + , int(m_team_reduce) + , int(m_team_shared) + , int(m_thread_local) + , int(m_scratch_size) + ); +fflush(stdout); +#endif + } + + //---------------------------------------- + // Get a work index within the range. + // First try to steal from beginning of own teams's partition. + // If that fails then try to steal from end of another teams' partition. + int get_work_stealing() noexcept; + + //---------------------------------------- + // Set the initial work partitioning of [ 0 .. length ) among the teams + // with granularity of chunk + + void set_work_partition(int64_t const length, int const chunk) noexcept { + // Minimum chunk size to insure that + // m_work_end < std::numeric_limits<int>::max() * m_work_chunk + + int const chunk_min = (length + std::numeric_limits<int>::max()) / + std::numeric_limits<int>::max(); + + m_work_end = length; + m_work_chunk = std::max(chunk, chunk_min); + + // Number of work chunks and partitioning of that number: + int const num = (m_work_end + m_work_chunk - 1) / m_work_chunk; + int const part = (num + m_league_size - 1) / m_league_size; + + m_work_range.first = part * m_league_rank; + m_work_range.second = m_work_range.first + part; + + // Steal from next team, round robin + // The next team is offset by m_team_alloc if it fits in the pool. + + m_steal_rank = m_team_base + m_team_alloc + m_team_size <= m_pool_size + ? m_team_base + m_team_alloc + : 0; + } + + std::pair<int64_t, int64_t> get_work_partition() noexcept { + int64_t first = m_work_range.first; + int64_t second = m_work_range.second; + first *= m_work_chunk; + second *= m_work_chunk; + return std::pair<int64_t, int64_t>( + first, second < m_work_end ? second : m_work_end); + } + + std::pair<int64_t, int64_t> get_work_stealing_chunk() noexcept { + std::pair<int64_t, int64_t> x(-1, -1); + + const int i = get_work_stealing(); + + if (0 <= i) { + x.first = m_work_chunk * i; + x.second = x.first + m_work_chunk < m_work_end ? x.first + m_work_chunk + : m_work_end; + } + + return x; + } +}; + +//---------------------------------------------------------------------------- + +template <class HostExecSpace> +class HostThreadTeamMember { + public: + using scratch_memory_space = typename HostExecSpace::scratch_memory_space; + using execution_space = HostExecSpace; + using thread_team_member = HostThreadTeamMember; + using host_thread_team_member = HostThreadTeamMember; + + private: + scratch_memory_space m_scratch; + HostThreadTeamData& m_data; + int const m_league_rank; + int const m_league_size; + + public: + constexpr HostThreadTeamMember(HostThreadTeamData& arg_data) noexcept + : m_scratch(arg_data.team_shared(), arg_data.team_shared_bytes()), + m_data(arg_data), + m_league_rank(arg_data.m_league_rank), + m_league_size(arg_data.m_league_size) {} + + constexpr HostThreadTeamMember(HostThreadTeamData& arg_data, + int const arg_league_rank, + int const arg_league_size) noexcept + : m_scratch(arg_data.team_shared(), arg_data.team_shared_bytes(), + arg_data.team_shared(), arg_data.team_shared_bytes()), + m_data(arg_data), + m_league_rank(arg_league_rank), + m_league_size(arg_league_size) {} + + ~HostThreadTeamMember() = default; + HostThreadTeamMember() = delete; + HostThreadTeamMember(HostThreadTeamMember&&) = default; + HostThreadTeamMember(HostThreadTeamMember const&) = default; + HostThreadTeamMember& operator=(HostThreadTeamMember&&) = default; + HostThreadTeamMember& operator=(HostThreadTeamMember const&) = default; + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + int team_rank() const noexcept { return m_data.m_team_rank; } + + KOKKOS_INLINE_FUNCTION + int team_size() const noexcept { return m_data.m_team_size; } + + KOKKOS_INLINE_FUNCTION + int league_rank() const noexcept { return m_league_rank; } + + KOKKOS_INLINE_FUNCTION + int league_size() const noexcept { return m_league_size; } + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + const scratch_memory_space& team_shmem() const { + return m_scratch.set_team_thread_mode(0, 1, 0); + } + + KOKKOS_INLINE_FUNCTION + const scratch_memory_space& team_scratch(int) const { + return m_scratch.set_team_thread_mode(0, 1, 0); + } + + KOKKOS_INLINE_FUNCTION + const scratch_memory_space& thread_scratch(int) const { + return m_scratch.set_team_thread_mode(0, m_data.m_team_size, + m_data.m_team_rank); + } + + //-------------------------------------------------------------------------- + // Team collectives + //-------------------------------------------------------------------------- + + KOKKOS_INLINE_FUNCTION void team_barrier() const noexcept +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + { + if (m_data.team_rendezvous()) { + m_data.team_rendezvous_release(); + }; + } +#else + { + } +#endif + + //-------------------------------------------------------------------------- + + template <typename T> + KOKKOS_INLINE_FUNCTION void team_broadcast(T& value, + const int source_team_rank) const + noexcept +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + { + if (1 < m_data.m_team_size) { + T volatile* const shared_value = (T*)m_data.team_reduce(); + + // Don't overwrite shared memory until all threads arrive + + if (m_data.team_rendezvous(source_team_rank)) { + // All threads have entered 'team_rendezvous' + // only this thread returned from 'team_rendezvous' + // with a return value of 'true' + + *shared_value = value; + + m_data.team_rendezvous_release(); + // This thread released all other threads from 'team_rendezvous' + // with a return value of 'false' + } else { + value = *shared_value; + } + } + } +#else + { + (void)value; + (void)source_team_rank; + Kokkos::abort("HostThreadTeamMember team_broadcast\n"); + } +#endif + + //-------------------------------------------------------------------------- + + template <class Closure, typename T> + KOKKOS_INLINE_FUNCTION void team_broadcast(Closure const& f, T& value, + const int source_team_rank) const + noexcept +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + { + T volatile* const shared_value = (T*)m_data.team_reduce(); + + // Don't overwrite shared memory until all threads arrive + + if (m_data.team_rendezvous(source_team_rank)) { + // All threads have entered 'team_rendezvous' + // only this thread returned from 'team_rendezvous' + // with a return value of 'true' + + f(value); + + if (1 < m_data.m_team_size) { + *shared_value = value; + } + + m_data.team_rendezvous_release(); + // This thread released all other threads from 'team_rendezvous' + // with a return value of 'false' + } else { + value = *shared_value; + } + } +#else + { + (void)f; + (void)value; + (void)source_team_rank; + Kokkos::abort("HostThreadTeamMember team_broadcast\n"); + } +#endif + + //-------------------------------------------------------------------------- + // team_reduce( Sum(result) ); + // team_reduce( Min(result) ); + // team_reduce( Max(result) ); + + template <typename ReducerType> + KOKKOS_INLINE_FUNCTION + typename std::enable_if<is_reducer<ReducerType>::value>::type + team_reduce(ReducerType const& reducer) const noexcept { + team_reduce(reducer, reducer.reference()); + } + + template <typename ReducerType> + KOKKOS_INLINE_FUNCTION + typename std::enable_if<is_reducer<ReducerType>::value>::type + team_reduce(ReducerType const& reducer, + typename ReducerType::value_type contribution) const noexcept +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + { + if (1 < m_data.m_team_size) { + using value_type = typename ReducerType::value_type; + + if (0 != m_data.m_team_rank) { + // Non-root copies to their local buffer: + /*reducer.copy( (value_type*) m_data.team_reduce_local() + , reducer.data() );*/ + *((value_type*)m_data.team_reduce_local()) = contribution; + } + + // Root does not overwrite shared memory until all threads arrive + // and copy to their local buffer. + + if (m_data.team_rendezvous()) { + // All threads have entered 'team_rendezvous' + // only this thread returned from 'team_rendezvous' + // with a return value of 'true' + // + // This thread sums contributed values + for (int i = 1; i < m_data.m_team_size; ++i) { + value_type* const src = + (value_type*)m_data.team_member(i)->team_reduce_local(); + + reducer.join(contribution, *src); + } + + // Copy result to root member's buffer: + // reducer.copy( (value_type*) m_data.team_reduce() , reducer.data() ); + *((value_type*)m_data.team_reduce()) = contribution; + reducer.reference() = contribution; + m_data.team_rendezvous_release(); + // This thread released all other threads from 'team_rendezvous' + // with a return value of 'false' + } else { + // Copy from root member's buffer: + reducer.reference() = *((value_type*)m_data.team_reduce()); + } + } else { + reducer.reference() = contribution; + } + } +#else + { + (void)reducer; + (void)contribution; + Kokkos::abort("HostThreadTeamMember team_reduce\n"); + } +#endif + + //-------------------------------------------------------------------------- + + /*template< typename ValueType , class JoinOp > + KOKKOS_INLINE_FUNCTION + ValueType + team_reduce( ValueType const & value + , JoinOp const & join ) const noexcept +#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + { + if ( 0 != m_data.m_team_rank ) { + // Non-root copies to their local buffer: + *((ValueType*) m_data.team_reduce_local()) = value ; + } + + // Root does not overwrite shared memory until all threads arrive + // and copy to their local buffer. + + if ( m_data.team_rendezvous() ) { + const Impl::Reducer< ValueType , JoinOp > reducer( join ); + + // All threads have entered 'team_rendezvous' + // only this thread returned from 'team_rendezvous' + // with a return value of 'true' + // + // This thread sums contributed values + + ValueType * const dst = (ValueType*) m_data.team_reduce_local(); + + *dst = value ; + + for ( int i = 1 ; i < m_data.m_team_size ; ++i ) { + ValueType * const src = + (ValueType*) m_data.team_member(i)->team_reduce_local(); + + reducer.join( dst , src ); + } + + m_data.team_rendezvous_release(); + // This thread released all other threads from 'team_rendezvous' + // with a return value of 'false' + } + + return *((ValueType*) m_data.team_reduce()); + } +#else + { Kokkos::abort("HostThreadTeamMember team_reduce\n"); return ValueType(); } +#endif*/ + + template <typename T> + KOKKOS_INLINE_FUNCTION T team_scan(T const& value, + T* const global = nullptr) const noexcept +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + { + if (0 != m_data.m_team_rank) { + // Non-root copies to their local buffer: + ((T*)m_data.team_reduce_local())[1] = value; + } + + // Root does not overwrite shared memory until all threads arrive + // and copy to their local buffer. + + if (m_data.team_rendezvous()) { + // All threads have entered 'team_rendezvous' + // only this thread returned from 'team_rendezvous' + // with a return value of 'true' + // + // This thread scans contributed values + + { + T* prev = (T*)m_data.team_reduce_local(); + + prev[0] = 0; + prev[1] = value; + + for (int i = 1; i < m_data.m_team_size; ++i) { + T* const ptr = (T*)m_data.team_member(i)->team_reduce_local(); + + ptr[0] = prev[0] + prev[1]; + + prev = ptr; + } + } + + // If adding to global value then atomic_fetch_add to that value + // and sum previous value to every entry of the scan. + if (global) { + T* prev = (T*)m_data.team_reduce_local(); + + { + T* ptr = (T*)m_data.team_member(m_data.m_team_size - 1) + ->team_reduce_local(); + prev[0] = Kokkos::atomic_fetch_add(global, ptr[0] + ptr[1]); + } + + for (int i = 1; i < m_data.m_team_size; ++i) { + T* ptr = (T*)m_data.team_member(i)->team_reduce_local(); + ptr[0] += prev[0]; + } + } + + m_data.team_rendezvous_release(); + } + + return ((T*)m_data.team_reduce_local())[0]; + } +#else + { + (void)value; + (void)global; + Kokkos::abort("HostThreadTeamMember team_scan\n"); + return T(); + } +#endif +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +template <typename iType, typename Member> +KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<iType, Member> +TeamThreadRange( + Member const& member, iType count, + typename std::enable_if< + Impl::is_thread_team_member<Member>::value>::type const** = nullptr) { + return Impl::TeamThreadRangeBoundariesStruct<iType, Member>(member, 0, count); +} + +template <typename iType1, typename iType2, typename Member> +KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct< + typename std::common_type<iType1, iType2>::type, Member> +TeamThreadRange( + Member const& member, iType1 begin, iType2 end, + typename std::enable_if< + Impl::is_thread_team_member<Member>::value>::type const** = nullptr) { + return Impl::TeamThreadRangeBoundariesStruct< + typename std::common_type<iType1, iType2>::type, Member>(member, begin, + end); +} + +template <typename iType, typename Member> +KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<iType, Member> +TeamVectorRange( + Member const& member, iType count, + typename std::enable_if< + Impl::is_thread_team_member<Member>::value>::type const** = nullptr) { + return Impl::TeamThreadRangeBoundariesStruct<iType, Member>(member, 0, count); +} + +template <typename iType1, typename iType2, typename Member> +KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct< + typename std::common_type<iType1, iType2>::type, Member> +TeamVectorRange( + Member const& member, iType1 begin, iType2 end, + typename std::enable_if< + Impl::is_thread_team_member<Member>::value>::type const** = nullptr) { + return Impl::TeamThreadRangeBoundariesStruct< + typename std::common_type<iType1, iType2>::type, Member>(member, begin, + end); +} + +template <typename iType, typename Member> +KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct<iType, Member> +ThreadVectorRange( + Member const& member, iType count, + typename std::enable_if< + Impl::is_thread_team_member<Member>::value>::type const** = nullptr) { + return Impl::ThreadVectorRangeBoundariesStruct<iType, Member>(member, count); +} + +template <typename iType1, typename iType2, typename Member> +KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct< + typename std::common_type<iType1, iType2>::type, Member> +ThreadVectorRange( + Member const& member, iType1 arg_begin, iType2 arg_end, + typename std::enable_if< + Impl::is_thread_team_member<Member>::value>::type const** = nullptr) { + using iType = typename std::common_type<iType1, iType2>::type; + return Impl::ThreadVectorRangeBoundariesStruct<iType, Member>( + member, iType(arg_begin), iType(arg_end)); +} + +//---------------------------------------------------------------------------- +/** \brief Inter-thread parallel_for. + * + * Executes lambda(iType i) for each i=[0..N) + * + * The range [0..N) is mapped to all threads of the the calling thread team. + */ +template <typename iType, class Closure, class Member> +KOKKOS_INLINE_FUNCTION void parallel_for( + Impl::TeamThreadRangeBoundariesStruct<iType, Member> const& loop_boundaries, + Closure const& closure, + typename std::enable_if<Impl::is_host_thread_team_member<Member>::value>:: + type const** = nullptr) { + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) { + closure(i); + } +} + +template <typename iType, class Closure, class Member> +KOKKOS_INLINE_FUNCTION void parallel_for( + Impl::ThreadVectorRangeBoundariesStruct<iType, Member> const& + loop_boundaries, + Closure const& closure, + typename std::enable_if<Impl::is_host_thread_team_member<Member>::value>:: + type const** = nullptr) { +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) { + closure(i); + } +} + +//---------------------------------------------------------------------------- + +template <typename iType, class Closure, class Reducer, class Member> +KOKKOS_INLINE_FUNCTION typename std::enable_if< + Kokkos::is_reducer<Reducer>::value && + Impl::is_host_thread_team_member<Member>::value>::type +parallel_reduce( + Impl::TeamThreadRangeBoundariesStruct<iType, Member> const& loop_boundaries, + Closure const& closure, Reducer const& reducer) { + typename Reducer::value_type value; + reducer.init(value); + + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) { + closure(i, value); + } + + loop_boundaries.thread.team_reduce(reducer, value); +} + +template <typename iType, typename Closure, typename ValueType, typename Member> +KOKKOS_INLINE_FUNCTION typename std::enable_if< + !Kokkos::is_reducer<ValueType>::value && + Impl::is_host_thread_team_member<Member>::value>::type +parallel_reduce( + Impl::TeamThreadRangeBoundariesStruct<iType, Member> const& loop_boundaries, + Closure const& closure, ValueType& result) { + ValueType val; + Sum<ValueType> reducer(val); + reducer.init(val); + + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) { + closure(i, reducer.reference()); + } + + loop_boundaries.thread.team_reduce(reducer); + result = reducer.reference(); +} + +/*template< typename iType, class Space + , class Closure, class Joiner , typename ValueType > +KOKKOS_INLINE_FUNCTION +void parallel_reduce + ( +Impl::TeamThreadRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> > + const & loop_boundaries + , Closure const & closure + , Joiner const & joiner + , ValueType & result + ) +{ + Impl::Reducer< ValueType , Joiner > reducer( joiner , & result ); + + reducer.init( reducer.data() ); + + for( iType i = loop_boundaries.start + ; i < loop_boundaries.end + ; i += loop_boundaries.increment ) { + closure( i , reducer.reference() ); + } + + loop_boundaries.thread.team_reduce( reducer ); +}*/ + +//---------------------------------------------------------------------------- +/** \brief Inter-thread vector parallel_reduce. + * + * Executes lambda(iType i, ValueType & val) for each i=[0..N) + * + * The range [0..N) is mapped to all threads of the + * calling thread team and a summation of val is + * performed and put into result. + */ +template <typename iType, class Lambda, typename ValueType, typename Member> +KOKKOS_INLINE_FUNCTION typename std::enable_if< + !Kokkos::is_reducer<ValueType>::value && + Impl::is_host_thread_team_member<Member>::value>::type +parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType, Member>& + loop_boundaries, + const Lambda& lambda, ValueType& result) { + result = ValueType(); + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) { + lambda(i, result); + } +} + +template <typename iType, class Lambda, typename ReducerType, typename Member> +KOKKOS_INLINE_FUNCTION typename std::enable_if< + Kokkos::is_reducer<ReducerType>::value && + Impl::is_host_thread_team_member<Member>::value>::type +parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType, Member>& + loop_boundaries, + const Lambda& lambda, const ReducerType& reducer) { + reducer.init(reducer.reference()); + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) { + lambda(i, reducer.reference()); + } +} + +//---------------------------------------------------------------------------- + +template <typename iType, class Closure, class Member> +KOKKOS_INLINE_FUNCTION typename std::enable_if< + Impl::is_host_thread_team_member<Member>::value>::type +parallel_scan( + Impl::TeamThreadRangeBoundariesStruct<iType, Member> const& loop_boundaries, + Closure const& closure) { + // Extract ValueType from the closure + + using value_type = typename Kokkos::Impl::FunctorAnalysis< + Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type; + + value_type accum = 0; + + // Intra-member scan + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) { + closure(i, accum, false); + } + + // 'accum' output is the exclusive prefix sum + accum = loop_boundaries.thread.team_scan(accum); + + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) { + closure(i, accum, true); + } +} + +template <typename iType, class ClosureType, class Member> +KOKKOS_INLINE_FUNCTION typename std::enable_if< + Impl::is_host_thread_team_member<Member>::value>::type +parallel_scan(Impl::ThreadVectorRangeBoundariesStruct<iType, Member> const& + loop_boundaries, + ClosureType const& closure) { + using value_type = typename Kokkos::Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::SCAN, void, ClosureType>::value_type; + + value_type scan_val = value_type(); + +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) { + closure(i, scan_val, true); + } +} + +template <typename iType, class Lambda, typename ReducerType, typename Member> +KOKKOS_INLINE_FUNCTION typename std::enable_if< + Kokkos::is_reducer<ReducerType>::value && + Impl::is_host_thread_team_member<Member>::value>::type +parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType, Member>& + loop_boundaries, + const Lambda& lambda, const ReducerType& reducer) { + typename ReducerType::value_type scan_val; + reducer.init(scan_val); + +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif + for (iType i = loop_boundaries.start; i < loop_boundaries.end; + i += loop_boundaries.increment) { + lambda(i, scan_val, true); + } +} + +//---------------------------------------------------------------------------- + +template <class Member> +KOKKOS_INLINE_FUNCTION Impl::ThreadSingleStruct<Member> PerTeam( + Member const& member, + typename std::enable_if< + Impl::is_thread_team_member<Member>::value>::type const** = nullptr) { + return Impl::ThreadSingleStruct<Member>(member); +} + +template <class Member> +KOKKOS_INLINE_FUNCTION Impl::VectorSingleStruct<Member> PerThread( + Member const& member, + typename std::enable_if< + Impl::is_thread_team_member<Member>::value>::type const** = nullptr) { + return Impl::VectorSingleStruct<Member>(member); +} + +template <class Member, class FunctorType> +KOKKOS_INLINE_FUNCTION typename std::enable_if< + Impl::is_host_thread_team_member<Member>::value>::type +single(const Impl::ThreadSingleStruct<Member>& single, + const FunctorType& functor) { + // 'single' does not perform a barrier. + if (single.team_member.team_rank() == 0) functor(); +} + +template <class Member, class FunctorType, typename ValueType> +KOKKOS_INLINE_FUNCTION typename std::enable_if< + Impl::is_host_thread_team_member<Member>::value>::type +single(const Impl::ThreadSingleStruct<Member>& single, + const FunctorType& functor, ValueType& val) { + single.team_member.team_broadcast(functor, val, 0); +} + +template <class Member, class FunctorType> +KOKKOS_INLINE_FUNCTION typename std::enable_if< + Impl::is_host_thread_team_member<Member>::value>::type +single(const Impl::VectorSingleStruct<Member>&, const FunctorType& functor) { + functor(); +} + +template <class Member, class FunctorType, typename ValueType> +KOKKOS_INLINE_FUNCTION typename std::enable_if< + Impl::is_host_thread_team_member<Member>::value>::type +single(const Impl::VectorSingleStruct<Member>&, const FunctorType& functor, + ValueType& val) { + functor(val); +} + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #ifndef KOKKOS_IMPL_HOSTTHREADTEAM_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_LIFO.hpp b/packages/kokkos/core/src/impl/Kokkos_LIFO.hpp new file mode 100644 index 0000000000000000000000000000000000000000..683c5c9b18ba5b8c802eebf5cdcc62cac42bf616 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_LIFO.hpp @@ -0,0 +1,405 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +// Experimental unified task-data parallel manycore LDRD + +#ifndef KOKKOS_IMPL_LIFO_HPP +#define KOKKOS_IMPL_LIFO_HPP + +#include <Kokkos_Macros.hpp> +#ifdef KOKKOS_ENABLE_TASKDAG + +#include <Kokkos_Core_fwd.hpp> + +#include <Kokkos_PointerOwnership.hpp> +#include <impl/Kokkos_OptionalRef.hpp> +#include <impl/Kokkos_Error.hpp> // KOKKOS_EXPECTS +#include <impl/Kokkos_LinkedListNode.hpp> + +#include <Kokkos_Atomic.hpp> // atomic_compare_exchange, atomic_fence + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +template <class T> +struct LockBasedLIFOCommon { + using value_type = T; + + using node_type = SimpleSinglyLinkedListNode<>; + + static constexpr uintptr_t LockTag = ~uintptr_t(0); + static constexpr uintptr_t EndTag = ~uintptr_t(1); + + OwningRawPtr<node_type> m_head = (node_type*)EndTag; + + KOKKOS_INLINE_FUNCTION + bool _try_push_node(node_type& node) { + KOKKOS_EXPECTS(!node.is_enqueued()); + + auto* volatile& next = LinkedListNodeAccess::next_ptr(node); + + // store the head of the queue in a local variable + auto* old_head = m_head; + + // retry until someone locks the queue or we successfully compare exchange + while (old_head != (node_type*)LockTag) { + // TODO @tasking @memory_order DSH this should have a memory order and not + // a memory fence + + // set task->next to the head of the queue + next = old_head; + + // fence to emulate acquire semantics on next and release semantics on + // the store of m_head + // Do not proceed until 'next' has been stored. + Kokkos::memory_fence(); + + // store the old head + auto* const old_head_tmp = old_head; + + // attempt to swap task with the old head of the queue + // as if this were done atomically: + // if(m_head == old_head) { + // m_head = &node; + // } + // old_head = m_head; + old_head = ::Kokkos::atomic_compare_exchange(&m_head, old_head, &node); + + if (old_head_tmp == old_head) return true; + } + + // Failed, replace 'task->m_next' value since 'task' remains + // not a member of a queue. + + // TODO @tasking @memory_order DSH this should have a memory order and not a + // memory fence + LinkedListNodeAccess::mark_as_not_enqueued(node); + + // fence to emulate acquire semantics on next + // Do not proceed until 'next' has been stored. + ::Kokkos::memory_fence(); + + return false; + } + + bool _is_empty() const noexcept { + // TODO @tasking @memory_order DSH make this an atomic load with memory + // order + return (volatile node_type*)this->m_head == (node_type*)EndTag; + } +}; + +//------------------------------------------------------------------------------ +//------------------------------------------------------------------------------ + +template <class T> +class LockBasedLIFO : private LockBasedLIFOCommon<T> { + private: + using base_t = LockBasedLIFOCommon<T>; + using node_type = typename base_t::node_type; + + public: + using value_type = typename base_t::value_type; // = T + using intrusive_node_base_type = SimpleSinglyLinkedListNode<>; + + public: + LockBasedLIFO() = default; + LockBasedLIFO(LockBasedLIFO const&) = delete; + LockBasedLIFO(LockBasedLIFO&&) = delete; + LockBasedLIFO& operator=(LockBasedLIFO const&) = delete; + LockBasedLIFO& operator=(LockBasedLIFO&&) = delete; + + ~LockBasedLIFO() = default; + + bool empty() const noexcept { + // TODO @tasking @memory_order DSH memory order + return this->_is_empty(); + } + + KOKKOS_INLINE_FUNCTION + OptionalRef<T> pop(bool abort_on_locked = false) { + // Put this in here to avoid requiring value_type to be complete until now. + static_assert(std::is_base_of<intrusive_node_base_type, value_type>::value, + "Intrusive linked-list value_type must be derived from " + "intrusive_node_base_type"); + + // We can't use the static constexpr LockTag directly because + // atomic_compare_exchange needs to bind a reference to that, and you + // can't do that with static constexpr variables. + auto* const lock_tag = (node_type*)base_t::LockTag; + + // TODO @tasking @memory_order DSH shouldn't this be a relaxed atomic load? + // start with the return value equal to the head + auto* rv = this->m_head; + + // Retry until the lock is acquired or the queue is empty. + while (rv != (node_type*)base_t::EndTag) { + // The only possible values for the queue are + // (1) lock, (2) end, or (3) a valid task. + // Thus zero will never appear in the queue. + // + // If queue is locked then just read by guaranteeing the CAS will fail. + KOKKOS_ASSERT(rv != nullptr); + + if (rv == lock_tag) { + // TODO @tasking @memory_order DSH this should just be an atomic load + // followed by a continue just set rv to nullptr for now, effectively + // turning the atomic_compare_exchange below into a load + rv = nullptr; + if (abort_on_locked) { + break; + } + } + + auto* const old_rv = rv; + + // TODO @tasking @memory_order DSH this should be a weak compare exchange + // in a loop + rv = Kokkos::atomic_compare_exchange(&(this->m_head), old_rv, lock_tag); + + if (rv == old_rv) { + // CAS succeeded and queue is locked + // + // This thread has locked the queue and removed 'rv' from the queue. + // Extract the next entry of the queue from 'rv->m_next' + // and mark 'rv' as popped from a queue by setting + // 'rv->m_next = nullptr'. + // + // Place the next entry in the head of the queue, + // which also unlocks the queue. + // + // This thread has exclusive access to + // the queue and the popped task's m_next. + + // TODO @tasking @memory_order DSH check whether the volatile is needed + // here + auto* volatile& next = LinkedListNodeAccess::next_ptr(*rv); //->m_next; + + // This algorithm is not lockfree because a adversarial scheduler could + // context switch this thread at this point and the rest of the threads + // calling this method would never make forward progress + + // TODO @tasking @memory_order DSH I think this needs to be a atomic + // store release (and the memory fence needs to be removed) + // TODO @tasking DSH prove that this doesn't need to be a volatile store + // Lock is released here + this->m_head = next; + + // Mark rv as popped by assigning nullptr to the next + LinkedListNodeAccess::mark_as_not_enqueued(*rv); + + Kokkos::memory_fence(); + + return OptionalRef<T>{*static_cast<T*>(rv)}; + } + + // Otherwise, the CAS got a value that didn't match (either because + // another thread locked the queue and we observed the lock tag or because + // another thread replaced the head and now we want to try to lock the + // queue with that as the popped item. Either way, try again. + } + + // Return an empty OptionalRef by calling the default constructor + return {}; + } + + KOKKOS_INLINE_FUNCTION + OptionalRef<T> steal() { + // TODO @tasking @optimization DSH do this with fewer retries + return pop(/* abort_on_locked = */ true); + } + + KOKKOS_INLINE_FUNCTION + bool push(node_type& node) { + while (!this->_try_push_node(node)) { /* retry until success */ + } + // for consistency with push interface on other queue types: + return true; + } + + KOKKOS_INLINE_FUNCTION + bool push(node_type&& node) { + // Just forward to the lvalue version + return push(node); + } +}; + +/** @brief A Multiple Producer, Single Consumer Queue with some special + * semantics + * + * This multi-producer, single consumer queue has the following semantics: + * + * - Any number of threads may call `try_emplace`/`try_push` + * + These operations are lock-free. + * - Exactly one thread calls `consume()`, and the call occurs exactly once + * in the lifetime of the queue. + * + This operation is lock-free (and wait-free w.r.t. producers) + * - Any calls to `try_push` that happen-before the call to + * `consume()` will succeed and return an true, such that the `consume()` + * call will visit that node. + * - Any calls to `try_push` for which the single call to `consume()` + * happens-before those calls will return false and the node given as + * an argument to `try_push` will not be visited by consume() + * + * + * @tparam T The type of items in the queue + * + */ +template <class T> +class SingleConsumeOperationLIFO : private LockBasedLIFOCommon<T> { + private: + using base_t = LockBasedLIFOCommon<T>; + using node_type = typename base_t::node_type; + + // Allows us to reuse the existing infrastructure for + static constexpr auto ConsumedTag = base_t::LockTag; + + public: + using value_type = typename base_t::value_type; // = T + + KOKKOS_DEFAULTED_FUNCTION + SingleConsumeOperationLIFO() noexcept = default; + + SingleConsumeOperationLIFO(SingleConsumeOperationLIFO const&) = delete; + SingleConsumeOperationLIFO(SingleConsumeOperationLIFO&&) = delete; + SingleConsumeOperationLIFO& operator=(SingleConsumeOperationLIFO const&) = + delete; + SingleConsumeOperationLIFO& operator=(SingleConsumeOperationLIFO&&) = delete; + + KOKKOS_DEFAULTED_FUNCTION + ~SingleConsumeOperationLIFO() = default; + + KOKKOS_INLINE_FUNCTION + bool empty() const noexcept { + // TODO @tasking @memory_order DSH memory order + return this->_is_empty(); + } + + KOKKOS_INLINE_FUNCTION + bool is_consumed() const noexcept { + // TODO @tasking @memory_order DSH memory order? + return this->m_head == (node_type*)ConsumedTag; + } + + KOKKOS_INLINE_FUNCTION + bool try_push(node_type& node) { + return this->_try_push_node(node); + // Ensures: (return value is true) || (node.is_enqueued() == false); + } + + template <class Function> + KOKKOS_INLINE_FUNCTION void consume(Function&& f) { + auto* const consumed_tag = (node_type*)ConsumedTag; + + // Swap the Consumed tag into the head of the queue: + + // (local variable used for assertion only) + // TODO @tasking @memory_order DSH this should have memory order release, I + // think + Kokkos::memory_fence(); + auto old_head = Kokkos::atomic_exchange(&(this->m_head), consumed_tag); + + // Assert that the queue wasn't consumed before this + // This can't be an expects clause because the acquire fence on the read + // would be a side-effect + KOKKOS_ASSERT(old_head != consumed_tag); + + // We now have exclusive access to the queue; loop over it and call + // the user function + while (old_head != (node_type*)base_t::EndTag) { + // get the Node to make the call with + auto* call_arg = old_head; + + // advance the head + old_head = LinkedListNodeAccess::next_ptr(*old_head); + + // Mark as popped before proceeding + LinkedListNodeAccess::mark_as_not_enqueued(*call_arg); + + // Call the user function + auto& arg = *static_cast<T*>(call_arg); + f(std::move(arg)); + } + } +}; + +} // end namespace Impl +} // end namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +struct TaskQueueTraitsLockBased { + // TODO @tasking @documentation DSH document what concepts these match + + template <class Task> + using ready_queue_type = LockBasedLIFO<Task>; + + template <class Task> + using waiting_queue_type = SingleConsumeOperationLIFO<Task>; + + template <class Task> + using intrusive_task_base_type = + typename ready_queue_type<Task>::intrusive_node_base_type; + + static constexpr auto ready_queue_insertion_may_fail = false; +}; + +} // end namespace Impl +} // end namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* defined KOKKOS_ENABLE_TASKDAG */ +#endif /* #ifndef KOKKOS_IMPL_LIFO_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_LinkedListNode.hpp b/packages/kokkos/core/src/impl/Kokkos_LinkedListNode.hpp new file mode 100644 index 0000000000000000000000000000000000000000..79aeca5da0691264c4cb215f62e17bdb8dbe95e1 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_LinkedListNode.hpp @@ -0,0 +1,184 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +// Experimental unified task-data parallel manycore LDRD + +#ifndef KOKKOS_IMPL_LINKEDLISTNODE_HPP +#define KOKKOS_IMPL_LINKEDLISTNODE_HPP + +#include <Kokkos_Macros.hpp> +#ifdef KOKKOS_ENABLE_TASKDAG + +#include <Kokkos_Core_fwd.hpp> + +#include <Kokkos_PointerOwnership.hpp> +#include <impl/Kokkos_OptionalRef.hpp> +#include <impl/Kokkos_Error.hpp> // KOKKOS_EXPECTS + +#include <Kokkos_Atomic.hpp> // atomic_compare_exchange, atomic_fence + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +struct LinkedListNodeAccess; + +template <uintptr_t NotEnqueuedValue = 0, + template <class> class PointerTemplate = std::add_pointer> +struct SimpleSinglyLinkedListNode { + private: + using pointer_type = + typename PointerTemplate<SimpleSinglyLinkedListNode>::type; + + pointer_type m_next = reinterpret_cast<pointer_type>(NotEnqueuedValue); + + // These are private because they are an implementation detail of the queue + // and should not get added to the value type's interface via the intrusive + // wrapper. + + KOKKOS_INLINE_FUNCTION + void mark_as_not_enqueued() noexcept { + // TODO @tasking @memory_order DSH make this an atomic store with memory + // order + m_next = (pointer_type)NotEnqueuedValue; + } + + KOKKOS_INLINE_FUNCTION + void mark_as_not_enqueued() volatile noexcept { + // TODO @tasking @memory_order DSH make this an atomic store with memory + // order + m_next = (pointer_type)NotEnqueuedValue; + } + + KOKKOS_INLINE_FUNCTION + pointer_type& _next_ptr() noexcept { return m_next; } + + KOKKOS_INLINE_FUNCTION + pointer_type volatile& _next_ptr() volatile noexcept { return m_next; } + + KOKKOS_INLINE_FUNCTION + pointer_type const& _next_ptr() const noexcept { return m_next; } + + KOKKOS_INLINE_FUNCTION + pointer_type const volatile& _next_ptr() const volatile noexcept { + return m_next; + } + + friend struct LinkedListNodeAccess; + + public: + // KOKKOS_CONSTEXPR_14 + KOKKOS_INLINE_FUNCTION + bool is_enqueued() const noexcept { + // TODO @tasking @memory_order DSH make this an atomic load with memory + // order + return m_next != reinterpret_cast<pointer_type>(NotEnqueuedValue); + } + + // KOKKOS_CONSTEXPR_14 + KOKKOS_INLINE_FUNCTION + bool is_enqueued() const volatile noexcept { + // TODO @tasking @memory_order DSH make this an atomic load with memory + // order + return m_next != reinterpret_cast<pointer_type>(NotEnqueuedValue); + } +}; + +/// Attorney for LinkedListNode, since user types inherit from it +struct LinkedListNodeAccess { + template <class Node> + KOKKOS_INLINE_FUNCTION static void mark_as_not_enqueued(Node& node) noexcept { + node.mark_as_not_enqueued(); + } + + template <class Node> + KOKKOS_INLINE_FUNCTION static void mark_as_not_enqueued( + Node volatile& node) noexcept { + node.mark_as_not_enqueued(); + } + + template <class Node> + KOKKOS_INLINE_FUNCTION static typename Node::pointer_type& next_ptr( + Node& node) noexcept { + return node._next_ptr(); + } + + template <class Node> + KOKKOS_INLINE_FUNCTION static typename Node::pointer_type& next_ptr( + Node volatile& node) noexcept { + return node._next_ptr(); + } + + template <class Node> + KOKKOS_INLINE_FUNCTION static typename Node::pointer_type& next_ptr( + Node const& node) noexcept { + return node._next_ptr(); + } + + template <class Node> + KOKKOS_INLINE_FUNCTION static typename Node::pointer_type& prev_ptr( + Node& node) noexcept { + return node._prev_ptr(); + } + + template <class Node> + KOKKOS_INLINE_FUNCTION static typename Node::pointer_type& prev_ptr( + Node const& node) noexcept { + return node._prev_ptr(); + } +}; + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +} // end namespace Impl +} // end namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* defined KOKKOS_ENABLE_TASKDAG */ +#endif /* #ifndef KOKKOS_IMPL_LINKEDLISTNODE_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_MemoryPool.cpp b/packages/kokkos/core/src/impl/Kokkos_MemoryPool.cpp new file mode 100644 index 0000000000000000000000000000000000000000..889d821bb1ced124c06f7c41f3604baa9d3bf782 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_MemoryPool.cpp @@ -0,0 +1,135 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <impl/Kokkos_Error.hpp> + +#include <ostream> +#include <sstream> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +/* Verify size constraints: + * min_block_alloc_size <= max_block_alloc_size + * max_block_alloc_size <= min_superblock_size + * min_superblock_size <= max_superblock_size + * min_superblock_size <= min_total_alloc_size + * min_superblock_size <= min_block_alloc_size * + * max_block_per_superblock + */ +void memory_pool_bounds_verification(size_t min_block_alloc_size, + size_t max_block_alloc_size, + size_t min_superblock_size, + size_t max_superblock_size, + size_t max_block_per_superblock, + size_t min_total_alloc_size) { + const size_t max_superblock = min_block_alloc_size * max_block_per_superblock; + + if ((size_t(max_superblock_size) < min_superblock_size) || + (min_total_alloc_size < min_superblock_size) || + (max_superblock < min_superblock_size) || + (min_superblock_size < max_block_alloc_size) || + (max_block_alloc_size < min_block_alloc_size)) { + std::ostringstream msg; + + msg << "Kokkos::MemoryPool size constraint violation"; + + if (size_t(max_superblock_size) < min_superblock_size) { + msg << " : max_superblock_size(" << max_superblock_size + << ") < min_superblock_size(" << min_superblock_size << ")"; + } + + if (min_total_alloc_size < min_superblock_size) { + msg << " : min_total_alloc_size(" << min_total_alloc_size + << ") < min_superblock_size(" << min_superblock_size << ")"; + } + + if (max_superblock < min_superblock_size) { + msg << " : max_superblock(" << max_superblock + << ") < min_superblock_size(" << min_superblock_size << ")"; + } + + if (min_superblock_size < max_block_alloc_size) { + msg << " : min_superblock_size(" << min_superblock_size + << ") < max_block_alloc_size(" << max_block_alloc_size << ")"; + } + + if (max_block_alloc_size < min_block_alloc_size) { + msg << " : max_block_alloc_size(" << max_block_alloc_size + << ") < min_block_alloc_size(" << min_block_alloc_size << ")"; + } + + Kokkos::Impl::throw_runtime_exception(msg.str()); + } +} + +// This has way too many parameters, but it is entirely for moving the iostream +// inclusion out of the header file with as few changes as possible +void _print_memory_pool_state(std::ostream& s, uint32_t const* sb_state_ptr, + int32_t sb_count, uint32_t sb_size_lg2, + uint32_t sb_state_size, uint32_t state_shift, + uint32_t state_used_mask) { + s << "pool_size(" << (size_t(sb_count) << sb_size_lg2) << ")" + << " superblock_size(" << (1LU << sb_size_lg2) << ")" << std::endl; + + for (int32_t i = 0; i < sb_count; ++i, sb_state_ptr += sb_state_size) { + if (*sb_state_ptr) { + const uint32_t block_count_lg2 = (*sb_state_ptr) >> state_shift; + const uint32_t block_size_lg2 = sb_size_lg2 - block_count_lg2; + const uint32_t block_count = 1u << block_count_lg2; + const uint32_t block_used = (*sb_state_ptr) & state_used_mask; + + s << "Superblock[ " << i << " / " << sb_count << " ] {" + << " block_size(" << (1 << block_size_lg2) << ")" + << " block_count( " << block_used << " / " << block_count << " )" + << std::endl; + } + } +} + +} // namespace Impl +} // namespace Kokkos diff --git a/packages/kokkos/core/src/impl/Kokkos_MemoryPoolAllocator.hpp b/packages/kokkos/core/src/impl/Kokkos_MemoryPoolAllocator.hpp new file mode 100644 index 0000000000000000000000000000000000000000..2218405766cacb1215604ff7e7cb749b509be56e --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_MemoryPoolAllocator.hpp @@ -0,0 +1,131 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +// Experimental unified task-data parallel manycore LDRD + +#ifndef KOKKOS_IMPL_MEMORYPOOLALLOCATOR_HPP +#define KOKKOS_IMPL_MEMORYPOOLALLOCATOR_HPP + +#include <Kokkos_Macros.hpp> + +#include <Kokkos_Core_fwd.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- +namespace Kokkos { +namespace Impl { + +template <class MemoryPool, class T> +class MemoryPoolAllocator { + public: + using memory_pool = MemoryPool; + + private: + memory_pool m_pool; + + public: + KOKKOS_DEFAULTED_FUNCTION + MemoryPoolAllocator() = default; + KOKKOS_DEFAULTED_FUNCTION + MemoryPoolAllocator(MemoryPoolAllocator const&) = default; + KOKKOS_DEFAULTED_FUNCTION + MemoryPoolAllocator(MemoryPoolAllocator&&) = default; + KOKKOS_DEFAULTED_FUNCTION + MemoryPoolAllocator& operator=(MemoryPoolAllocator const&) = default; + KOKKOS_DEFAULTED_FUNCTION + MemoryPoolAllocator& operator=(MemoryPoolAllocator&&) = default; + KOKKOS_DEFAULTED_FUNCTION + ~MemoryPoolAllocator() = default; + + KOKKOS_INLINE_FUNCTION + explicit MemoryPoolAllocator(memory_pool const& arg_pool) + : m_pool(arg_pool) {} + KOKKOS_INLINE_FUNCTION + explicit MemoryPoolAllocator(memory_pool&& arg_pool) + : m_pool(std::move(arg_pool)) {} + + public: + using value_type = T; + using pointer = T*; + using size_type = typename MemoryPool::memory_space::size_type; + using difference_type = typename std::make_signed<size_type>::type; + + template <class U> + struct rebind { + using other = MemoryPoolAllocator<MemoryPool, U>; + }; + + KOKKOS_INLINE_FUNCTION + pointer allocate(size_t n) { + void* rv = m_pool.allocate(n * sizeof(T)); + if (rv == nullptr) { + Kokkos::abort("Kokkos MemoryPool allocator failed to allocate memory"); + } + return reinterpret_cast<T*>(rv); + } + + KOKKOS_INLINE_FUNCTION + void deallocate(T* ptr, size_t n) { m_pool.deallocate(ptr, n * sizeof(T)); } + + KOKKOS_INLINE_FUNCTION + size_type max_size() const { return m_pool.max_block_size(); } + + KOKKOS_INLINE_FUNCTION + bool operator==(MemoryPoolAllocator const& other) const { + return m_pool == other.m_pool; + } + + KOKKOS_INLINE_FUNCTION + bool operator!=(MemoryPoolAllocator const& other) const { + return !(*this == other); + } +}; + +} // end namespace Impl +} // end namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #ifndef KOKKOS_IMPL_MEMORYPOOLALLOCATOR_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_MemorySpace.cpp b/packages/kokkos/core/src/impl/Kokkos_MemorySpace.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ec2e573c0450c6d81db64334db65102bd59f2ae1 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_MemorySpace.cpp @@ -0,0 +1,95 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2019) Sandia Corporation +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +/** @file Kokkos_MemorySpace.cpp + * + * Operations common to memory space instances, or at least default + * implementations thereof. + */ + +#include <impl/Kokkos_MemorySpace.hpp> + +#include <iostream> +#include <string> +#include <sstream> + +namespace Kokkos { +namespace Impl { + +void safe_throw_allocation_with_header_failure( + std::string const& space_name, std::string const& label, + Kokkos::Experimental::RawMemoryAllocationFailure const& failure) { + auto generate_failure_message = [&](std::ostream& o) { + o << "Kokkos failed to allocate memory for label \"" << label + << "\". Allocation using MemorySpace named \"" << space_name + << "\" failed with the following error: "; + failure.print_error_message(o); + if (failure.failure_mode() == + Kokkos::Experimental::RawMemoryAllocationFailure::FailureMode:: + AllocationNotAligned) { + // TODO: delete the misaligned memory? + o << "Warning: Allocation failed due to misalignment; memory may " + "be leaked.\n"; + } + o.flush(); + }; + try { + std::ostringstream sstr; + generate_failure_message(sstr); + Kokkos::Impl::throw_runtime_exception(sstr.str()); + } catch (std::bad_alloc const&) { + // Probably failed to allocate the string because we're so close to out + // of memory. Try printing to std::cerr instead + try { + generate_failure_message(std::cerr); + } catch (std::bad_alloc const&) { + // oh well, we tried... + } + Kokkos::Impl::throw_runtime_exception( + "Kokkos encountered an allocation failure, then another allocation " + "failure while trying to create the error message."); + } +} + +} // end namespace Impl +} // end namespace Kokkos diff --git a/packages/kokkos/core/src/impl/Kokkos_MemorySpace.hpp b/packages/kokkos/core/src/impl/Kokkos_MemorySpace.hpp new file mode 100644 index 0000000000000000000000000000000000000000..5b3764686f517066194f142fbca68e270cdd1b8f --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_MemorySpace.hpp @@ -0,0 +1,84 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2019) Sandia Corporation +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +/** @file Kokkos_MemorySpace.hpp + * + * Operations common to memory space instances, or at least default + * implementations thereof. + */ + +#ifndef KOKKOS_IMPL_MEMORYSPACE_HPP +#define KOKKOS_IMPL_MEMORYSPACE_HPP + +#include <Kokkos_Macros.hpp> +#include <impl/Kokkos_SharedAlloc.hpp> +#include <impl/Kokkos_Error.hpp> + +#include <string> + +namespace Kokkos { +namespace Impl { + +// Defined in implementation file to avoid having to include iostream +void safe_throw_allocation_with_header_failure( + std::string const &space_name, std::string const &label, + Kokkos::Experimental::RawMemoryAllocationFailure const &failure); + +template <class MemorySpace> +SharedAllocationHeader *checked_allocation_with_header(MemorySpace const &space, + std::string const &label, + size_t alloc_size) { + try { + return reinterpret_cast<SharedAllocationHeader *>(space.allocate( + label.c_str(), alloc_size + sizeof(SharedAllocationHeader), + alloc_size)); + } catch (Kokkos::Experimental::RawMemoryAllocationFailure const &failure) { + safe_throw_allocation_with_header_failure(space.name(), label, failure); + } + return nullptr; // unreachable +} + +} // end namespace Impl +} // end namespace Kokkos + +#endif // KOKKOS_IMPL_MEMORYSPACE_HPP diff --git a/packages/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp b/packages/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp new file mode 100644 index 0000000000000000000000000000000000000000..76d553601923fd7282132fbff05ce69a4e576e97 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp @@ -0,0 +1,111 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_MEMORY_FENCE_HPP) +#define KOKKOS_MEMORY_FENCE_HPP +namespace Kokkos { + +//---------------------------------------------------------------------------- + +KOKKOS_FORCEINLINE_FUNCTION +void memory_fence() { +#if defined(__CUDA_ARCH__) + __threadfence(); +#elif defined(KOKKOS_ENABLE_OPENMPTARGET) +#pragma omp flush +#elif defined(__HIP_DEVICE_COMPILE__) + __threadfence(); +#elif defined(KOKKOS_ENABLE_SYCL) && defined(__SYCL_DEVICE_ONLY__) + sycl::ONEAPI::atomic_fence(sycl::ONEAPI::memory_order::acq_rel, + sycl::ONEAPI::memory_scope::device); +#elif defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) + asm volatile("mfence" ::: "memory"); +#elif defined(KOKKOS_ENABLE_GNU_ATOMICS) || \ + (defined(KOKKOS_COMPILER_NVCC) && defined(KOKKOS_ENABLE_INTEL_ATOMICS)) + __sync_synchronize(); +#elif defined(KOKKOS_ENABLE_INTEL_ATOMICS) + _mm_mfence(); +#elif defined(KOKKOS_ENABLE_OPENMP_ATOMICS) +#pragma omp flush +#elif defined(KOKKOS_ENABLE_WINDOWS_ATOMICS) + MemoryBarrier(); +#elif !defined(KOKKOS_ENABLE_SERIAL_ATOMICS) +#error "Error: memory_fence() not defined" +#endif +} + +////////////////////////////////////////////////////// +// store_fence() +// +// If possible use a store fence on the architecture, if not run a full memory +// fence + +KOKKOS_FORCEINLINE_FUNCTION +void store_fence() { +#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) + asm volatile("sfence" ::: "memory"); +#else + memory_fence(); +#endif +} + +////////////////////////////////////////////////////// +// load_fence() +// +// If possible use a load fence on the architecture, if not run a full memory +// fence + +KOKKOS_FORCEINLINE_FUNCTION +void load_fence() { +#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) + asm volatile("lfence" ::: "memory"); +#else + memory_fence(); +#endif +} + +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/impl/Kokkos_MultipleTaskQueue.hpp b/packages/kokkos/core/src/impl/Kokkos_MultipleTaskQueue.hpp new file mode 100644 index 0000000000000000000000000000000000000000..fe78cfbacc632d353844a5cb17f89a5d5ba067ff --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_MultipleTaskQueue.hpp @@ -0,0 +1,535 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_IMPL_MULTIPLETASKQUEUE_HPP +#define KOKKOS_IMPL_MULTIPLETASKQUEUE_HPP + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_TASKDAG) + +#include <Kokkos_TaskScheduler_fwd.hpp> +#include <Kokkos_Core_fwd.hpp> + +#include <Kokkos_MemoryPool.hpp> + +#include <impl/Kokkos_TaskBase.hpp> +#include <impl/Kokkos_TaskResult.hpp> + +#include <impl/Kokkos_TaskQueueMemoryManager.hpp> +#include <impl/Kokkos_TaskQueueCommon.hpp> +#include <impl/Kokkos_Memory_Fence.hpp> +#include <impl/Kokkos_Atomic_Increment.hpp> +#include <impl/Kokkos_OptionalRef.hpp> +#include <impl/Kokkos_LIFO.hpp> + +#include <string> +#include <typeinfo> +#include <stdexcept> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +// A *non*-concurrent linked list of tasks that failed to be enqueued +// (We can't reuse the wait queue for this because of the semantics of that +// queue that require it to be popped exactly once, and if a task has failed +// to be enqueued, it has already been marked ready) +template <class TaskQueueTraits> +struct FailedQueueInsertionLinkedListSchedulingInfo { + using task_base_type = TaskNode<TaskQueueTraits>; + task_base_type* next = nullptr; +}; + +struct EmptyTaskSchedulingInfo {}; + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +template <class ExecSpace, class MemorySpace, class TaskQueueTraits, + class MemoryPool> +class MultipleTaskQueue; + +template <class TaskQueueTraits> +struct MultipleTaskQueueTeamEntry { + public: + using task_base_type = TaskNode<TaskQueueTraits>; + using runnable_task_base_type = RunnableTaskBase<TaskQueueTraits>; + using ready_queue_type = + typename TaskQueueTraits::template ready_queue_type<task_base_type>; + using task_queue_traits = TaskQueueTraits; + using task_scheduling_info_type = typename std::conditional< + TaskQueueTraits::ready_queue_insertion_may_fail, + FailedQueueInsertionLinkedListSchedulingInfo<TaskQueueTraits>, + EmptyTaskSchedulingInfo>::type; + + private: + // Number of allowed priorities + static constexpr int NumPriorities = 3; + + ready_queue_type m_ready_queues[NumPriorities][2]; + + task_base_type* m_failed_heads[NumPriorities][2]; + + KOKKOS_INLINE_FUNCTION + task_base_type*& failed_head_for(runnable_task_base_type const& task) { + return m_failed_heads[int(task.get_priority())][int(task.get_task_type())]; + } + + template <class _always_void = void> + KOKKOS_INLINE_FUNCTION OptionalRef<task_base_type> _pop_failed_insertion( + int priority, TaskType type, + typename std::enable_if< + task_queue_traits::ready_queue_insertion_may_fail && + std::is_void<_always_void>::value, + void*>::type = nullptr) { + auto* rv_ptr = m_failed_heads[priority][(int)type]; + if (rv_ptr) { + m_failed_heads[priority][(int)type] = + rv_ptr->as_runnable_task() + .template scheduling_info_as<task_scheduling_info_type>() + .next; + return OptionalRef<task_base_type>{*rv_ptr}; + } else { + return OptionalRef<task_base_type>{nullptr}; + } + } + + template <class _always_void = void> + KOKKOS_INLINE_FUNCTION OptionalRef<task_base_type> _pop_failed_insertion( + int /*priority*/, TaskType /*type*/, + typename std::enable_if< + !task_queue_traits::ready_queue_insertion_may_fail && + std::is_void<_always_void>::value, + void*>::type = nullptr) { + return OptionalRef<task_base_type>{nullptr}; + } + + public: + KOKKOS_INLINE_FUNCTION + MultipleTaskQueueTeamEntry() { + for (int iPriority = 0; iPriority < NumPriorities; ++iPriority) { + for (int iType = 0; iType < 2; ++iType) { + m_failed_heads[iPriority][iType] = nullptr; + } + } + } + + KOKKOS_INLINE_FUNCTION + OptionalRef<task_base_type> try_to_steal_ready_task() { + auto return_value = OptionalRef<task_base_type>{}; + // prefer lower priority tasks when stealing + for (int i_priority = NumPriorities - 1; i_priority >= 0; --i_priority) { + // Check for a single task with this priority + return_value = m_ready_queues[i_priority][TaskSingle].steal(); + if (return_value) return return_value; + + // Check for a team task with this priority + return_value = m_ready_queues[i_priority][TaskTeam].steal(); + if (return_value) return return_value; + } + return return_value; + } + + KOKKOS_INLINE_FUNCTION + OptionalRef<task_base_type> pop_ready_task() { + auto return_value = OptionalRef<task_base_type>{}; + for (int i_priority = 0; i_priority < NumPriorities; ++i_priority) { + return_value = _pop_failed_insertion(i_priority, TaskTeam); + if (!return_value) + return_value = m_ready_queues[i_priority][TaskTeam].pop(); + if (return_value) return return_value; + + // Check for a single task with this priority + return_value = _pop_failed_insertion(i_priority, TaskSingle); + if (!return_value) + return_value = m_ready_queues[i_priority][TaskSingle].pop(); + if (return_value) return return_value; + } + return return_value; + } + + KOKKOS_INLINE_FUNCTION + ready_queue_type& team_queue_for(runnable_task_base_type const& task) { + return m_ready_queues[int(task.get_priority())][int(task.get_task_type())]; + } + + template <class _always_void = void> + KOKKOS_INLINE_FUNCTION void do_handle_failed_insertion( + runnable_task_base_type&& task, + typename std::enable_if< + task_queue_traits::ready_queue_insertion_may_fail && + std::is_void<_always_void>::value, + void*>::type = nullptr) { + // failed insertions, if they happen, must be from the only thread that + // is allowed to push to m_ready_queues, so this linked-list insertion is + // not concurrent + auto& node = task.template scheduling_info_as<task_scheduling_info_type>(); + auto*& head = failed_head_for(task); + node.next = head; + head = &task; + } + + template <class _always_void = void> + KOKKOS_INLINE_FUNCTION void do_handle_failed_insertion( + runnable_task_base_type&& /*task*/, + typename std::enable_if< + !task_queue_traits::ready_queue_insertion_may_fail && + std::is_void<_always_void>::value, + void*>::type = nullptr) { + Kokkos::abort("should be unreachable!"); + } + + template <class _always_void = void> + KOKKOS_INLINE_FUNCTION void flush_failed_insertions( + int priority, int task_type, + typename std::enable_if< + task_queue_traits::ready_queue_insertion_may_fail && + std::is_void<_always_void>::value, // just to make this dependent + // on template parameter + int>::type = 0) { + // TODO @tasking @minor DSH this somethimes gets some things out of LIFO + // order, which may be undesirable (but not a bug) + + auto*& failed_head = m_failed_heads[priority][task_type]; + auto& team_queue = m_ready_queues[priority][task_type]; + + while (failed_head != nullptr) { + bool success = team_queue.push(*failed_head); + if (success) { + // Step to the next linked list element + failed_head = + failed_head->as_runnable_task() + .template scheduling_info_as<task_scheduling_info_type>() + .next; + } else { + // no more room, stop traversing and leave the head where it is + break; + } + } + } + + template <class _always_void = void> + KOKKOS_INLINE_FUNCTION void flush_failed_insertions( + int, int, + typename std::enable_if< + !task_queue_traits::ready_queue_insertion_may_fail && + std::is_void<_always_void>::value, // just to make this dependent + // on template parameter + int>::type = 0) {} + + KOKKOS_INLINE_FUNCTION + void flush_all_failed_insertions() { + for (int iPriority = 0; iPriority < NumPriorities; ++iPriority) { + flush_failed_insertions(iPriority, (int)TaskType::TaskTeam); + flush_failed_insertions(iPriority, (int)TaskType::TaskSingle); + } + } + + template <class TeamSchedulerInfo, class ExecutionSpace, class MemorySpace, + class MemoryPool> + KOKKOS_INLINE_FUNCTION void do_schedule_runnable( + MultipleTaskQueue<ExecutionSpace, MemorySpace, TaskQueueTraits, + MemoryPool>& queue, + RunnableTaskBase<TaskQueueTraits>&& task, TeamSchedulerInfo const& info + + ) { + // Push on any nodes that failed to enqueue + auto& team_queue = team_queue_for(task); + auto priority = task.get_priority(); + auto task_type = task.get_task_type(); + + // First schedule the task + queue.schedule_runnable_to_queue(std::move(task), team_queue, info); + + // Task may be enqueued and may be run at any point; don't touch it (hence + // the use of move semantics) + flush_failed_insertions((int)priority, (int)task_type); + } +}; + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +template <class ExecSpace, class MemorySpace, class TaskQueueTraits, + class MemoryPool> +class MultipleTaskQueue final + : public TaskQueueMemoryManager<ExecSpace, MemorySpace, MemoryPool>, + public TaskQueueCommonMixin<MultipleTaskQueue< + ExecSpace, MemorySpace, TaskQueueTraits, MemoryPool>>, + private ObjectWithVLAEmulation< + MultipleTaskQueue<ExecSpace, MemorySpace, TaskQueueTraits, + MemoryPool>, + MultipleTaskQueueTeamEntry<TaskQueueTraits>> { + public: + using task_queue_type = MultipleTaskQueue; // mark as task_queue concept + using task_queue_traits = TaskQueueTraits; + using task_base_type = TaskNode<TaskQueueTraits>; + using ready_queue_type = + typename TaskQueueTraits::template ready_queue_type<task_base_type>; + + private: + using base_t = TaskQueueMemoryManager<ExecSpace, MemorySpace, MemoryPool>; + using common_mixin_t = TaskQueueCommonMixin<MultipleTaskQueue>; + using vla_emulation_base_t = ObjectWithVLAEmulation< + MultipleTaskQueue<ExecSpace, MemorySpace, TaskQueueTraits, MemoryPool>, + MultipleTaskQueueTeamEntry<TaskQueueTraits>>; + + // Allow private inheritance from ObjectWithVLAEmulation + friend struct VLAEmulationAccess; + + public: + struct SchedulerInfo { + using team_queue_id_t = int32_t; + static constexpr team_queue_id_t NoAssociatedTeam = -1; + team_queue_id_t team_association = NoAssociatedTeam; + + using scheduler_info_type = SchedulerInfo; + + KOKKOS_INLINE_FUNCTION + constexpr explicit SchedulerInfo(team_queue_id_t association) noexcept + : team_association(association) {} + + KOKKOS_DEFAULTED_FUNCTION + SchedulerInfo() = default; + + KOKKOS_DEFAULTED_FUNCTION + SchedulerInfo(SchedulerInfo const&) = default; + + KOKKOS_DEFAULTED_FUNCTION + SchedulerInfo(SchedulerInfo&&) = default; + + KOKKOS_DEFAULTED_FUNCTION + SchedulerInfo& operator=(SchedulerInfo const&) = default; + + KOKKOS_DEFAULTED_FUNCTION + SchedulerInfo& operator=(SchedulerInfo&&) = default; + + KOKKOS_DEFAULTED_FUNCTION + ~SchedulerInfo() = default; + }; + + using task_scheduling_info_type = typename std::conditional< + TaskQueueTraits::ready_queue_insertion_may_fail, + FailedQueueInsertionLinkedListSchedulingInfo<TaskQueueTraits>, + EmptyTaskSchedulingInfo>::type; + using team_scheduler_info_type = SchedulerInfo; + + using runnable_task_base_type = RunnableTaskBase<TaskQueueTraits>; + + template <class Functor, class Scheduler> + // requires TaskScheduler<Scheduler> && TaskFunctor<Functor> + using runnable_task_type = + RunnableTask<task_queue_traits, Scheduler, typename Functor::value_type, + Functor>; + + using aggregate_task_type = + AggregateTask<task_queue_traits, task_scheduling_info_type>; + + // Number of allowed priorities + static constexpr int NumPriorities = 3; + + KOKKOS_INLINE_FUNCTION + constexpr typename vla_emulation_base_t::vla_entry_count_type n_queues() const + noexcept { + return this->n_vla_entries(); + } + + public: + //---------------------------------------------------------------------------- + // <editor-fold desc="Constructors, destructors, and assignment"> {{{2 + + MultipleTaskQueue() = delete; + MultipleTaskQueue(MultipleTaskQueue const&) = delete; + MultipleTaskQueue(MultipleTaskQueue&&) = delete; + MultipleTaskQueue& operator=(MultipleTaskQueue const&) = delete; + MultipleTaskQueue& operator=(MultipleTaskQueue&&) = delete; + + MultipleTaskQueue(typename base_t::execution_space const& arg_execution_space, + typename base_t::memory_space const&, + typename base_t::memory_pool const& arg_memory_pool) + : base_t(arg_memory_pool), + vla_emulation_base_t( + Impl::TaskQueueSpecialization< + // TODO @tasking @generalization DSH avoid referencing + // SimpleTaskScheduler directly? + SimpleTaskScheduler<typename base_t::execution_space, + MultipleTaskQueue>>:: + get_max_team_count(arg_execution_space)) {} + + // </editor-fold> end Constructors, destructors, and assignment }}}2 + //---------------------------------------------------------------------------- + + KOKKOS_FUNCTION + void schedule_runnable(runnable_task_base_type&& task, + team_scheduler_info_type const& info) { + auto team_association = info.team_association; + // Should only not be assigned if this is a host spawn... + if (team_association == team_scheduler_info_type::NoAssociatedTeam) { + team_association = 0; + } + this->vla_value_at(team_association) + .do_schedule_runnable(*this, std::move(task), info); + // Task may be enqueued and may be run at any point; don't touch it (hence + // the use of move semantics) + } + + KOKKOS_FUNCTION + OptionalRef<task_base_type> pop_ready_task( + team_scheduler_info_type const& info) { + KOKKOS_EXPECTS(info.team_association != + team_scheduler_info_type::NoAssociatedTeam); + + auto return_value = OptionalRef<task_base_type>{}; + auto team_association = info.team_association; + + // always loop in order of priority first, then prefer team tasks over + // single tasks + auto& team_queue_info = this->vla_value_at(team_association); + + if (task_queue_traits::ready_queue_insertion_may_fail) { + team_queue_info.flush_all_failed_insertions(); + } + + return_value = team_queue_info.pop_ready_task(); + + if (!return_value) { + // loop through the rest of the teams and try to steal + for (auto isteal = (team_association + 1) % this->n_queues(); + isteal != team_association; + isteal = (isteal + 1) % this->n_queues()) { + return_value = this->vla_value_at(isteal).try_to_steal_ready_task(); + if (return_value) { + break; + } + } + + // Note that this is where we'd update the task's scheduling info + } + // if nothing was found, return a default-constructed (empty) OptionalRef + return return_value; + } + + // TODO @tasking @generalization DSH make this a property-based customization + // point + KOKKOS_INLINE_FUNCTION + team_scheduler_info_type initial_team_scheduler_info(int rank_in_league) const + noexcept { + return team_scheduler_info_type{ + typename team_scheduler_info_type::team_queue_id_t(rank_in_league % + n_queues())}; + } + + // TODO @tasking @generalization DSH make this a property-based customization + // point + static /* KOKKOS_CONSTEXPR_14 */ size_t task_queue_allocation_size( + typename base_t::execution_space const& exec_space, + typename base_t::memory_space const&, + typename base_t::memory_pool const&) { + using specialization = Impl::TaskQueueSpecialization< + // TODO @tasking @generalization DSH avoid referencing + // SimpleTaskScheduler directly? + SimpleTaskScheduler<typename base_t::execution_space, + MultipleTaskQueue>>; + + return vla_emulation_base_t::required_allocation_size( + /* num_vla_entries = */ specialization::get_max_team_count(exec_space)); + } + + // Provide a sensible default that can be overridden + KOKKOS_INLINE_FUNCTION + void update_scheduling_info_from_completed_predecessor( + runnable_task_base_type& /*ready_task*/, + runnable_task_base_type const& /*predecessor*/) const { + // Do nothing; we're using the extra storage for the failure linked list + } + + // Provide a sensible default that can be overridden + KOKKOS_INLINE_FUNCTION + void update_scheduling_info_from_completed_predecessor( + aggregate_task_type& /*aggregate*/, + runnable_task_base_type const& /*predecessor*/) const { + // Do nothing; we're using the extra storage for the failure linked list + } + + // Provide a sensible default that can be overridden + KOKKOS_INLINE_FUNCTION + void update_scheduling_info_from_completed_predecessor( + aggregate_task_type& /*aggregate*/, + aggregate_task_type const& /*predecessor*/) const { + // Do nothing; we're using the extra storage for the failure linked list + } + + // Provide a sensible default that can be overridden + KOKKOS_INLINE_FUNCTION + void update_scheduling_info_from_completed_predecessor( + runnable_task_base_type& /*ready_task*/, + aggregate_task_type const& /*predecessor*/) const { + // Do nothing; we're using the extra storage for the failure linked list + } + + KOKKOS_INLINE_FUNCTION + void handle_failed_ready_queue_insertion( + runnable_task_base_type&& task, ready_queue_type&, + team_scheduler_info_type const& info) { + KOKKOS_EXPECTS(info.team_association != + team_scheduler_info_type::NoAssociatedTeam); + + this->vla_value_at(info.team_association) + .do_handle_failed_insertion(std::move(task)); + } +}; + +} /* namespace Impl */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */ +#endif /* #ifndef KOKKOS_IMPL_MULTIPLETASKQUEUE_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_NumericTraits.cpp b/packages/kokkos/core/src/impl/Kokkos_NumericTraits.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e53afe436daff997726be8cb0c880887c32de1a4 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_NumericTraits.cpp @@ -0,0 +1,73 @@ +#include <Kokkos_NumericTraits.hpp> + +// NOTE These out-of class definitions are only required with C++14. Since +// C++17, a static data member declared constrexpr is impllictly inline. + +#if !defined(KOKKOS_ENABLE_CXX17) +namespace Kokkos { +namespace Experimental { +namespace Impl { +#define OUT_OF_CLASS_DEFINTION_FLOATING_POINT(TRAIT) \ + constexpr float TRAIT##_helper<float>::value; \ + constexpr double TRAIT##_helper<double>::value; \ + constexpr long double TRAIT##_helper<long double>::value + +#define OUT_OF_CLASS_DEFINTION_INTEGRAL(TRAIT) \ + constexpr bool TRAIT##_helper<bool>::value; \ + constexpr char TRAIT##_helper<char>::value; \ + constexpr signed char TRAIT##_helper<signed char>::value; \ + constexpr unsigned char TRAIT##_helper<unsigned char>::value; \ + constexpr short TRAIT##_helper<short>::value; \ + constexpr unsigned short TRAIT##_helper<unsigned short>::value; \ + constexpr int TRAIT##_helper<int>::value; \ + constexpr unsigned int TRAIT##_helper<unsigned int>::value; \ + constexpr long int TRAIT##_helper<long int>::value; \ + constexpr unsigned long int TRAIT##_helper<unsigned long int>::value; \ + constexpr long long int TRAIT##_helper<long long int>::value; \ + constexpr unsigned long long int TRAIT##_helper<unsigned long long int>::value + +#define OUT_OF_CLASS_DEFINTION_FLOATING_POINT_2(TRAIT) \ + constexpr int TRAIT##_helper<float>::value; \ + constexpr int TRAIT##_helper<double>::value; \ + constexpr int TRAIT##_helper<long double>::value + +#define OUT_OF_CLASS_DEFINTION_INTEGRAL_2(TRAIT) \ + constexpr int TRAIT##_helper<bool>::value; \ + constexpr int TRAIT##_helper<char>::value; \ + constexpr int TRAIT##_helper<signed char>::value; \ + constexpr int TRAIT##_helper<unsigned char>::value; \ + constexpr int TRAIT##_helper<short>::value; \ + constexpr int TRAIT##_helper<unsigned short>::value; \ + constexpr int TRAIT##_helper<int>::value; \ + constexpr int TRAIT##_helper<unsigned int>::value; \ + constexpr int TRAIT##_helper<long int>::value; \ + constexpr int TRAIT##_helper<unsigned long int>::value; \ + constexpr int TRAIT##_helper<long long int>::value; \ + constexpr int TRAIT##_helper<unsigned long long int>::value + +OUT_OF_CLASS_DEFINTION_FLOATING_POINT(infinity); +OUT_OF_CLASS_DEFINTION_FLOATING_POINT(epsilon); +OUT_OF_CLASS_DEFINTION_FLOATING_POINT(round_error); +OUT_OF_CLASS_DEFINTION_FLOATING_POINT(norm_min); + +OUT_OF_CLASS_DEFINTION_INTEGRAL(finite_min); +OUT_OF_CLASS_DEFINTION_FLOATING_POINT(finite_min); +OUT_OF_CLASS_DEFINTION_INTEGRAL(finite_max); +OUT_OF_CLASS_DEFINTION_FLOATING_POINT(finite_max); + +OUT_OF_CLASS_DEFINTION_INTEGRAL_2(digits); +OUT_OF_CLASS_DEFINTION_FLOATING_POINT_2(digits); +OUT_OF_CLASS_DEFINTION_INTEGRAL_2(digits10); +OUT_OF_CLASS_DEFINTION_FLOATING_POINT_2(digits10); +OUT_OF_CLASS_DEFINTION_FLOATING_POINT_2(max_digits10); +OUT_OF_CLASS_DEFINTION_INTEGRAL_2(radix); +OUT_OF_CLASS_DEFINTION_FLOATING_POINT_2(radix); + +OUT_OF_CLASS_DEFINTION_FLOATING_POINT_2(min_exponent); +OUT_OF_CLASS_DEFINTION_FLOATING_POINT_2(min_exponent10); +OUT_OF_CLASS_DEFINTION_FLOATING_POINT_2(max_exponent); +OUT_OF_CLASS_DEFINTION_FLOATING_POINT_2(max_exponent10); +} // namespace Impl +} // namespace Experimental +} // namespace Kokkos +#endif diff --git a/packages/kokkos/core/src/impl/Kokkos_OptionalRef.hpp b/packages/kokkos/core/src/impl/Kokkos_OptionalRef.hpp new file mode 100644 index 0000000000000000000000000000000000000000..12f6c9f5fdb42e8383f3c9b174ea17c28ff04fe7 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_OptionalRef.hpp @@ -0,0 +1,238 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +// Experimental unified task-data parallel manycore LDRD + +#ifndef KOKKOS_IMPL_OPTIONALREF_HPP +#define KOKKOS_IMPL_OPTIONALREF_HPP + +#include <Kokkos_Macros.hpp> + +#include <Kokkos_Core_fwd.hpp> + +#include <Kokkos_PointerOwnership.hpp> +#include <impl/Kokkos_Error.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- +namespace Kokkos { +namespace Impl { + +struct InPlaceTag {}; + +template <class T> +struct OptionalRef { + private: + ObservingRawPtr<T> m_value = nullptr; + + public: + using value_type = T; + + KOKKOS_DEFAULTED_FUNCTION + OptionalRef() = default; + + KOKKOS_DEFAULTED_FUNCTION + OptionalRef(OptionalRef const&) = default; + + KOKKOS_DEFAULTED_FUNCTION + OptionalRef(OptionalRef&&) = default; + + KOKKOS_INLINE_FUNCTION + // MSVC requires that this copy constructor is not defaulted + // if there exists a (non-defaulted) volatile one. + OptionalRef& operator=(OptionalRef const& other) noexcept { + m_value = other.m_value; + return *this; + } + + KOKKOS_INLINE_FUNCTION + // Can't return a reference to volatile OptionalRef, since GCC issues a + // warning about reference to volatile not accessing the underlying value + void operator=(OptionalRef const volatile& other) volatile noexcept { + m_value = other.m_value; + } + + KOKKOS_DEFAULTED_FUNCTION + OptionalRef& operator=(OptionalRef&&) = default; + + KOKKOS_DEFAULTED_FUNCTION + ~OptionalRef() = default; + + KOKKOS_INLINE_FUNCTION + explicit OptionalRef(T& arg_value) : m_value(&arg_value) {} + + KOKKOS_INLINE_FUNCTION + explicit OptionalRef(std::nullptr_t) : m_value(nullptr) {} + + KOKKOS_INLINE_FUNCTION + OptionalRef& operator=(T& arg_value) { + m_value = &arg_value; + return *this; + } + + KOKKOS_INLINE_FUNCTION + OptionalRef& operator=(std::nullptr_t) { + m_value = nullptr; + return *this; + } + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + OptionalRef<typename std::add_volatile<T>::type> + as_volatile() volatile noexcept { + return OptionalRef<typename std::add_volatile<T>::type>(*(*this)); + } + + KOKKOS_INLINE_FUNCTION + OptionalRef< + typename std::add_volatile<typename std::add_const<T>::type>::type> + as_volatile() const volatile noexcept { + return OptionalRef< + typename std::add_volatile<typename std::add_const<T>::type>::type>( + *(*this)); + } + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + T& operator*() & { + KOKKOS_EXPECTS(this->has_value()); + return *m_value; + } + + KOKKOS_INLINE_FUNCTION + T const& operator*() const& { + KOKKOS_EXPECTS(this->has_value()); + return *m_value; + } + + KOKKOS_INLINE_FUNCTION + T volatile& operator*() volatile& { + KOKKOS_EXPECTS(this->has_value()); + return *m_value; + } + + KOKKOS_INLINE_FUNCTION + T const volatile& operator*() const volatile& { + KOKKOS_EXPECTS(this->has_value()); + return *m_value; + } + + KOKKOS_INLINE_FUNCTION + T&& operator*() && { + KOKKOS_EXPECTS(this->has_value()); + return std::move(*m_value); + } + + KOKKOS_INLINE_FUNCTION + T* operator->() { + KOKKOS_EXPECTS(this->has_value()); + return m_value; + } + + KOKKOS_INLINE_FUNCTION + T const* operator->() const { + KOKKOS_EXPECTS(this->has_value()); + return m_value; + } + + KOKKOS_INLINE_FUNCTION + T volatile* operator->() volatile { + KOKKOS_EXPECTS(this->has_value()); + return m_value; + } + + KOKKOS_INLINE_FUNCTION + T const volatile* operator->() const volatile { + KOKKOS_EXPECTS(this->has_value()); + return m_value; + } + + KOKKOS_INLINE_FUNCTION + T* get() { return m_value; } + + KOKKOS_INLINE_FUNCTION + T const* get() const { return m_value; } + + KOKKOS_INLINE_FUNCTION + T volatile* get() volatile { return m_value; } + + KOKKOS_INLINE_FUNCTION + T const volatile* get() const volatile { return m_value; } + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + operator bool() { return m_value != nullptr; } + + KOKKOS_INLINE_FUNCTION + operator bool() const { return m_value != nullptr; } + + KOKKOS_INLINE_FUNCTION + operator bool() volatile { return m_value != nullptr; } + + KOKKOS_INLINE_FUNCTION + operator bool() const volatile { return m_value != nullptr; } + + KOKKOS_INLINE_FUNCTION + bool has_value() { return m_value != nullptr; } + + KOKKOS_INLINE_FUNCTION + bool has_value() const { return m_value != nullptr; } + + KOKKOS_INLINE_FUNCTION + bool has_value() volatile { return m_value != nullptr; } + + KOKKOS_INLINE_FUNCTION + bool has_value() const volatile { return m_value != nullptr; } +}; + +} // end namespace Impl +} // end namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #ifndef KOKKOS_IMPL_OPTIONALREF_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_PhysicalLayout.hpp b/packages/kokkos/core/src/impl/Kokkos_PhysicalLayout.hpp new file mode 100644 index 0000000000000000000000000000000000000000..bc0a7df3e03969a37f70c71ce5497fd0f864fb2d --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_PhysicalLayout.hpp @@ -0,0 +1,77 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_PHYSICAL_LAYOUT_HPP +#define KOKKOS_PHYSICAL_LAYOUT_HPP + +#include <Kokkos_View.hpp> + +namespace Kokkos { +namespace Impl { + +struct PhysicalLayout { + enum LayoutType { Left, Right, Scalar, Error }; + LayoutType layout_type; + int rank; + long long int stride[9]; // distance between two neighboring elements in a + // given dimension + + template <class T, class L, class D, class M> + PhysicalLayout(const View<T, L, D, M>& view) + : layout_type( + is_same<typename View<T, L, D, M>::array_layout, LayoutLeft>::value + ? Left + : (is_same<typename View<T, L, D, M>::array_layout, + LayoutRight>::value + ? Right + : Error)), + rank(view.Rank) { + for (int i = 0; i < 9; i++) stride[i] = 0; + view.stride(stride); + } +}; + +} // namespace Impl +} // namespace Kokkos +#endif diff --git a/packages/kokkos/core/src/impl/Kokkos_Profiling.cpp b/packages/kokkos/core/src/impl/Kokkos_Profiling.cpp new file mode 100644 index 0000000000000000000000000000000000000000..94ea6e1a2b10c33a81e4f2c6b7a932577ce6144b --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Profiling.cpp @@ -0,0 +1,1119 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Macros.hpp> +#include <Kokkos_Tuners.hpp> +#include <impl/Kokkos_Profiling.hpp> +#if defined(KOKKOS_ENABLE_LIBDL) +#include <dlfcn.h> +#endif + +#include <algorithm> +#include <array> +#include <cstring> +#include <iostream> +#include <stack> +#include <unordered_map> +#include <unordered_set> +#include <vector> +namespace Kokkos { + +namespace Tools { + +namespace Experimental { + +namespace Impl { +void tool_invoked_fence(const uint32_t /* devID */) { + /** + * Currently the function ignores the device ID, + * Eventually we want to support fencing only + * a given stream/resource + */ + Kokkos::fence(); +} +} // namespace Impl +#ifdef KOKKOS_ENABLE_TUNING +static size_t kernel_name_context_variable_id; +static size_t kernel_type_context_variable_id; +static std::unordered_map<size_t, std::unordered_set<size_t>> + features_per_context; +static std::unordered_set<size_t> active_features; +static std::unordered_map<size_t, VariableValue> feature_values; +static std::unordered_map<size_t, VariableInfo> variable_metadata; +#endif +static EventSet current_callbacks; +static EventSet backup_callbacks; +static EventSet no_profiling; +static Kokkos::Tools::Experimental::ToolSettings tool_requirements; +bool eventSetsEqual(const EventSet& l, const EventSet& r) { + return l.init == r.init && l.finalize == r.finalize && + l.parse_args == r.parse_args && l.print_help == r.print_help && + l.begin_parallel_for == r.begin_parallel_for && + l.end_parallel_for == r.end_parallel_for && + l.begin_parallel_reduce == r.begin_parallel_reduce && + l.end_parallel_reduce == r.end_parallel_reduce && + l.begin_parallel_scan == r.begin_parallel_scan && + l.end_parallel_scan == r.end_parallel_scan && + l.push_region == r.push_region && l.pop_region == r.pop_region && + l.allocate_data == r.allocate_data && + l.deallocate_data == r.deallocate_data && + l.create_profile_section == r.create_profile_section && + l.start_profile_section == r.start_profile_section && + l.stop_profile_section == r.stop_profile_section && + l.destroy_profile_section == r.destroy_profile_section && + l.profile_event == r.profile_event && + l.begin_deep_copy == r.begin_deep_copy && + l.end_deep_copy == r.end_deep_copy && l.begin_fence == r.begin_fence && + l.end_fence == r.end_fence && l.sync_dual_view == r.sync_dual_view && + l.modify_dual_view == r.modify_dual_view && + l.declare_metadata == r.declare_metadata && + l.request_tool_settings == r.request_tool_settings && + l.provide_tool_programming_interface == + r.provide_tool_programming_interface && + l.declare_input_type == r.declare_input_type && + l.declare_output_type == r.declare_output_type && + l.end_tuning_context == r.end_tuning_context && + l.begin_tuning_context == r.begin_tuning_context && + l.request_output_values == r.request_output_values && + l.declare_optimization_goal == r.declare_optimization_goal; +} +enum class MayRequireGlobalFencing : bool { No, Yes }; +template <typename Callback, typename... Args> +inline void invoke_kokkosp_callback( + MayRequireGlobalFencing may_require_global_fencing, + const Callback& callback, Args&&... args) { + if (callback != nullptr) { + // two clause if statement + // may_require_global_fencing: "if this callback ever needs a fence", AND + // if the tool requires global fencing (default true, but tools can + // overwrite) + if (may_require_global_fencing == MayRequireGlobalFencing::Yes && + (Kokkos::Tools::Experimental::tool_requirements + .requires_global_fencing)) { + Kokkos::fence(); + } + (*callback)(std::forward<Args>(args)...); + } +} +} // namespace Experimental +bool profileLibraryLoaded() { + return !Experimental::eventSetsEqual(Experimental::current_callbacks, + Experimental::no_profiling); +} + +void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID, + uint64_t* kernelID) { + Experimental::invoke_kokkosp_callback( + Experimental::MayRequireGlobalFencing::Yes, + Experimental::current_callbacks.begin_parallel_for, kernelPrefix.c_str(), + devID, kernelID); +#ifdef KOKKOS_ENABLE_TUNING + if (Kokkos::tune_internals()) { + auto context_id = Experimental::get_new_context_id(); + Experimental::begin_context(context_id); + Experimental::VariableValue contextValues[] = { + Experimental::make_variable_value( + Experimental::kernel_name_context_variable_id, kernelPrefix), + Experimental::make_variable_value( + Experimental::kernel_type_context_variable_id, "parallel_for")}; + Experimental::set_input_values(context_id, 2, contextValues); + } +#endif +} + +void endParallelFor(const uint64_t kernelID) { + Experimental::invoke_kokkosp_callback( + Experimental::MayRequireGlobalFencing::Yes, + Experimental::current_callbacks.end_parallel_for, kernelID); +#ifdef KOKKOS_ENABLE_TUNING + if (Kokkos::tune_internals()) { + Experimental::end_context(Experimental::get_current_context_id()); + } +#endif +} + +void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID, + uint64_t* kernelID) { + Experimental::invoke_kokkosp_callback( + Experimental::MayRequireGlobalFencing::Yes, + Experimental::current_callbacks.begin_parallel_scan, kernelPrefix.c_str(), + devID, kernelID); +#ifdef KOKKOS_ENABLE_TUNING + if (Kokkos::tune_internals()) { + auto context_id = Experimental::get_new_context_id(); + Experimental::begin_context(context_id); + Experimental::VariableValue contextValues[] = { + Experimental::make_variable_value( + Experimental::kernel_name_context_variable_id, kernelPrefix), + Experimental::make_variable_value( + Experimental::kernel_type_context_variable_id, "parallel_for")}; + Experimental::set_input_values(context_id, 2, contextValues); + } +#endif +} + +void endParallelScan(const uint64_t kernelID) { + Experimental::invoke_kokkosp_callback( + Experimental::MayRequireGlobalFencing::Yes, + Experimental::current_callbacks.end_parallel_scan, kernelID); +#ifdef KOKKOS_ENABLE_TUNING + if (Kokkos::tune_internals()) { + Experimental::end_context(Experimental::get_current_context_id()); + } +#endif +} + +void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, + uint64_t* kernelID) { + Experimental::invoke_kokkosp_callback( + Experimental::MayRequireGlobalFencing::Yes, + Experimental::current_callbacks.begin_parallel_reduce, + kernelPrefix.c_str(), devID, kernelID); +#ifdef KOKKOS_ENABLE_TUNING + if (Kokkos::tune_internals()) { + auto context_id = Experimental::get_new_context_id(); + Experimental::begin_context(context_id); + Experimental::VariableValue contextValues[] = { + Experimental::make_variable_value( + Experimental::kernel_name_context_variable_id, kernelPrefix), + Experimental::make_variable_value( + Experimental::kernel_type_context_variable_id, "parallel_for")}; + Experimental::set_input_values(context_id, 2, contextValues); + } +#endif +} + +void endParallelReduce(const uint64_t kernelID) { + Experimental::invoke_kokkosp_callback( + Experimental::MayRequireGlobalFencing::Yes, + Experimental::current_callbacks.end_parallel_reduce, kernelID); +#ifdef KOKKOS_ENABLE_TUNING + if (Kokkos::tune_internals()) { + Experimental::end_context(Experimental::get_current_context_id()); + } +#endif +} + +void pushRegion(const std::string& kName) { + Experimental::invoke_kokkosp_callback( + Experimental::MayRequireGlobalFencing::Yes, + Experimental::current_callbacks.push_region, kName.c_str()); +} + +void popRegion() { + Experimental::invoke_kokkosp_callback( + Experimental::MayRequireGlobalFencing::Yes, + Experimental::current_callbacks.pop_region); +} + +void allocateData(const SpaceHandle space, const std::string label, + const void* ptr, const uint64_t size) { + Experimental::invoke_kokkosp_callback( + Experimental::MayRequireGlobalFencing::No, + Experimental::current_callbacks.allocate_data, space, label.c_str(), ptr, + size); +} + +void deallocateData(const SpaceHandle space, const std::string label, + const void* ptr, const uint64_t size) { + Experimental::invoke_kokkosp_callback( + Experimental::MayRequireGlobalFencing::No, + Experimental::current_callbacks.deallocate_data, space, label.c_str(), + ptr, size); +} + +void beginDeepCopy(const SpaceHandle dst_space, const std::string dst_label, + const void* dst_ptr, const SpaceHandle src_space, + const std::string src_label, const void* src_ptr, + const uint64_t size) { + Experimental::invoke_kokkosp_callback( + Experimental::MayRequireGlobalFencing::No, + Experimental::current_callbacks.begin_deep_copy, dst_space, + dst_label.c_str(), dst_ptr, src_space, src_label.c_str(), src_ptr, size); +#ifdef KOKKOS_ENABLE_TUNING + if (Experimental::current_callbacks.begin_deep_copy != nullptr) { + if (Kokkos::tune_internals()) { + auto context_id = Experimental::get_new_context_id(); + Experimental::begin_context(context_id); + Experimental::VariableValue contextValues[] = { + Experimental::make_variable_value( + Experimental::kernel_name_context_variable_id, + "deep_copy_kernel"), + Experimental::make_variable_value( + Experimental::kernel_type_context_variable_id, "deep_copy")}; + Experimental::set_input_values(context_id, 2, contextValues); + } + } +#endif +} + +void endDeepCopy() { + Experimental::invoke_kokkosp_callback( + Experimental::MayRequireGlobalFencing::No, + Experimental::current_callbacks.end_deep_copy); +#ifdef KOKKOS_ENABLE_TUNING + if (Experimental::current_callbacks.end_deep_copy != nullptr) { + if (Kokkos::tune_internals()) { + Experimental::end_context(Experimental::get_current_context_id()); + } + } +#endif +} + +void beginFence(const std::string name, const uint32_t deviceId, + uint64_t* handle) { + Experimental::invoke_kokkosp_callback( + Experimental::MayRequireGlobalFencing::No, + Experimental::current_callbacks.begin_fence, name.c_str(), deviceId, + handle); +} + +void endFence(const uint64_t handle) { + Experimental::invoke_kokkosp_callback( + Experimental::MayRequireGlobalFencing::No, + Experimental::current_callbacks.end_fence, handle); +} + +void createProfileSection(const std::string& sectionName, uint32_t* secID) { + Experimental::invoke_kokkosp_callback( + Experimental::MayRequireGlobalFencing::No, + Experimental::current_callbacks.create_profile_section, + sectionName.c_str(), secID); +} + +void startSection(const uint32_t secID) { + Experimental::invoke_kokkosp_callback( + Experimental::MayRequireGlobalFencing::No, + Experimental::current_callbacks.start_profile_section, secID); +} + +void stopSection(const uint32_t secID) { + Experimental::invoke_kokkosp_callback( + Experimental::MayRequireGlobalFencing::No, + Experimental::current_callbacks.stop_profile_section, secID); +} + +void destroyProfileSection(const uint32_t secID) { + Experimental::invoke_kokkosp_callback( + Experimental::MayRequireGlobalFencing::No, + Experimental::current_callbacks.destroy_profile_section, secID); +} + +void markEvent(const std::string& eventName) { + Experimental::invoke_kokkosp_callback( + Experimental::MayRequireGlobalFencing::No, + Experimental::current_callbacks.profile_event, eventName.c_str()); +} + +bool printHelp(const std::string& args) { + if (Experimental::current_callbacks.print_help == nullptr) { + return false; + } + std::string arg0 = args.substr(0, args.find_first_of(' ')); + const char* carg0 = arg0.c_str(); + Experimental::invoke_kokkosp_callback( + Experimental::MayRequireGlobalFencing::No, + Experimental::current_callbacks.print_help, const_cast<char*>(carg0)); + return true; +} + +void parseArgs(int _argc, char** _argv) { + if (Experimental::current_callbacks.parse_args != nullptr && _argc > 0) { + Experimental::invoke_kokkosp_callback( + Experimental::MayRequireGlobalFencing::No, + Experimental::current_callbacks.parse_args, _argc, _argv); + } +} + +void parseArgs(const std::string& args) { + if (Experimental::current_callbacks.parse_args == nullptr) { + return; + } + using strvec_t = std::vector<std::string>; + auto tokenize = [](const std::string& line, const std::string& delimiters) { + strvec_t _result{}; + std::size_t _bidx = 0; // position that is the beginning of the new string + std::size_t _didx = 0; // position of the delimiter in the string + while (_bidx < line.length() && _didx < line.length()) { + // find the first character (starting at _didx) that is not a delimiter + _bidx = line.find_first_not_of(delimiters, _didx); + // if no more non-delimiter chars, done + if (_bidx == std::string::npos) break; + // starting at the position of the new string, find the next delimiter + _didx = line.find_first_of(delimiters, _bidx); + // starting at the position of the new string, get the characters + // between this position and the next delimiter + std::string _tmp = line.substr(_bidx, _didx - _bidx); + // don't add empty strings + if (!_tmp.empty()) _result.emplace_back(_tmp); + } + return _result; + }; + auto vargs = tokenize(args, " \t"); + if (vargs.size() == 0) return; + auto _argc = static_cast<int>(vargs.size()); + char** _argv = new char*[_argc + 1]; + _argv[vargs.size()] = nullptr; + for (int i = 0; i < _argc; ++i) { + auto& _str = vargs.at(i); + _argv[i] = new char[_str.length() + 1]; + std::memcpy(_argv[i], _str.c_str(), _str.length() * sizeof(char)); + _argv[i][_str.length()] = '\0'; + } + parseArgs(_argc, _argv); + for (int i = 0; i < _argc; ++i) { + delete[] _argv[i]; + } + delete[] _argv; +} + +SpaceHandle make_space_handle(const char* space_name) { + SpaceHandle handle; + strncpy(handle.name, space_name, 63); + return handle; +} + +template <typename Callback> +void lookup_function(void* dlopen_handle, const std::string& basename, + Callback& callback) { +#ifdef KOKKOS_ENABLE_LIBDL + // dlsym returns a pointer to an object, while we want to assign to + // pointer to function A direct cast will give warnings hence, we have to + // workaround the issue by casting pointer to pointers. + void* p = dlsym(dlopen_handle, basename.c_str()); + callback = *reinterpret_cast<Callback*>(&p); +#endif +} + +void initialize(const std::string& profileLibrary) { + // Make sure initialize calls happens only once + static int is_initialized = 0; + if (is_initialized) return; + is_initialized = 1; + +#ifdef KOKKOS_ENABLE_LIBDL + void* firstProfileLibrary = nullptr; + + if (profileLibrary.empty()) return; + + char* envProfileLibrary = const_cast<char*>(profileLibrary.c_str()); + + char* envProfileCopy = + (char*)malloc(sizeof(char) * (strlen(envProfileLibrary) + 1)); + sprintf(envProfileCopy, "%s", envProfileLibrary); + + char* profileLibraryName = strtok(envProfileCopy, ";"); + + if ((profileLibraryName != nullptr) && + (strcmp(profileLibraryName, "") != 0)) { + firstProfileLibrary = dlopen(profileLibraryName, RTLD_NOW | RTLD_GLOBAL); + + if (firstProfileLibrary == nullptr) { + std::cerr << "Error: Unable to load KokkosP library: " + << profileLibraryName << std::endl; + std::cerr << "dlopen(" << profileLibraryName + << ", RTLD_NOW | RTLD_GLOBAL) failed with " << dlerror() + << '\n'; + } else { +#ifdef KOKKOS_ENABLE_PROFILING_LOAD_PRINT + std::cout << "KokkosP: Library Loaded: " << profileLibraryName + << std::endl; +#endif + lookup_function( + firstProfileLibrary, "kokkosp_begin_parallel_scan", + Kokkos::Tools::Experimental::current_callbacks.begin_parallel_scan); + lookup_function( + firstProfileLibrary, "kokkosp_begin_parallel_for", + Kokkos::Tools::Experimental::current_callbacks.begin_parallel_for); + lookup_function( + firstProfileLibrary, "kokkosp_begin_parallel_reduce", + Kokkos::Tools::Experimental::current_callbacks.begin_parallel_reduce); + lookup_function( + firstProfileLibrary, "kokkosp_end_parallel_scan", + Kokkos::Tools::Experimental::current_callbacks.end_parallel_scan); + lookup_function( + firstProfileLibrary, "kokkosp_end_parallel_for", + Kokkos::Tools::Experimental::current_callbacks.end_parallel_for); + lookup_function( + firstProfileLibrary, "kokkosp_end_parallel_reduce", + Kokkos::Tools::Experimental::current_callbacks.end_parallel_reduce); + + lookup_function(firstProfileLibrary, "kokkosp_init_library", + Kokkos::Tools::Experimental::current_callbacks.init); + lookup_function(firstProfileLibrary, "kokkosp_finalize_library", + Kokkos::Tools::Experimental::current_callbacks.finalize); + + lookup_function( + firstProfileLibrary, "kokkosp_push_profile_region", + Kokkos::Tools::Experimental::current_callbacks.push_region); + lookup_function( + firstProfileLibrary, "kokkosp_pop_profile_region", + Kokkos::Tools::Experimental::current_callbacks.pop_region); + lookup_function( + firstProfileLibrary, "kokkosp_allocate_data", + Kokkos::Tools::Experimental::current_callbacks.allocate_data); + lookup_function( + firstProfileLibrary, "kokkosp_deallocate_data", + Kokkos::Tools::Experimental::current_callbacks.deallocate_data); + + lookup_function( + firstProfileLibrary, "kokkosp_begin_deep_copy", + Kokkos::Tools::Experimental::current_callbacks.begin_deep_copy); + lookup_function( + firstProfileLibrary, "kokkosp_end_deep_copy", + Kokkos::Tools::Experimental::current_callbacks.end_deep_copy); + lookup_function( + firstProfileLibrary, "kokkosp_begin_fence", + Kokkos::Tools::Experimental::current_callbacks.begin_fence); + lookup_function(firstProfileLibrary, "kokkosp_end_fence", + Kokkos::Tools::Experimental::current_callbacks.end_fence); + lookup_function( + firstProfileLibrary, "kokkosp_dual_view_sync", + Kokkos::Tools::Experimental::current_callbacks.sync_dual_view); + lookup_function( + firstProfileLibrary, "kokkosp_dual_view_modify", + Kokkos::Tools::Experimental::current_callbacks.modify_dual_view); + + lookup_function( + firstProfileLibrary, "kokkosp_declare_metadata", + Kokkos::Tools::Experimental::current_callbacks.declare_metadata); + lookup_function(firstProfileLibrary, "kokkosp_create_profile_section", + Kokkos::Tools::Experimental::current_callbacks + .create_profile_section); + lookup_function( + firstProfileLibrary, "kokkosp_start_profile_section", + Kokkos::Tools::Experimental::current_callbacks.start_profile_section); + lookup_function( + firstProfileLibrary, "kokkosp_stop_profile_section", + Kokkos::Tools::Experimental::current_callbacks.stop_profile_section); + lookup_function(firstProfileLibrary, "kokkosp_destroy_profile_section", + Kokkos::Tools::Experimental::current_callbacks + .destroy_profile_section); + + lookup_function( + firstProfileLibrary, "kokkosp_profile_event", + Kokkos::Tools::Experimental::current_callbacks.profile_event); +#ifdef KOKKOS_ENABLE_TUNING + lookup_function( + firstProfileLibrary, "kokkosp_declare_output_type", + Kokkos::Tools::Experimental::current_callbacks.declare_output_type); + + lookup_function( + firstProfileLibrary, "kokkosp_declare_input_type", + Kokkos::Tools::Experimental::current_callbacks.declare_input_type); + lookup_function( + firstProfileLibrary, "kokkosp_request_values", + Kokkos::Tools::Experimental::current_callbacks.request_output_values); + lookup_function( + firstProfileLibrary, "kokkosp_end_context", + Kokkos::Tools::Experimental::current_callbacks.end_tuning_context); + lookup_function( + firstProfileLibrary, "kokkosp_begin_context", + Kokkos::Tools::Experimental::current_callbacks.begin_tuning_context); + lookup_function(firstProfileLibrary, "kokkosp_declare_optimization_goal", + Kokkos::Tools::Experimental::current_callbacks + .declare_optimization_goal); +#endif // KOKKOS_ENABLE_TUNING + + lookup_function( + firstProfileLibrary, "kokkosp_print_help", + Kokkos::Tools::Experimental::current_callbacks.print_help); + lookup_function( + firstProfileLibrary, "kokkosp_parse_args", + Kokkos::Tools::Experimental::current_callbacks.parse_args); + lookup_function(firstProfileLibrary, + "kokkosp_provide_tool_programming_interface", + Kokkos::Tools::Experimental::current_callbacks + .provide_tool_programming_interface); + lookup_function( + firstProfileLibrary, "kokkosp_request_tool_settings", + Kokkos::Tools::Experimental::current_callbacks.request_tool_settings); + } + } +#else + (void)profileLibrary; +#endif // KOKKOS_ENABLE_LIBDL + Experimental::invoke_kokkosp_callback( + Kokkos::Tools::Experimental::MayRequireGlobalFencing::No, + Kokkos::Tools::Experimental::current_callbacks.init, 0, + (uint64_t)KOKKOSP_INTERFACE_VERSION, (uint32_t)0, nullptr); + + Experimental::tool_requirements.requires_global_fencing = true; + + Experimental::invoke_kokkosp_callback( + Experimental::MayRequireGlobalFencing::No, + Experimental::current_callbacks.request_tool_settings, 1, + &Experimental::tool_requirements); + + Experimental::ToolProgrammingInterface actions; + actions.fence = &Experimental::Impl::tool_invoked_fence; + + Experimental::invoke_kokkosp_callback( + Experimental::MayRequireGlobalFencing::No, + Experimental::current_callbacks.provide_tool_programming_interface, 1, + actions); + +#ifdef KOKKOS_ENABLE_TUNING + Experimental::VariableInfo kernel_name; + kernel_name.type = Experimental::ValueType::kokkos_value_string; + kernel_name.category = + Experimental::StatisticalCategory::kokkos_value_categorical; + kernel_name.valueQuantity = + Experimental::CandidateValueType::kokkos_value_unbounded; + + std::array<std::string, 4> candidate_values = { + "parallel_for", + "parallel_reduce", + "parallel_scan", + "parallel_copy", + }; + + Experimental::SetOrRange kernel_type_variable_candidates = + Experimental::make_candidate_set(4, candidate_values.data()); + + Experimental::kernel_name_context_variable_id = + Experimental::declare_input_type("kokkos.kernel_name", kernel_name); + + Experimental::VariableInfo kernel_type; + kernel_type.type = Experimental::ValueType::kokkos_value_string; + kernel_type.category = + Experimental::StatisticalCategory::kokkos_value_categorical; + kernel_type.valueQuantity = + Experimental::CandidateValueType::kokkos_value_set; + kernel_type.candidates = kernel_type_variable_candidates; + Experimental::kernel_type_context_variable_id = + Experimental::declare_input_type("kokkos.kernel_type", kernel_type); + +#endif + + Experimental::no_profiling.init = nullptr; + Experimental::no_profiling.finalize = nullptr; + + Experimental::no_profiling.begin_parallel_for = nullptr; + Experimental::no_profiling.begin_parallel_scan = nullptr; + Experimental::no_profiling.begin_parallel_reduce = nullptr; + Experimental::no_profiling.end_parallel_scan = nullptr; + Experimental::no_profiling.end_parallel_for = nullptr; + Experimental::no_profiling.end_parallel_reduce = nullptr; + + Experimental::no_profiling.push_region = nullptr; + Experimental::no_profiling.pop_region = nullptr; + Experimental::no_profiling.allocate_data = nullptr; + Experimental::no_profiling.deallocate_data = nullptr; + + Experimental::no_profiling.begin_deep_copy = nullptr; + Experimental::no_profiling.end_deep_copy = nullptr; + + Experimental::no_profiling.create_profile_section = nullptr; + Experimental::no_profiling.start_profile_section = nullptr; + Experimental::no_profiling.stop_profile_section = nullptr; + Experimental::no_profiling.destroy_profile_section = nullptr; + + Experimental::no_profiling.profile_event = nullptr; + + Experimental::no_profiling.declare_input_type = nullptr; + Experimental::no_profiling.declare_output_type = nullptr; + Experimental::no_profiling.request_output_values = nullptr; + Experimental::no_profiling.end_tuning_context = nullptr; +#ifdef KOKKOS_ENABLE_LIBDL + free(envProfileCopy); +#endif +} + +void finalize() { + // Make sure finalize calls happens only once + static int is_finalized = 0; + if (is_finalized) return; + is_finalized = 1; + + if (Experimental::current_callbacks.finalize != nullptr) { + Experimental::invoke_kokkosp_callback( + Experimental::MayRequireGlobalFencing::No, + Experimental::current_callbacks.finalize); + + Experimental::pause_tools(); + } +#ifdef KOKKOS_ENABLE_TUNING + // clean up string candidate set + for (auto& metadata_pair : Experimental::variable_metadata) { + auto metadata = metadata_pair.second; + if ((metadata.type == Experimental::ValueType::kokkos_value_string) && + (metadata.valueQuantity == + Experimental::CandidateValueType::kokkos_value_set)) { + auto candidate_set = metadata.candidates.set; + delete[] candidate_set.values.string_value; + } + } +#endif +} + +void syncDualView(const std::string& label, const void* const ptr, + bool to_device) { + Experimental::invoke_kokkosp_callback( + Experimental::MayRequireGlobalFencing::No, + Experimental::current_callbacks.sync_dual_view, label.c_str(), ptr, + to_device); +} +void modifyDualView(const std::string& label, const void* const ptr, + bool on_device) { + Experimental::invoke_kokkosp_callback( + Experimental::MayRequireGlobalFencing::No, + Experimental::current_callbacks.modify_dual_view, label.c_str(), ptr, + on_device); +} + +void declareMetadata(const std::string& key, const std::string& value) { + Experimental::invoke_kokkosp_callback( + Experimental::MayRequireGlobalFencing::No, + Experimental::current_callbacks.declare_metadata, key.c_str(), + value.c_str()); +} + +} // namespace Tools + +namespace Tools { +namespace Experimental { +void set_init_callback(initFunction callback) { + current_callbacks.init = callback; +} +void set_finalize_callback(finalizeFunction callback) { + current_callbacks.finalize = callback; +} +void set_parse_args_callback(parseArgsFunction callback) { + current_callbacks.parse_args = callback; +} +void set_print_help_callback(printHelpFunction callback) { + current_callbacks.print_help = callback; +} +void set_begin_parallel_for_callback(beginFunction callback) { + current_callbacks.begin_parallel_for = callback; +} +void set_end_parallel_for_callback(endFunction callback) { + current_callbacks.end_parallel_for = callback; +} +void set_begin_parallel_reduce_callback(beginFunction callback) { + current_callbacks.begin_parallel_reduce = callback; +} +void set_end_parallel_reduce_callback(endFunction callback) { + current_callbacks.end_parallel_reduce = callback; +} +void set_begin_parallel_scan_callback(beginFunction callback) { + current_callbacks.begin_parallel_scan = callback; +} +void set_end_parallel_scan_callback(endFunction callback) { + current_callbacks.end_parallel_scan = callback; +} +void set_push_region_callback(pushFunction callback) { + current_callbacks.push_region = callback; +} +void set_pop_region_callback(popFunction callback) { + current_callbacks.pop_region = callback; +} +void set_allocate_data_callback(allocateDataFunction callback) { + current_callbacks.allocate_data = callback; +} +void set_deallocate_data_callback(deallocateDataFunction callback) { + current_callbacks.deallocate_data = callback; +} +void set_create_profile_section_callback( + createProfileSectionFunction callback) { + current_callbacks.create_profile_section = callback; +} +void set_start_profile_section_callback(startProfileSectionFunction callback) { + current_callbacks.start_profile_section = callback; +} +void set_stop_profile_section_callback(stopProfileSectionFunction callback) { + current_callbacks.stop_profile_section = callback; +} +void set_destroy_profile_section_callback( + destroyProfileSectionFunction callback) { + current_callbacks.destroy_profile_section = callback; +} +void set_profile_event_callback(profileEventFunction callback) { + current_callbacks.profile_event = callback; +} +void set_begin_deep_copy_callback(beginDeepCopyFunction callback) { + current_callbacks.begin_deep_copy = callback; +} +void set_end_deep_copy_callback(endDeepCopyFunction callback) { + current_callbacks.end_deep_copy = callback; +} +void set_begin_fence_callback(beginFenceFunction callback) { + current_callbacks.begin_fence = callback; +} +void set_end_fence_callback(endFenceFunction callback) { + current_callbacks.end_fence = callback; +} + +void set_dual_view_sync_callback(dualViewSyncFunction callback) { + current_callbacks.sync_dual_view = callback; +} +void set_dual_view_modify_callback(dualViewModifyFunction callback) { + current_callbacks.modify_dual_view = callback; +} +void set_declare_metadata_callback(declareMetadataFunction callback) { + current_callbacks.declare_metadata = callback; +} + +void set_declare_output_type_callback(outputTypeDeclarationFunction callback) { + current_callbacks.declare_output_type = callback; +} +void set_declare_input_type_callback(inputTypeDeclarationFunction callback) { + current_callbacks.declare_input_type = callback; +} +void set_request_output_values_callback(requestValueFunction callback) { + current_callbacks.request_output_values = callback; +} +void set_end_context_callback(contextEndFunction callback) { + current_callbacks.end_tuning_context = callback; +} +void set_begin_context_callback(contextBeginFunction callback) { + current_callbacks.begin_tuning_context = callback; +} +void set_declare_optimization_goal_callback( + optimizationGoalDeclarationFunction callback) { + current_callbacks.declare_optimization_goal = callback; +} + +void pause_tools() { + backup_callbacks = current_callbacks; + current_callbacks = no_profiling; +} + +void resume_tools() { current_callbacks = backup_callbacks; } + +EventSet get_callbacks() { return current_callbacks; } +void set_callbacks(EventSet new_events) { current_callbacks = new_events; } +} // namespace Experimental +} // namespace Tools + +namespace Profiling { +bool profileLibraryLoaded() { return Kokkos::Tools::profileLibraryLoaded(); } + +void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID, + uint64_t* kernelID) { + Kokkos::Tools::beginParallelFor(kernelPrefix, devID, kernelID); +} +void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, + uint64_t* kernelID) { + Kokkos::Tools::beginParallelReduce(kernelPrefix, devID, kernelID); +} +void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID, + uint64_t* kernelID) { + Kokkos::Tools::beginParallelScan(kernelPrefix, devID, kernelID); +} +void endParallelFor(const uint64_t kernelID) { + Kokkos::Tools::endParallelFor(kernelID); +} +void endParallelReduce(const uint64_t kernelID) { + Kokkos::Tools::endParallelReduce(kernelID); +} +void endParallelScan(const uint64_t kernelID) { + Kokkos::Tools::endParallelScan(kernelID); +} + +void pushRegion(const std::string& kName) { Kokkos::Tools::pushRegion(kName); } +void popRegion() { Kokkos::Tools::popRegion(); } + +void createProfileSection(const std::string& sectionName, uint32_t* secID) { + Kokkos::Tools::createProfileSection(sectionName, secID); +} +void destroyProfileSection(const uint32_t secID) { + Kokkos::Tools::destroyProfileSection(secID); +} + +void startSection(const uint32_t secID) { Kokkos::Tools::startSection(secID); } + +void stopSection(const uint32_t secID) { Kokkos::Tools::stopSection(secID); } + +void markEvent(const std::string& eventName) { + Kokkos::Tools::markEvent(eventName); +} +void allocateData(const SpaceHandle handle, const std::string name, + const void* data, const uint64_t size) { + Kokkos::Tools::allocateData(handle, name, data, size); +} +void deallocateData(const SpaceHandle space, const std::string label, + const void* ptr, const uint64_t size) { + Kokkos::Tools::deallocateData(space, label, ptr, size); +} + +void beginDeepCopy(const SpaceHandle dst_space, const std::string dst_label, + const void* dst_ptr, const SpaceHandle src_space, + const std::string src_label, const void* src_ptr, + const uint64_t size) { + Kokkos::Tools::beginDeepCopy(dst_space, dst_label, dst_ptr, src_space, + src_label, src_ptr, size); +} +void endDeepCopy() { Kokkos::Tools::endDeepCopy(); } + +void finalize() { Kokkos::Tools::finalize(); } +void initialize(const std::string& profileLibrary) { + Kokkos::Tools::initialize(profileLibrary); +} + +bool printHelp(const std::string& args) { + return Kokkos::Tools::printHelp(args); +} +void parseArgs(const std::string& args) { Kokkos::Tools::parseArgs(args); } +void parseArgs(int _argc, char** _argv) { + Kokkos::Tools::parseArgs(_argc, _argv); +} + +SpaceHandle make_space_handle(const char* space_name) { + return Kokkos::Tools::make_space_handle(space_name); +} +} // namespace Profiling + +} // namespace Kokkos + +// Tuning + +namespace Kokkos { +namespace Tools { +namespace Experimental { +static size_t& get_context_counter() { + static size_t x; + return x; +} +static size_t& get_variable_counter() { + static size_t x; + return ++x; +} + +size_t get_new_context_id() { return ++get_context_counter(); } +size_t get_current_context_id() { return get_context_counter(); } +void decrement_current_context_id() { --get_context_counter(); } +size_t get_new_variable_id() { return get_variable_counter(); } + +size_t declare_output_type(const std::string& variableName, VariableInfo info) { + size_t variableId = get_new_variable_id(); +#ifdef KOKKOS_ENABLE_TUNING + Experimental::invoke_kokkosp_callback( + Experimental::MayRequireGlobalFencing::No, + Experimental::current_callbacks.declare_output_type, variableName.c_str(), + variableId, &info); + variable_metadata[variableId] = info; +#else + (void)variableName; + (void)info; +#endif + return variableId; +} + +size_t declare_input_type(const std::string& variableName, VariableInfo info) { + size_t variableId = get_new_variable_id(); +#ifdef KOKKOS_ENABLE_TUNING + Experimental::invoke_kokkosp_callback( + Experimental::MayRequireGlobalFencing::No, + Experimental::current_callbacks.declare_input_type, variableName.c_str(), + variableId, &info); + variable_metadata[variableId] = info; +#else + (void)variableName; + (void)info; +#endif + return variableId; +} + +void set_input_values(size_t contextId, size_t count, VariableValue* values) { +#ifdef KOKKOS_ENABLE_TUNING + if (features_per_context.find(contextId) == features_per_context.end()) { + features_per_context[contextId] = std::unordered_set<size_t>(); + } + for (size_t x = 0; x < count; ++x) { + values[x].metadata = &variable_metadata[values[x].type_id]; + features_per_context[contextId].insert(values[x].type_id); + active_features.insert(values[x].type_id); + feature_values[values[x].type_id] = values[x]; + } +#else + (void)contextId; + (void)count; + (void)values; +#endif +} +#include <iostream> +void request_output_values(size_t contextId, size_t count, + VariableValue* values) { +#ifdef KOKKOS_ENABLE_TUNING + std::vector<size_t> context_ids; + std::vector<VariableValue> context_values; + for (auto id : active_features) { + context_values.push_back(feature_values[id]); + } + if (Experimental::current_callbacks.request_output_values != nullptr) { + for (size_t x = 0; x < count; ++x) { + values[x].metadata = &variable_metadata[values[x].type_id]; + } + Experimental::invoke_kokkosp_callback( + Experimental::MayRequireGlobalFencing::No, + Experimental::current_callbacks.request_output_values, contextId, + context_values.size(), context_values.data(), count, values); + } +#else + (void)contextId; + (void)count; + (void)values; +#endif +} + +#ifdef KOKKOS_ENABLE_TUNING +static std::unordered_map<size_t, size_t> optimization_goals; +#endif + +void begin_context(size_t contextId) { + Experimental::invoke_kokkosp_callback( + Experimental::MayRequireGlobalFencing::No, + Experimental::current_callbacks.begin_tuning_context, contextId); +} +void end_context(size_t contextId) { +#ifdef KOKKOS_ENABLE_TUNING + for (auto id : features_per_context[contextId]) { + active_features.erase(id); + } + Experimental::invoke_kokkosp_callback( + Experimental::MayRequireGlobalFencing::No, + Experimental::current_callbacks.end_tuning_context, contextId, + feature_values[optimization_goals[contextId]]); + optimization_goals.erase(contextId); + decrement_current_context_id(); +#else + (void)contextId; +#endif +} + +bool have_tuning_tool() { +#ifdef KOKKOS_ENABLE_TUNING + return (Experimental::current_callbacks.request_output_values != nullptr); +#else + return false; +#endif +} + +VariableValue make_variable_value(size_t id, int64_t val) { + VariableValue variable_value; + variable_value.type_id = id; + variable_value.value.int_value = val; + return variable_value; +} +VariableValue make_variable_value(size_t id, double val) { + VariableValue variable_value; + variable_value.type_id = id; + variable_value.value.double_value = val; + return variable_value; +} +VariableValue make_variable_value(size_t id, const std::string& val) { + VariableValue variable_value; + variable_value.type_id = id; + strncpy(variable_value.value.string_value, val.c_str(), + KOKKOS_TOOLS_TUNING_STRING_LENGTH - 1); + return variable_value; +} +SetOrRange make_candidate_set(size_t size, std::string* data) { + SetOrRange value_set; + value_set.set.values.string_value = new TuningString[size]; + for (size_t x = 0; x < size; ++x) { + strncpy(value_set.set.values.string_value[x], data[x].c_str(), + KOKKOS_TOOLS_TUNING_STRING_LENGTH - 1); + } + value_set.set.size = size; + return value_set; +} +SetOrRange make_candidate_set(size_t size, int64_t* data) { + SetOrRange value_set; + value_set.set.size = size; + value_set.set.values.int_value = data; + return value_set; +} +SetOrRange make_candidate_set(size_t size, double* data) { + SetOrRange value_set; + value_set.set.size = size; + value_set.set.values.double_value = data; + return value_set; +} +SetOrRange make_candidate_range(double lower, double upper, double step, + bool openLower = false, + bool openUpper = false) { + SetOrRange value_range; + value_range.range.lower.double_value = lower; + value_range.range.upper.double_value = upper; + value_range.range.step.double_value = step; + value_range.range.openLower = openLower; + value_range.range.openUpper = openUpper; + return value_range; +} + +SetOrRange make_candidate_range(int64_t lower, int64_t upper, int64_t step, + bool openLower = false, + bool openUpper = false) { + SetOrRange value_range; + value_range.range.lower.int_value = lower; + value_range.range.upper.int_value = upper; + value_range.range.step.int_value = step; + value_range.range.openLower = openLower; + value_range.range.openUpper = openUpper; + return value_range; +} + +size_t get_new_context_id(); +size_t get_current_context_id(); +void decrement_current_context_id(); +size_t get_new_variable_id(); +void declare_optimization_goal(const size_t context, + const OptimizationGoal& goal) { +#ifdef KOKKOS_ENABLE_TUNING + Experimental::invoke_kokkosp_callback( + Experimental::MayRequireGlobalFencing::No, + Experimental::current_callbacks.declare_optimization_goal, context, goal); + optimization_goals[context] = goal.type_id; +#else + (void)context; + (void)goal; +#endif +} +} // end namespace Experimental +} // end namespace Tools + +} // end namespace Kokkos diff --git a/packages/kokkos/core/src/impl/Kokkos_Profiling.hpp b/packages/kokkos/core/src/impl/Kokkos_Profiling.hpp new file mode 100644 index 0000000000000000000000000000000000000000..1ff6a36c3bc3c934e787af30c5bd6568046f15f1 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Profiling.hpp @@ -0,0 +1,721 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_IMPL_KOKKOS_PROFILING_HPP +#define KOKKOS_IMPL_KOKKOS_PROFILING_HPP + +#include <Kokkos_Core_fwd.hpp> +#include <Kokkos_ExecPolicy.hpp> +#include <Kokkos_Macros.hpp> +#include <Kokkos_Tuners.hpp> +#include <impl/Kokkos_Profiling_Interface.hpp> +#include <map> +#include <string> +#include <type_traits> +namespace Kokkos { + +// forward declaration +bool tune_internals() noexcept; + +namespace Tools { + +bool profileLibraryLoaded(); + +void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID, + uint64_t* kernelID); +void endParallelFor(const uint64_t kernelID); +void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID, + uint64_t* kernelID); +void endParallelScan(const uint64_t kernelID); +void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, + uint64_t* kernelID); +void endParallelReduce(const uint64_t kernelID); + +void pushRegion(const std::string& kName); +void popRegion(); + +void createProfileSection(const std::string& sectionName, uint32_t* secID); +void startSection(const uint32_t secID); +void stopSection(const uint32_t secID); +void destroyProfileSection(const uint32_t secID); + +void markEvent(const std::string& evName); + +void allocateData(const SpaceHandle space, const std::string label, + const void* ptr, const uint64_t size); +void deallocateData(const SpaceHandle space, const std::string label, + const void* ptr, const uint64_t size); + +void beginDeepCopy(const SpaceHandle dst_space, const std::string dst_label, + const void* dst_ptr, const SpaceHandle src_space, + const std::string src_label, const void* src_ptr, + const uint64_t size); +void endDeepCopy(); +void beginFence(const std::string name, const uint32_t deviceId, + uint64_t* handle); +void endFence(const uint64_t handle); + +/** + * syncDualView declares to the tool that a given DualView + * has been synced. + * + * Arguments: + * + * label: name of the View within the DualView + * ptr: that View's data ptr + * to_device: true if the data is being synchronized to the device + * false otherwise + */ +void syncDualView(const std::string& label, const void* const ptr, + bool to_device); +/** + * modifyDualView declares to the tool that a given DualView + * has been modified. Note: this means that somebody *called* + * modify on the DualView, this doesn't get called any time + * somebody touches the data + * + * Arguments: + * + * label: name of the View within the DualView + * ptr: that View's data ptr + * on_device: true if the data is being modified on the device + * false otherwise + */ +void modifyDualView(const std::string& label, const void* const ptr, + bool on_device); + +void declareMetadata(const std::string& key, const std::string& value); +void initialize(const std::string& = {}); +void finalize(); +bool printHelp(const std::string&); +void parseArgs(const std::string&); + +Kokkos_Profiling_SpaceHandle make_space_handle(const char* space_name); + +namespace Experimental { + +void set_init_callback(initFunction callback); +void set_finalize_callback(finalizeFunction callback); +void set_parse_args_callback(parseArgsFunction callback); +void set_print_help_callback(printHelpFunction callback); +void set_begin_parallel_for_callback(beginFunction callback); +void set_end_parallel_for_callback(endFunction callback); +void set_begin_parallel_reduce_callback(beginFunction callback); +void set_end_parallel_reduce_callback(endFunction callback); +void set_begin_parallel_scan_callback(beginFunction callback); +void set_end_parallel_scan_callback(endFunction callback); +void set_push_region_callback(pushFunction callback); +void set_pop_region_callback(popFunction callback); +void set_allocate_data_callback(allocateDataFunction callback); +void set_deallocate_data_callback(deallocateDataFunction callback); +void set_create_profile_section_callback(createProfileSectionFunction callback); +void set_start_profile_section_callback(startProfileSectionFunction callback); +void set_stop_profile_section_callback(stopProfileSectionFunction callback); +void set_destroy_profile_section_callback( + destroyProfileSectionFunction callback); +void set_profile_event_callback(profileEventFunction callback); +void set_begin_deep_copy_callback(beginDeepCopyFunction callback); +void set_end_deep_copy_callback(endDeepCopyFunction callback); +void set_begin_fence_callback(beginFenceFunction callback); +void set_end_fence_callback(endFenceFunction callback); +void set_dual_view_sync_callback(dualViewSyncFunction callback); +void set_dual_view_modify_callback(dualViewModifyFunction callback); +void set_declare_metadata_callback(declareMetadataFunction callback); + +void set_declare_output_type_callback(outputTypeDeclarationFunction callback); +void set_declare_input_type_callback(inputTypeDeclarationFunction callback); +void set_request_output_values_callback(requestValueFunction callback); +void set_declare_optimization_goal_callback( + optimizationGoalDeclarationFunction callback); +void set_end_context_callback(contextEndFunction callback); +void set_begin_context_callback(contextBeginFunction callback); + +void pause_tools(); +void resume_tools(); + +EventSet get_callbacks(); +void set_callbacks(EventSet new_events); +} // namespace Experimental + +namespace Experimental { +// forward declarations +size_t get_new_context_id(); +size_t get_current_context_id(); +} // namespace Experimental + +namespace Impl { + +static std::map<std::string, Kokkos::Tools::Experimental::TeamSizeTuner> + team_tuners; + +template <int Rank> +using MDRangeTuningMap = + std::map<std::string, Kokkos::Tools::Experimental::MDRangeTuner<Rank>>; + +template <int Rank> +static MDRangeTuningMap<Rank> mdrange_tuners; + +// For any policies without a tuning implementation, with a reducer +template <class ReducerType, class ExecPolicy, class Functor, typename TagType> +void tune_policy(const size_t, const std::string&, ExecPolicy&, const Functor&, + TagType) {} + +// For any policies without a tuning implementation, without a reducer +template <class ExecPolicy, class Functor, typename TagType> +void tune_policy(const size_t, const std::string&, ExecPolicy&, const Functor&, + const TagType&) {} + +/** + * Tuning for parallel_fors and parallel_scans is a fairly simple process. + * + * Tuning for a parallel_reduce turns out to be a little more complicated. + * + * If you're tuning a reducer, it might be a complex or a simple reducer + * (an example of simple would be one where the join is just "+". + * + * Unfortunately these two paths are very different in terms of which classes + * get instantiated. Thankfully, all of this complexity is encoded in the + * ReducerType. If it's a "simple" reducer, this will be Kokkos::InvalidType, + * otherwise it'll be something else. + * + * If the type is complex, for the code to be generally right you _must_ + * pass an instance of that ReducerType to functions that determine + * eligible team sizes. If the type is simple, you can't construct one, + * you use the simpler 2-arg formulation of team_size_recommended/max. + */ + +namespace Impl { + +struct SimpleTeamSizeCalculator { + template <typename Policy, typename Functor, typename Tag> + int get_max_team_size(const Policy& policy, const Functor& functor, + const Tag tag) { + auto max = policy.team_size_max(functor, tag); + return max; + } + template <typename Policy, typename Functor, typename Tag> + int get_recommended_team_size(const Policy& policy, const Functor& functor, + const Tag tag) { + auto max = policy.team_size_recommended(functor, tag); + return max; + } + template <typename Policy, typename Functor> + int get_mdrange_max_tile_size_product(const Policy& policy, + const Functor& functor, + const Kokkos::ParallelForTag&) { + using exec_space = typename Policy::execution_space; + using driver = Kokkos::Impl::ParallelFor<Functor, Policy, exec_space>; + return driver::max_tile_size_product(policy, functor); + } + template <typename Policy, typename Functor> + int get_mdrange_max_tile_size_product(const Policy& policy, + const Functor& functor, + const Kokkos::ParallelReduceTag&) { + using exec_space = typename Policy::execution_space; + using driver = + Kokkos::Impl::ParallelReduce<Functor, Policy, Kokkos::InvalidType, + exec_space>; + return driver::max_tile_size_product(policy, functor); + } +}; + +// when we have a complex reducer, we need to pass an +// instance to team_size_recommended/max. Reducers +// aren't default constructible, but they are +// constructible from a reference to an +// instance of their value_type so we construct +// a value_type and temporary reducer here +template <typename ReducerType> +struct ComplexReducerSizeCalculator { + template <typename Policy, typename Functor, typename Tag> + int get_max_team_size(const Policy& policy, const Functor& functor, + const Tag tag) { + using value_type = typename ReducerType::value_type; + value_type value; + ReducerType reducer_example = ReducerType(value); + return policy.team_size_max(functor, reducer_example, tag); + } + template <typename Policy, typename Functor, typename Tag> + int get_recommended_team_size(const Policy& policy, const Functor& functor, + const Tag tag) { + using value_type = typename ReducerType::value_type; + value_type value; + ReducerType reducer_example = ReducerType(value); + return policy.team_size_recommended(functor, reducer_example, tag); + } + template <typename Policy, typename Functor> + int get_mdrange_max_tile_size_product(const Policy& policy, + const Functor& functor, + const Kokkos::ParallelReduceTag&) { + using exec_space = typename Policy::execution_space; + using driver = + Kokkos::Impl::ParallelReduce<Functor, Policy, ReducerType, exec_space>; + return driver::max_tile_size_product(policy, functor); + } +}; + +} // namespace Impl + +template <class Tuner, class Functor, class TagType, + class TuningPermissionFunctor, class Map, class Policy> +void generic_tune_policy(const std::string& label_in, Map& map, Policy& policy, + const Functor& functor, const TagType& tag, + const TuningPermissionFunctor& should_tune) { + if (should_tune(policy)) { + std::string label = label_in; + if (label_in.empty()) { + using policy_type = + typename std::remove_reference<decltype(policy)>::type; + using work_tag = typename policy_type::work_tag; + Kokkos::Impl::ParallelConstructName<Functor, work_tag> name(label); + label = name.get(); + } + auto tuner_iter = [&]() { + auto my_tuner = map.find(label); + if (my_tuner == map.end()) { + return (map.emplace(label, Tuner(label, policy, functor, tag, + Impl::SimpleTeamSizeCalculator{})) + .first); + } + return my_tuner; + }(); + tuner_iter->second.tune(policy); + } +} +template <class Tuner, class ReducerType, class Functor, class TagType, + class TuningPermissionFunctor, class Map, class Policy> +void generic_tune_policy(const std::string& label_in, Map& map, Policy& policy, + const Functor& functor, const TagType& tag, + const TuningPermissionFunctor& should_tune) { + if (should_tune(policy)) { + std::string label = label_in; + if (label_in.empty()) { + using policy_type = + typename std::remove_reference<decltype(policy)>::type; + using work_tag = typename policy_type::work_tag; + Kokkos::Impl::ParallelConstructName<Functor, work_tag> name(label); + label = name.get(); + } + auto tuner_iter = [&]() { + auto my_tuner = map.find(label); + if (my_tuner == map.end()) { + return (map.emplace( + label, + Tuner(label, policy, functor, tag, + Impl::ComplexReducerSizeCalculator<ReducerType>{})) + .first); + } + return my_tuner; + }(); + tuner_iter->second.tune(policy); + } +} + +// tune a TeamPolicy, without reducer +template <class Functor, class TagType, class... Properties> +void tune_policy(const size_t /**tuning_context*/, const std::string& label_in, + Kokkos::TeamPolicy<Properties...>& policy, + const Functor& functor, const TagType& tag) { + generic_tune_policy<Experimental::TeamSizeTuner>( + label_in, team_tuners, policy, functor, tag, + [](const Kokkos::TeamPolicy<Properties...>& candidate_policy) { + return (candidate_policy.impl_auto_team_size() || + candidate_policy.impl_auto_vector_length()); + }); +} + +// tune a TeamPolicy, with reducer +template <class ReducerType, class Functor, class TagType, class... Properties> +void tune_policy(const size_t /**tuning_context*/, const std::string& label_in, + Kokkos::TeamPolicy<Properties...>& policy, + const Functor& functor, const TagType& tag) { + generic_tune_policy<Experimental::TeamSizeTuner, ReducerType>( + label_in, team_tuners, policy, functor, tag, + [](const Kokkos::TeamPolicy<Properties...>& candidate_policy) { + return (candidate_policy.impl_auto_team_size() || + candidate_policy.impl_auto_vector_length()); + }); +} + +// tune a MDRangePolicy, without reducer +template <class Functor, class TagType, class... Properties> +void tune_policy(const size_t /**tuning_context*/, const std::string& label_in, + Kokkos::MDRangePolicy<Properties...>& policy, + const Functor& functor, const TagType& tag) { + using Policy = Kokkos::MDRangePolicy<Properties...>; + static constexpr int rank = Policy::rank; + generic_tune_policy<Experimental::MDRangeTuner<rank>>( + label_in, mdrange_tuners<rank>, policy, functor, tag, + [](const Policy& candidate_policy) { + return candidate_policy.impl_tune_tile_size(); + }); +} + +// tune a MDRangePolicy, with reducer +template <class ReducerType, class Functor, class TagType, class... Properties> +void tune_policy(const size_t /**tuning_context*/, const std::string& label_in, + Kokkos::MDRangePolicy<Properties...>& policy, + const Functor& functor, const TagType& tag) { + using Policy = Kokkos::MDRangePolicy<Properties...>; + static constexpr int rank = Policy::rank; + generic_tune_policy<Experimental::MDRangeTuner<rank>, ReducerType>( + label_in, mdrange_tuners<rank>, policy, functor, tag, + [](const Policy& candidate_policy) { + return candidate_policy.impl_tune_tile_size(); + }); +} + +template <class ReducerType> +struct ReductionSwitcher { + template <class Functor, class TagType, class ExecPolicy> + static void tune(const size_t tuning_context, const std::string& label, + ExecPolicy& policy, const Functor& functor, + const TagType& tag) { + if (Kokkos::tune_internals()) { + tune_policy<ReducerType>(tuning_context, label, policy, functor, tag); + } + } +}; + +template <> +struct ReductionSwitcher<Kokkos::InvalidType> { + template <class Functor, class TagType, class ExecPolicy> + static void tune(const size_t tuning_context, const std::string& label, + ExecPolicy& policy, const Functor& functor, + const TagType& tag) { + if (Kokkos::tune_internals()) { + tune_policy(tuning_context, label, policy, functor, tag); + } + } +}; + +template <class Tuner, class Functor, class TagType, + class TuningPermissionFunctor, class Map, class Policy> +void generic_report_results(const std::string& label_in, Map& map, + Policy& policy, const Functor&, const TagType&, + const TuningPermissionFunctor& should_tune) { + if (should_tune(policy)) { + std::string label = label_in; + if (label_in.empty()) { + using policy_type = + typename std::remove_reference<decltype(policy)>::type; + using work_tag = typename policy_type::work_tag; + Kokkos::Impl::ParallelConstructName<Functor, work_tag> name(label); + label = name.get(); + } + auto tuner_iter = map[label]; + tuner_iter.end(); + } +} + +// report results for a policy type we don't tune (do nothing) +template <class ExecPolicy, class Functor, typename TagType> +void report_policy_results(const size_t, const std::string&, ExecPolicy&, + const Functor&, const TagType&) {} + +// report results for a TeamPolicy +template <class Functor, class TagType, class... Properties> +void report_policy_results(const size_t /**tuning_context*/, + const std::string& label_in, + Kokkos::TeamPolicy<Properties...>& policy, + const Functor& functor, const TagType& tag) { + generic_report_results<Experimental::TeamSizeTuner>( + label_in, team_tuners, policy, functor, tag, + [](const Kokkos::TeamPolicy<Properties...>& candidate_policy) { + return (candidate_policy.impl_auto_team_size() || + candidate_policy.impl_auto_vector_length()); + }); +} + +// report results for an MDRangePolicy +template <class Functor, class TagType, class... Properties> +void report_policy_results(const size_t /**tuning_context*/, + const std::string& label_in, + Kokkos::MDRangePolicy<Properties...>& policy, + const Functor& functor, const TagType& tag) { + using Policy = Kokkos::MDRangePolicy<Properties...>; + static constexpr int rank = Policy::rank; + generic_report_results<Experimental::MDRangeTuner<rank>>( + label_in, mdrange_tuners<rank>, policy, functor, tag, + [](const Policy& candidate_policy) { + return candidate_policy.impl_tune_tile_size(); + }); +} + +template <class ExecPolicy, class FunctorType> +void begin_parallel_for(ExecPolicy& policy, FunctorType& functor, + const std::string& label, uint64_t& kpID) { + if (Kokkos::Tools::profileLibraryLoaded()) { + Kokkos::Impl::ParallelConstructName<FunctorType, + typename ExecPolicy::work_tag> + name(label); + Kokkos::Tools::beginParallelFor( + name.get(), Kokkos::Profiling::Experimental::device_id(policy.space()), + &kpID); + } +#ifdef KOKKOS_ENABLE_TUNING + size_t context_id = Kokkos::Tools::Experimental::get_new_context_id(); + if (Kokkos::tune_internals()) { + tune_policy(context_id, label, policy, functor, Kokkos::ParallelForTag{}); + } +#else + (void)functor; +#endif +} + +template <class ExecPolicy, class FunctorType> +void end_parallel_for(ExecPolicy& policy, FunctorType& functor, + const std::string& label, uint64_t& kpID) { + if (Kokkos::Tools::profileLibraryLoaded()) { + Kokkos::Tools::endParallelFor(kpID); + } +#ifdef KOKKOS_ENABLE_TUNING + size_t context_id = Kokkos::Tools::Experimental::get_current_context_id(); + if (Kokkos::tune_internals()) { + report_policy_results(context_id, label, policy, functor, + Kokkos::ParallelForTag{}); + } +#else + (void)policy; + (void)functor; + (void)label; +#endif +} + +template <class ExecPolicy, class FunctorType> +void begin_parallel_scan(ExecPolicy& policy, FunctorType& functor, + const std::string& label, uint64_t& kpID) { + if (Kokkos::Tools::profileLibraryLoaded()) { + Kokkos::Impl::ParallelConstructName<FunctorType, + typename ExecPolicy::work_tag> + name(label); + Kokkos::Tools::beginParallelScan( + name.get(), Kokkos::Profiling::Experimental::device_id(policy.space()), + &kpID); + } +#ifdef KOKKOS_ENABLE_TUNING + size_t context_id = Kokkos::Tools::Experimental::get_new_context_id(); + if (Kokkos::tune_internals()) { + tune_policy(context_id, label, policy, functor, Kokkos::ParallelScanTag{}); + } +#else + (void)functor; +#endif +} + +template <class ExecPolicy, class FunctorType> +void end_parallel_scan(ExecPolicy& policy, FunctorType& functor, + const std::string& label, uint64_t& kpID) { + if (Kokkos::Tools::profileLibraryLoaded()) { + Kokkos::Tools::endParallelScan(kpID); + } +#ifdef KOKKOS_ENABLE_TUNING + size_t context_id = Kokkos::Tools::Experimental::get_current_context_id(); + if (Kokkos::tune_internals()) { + report_policy_results(context_id, label, policy, functor, + Kokkos::ParallelScanTag{}); + } +#else + (void)policy; + (void)functor; + (void)label; +#endif +} + +template <class ReducerType, class ExecPolicy, class FunctorType> +void begin_parallel_reduce(ExecPolicy& policy, FunctorType& functor, + const std::string& label, uint64_t& kpID) { + if (Kokkos::Tools::profileLibraryLoaded()) { + Kokkos::Impl::ParallelConstructName<FunctorType, + typename ExecPolicy::work_tag> + name(label); + Kokkos::Tools::beginParallelReduce( + name.get(), Kokkos::Profiling::Experimental::device_id(policy.space()), + &kpID); + } +#ifdef KOKKOS_ENABLE_TUNING + size_t context_id = Kokkos::Tools::Experimental::get_new_context_id(); + ReductionSwitcher<ReducerType>::tune(context_id, label, policy, functor, + Kokkos::ParallelReduceTag{}); +#else + (void)functor; +#endif +} + +template <class ReducerType, class ExecPolicy, class FunctorType> +void end_parallel_reduce(ExecPolicy& policy, FunctorType& functor, + const std::string& label, uint64_t& kpID) { + if (Kokkos::Tools::profileLibraryLoaded()) { + Kokkos::Tools::endParallelReduce(kpID); + } +#ifdef KOKKOS_ENABLE_TUNING + size_t context_id = Kokkos::Tools::Experimental::get_current_context_id(); + if (Kokkos::tune_internals()) { + report_policy_results(context_id, label, policy, functor, + Kokkos::ParallelReduceTag{}); + } +#else + (void)policy; + (void)functor; + (void)label; +#endif +} + +} // namespace Impl + +} // namespace Tools +namespace Profiling { + +bool profileLibraryLoaded(); + +void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID, + uint64_t* kernelID); +void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, + uint64_t* kernelID); +void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID, + uint64_t* kernelID); +void endParallelFor(const uint64_t kernelID); +void endParallelReduce(const uint64_t kernelID); +void endParallelScan(const uint64_t kernelID); +void pushRegion(const std::string& kName); +void popRegion(); + +void createProfileSection(const std::string& sectionName, uint32_t* secID); +void destroyProfileSection(const uint32_t secID); +void startSection(const uint32_t secID); + +void stopSection(const uint32_t secID); + +void markEvent(const std::string& eventName); +void allocateData(const SpaceHandle handle, const std::string name, + const void* data, const uint64_t size); +void deallocateData(const SpaceHandle space, const std::string label, + const void* ptr, const uint64_t size); +void beginDeepCopy(const SpaceHandle dst_space, const std::string dst_label, + const void* dst_ptr, const SpaceHandle src_space, + const std::string src_label, const void* src_ptr, + const uint64_t size); +void endDeepCopy(); +void finalize(); +void initialize(const std::string& = {}); + +SpaceHandle make_space_handle(const char* space_name); + +namespace Experimental { +using Kokkos::Tools::Experimental::set_allocate_data_callback; +using Kokkos::Tools::Experimental::set_begin_deep_copy_callback; +using Kokkos::Tools::Experimental::set_begin_parallel_for_callback; +using Kokkos::Tools::Experimental::set_begin_parallel_reduce_callback; +using Kokkos::Tools::Experimental::set_begin_parallel_scan_callback; +using Kokkos::Tools::Experimental::set_create_profile_section_callback; +using Kokkos::Tools::Experimental::set_deallocate_data_callback; +using Kokkos::Tools::Experimental::set_destroy_profile_section_callback; +using Kokkos::Tools::Experimental::set_end_deep_copy_callback; +using Kokkos::Tools::Experimental::set_end_parallel_for_callback; +using Kokkos::Tools::Experimental::set_end_parallel_reduce_callback; +using Kokkos::Tools::Experimental::set_end_parallel_scan_callback; +using Kokkos::Tools::Experimental::set_finalize_callback; +using Kokkos::Tools::Experimental::set_init_callback; +using Kokkos::Tools::Experimental::set_parse_args_callback; +using Kokkos::Tools::Experimental::set_pop_region_callback; +using Kokkos::Tools::Experimental::set_print_help_callback; +using Kokkos::Tools::Experimental::set_profile_event_callback; +using Kokkos::Tools::Experimental::set_push_region_callback; +using Kokkos::Tools::Experimental::set_start_profile_section_callback; +using Kokkos::Tools::Experimental::set_stop_profile_section_callback; + +using Kokkos::Tools::Experimental::EventSet; + +using Kokkos::Tools::Experimental::pause_tools; +using Kokkos::Tools::Experimental::resume_tools; + +using Kokkos::Tools::Experimental::get_callbacks; +using Kokkos::Tools::Experimental::set_callbacks; + +} // namespace Experimental +} // namespace Profiling + +namespace Tools { +namespace Experimental { + +VariableValue make_variable_value(size_t id, int64_t val); +VariableValue make_variable_value(size_t id, double val); +VariableValue make_variable_value(size_t id, const std::string& val); + +SetOrRange make_candidate_set(size_t size, std::string* data); +SetOrRange make_candidate_set(size_t size, int64_t* data); +SetOrRange make_candidate_set(size_t size, double* data); +SetOrRange make_candidate_range(double lower, double upper, double step, + bool openLower, bool openUpper); + +SetOrRange make_candidate_range(int64_t lower, int64_t upper, int64_t step, + bool openLower, bool openUpper); + +void declare_optimization_goal(const size_t context, + const OptimizationGoal& goal); + +size_t declare_output_type(const std::string& typeName, VariableInfo info); + +size_t declare_input_type(const std::string& typeName, VariableInfo info); + +void set_input_values(size_t contextId, size_t count, VariableValue* values); + +void end_context(size_t contextId); +void begin_context(size_t contextId); + +void request_output_values(size_t contextId, size_t count, + VariableValue* values); + +bool have_tuning_tool(); + +size_t get_new_context_id(); +size_t get_current_context_id(); + +size_t get_new_variable_id(); +} // namespace Experimental +} // namespace Tools + +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h b/packages/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h new file mode 100644 index 0000000000000000000000000000000000000000..ed8751c50cc04d915b7b3c371a6ec05756ff6087 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h @@ -0,0 +1,296 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_PROFILING_C_INTERFACE_HPP +#define KOKKOS_PROFILING_C_INTERFACE_HPP + +#ifdef __cplusplus +#include <cstddef> +#include <cstdint> +#else +#include <stddef.h> +#include <stdint.h> +#include <stdbool.h> +#endif + +#define KOKKOSP_INTERFACE_VERSION 20210225 + +// Profiling + +struct Kokkos_Profiling_KokkosPDeviceInfo { + size_t deviceID; +}; + +struct Kokkos_Profiling_SpaceHandle { + char name[64]; +}; + +// NOLINTNEXTLINE(modernize-use-using): C compatibility +typedef void (*Kokkos_Profiling_initFunction)( + const int, const uint64_t, const uint32_t, + struct Kokkos_Profiling_KokkosPDeviceInfo*); +// NOLINTNEXTLINE(modernize-use-using): C compatibility +typedef void (*Kokkos_Profiling_finalizeFunction)(); +// NOLINTNEXTLINE(modernize-use-using): C compatibility +typedef void (*Kokkos_Profiling_parseArgsFunction)(int, char**); +// NOLINTNEXTLINE(modernize-use-using): C compatibility +typedef void (*Kokkos_Profiling_printHelpFunction)(char*); +// NOLINTNEXTLINE(modernize-use-using): C compatibility +typedef void (*Kokkos_Profiling_beginFunction)(const char*, const uint32_t, + uint64_t*); +// NOLINTNEXTLINE(modernize-use-using): C compatibility +typedef void (*Kokkos_Profiling_endFunction)(uint64_t); + +// NOLINTNEXTLINE(modernize-use-using): C compatibility +typedef void (*Kokkos_Profiling_pushFunction)(const char*); +// NOLINTNEXTLINE(modernize-use-using): C compatibility +typedef void (*Kokkos_Profiling_popFunction)(); + +// NOLINTNEXTLINE(modernize-use-using): C compatibility +typedef void (*Kokkos_Profiling_allocateDataFunction)( + const struct Kokkos_Profiling_SpaceHandle, const char*, const void*, + const uint64_t); +// NOLINTNEXTLINE(modernize-use-using): C compatibility +typedef void (*Kokkos_Profiling_deallocateDataFunction)( + const struct Kokkos_Profiling_SpaceHandle, const char*, const void*, + const uint64_t); + +// NOLINTNEXTLINE(modernize-use-using): C compatibility +typedef void (*Kokkos_Profiling_createProfileSectionFunction)(const char*, + uint32_t*); +// NOLINTNEXTLINE(modernize-use-using): C compatibility +typedef void (*Kokkos_Profiling_startProfileSectionFunction)(const uint32_t); +// NOLINTNEXTLINE(modernize-use-using): C compatibility +typedef void (*Kokkos_Profiling_stopProfileSectionFunction)(const uint32_t); +// NOLINTNEXTLINE(modernize-use-using): C compatibility +typedef void (*Kokkos_Profiling_destroyProfileSectionFunction)(const uint32_t); + +// NOLINTNEXTLINE(modernize-use-using): C compatibility +typedef void (*Kokkos_Profiling_profileEventFunction)(const char*); + +// NOLINTNEXTLINE(modernize-use-using): C compatibility +typedef void (*Kokkos_Profiling_beginDeepCopyFunction)( + struct Kokkos_Profiling_SpaceHandle, const char*, const void*, + struct Kokkos_Profiling_SpaceHandle, const char*, const void*, uint64_t); +// NOLINTNEXTLINE(modernize-use-using): C compatibility +typedef void (*Kokkos_Profiling_endDeepCopyFunction)(); +typedef void (*Kokkos_Profiling_beginFenceFunction)(const char*, const uint32_t, + uint64_t*); +typedef void (*Kokkos_Profiling_endFenceFunction)(uint64_t); + +// NOLINTNEXTLINE(modernize-use-using): C compatibility +typedef void (*Kokkos_Profiling_dualViewSyncFunction)(const char*, + const void* const, bool); +// NOLINTNEXTLINE(modernize-use-using): C compatibility +typedef void (*Kokkos_Profiling_dualViewModifyFunction)(const char*, + const void* const, + bool); + +// NOLINTNEXTLINE(modernize-use-using): C compatibility +typedef void (*Kokkos_Profiling_declareMetadataFunction)(const char*, + const char*); + +// NOLINTNEXTLINE(modernize-use-using): C compatibility +typedef void (*Kokkos_Tools_toolInvokedFenceFunction)(const uint32_t); + +// NOLINTNEXTLINE(modernize-use-using): C compatibility +typedef void (*Kokkos_Tools_functionPointer)(); +struct Kokkos_Tools_ToolProgrammingInterface { + Kokkos_Tools_toolInvokedFenceFunction fence; + // allow addition of more actions + Kokkos_Tools_functionPointer padding[31]; +}; + +struct Kokkos_Tools_ToolSettings { + bool requires_global_fencing; + bool padding[255]; +}; + +// NOLINTNEXTLINE(modernize-use-using): C compatibility +typedef void (*Kokkos_Tools_provideToolProgrammingInterfaceFunction)( + const uint32_t, struct Kokkos_Tools_ToolProgrammingInterface); +// NOLINTNEXTLINE(modernize-use-using): C compatibility +typedef void (*Kokkos_Tools_requestToolSettingsFunction)( + const uint32_t, struct Kokkos_Tools_ToolSettings*); + +// Tuning + +#define KOKKOS_TOOLS_TUNING_STRING_LENGTH 64 +typedef char Kokkos_Tools_Tuning_String[KOKKOS_TOOLS_TUNING_STRING_LENGTH]; +union Kokkos_Tools_VariableValue_ValueUnion { + int64_t int_value; + double double_value; + Kokkos_Tools_Tuning_String string_value; +}; + +union Kokkos_Tools_VariableValue_ValueUnionSet { + int64_t* int_value; + double* double_value; + Kokkos_Tools_Tuning_String* string_value; +}; + +struct Kokkos_Tools_ValueSet { + size_t size; + union Kokkos_Tools_VariableValue_ValueUnionSet values; +}; + +enum Kokkos_Tools_OptimizationType { + Kokkos_Tools_Minimize, + Kokkos_Tools_Maximize +}; + +struct Kokkos_Tools_OptimzationGoal { + size_t type_id; + enum Kokkos_Tools_OptimizationType goal; +}; + +struct Kokkos_Tools_ValueRange { + union Kokkos_Tools_VariableValue_ValueUnion lower; + union Kokkos_Tools_VariableValue_ValueUnion upper; + union Kokkos_Tools_VariableValue_ValueUnion step; + bool openLower; + bool openUpper; +}; + +enum Kokkos_Tools_VariableInfo_ValueType { + kokkos_value_double, + kokkos_value_int64, + kokkos_value_string, +}; + +enum Kokkos_Tools_VariableInfo_StatisticalCategory { + kokkos_value_categorical, // unordered distinct objects + kokkos_value_ordinal, // ordered distinct objects + kokkos_value_interval, // ordered distinct objects for which distance matters + kokkos_value_ratio // ordered distinct objects for which distance matters, + // division matters, and the concept of zero exists +}; + +enum Kokkos_Tools_VariableInfo_CandidateValueType { + kokkos_value_set, // I am one of [2,3,4,5] + kokkos_value_range, // I am somewhere in [2,12) + kokkos_value_unbounded // I am [text/int/float], but we don't know at + // declaration time what values are appropriate. Only + // valid for Context Variables +}; + +union Kokkos_Tools_VariableInfo_SetOrRange { + struct Kokkos_Tools_ValueSet set; + struct Kokkos_Tools_ValueRange range; +}; + +struct Kokkos_Tools_VariableInfo { + enum Kokkos_Tools_VariableInfo_ValueType type; + enum Kokkos_Tools_VariableInfo_StatisticalCategory category; + enum Kokkos_Tools_VariableInfo_CandidateValueType valueQuantity; + union Kokkos_Tools_VariableInfo_SetOrRange candidates; + void* toolProvidedInfo; +}; + +struct Kokkos_Tools_VariableValue { + size_t type_id; + union Kokkos_Tools_VariableValue_ValueUnion value; + struct Kokkos_Tools_VariableInfo* metadata; +}; + +typedef void (*Kokkos_Tools_outputTypeDeclarationFunction)( + const char*, const size_t, struct Kokkos_Tools_VariableInfo* info); +typedef void (*Kokkos_Tools_inputTypeDeclarationFunction)( + const char*, const size_t, struct Kokkos_Tools_VariableInfo* info); + +typedef void (*Kokkos_Tools_requestValueFunction)( + const size_t, const size_t, const struct Kokkos_Tools_VariableValue*, + const size_t count, struct Kokkos_Tools_VariableValue*); +typedef void (*Kokkos_Tools_contextBeginFunction)(const size_t); +typedef void (*Kokkos_Tools_contextEndFunction)( + const size_t, struct Kokkos_Tools_VariableValue); +typedef void (*Kokkos_Tools_optimizationGoalDeclarationFunction)( + const size_t, const struct Kokkos_Tools_OptimzationGoal goal); + +struct Kokkos_Profiling_EventSet { + Kokkos_Profiling_initFunction init; + Kokkos_Profiling_finalizeFunction finalize; + Kokkos_Profiling_parseArgsFunction parse_args; + Kokkos_Profiling_printHelpFunction print_help; + Kokkos_Profiling_beginFunction begin_parallel_for; + Kokkos_Profiling_endFunction end_parallel_for; + Kokkos_Profiling_beginFunction begin_parallel_reduce; + Kokkos_Profiling_endFunction end_parallel_reduce; + Kokkos_Profiling_beginFunction begin_parallel_scan; + Kokkos_Profiling_endFunction end_parallel_scan; + Kokkos_Profiling_pushFunction push_region; + Kokkos_Profiling_popFunction pop_region; + Kokkos_Profiling_allocateDataFunction allocate_data; + Kokkos_Profiling_deallocateDataFunction deallocate_data; + Kokkos_Profiling_createProfileSectionFunction create_profile_section; + Kokkos_Profiling_startProfileSectionFunction start_profile_section; + Kokkos_Profiling_stopProfileSectionFunction stop_profile_section; + Kokkos_Profiling_destroyProfileSectionFunction destroy_profile_section; + Kokkos_Profiling_profileEventFunction profile_event; + Kokkos_Profiling_beginDeepCopyFunction begin_deep_copy; + Kokkos_Profiling_endDeepCopyFunction end_deep_copy; + Kokkos_Profiling_beginFenceFunction begin_fence; + Kokkos_Profiling_endFenceFunction end_fence; + Kokkos_Profiling_dualViewSyncFunction sync_dual_view; + Kokkos_Profiling_dualViewModifyFunction modify_dual_view; + Kokkos_Profiling_declareMetadataFunction declare_metadata; + Kokkos_Tools_provideToolProgrammingInterfaceFunction + provide_tool_programming_interface; + Kokkos_Tools_requestToolSettingsFunction request_tool_settings; + char profiling_padding[9 * sizeof(Kokkos_Tools_functionPointer)]; + Kokkos_Tools_outputTypeDeclarationFunction declare_output_type; + Kokkos_Tools_inputTypeDeclarationFunction declare_input_type; + Kokkos_Tools_requestValueFunction request_output_values; + Kokkos_Tools_contextBeginFunction begin_tuning_context; + Kokkos_Tools_contextEndFunction end_tuning_context; + Kokkos_Tools_optimizationGoalDeclarationFunction declare_optimization_goal; + char padding[232 * + sizeof( + Kokkos_Tools_functionPointer)]; // allows us to add another + // 256 events to the Tools + // interface without + // changing struct layout +}; + +#endif // KOKKOS_PROFILING_C_INTERFACE_HPP diff --git a/packages/kokkos/core/src/impl/Kokkos_Profiling_DeviceInfo.hpp b/packages/kokkos/core/src/impl/Kokkos_Profiling_DeviceInfo.hpp new file mode 100644 index 0000000000000000000000000000000000000000..be6f756d0c2fc71976d48df2a76570b58ea8ee1b --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Profiling_DeviceInfo.hpp @@ -0,0 +1,56 @@ +/* + //@HEADER + // ************************************************************************ + // + // Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). + // + // Under the terms of Contract DE-NA0003525 with NTESS, + // the U.S. Government retains certain rights in this software. + // + // Redistribution and use in source and binary forms, with or without + // modification, are permitted provided that the following conditions are + // met: + // + // 1. Redistributions of source code must retain the above copyright + // notice, this list of conditions and the following disclaimer. + // + // 2. Redistributions in binary form must reproduce the above copyright + // notice, this list of conditions and the following disclaimer in the + // documentation and/or other materials provided with the distribution. + // + // 3. Neither the name of the Corporation nor the names of the + // contributors may be used to endorse or promote products derived from + // this software without specific prior written permission. + // + // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY + // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE + // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + // + // Questions? Contact Christian R. Trott (crtrott@sandia.gov) + // + // ************************************************************************ + //@HEADER +*/ + +#ifndef KOKKOSP_DEVICE_INFO_HPP +#define KOKKOSP_DEVICE_INFO_HPP + +#include <cstdint> +#include <impl/Kokkos_Profiling_C_Interface.h> +namespace Kokkos { +namespace Profiling { +using KokkosPDeviceInfo = Kokkos_Profiling_KokkosPDeviceInfo; +} // namespace Profiling +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp b/packages/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp new file mode 100644 index 0000000000000000000000000000000000000000..7809632f78ddf33d8429b353723736b68e3b7536 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp @@ -0,0 +1,223 @@ +/* + //@HEADER + // ************************************************************************ + // + // Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). + // + // Under the terms of Contract DE-NA0003525 with NTESS, + // the U.S. Government retains certain rights in this software. + // + // Redistribution and use in source and binary forms, with or without + // modification, are permitted provided that the following conditions are + // met: + // + // 1. Redistributions of source code must retain the above copyright + // notice, this list of conditions and the following disclaimer. + // + // 2. Redistributions in binary form must reproduce the above copyright + // notice, this list of conditions and the following disclaimer in the + // documentation and/or other materials provided with the distribution. + // + // 3. Neither the name of the Corporation nor the names of the + // contributors may be used to endorse or promote products derived from + // this software without specific prior written permission. + // + // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY + // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE + // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + // + // Questions? Contact Christian R. Trott (crtrott@sandia.gov) + // + // ************************************************************************ + //@HEADER + */ + +#ifndef KOKKOSP_INTERFACE_HPP +#define KOKKOSP_INTERFACE_HPP + +#include <cinttypes> +#include <cstddef> + +#include <cstdlib> + +// NOTE: in this Kokkos::Profiling block, do not define anything that shouldn't +// exist should Profiling be disabled + +namespace Kokkos { +namespace Tools { +namespace Experimental { +enum struct DeviceType { + Serial, + OpenMP, + Cuda, + HIP, + OpenMPTarget, + HPX, + Threads, + SYCL, + Unknown +}; + +template <typename ExecutionSpace> +struct DeviceTypeTraits; + +constexpr const size_t device_type_bits = 8; +constexpr const size_t instance_bits = 24; +template <typename ExecutionSpace> +inline uint32_t device_id(ExecutionSpace const& space) noexcept { + auto device_id = static_cast<uint32_t>(DeviceTypeTraits<ExecutionSpace>::id); + return (device_id << instance_bits) + space.impl_instance_id(); +} +} // namespace Experimental +} // namespace Tools +} // end namespace Kokkos + +#if defined(KOKKOS_ENABLE_LIBDL) +// We check at configure time that libdl is available. +#include <dlfcn.h> +#endif + +#include <impl/Kokkos_Profiling_DeviceInfo.hpp> +#include <impl/Kokkos_Profiling_C_Interface.h> + +namespace Kokkos { +namespace Tools { + +using SpaceHandle = Kokkos_Profiling_SpaceHandle; + +} // namespace Tools + +namespace Tools { + +namespace Experimental { +using EventSet = Kokkos_Profiling_EventSet; +static_assert(sizeof(EventSet) / sizeof(Kokkos_Tools_functionPointer) == 275, + "sizeof EventSet has changed, this is an error on the part of a " + "Kokkos developer"); +static_assert(sizeof(Kokkos_Tools_ToolSettings) / sizeof(bool) == 256, + "sizeof EventSet has changed, this is an error on the part of a " + "Kokkos developer"); +static_assert(sizeof(Kokkos_Tools_ToolProgrammingInterface) / + sizeof(Kokkos_Tools_functionPointer) == + 32, + "sizeof EventSet has changed, this is an error on the part of a " + "Kokkos developer"); + +using toolInvokedFenceFunction = Kokkos_Tools_toolInvokedFenceFunction; +using provideToolProgrammingInterfaceFunction = + Kokkos_Tools_provideToolProgrammingInterfaceFunction; +using requestToolSettingsFunction = Kokkos_Tools_requestToolSettingsFunction; +using ToolSettings = Kokkos_Tools_ToolSettings; +using ToolProgrammingInterface = Kokkos_Tools_ToolProgrammingInterface; +} // namespace Experimental +using initFunction = Kokkos_Profiling_initFunction; +using finalizeFunction = Kokkos_Profiling_finalizeFunction; +using parseArgsFunction = Kokkos_Profiling_parseArgsFunction; +using printHelpFunction = Kokkos_Profiling_printHelpFunction; +using beginFunction = Kokkos_Profiling_beginFunction; +using endFunction = Kokkos_Profiling_endFunction; +using pushFunction = Kokkos_Profiling_pushFunction; +using popFunction = Kokkos_Profiling_popFunction; +using allocateDataFunction = Kokkos_Profiling_allocateDataFunction; +using deallocateDataFunction = Kokkos_Profiling_deallocateDataFunction; +using createProfileSectionFunction = + Kokkos_Profiling_createProfileSectionFunction; +using startProfileSectionFunction = + Kokkos_Profiling_startProfileSectionFunction; +using stopProfileSectionFunction = Kokkos_Profiling_stopProfileSectionFunction; +using destroyProfileSectionFunction = + Kokkos_Profiling_destroyProfileSectionFunction; +using profileEventFunction = Kokkos_Profiling_profileEventFunction; +using beginDeepCopyFunction = Kokkos_Profiling_beginDeepCopyFunction; +using endDeepCopyFunction = Kokkos_Profiling_endDeepCopyFunction; +using beginFenceFunction = Kokkos_Profiling_beginFenceFunction; +using endFenceFunction = Kokkos_Profiling_endFenceFunction; +using dualViewSyncFunction = Kokkos_Profiling_dualViewSyncFunction; +using dualViewModifyFunction = Kokkos_Profiling_dualViewModifyFunction; +using declareMetadataFunction = Kokkos_Profiling_declareMetadataFunction; + +} // namespace Tools + +} // namespace Kokkos + +// Profiling + +namespace Kokkos { + +namespace Profiling { + +/** The Profiling namespace is being renamed to Tools. + * This is reexposing the contents of what used to be the Profiling + * Interface with their original names, to avoid breaking old code + */ + +namespace Experimental { + +using Kokkos::Tools::Experimental::device_id; +using Kokkos::Tools::Experimental::DeviceType; +using Kokkos::Tools::Experimental::DeviceTypeTraits; + +} // namespace Experimental + +using Kokkos::Tools::allocateDataFunction; +using Kokkos::Tools::beginDeepCopyFunction; +using Kokkos::Tools::beginFunction; +using Kokkos::Tools::createProfileSectionFunction; +using Kokkos::Tools::deallocateDataFunction; +using Kokkos::Tools::destroyProfileSectionFunction; +using Kokkos::Tools::endDeepCopyFunction; +using Kokkos::Tools::endFunction; +using Kokkos::Tools::finalizeFunction; +using Kokkos::Tools::initFunction; +using Kokkos::Tools::parseArgsFunction; +using Kokkos::Tools::popFunction; +using Kokkos::Tools::printHelpFunction; +using Kokkos::Tools::profileEventFunction; +using Kokkos::Tools::pushFunction; +using Kokkos::Tools::SpaceHandle; +using Kokkos::Tools::startProfileSectionFunction; +using Kokkos::Tools::stopProfileSectionFunction; + +} // namespace Profiling +} // namespace Kokkos + +// Tuning + +namespace Kokkos { +namespace Tools { +namespace Experimental { +using ValueSet = Kokkos_Tools_ValueSet; +using ValueRange = Kokkos_Tools_ValueRange; +using StatisticalCategory = Kokkos_Tools_VariableInfo_StatisticalCategory; +using ValueType = Kokkos_Tools_VariableInfo_ValueType; +using CandidateValueType = Kokkos_Tools_VariableInfo_CandidateValueType; +using SetOrRange = Kokkos_Tools_VariableInfo_SetOrRange; +using VariableInfo = Kokkos_Tools_VariableInfo; +using OptimizationGoal = Kokkos_Tools_OptimzationGoal; +using TuningString = Kokkos_Tools_Tuning_String; +using VariableValue = Kokkos_Tools_VariableValue; + +using outputTypeDeclarationFunction = + Kokkos_Tools_outputTypeDeclarationFunction; +using inputTypeDeclarationFunction = Kokkos_Tools_inputTypeDeclarationFunction; +using requestValueFunction = Kokkos_Tools_requestValueFunction; +using contextBeginFunction = Kokkos_Tools_contextBeginFunction; +using contextEndFunction = Kokkos_Tools_contextEndFunction; +using optimizationGoalDeclarationFunction = + Kokkos_Tools_optimizationGoalDeclarationFunction; +} // end namespace Experimental +} // end namespace Tools + +} // end namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/impl/Kokkos_Serial.cpp b/packages/kokkos/core/src/impl/Kokkos_Serial.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4bd037906506bd27654067d5c2fda99fb59684ca --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Serial.cpp @@ -0,0 +1,225 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#if defined(KOKKOS_ENABLE_SERIAL) + +#include <cstdlib> +#include <sstream> +#include <Kokkos_Serial.hpp> +#include <impl/Kokkos_Traits.hpp> +#include <impl/Kokkos_Error.hpp> + +#include <impl/Kokkos_SharedAlloc.hpp> +#include <sstream> + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { +namespace { + +HostThreadTeamData g_serial_thread_team_data; + +bool g_serial_is_initialized = false; + +} // namespace + +// Resize thread team data scratch memory +void serial_resize_thread_team_data(size_t pool_reduce_bytes, + size_t team_reduce_bytes, + size_t team_shared_bytes, + size_t thread_local_bytes) { + if (pool_reduce_bytes < 512) pool_reduce_bytes = 512; + if (team_reduce_bytes < 512) team_reduce_bytes = 512; + + const size_t old_pool_reduce = g_serial_thread_team_data.pool_reduce_bytes(); + const size_t old_team_reduce = g_serial_thread_team_data.team_reduce_bytes(); + const size_t old_team_shared = g_serial_thread_team_data.team_shared_bytes(); + const size_t old_thread_local = + g_serial_thread_team_data.thread_local_bytes(); + const size_t old_alloc_bytes = g_serial_thread_team_data.scratch_bytes(); + + // Allocate if any of the old allocation is tool small: + + const bool allocate = (old_pool_reduce < pool_reduce_bytes) || + (old_team_reduce < team_reduce_bytes) || + (old_team_shared < team_shared_bytes) || + (old_thread_local < thread_local_bytes); + + if (allocate) { + Kokkos::HostSpace space; + + if (old_alloc_bytes) { + g_serial_thread_team_data.disband_team(); + g_serial_thread_team_data.disband_pool(); + + space.deallocate("Kokkos::Serial::scratch_mem", + g_serial_thread_team_data.scratch_buffer(), + g_serial_thread_team_data.scratch_bytes()); + } + + if (pool_reduce_bytes < old_pool_reduce) { + pool_reduce_bytes = old_pool_reduce; + } + if (team_reduce_bytes < old_team_reduce) { + team_reduce_bytes = old_team_reduce; + } + if (team_shared_bytes < old_team_shared) { + team_shared_bytes = old_team_shared; + } + if (thread_local_bytes < old_thread_local) { + thread_local_bytes = old_thread_local; + } + + const size_t alloc_bytes = + HostThreadTeamData::scratch_size(pool_reduce_bytes, team_reduce_bytes, + team_shared_bytes, thread_local_bytes); + + void* ptr = nullptr; + try { + ptr = space.allocate("Kokkos::Serial::scratch_mem", alloc_bytes); + } catch (Kokkos::Experimental::RawMemoryAllocationFailure const& failure) { + // For now, just rethrow the error message the existing way + Kokkos::Impl::throw_runtime_exception(failure.get_error_message()); + } + + g_serial_thread_team_data.scratch_assign( + ((char*)ptr), alloc_bytes, pool_reduce_bytes, team_reduce_bytes, + team_shared_bytes, thread_local_bytes); + + HostThreadTeamData* pool[1] = {&g_serial_thread_team_data}; + + g_serial_thread_team_data.organize_pool(pool, 1); + g_serial_thread_team_data.organize_team(1); + } +} + +HostThreadTeamData* serial_get_thread_team_data() { + return &g_serial_thread_team_data; +} + +} // namespace Impl +} // namespace Kokkos + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { + +bool Serial::impl_is_initialized() { return Impl::g_serial_is_initialized; } + +void Serial::impl_initialize() { + Impl::SharedAllocationRecord<void, void>::tracking_enable(); + + // Init the array of locks used for arbitrarily sized atomics + Impl::init_lock_array_host_space(); + + Impl::g_serial_is_initialized = true; +} + +void Serial::impl_finalize() { + if (Impl::g_serial_thread_team_data.scratch_buffer()) { + Impl::g_serial_thread_team_data.disband_team(); + Impl::g_serial_thread_team_data.disband_pool(); + + Kokkos::HostSpace space; + + space.deallocate(Impl::g_serial_thread_team_data.scratch_buffer(), + Impl::g_serial_thread_team_data.scratch_bytes()); + + Impl::g_serial_thread_team_data.scratch_assign(nullptr, 0, 0, 0, 0, 0); + } + + Kokkos::Profiling::finalize(); + + Impl::g_serial_is_initialized = false; +} + +const char* Serial::name() { return "Serial"; } + +namespace Impl { + +int g_serial_space_factory_initialized = + initialize_space_factory<SerialSpaceInitializer>("100_Serial"); + +void SerialSpaceInitializer::initialize(const InitArguments& args) { + // Prevent "unused variable" warning for 'args' input struct. If + // Serial::initialize() ever needs to take arguments from the input + // struct, you may remove this line of code. + (void)args; + + // Always initialize Serial if it is configure time enabled + Kokkos::Serial::impl_initialize(); +} + +void SerialSpaceInitializer::finalize(const bool) { + if (Kokkos::Serial::impl_is_initialized()) Kokkos::Serial::impl_finalize(); +} + +void SerialSpaceInitializer::fence() { Kokkos::Serial::impl_static_fence(); } + +void SerialSpaceInitializer::print_configuration(std::ostream& msg, + const bool detail) { + msg << "Host Serial Execution Space:" << std::endl; + msg << " KOKKOS_ENABLE_SERIAL: "; + msg << "yes" << std::endl; + + msg << "Serial Atomics:" << std::endl; + msg << " KOKKOS_ENABLE_SERIAL_ATOMICS: "; +#ifdef KOKKOS_ENABLE_SERIAL_ATOMICS + msg << "yes" << std::endl; +#else + msg << "no" << std::endl; +#endif + + msg << "\nSerial Runtime Configuration:" << std::endl; + Serial::print_configuration(msg, detail); +} + +} // namespace Impl +} // namespace Kokkos + +#else +void KOKKOS_CORE_SRC_IMPL_SERIAL_PREVENT_LINK_ERROR() {} +#endif // defined( KOKKOS_ENABLE_SERIAL ) diff --git a/packages/kokkos/core/src/impl/Kokkos_Serial_Task.cpp b/packages/kokkos/core/src/impl/Kokkos_Serial_Task.cpp new file mode 100644 index 0000000000000000000000000000000000000000..179c55b10dd5357fecc1016dfa239ebd2813149f --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Serial_Task.cpp @@ -0,0 +1,67 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_SERIAL) && defined(KOKKOS_ENABLE_TASKDAG) + +#include <Kokkos_Core.hpp> + +#include <impl/Kokkos_Serial_Task.hpp> +#include <impl/Kokkos_TaskQueue_impl.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template class TaskQueue<Kokkos::Serial, typename Kokkos::Serial::memory_space>; + +} +} // namespace Kokkos + +#else +void KOKKOS_CORE_SRC_IMPL_SERIAL_TASK_PREVENT_LINK_ERROR() {} +#endif /* #if defined( KOKKOS_ENABLE_SERIAL ) && defined( \ + KOKKOS_ENABLE_TASKDAG ) */ diff --git a/packages/kokkos/core/src/impl/Kokkos_Serial_Task.hpp b/packages/kokkos/core/src/impl/Kokkos_Serial_Task.hpp new file mode 100644 index 0000000000000000000000000000000000000000..3ac3899acaf9f3695025472fc5f02cb0708f64fb --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Serial_Task.hpp @@ -0,0 +1,250 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_IMPL_SERIAL_TASK_HPP +#define KOKKOS_IMPL_SERIAL_TASK_HPP + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_TASKDAG) + +#include <Kokkos_TaskScheduler_fwd.hpp> + +#include <impl/Kokkos_TaskQueue.hpp> +#include <Kokkos_Serial.hpp> +#include <impl/Kokkos_HostThreadTeam.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <class QueueType> +class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Serial, QueueType> > { + public: + // Note: Scheduler may be an incomplete type at class scope (but not inside + // of the methods, obviously) + + using execution_space = Kokkos::Serial; + using memory_space = Kokkos::HostSpace; + using scheduler_type = SimpleTaskScheduler<Kokkos::Serial, QueueType>; + using member_type = + TaskTeamMemberAdapter<HostThreadTeamMember<Kokkos::Serial>, + scheduler_type>; + + static void execute(scheduler_type const& scheduler) { + using task_base_type = typename scheduler_type::task_base_type; + + // Set default buffers + serial_resize_thread_team_data(0, /* global reduce buffer */ + 512, /* team reduce buffer */ + 0, /* team shared buffer */ + 0 /* thread local buffer */ + ); + + Impl::HostThreadTeamData& self = *Impl::serial_get_thread_team_data(); + + auto& queue = scheduler.queue(); + auto team_scheduler = scheduler.get_team_scheduler(0); + + member_type member(scheduler, self); + + auto current_task = OptionalRef<task_base_type>(nullptr); + + while (!queue.is_done()) { + // Each team lead attempts to acquire either a thread team task + // or a single thread task for the team. + + // pop a task off + current_task = queue.pop_ready_task(team_scheduler.team_scheduler_info()); + + // run the task + if (current_task) { + current_task->as_runnable_task().run(member); + // Respawns are handled in the complete function + queue.complete((*std::move(current_task)).as_runnable_task(), + team_scheduler.team_scheduler_info()); + } + } + } + + static constexpr uint32_t get_max_team_count( + execution_space const&) noexcept { + return 1; + } + + template <typename TaskType> + static void get_function_pointer(typename TaskType::function_type& ptr, + typename TaskType::destroy_type& dtor) { + ptr = TaskType::apply; + dtor = TaskType::destroy; + } +}; + +//---------------------------------------------------------------------------- + +template <class Scheduler> +class TaskQueueSpecializationConstrained< + Scheduler, + typename std::enable_if<std::is_same<typename Scheduler::execution_space, + Kokkos::Serial>::value>::type> { + public: + // Note: Scheduler may be an incomplete type at class scope (but not inside + // of the methods, obviously) + + using execution_space = Kokkos::Serial; + using memory_space = Kokkos::HostSpace; + using scheduler_type = Scheduler; + using member_type = + TaskTeamMemberAdapter<HostThreadTeamMember<Kokkos::Serial>, + scheduler_type>; + + static void iff_single_thread_recursive_execute( + scheduler_type const& scheduler) { + using task_base_type = TaskBase; + using queue_type = typename scheduler_type::queue_type; + + task_base_type* const end = (task_base_type*)task_base_type::EndTag; + + Impl::HostThreadTeamData* const data = Impl::serial_get_thread_team_data(); + + member_type exec(scheduler, *data); + + // Loop until no runnable task + + task_base_type* task = end; + + auto* const queue = scheduler.m_queue; + + do { + task = end; + + for (int i = 0; i < queue_type::NumQueue && end == task; ++i) { + for (int j = 0; j < 2 && end == task; ++j) { + task = queue_type::pop_ready_task(&queue->m_ready[i][j]); + } + } + + if (end == task) break; + + (*task->m_apply)(task, &exec); + + queue->complete(task); + + } while (1); + } + + static void execute(scheduler_type const& scheduler) { + using task_base_type = TaskBase; + using queue_type = typename scheduler_type::queue_type; + + task_base_type* const end = (task_base_type*)task_base_type::EndTag; + + // Set default buffers + serial_resize_thread_team_data(0, /* global reduce buffer */ + 512, /* team reduce buffer */ + 0, /* team shared buffer */ + 0 /* thread local buffer */ + ); + + auto* const queue = scheduler.m_queue; + + Impl::HostThreadTeamData* const data = Impl::serial_get_thread_team_data(); + + member_type exec(scheduler, *data); + + // Loop until all queues are empty + while (0 < queue->m_ready_count) { + task_base_type* task = end; + + for (int i = 0; i < queue_type::NumQueue && end == task; ++i) { + for (int j = 0; j < 2 && end == task; ++j) { + task = queue_type::pop_ready_task(&queue->m_ready[i][j]); + } + } + + if (end != task) { + // pop_ready_task resulted in lock == task->m_next + // In the executing state + + (*task->m_apply)(task, &exec); + +#if 0 + printf( "TaskQueue<Serial>::executed: 0x%lx { 0x%lx 0x%lx %d %d %d }\n" + , uintptr_t(task) + , uintptr_t(task->m_wait) + , uintptr_t(task->m_next) + , task->m_task_type + , task->m_priority + , task->m_ref_count ); +#endif + + // If a respawn then re-enqueue otherwise the task is complete + // and all tasks waiting on this task are updated. + queue->complete(task); + } else if (0 != queue->m_ready_count) { + Kokkos::abort("TaskQueue<Serial>::execute ERROR: ready_count"); + } + } + } + + template <typename TaskType> + static void get_function_pointer(typename TaskType::function_type& ptr, + typename TaskType::destroy_type& dtor) { + ptr = TaskType::apply; + dtor = TaskType::destroy; + } +}; + +extern template class TaskQueue<Kokkos::Serial, + typename Kokkos::Serial::memory_space>; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */ +#endif /* #ifndef KOKKOS_IMPL_SERIAL_TASK_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_Serial_WorkGraphPolicy.hpp b/packages/kokkos/core/src/impl/Kokkos_Serial_WorkGraphPolicy.hpp new file mode 100644 index 0000000000000000000000000000000000000000..0f6ad5cb0354a3d467a686603f72869990f8380c --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Serial_WorkGraphPolicy.hpp @@ -0,0 +1,94 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_SERIAL_WORKGRAPHPOLICY_HPP +#define KOKKOS_SERIAL_WORKGRAPHPOLICY_HPP + +namespace Kokkos { +namespace Impl { + +template <class FunctorType, class... Traits> +class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>, + Kokkos::Serial> { + private: + using Policy = Kokkos::WorkGraphPolicy<Traits...>; + + Policy m_policy; + FunctorType m_functor; + + template <class TagType> + typename std::enable_if<std::is_same<TagType, void>::value>::type exec_one( + const std::int32_t w) const noexcept { + m_functor(w); + } + + template <class TagType> + typename std::enable_if<!std::is_same<TagType, void>::value>::type exec_one( + const std::int32_t w) const noexcept { + const TagType t{}; + m_functor(t, w); + } + + public: + inline void execute() const noexcept { + // Spin until COMPLETED_TOKEN. + // END_TOKEN indicates no work is currently available. + + for (std::int32_t w = Policy::END_TOKEN; + Policy::COMPLETED_TOKEN != (w = m_policy.pop_work());) { + if (Policy::END_TOKEN != w) { + exec_one<typename Policy::work_tag>(w); + m_policy.completed_work(w); + } + } + } + + inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) + : m_policy(arg_policy), m_functor(arg_functor) {} +}; + +} // namespace Impl +} // namespace Kokkos + +#endif /* #define KOKKOS_SERIAL_WORKGRAPHPOLICY_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp new file mode 100644 index 0000000000000000000000000000000000000000..917ae72081c6a5eee98b4e02827097446cb29e0b --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp @@ -0,0 +1,369 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> + +namespace Kokkos { +namespace Impl { + +KOKKOS_THREAD_LOCAL int SharedAllocationRecord<void, void>::t_tracking_enabled = + 1; + +#ifdef KOKKOS_ENABLE_DEBUG +bool SharedAllocationRecord<void, void>::is_sane( + SharedAllocationRecord<void, void>* arg_record) { + SharedAllocationRecord* const root = + arg_record ? arg_record->m_root : nullptr; + + bool ok = root != nullptr && root->use_count() == 0; + + if (ok) { + SharedAllocationRecord* root_next = nullptr; + static constexpr SharedAllocationRecord* zero = nullptr; + // Lock the list: + while ((root_next = Kokkos::atomic_exchange(&root->m_next, zero)) == + nullptr) + ; + + for (SharedAllocationRecord* rec = root_next; ok && rec != root; + rec = rec->m_next) { + const bool ok_non_null = + rec && rec->m_prev && (rec == root || rec->m_next); + const bool ok_root = ok_non_null && rec->m_root == root; + const bool ok_prev_next = + ok_non_null && + (rec->m_prev != root ? rec->m_prev->m_next == rec : root_next == rec); + const bool ok_next_prev = ok_non_null && rec->m_next->m_prev == rec; + const bool ok_count = ok_non_null && 0 <= rec->use_count(); + + ok = ok_root && ok_prev_next && ok_next_prev && ok_count; + + if (!ok) { + // Formatting dependent on sizeof(uintptr_t) + const char* format_string; + + if (sizeof(uintptr_t) == sizeof(unsigned long)) { + format_string = + "Kokkos::Impl::SharedAllocationRecord failed is_sane: " + "rec(0x%.12lx){ m_count(%d) m_root(0x%.12lx) m_next(0x%.12lx) " + "m_prev(0x%.12lx) m_next->m_prev(0x%.12lx) " + "m_prev->m_next(0x%.12lx) }\n"; + } else if (sizeof(uintptr_t) == sizeof(unsigned long long)) { + format_string = + "Kokkos::Impl::SharedAllocationRecord failed is_sane: " + "rec(0x%.12llx){ m_count(%d) m_root(0x%.12llx) m_next(0x%.12llx) " + "m_prev(0x%.12llx) m_next->m_prev(0x%.12llx) " + "m_prev->m_next(0x%.12llx) }\n"; + } + + fprintf(stderr, format_string, reinterpret_cast<uintptr_t>(rec), + rec->use_count(), reinterpret_cast<uintptr_t>(rec->m_root), + reinterpret_cast<uintptr_t>(rec->m_next), + reinterpret_cast<uintptr_t>(rec->m_prev), + reinterpret_cast<uintptr_t>( + rec->m_next != nullptr ? rec->m_next->m_prev : nullptr), + reinterpret_cast<uintptr_t>(rec->m_prev != rec->m_root + ? rec->m_prev->m_next + : root_next)); + } + } + + if (nullptr != Kokkos::atomic_exchange(&root->m_next, root_next)) { + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::SharedAllocationRecord failed is_sane unlocking"); + } + } + return ok; +} + +#else + +bool SharedAllocationRecord<void, void>::is_sane( + SharedAllocationRecord<void, void>*) { + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::SharedAllocationRecord::is_sane only works with " + "KOKKOS_ENABLE_DEBUG enabled"); + return false; +} +#endif //#ifdef KOKKOS_ENABLE_DEBUG + +#ifdef KOKKOS_ENABLE_DEBUG +SharedAllocationRecord<void, void>* SharedAllocationRecord<void, void>::find( + SharedAllocationRecord<void, void>* const arg_root, + void* const arg_data_ptr) { + SharedAllocationRecord* root_next = nullptr; + static constexpr SharedAllocationRecord* zero = nullptr; + + // Lock the list: + while ((root_next = Kokkos::atomic_exchange(&arg_root->m_next, zero)) == + nullptr) + ; + + // Iterate searching for the record with this data pointer + + SharedAllocationRecord* r = root_next; + + while ((r != arg_root) && (r->data() != arg_data_ptr)) { + r = r->m_next; + } + + if (r == arg_root) { + r = nullptr; + } + + if (nullptr != Kokkos::atomic_exchange(&arg_root->m_next, root_next)) { + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::SharedAllocationRecord failed locking/unlocking"); + } + return r; +} +#else +SharedAllocationRecord<void, void>* SharedAllocationRecord<void, void>::find( + SharedAllocationRecord<void, void>* const, void* const) { + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::SharedAllocationRecord::find only works with " + "KOKKOS_ENABLE_DEBUG " + "enabled"); + return nullptr; +} +#endif + +/**\brief Construct and insert into 'arg_root' tracking set. + * use_count is zero. + */ +SharedAllocationRecord<void, void>::SharedAllocationRecord( +#ifdef KOKKOS_ENABLE_DEBUG + SharedAllocationRecord<void, void>* arg_root, +#endif + SharedAllocationHeader* arg_alloc_ptr, size_t arg_alloc_size, + SharedAllocationRecord<void, void>::function_type arg_dealloc) + : m_alloc_ptr(arg_alloc_ptr), + m_alloc_size(arg_alloc_size), + m_dealloc(arg_dealloc) +#ifdef KOKKOS_ENABLE_DEBUG + , + m_root(arg_root), + m_prev(nullptr), + m_next(nullptr) +#endif + , + m_count(0) { + if (nullptr != arg_alloc_ptr) { +#ifdef KOKKOS_ENABLE_DEBUG + // Insert into the root double-linked list for tracking + // + // before: arg_root->m_next == next ; next->m_prev == arg_root + // after: arg_root->m_next == this ; this->m_prev == arg_root ; + // this->m_next == next ; next->m_prev == this + + m_prev = m_root; + static constexpr SharedAllocationRecord* zero = nullptr; + + // Read root->m_next and lock by setting to nullptr + while ((m_next = Kokkos::atomic_exchange(&m_root->m_next, zero)) == nullptr) + ; + + m_next->m_prev = this; + + // memory fence before completing insertion into linked list + Kokkos::memory_fence(); + + if (nullptr != Kokkos::atomic_exchange(&m_root->m_next, this)) { + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::SharedAllocationRecord failed locking/unlocking"); + } +#endif + + } else { + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::SharedAllocationRecord given nullptr allocation"); + } +} + +void SharedAllocationRecord<void, void>::increment( + SharedAllocationRecord<void, void>* arg_record) { + const int old_count = Kokkos::atomic_fetch_add(&arg_record->m_count, 1); + + if (old_count < 0) { // Error + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::SharedAllocationRecord failed increment"); + } +} + +SharedAllocationRecord<void, void>* SharedAllocationRecord< + void, void>::decrement(SharedAllocationRecord<void, void>* arg_record) { + const int old_count = Kokkos::atomic_fetch_sub(&arg_record->m_count, 1); + + if (old_count == 1) { + if (!Kokkos::is_initialized()) { + std::stringstream ss; + ss << "Kokkos allocation \""; + ss << arg_record->get_label(); + ss << "\" is being deallocated after Kokkos::finalize was called\n"; + auto s = ss.str(); + Kokkos::Impl::throw_runtime_exception(s); + } + +#ifdef KOKKOS_ENABLE_DEBUG + // before: arg_record->m_prev->m_next == arg_record && + // arg_record->m_next->m_prev == arg_record + // + // after: arg_record->m_prev->m_next == arg_record->m_next && + // arg_record->m_next->m_prev == arg_record->m_prev + + SharedAllocationRecord* root_next = nullptr; + static constexpr SharedAllocationRecord* zero = nullptr; + + // Lock the list: + while ((root_next = Kokkos::atomic_exchange(&arg_record->m_root->m_next, + zero)) == nullptr) + ; + + arg_record->m_next->m_prev = arg_record->m_prev; + + if (root_next != arg_record) { + arg_record->m_prev->m_next = arg_record->m_next; + } else { + // before: arg_record->m_root == arg_record->m_prev + // after: arg_record->m_root == arg_record->m_next + root_next = arg_record->m_next; + } + + Kokkos::memory_fence(); + + // Unlock the list: + if (nullptr != + Kokkos::atomic_exchange(&arg_record->m_root->m_next, root_next)) { + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::SharedAllocationRecord failed decrement unlocking"); + } + + arg_record->m_next = nullptr; + arg_record->m_prev = nullptr; +#endif + + function_type d = arg_record->m_dealloc; + (*d)(arg_record); + arg_record = nullptr; + } else if (old_count < 1) { // Error + fprintf(stderr, + "Kokkos::Impl::SharedAllocationRecord '%s' failed decrement count " + "= %d\n", + arg_record->m_alloc_ptr->m_label, old_count); + fflush(stderr); + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::SharedAllocationRecord failed decrement count"); + } + + return arg_record; +} + +#ifdef KOKKOS_ENABLE_DEBUG +void SharedAllocationRecord<void, void>::print_host_accessible_records( + std::ostream& s, const char* const space_name, + const SharedAllocationRecord* const root, const bool detail) { + const SharedAllocationRecord<void, void>* r = root; + + char buffer[256]; + + if (detail) { + do { + // Formatting dependent on sizeof(uintptr_t) + const char* format_string; + + if (sizeof(uintptr_t) == sizeof(unsigned long)) { + format_string = + "%s addr( 0x%.12lx ) list( 0x%.12lx 0x%.12lx ) extent[ 0x%.12lx + " + "%.8ld ] count(%d) dealloc(0x%.12lx) %s\n"; + } else if (sizeof(uintptr_t) == sizeof(unsigned long long)) { + format_string = + "%s addr( 0x%.12llx ) list( 0x%.12llx 0x%.12llx ) extent[ " + "0x%.12llx + %.8ld ] count(%d) dealloc(0x%.12llx) %s\n"; + } + + snprintf(buffer, 256, format_string, space_name, + reinterpret_cast<uintptr_t>(r), + reinterpret_cast<uintptr_t>(r->m_prev), + reinterpret_cast<uintptr_t>(r->m_next), + reinterpret_cast<uintptr_t>(r->m_alloc_ptr), r->m_alloc_size, + r->use_count(), reinterpret_cast<uintptr_t>(r->m_dealloc), + r->m_alloc_ptr->m_label); + s << buffer; + r = r->m_next; + } while (r != root); + } else { + do { + if (r->m_alloc_ptr) { + // Formatting dependent on sizeof(uintptr_t) + const char* format_string; + + if (sizeof(uintptr_t) == sizeof(unsigned long)) { + format_string = "%s [ 0x%.12lx + %ld ] %s\n"; + } else if (sizeof(uintptr_t) == sizeof(unsigned long long)) { + format_string = "%s [ 0x%.12llx + %ld ] %s\n"; + } + + snprintf(buffer, 256, format_string, space_name, + reinterpret_cast<uintptr_t>(r->data()), r->size(), + r->m_alloc_ptr->m_label); + } else { + snprintf(buffer, 256, "%s [ 0 + 0 ]\n", space_name); + } + s << buffer; + r = r->m_next; + } while (r != root); + } +} +#else +void SharedAllocationRecord<void, void>::print_host_accessible_records( + std::ostream&, const char* const, const SharedAllocationRecord* const, + const bool) { + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::SharedAllocationRecord::print_host_accessible_records" + " only works with KOKKOS_ENABLE_DEBUG enabled"); +} +#endif + +} /* namespace Impl */ +} /* namespace Kokkos */ diff --git a/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp new file mode 100644 index 0000000000000000000000000000000000000000..64dfd5d33fb8576b1cb5446843edefaaf6d67422 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp @@ -0,0 +1,587 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_SHARED_ALLOC_HPP +#define KOKKOS_SHARED_ALLOC_HPP + +#include <Kokkos_Macros.hpp> +#include <Kokkos_Core_fwd.hpp> +#include <impl/Kokkos_Error.hpp> // Impl::throw_runtime_exception + +#include <cstdint> +#include <string> + +#if defined(KOKKOS_ENABLE_OPENMPTARGET) +// Base function. +static constexpr bool kokkos_omp_on_host() { return true; } +#if defined(KOKKOS_COMPILER_PGI) +#define KOKKOS_IMPL_IF_ON_HOST if (!__builtin_is_device_code()) +#else +// Note: OpenMPTarget enforces C++17 at configure time +#pragma omp begin declare variant match(device = {kind(host)}) +static constexpr bool kokkos_omp_on_host() { return true; } +#pragma omp end declare variant + +#pragma omp begin declare variant match(device = {kind(nohost)}) +static constexpr bool kokkos_omp_on_host() { return false; } +#pragma omp end declare variant + +#define KOKKOS_IMPL_IF_ON_HOST if constexpr (kokkos_omp_on_host()) +#endif +#else +#define KOKKOS_IMPL_IF_ON_HOST if (true) +#endif + +namespace Kokkos { +namespace Impl { + +template <class MemorySpace = void, class DestroyFunctor = void> +class SharedAllocationRecord; + +template <class MemorySpace> +class SharedAllocationRecordCommon; + +class SharedAllocationHeader { + private: + using Record = SharedAllocationRecord<void, void>; + + static constexpr unsigned maximum_label_length = + (1u << 7 /* 128 */) - sizeof(Record*); + + template <class, class> + friend class SharedAllocationRecord; + template <class> + friend class SharedAllocationRecordCommon; + template <class> + friend class HostInaccessibleSharedAllocationRecordCommon; + + Record* m_record; + char m_label[maximum_label_length]; + + public: + /* Given user memory get pointer to the header */ + KOKKOS_INLINE_FUNCTION static const SharedAllocationHeader* get_header( + void* alloc_ptr) { + return reinterpret_cast<SharedAllocationHeader*>( + reinterpret_cast<char*>(alloc_ptr) - sizeof(SharedAllocationHeader)); + } + + KOKKOS_INLINE_FUNCTION + const char* label() const { return m_label; } +}; + +template <> +class SharedAllocationRecord<void, void> { + protected: + static_assert(sizeof(SharedAllocationHeader) == (1u << 7 /* 128 */), + "sizeof(SharedAllocationHeader) != 128"); + + template <class, class> + friend class SharedAllocationRecord; + template <class> + friend class SharedAllocationRecordCommon; + template <class> + friend class HostInaccessibleSharedAllocationRecordCommon; + + using function_type = void (*)(SharedAllocationRecord<void, void>*); + + SharedAllocationHeader* const m_alloc_ptr; + size_t const m_alloc_size; + function_type const m_dealloc; +#ifdef KOKKOS_ENABLE_DEBUG + SharedAllocationRecord* const m_root; + SharedAllocationRecord* m_prev; + SharedAllocationRecord* m_next; +#endif + int m_count; + + SharedAllocationRecord(SharedAllocationRecord&&) = delete; + SharedAllocationRecord(const SharedAllocationRecord&) = delete; + SharedAllocationRecord& operator=(SharedAllocationRecord&&) = delete; + SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; + + /**\brief Construct and insert into 'arg_root' tracking set. + * use_count is zero. + */ + SharedAllocationRecord( +#ifdef KOKKOS_ENABLE_DEBUG + SharedAllocationRecord* arg_root, +#endif + SharedAllocationHeader* arg_alloc_ptr, size_t arg_alloc_size, + function_type arg_dealloc); + private: + static KOKKOS_THREAD_LOCAL int t_tracking_enabled; + + public: + virtual std::string get_label() const { return std::string("Unmanaged"); } + +#ifdef KOKKOS_IMPL_ENABLE_OVERLOAD_HOST_DEVICE + /* Device tracking_enabled -- always disabled */ + KOKKOS_IMPL_DEVICE_FUNCTION + static int tracking_enabled() { return 0; } +#endif + + KOKKOS_IMPL_HOST_FUNCTION + static int tracking_enabled() { + KOKKOS_IMPL_IF_ON_HOST { return t_tracking_enabled; } + else { + return 0; + } + } + + /**\brief A host process thread claims and disables the + * shared allocation tracking flag. + */ + static void tracking_disable() { + KOKKOS_IMPL_IF_ON_HOST { t_tracking_enabled = 0; } + } + + /**\brief A host process thread releases and enables the + * shared allocation tracking flag. + */ + static void tracking_enable() { + KOKKOS_IMPL_IF_ON_HOST { t_tracking_enabled = 1; } + } + + virtual ~SharedAllocationRecord() = default; + + SharedAllocationRecord() + : m_alloc_ptr(nullptr), + m_alloc_size(0), + m_dealloc(nullptr) +#ifdef KOKKOS_ENABLE_DEBUG + , + m_root(this), + m_prev(this), + m_next(this) +#endif + , + m_count(0) { + } + + static constexpr unsigned maximum_label_length = + SharedAllocationHeader::maximum_label_length; + + KOKKOS_INLINE_FUNCTION + const SharedAllocationHeader* head() const { return m_alloc_ptr; } + + /* User's memory begins at the end of the header */ + KOKKOS_INLINE_FUNCTION + void* data() const { return reinterpret_cast<void*>(m_alloc_ptr + 1); } + + /* User's memory begins at the end of the header */ + size_t size() const { return m_alloc_size - sizeof(SharedAllocationHeader); } + + /* Cannot be 'constexpr' because 'm_count' is volatile */ + int use_count() const { return *static_cast<const volatile int*>(&m_count); } + +#ifdef KOKKOS_IMPL_ENABLE_OVERLOAD_HOST_DEVICE + /* Device tracking_enabled -- always disabled */ + KOKKOS_IMPL_DEVICE_FUNCTION + static void increment(SharedAllocationRecord*){}; +#endif + + /* Increment use count */ + KOKKOS_IMPL_HOST_FUNCTION + static void increment(SharedAllocationRecord*); + +#ifdef KOKKOS_IMPL_ENABLE_OVERLOAD_HOST_DEVICE + /* Device tracking_enabled -- always disabled */ + KOKKOS_IMPL_DEVICE_FUNCTION + static void decrement(SharedAllocationRecord*){}; +#endif + + /* Decrement use count. If 1->0 then remove from the tracking list and invoke + * m_dealloc */ + KOKKOS_IMPL_HOST_FUNCTION + static SharedAllocationRecord* decrement(SharedAllocationRecord*); + + /* Given a root record and data pointer find the record */ + static SharedAllocationRecord* find(SharedAllocationRecord* const, + void* const); + + /* Sanity check for the whole set of records to which the input record + * belongs. Locks the set's insert/erase operations until the sanity check is + * complete. + */ + static bool is_sane(SharedAllocationRecord*); + + /* Print host-accessible records */ + static void print_host_accessible_records( + std::ostream&, const char* const space_name, + const SharedAllocationRecord* const root, const bool detail); +}; + +template <class MemorySpace> +class SharedAllocationRecordCommon : public SharedAllocationRecord<void, void> { + private: + using derived_t = SharedAllocationRecord<MemorySpace, void>; + using record_base_t = SharedAllocationRecord<void, void>; + derived_t& self() { return *static_cast<derived_t*>(this); } + derived_t const& self() const { return *static_cast<derived_t const*>(this); } + + protected: + using record_base_t::record_base_t; + + void _fill_host_accessible_header_info(SharedAllocationHeader& arg_header, + std::string const& arg_label); + + static void deallocate(record_base_t* arg_rec); + + public: + static auto allocate(MemorySpace const& arg_space, + std::string const& arg_label, size_t arg_alloc_size) + -> derived_t*; + /**\brief Allocate tracked memory in the space */ + static void* allocate_tracked(MemorySpace const& arg_space, + std::string const& arg_alloc_label, + size_t arg_alloc_size); + /**\brief Reallocate tracked memory in the space */ + static void deallocate_tracked(void* arg_alloc_ptr); + /**\brief Deallocate tracked memory in the space */ + static void* reallocate_tracked(void* arg_alloc_ptr, size_t arg_alloc_size); + static auto get_record(void* alloc_ptr) -> derived_t*; + std::string get_label() const; + static void print_records(std::ostream& s, MemorySpace const&, + bool detail = false); +}; + +template <class MemorySpace> +class HostInaccessibleSharedAllocationRecordCommon + : public SharedAllocationRecordCommon<MemorySpace> { + private: + using base_t = SharedAllocationRecordCommon<MemorySpace>; + using derived_t = SharedAllocationRecord<MemorySpace, void>; + using record_base_t = SharedAllocationRecord<void, void>; + + protected: + using base_t::base_t; + + public: + static void print_records(std::ostream& s, MemorySpace const&, + bool detail = false); + static auto get_record(void* alloc_ptr) -> derived_t*; + std::string get_label() const; +}; + +namespace { + +/* Taking the address of this function so make sure it is unique */ +template <class MemorySpace, class DestroyFunctor> +void deallocate(SharedAllocationRecord<void, void>* record_ptr) { + using base_type = SharedAllocationRecord<MemorySpace, void>; + using this_type = SharedAllocationRecord<MemorySpace, DestroyFunctor>; + + this_type* const ptr = + static_cast<this_type*>(static_cast<base_type*>(record_ptr)); + + ptr->m_destroy.destroy_shared_allocation(); + + delete ptr; +} + +} // namespace + +/* + * Memory space specialization of SharedAllocationRecord< Space , void > + * requires : + * + * SharedAllocationRecord< Space , void > : public SharedAllocationRecord< void + * , void > + * { + * // delete allocated user memory via static_cast to this type. + * static void deallocate( const SharedAllocationRecord<void,void> * ); + * Space m_space ; + * } + */ +template <class MemorySpace, class DestroyFunctor> +class SharedAllocationRecord + : public SharedAllocationRecord<MemorySpace, void> { + private: + SharedAllocationRecord(const MemorySpace& arg_space, + const std::string& arg_label, const size_t arg_alloc) + /* Allocate user memory as [ SharedAllocationHeader , user_memory ] */ + : SharedAllocationRecord<MemorySpace, void>( + arg_space, arg_label, arg_alloc, + &Kokkos::Impl::deallocate<MemorySpace, DestroyFunctor>), + m_destroy() {} + + SharedAllocationRecord() = delete; + SharedAllocationRecord(const SharedAllocationRecord&) = delete; + SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; + + public: + DestroyFunctor m_destroy; + + // Allocate with a zero use count. Incrementing the use count from zero to + // one inserts the record into the tracking list. Decrementing the count from + // one to zero removes from the trakcing list and deallocates. + KOKKOS_INLINE_FUNCTION static SharedAllocationRecord* allocate( + const MemorySpace& arg_space, const std::string& arg_label, + const size_t arg_alloc) { +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + return new SharedAllocationRecord(arg_space, arg_label, arg_alloc); +#else + (void)arg_space; + (void)arg_label; + (void)arg_alloc; + return (SharedAllocationRecord*)0; +#endif + } +}; + +template <class MemorySpace> +class SharedAllocationRecord<MemorySpace, void> + : public SharedAllocationRecord<void, void> {}; + +union SharedAllocationTracker { + private: + using Record = SharedAllocationRecord<void, void>; + + enum : uintptr_t { DO_NOT_DEREF_FLAG = 0x01ul }; + + // The allocation record resides in Host memory space + uintptr_t m_record_bits; + Record* m_record; + + public: + // Use macros instead of inline functions to reduce + // pressure on compiler optimization by reducing + // number of symbols and inline functions. + +#if defined(KOKKOS_IMPL_ENABLE_OVERLOAD_HOST_DEVICE) + +#define KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_ENABLED Record::tracking_enabled() + +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST +#define KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_CONDITION \ + (!(m_record_bits & DO_NOT_DEREF_FLAG)) +#else +#define KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_CONDITION (0) +#endif + +#define KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_INCREMENT \ + if (KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_CONDITION) \ + KOKKOS_IMPL_IF_ON_HOST Record::increment(m_record); + +#define KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT \ + if (KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_CONDITION) \ + KOKKOS_IMPL_IF_ON_HOST Record::decrement(m_record); + +#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + +#define KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_ENABLED Record::tracking_enabled() + +#define KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_INCREMENT \ + if (!(m_record_bits & DO_NOT_DEREF_FLAG)) \ + KOKKOS_IMPL_IF_ON_HOST Record::increment(m_record); + +#define KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT \ + if (!(m_record_bits & DO_NOT_DEREF_FLAG)) \ + KOKKOS_IMPL_IF_ON_HOST Record::decrement(m_record); + +#else + +#define KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_ENABLED 0 + +#define KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_INCREMENT /* */ + +#define KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT /* */ + +#endif + +#define KOKKOS_IMPL_SHARED_ALLOCATION_CARRY_RECORD_BITS(rhs, \ + override_tracking) \ + (((!override_tracking) || (rhs.m_record_bits & DO_NOT_DEREF_FLAG) || \ + (!KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_ENABLED)) \ + ? rhs.m_record_bits | DO_NOT_DEREF_FLAG \ + : rhs.m_record_bits) + + /** \brief Assign a specialized record */ + inline void assign_allocated_record_to_uninitialized(Record* arg_record) { + if (arg_record) { + Record::increment(m_record = arg_record); + } else { + m_record_bits = DO_NOT_DEREF_FLAG; + } + } + + template <class MemorySpace> + constexpr SharedAllocationRecord<MemorySpace, void>* get_record() const + noexcept { + return (m_record_bits & DO_NOT_DEREF_FLAG) + ? nullptr + : static_cast<SharedAllocationRecord<MemorySpace, void>*>( + m_record); + } + + template <class MemorySpace> + std::string get_label() const { + return (m_record_bits == DO_NOT_DEREF_FLAG) + ? std::string() + : reinterpret_cast<SharedAllocationRecord<MemorySpace, void>*>( + m_record_bits & ~DO_NOT_DEREF_FLAG) + ->get_label(); + } + + KOKKOS_INLINE_FUNCTION + int use_count() const { +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + Record* const tmp = + reinterpret_cast<Record*>(m_record_bits & ~DO_NOT_DEREF_FLAG); + return (tmp ? tmp->use_count() : 0); +#else + return 0; +#endif + } + + KOKKOS_INLINE_FUNCTION + bool has_record() const { + return (m_record_bits & (~DO_NOT_DEREF_FLAG)) != 0; + } + + KOKKOS_FORCEINLINE_FUNCTION + void clear() { + // If this is tracking then must decrement + KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT + // Reset to default constructed value. + m_record_bits = DO_NOT_DEREF_FLAG; + } + + // Copy: + KOKKOS_FORCEINLINE_FUNCTION + ~SharedAllocationTracker(){KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT} + + KOKKOS_FORCEINLINE_FUNCTION constexpr SharedAllocationTracker() + : m_record_bits(DO_NOT_DEREF_FLAG) {} + + // Move: + + KOKKOS_FORCEINLINE_FUNCTION + SharedAllocationTracker(SharedAllocationTracker&& rhs) + : m_record_bits(rhs.m_record_bits) { + rhs.m_record_bits = DO_NOT_DEREF_FLAG; + } + + KOKKOS_FORCEINLINE_FUNCTION + SharedAllocationTracker& operator=(SharedAllocationTracker&& rhs) { + auto swap_tmp = m_record_bits; + m_record_bits = rhs.m_record_bits; + rhs.m_record_bits = swap_tmp; + return *this; + } + + // Copy: + + KOKKOS_FORCEINLINE_FUNCTION + SharedAllocationTracker(const SharedAllocationTracker& rhs) + : m_record_bits(KOKKOS_IMPL_SHARED_ALLOCATION_CARRY_RECORD_BITS( + rhs, true)){KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_INCREMENT} + + /** \brief Copy construction may disable tracking. */ + KOKKOS_FORCEINLINE_FUNCTION SharedAllocationTracker( + const SharedAllocationTracker& rhs, const bool enable_tracking) + : m_record_bits(KOKKOS_IMPL_SHARED_ALLOCATION_CARRY_RECORD_BITS( + rhs, + enable_tracking)){KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_INCREMENT} + + KOKKOS_FORCEINLINE_FUNCTION SharedAllocationTracker + & + operator=(const SharedAllocationTracker& rhs) { + // If this is tracking then must decrement + KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT + m_record_bits = KOKKOS_IMPL_SHARED_ALLOCATION_CARRY_RECORD_BITS(rhs, true); + KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_INCREMENT + return *this; + } + + /* The following functions (assign_direct and assign_force_disable) + * are the result of deconstructing the + * KOKKOS_IMPL_SHARED_ALLOCATION_CARRY_RECORD_BITS macro. This + * allows the caller to do the check for tracking enabled and managed + * apart from the assignement of the record because the tracking + * enabled / managed question may be important for other tasks as well + */ + + /** \brief Copy assignment without the carry bits logic + * This assumes that externally defined tracking is explicitly enabled + */ + KOKKOS_FORCEINLINE_FUNCTION + void assign_direct(const SharedAllocationTracker& rhs) { + KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT + m_record_bits = rhs.m_record_bits; + KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_INCREMENT + } + + /** \brief Copy assignment without the increment + * we cannot assume that current record is unmanaged + * but with externally defined tracking explicitly disabled + * we can go straight to the do not deref flag */ + KOKKOS_FORCEINLINE_FUNCTION + void assign_force_disable(const SharedAllocationTracker& rhs) { + KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT + m_record_bits = rhs.m_record_bits | DO_NOT_DEREF_FLAG; + } + + // report if record is tracking or not + KOKKOS_FORCEINLINE_FUNCTION + bool tracking_enabled() { return (!(m_record_bits & DO_NOT_DEREF_FLAG)); } + + /** \brief Copy assignment may disable tracking */ + KOKKOS_FORCEINLINE_FUNCTION + void assign(const SharedAllocationTracker& rhs, const bool enable_tracking) { + KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT + m_record_bits = + KOKKOS_IMPL_SHARED_ALLOCATION_CARRY_RECORD_BITS(rhs, enable_tracking); + KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_INCREMENT + } + +#undef KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_ENABLED +#undef KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_INCREMENT +#undef KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT +}; + +} /* namespace Impl */ +} /* namespace Kokkos */ +#endif diff --git a/packages/kokkos/core/src/impl/Kokkos_SharedAlloc_timpl.hpp b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc_timpl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a6ee1b3f9eb11ddfbfd2c1ce5dd7a213bd25dda9 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc_timpl.hpp @@ -0,0 +1,287 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (12/8/20) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_IMPL_SHAREDALLOC_TIMPL_HPP +#define KOKKOS_IMPL_SHAREDALLOC_TIMPL_HPP + +#include <Kokkos_Macros.hpp> +#include <Kokkos_Core_fwd.hpp> + +#include <impl/Kokkos_SharedAlloc.hpp> + +#include <Kokkos_HostSpace.hpp> // used with HostInaccessible specializations + +#include <string> // std::string +#include <cstring> // strncpy +#include <iostream> // ostream + +namespace Kokkos { +namespace Impl { + +template <class MemorySpace> +auto SharedAllocationRecordCommon<MemorySpace>::allocate( + MemorySpace const& arg_space, std::string const& arg_label, + size_t arg_alloc_size) -> derived_t* { + return new derived_t(arg_space, arg_label, arg_alloc_size); +} + +template <class MemorySpace> +void* SharedAllocationRecordCommon<MemorySpace>::allocate_tracked( + const MemorySpace& arg_space, const std::string& arg_alloc_label, + size_t arg_alloc_size) { + if (!arg_alloc_size) return nullptr; + + SharedAllocationRecord* const r = + allocate(arg_space, arg_alloc_label, arg_alloc_size); + + record_base_t::increment(r); + + return r->data(); +} + +template <class MemorySpace> +void SharedAllocationRecordCommon<MemorySpace>::deallocate( + SharedAllocationRecordCommon::record_base_t* arg_rec) { + delete static_cast<derived_t*>(arg_rec); +} + +template <class MemorySpace> +void SharedAllocationRecordCommon<MemorySpace>::deallocate_tracked( + void* arg_alloc_ptr) { + if (arg_alloc_ptr != nullptr) { + SharedAllocationRecord* const r = derived_t::get_record(arg_alloc_ptr); + record_base_t::decrement(r); + } +} + +template <class MemorySpace> +void* SharedAllocationRecordCommon<MemorySpace>::reallocate_tracked( + void* arg_alloc_ptr, size_t arg_alloc_size) { + derived_t* const r_old = derived_t::get_record(arg_alloc_ptr); + derived_t* const r_new = + allocate(r_old->m_space, r_old->get_label(), arg_alloc_size); + + Kokkos::Impl::DeepCopy<MemorySpace, MemorySpace>( + r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size())); + + record_base_t::increment(r_new); + record_base_t::decrement(r_old); + + return r_new->data(); +} + +template <class MemorySpace> +auto SharedAllocationRecordCommon<MemorySpace>::get_record(void* alloc_ptr) + -> derived_t* { + using Header = SharedAllocationHeader; + + Header const* const h = alloc_ptr ? Header::get_header(alloc_ptr) : nullptr; + + if (!alloc_ptr || h->m_record->m_alloc_ptr != h) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::SharedAllocationRecordCommon<") + + std::string(MemorySpace::name()) + + std::string(">::get_record() ERROR")); + } + + return static_cast<derived_t*>(h->m_record); +} + +template <class MemorySpace> +std::string SharedAllocationRecordCommon<MemorySpace>::get_label() const { + return std::string(record_base_t::head()->m_label); +} + +template <class MemorySpace> +void SharedAllocationRecordCommon<MemorySpace>:: + _fill_host_accessible_header_info(SharedAllocationHeader& arg_header, + std::string const& arg_label) { + // Fill in the Header information, directly accessible on the host + + arg_header.m_record = &self(); + + strncpy(arg_header.m_label, arg_label.c_str(), + SharedAllocationHeader::maximum_label_length); + // Set last element zero, in case c_str is too long + arg_header.m_label[SharedAllocationHeader::maximum_label_length - 1] = '\0'; +} + +template <class MemorySpace> +void SharedAllocationRecordCommon<MemorySpace>::print_records( + std::ostream& s, const MemorySpace&, bool detail) { + (void)s; + (void)detail; +#ifdef KOKKOS_ENABLE_DEBUG + SharedAllocationRecord<void, void>::print_host_accessible_records( + s, MemorySpace::name(), &derived_t::s_root_record, detail); +#else + Kokkos::Impl::throw_runtime_exception( + std::string("SharedAllocationHeader<") + + std::string(MemorySpace::name()) + + std::string( + ">::print_records only works with KOKKOS_ENABLE_DEBUG enabled")); +#endif +} + +template <class MemorySpace> +void HostInaccessibleSharedAllocationRecordCommon<MemorySpace>::print_records( + std::ostream& s, const MemorySpace&, bool detail) { + (void)s; + (void)detail; +#ifdef KOKKOS_ENABLE_DEBUG + SharedAllocationRecord<void, void>* r = &derived_t::s_root_record; + + char buffer[256]; + + SharedAllocationHeader head; + + if (detail) { + do { + if (r->m_alloc_ptr) { + Kokkos::Impl::DeepCopy<HostSpace, MemorySpace>( + &head, r->m_alloc_ptr, sizeof(SharedAllocationHeader)); + } else { + head.m_label[0] = 0; + } + + // Formatting dependent on sizeof(uintptr_t) + const char* format_string; + + if (sizeof(uintptr_t) == sizeof(unsigned long)) { + format_string = + "%s addr( 0x%.12lx ) list( 0x%.12lx 0x%.12lx ) extent[ 0x%.12lx " + "+ %.8ld ] count(%d) dealloc(0x%.12lx) %s\n"; + } else if (sizeof(uintptr_t) == sizeof(unsigned long long)) { + format_string = + "%s addr( 0x%.12llx ) list( 0x%.12llx 0x%.12llx ) extent[ " + "0x%.12llx + %.8ld ] count(%d) dealloc(0x%.12llx) %s\n"; + } + + snprintf(buffer, 256, format_string, MemorySpace::execution_space::name(), + reinterpret_cast<uintptr_t>(r), + reinterpret_cast<uintptr_t>(r->m_prev), + reinterpret_cast<uintptr_t>(r->m_next), + reinterpret_cast<uintptr_t>(r->m_alloc_ptr), r->m_alloc_size, + r->m_count, reinterpret_cast<uintptr_t>(r->m_dealloc), + head.m_label); + s << buffer; + r = r->m_next; + } while (r != &derived_t::s_root_record); + } else { + do { + if (r->m_alloc_ptr) { + Kokkos::Impl::DeepCopy<HostSpace, MemorySpace>( + &head, r->m_alloc_ptr, sizeof(SharedAllocationHeader)); + + // Formatting dependent on sizeof(uintptr_t) + const char* format_string; + + if (sizeof(uintptr_t) == sizeof(unsigned long)) { + format_string = "%s [ 0x%.12lx + %ld ] %s\n"; + } else if (sizeof(uintptr_t) == sizeof(unsigned long long)) { + format_string = "%s [ 0x%.12llx + %ld ] %s\n"; + } + + snprintf( + buffer, 256, format_string, MemorySpace::execution_space::name(), + reinterpret_cast<uintptr_t>(r->data()), r->size(), head.m_label); + } else { + snprintf(buffer, 256, "%s [ 0 + 0 ]\n", + MemorySpace::execution_space::name()); + } + s << buffer; + r = r->m_next; + } while (r != &derived_t::s_root_record); + } +#else + Kokkos::Impl::throw_runtime_exception( + std::string("SharedAllocationHeader<") + + std::string(MemorySpace::name()) + + std::string( + ">::print_records only works with KOKKOS_ENABLE_DEBUG enabled")); +#endif +} + +template <class MemorySpace> +auto HostInaccessibleSharedAllocationRecordCommon<MemorySpace>::get_record( + void* alloc_ptr) -> derived_t* { + // Copy the header from the allocation + SharedAllocationHeader head; + + SharedAllocationHeader const* const head_cuda = + alloc_ptr ? SharedAllocationHeader::get_header(alloc_ptr) : nullptr; + + if (alloc_ptr) { + Kokkos::Impl::DeepCopy<HostSpace, MemorySpace>( + &head, head_cuda, sizeof(SharedAllocationHeader)); + } + + derived_t* const record = + alloc_ptr ? static_cast<derived_t*>(head.m_record) : nullptr; + + if (!alloc_ptr || record->m_alloc_ptr != head_cuda) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::SharedAllocationRecord<") + + std::string(MemorySpace::name()) + + std::string(", void>::get_record ERROR")); + } + + return record; +} + +template <class MemorySpace> +std::string +HostInaccessibleSharedAllocationRecordCommon<MemorySpace>::get_label() const { + SharedAllocationHeader header; + + Kokkos::Impl::DeepCopy<Kokkos::HostSpace, MemorySpace>( + &header, this->record_base_t::head(), sizeof(SharedAllocationHeader)); + + return std::string(header.m_label); +} + +} // end namespace Impl +} // end namespace Kokkos + +#endif // KOKKOS_IMPL_SHAREDALLOC_TIMPL_HPP diff --git a/packages/kokkos/core/src/impl/Kokkos_SimpleTaskScheduler.hpp b/packages/kokkos/core/src/impl/Kokkos_SimpleTaskScheduler.hpp new file mode 100644 index 0000000000000000000000000000000000000000..0773a0914befe4e9db3b3b79ae3c446bcb0f3ad1 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_SimpleTaskScheduler.hpp @@ -0,0 +1,496 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_SIMPLETASKSCHEDULER_HPP +#define KOKKOS_SIMPLETASKSCHEDULER_HPP + +//---------------------------------------------------------------------------- + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_TASKDAG) + +#include <Kokkos_Core_fwd.hpp> +#include <Kokkos_TaskScheduler_fwd.hpp> +//---------------------------------------------------------------------------- + +#include <Kokkos_MemoryPool.hpp> +#include <impl/Kokkos_Tags.hpp> + +#include <Kokkos_Future.hpp> +#include <impl/Kokkos_TaskQueue.hpp> +#include <impl/Kokkos_SingleTaskQueue.hpp> +#include <impl/Kokkos_MultipleTaskQueue.hpp> +#include <impl/Kokkos_TaskQueueMultiple.hpp> +#include <impl/Kokkos_TaskPolicyData.hpp> +#include <impl/Kokkos_TaskTeamMember.hpp> +#include <impl/Kokkos_EBO.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +namespace Impl { + +// TODO @tasking @cleanup move this +template <class T> +struct DefaultDestroy { + T* managed_object; + KOKKOS_FUNCTION + void destroy_shared_allocation() { managed_object->~T(); } +}; + +} // namespace Impl + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +template <class ExecSpace, class QueueType> +// requires ExecutionSpace<ExecSpace> && TaskQueue<QueueType> +class SimpleTaskScheduler + : public Impl::TaskSchedulerBase, + private Impl::ExecutionSpaceInstanceStorage<ExecSpace>, + private Impl::MemorySpaceInstanceStorage< + typename QueueType::memory_space>, + private Impl::NoUniqueAddressMemberEmulation< + typename QueueType::team_scheduler_info_type> { + public: + // TODO @tasking @generalization (maybe?) don't force QueueType to be complete + // here + + using scheduler_type = SimpleTaskScheduler; // tag as scheduler concept + using execution_space = ExecSpace; + using task_queue_type = QueueType; + using memory_space = typename task_queue_type::memory_space; + using memory_pool = typename task_queue_type::memory_pool; + + using team_scheduler_info_type = + typename task_queue_type::team_scheduler_info_type; + using task_scheduling_info_type = + typename task_queue_type::task_scheduling_info_type; + using specialization = Impl::TaskQueueSpecialization<SimpleTaskScheduler>; + using member_type = typename specialization::member_type; + + template <class Functor> + using runnable_task_type = + typename QueueType::template runnable_task_type<Functor, + SimpleTaskScheduler>; + + using task_base_type = typename task_queue_type::task_base_type; + using runnable_task_base_type = + typename task_queue_type::runnable_task_base_type; + + using task_queue_traits = typename QueueType::task_queue_traits; + + template <class ValueType> + using future_type = Kokkos::BasicFuture<ValueType, SimpleTaskScheduler>; + template <class FunctorType> + using future_type_for_functor = future_type<typename FunctorType::value_type>; + + private: + template <typename, typename> + friend class BasicFuture; + + using track_type = Kokkos::Impl::SharedAllocationTracker; + using execution_space_storage = + Impl::ExecutionSpaceInstanceStorage<execution_space>; + using memory_space_storage = Impl::MemorySpaceInstanceStorage<memory_space>; + using team_scheduler_info_storage = + Impl::NoUniqueAddressMemberEmulation<team_scheduler_info_type>; + + track_type m_track; + task_queue_type* m_queue = nullptr; + + KOKKOS_INLINE_FUNCTION + static constexpr task_base_type* _get_task_ptr(std::nullptr_t) { + return nullptr; + } + + template <class ValueType> + KOKKOS_INLINE_FUNCTION static constexpr task_base_type* _get_task_ptr( + future_type<ValueType>&& f) { + return f.m_task; + } + + template <int TaskEnum, class DepTaskType, class FunctorType> + KOKKOS_FUNCTION future_type_for_functor< + typename std::decay<FunctorType>::type> + _spawn_impl( + DepTaskType arg_predecessor_task, TaskPriority arg_priority, + typename runnable_task_base_type::function_type apply_function_ptr, + typename runnable_task_base_type::destroy_type /*destroy_function_ptr*/, + FunctorType&& functor) { + KOKKOS_EXPECTS(m_queue != nullptr); + + using functor_future_type = + future_type_for_functor<typename std::decay<FunctorType>::type>; + using task_type = + typename task_queue_type::template runnable_task_type<FunctorType, + scheduler_type>; + + // Reference count starts at two: + // +1 for the matching decrement when task is complete + // +1 for the future + auto& runnable_task = *m_queue->template allocate_and_construct<task_type>( + /* functor = */ std::forward<FunctorType>(functor), + /* apply_function_ptr = */ apply_function_ptr, + /* task_type = */ static_cast<Impl::TaskType>(TaskEnum), + /* priority = */ arg_priority, + /* queue_base = */ m_queue, + /* initial_reference_count = */ 2); + + if (arg_predecessor_task != nullptr) { + m_queue->initialize_scheduling_info_from_predecessor( + runnable_task, *arg_predecessor_task); + runnable_task.set_predecessor(*arg_predecessor_task); + arg_predecessor_task->decrement_and_check_reference_count(); + } else { + m_queue->initialize_scheduling_info_from_team_scheduler_info( + runnable_task, team_scheduler_info()); + } + + auto rv = functor_future_type(&runnable_task); + + Kokkos::memory_fence(); // fence to ensure dependent stores are visible + + m_queue->schedule_runnable(std::move(runnable_task), team_scheduler_info()); + // note that task may be already completed even here, so don't touch it + // again + + return rv; + } + + public: + //---------------------------------------------------------------------------- + // <editor-fold desc="Constructors, destructor, and assignment"> {{{2 + + SimpleTaskScheduler() = default; + + explicit SimpleTaskScheduler(execution_space const& arg_execution_space, + memory_space const& arg_memory_space, + memory_pool const& arg_memory_pool) + : execution_space_storage(arg_execution_space), + memory_space_storage(arg_memory_space) { + // Ask the task queue how much space it needs (usually will just be + // sizeof(task_queue_type), but some queues may need additional storage + // dependent on runtime conditions or properties of the execution space) + auto const allocation_size = task_queue_type::task_queue_allocation_size( + arg_execution_space, arg_memory_space, arg_memory_pool); + + // TODO @tasking @generalization DSH better encapsulation of the + // SharedAllocationRecord pattern + using record_type = + Impl::SharedAllocationRecord<memory_space, + Impl::DefaultDestroy<task_queue_type> >; + + // Allocate space for the task queue + auto* record = record_type::allocate(memory_space(), "Kokkos::TaskQueue", + allocation_size); + m_queue = new (record->data()) + task_queue_type(arg_execution_space, arg_memory_space, arg_memory_pool); + record->m_destroy.managed_object = m_queue; + m_track.assign_allocated_record_to_uninitialized(record); + } + + explicit SimpleTaskScheduler(execution_space const& arg_execution_space, + memory_pool const& pool) + : SimpleTaskScheduler(arg_execution_space, memory_space{}, + pool) { /* forwarding ctor, must be empty */ + } + + explicit SimpleTaskScheduler(memory_pool const& pool) + : SimpleTaskScheduler(execution_space{}, memory_space{}, + pool) { /* forwarding ctor, must be empty */ + } + + SimpleTaskScheduler(memory_space const& arg_memory_space, + size_t const mempool_capacity, + unsigned const mempool_min_block_size, // = 1u << 6 + unsigned const mempool_max_block_size, // = 1u << 10 + unsigned const mempool_superblock_size // = 1u << 12 + ) + : SimpleTaskScheduler( + execution_space{}, arg_memory_space, + memory_pool( + arg_memory_space, mempool_capacity, mempool_min_block_size, + mempool_max_block_size, + mempool_superblock_size)) { /* forwarding ctor, must be empty */ + } + + // </editor-fold> end Constructors, destructor, and assignment }}}2 + //---------------------------------------------------------------------------- + + // Note that this is an expression of shallow constness + KOKKOS_INLINE_FUNCTION + task_queue_type& queue() const { + KOKKOS_EXPECTS(m_queue != nullptr); + return *m_queue; + } + + KOKKOS_INLINE_FUNCTION + SimpleTaskScheduler get_team_scheduler(int rank_in_league) const noexcept { + KOKKOS_EXPECTS(m_queue != nullptr); + auto rv = SimpleTaskScheduler{*this}; + rv.team_scheduler_info() = + m_queue->initial_team_scheduler_info(rank_in_league); + return rv; + } + + KOKKOS_INLINE_FUNCTION + execution_space const& get_execution_space() const { + return this->execution_space_instance(); + } + + KOKKOS_INLINE_FUNCTION + team_scheduler_info_type& team_scheduler_info() & { + return this->team_scheduler_info_storage::no_unique_address_data_member(); + } + + KOKKOS_INLINE_FUNCTION + team_scheduler_info_type const& team_scheduler_info() const& { + return this->team_scheduler_info_storage::no_unique_address_data_member(); + } + + //---------------------------------------------------------------------------- + + template <int TaskEnum, typename DepFutureType, typename FunctorType> + KOKKOS_FUNCTION static Kokkos::BasicFuture<typename FunctorType::value_type, + scheduler_type> + spawn(Impl::TaskPolicyWithScheduler<TaskEnum, scheduler_type, DepFutureType>&& + arg_policy, + typename runnable_task_base_type::function_type arg_function, + typename runnable_task_base_type::destroy_type arg_destroy, + FunctorType&& arg_functor) { + return std::move(arg_policy.scheduler()) + .template _spawn_impl<TaskEnum>( + _get_task_ptr(std::move(arg_policy.predecessor())), + arg_policy.priority(), arg_function, arg_destroy, + std::forward<FunctorType>(arg_functor)); + } + + template <int TaskEnum, typename DepFutureType, typename FunctorType> + KOKKOS_FUNCTION Kokkos::BasicFuture<typename FunctorType::value_type, + scheduler_type> + spawn(Impl::TaskPolicyWithPredecessor<TaskEnum, DepFutureType>&& arg_policy, + FunctorType&& arg_functor) { + static_assert(std::is_same<typename DepFutureType::scheduler_type, + scheduler_type>::value, + "Can't create a task policy from a scheduler and a future " + "from a different scheduler"); + + using task_type = runnable_task_type<FunctorType>; + typename task_type::function_type const ptr = task_type::apply; + typename task_type::destroy_type const dtor = task_type::destroy; + + return _spawn_impl<TaskEnum>(std::move(arg_policy).predecessor().m_task, + arg_policy.priority(), ptr, dtor, + std::forward<FunctorType>(arg_functor)); + } + + template <class FunctorType, class ValueType, class Scheduler> + KOKKOS_FUNCTION static void respawn( + FunctorType* functor, + BasicFuture<ValueType, Scheduler> const& predecessor, + TaskPriority priority = TaskPriority::Regular) { + using task_type = + typename task_queue_type::template runnable_task_type<FunctorType, + scheduler_type>; + + auto& task = *static_cast<task_type*>(functor); + + KOKKOS_EXPECTS(!task.get_respawn_flag()); + + task.set_priority(priority); + task.set_predecessor(*predecessor.m_task); + task.set_respawn_flag(true); + } + + template <class FunctorType> + KOKKOS_FUNCTION static void respawn( + FunctorType* functor, scheduler_type const&, + TaskPriority priority = TaskPriority::Regular) { + using task_type = + typename task_queue_type::template runnable_task_type<FunctorType, + scheduler_type>; + + auto& task = *static_cast<task_type*>(functor); + + KOKKOS_EXPECTS(!task.get_respawn_flag()); + + task.set_priority(priority); + KOKKOS_ASSERT(!task.has_predecessor()); + task.set_respawn_flag(true); + } + + template <class ValueType> + KOKKOS_FUNCTION future_type<void> when_all( + BasicFuture<ValueType, scheduler_type> const predecessors[], + int n_predecessors) { + // TODO @tasking @generalization DSH propagate scheduling info + + using task_type = typename task_queue_type::aggregate_task_type; + + future_type<void> rv; + + if (n_predecessors > 0) { + task_queue_type* queue_ptr = nullptr; + + // Loop over the predecessors to find the queue and increment the + // reference counts + for (int i_pred = 0; i_pred < n_predecessors; ++i_pred) { + auto* predecessor_task_ptr = predecessors[i_pred].m_task; + + if (predecessor_task_ptr != nullptr) { + // TODO @tasking @cleanup DSH figure out when this is allowed to be + // nullptr (if at all anymore) + + // Increment reference count to track subsequent assignment. + // TODO @tasking @optimization DSH figure out if this reference count + // increment is necessary + predecessor_task_ptr->increment_reference_count(); + + // TODO @tasking @cleanup DSH we should just set a boolean here + // instead to make this more readable + queue_ptr = m_queue; + } + + } // end loop over predecessors + + // This only represents a non-ready future if at least one of the + // predecessors has a task (and thus, a queue) + if (queue_ptr != nullptr) { + auto& q = *queue_ptr; + + auto* aggregate_task_ptr = + q.template allocate_and_construct_with_vla_emulation< + task_type, task_base_type*>( + /* n_vla_entries = */ n_predecessors, + /* aggregate_predecessor_count = */ n_predecessors, + /* queue_base = */ &q, + /* initial_reference_count = */ 2); + + rv = future_type<void>(aggregate_task_ptr); + + for (int i_pred = 0; i_pred < n_predecessors; ++i_pred) { + aggregate_task_ptr->vla_value_at(i_pred) = + predecessors[i_pred].m_task; + } + + Kokkos::memory_fence(); // we're touching very questionable memory, so + // be sure to fence + + q.schedule_aggregate(std::move(*aggregate_task_ptr), + team_scheduler_info()); + // the aggregate may be processed at any time, so don't touch it after + // this + } + } + + return rv; + } + + template <class F> + KOKKOS_FUNCTION future_type<void> when_all(int n_calls, F&& func) { + // TODO @tasking @generalization DSH propagate scheduling info? + + // later this should be std::invoke_result_t + using generated_type = decltype(func(0)); + using task_type = typename task_queue_type::aggregate_task_type; + + static_assert(is_future<generated_type>::value, + "when_all function must return a Kokkos future (an instance " + "of Kokkos::BasicFuture)"); + static_assert( + std::is_base_of<scheduler_type, + typename generated_type::scheduler_type>::value, + "when_all function must return a Kokkos::BasicFuture of a compatible " + "scheduler type"); + + auto* aggregate_task = + m_queue->template allocate_and_construct_with_vla_emulation< + task_type, task_base_type*>( + /* n_vla_entries = */ n_calls, + /* aggregate_predecessor_count = */ n_calls, + /* queue_base = */ m_queue, + /* initial_reference_count = */ 2); + + auto rv = future_type<void>(aggregate_task); + + for (int i_call = 0; i_call < n_calls; ++i_call) { + auto generated_future = func(i_call); + + if (generated_future.m_task != nullptr) { + generated_future.m_task->increment_reference_count(); + aggregate_task->vla_value_at(i_call) = generated_future.m_task; + + KOKKOS_ASSERT(m_queue == + generated_future.m_task->ready_queue_base_ptr() && + "Queue mismatch in when_all"); + } + } + + Kokkos::memory_fence(); + + m_queue->schedule_aggregate(std::move(*aggregate_task), + team_scheduler_info()); + // This could complete at any moment, so don't touch anything after this + + return rv; + } +}; + +template <class ExecSpace, class QueueType> +inline void wait(SimpleTaskScheduler<ExecSpace, QueueType> const& scheduler) { + using scheduler_type = SimpleTaskScheduler<ExecSpace, QueueType>; + scheduler_type::specialization::execute(scheduler); +} + +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------#endif +///* #if defined( KOKKOS_ENABLE_TASKDAG ) */ + +#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */ +#endif /* #ifndef KOKKOS_SIMPLETASKSCHEDULER_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_SingleTaskQueue.hpp b/packages/kokkos/core/src/impl/Kokkos_SingleTaskQueue.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a0eccffb627f39f1810978aa0d3ab25c9458e4e8 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_SingleTaskQueue.hpp @@ -0,0 +1,188 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_IMPL_SINGLETASKQUEUE_HPP +#define KOKKOS_IMPL_SINGLETASKQUEUE_HPP + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_TASKDAG) + +#include <Kokkos_TaskScheduler_fwd.hpp> +#include <Kokkos_Core_fwd.hpp> + +#include <Kokkos_MemoryPool.hpp> + +#include <impl/Kokkos_TaskBase.hpp> +#include <impl/Kokkos_TaskResult.hpp> + +#include <impl/Kokkos_TaskQueueMemoryManager.hpp> +#include <impl/Kokkos_TaskQueueCommon.hpp> +#include <impl/Kokkos_Memory_Fence.hpp> +#include <impl/Kokkos_Atomic_Increment.hpp> +#include <impl/Kokkos_OptionalRef.hpp> +#include <impl/Kokkos_LIFO.hpp> + +#include <string> +#include <typeinfo> +#include <stdexcept> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +template <class ExecSpace, class MemorySpace, class TaskQueueTraits, + class MemoryPool> +class SingleTaskQueue + : public TaskQueueMemoryManager<ExecSpace, MemorySpace, MemoryPool>, + public TaskQueueCommonMixin<SingleTaskQueue< + ExecSpace, MemorySpace, TaskQueueTraits, MemoryPool>> { + private: + using base_t = TaskQueueMemoryManager<ExecSpace, MemorySpace, MemoryPool>; + using common_mixin_t = TaskQueueCommonMixin<SingleTaskQueue>; + + struct EmptyTeamSchedulerInfo {}; + struct EmptyTaskSchedulingInfo {}; + + public: + using task_queue_type = SingleTaskQueue; // mark as task_queue concept + using task_queue_traits = TaskQueueTraits; + using task_base_type = TaskNode<TaskQueueTraits>; + using ready_queue_type = + typename TaskQueueTraits::template ready_queue_type<task_base_type>; + + using team_scheduler_info_type = EmptyTeamSchedulerInfo; + using task_scheduling_info_type = EmptyTaskSchedulingInfo; + + using runnable_task_base_type = RunnableTaskBase<TaskQueueTraits>; + + template <class Functor, class Scheduler> + // requires TaskScheduler<Scheduler> && TaskFunctor<Functor> + using runnable_task_type = + RunnableTask<task_queue_traits, Scheduler, typename Functor::value_type, + Functor>; + + using aggregate_task_type = + AggregateTask<task_queue_traits, task_scheduling_info_type>; + + // Number of allowed priorities + static constexpr int NumQueue = 3; + + private: + ready_queue_type m_ready_queues[NumQueue][2]; + + public: + //---------------------------------------------------------------------------- + // <editor-fold desc="Constructors, destructors, and assignment"> {{{2 + + SingleTaskQueue() = delete; + SingleTaskQueue(SingleTaskQueue const&) = delete; + SingleTaskQueue(SingleTaskQueue&&) = delete; + SingleTaskQueue& operator=(SingleTaskQueue const&) = delete; + SingleTaskQueue& operator=(SingleTaskQueue&&) = delete; + + explicit SingleTaskQueue(typename base_t::execution_space const&, + typename base_t::memory_space const&, + typename base_t::memory_pool const& arg_memory_pool) + : base_t(arg_memory_pool) {} + + ~SingleTaskQueue() { + for (int i_priority = 0; i_priority < NumQueue; ++i_priority) { + KOKKOS_EXPECTS(m_ready_queues[i_priority][TaskTeam].empty()); + KOKKOS_EXPECTS(m_ready_queues[i_priority][TaskSingle].empty()); + } + } + + // </editor-fold> end Constructors, destructors, and assignment }}}2 + //---------------------------------------------------------------------------- + + KOKKOS_FUNCTION + void schedule_runnable(runnable_task_base_type&& task, + team_scheduler_info_type const& info) { + this->schedule_runnable_to_queue( + std::move(task), + m_ready_queues[int(task.get_priority())][int(task.get_task_type())], + info); + // Task may be enqueued and may be run at any point; don't touch it (hence + // the use of move semantics) + } + + KOKKOS_FUNCTION + OptionalRef<task_base_type> pop_ready_task( + team_scheduler_info_type const& /*info*/) { + OptionalRef<task_base_type> return_value; + // always loop in order of priority first, then prefer team tasks over + // single tasks + for (int i_priority = 0; i_priority < NumQueue; ++i_priority) { + // Check for a team task with this priority + return_value = m_ready_queues[i_priority][TaskTeam].pop(); + if (return_value) return return_value; + + // Check for a single task with this priority + return_value = m_ready_queues[i_priority][TaskSingle].pop(); + if (return_value) return return_value; + } + // if nothing was found, return a default-constructed (empty) OptionalRef + return return_value; + } + + KOKKOS_INLINE_FUNCTION + constexpr team_scheduler_info_type initial_team_scheduler_info(int) const + noexcept { + return {}; + } +}; + +} /* namespace Impl */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */ +#endif /* #ifndef KOKKOS_IMPL_SINGLETASKQUEUE_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_Spinwait.cpp b/packages/kokkos/core/src/impl/Kokkos_Spinwait.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8ac034e249f1c1d1a4309003ee77c0cbe38682de --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Spinwait.cpp @@ -0,0 +1,141 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + +#include <Kokkos_Atomic.hpp> +#include <impl/Kokkos_Spinwait.hpp> +#include <impl/Kokkos_BitOps.hpp> + +#include <thread> +#if defined(_WIN32) +#include <process.h> +#include <winsock2.h> +#include <windows.h> +#endif + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace Impl { + +void host_thread_yield(const uint32_t i, const WaitMode mode) { + static constexpr uint32_t sleep_limit = 1 << 13; + static constexpr uint32_t yield_limit = 1 << 12; + + const int c = Kokkos::log2(i); + + if (WaitMode::ROOT != mode) { + if (sleep_limit < i) { + // Attempt to put the thread to sleep for 'c' microseconds + std::this_thread::yield(); + std::this_thread::sleep_for(std::chrono::microseconds(c)); + } + + else if (mode == WaitMode::PASSIVE || yield_limit < i) { + // Attempt to yield thread resources to runtime + std::this_thread::yield(); + } +#if defined(KOKKOS_ENABLE_ASM) + + else if ((1u << 4) < i) { + + // Insert a few no-ops to quiet the thread: + + for (int k = 0; k < c; ++k) { +#if defined(__amd64) || defined(__amd64__) || defined(__x86_64) || \ + defined(__x86_64__) +#if !defined(_WIN32) /* IS NOT Microsoft Windows */ + asm volatile("nop\n"); +#else + __asm__ __volatile__("nop\n"); +#endif +#elif defined(__PPC64__) + asm volatile("nop\n"); +#endif + } + } +#endif /* defined( KOKKOS_ENABLE_ASM ) */ + } +#if defined(KOKKOS_ENABLE_ASM) + else if ((1u << 3) < i) { + // no-ops for root thread + for (int k = 0; k < c; ++k) { +#if defined(__amd64) || defined(__amd64__) || defined(__x86_64) || \ + defined(__x86_64__) +#if !defined(_WIN32) /* IS NOT Microsoft Windows */ + asm volatile("nop\n"); +#else + __asm__ __volatile__("nop\n"); +#endif +#elif defined(__PPC64__) + asm volatile("nop\n"); +#endif + } + } + + { + // Insert memory pause +#if defined(__amd64) || defined(__amd64__) || defined(__x86_64) || \ + defined(__x86_64__) +#if !defined(_WIN32) /* IS NOT Microsoft Windows */ + asm volatile("pause\n" ::: "memory"); +#else + __asm__ __volatile__("pause\n" ::: "memory"); +#endif +#elif defined(__PPC64__) + asm volatile("or 27, 27, 27" ::: "memory"); +#endif + } + +#endif /* defined( KOKKOS_ENABLE_ASM ) */ +} + +} // namespace Impl +} // namespace Kokkos + +#else +void KOKKOS_CORE_SRC_IMPL_SPINWAIT_PREVENT_LINK_ERROR() {} +#endif diff --git a/packages/kokkos/core/src/impl/Kokkos_Spinwait.hpp b/packages/kokkos/core/src/impl/Kokkos_Spinwait.hpp new file mode 100644 index 0000000000000000000000000000000000000000..1c65fb91f2942aca57e66b09bd99ccaf5e450783 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Spinwait.hpp @@ -0,0 +1,137 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_SPINWAIT_HPP +#define KOKKOS_SPINWAIT_HPP + +#include <Kokkos_Macros.hpp> +#include <Kokkos_Atomic.hpp> + +#include <cstdint> + +#include <type_traits> + +namespace Kokkos { +namespace Impl { + +enum class WaitMode : int { + ACTIVE // Used for tight loops to keep threads active longest + , + PASSIVE // Used to quickly yield the thread to quite down the system + , + ROOT // Never sleep or yield the root thread +}; + +void host_thread_yield(const uint32_t i, const WaitMode mode); + +template <typename T> +typename std::enable_if<std::is_integral<T>::value, void>::type +root_spinwait_while_equal(T const volatile& flag, const T value) { + Kokkos::store_fence(); + uint32_t i = 0; + while (value == flag) { + host_thread_yield(++i, WaitMode::ROOT); + } + Kokkos::load_fence(); +} + +template <typename T> +typename std::enable_if<std::is_integral<T>::value, void>::type +root_spinwait_until_equal(T const volatile& flag, const T value) { + Kokkos::store_fence(); + uint32_t i = 0; + while (value != flag) { + host_thread_yield(++i, WaitMode::ROOT); + } + Kokkos::load_fence(); +} + +template <typename T> +typename std::enable_if<std::is_integral<T>::value, void>::type +spinwait_while_equal(T const volatile& flag, const T value) { + Kokkos::store_fence(); + uint32_t i = 0; + while (value == flag) { + host_thread_yield(++i, WaitMode::ACTIVE); + } + Kokkos::load_fence(); +} + +template <typename T> +typename std::enable_if<std::is_integral<T>::value, void>::type +yield_while_equal(T const volatile& flag, const T value) { + Kokkos::store_fence(); + uint32_t i = 0; + while (value == flag) { + host_thread_yield(++i, WaitMode::PASSIVE); + } + Kokkos::load_fence(); +} + +template <typename T> +typename std::enable_if<std::is_integral<T>::value, void>::type +spinwait_until_equal(T const volatile& flag, const T value) { + Kokkos::store_fence(); + uint32_t i = 0; + while (value != flag) { + host_thread_yield(++i, WaitMode::ACTIVE); + } + Kokkos::load_fence(); +} + +template <typename T> +typename std::enable_if<std::is_integral<T>::value, void>::type +yield_until_equal(T const volatile& flag, const T value) { + Kokkos::store_fence(); + uint32_t i = 0; + while (value != flag) { + host_thread_yield(++i, WaitMode::PASSIVE); + } + Kokkos::load_fence(); +} + +} /* namespace Impl */ +} /* namespace Kokkos */ + +#endif /* #ifndef KOKKOS_SPINWAIT_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_Stacktrace.cpp b/packages/kokkos/core/src/impl/Kokkos_Stacktrace.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c0c1fdf6be7e2024aa92692b21b4d0996e370bca --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Stacktrace.cpp @@ -0,0 +1,247 @@ +#include "Kokkos_Macros.hpp" +#include "Kokkos_Stacktrace.hpp" + +#ifdef KOKKOS_IMPL_ENABLE_STACKTRACE +// backtrace() function for retrieving the stacktrace +#include <execinfo.h> +#endif +#ifdef KOKKOS_IMPL_ENABLE_CXXABI +#include <cxxabi.h> +#endif // KOKKOS_ENABLE_CXXABI + +#include <exception> +#include <iostream> +#include <tuple> +#include <vector> + +namespace Kokkos { +namespace Impl { +#ifndef KOKKOS_IMPL_ENABLE_STACKTRACE +int backtrace(void**, int) { return 0; } +char** backtrace_symbols(void* const*, int) { return nullptr; } +#endif + +std::string demangle(const std::string& name) { +#ifndef KOKKOS_IMPL_ENABLE_CXXABI + return name; +#else + size_t found_end = name.find_first_of("+)", 0, 2); + if (found_end == std::string::npos) { + found_end = name.size(); + } + size_t found_parenthesis = name.find_first_of("("); + size_t start = found_parenthesis + 1; + if (found_parenthesis == std::string::npos) start = 0; + + std::string s = name.substr(start, found_end - start); + + if (s.length() != 0) { + int status = 0; + char* output_buffer = nullptr; + size_t length = s.length(); + char* d = abi::__cxa_demangle(s.c_str(), output_buffer, &length, &status); + if (d != nullptr) { + s = d; + free(d); + } + } + + // Special cases for "main" and "start" on Mac + if (s.length() == 0) { + if (name == "main" || name == "start") { + s = name; + } + } + return s; +#endif // KOKKOS_ENABLE_CXXABI +} + +class Stacktrace { + public: + Stacktrace() = delete; + Stacktrace(const Stacktrace&) = delete; + Stacktrace& operator=(const Stacktrace&) = delete; + Stacktrace(Stacktrace&&) = delete; + Stacktrace& operator=(Stacktrace&&) = delete; + ~Stacktrace() = delete; + + // These are public only to avoid wasting an extra stacktrace line. + // See save_stacktrace below. + static constexpr int capacity = 100; + static void* buffer[capacity]; + static int length; + + static std::vector<std::string> lines() { + char** symbols = backtrace_symbols(buffer, length); + if (symbols == nullptr) { + return {}; + } else { + std::vector<std::string> trace(length); + for (int i = 0; i < length; ++i) { + if (symbols[i] != nullptr) { + trace[i] = std::string(symbols[i]); + } + } + free(symbols); + return trace; + } + } +}; + +int Stacktrace::length = 0; +void* Stacktrace::buffer[Stacktrace::capacity]; + +void save_stacktrace() { + Stacktrace::length = backtrace(Stacktrace::buffer, Stacktrace::capacity); +} + +size_t find_first_non_whitespace(const std::string& s, const size_t start_pos) { + constexpr size_t num_ws_chars = 3; + const char ws_chars[] = "\n\t "; + return s.find_first_not_of(ws_chars, start_pos, num_ws_chars); +} + +size_t find_first_whitespace(const std::string& s, const size_t start_pos) { + constexpr size_t num_ws_chars = 3; + const char ws_chars[] = "\n\t "; + return s.find_first_of(ws_chars, start_pos, num_ws_chars); +} + +template <class Callback> +void for_each_token(const std::string& s, Callback c) { + size_t cur = find_first_non_whitespace(s, 0); + while (cur != std::string::npos) { + const size_t end = find_first_whitespace(s, cur); + const bool last = (end == std::string::npos); + const size_t count = last ? end : size_t(end - cur); + c(s.substr(cur, count), last); + cur = find_first_non_whitespace(s, end); + } +} + +// Search the whole backtrace, column by column, for "main". +// This tells us what column has the function names. +// While we're doing that, figure out the longest column, +// so we can compute spacing correctly. + +struct main_column_info { + bool found_main; + size_t main_col; + std::vector<size_t> main_col_lens; +}; + +main_column_info find_main_column(const std::vector<std::string>& traceback) { + bool found_main = false; + size_t main_col = 0; + for (auto&& entry : traceback) { + size_t col_count = 0; + for_each_token(entry, [&](const std::string& s, bool) { + const size_t pos = s.find("main"); + if (pos != std::string::npos) { + found_main = true; + main_col = col_count; + } + ++col_count; + }); + if (found_main) { + break; + } + } + + // Make another pass to get the column lengths. + // Only demangle the column of functions. + std::vector<size_t> max_col_lengths; + for (auto&& entry : traceback) { + size_t col_count = 0; + for_each_token(entry, [&](const std::string& s, bool) { + const size_t cur_col_len = + (found_main && col_count == main_col) ? demangle(s).size() : s.size(); + ++col_count; + if (max_col_lengths.size() < col_count) { + max_col_lengths.push_back(cur_col_len); + } else { + const size_t old_max_len = max_col_lengths[col_count - 1]; + if (old_max_len < cur_col_len) { + max_col_lengths[col_count - 1] = cur_col_len; + } + } + }); + } + return main_column_info{found_main, main_col, max_col_lengths}; +} + +void demangle_and_print_traceback_entry( + std::ostream& out, const std::string& traceback_entry, + const bool found_main, const size_t main_col, + const std::vector<size_t>& max_col_lens) { + std::vector<std::string> tokens; + size_t cur_col = 0; + for_each_token(traceback_entry, [&](const std::string& s, bool last) { + const size_t old_width(out.width()); + out.width(max_col_lens[cur_col]); + try { + if (found_main && cur_col == main_col) { + out << demangle(s); + } else { + out << s; + } + if (!last) { + out << " "; + } + ++cur_col; + } catch (...) { + out.width(old_width); + throw; + } + out.width(old_width); + }); +} + +void demangle_and_print_traceback(std::ostream& out, + const std::vector<std::string>& traceback) { + const auto result = find_main_column(traceback); + for (auto&& entry : traceback) { + demangle_and_print_traceback_entry(out, entry, result.found_main, + result.main_col, result.main_col_lens); + out << std::endl; + } +} + +void print_saved_stacktrace(std::ostream& out) { + auto lines = Stacktrace::lines(); + for (auto&& entry : lines) { + out << entry << std::endl; + } +} + +void print_demangled_saved_stacktrace(std::ostream& out) { + demangle_and_print_traceback(out, Stacktrace::lines()); +} + +std::function<void()> user_terminate_handler_post_ = nullptr; + +void kokkos_terminate_handler() { + using std::cerr; + using std::endl; + + cerr << "Kokkos observes that std::terminate has been called. " + "Here is the last saved stack trace. Note that this does not " + "necessarily show what called std::terminate." + << endl + << endl; + print_demangled_saved_stacktrace(std::cerr); + + if (user_terminate_handler_post_ != nullptr) { + user_terminate_handler_post_(); + } else { + std::abort(); + } +} + +void set_kokkos_terminate_handler(std::function<void()> user_post) { + user_terminate_handler_post_ = user_post; + std::set_terminate(kokkos_terminate_handler); +} + +} // namespace Impl +} // namespace Kokkos diff --git a/packages/kokkos/core/src/impl/Kokkos_Stacktrace.hpp b/packages/kokkos/core/src/impl/Kokkos_Stacktrace.hpp new file mode 100644 index 0000000000000000000000000000000000000000..b5cf4ee1b647f1cdf86bccfe3e73b68ec7127d7b --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Stacktrace.hpp @@ -0,0 +1,45 @@ +#ifndef KOKKOS_STACKTRACE_HPP +#define KOKKOS_STACKTRACE_HPP + +#include <functional> +#include <ostream> +#include <string> + +namespace Kokkos { +namespace Impl { + +/// \brief Return the demangled version of the input symbol, or the +/// original input if demangling is not possible. +std::string demangle(const std::string& name); + +/// \brief Save the current stacktrace. +/// +/// You may only save one stacktrace at a time. If you call this +/// twice, the second call will overwrite the result of the first +/// call. +void save_stacktrace(); + +/// \brief Print the raw form of the currently saved stacktrace, if +/// any, to the given output stream. +void print_saved_stacktrace(std::ostream& out); + +/// \brief Print the currently saved, demangled stacktrace, if any, to +/// the given output stream. +/// +/// Demangling is best effort only. +void print_demangled_saved_stacktrace(std::ostream& out); + +/// \brief Set the std::terminate handler so that it prints the +/// currently saved stack trace, then calls user_post. +/// +/// This is useful if you want to call, say, MPI_Abort instead of +/// std::abort. The MPI Standard frowns upon calling MPI functions +/// without including their header file, and Kokkos does not depend on +/// MPI, so there's no way for Kokkos to depend on MPI_Abort in a +/// portable way. +void set_kokkos_terminate_handler(std::function<void()> user_post = nullptr); + +} // namespace Impl +} // namespace Kokkos + +#endif // KOKKOS_STACKTRACE_HPP diff --git a/packages/kokkos/core/src/impl/Kokkos_Tags.hpp b/packages/kokkos/core/src/impl/Kokkos_Tags.hpp new file mode 100644 index 0000000000000000000000000000000000000000..eea4c938661afa00f4dad929312bf6cfa2b83776 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Tags.hpp @@ -0,0 +1,95 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TAGS_HPP +#define KOKKOS_TAGS_HPP + +#include <impl/Kokkos_Traits.hpp> +#include <Kokkos_Core_fwd.hpp> +#include <type_traits> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +/** KOKKOS_IMPL_HAS_TYPE( Type ) + * + * defines a meta-function that check if a type expose an internal alias which + * matches Type + * + * e.g. + * KOKKOS_IMPL_HAS_TYPE( array_layout ); + * struct Foo { using array_layout = void; }; + * have_array_layout<Foo>::value == 1; + */ +#define KOKKOS_IMPL_HAS_TYPE(TYPE) \ + template <typename T> \ + struct have_##TYPE { \ + private: \ + template <typename U, typename = void> \ + struct X : std::false_type {}; \ + template <typename U> \ + struct X<U, typename std::conditional<true, void, typename X::TYPE>::type> \ + : std::true_type {}; \ + \ + public: \ + using type = typename X<T>::type; \ + enum : bool { value = type::value }; \ + }; + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <typename T> +using is_void = std::is_same<void, T>; + +} +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskBase.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskBase.hpp new file mode 100644 index 0000000000000000000000000000000000000000..2d0f62a563712a1182849fd8dc43349f6996a42e --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_TaskBase.hpp @@ -0,0 +1,347 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +// Experimental unified task-data parallel manycore LDRD + +#ifndef KOKKOS_IMPL_TASKBASE_HPP +#define KOKKOS_IMPL_TASKBASE_HPP + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_TASKDAG) + +#include <Kokkos_TaskScheduler_fwd.hpp> +#include <Kokkos_Core_fwd.hpp> + +#include <impl/Kokkos_LIFO.hpp> + +#include <string> +#include <typeinfo> +#include <stdexcept> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +/** \brief Base class for task management, access, and execution. + * + * Inheritance structure to allow static_cast from the task root type + * and a task's FunctorType. + * + * // Enable a functor to access the base class + * // and provide memory for result value. + * TaskBase< Space , ResultType , FunctorType > + * : TaskBase< void , void , void > + * , FunctorType + * { ... }; + * Followed by memory allocated for result value. + * + * + * States of a task: + * + * Constructing State, NOT IN a linked list + * m_wait == 0 + * m_next == 0 + * + * Scheduling transition : Constructing -> Waiting + * before: + * m_wait == 0 + * m_next == this task's initial dependence, 0 if none + * after: + * m_wait == EndTag + * m_next == EndTag + * + * Waiting State, IN a linked list + * m_apply != 0 + * m_queue != 0 + * m_ref_count > 0 + * m_wait == head of linked list of tasks waiting on this task + * m_next == next of linked list of tasks + * + * transition : Waiting -> Executing + * before: + * m_next == EndTag + * after:: + * m_next == LockTag + * + * Executing State, NOT IN a linked list + * m_apply != 0 + * m_queue != 0 + * m_ref_count > 0 + * m_wait == head of linked list of tasks waiting on this task + * m_next == LockTag + * + * Respawn transition : Executing -> Executing-Respawn + * before: + * m_next == LockTag + * after: + * m_next == this task's updated dependence, 0 if none + * + * Executing-Respawn State, NOT IN a linked list + * m_apply != 0 + * m_queue != 0 + * m_ref_count > 0 + * m_wait == head of linked list of tasks waiting on this task + * m_next == this task's updated dependence, 0 if none + * + * transition : Executing -> Complete + * before: + * m_wait == head of linked list + * after: + * m_wait == LockTag + * + * Complete State, NOT IN a linked list + * m_wait == LockTag: cannot add dependence (<=> complete) + * m_next == LockTag: not a member of a wait queue + * + */ +class TaskBase { + public: + enum : int16_t { TaskTeam = 0, TaskSingle = 1, Aggregate = 2 }; + enum : uintptr_t { LockTag = ~uintptr_t(0), EndTag = ~uintptr_t(1) }; + + template <typename, typename> + friend class Kokkos::BasicTaskScheduler; + + using queue_type = TaskQueueBase; + + using function_type = void (*)(TaskBase*, void*); + using destroy_type = void (*)(TaskBase*); + + // sizeof(TaskBase) == 48 + + function_type m_apply = nullptr; ///< Apply function pointer + queue_type* m_queue = nullptr; ///< Pointer to the scheduler + TaskBase* m_next = nullptr; ///< next in linked list of ready tasks + TaskBase* m_wait = nullptr; ///< Queue of tasks waiting on this + int32_t m_ref_count = 0; + int32_t m_alloc_size = 0; + int32_t m_dep_count; ///< Aggregate's number of dependences + int16_t m_task_type; ///< Type of task + int16_t m_priority; ///< Priority of runnable task + + TaskBase(TaskBase&&) = delete; + TaskBase(const TaskBase&) = delete; + TaskBase& operator=(TaskBase&&) = delete; + TaskBase& operator=(const TaskBase&) = delete; + + KOKKOS_DEFAULTED_FUNCTION ~TaskBase() = default; + + KOKKOS_INLINE_FUNCTION constexpr TaskBase() + : m_apply(nullptr), + m_queue(nullptr), + m_next(nullptr), + m_wait(nullptr), + m_ref_count(0), + m_alloc_size(0), + m_dep_count(0), + m_task_type(0), + m_priority(0) {} + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + TaskBase* volatile* aggregate_dependences() volatile { + return reinterpret_cast<TaskBase* volatile*>(this + 1); + } + + KOKKOS_INLINE_FUNCTION + bool requested_respawn() { + // This should only be called when a task has finished executing and is + // in the transition to either the complete or executing-respawn state. + TaskBase* const lock = reinterpret_cast<TaskBase*>(LockTag); + return lock != m_next; + } + + KOKKOS_INLINE_FUNCTION + void add_dependence(TaskBase* dep) { + // Precondition: lock == m_next + + TaskBase* const lock = (TaskBase*)LockTag; + + // Assign dependence to m_next. It will be processed in the subsequent + // call to schedule. Error if the dependence is reset. + if (lock != Kokkos::atomic_exchange(&m_next, dep)) { + Kokkos::abort("TaskScheduler ERROR: resetting task dependence"); + } + + if (nullptr != dep) { + // The future may be destroyed upon returning from this call + // so increment reference count to track this assignment. + Kokkos::atomic_increment(&(dep->m_ref_count)); + } + } + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + int32_t reference_count() const { + return *((int32_t volatile*)(&m_ref_count)); + } +}; + +//------------------------------------------------------------------------------ +// <editor-fold desc="Verify the size of TaskBase is as expected"> {{{2 + +// Workaround: some compilers implement int16_t as 4 bytes, so the size might +// not actually be 48 bytes. +// There's not a lot of reason to keep checking this here; the program will +// work fine if this isn't true. I think this check was originally here to +// emphasize the fact that adding to the size of TaskBase could have a +// significant performance penalty, since doing so could substantially decrease +// the number of full task types that fit into a cache line. We'll leave it +// here for now, though, since we're probably going to be ripping all of the +// old TaskBase stuff out eventually anyway. +constexpr size_t unpadded_task_base_size = 44 + 2 * sizeof(int16_t); +// don't forget padding: +constexpr size_t task_base_misalignment = + unpadded_task_base_size % alignof(void*); +constexpr size_t task_base_padding_size = + (alignof(void*) - task_base_misalignment) % alignof(void*); +constexpr size_t expected_task_base_size = + unpadded_task_base_size + task_base_padding_size; + +// Produce a more readable compiler error message than the plain static assert +template <size_t Size> +struct verify_task_base_size_is_48_note_actual_size_is_ {}; +template <> +struct verify_task_base_size_is_48_note_actual_size_is_< + expected_task_base_size> { + using type = int; +}; +static constexpr + typename verify_task_base_size_is_48_note_actual_size_is_<sizeof( + TaskBase)>::type verify = {}; + +static_assert(sizeof(TaskBase) == expected_task_base_size, + "Verifying expected sizeof(TaskBase)"); + +// </editor-fold> end Verify the size of TaskBase is as expected }}}2 +//------------------------------------------------------------------------------ + +} /* namespace Impl */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <class Scheduler, typename ResultType, class FunctorType> +class Task : public TaskBase, public FunctorType { + public: + Task() = delete; + Task(Task&&) = delete; + Task(const Task&) = delete; + Task& operator=(Task&&) = delete; + Task& operator=(const Task&) = delete; + + using root_type = TaskBase; + using functor_type = FunctorType; + using result_type = ResultType; + + using specialization = TaskQueueSpecialization<Scheduler>; + using member_type = typename specialization::member_type; + + KOKKOS_INLINE_FUNCTION + void apply_functor(member_type* const member, void*) { + this->functor_type::operator()(*member); + } + + template <typename T> + KOKKOS_INLINE_FUNCTION void apply_functor(member_type* const member, + T* const result) { + this->functor_type::operator()(*member, *result); + } + + KOKKOS_FUNCTION static void destroy(root_type* root) { + TaskResult<result_type>::destroy(root); + } + + KOKKOS_FUNCTION static void apply(root_type* root, void* exec) { + Task* const task = static_cast<Task*>(root); + member_type* const member = reinterpret_cast<member_type*>(exec); + result_type* const result = TaskResult<result_type>::ptr(task); + + // Task may be serial or team. + // If team then must synchronize before querying if respawn was requested. + // If team then only one thread calls destructor. + + const bool only_one_thread = +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA) + 0 == threadIdx.x && 0 == threadIdx.y; +#else + 0 == member->team_rank(); +#endif + + task->apply_functor(member, result); + + member->team_barrier(); + + if (only_one_thread && !(task->requested_respawn())) { + // Did not respawn, destroy the functor to free memory. + task->functor_type::~functor_type(); + // Cannot destroy and deallocate the task until its dependences + // have been processed. + } + } + + // Constructor for runnable task + KOKKOS_INLINE_FUNCTION constexpr Task(FunctorType&& arg_functor) + : root_type(), functor_type(std::move(arg_functor)) {} + + KOKKOS_INLINE_FUNCTION + ~Task() = delete; +}; + +} /* namespace Impl */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */ +#endif /* #ifndef KOKKOS_IMPL_TASKBASE_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskNode.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskNode.hpp new file mode 100644 index 0000000000000000000000000000000000000000..42afa93cdcc4db4f4c0223d7b85f5edb8256ee31 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_TaskNode.hpp @@ -0,0 +1,698 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +// Experimental unified task-data parallel manycore LDRD + +#ifndef KOKKOS_IMPL_TASKNODE_HPP +#define KOKKOS_IMPL_TASKNODE_HPP + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_TASKDAG) + +#include <Kokkos_TaskScheduler_fwd.hpp> +#include <Kokkos_Core_fwd.hpp> + +#include <Kokkos_PointerOwnership.hpp> + +#include <impl/Kokkos_VLAEmulation.hpp> +#include <impl/Kokkos_LIFO.hpp> +#include <impl/Kokkos_ChaseLev.hpp> +#include <impl/Kokkos_EBO.hpp> +#include <Kokkos_Concepts.hpp> + +#include <string> +#include <typeinfo> +#include <stdexcept> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +#ifdef KOKKOS_COMPILER_PGI +// Bizzarely, an extra jump instruction forces the PGI compiler to not have a +// bug related to (probably?) empty base optimization and/or aggregate +// construction. This must be defined out-of-line to generate a jump +// jump instruction +void _kokkos_pgi_compiler_bug_workaround(); +#endif + +enum TaskType : int16_t { + TaskTeam = 0, + TaskSingle = 1, + Aggregate = 2, + TaskSpecial = -1 +}; + +//============================================================================== + +/** Intrusive base class for things allocated with a Kokkos::MemoryPool + * + * @warning Memory pools assume that the address of this class is the same + * as the address of the most derived type that was allocated to + * have the given size. As a consequence, when interacting with + * multiple inheritance, this must always be the first base class + * of any derived class that uses it! + * @todo Consider inverting inheritance structure to avoid this problem? + * + * @tparam CountType type of integer used to store the allocation size + */ +template <class CountType = int32_t> +class alignas(void*) PoolAllocatedObjectBase { + public: + using pool_allocation_size_type = CountType; + + private: + pool_allocation_size_type m_alloc_size; + + public: + KOKKOS_INLINE_FUNCTION + constexpr explicit PoolAllocatedObjectBase( + pool_allocation_size_type allocation_size) + : m_alloc_size(allocation_size) {} + + KOKKOS_INLINE_FUNCTION + CountType get_allocation_size() const noexcept { return m_alloc_size; } +}; + +//============================================================================== + +// TODO @tasking @cleanup DSH move this? +template <class CountType = int32_t> +class ReferenceCountedBase { + public: + using reference_count_size_type = CountType; + + private: + reference_count_size_type m_ref_count = 0; + + public: + KOKKOS_INLINE_FUNCTION +#ifndef KOKKOS_COMPILER_PGI + constexpr +#endif + explicit ReferenceCountedBase( + reference_count_size_type initial_reference_count) + : m_ref_count(initial_reference_count) { + // This can't be here because it breaks constexpr + // KOKKOS_EXPECTS(initial_reference_count > 0); +#ifdef KOKKOS_COMPILER_PGI + Impl::_kokkos_pgi_compiler_bug_workaround(); +#endif + } + + /** Decrement the reference count, + * and return true iff this decrement caused + * the reference count to become zero + */ + KOKKOS_INLINE_FUNCTION + bool decrement_and_check_reference_count() { + // TODO @tasking @memory_order DSH memory order + auto old_count = Kokkos::atomic_fetch_add(&m_ref_count, -1); + + KOKKOS_ASSERT(old_count > 0 && "reference count greater less than zero!"); + + return (old_count == 1); + } + + KOKKOS_INLINE_FUNCTION + void increment_reference_count() { Kokkos::atomic_increment(&m_ref_count); } +}; + +template <class TaskQueueTraits, class SchedulingInfo> +class AggregateTask; + +template <class TaskQueueTraits> +class RunnableTaskBase; + +//============================================================================== + +template <class TaskQueueTraits> +class TaskNode + : public PoolAllocatedObjectBase<int32_t>, // size 4, must be first! + public ReferenceCountedBase<int32_t>, // size 4 + public TaskQueueTraits::template intrusive_task_base_type< + TaskNode<TaskQueueTraits>> // size 8+ +{ + public: + using priority_type = int16_t; + + private: + using task_base_type = TaskNode<TaskQueueTraits>; + using pool_allocated_base_type = PoolAllocatedObjectBase<int32_t>; + using reference_counted_base_type = ReferenceCountedBase<int32_t>; + using task_queue_traits = TaskQueueTraits; + using waiting_queue_type = + typename task_queue_traits::template waiting_queue_type<TaskNode>; + + waiting_queue_type m_wait_queue; // size 8+ + + // TODO @tasking @cleanup DSH eliminate this, or make its purpose a bit more + // clear. It's only used in BasicFuture, and only for deallocation purposes + TaskQueueBase* m_ready_queue_base; + + TaskType m_task_type; // size 2 + priority_type m_priority; // size 2 + bool m_is_respawning = false; + + public: + KOKKOS_INLINE_FUNCTION + constexpr TaskNode(TaskType task_type, TaskPriority priority, + TaskQueueBase* queue_base, + reference_count_size_type initial_reference_count, + pool_allocation_size_type allocation_size) + : pool_allocated_base_type( + /* allocation_size = */ allocation_size), + reference_counted_base_type( + /* initial_reference_count = */ initial_reference_count), + m_wait_queue(), + m_ready_queue_base(queue_base), + m_task_type(task_type), + m_priority(static_cast<priority_type>(priority)), + m_is_respawning(false) {} + + TaskNode() = delete; + TaskNode(TaskNode const&) = delete; + TaskNode(TaskNode&&) = delete; + TaskNode& operator=(TaskNode const&) = delete; + TaskNode& operator=(TaskNode&&) = delete; + + KOKKOS_INLINE_FUNCTION + bool is_aggregate() const noexcept { + return m_task_type == TaskType::Aggregate; + } + + KOKKOS_INLINE_FUNCTION + bool is_runnable() const noexcept { + return m_task_type != TaskType::Aggregate; + } + + KOKKOS_INLINE_FUNCTION + bool is_runnable() const volatile noexcept { + return m_task_type != TaskType::Aggregate; + } + + KOKKOS_INLINE_FUNCTION + bool is_single_runnable() const noexcept { + return m_task_type == TaskType::TaskSingle; + } + + KOKKOS_INLINE_FUNCTION + bool is_team_runnable() const noexcept { + return m_task_type == TaskType::TaskTeam; + } + + KOKKOS_INLINE_FUNCTION + TaskType get_task_type() const noexcept { return m_task_type; } + + KOKKOS_INLINE_FUNCTION + RunnableTaskBase<TaskQueueTraits>& as_runnable_task() & { + KOKKOS_EXPECTS(this->is_runnable()); + return static_cast<RunnableTaskBase<TaskQueueTraits>&>(*this); + } + + KOKKOS_INLINE_FUNCTION + RunnableTaskBase<TaskQueueTraits> const& as_runnable_task() const& { + KOKKOS_EXPECTS(this->is_runnable()); + return static_cast<RunnableTaskBase<TaskQueueTraits> const&>(*this); + } + + KOKKOS_INLINE_FUNCTION + RunnableTaskBase<TaskQueueTraits> volatile& as_runnable_task() volatile& { + KOKKOS_EXPECTS(this->is_runnable()); + return static_cast<RunnableTaskBase<TaskQueueTraits> volatile&>(*this); + } + + KOKKOS_INLINE_FUNCTION + RunnableTaskBase<TaskQueueTraits> const volatile& as_runnable_task() const + volatile& { + KOKKOS_EXPECTS(this->is_runnable()); + return static_cast<RunnableTaskBase<TaskQueueTraits> const volatile&>( + *this); + } + + KOKKOS_INLINE_FUNCTION + RunnableTaskBase<TaskQueueTraits>&& as_runnable_task() && { + KOKKOS_EXPECTS(this->is_runnable()); + return static_cast<RunnableTaskBase<TaskQueueTraits>&&>(*this); + } + + template <class SchedulingInfo> + KOKKOS_INLINE_FUNCTION AggregateTask<TaskQueueTraits, SchedulingInfo>& + as_aggregate() & { + KOKKOS_EXPECTS(this->is_aggregate()); + return static_cast<AggregateTask<TaskQueueTraits, SchedulingInfo>&>(*this); + } + + template <class SchedulingInfo> + KOKKOS_INLINE_FUNCTION AggregateTask<TaskQueueTraits, SchedulingInfo> const& + as_aggregate() const& { + KOKKOS_EXPECTS(this->is_aggregate()); + return static_cast<AggregateTask<TaskQueueTraits, SchedulingInfo> const&>( + *this); + } + + template <class SchedulingInfo> + KOKKOS_INLINE_FUNCTION AggregateTask<TaskQueueTraits, SchedulingInfo>&& + as_aggregate() && { + KOKKOS_EXPECTS(this->is_aggregate()); + return static_cast<AggregateTask<TaskQueueTraits, SchedulingInfo>&&>(*this); + } + + KOKKOS_INLINE_FUNCTION + bool try_add_waiting(task_base_type& depends_on_this) { + return m_wait_queue.try_push(depends_on_this); + } + + template <class Function> + KOKKOS_INLINE_FUNCTION void consume_wait_queue(Function&& f) { + KOKKOS_EXPECTS(!m_wait_queue.is_consumed()); + m_wait_queue.consume(std::forward<Function>(f)); + } + + KOKKOS_INLINE_FUNCTION + bool wait_queue_is_consumed() const noexcept { + // TODO @tasking @memory_order DSH memory order + return m_wait_queue.is_consumed(); + } + + KOKKOS_INLINE_FUNCTION + TaskQueueBase* ready_queue_base_ptr() const noexcept { + return m_ready_queue_base; + } + + KOKKOS_INLINE_FUNCTION + void set_priority(TaskPriority priority) noexcept { + KOKKOS_EXPECTS(!this->is_enqueued()); + m_priority = (priority_type)priority; + } + + KOKKOS_INLINE_FUNCTION + void set_priority(TaskPriority priority) volatile noexcept { + KOKKOS_EXPECTS(!this->is_enqueued()); + m_priority = (priority_type)priority; + } + + KOKKOS_INLINE_FUNCTION + TaskPriority get_priority() const noexcept { + return (TaskPriority)m_priority; + } + + KOKKOS_INLINE_FUNCTION + bool get_respawn_flag() const { return m_is_respawning; } + + KOKKOS_INLINE_FUNCTION + void set_respawn_flag(bool value = true) { m_is_respawning = value; } + + KOKKOS_INLINE_FUNCTION + void set_respawn_flag(bool value = true) volatile { m_is_respawning = value; } +}; + +//============================================================================== + +template <class BaseClass, class SchedulingInfo> +class SchedulingInfoStorage; + +//============================================================================== + +template <class BaseType, class SchedulingInfo> +class SchedulingInfoStorage + : public BaseType, // must be first base class for allocation reasons!!! + private NoUniqueAddressMemberEmulation<SchedulingInfo> { + private: + using base_t = BaseType; + using task_scheduling_info_type = SchedulingInfo; + + public: + // Can't just do using base_t::base_t because of stupid stuff with clang cuda + template <class... Args> + // requires std::is_constructible_v<base_t, Args&&...> + KOKKOS_INLINE_FUNCTION constexpr explicit SchedulingInfoStorage( + Args&&... args) + : base_t(std::forward<Args>(args)...) {} + + KOKKOS_INLINE_FUNCTION + task_scheduling_info_type& scheduling_info() & { + return this->no_unique_address_data_member(); + } + + KOKKOS_INLINE_FUNCTION + task_scheduling_info_type const& scheduling_info() const& { + return this->no_unique_address_data_member(); + } + + KOKKOS_INLINE_FUNCTION + task_scheduling_info_type&& scheduling_info() && { + return std::move(*this).no_unique_address_data_member(); + } +}; + +//============================================================================== + +template <class TaskQueueTraits, class SchedulingInfo> +class alignas(16) AggregateTask final + : public SchedulingInfoStorage<TaskNode<TaskQueueTraits>, + SchedulingInfo>, // must be first base class + // for allocation + // reasons!!! + public ObjectWithVLAEmulation< + AggregateTask<TaskQueueTraits, SchedulingInfo>, + OwningRawPtr<TaskNode<TaskQueueTraits>>> { + private: + using base_t = + SchedulingInfoStorage<TaskNode<TaskQueueTraits>, SchedulingInfo>; + using vla_base_t = + ObjectWithVLAEmulation<AggregateTask<TaskQueueTraits, SchedulingInfo>, + OwningRawPtr<TaskNode<TaskQueueTraits>>>; + + using task_base_type = TaskNode<TaskQueueTraits>; + + public: + using aggregate_task_type = AggregateTask; // concept marker + + template <class... Args> + // requires std::is_constructible_v<base_t, Args&&...> + KOKKOS_INLINE_FUNCTION constexpr explicit AggregateTask( + int32_t aggregate_predecessor_count, Args&&... args) + : base_t(TaskType::Aggregate, + TaskPriority::Regular, // all aggregates are regular priority + std::forward<Args>(args)...), + vla_base_t(aggregate_predecessor_count) {} + + KOKKOS_INLINE_FUNCTION + int32_t dependence_count() const { return this->n_vla_entries(); } +}; + +// KOKKOS_IMPL_IS_CONCEPT(aggregate_task); + +//============================================================================== + +template <class TaskQueueTraits> +class RunnableTaskBase + : public TaskNode<TaskQueueTraits> // must be first base class for + // allocation reasons!!! +{ + private: + using base_t = TaskNode<TaskQueueTraits>; + + public: + using task_base_type = TaskNode<TaskQueueTraits>; + using function_type = void (*)(task_base_type*, void*); + using destroy_type = void (*)(task_base_type*); + using runnable_task_type = RunnableTaskBase; + + private: + function_type m_apply; + task_base_type* m_predecessor = nullptr; + + public: + template <class... Args> + // requires std::is_constructible_v<base_t, Args&&...> + KOKKOS_INLINE_FUNCTION constexpr explicit RunnableTaskBase( + function_type apply_function_ptr, Args&&... args) + : base_t(std::forward<Args>(args)...), m_apply(apply_function_ptr) {} + + KOKKOS_INLINE_FUNCTION + bool has_predecessor() const { return m_predecessor != nullptr; } + + KOKKOS_INLINE_FUNCTION + void clear_predecessor() { m_predecessor = nullptr; } + + KOKKOS_INLINE_FUNCTION + void clear_predecessor() volatile { m_predecessor = nullptr; } + + template <class SchedulingInfo> + KOKKOS_INLINE_FUNCTION SchedulingInfo& scheduling_info_as() { + using info_storage_type = + SchedulingInfoStorage<RunnableTaskBase, SchedulingInfo>; + + return static_cast<info_storage_type*>(this)->scheduling_info(); + } + + template <class SchedulingInfo> + KOKKOS_INLINE_FUNCTION SchedulingInfo const& scheduling_info_as() const { + using info_storage_type = + SchedulingInfoStorage<RunnableTaskBase, SchedulingInfo>; + + return static_cast<info_storage_type const*>(this)->scheduling_info(); + } + + KOKKOS_INLINE_FUNCTION + task_base_type& get_predecessor() const { + KOKKOS_EXPECTS(m_predecessor != nullptr); + return *m_predecessor; + } + + KOKKOS_INLINE_FUNCTION + void set_predecessor(task_base_type& predecessor) { + KOKKOS_EXPECTS(m_predecessor == nullptr); + // Increment the reference count so that predecessor doesn't go away + // before this task is enqueued. + // (should be memory order acquire) + predecessor.increment_reference_count(); + m_predecessor = &predecessor; + } + + KOKKOS_INLINE_FUNCTION + void acquire_predecessor_from(runnable_task_type& other) { + KOKKOS_EXPECTS(m_predecessor == nullptr || + other.m_predecessor == m_predecessor); + // since we're transferring, no need to modify the reference count + m_predecessor = other.m_predecessor; + other.m_predecessor = nullptr; + } + + KOKKOS_INLINE_FUNCTION + void acquire_predecessor_from(runnable_task_type& other) volatile { + KOKKOS_EXPECTS(m_predecessor == nullptr || + other.m_predecessor == m_predecessor); + // since we're transferring, no need to modify the reference count + m_predecessor = other.m_predecessor; + other.m_predecessor = nullptr; + } + + template <class TeamMember> + KOKKOS_INLINE_FUNCTION void run(TeamMember& member) { + (*m_apply)(this, &member); + } +}; + +// KOKKOS_IMPL_IS_CONCEPT(runnable_task); + +//============================================================================== + +template <class ResultType, class Base> +class TaskResultStorage : public Base { + private: + using base_t = Base; + + alignas(Base) ResultType m_value = ResultType{}; + + public: + // using base_t::base_t; + // Can't just do using base_t::base_t because of stupid stuff with clang cuda + template <class... Args> + // requires std::is_constructible_v<base_t, Args&&...> + KOKKOS_INLINE_FUNCTION constexpr explicit TaskResultStorage(Args&&... args) + : base_t(std::forward<Args>(args)...) {} + + KOKKOS_INLINE_FUNCTION + ResultType* value_pointer() { + // Over-alignment makes this a non-standard-layout class, + // so alignas() doesn't work + // static_assert( + // offsetof(TaskResultStorage, m_value) == sizeof(Base), + // "TaskResultStorage must be POD for layout purposes" + //); + return &m_value; + } + + KOKKOS_INLINE_FUNCTION + ResultType& value_reference() { return m_value; } +}; + +// TODO @tasking @optimization DSH optimization for empty types (in addition to +// void) +template <class Base> +class TaskResultStorage<void, Base> : public Base { + private: + using base_t = Base; + + public: + // using base_t::base_t; + // Can't just do using base_t::base_t because of stupid stuff with clang cuda + template <class... Args> + // requires std::is_constructible_v<base_t, Args&&...> + KOKKOS_INLINE_FUNCTION constexpr explicit TaskResultStorage(Args&&... args) + : base_t(std::forward<Args>(args)...) {} + + KOKKOS_INLINE_FUNCTION + void* value_pointer() noexcept { return nullptr; } + + KOKKOS_INLINE_FUNCTION + void value_reference() noexcept {} +}; + +//============================================================================== + +template <class TaskQueueTraits, class Scheduler, class ResultType, + class FunctorType> +class alignas(16) RunnableTask + : // using nesting of base classes to control layout; multiple empty base + // classes may not be ABI compatible with CUDA on Windows + public TaskResultStorage< + ResultType, + SchedulingInfoStorage<RunnableTaskBase<TaskQueueTraits>, + typename Scheduler::task_queue_type:: + task_scheduling_info_type>>, // must be + // first base + // class + public FunctorType { + private: + using base_t = TaskResultStorage< + ResultType, + SchedulingInfoStorage< + RunnableTaskBase<TaskQueueTraits>, + typename Scheduler::task_queue_type::task_scheduling_info_type>>; + + using runnable_task_base_type = RunnableTaskBase<TaskQueueTraits>; + using scheduler_type = Scheduler; + using scheduling_info_type = + typename scheduler_type::task_scheduling_info_type; + using scheduling_info_storage_base = base_t; + + using task_base_type = TaskNode<TaskQueueTraits>; + using specialization = TaskQueueSpecialization<scheduler_type>; + using member_type = typename specialization::member_type; + using result_type = ResultType; + using functor_type = FunctorType; + + public: + template <class... Args> + // requires std::is_constructible_v<base_t, Args&&...> + KOKKOS_INLINE_FUNCTION constexpr explicit RunnableTask(FunctorType&& functor, + Args&&... args) + : base_t(std::forward<Args>(args)...), functor_type(std::move(functor)) {} + + KOKKOS_INLINE_FUNCTION + ~RunnableTask() = delete; + + KOKKOS_INLINE_FUNCTION + void update_scheduling_info(member_type& /*member*/) { + // TODO @tasking @generalization DSH call a queue-specific hook here; for + // now, this info is already updated elsewhere this->scheduling_info() = + // member.scheduler().scheduling_info(); + } + + KOKKOS_INLINE_FUNCTION + void apply_functor(member_type* member, void*) { + update_scheduling_info(*member); + this->functor_type::operator()(*member); + } + + template <typename T> + KOKKOS_INLINE_FUNCTION void apply_functor(member_type* member, T* val) { + update_scheduling_info(*member); + // this->functor_type::operator()(*member, *val); + this->functor_type::operator()(*member, *val); + } + + KOKKOS_FUNCTION static void destroy(task_base_type* /*root*/) { + // TaskResult<result_type>::destroy(root); + } + + KOKKOS_FUNCTION static void apply(task_base_type* self, + void* member_as_void) { + using task_type = Impl::RunnableTask<TaskQueueTraits, Scheduler, ResultType, + FunctorType>*; + auto* const task = static_cast<task_type>(self); + auto* const member = reinterpret_cast<member_type*>(member_as_void); + + // Now that we're over-aligning the result storage, this isn't a problem any + // more + // static_assert(std::is_standard_layout<task_type>::value, + // "Tasks must be standard layout" + //); + // static_assert(std::is_pod<task_type>::value, + // "Tasks must be PODs" + //); + + // Task may be serial or team. + // If team then must synchronize before querying if respawn was requested. + // If team then only one thread calls destructor. + + const bool only_one_thread = +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA) + 0 == threadIdx.x && 0 == threadIdx.y; +#else + 0 == member->team_rank(); +#endif + + // Ensure that the respawn flag is set to zero + self->set_respawn_flag(false); + + // task->apply_functor(member, TaskResult<result_type>::ptr(task)); + task->apply_functor(member, task->value_pointer()); + + member->team_barrier(); + + if (only_one_thread && !(task->get_respawn_flag())) { + // Did not respawn, destroy the functor to free memory. + task->functor_type::~functor_type(); + // Cannot destroy and deallocate the task until its dependences + // have been processed. + } + } +}; + +} /* namespace Impl */ + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */ +#endif /* #ifndef KOKKOS_IMPL_TASKNODE_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskPolicyData.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskPolicyData.hpp new file mode 100644 index 0000000000000000000000000000000000000000..09113628a76f8c9282ae8d30e0f29e7c407ed962 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_TaskPolicyData.hpp @@ -0,0 +1,172 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_IMPL_TASKPOLICYDATA_HPP +#define KOKKOS_IMPL_TASKPOLICYDATA_HPP + +//---------------------------------------------------------------------------- + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_TASKDAG) + +#include <Kokkos_Core_fwd.hpp> +#include <Kokkos_TaskScheduler_fwd.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +//---------------------------------------------------------------------------- + +template <int TaskEnum, typename DepFutureType> +struct TaskPolicyWithPredecessor { + private: + DepFutureType m_predecessor; + Kokkos::TaskPriority m_priority; + + public: + KOKKOS_INLINE_FUNCTION + TaskPolicyWithPredecessor(DepFutureType arg_predecessor, + Kokkos::TaskPriority arg_priority) + : m_predecessor(std::move(arg_predecessor)), m_priority(arg_priority) {} + + TaskPolicyWithPredecessor() = delete; + + KOKKOS_DEFAULTED_FUNCTION + TaskPolicyWithPredecessor(TaskPolicyWithPredecessor const&) = default; + + KOKKOS_DEFAULTED_FUNCTION + TaskPolicyWithPredecessor(TaskPolicyWithPredecessor&&) = default; + + KOKKOS_DEFAULTED_FUNCTION + TaskPolicyWithPredecessor& operator=(TaskPolicyWithPredecessor const&) = + default; + + KOKKOS_DEFAULTED_FUNCTION + TaskPolicyWithPredecessor& operator=(TaskPolicyWithPredecessor&&) = default; + + KOKKOS_DEFAULTED_FUNCTION + ~TaskPolicyWithPredecessor() = default; + + KOKKOS_INLINE_FUNCTION + DepFutureType&& predecessor() && { return std::move(m_predecessor); } + + KOKKOS_INLINE_FUNCTION + constexpr TaskPriority priority() const { return m_priority; } + + KOKKOS_INLINE_FUNCTION + static constexpr int task_type() noexcept { return TaskEnum; } +}; + +// TODO @tasking @cleanup DSH clean this up. Using nullptr_t here is too clever +template <int TaskEnum, typename Scheduler, + typename PredecessorFuture = std::nullptr_t> +struct TaskPolicyWithScheduler { + public: + using predecessor_future_type = PredecessorFuture; + + private: + Scheduler m_scheduler; + Kokkos::TaskPriority m_priority; + predecessor_future_type m_predecessor; + + public: + KOKKOS_INLINE_FUNCTION + TaskPolicyWithScheduler(Scheduler arg_scheduler, + Kokkos::TaskPriority arg_priority) + : m_scheduler(std::move(arg_scheduler)), m_priority(arg_priority) {} + + KOKKOS_INLINE_FUNCTION + TaskPolicyWithScheduler(Scheduler arg_scheduler, + predecessor_future_type arg_predecessor, + Kokkos::TaskPriority arg_priority) + : m_scheduler(std::move(arg_scheduler)), + m_priority(arg_priority), + m_predecessor(std::move(arg_predecessor)) {} + + TaskPolicyWithScheduler() = delete; + + KOKKOS_DEFAULTED_FUNCTION + TaskPolicyWithScheduler(TaskPolicyWithScheduler const&) = default; + + KOKKOS_DEFAULTED_FUNCTION + TaskPolicyWithScheduler(TaskPolicyWithScheduler&&) = default; + + KOKKOS_DEFAULTED_FUNCTION + TaskPolicyWithScheduler& operator=(TaskPolicyWithScheduler const&) = default; + + KOKKOS_DEFAULTED_FUNCTION + TaskPolicyWithScheduler& operator=(TaskPolicyWithScheduler&&) = default; + + KOKKOS_DEFAULTED_FUNCTION + ~TaskPolicyWithScheduler() = default; + + KOKKOS_INLINE_FUNCTION + Scheduler& scheduler() & { return m_scheduler; } + + KOKKOS_INLINE_FUNCTION + constexpr TaskPriority priority() const { return m_priority; } + + KOKKOS_INLINE_FUNCTION + predecessor_future_type& predecessor() & { return m_predecessor; } + + KOKKOS_INLINE_FUNCTION + static constexpr bool has_predecessor() noexcept { + return !std::is_same<PredecessorFuture, std::nullptr_t>::value; + } + + KOKKOS_INLINE_FUNCTION + static constexpr int task_type() noexcept { return TaskEnum; } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */ +#endif /* #ifndef KOKKOS_IMPL_TASKPOLICYDATA_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskQueue.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskQueue.hpp new file mode 100644 index 0000000000000000000000000000000000000000..c0d2eca9c106305e1bfdeb8efd634f657df90c8b --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_TaskQueue.hpp @@ -0,0 +1,264 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +// Experimental unified task-data parallel manycore LDRD + +#ifndef KOKKOS_IMPL_TASKQUEUE_HPP +#define KOKKOS_IMPL_TASKQUEUE_HPP + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_TASKDAG) + +#include <Kokkos_TaskScheduler_fwd.hpp> +#include <Kokkos_Core_fwd.hpp> + +#include <Kokkos_MemoryPool.hpp> + +#include <impl/Kokkos_TaskBase.hpp> +#include <impl/Kokkos_TaskResult.hpp> + +#include <impl/Kokkos_Memory_Fence.hpp> +#include <impl/Kokkos_Atomic_Increment.hpp> +#include <impl/Kokkos_OptionalRef.hpp> +#include <impl/Kokkos_LIFO.hpp> + +#include <string> +#include <typeinfo> +#include <stdexcept> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +/** \brief Manage task allocation, deallocation, and scheduling. + * + * Task execution is deferred to the TaskQueueSpecialization. + * All other aspects of task management have shared implementation. + */ +template <typename ExecSpace, typename MemorySpace> +class TaskQueue : public TaskQueueBase { + protected: + template <class> + friend struct TaskQueueSpecialization; + template <class, class> + friend class TaskQueueSpecializationConstrained; + template <class, class> + friend class Kokkos::BasicTaskScheduler; + + using execution_space = ExecSpace; + using memory_space = MemorySpace; + using device_type = Kokkos::Device<execution_space, memory_space>; + using memory_pool = Kokkos::MemoryPool<device_type>; + using task_root_type = Kokkos::Impl::TaskBase; + using team_queue_type = TaskQueue; + + struct Destroy { + TaskQueue* m_queue; + void destroy_shared_allocation(); + }; + + //---------------------------------------- + + enum : int { NumQueue = 3 }; + + // Queue is organized as [ priority ][ type ] + + memory_pool m_memory; + task_root_type* volatile m_ready[NumQueue][2]; + // long m_accum_alloc ; // Accumulated number of + // allocations + int m_count_alloc = 0; // Current number of allocations + int m_max_alloc; // Maximum number of allocations + int m_ready_count; // Number of ready or executing + + //---------------------------------------- + + ~TaskQueue(); + TaskQueue() = delete; + TaskQueue(TaskQueue&&) = delete; + TaskQueue(TaskQueue const&) = delete; + TaskQueue& operator=(TaskQueue&&) = delete; + TaskQueue& operator=(TaskQueue const&) = delete; + + TaskQueue(const memory_pool& arg_memory_pool); + + // Schedule a task + // Precondition: + // task is not executing + // task->m_next is the dependence or zero + // Postcondition: + // task->m_next is linked list membership + KOKKOS_FUNCTION void schedule_runnable(task_root_type*); + KOKKOS_FUNCTION void schedule_aggregate(task_root_type*); + + // Reschedule a task + // Precondition: + // task is in Executing state + // task->m_next == LockTag + // Postcondition: + // task is in Executing-Respawn state + // task->m_next == 0 (no dependence) + KOKKOS_FUNCTION + void reschedule(task_root_type*); + + // Complete a task + // Precondition: + // task is not executing + // task->m_next == LockTag => task is complete + // task->m_next != LockTag => task is respawn + // Postcondition: + // task->m_wait == LockTag => task is complete + // task->m_wait != LockTag => task is waiting + KOKKOS_FUNCTION + void complete(task_root_type*); + + KOKKOS_FUNCTION + static bool push_task(task_root_type* volatile* const, task_root_type* const); + + KOKKOS_FUNCTION + static task_root_type* pop_ready_task(task_root_type* volatile* const); + + KOKKOS_FUNCTION static void decrement(task_root_type* task); + + public: + KOKKOS_INLINE_FUNCTION + int allocation_count() const noexcept { return m_count_alloc; } + + KOKKOS_INLINE_FUNCTION + void initialize_team_queues(int /*pool_size*/) const noexcept {} + + KOKKOS_INLINE_FUNCTION + task_root_type* attempt_to_steal_task() const noexcept { return nullptr; } + + KOKKOS_INLINE_FUNCTION + team_queue_type& get_team_queue(int /*team_rank*/) { return *this; } + + // void execute() { specialization::execute( this ); } + + template <typename FunctorType> + void proc_set_apply(typename task_root_type::function_type* ptr) { + using specialization = + TaskQueueSpecialization<BasicTaskScheduler<ExecSpace, TaskQueue>>; + specialization::template proc_set_apply<FunctorType>(ptr); + } + + // Assign task pointer with reference counting of assigned tasks + KOKKOS_FUNCTION static void assign(task_root_type** const lhs, + task_root_type* const rhs) { +#if 0 + { + printf( "assign( 0x%lx { 0x%lx %d %d } , 0x%lx { 0x%lx %d %d } )\n" + , uintptr_t( lhs ? *lhs : 0 ) + , uintptr_t( lhs && *lhs ? (*lhs)->m_next : 0 ) + , int( lhs && *lhs ? (*lhs)->m_task_type : 0 ) + , int( lhs && *lhs ? (*lhs)->m_ref_count : 0 ) + , uintptr_t(rhs) + , uintptr_t( rhs ? rhs->m_next : 0 ) + , int( rhs ? rhs->m_task_type : 0 ) + , int( rhs ? rhs->m_ref_count : 0 ) + ); + fflush( stdout ); + } +#endif + + if (*lhs) decrement(*lhs); + if (rhs) { + Kokkos::atomic_increment(&(rhs->m_ref_count)); + } + + // Force write of *lhs + + *static_cast<task_root_type* volatile*>(lhs) = rhs; + + Kokkos::memory_fence(); + } + + KOKKOS_FUNCTION + size_t allocate_block_size(size_t n); ///< Actual block size allocated + + KOKKOS_FUNCTION + void* allocate(size_t n); ///< Allocate from the memory pool + + KOKKOS_FUNCTION + void deallocate(void* p, size_t n); ///< Deallocate to the memory pool + + //---------------------------------------- + /**\brief Allocation size for a spawned task */ + + template <typename FunctorType> + KOKKOS_FUNCTION size_t spawn_allocation_size() const { + using value_type = typename FunctorType::value_type; + + using task_type = Impl::Task<execution_space, value_type, FunctorType>; + + enum : size_t { align = (1 << 4), align_mask = align - 1 }; + enum : size_t { task_size = sizeof(task_type) }; + enum : size_t { result_size = Impl::TaskResult<value_type>::size }; + enum : size_t { + alloc_size = ((task_size + align_mask) & ~align_mask) + + ((result_size + align_mask) & ~align_mask) + }; + + return m_memory.allocate_block_size(task_size); + } + + /**\brief Allocation size for a when_all aggregate */ + + KOKKOS_FUNCTION + size_t when_all_allocation_size(int narg) const { + return m_memory.allocate_block_size(sizeof(task_root_type) + + narg * sizeof(task_root_type*)); + } +}; + +} /* namespace Impl */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */ +#endif /* #ifndef KOKKOS_IMPL_TASKQUEUE_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskQueueCommon.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskQueueCommon.hpp new file mode 100644 index 0000000000000000000000000000000000000000..cae06d4ea5ca17b5924a7bbf8c415d6f0a3ab070 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_TaskQueueCommon.hpp @@ -0,0 +1,506 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_IMPL_TASKQUEUECOMMON_HPP +#define KOKKOS_IMPL_TASKQUEUECOMMON_HPP + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_TASKDAG) + +#include <Kokkos_TaskScheduler_fwd.hpp> +#include <Kokkos_Core_fwd.hpp> + +#include <Kokkos_MemoryPool.hpp> + +#include <impl/Kokkos_TaskNode.hpp> +#include <impl/Kokkos_TaskResult.hpp> + +#include <impl/Kokkos_TaskQueueMemoryManager.hpp> +#include <impl/Kokkos_Memory_Fence.hpp> +#include <impl/Kokkos_Atomic_Increment.hpp> +#include <impl/Kokkos_OptionalRef.hpp> +#include <impl/Kokkos_LIFO.hpp> + +#include <string> +#include <typeinfo> +#include <stdexcept> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +/// @brief CRTP Base class implementing the ready count parts common to most +/// task queues +template <class Derived> +class TaskQueueCommonMixin { + private: + int32_t m_ready_count = 0; + + // CRTP boilerplate + KOKKOS_INLINE_FUNCTION + Derived& _self() { return *static_cast<Derived*>(this); } + + public: + //---------------------------------------------------------------------------- + // <editor-fold desc="Constructors, destructor, and assignment"> {{{2 + + TaskQueueCommonMixin() : m_ready_count(0) { + // TODO @tasking @memory_order DSH figure out if I need this store to be + // atomic + } + + ~TaskQueueCommonMixin() { + KOKKOS_EXPECTS((Kokkos::memory_fence(), m_ready_count < 1)); + KOKKOS_EXPECTS(m_ready_count == 0); + } + + // </editor-fold> end Constructors, destructor, and assignment }}}2 + //---------------------------------------------------------------------------- + + //---------------------------------------------------------------------------- + // <editor-fold desc="Task and queue completion"> {{{2 + + private: + // This would be more readable with a lambda, but that comes with + // all the baggage associated with a lambda (compilation times, bugs with + // nvcc, etc.), so we'll use a simple little helper functor here. + template <class TaskQueueTraits, class TeamSchedulerInfo> + struct _schedule_waiting_tasks_operation { + TaskNode<TaskQueueTraits> const& m_predecessor; + Derived& m_queue; + TeamSchedulerInfo const& m_info; + KOKKOS_INLINE_FUNCTION + void operator()(TaskNode<TaskQueueTraits>&& task) const noexcept + // requires Same<TaskType, Derived::task_base_type> + { + using task_scheduling_info_type = + typename Derived::task_scheduling_info_type; + if (task.is_runnable()) // KOKKOS_LIKELY + { + // TODO @tasking @optimiazation DSH check this outside of the loop ? + if (m_predecessor.is_runnable()) { + m_queue.update_scheduling_info_from_completed_predecessor( + /* ready_task = */ task.as_runnable_task(), + /* predecessor = */ m_predecessor.as_runnable_task()); + } else { + KOKKOS_ASSERT(m_predecessor.is_aggregate()); + m_queue.update_scheduling_info_from_completed_predecessor( + /* ready_task = */ task.as_runnable_task(), + /* predecessor = */ m_predecessor + .template as_aggregate<task_scheduling_info_type>()); + } + m_queue.schedule_runnable(std::move(task).as_runnable_task(), m_info); + } else { + // The scheduling info update happens inside of schedule_aggregate + m_queue.schedule_aggregate( + std::move(task).template as_aggregate<task_scheduling_info_type>(), + m_info); + } + } + }; + + protected: + template <class TaskQueueTraits, class TeamSchedulerInfo> + KOKKOS_FUNCTION void _complete_finished_task(TaskNode<TaskQueueTraits>&& task, + TeamSchedulerInfo const& info) { + task.consume_wait_queue( + _schedule_waiting_tasks_operation<TaskQueueTraits, TeamSchedulerInfo>{ + task, _self(), info}); + bool should_delete = task.decrement_and_check_reference_count(); + if (should_delete) { + _self().deallocate(std::move(task)); + } + } + + KOKKOS_INLINE_FUNCTION + void _increment_ready_count() { + // TODO @tasking @memory_order DSH memory order + Kokkos::atomic_increment(&this->m_ready_count); + } + + KOKKOS_INLINE_FUNCTION + void _decrement_ready_count() { + // TODO @tasking @memory_order DSH memory order + Kokkos::atomic_decrement(&this->m_ready_count); + Kokkos::memory_fence(); + } + + public: + KOKKOS_INLINE_FUNCTION + bool is_done() const noexcept { + // TODO @tasking @memory_order DSH Memory order, instead of volatile + return (*(volatile int*)(&m_ready_count)) == 0; + } + + KOKKOS_INLINE_FUNCTION + int32_t ready_count() const noexcept { + // TODO @tasking @memory_order DSH Memory order, instead of volatile + return (*(volatile int*)(&m_ready_count)); + } + + template <class TaskQueueTraits, class TeamSchedulerInfo> + KOKKOS_FUNCTION void complete(RunnableTaskBase<TaskQueueTraits>&& task, + TeamSchedulerInfo const& info) { + if (task.get_respawn_flag()) { + _self().schedule_runnable(std::move(task), info); + } else { + _complete_finished_task(std::move(task), info); + } + // A runnable task was popped from a ready queue finished executing. + // If respawned into a ready queue then the ready count was incremented + // so decrement whether respawned or not. If finished, all of the + // tasks waiting on this have been enqueued (either in the ready queue + // or the next waiting queue, in the case of an aggregate), and the + // ready count has been incremented for each of those, preventing + // quiescence. Thus, it's safe to decrement the ready count here. + // TODO @tasking @memory_order DSH memory order? (probably release) + _decrement_ready_count(); + } + + template <class TaskQueueTraits, class SchedulingInfo, + class TeamSchedulerInfo> + KOKKOS_FUNCTION void complete( + AggregateTask<TaskQueueTraits, SchedulingInfo>&& task, + TeamSchedulerInfo const& info) { + // TODO @tasking DSH old code has a ifndef __HIP_DEVICE_COMPILE__ here; + // figure out why + _complete_finished_task(std::move(task), info); + } + + // </editor-fold> end Task and queue completion }}}2 + //---------------------------------------------------------------------------- + + //---------------------------------------------------------------------------- + // <editor-fold desc="Scheduling"> {{{2 + + public: + // This isn't actually generic; the template parameters are just to keep + // Derived from having to be complete + template <class TaskQueueTraits, class ReadyQueueType, + class TeamSchedulerInfo> + KOKKOS_INLINE_FUNCTION void schedule_runnable_to_queue( + RunnableTaskBase<TaskQueueTraits>&& task, ReadyQueueType& ready_queue, + TeamSchedulerInfo const& info) { + bool task_is_ready = true; + bool scheduling_info_updated = false; + + // do this before enqueueing and potentially losing exclusive access to task + bool task_is_respawning = task.get_respawn_flag(); + + // clear the respawn flag, since we're handling the respawn (if any) here. + // We must make sure this is written through the cache, since the next + // thread to access it might be a Cuda thread from a different thread block. + ((RunnableTaskBase<TaskQueueTraits> volatile&)task).set_respawn_flag(false); + + if (task.has_predecessor()) { + // save the predecessor into a local variable, then clear it from the + // task before adding it to the wait queue of the predecessor + // (We have exclusive access to the task's predecessor, so we don't need + // to do this atomically) + // TODO @tasking @internal_documentation DSH document that we expect + // exclusive access to `task` in this function + auto& predecessor = task.get_predecessor(); + // This needs a load/store fence here, technically + // making this a release store would also do this + ((RunnableTaskBase<TaskQueueTraits> volatile&)task).clear_predecessor(); + + // TODO @tasking @memory_order DSH remove this fence in favor of memory + // orders + Kokkos::memory_fence(); // for now + + // Try to add the task to the predecessor's waiting queue. If it fails, + // the predecessor is already done + bool predecessor_not_ready = predecessor.try_add_waiting(task); + + // NOTE: if the predecessor was not ready and the task was enqueued, + // we've lost exclusive access and should nt touch task again + + // If the predecessor is not done, then task is not ready + task_is_ready = !predecessor_not_ready; + + if (task_is_ready && predecessor.is_runnable()) { + // this is our last chance to update the scheduling info before + // predecessor is potentially deleted + _self().update_scheduling_info_from_completed_predecessor( + /* ready_task = */ task, + /* predecessor = */ predecessor.as_runnable_task()); + scheduling_info_updated = true; + } + + if (task_is_respawning) { + // Reference count for predecessor was incremented when + // respawn called set_dependency() + // so that if predecessor completed prior to the + // above try_add_waiting(), predecessor would not be destroyed. + // predecessor reference count can now be decremented, + // which may deallocate it. + bool should_delete = predecessor.decrement_and_check_reference_count(); + if (should_delete) { + // TODO @tasking @cleanup DSH better encapsulation of this! + _self().deallocate(std::move(predecessor)); + } + } + // Note! predecessor may be destroyed at this point, so don't add anything + // here + } + + if (scheduling_info_updated) { + // We need to go back to the queue itself and see if it wants to schedule + // somewhere else + _self().schedule_runnable(std::move(task), info); + } + // Put it in the appropriate ready queue if it's ready + else if (task_is_ready) { + // Increment the ready count + _self()._increment_ready_count(); + // and enqueue the task + // (can't move because the task isn't expired unless the push succeeds + bool push_success = ready_queue.push(task); + if (!push_success) { + _self().handle_failed_ready_queue_insertion(std::move(task), + ready_queue, info); + } + } + + // Task may be enqueued and may be run at any point; don't touch it (hence + // the use of move semantics) + } + + template <class TaskQueueTraits, class ReadyQueueType, + class TeamSchedulerInfo> + KOKKOS_INLINE_FUNCTION void handle_failed_ready_queue_insertion( + RunnableTaskBase<TaskQueueTraits>&& /*task*/, + ReadyQueueType& /*ready_queue*/, TeamSchedulerInfo const& /*info*/) { + Kokkos::abort("Unhandled failure of ready task queue insertion!\n"); + } + + // This isn't actually generic; the template parameters are just to keep + // Derived from having to be complete + template <class TaskQueueTraits, class SchedulingInfo, + class TeamSchedulerInfo> + KOKKOS_FUNCTION void schedule_aggregate( + AggregateTask<TaskQueueTraits, SchedulingInfo>&& aggregate, + TeamSchedulerInfo const& info) { + // Because the aggregate is being scheduled, should not be in any queue + KOKKOS_EXPECTS(!aggregate.is_enqueued()); + + using task_scheduling_info_type = + typename Derived::task_scheduling_info_type; + using team_scheduler_info_type = typename Derived::team_scheduler_info_type; + static_assert( + std::is_same<TeamSchedulerInfo, team_scheduler_info_type>::value, + "SchedulingInfo type mismatch!"); + + bool incomplete_dependence_found = false; + + for (auto*& predecessor_ptr_ref : aggregate) { + // if a previous scheduling operation hasn't already set the predecessor + // to nullptr, try to enqueue the aggregate into the predecessorendence's + // waiting queue + if (predecessor_ptr_ref != nullptr) { + // Swap the pointer onto the stack and set the one in the aggregate VLA + // to nullptr before we try to add it to the waiting queue so that some + // other thread doesn't also get to here and find the pointer to be + // not null (since as soon as we try and schedule the aggregate, we + // potentially lose exclusive access to it if that enqueueing operation + // succeeds. The swap doesn't need to happen atomically since we have + // exclusive access to aggregate until an insertion succeeds + auto* predecessor_ptr = std::move(predecessor_ptr_ref); + + // TODO @tasking @memory_order DSH I think this needs to be a store + // release so that it doesn't get reordered after the queue insertion + predecessor_ptr_ref = nullptr; + + // TODO @tasking @memory_order DSH remove this fence in favor of memory + // orders + Kokkos::memory_fence(); + + // If adding the aggregate to the waiting queue succeeds, the + // predecessor is not complete + bool pred_not_ready = predecessor_ptr->try_add_waiting(aggregate); + + // NOTE! At this point it is unsafe to access aggregate (unless the + // enqueueing failed, so we can't use move semantics to expire it) + + // we found an incomplete dependence, so we can't make task's successors + // ready yet + incomplete_dependence_found = pred_not_ready; + + if (!pred_not_ready) { + // A predecessor was done, and we didn't enqueue the aggregate + // Update the aggregate's scheduling info (we still have exclusive + // access to it here) + if (predecessor_ptr->is_runnable()) { + _self().update_scheduling_info_from_completed_predecessor( + aggregate, predecessor_ptr->as_runnable_task()); + } else { + KOKKOS_ASSERT(predecessor_ptr->is_aggregate()); + _self().update_scheduling_info_from_completed_predecessor( + aggregate, + (*predecessor_ptr) + .template as_aggregate<task_scheduling_info_type>()); + } + } + + // the reference count for the predecessor was incremented when we put + // it into the predecessor list, so decrement it here + bool should_delete = + predecessor_ptr->decrement_and_check_reference_count(); + if (should_delete) { + // TODO @tasking @cleanup DSH better encapsulation of this! + _self().deallocate(std::move(*predecessor_ptr)); + } + + // Stop the loop if we found an incomplete dependence + if (incomplete_dependence_found) break; + } + } + + // NOTE: it's not safe to access aggregate any more if an incomplete + // dependence was found, because some other thread could have already popped + // it off of another waiting queue + + if (!incomplete_dependence_found) { + // all of the predecessors were completed, so we can complete `task` + _self().complete(std::move(aggregate), info); + } + // Note!! task may have been deleted at this point, so don't add anything + // here! + } + + // Provide a sensible default that can be overridden + template <class TaskQueueTraits> + KOKKOS_INLINE_FUNCTION void update_scheduling_info_from_completed_predecessor( + RunnableTaskBase<TaskQueueTraits>& ready_task, + RunnableTaskBase<TaskQueueTraits> const& predecessor) const { + // by default, tell a ready task to use the scheduling info of its most + // recent predecessor + using task_scheduling_info_type = + typename Derived::task_scheduling_info_type; + ready_task.template scheduling_info_as<task_scheduling_info_type>() = + predecessor.template scheduling_info_as<task_scheduling_info_type>(); + } + + // Provide a sensible default that can be overridden + template <class SchedulingInfo, class TaskQueueTraits> + KOKKOS_INLINE_FUNCTION void update_scheduling_info_from_completed_predecessor( + AggregateTask<TaskQueueTraits, SchedulingInfo>& aggregate, + RunnableTaskBase<TaskQueueTraits> const& predecessor) const { + // by default, tell a ready task to use the scheduling info of its most + // recent predecessor + using task_scheduling_info_type = + typename Derived::task_scheduling_info_type; + aggregate.scheduling_info() = + predecessor.template scheduling_info_as<task_scheduling_info_type>(); + } + + // Provide a sensible default that can be overridden + template <class SchedulingInfo, class TaskQueueTraits> + KOKKOS_INLINE_FUNCTION void update_scheduling_info_from_completed_predecessor( + AggregateTask<TaskQueueTraits, SchedulingInfo>& aggregate, + AggregateTask<TaskQueueTraits, SchedulingInfo> const& predecessor) const { + // by default, tell a ready task to use the scheduling info of its most + // recent predecessor + aggregate.scheduling_info() = predecessor.scheduling_info(); + } + + // Provide a sensible default that can be overridden + template <class SchedulingInfo, class TaskQueueTraits> + KOKKOS_INLINE_FUNCTION void update_scheduling_info_from_completed_predecessor( + RunnableTaskBase<TaskQueueTraits>& ready_task, + AggregateTask<TaskQueueTraits, SchedulingInfo> const& predecessor) const { + // by default, tell a ready task to use the scheduling info of its most + // recent predecessor + using task_scheduling_info_type = + typename Derived::task_scheduling_info_type; + ready_task.template scheduling_info_as<task_scheduling_info_type>() = + predecessor.scheduling_info(); + } + + template <class TaskQueueTraits> + KOKKOS_INLINE_FUNCTION void initialize_scheduling_info_from_predecessor( + TaskNode<TaskQueueTraits>& /*task*/, + TaskNode<TaskQueueTraits>& /*predecessor*/) const { + /* do nothing by default */ + } + + template <class TeamSchedulerInfo, class TaskQueueTraits> + KOKKOS_INLINE_FUNCTION void + initialize_scheduling_info_from_team_scheduler_info( + TaskNode<TaskQueueTraits>& /*task*/, + TeamSchedulerInfo const& /*info*/) const { + /* do nothing by default */ + } + + template <class ExecutionSpace, class MemorySpace, class MemoryPool> + static /* KOKKOS_CONSTEXPR_14 */ size_t task_queue_allocation_size( + ExecutionSpace const&, MemorySpace const&, MemoryPool const&) + // requires Same<ExecutionSpace, typename Derived::execution_space> + // && Same<MemorySpace, typename Derived::memory_space> + // && Same<MemoryPool, typename Derived::memory_pool> + { + static_assert( + std::is_same<ExecutionSpace, + typename Derived::execution_space>::value && + std::is_same<MemorySpace, typename Derived::memory_space>::value && + std::is_same<MemoryPool, typename Derived::memory_pool>::value, + "Type mismatch in task_queue_allocation_size customization point"); + + return sizeof(Derived); + } + + // </editor-fold> end Scheduling }}}2 + //---------------------------------------------------------------------------- +}; + +} /* namespace Impl */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */ +#endif /* #ifndef KOKKOS_IMPL_TASKQUEUECOMMON_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskQueueMemoryManager.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskQueueMemoryManager.hpp new file mode 100644 index 0000000000000000000000000000000000000000..6e2481f93567a671a5ad66f3536b45c009286eca --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_TaskQueueMemoryManager.hpp @@ -0,0 +1,226 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_IMPL_TASKQUEUEMEMORYMANAGER_HPP +#define KOKKOS_IMPL_TASKQUEUEMEMORYMANAGER_HPP + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_TASKDAG) + +#include <Kokkos_TaskScheduler_fwd.hpp> +#include <Kokkos_Core_fwd.hpp> + +#include <Kokkos_MemoryPool.hpp> + +#include <impl/Kokkos_TaskBase.hpp> +#include <impl/Kokkos_TaskResult.hpp> + +#include <impl/Kokkos_Memory_Fence.hpp> +#include <impl/Kokkos_Atomic_Increment.hpp> +#include <impl/Kokkos_OptionalRef.hpp> +#include <impl/Kokkos_LIFO.hpp> + +#include <string> +#include <typeinfo> +#include <stdexcept> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <class ExecSpace, class MemorySpace, + class MemoryPool = + Kokkos::MemoryPool<Kokkos::Device<ExecSpace, MemorySpace>>> +class TaskQueueMemoryManager : public TaskQueueBase { + public: + using execution_space = ExecSpace; + using memory_space = MemorySpace; + using device_type = Kokkos::Device<execution_space, memory_space>; + using memory_pool = MemoryPool; + using allocation_size_type = size_t; + + private: + memory_pool m_pool; + // TODO @tasking @generalization DSH re-enable this with a flag in the type + // long m_accum_alloc = 0; + int m_count_alloc = 0; + int m_max_alloc = 0; + + struct _allocation_result { + bool success; + void* pointer; + }; + + KOKKOS_INLINE_FUNCTION + _allocation_result _do_pool_allocate(allocation_size_type requested_size) { + // KOKKOS_EXPECTS(requested_size >= 0); generates a warning when + // allocation_size_type is unsigned + if (requested_size == 0) { + return {true, nullptr}; + } else { + void* data = m_pool.allocate(static_cast<size_t>(requested_size)); + + // Kokkos::atomic_increment(&m_accum_alloc); // memory_order_relaxed + Kokkos::atomic_increment(&m_count_alloc); // memory_order_relaxed + // TODO @tasking @minor DSH make this thread safe? (otherwise, it's just + // an approximation, which is probably fine...) + if (m_max_alloc < m_count_alloc) m_max_alloc = m_count_alloc; + + return {data != nullptr, data}; + } + } + + template <class T, class... Args> + KOKKOS_INLINE_FUNCTION T* _do_contruct(void* allocated, + allocation_size_type allocated_size, + Args&&... args) { + static_assert(std::is_base_of<PoolAllocatedObjectBase<int32_t>, T>::value, + "TaskQueueMemoryManager can only allocate objects with " + "PoolAllocatedObjectBase base class"); + + // TODO @tasking DSH figure out why this isn't working + // static_assert( + // std::is_constructible<T, Args..., int32_t>::value, + // "TaskQueueMemoryManager can't construct object of the requested type + // from the " " allocation size and the given arguments" + //); + + auto rv = new (allocated) T(std::forward<Args>(args)..., allocated_size); + + // It feels like there should be a way to check this at compile-time + KOKKOS_ASSERT( + (intptr_t)(rv) == + (intptr_t)(static_cast<PoolAllocatedObjectBase<int32_t>*>(rv)) && + "PoolAllocatedObjectBase must be the first base class of the allocated " + "type"); + + return rv; + } + + public: + explicit TaskQueueMemoryManager(memory_pool const& pool) : m_pool(pool) {} + + template <class T, class... Args> + KOKKOS_FUNCTION T* allocate_and_construct(Args&&... args) + // requires + // std::is_base_of_v<PoolAllocatedObjectBase<typename + // memory_pool::size_type>, T> + // && std::is_constructible_v<T, Args&&..., allocation_size_type> + { + constexpr auto allocation_size = sizeof(T); + + auto result = _do_pool_allocate(allocation_size); + + KOKKOS_ASSERT(result.success && "Memory allocation failure"); + + auto rv = _do_contruct<T>(result.pointer, allocation_size, + std::forward<Args>(args)...); + + KOKKOS_ENSURES(intptr_t(rv) % alignof(T) == 0 && + "alignment not preserved!"); + + return rv; + } + + template <class T, class VLAValueType, class... Args> + KOKKOS_INLINE_FUNCTION T* allocate_and_construct_with_vla_emulation( + allocation_size_type n_vla_entries, Args&&... args) + // requires + // std::is_base_of_v<PoolAllocatedObjectBase<typename + // memory_pool::size_type>, T> + // && std::is_base_of<ObjectWithVLAEmulation<T, VLAValueType>, T>::value + // && std::is_constructible_v<T, allocation_size_type, Args&&...> + { + static_assert( + std::is_base_of<ObjectWithVLAEmulation<T, VLAValueType>, T>::value, + "Can't append emulated variable length array of type with greater " + "alignment than" + " the type to which the VLA is being appended"); + + using vla_emulation_base = ObjectWithVLAEmulation<T, VLAValueType>; + + auto const allocation_size = + vla_emulation_base::required_allocation_size(n_vla_entries); + auto result = _do_pool_allocate(allocation_size); + + KOKKOS_ASSERT(result.success && "Memory allocation failure"); + + auto rv = _do_contruct<T>(result.pointer, allocation_size, + std::forward<Args>(args)...); + + KOKKOS_ENSURES(intptr_t(rv) % alignof(T) == 0); + + return rv; + } + + template <class CountType> + KOKKOS_INLINE_FUNCTION void deallocate( + PoolAllocatedObjectBase<CountType>&& obj) { + m_pool.deallocate((void*)&obj, 1); + Kokkos::atomic_decrement(&m_count_alloc); // memory_order_relaxed + } + + KOKKOS_INLINE_FUNCTION + memory_pool& get_memory_pool() { return m_pool; } + KOKKOS_INLINE_FUNCTION + memory_pool const& get_memory_pool() const { return m_pool; } + + KOKKOS_INLINE_FUNCTION + int allocation_count() const noexcept { return m_count_alloc; } +}; + +} /* namespace Impl */ +} /* namespace Kokkos */ + +//////////////////////////////////////////////////////////////////////////////// +// END OLD CODE +//////////////////////////////////////////////////////////////////////////////// + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */ +#endif /* #ifndef KOKKOS_IMPL_TASKQUEUEMEMORYMANAGER_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskQueueMultiple.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskQueueMultiple.hpp new file mode 100644 index 0000000000000000000000000000000000000000..efee3d051dc8fb4e112219527bb322404ff4dfe6 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_TaskQueueMultiple.hpp @@ -0,0 +1,269 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +// Experimental unified task-data parallel manycore LDRD + +#ifndef KOKKOS_IMPL_TASKQUEUEMULTIPLE_HPP +#define KOKKOS_IMPL_TASKQUEUEMULTIPLE_HPP + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_TASKDAG) + +#include <Kokkos_TaskScheduler_fwd.hpp> +#include <Kokkos_Core_fwd.hpp> + +#include <Kokkos_MemoryPool.hpp> + +#include <impl/Kokkos_TaskBase.hpp> +#include <impl/Kokkos_TaskResult.hpp> +#include <impl/Kokkos_TaskQueue.hpp> + +#include <impl/Kokkos_Memory_Fence.hpp> +#include <impl/Kokkos_Atomic_Increment.hpp> +#include <impl/Kokkos_Atomic_Decrement.hpp> + +#include <string> +#include <typeinfo> +#include <stdexcept> +#include <cassert> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <typename ExecSpace, + typename MemorySpace = typename ExecSpace::memory_space> +class LeagueQueueCollection; + +template <class ExecSpace, class MemorySpace> +class TaskQueueMultiple : public TaskQueue<ExecSpace, MemorySpace> { + private: + using base_t = TaskQueue<ExecSpace, MemorySpace>; + using queue_collection_t = LeagueQueueCollection<ExecSpace, MemorySpace>; + + int m_league_rank = static_cast<int>(KOKKOS_INVALID_INDEX); + + // This pointer is owning only if m_league_rank == 0 + queue_collection_t* m_other_queues = nullptr; + + public: + struct Destroy { + TaskQueueMultiple* m_queue; + void destroy_shared_allocation(); + }; + + using team_queue_type = TaskQueueMultiple; + + TaskQueueMultiple(int arg_league_rank, queue_collection_t* arg_other_queues, + typename base_t::memory_pool const& arg_memory_pool) + : base_t(arg_memory_pool), + m_league_rank(arg_league_rank), + m_other_queues(arg_other_queues) {} + + explicit TaskQueueMultiple( + typename base_t::memory_pool const& arg_memory_pool) + : base_t(arg_memory_pool), m_league_rank(0) { + void* other_queues_buffer = + typename base_t::memory_space{}.allocate(sizeof(queue_collection_t)); + m_other_queues = new (other_queues_buffer) queue_collection_t(this); + } + + ~TaskQueueMultiple() { + if (m_league_rank == 0 && m_other_queues != nullptr) { + m_other_queues->~queue_collection_t(); + typename base_t::memory_space{}.deallocate(m_other_queues, + sizeof(queue_collection_t)); + } + // rest of destruction is handled in the base class + } + + //---------------------------------------- + + void initialize_team_queues(int arg_league_size) const noexcept { + m_other_queues->initialize_team_queues(arg_league_size, this->m_memory); + } + + KOKKOS_INLINE_FUNCTION + team_queue_type& get_team_queue(int arg_league_rank) noexcept { + if (arg_league_rank == m_league_rank) + return *this; + else + return m_other_queues->get_team_queue(arg_league_rank); + } + + KOKKOS_INLINE_FUNCTION + typename base_t::task_root_type* attempt_to_steal_task() noexcept { + TaskBase* rv = nullptr; + auto* const end_tag = reinterpret_cast<TaskBase*>(TaskBase::EndTag); + + if (m_other_queues == nullptr) { + Kokkos::abort("attempted to steal task before queues were initialized!"); + } + + // Loop by priority and then type, and then team + for (int i = 0; i < base_t::NumQueue; ++i) { + for (int j = 0; j < 2; ++j) { + // for now, always start by trying to steal from team zero + for (int iteam = 0; iteam < m_other_queues->size(); ++iteam) { + if (iteam == m_league_rank) continue; + auto& steal_from = get_team_queue(iteam); + if (*((volatile int*)&steal_from.m_ready_count) > 0) { + // we've found at least one queue that's not done, so even if we + // can't pop something off of it we shouldn't return a nullptr + // indicating completion. rv will be end_tag when the pop fails + rv = base_t::pop_ready_task(&steal_from.m_ready[i][j]); + if (rv != end_tag) { + // task stolen. + // first increment our ready count, then decrement the ready count + // on the other queue: + Kokkos::atomic_increment(&this->m_ready_count); + Kokkos::atomic_decrement(&steal_from.m_ready_count); + return rv; + } + } + } + } + } + + // at this point, rv will only be nullptr if *all* of the queues had an + // m_ready_count of 0. This indicates quiescence. If at least some of them + // had non-zero, there would have been at least one pop_ready_task that + // was called and returned end_tag if it couldn't pop a task + return rv; + } +}; + +template <typename ExecSpace, typename MemorySpace> +class LeagueQueueCollection { + private: + using execution_space = ExecSpace; + using memory_space = MemorySpace; + using device_type = Kokkos::Device<execution_space, memory_space>; + using memory_pool = Kokkos::MemoryPool<device_type>; + using team_queue_type = TaskQueueMultiple<execution_space, memory_space>; + using team_scheduler_type = BasicTaskScheduler<ExecSpace, team_queue_type>; + using specialization = TaskQueueSpecialization<team_scheduler_type>; + + enum : long { max_num_queues = 6 }; // specialization::max_league_size }; + + // this is a non-owning pointer + team_queue_type* m_rank_zero_queue = nullptr; + // This really needs to be an optional<TaskQueue<ExecSpace>> + union optional_queue { + KOKKOS_INLINE_FUNCTION + optional_queue() : uninitialized(0) {} + KOKKOS_INLINE_FUNCTION + ~optional_queue() { uninitialized = 0; } + char uninitialized; + team_queue_type initialized; + } m_queues[max_num_queues]; + int m_size = static_cast<int>(KOKKOS_INVALID_INDEX); + + public: + LeagueQueueCollection() = delete; + LeagueQueueCollection(LeagueQueueCollection const&) = delete; + LeagueQueueCollection(LeagueQueueCollection&&) = delete; + LeagueQueueCollection& operator=(LeagueQueueCollection const&) = delete; + LeagueQueueCollection& operator=(LeagueQueueCollection&&) = delete; + + ~LeagueQueueCollection() { + // destroy only the initialized queues that we own + for (int iteam = 0; iteam < m_size - 1; ++iteam) { + m_queues[iteam].initialized.~team_queue_type(); + m_queues[iteam].uninitialized = 0; + } + } + + KOKKOS_INLINE_FUNCTION + explicit LeagueQueueCollection(team_queue_type* arg_rank_zero_queue) + : m_rank_zero_queue(arg_rank_zero_queue), m_size(1) {} + + void initialize_team_queues(int arg_count, + memory_pool const& arg_memory_pool) noexcept { + arg_count = std::min((int)max_num_queues, arg_count); + // assert(arg_count <= max_num_queues); + if (arg_count > m_size) { + for (int i = m_size; i < arg_count; ++i) { + new (&m_queues[i - 1].initialized) + team_queue_type(i, this, arg_memory_pool); + } + m_size = arg_count; + } + } + + KOKKOS_INLINE_FUNCTION + constexpr int size() const noexcept { return m_size; } + + KOKKOS_INLINE_FUNCTION + constexpr bool initialized() const noexcept { + return m_size != int(KOKKOS_INVALID_INDEX); + } + + KOKKOS_INLINE_FUNCTION + team_queue_type& get_team_queue(int iteam) { + iteam %= max_num_queues; +#if !defined(__HIP_DEVICE_COMPILE__) && !defined(__CUDA_ARCH__) + assert(initialized()); + assert(iteam < m_size); + assert(iteam >= 0); +#endif + if (iteam == 0) + return *m_rank_zero_queue; + else + return m_queues[iteam - 1].initialized; + } +}; + +} /* namespace Impl */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#include <impl/Kokkos_TaskQueueMultiple_impl.hpp> + +#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */ +#endif /* #ifndef KOKKOS_IMPL_TASKQUEUEMULTIPLE_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskQueueMultiple_impl.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskQueueMultiple_impl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..1af97918b8a057444b9653202838a7dff06bb143 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_TaskQueueMultiple_impl.hpp @@ -0,0 +1,74 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_IMPL_TASKQUEUEMULTIPLE_IMPL_HPP +#define KOKKOS_IMPL_TASKQUEUEMULTIPLE_IMPL_HPP + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_TASKDAG) + +#include <impl/Kokkos_TaskQueueMultiple.hpp> + +#define KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING_MULTIPLE 0 + +namespace Kokkos { +namespace Impl { + +template <class ExecSpace, class MemorySpace> +void TaskQueueMultiple<ExecSpace, + MemorySpace>::Destroy::destroy_shared_allocation() { +// KOKKOS WORKAROUND for CUDA 10.1 with GCC 7.3.0 +#if (KOKKOS_COMPILER_CUDA_VERSION == 101) && defined(KOKKOS_COMPILER_NVCC) && \ + (KOKKOS_COMPILER_GNU >= 730) + (*m_queue).get_team_queue(0).~TaskQueueMultiple(); +#else + m_queue->get_team_queue(0).~TaskQueueMultiple(); +#endif +} + +} /* namespace Impl */ +} /* namespace Kokkos */ + +#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */ +#endif /* #ifndef KOKKOS_IMPL_TASKQUEUEMULTIPLE_IMPL_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a87e5f72721f95f06d1c9f90c17e92d0e1fec2fb --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp @@ -0,0 +1,658 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_IMPL_TASKQUEUE_IMPL_HPP +#define KOKKOS_IMPL_TASKQUEUE_IMPL_HPP +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_TASKDAG) + +#define KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING 0 + +namespace Kokkos { +namespace Impl { + +//---------------------------------------------------------------------------- + +template <typename ExecSpace, typename MemorySpace> +void TaskQueue<ExecSpace, MemorySpace>::Destroy::destroy_shared_allocation() { + m_queue->~TaskQueue(); +} + +//---------------------------------------------------------------------------- + +template <typename ExecSpace, typename MemorySpace> +TaskQueue<ExecSpace, MemorySpace>::TaskQueue( + typename TaskQueue<ExecSpace, MemorySpace>::memory_pool const + &arg_memory_pool) + : m_memory(arg_memory_pool), + m_ready() + //, m_accum_alloc(0) + //, m_count_alloc(0) + //, m_max_alloc(0) + , + m_ready_count(0) { + for (int i = 0; i < NumQueue; ++i) { + m_ready[i][0] = (task_root_type *)task_root_type::EndTag; + m_ready[i][1] = (task_root_type *)task_root_type::EndTag; + } +} + +//---------------------------------------------------------------------------- + +template <typename ExecSpace, typename MemorySpace> +TaskQueue<ExecSpace, MemorySpace>::~TaskQueue() { + // Verify that queues are empty and ready count is zero + + for (int i = 0; i < NumQueue; ++i) { + for (int j = 0; j < 2; ++j) { + if (m_ready[i][j] != (task_root_type *)task_root_type::EndTag) { + Kokkos::abort("TaskQueue::~TaskQueue ERROR: has ready tasks"); + } + } + } + + if (0 != m_ready_count) { + Kokkos::abort("TaskQueue::~TaskQueue ERROR: has ready or executing tasks"); + } +} + +//---------------------------------------------------------------------------- + +template <typename ExecSpace, typename MemorySpace> +KOKKOS_FUNCTION void TaskQueue<ExecSpace, MemorySpace>::decrement( + TaskQueue<ExecSpace, MemorySpace>::task_root_type *task) { + task_root_type volatile &t = *task; + + const int count = Kokkos::atomic_fetch_add(&(t.m_ref_count), -1); + +#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING + if (1 == count) { + printf("decrement-destroy( 0x%lx { 0x%lx %d %d } )\n", uintptr_t(task), + uintptr_t(task->m_next), int(task->m_task_type), + int(task->m_ref_count)); + } +#endif + + if ((1 == count) && (t.m_next == (task_root_type *)task_root_type::LockTag)) { + // Reference count is zero and task is complete, deallocate. + + // TaskQueue< ExecSpace, MemorySpace> * const queue = + // static_cast<scheduler_type const *>( t.m_scheduler )->m_queue; + auto *const volatile queue = static_cast<TaskQueue *>(t.m_queue); + + // TODO @tasking @minor DSH this should call the destructor for a + // non-trivially destructible type (possibly just ignore this in the old + // version, though?) (Can't just do this; it needs to be queued since it's + // device code if(task->m_destroy) task->m_destroy(task); + + queue->deallocate(task, t.m_alloc_size); + } else if (count <= 1) { + Kokkos::abort( + "TaskScheduler task has negative reference count or is incomplete"); + } +} + +//---------------------------------------------------------------------------- + +template <typename ExecSpace, typename MemorySpace> +KOKKOS_FUNCTION size_t +TaskQueue<ExecSpace, MemorySpace>::allocate_block_size(size_t n) { + return m_memory.allocate_block_size(n); +} + +template <typename ExecSpace, typename MemorySpace> +KOKKOS_FUNCTION void *TaskQueue<ExecSpace, MemorySpace>::allocate(size_t n) { + void *const p = m_memory.allocate(n); + + if (p) { + // Kokkos::atomic_increment( & m_accum_alloc ); + Kokkos::atomic_increment(&m_count_alloc); + + // if ( m_max_alloc < m_count_alloc ) m_max_alloc = m_count_alloc ; + } + + return p; +} + +template <typename ExecSpace, typename MemorySpace> +KOKKOS_FUNCTION void TaskQueue<ExecSpace, MemorySpace>::deallocate(void *p, + size_t n) { + m_memory.deallocate(p, n); + Kokkos::atomic_decrement(&m_count_alloc); +} + +//---------------------------------------------------------------------------- + +template <typename ExecSpace, typename MemorySpace> +KOKKOS_FUNCTION bool TaskQueue<ExecSpace, MemorySpace>::push_task( + TaskQueue<ExecSpace, MemorySpace>::task_root_type *volatile *const queue, + TaskQueue<ExecSpace, MemorySpace>::task_root_type *const task) { + // Push task into a concurrently pushed and popped queue. + // The queue can be either a ready task queue or a waiting task queue. + // The queue is a linked list where 'task->m_next' form the links. + // Fail the push attempt if the queue is locked; + // otherwise retry until the push succeeds. + +#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING + printf("push_task( 0x%lx { 0x%lx } 0x%lx { 0x%lx 0x%lx %d %d %d } )\n", + uintptr_t(queue), uintptr_t(*queue), uintptr_t(task), + uintptr_t(task->m_wait), uintptr_t(task->m_next), task->m_task_type, + task->m_priority, task->m_ref_count); +#endif + + task_root_type *const zero = nullptr; + task_root_type *const lock = (task_root_type *)task_root_type::LockTag; + + task_root_type *volatile &next = task->m_next; + + if (zero != next) { + Kokkos::abort( + "TaskQueue::push_task ERROR: already a member of another queue"); + } + + // store the head of the queue + task_root_type *old_head = *queue; + + while (old_head != lock) { + // set task->next to the head of the queue + next = old_head; + + // Do not proceed until 'next' has been stored. + Kokkos::memory_fence(); + + // store the old head + task_root_type *const old_head_tmp = old_head; + + // attempt to swap task with the old head of the queue + // as if this were done atomically: + // if(*queue == old_head) { + // *queue = task; + // } + // old_head = *queue; + old_head = Kokkos::atomic_compare_exchange(queue, old_head, task); + + if (old_head_tmp == old_head) return true; + } + + // Failed, replace 'task->m_next' value since 'task' remains + // not a member of a queue. + + next = zero; + + // Do not proceed until 'next' has been stored. + Kokkos::memory_fence(); + + return false; +} + +//---------------------------------------------------------------------------- + +template <typename ExecSpace, typename MemorySpace> +KOKKOS_FUNCTION typename TaskQueue<ExecSpace, MemorySpace>::task_root_type * +TaskQueue<ExecSpace, MemorySpace>::pop_ready_task( + TaskQueue<ExecSpace, MemorySpace>::task_root_type *volatile *const queue) { + // Pop task from a concurrently pushed and popped ready task queue. + // The queue is a linked list where 'task->m_next' form the links. + + task_root_type *const lock = (task_root_type *)task_root_type::LockTag; + task_root_type *const end = (task_root_type *)task_root_type::EndTag; + + // *queue is + // end => an empty queue + // lock => a locked queue + // valid + + // Retry until the lock is acquired or the queue is empty. + + task_root_type *task = *queue; + + while (end != task) { + // The only possible values for the queue are + // (1) lock, (2) end, or (3) a valid task. + // Thus zero will never appear in the queue. + // + // If queue is locked then just read by guaranteeing the CAS will fail. + + if (lock == task) task = nullptr; + + task_root_type *const x = task; + + task = Kokkos::atomic_compare_exchange(queue, x, lock); + + if (x == task) { + // CAS succeeded and queue is locked + // + // This thread has locked the queue and removed 'task' from the queue. + // Extract the next entry of the queue from 'task->m_next' + // and mark 'task' as popped from a queue by setting + // 'task->m_next = lock'. + // + // Place the next entry in the head of the queue, + // which also unlocks the queue. + // + // This thread has exclusive access to + // the queue and the popped task's m_next. + + task_root_type *volatile &next = task->m_next; + + // This algorithm is not lockfree because a adversarial scheduler could + // context switch this thread at this point and the rest of the threads + // calling this method would never make forward progress + + *queue = next; + next = lock; + + Kokkos::memory_fence(); + +#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING + printf("pop_ready_task( 0x%lx 0x%lx { 0x%lx 0x%lx %d %d %d } )\n", + uintptr_t(queue), uintptr_t(task), uintptr_t(task->m_wait), + uintptr_t(task->m_next), int(task->m_task_type), + int(task->m_priority), int(task->m_ref_count)); +#endif + + return task; + } + } + + return end; +} + +//---------------------------------------------------------------------------- + +template <typename ExecSpace, typename MemorySpace> +KOKKOS_FUNCTION void TaskQueue<ExecSpace, MemorySpace>::schedule_runnable( + TaskQueue<ExecSpace, MemorySpace>::task_root_type *const task) { + // Schedule a runnable task upon construction / spawn + // and upon completion of other tasks that 'task' is waiting on. + // + // Precondition: + // - called by a single thread for the input task + // - calling thread has exclusive access to the task + // - task is not a member of a queue + // - if runnable then task is either constructing or respawning + // + // Constructing state: + // task->m_wait == 0 + // task->m_next == dependence or 0 + // Respawn state: + // task->m_wait == head of linked list: 'end' or valid task + // task->m_next == dependence or 0 + // + // Task state transition: + // Constructing -> Waiting + // Respawn -> Waiting + // + // Postcondition on task state: + // task->m_wait == head of linked list (queue) + // task->m_next == member of linked list (queue) + +#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING + printf("schedule_runnable( 0x%lx { 0x%lx 0x%lx %d %d %d }\n", uintptr_t(task), + uintptr_t(task->m_wait), uintptr_t(task->m_next), task->m_task_type, + task->m_priority, task->m_ref_count); +#endif + + task_root_type *const zero = nullptr; + task_root_type *const lock = (task_root_type *)task_root_type::LockTag; + task_root_type *const end = (task_root_type *)task_root_type::EndTag; + + task_root_type volatile &t = *task; + + bool respawn = false; + + //---------------------------------------- + + if (zero == t.m_wait) { + // Task in Constructing state + // - Transition to Waiting state + // Preconditions: + // - call occurs exclusively within a single thread + + t.m_wait = end; + // Task in Waiting state + } else if (lock != t.m_wait) { + // Task in Executing state with Respawn request + // - Update dependence + // - Transition to Waiting state + respawn = true; + } else { + // Task in Complete state + Kokkos::abort("TaskQueue::schedule_runnable ERROR: task is complete"); + } + + //---------------------------------------- + // Scheduling a runnable task which may have a depencency 'dep'. + // Extract dependence, if any, from task->m_next. + // If 'dep' is not null then attempt to push 'task' + // into the wait queue of 'dep'. + // If the push succeeds then 'task' may be + // processed or executed by another thread at any time. + // If the push fails then 'dep' is complete and 'task' + // is ready to execute. + + // Exclusive access so don't need an atomic exchange + // task_root_type * dep = Kokkos::atomic_exchange( & task->m_next , zero ); + task_root_type *dep = t.m_next; + t.m_next = zero; + + Kokkos::memory_fence(); + + // If we don't have a dependency, or if pushing onto the wait queue of that + // dependency failed (since the only time that queue should be locked is when + // the task is transitioning to complete?) + const bool is_ready = (nullptr == dep) || (!push_task(&dep->m_wait, task)); + + if ((nullptr != dep) && respawn) { + // Reference count for dep was incremented when + // respawn assigned dependency to task->m_next + // so that if dep completed prior to the + // above push_task dep would not be destroyed. + // dep reference count can now be decremented, + // which may deallocate the task. + TaskQueue::assign(&dep, nullptr); + } + + if (is_ready) { + // No dependence or 'dep' is complete so push task into ready queue. + // Increment the ready count before pushing into ready queue + // to track number of ready + executing tasks. + // The ready count will be decremented when the task is complete. + + Kokkos::atomic_increment(&m_ready_count); + + task_root_type *volatile *const ready_queue = + &m_ready[t.m_priority][t.m_task_type]; + + // A push_task fails if the ready queue is locked. + // A ready queue is only locked during a push or pop; + // i.e., it is never permanently locked. + // Retry push to ready queue until it succeeds. + // When the push succeeds then 'task' may be + // processed or executed by another thread at any time. + + while (!push_task(ready_queue, task)) + ; + } + + //---------------------------------------- + // Postcondition: + // - A runnable 'task' was pushed into a wait or ready queue. + // - Concurrent execution may have already popped 'task' + // from a queue and processed it as appropriate. +} + +template <typename ExecSpace, typename MemorySpace> +KOKKOS_FUNCTION void TaskQueue<ExecSpace, MemorySpace>::schedule_aggregate( + TaskQueue<ExecSpace, MemorySpace>::task_root_type *const task) { + // Schedule an aggregate task upon construction + // and upon completion of other tasks that 'task' is waiting on. + // + // Precondition: + // - called by a single thread for the input task + // - calling thread has exclusive access to the task + // - task is not a member of a queue + // + // Constructing state: + // task->m_wait == 0 + // task->m_next == dependence or 0 + // + // Task state transition: + // Constructing -> Waiting + // + // Postcondition on task state: + // task->m_wait == head of linked list (queue) + // task->m_next == member of linked list (queue) + +#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING + printf("schedule_aggregate( 0x%lx { 0x%lx 0x%lx %d %d %d %d }\n", + uintptr_t(task), uintptr_t(task->m_wait), uintptr_t(task->m_next), + task->m_dep_count, task->m_task_type, task->m_priority, + task->m_ref_count); +#endif + + task_root_type *const zero = nullptr; + task_root_type *const lock = (task_root_type *)task_root_type::LockTag; + task_root_type *const end = (task_root_type *)task_root_type::EndTag; + + task_root_type volatile &t = *task; + + //---------------------------------------- + + if (zero == t.m_wait) { + // Task in Constructing state + // - Transition to Waiting state + // Preconditions: + // - call occurs exclusively within a single thread + + t.m_wait = end; + // Task in Waiting state + } else if (lock == t.m_wait) { + // Task in Complete state + Kokkos::abort("TaskQueue::schedule_aggregate ERROR: task is complete"); + } + + //---------------------------------------- + // Scheduling a 'when_all' task with multiple dependences. + // This scheduling may be called when the 'when_all' is + // (1) created or + // (2) being removed from a completed task's wait list. + + task_root_type *volatile *const aggr = t.aggregate_dependences(); + + // Assume the 'when_all' is complete until a dependence is + // found that is not complete. + + bool is_complete = true; + + for (int i = t.m_dep_count; 0 < i && is_complete;) { + --i; + + // Loop dependences looking for an incomplete task. + // Add this task to the incomplete task's wait queue. + + // Remove a task 'x' from the dependence list. + // The reference count of 'x' was incremented when + // it was assigned into the dependence list. + + // Exclusive access so don't need an atomic exchange + // task_root_type * x = Kokkos::atomic_exchange( aggr + i , zero ); + task_root_type *x = aggr[i]; + aggr[i] = zero; + + if (x) { + // If x->m_wait is not locked then push succeeds + // and the aggregate is not complete. + // If the push succeeds then this when_all 'task' may be + // processed by another thread at any time. + // For example, 'x' may be completeed by another + // thread and then re-schedule this when_all 'task'. + + is_complete = !push_task(&x->m_wait, task); + + // Decrement reference count which had been incremented + // when 'x' was added to the dependence list. + + TaskQueue::assign(&x, zero); + } + } + + if (is_complete) { + // The when_all 'task' was not added to a wait queue because + // all dependences were complete so this aggregate is complete. + // Complete the when_all 'task' to schedule other tasks + // that are waiting for the when_all 'task' to complete. + + t.m_next = lock; + + complete(task); + + // '*task' may have been deleted upon completion + } + + //---------------------------------------- + // Postcondition: + // - An aggregate 'task' was either pushed to a wait queue or completed. + // - Concurrent execution may have already popped 'task' + // from a queue and processed it as appropriate. +} + +//---------------------------------------------------------------------------- + +template <typename ExecSpace, typename MemorySpace> +KOKKOS_FUNCTION void TaskQueue<ExecSpace, MemorySpace>::reschedule( + task_root_type *task) { + // Precondition: + // task is in Executing state + // task->m_next == LockTag + // + // Postcondition: + // task is in Executing-Respawn state + // task->m_next == 0 (no dependence) + + task_root_type *const zero = nullptr; + task_root_type *const lock = (task_root_type *)task_root_type::LockTag; + + if (lock != Kokkos::atomic_exchange(&task->m_next, zero)) { + Kokkos::abort("TaskScheduler::respawn ERROR: already respawned"); + } +} + +//---------------------------------------------------------------------------- + +template <typename ExecSpace, typename MemorySpace> +KOKKOS_FUNCTION void TaskQueue<ExecSpace, MemorySpace>::complete( + TaskQueue<ExecSpace, MemorySpace>::task_root_type *task) { + // Complete a runnable task that has finished executing + // or a when_all task when all of its dependeneces are complete. + + task_root_type *const zero = nullptr; + task_root_type *const lock = (task_root_type *)task_root_type::LockTag; + task_root_type *const end = (task_root_type *)task_root_type::EndTag; + +#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING + printf("complete( 0x%lx { 0x%lx 0x%lx %d %d %d }\n", uintptr_t(task), + uintptr_t(task->m_wait), uintptr_t(task->m_next), task->m_task_type, + task->m_priority, task->m_ref_count); +#endif + + task_root_type volatile &t = *task; + + const bool runnable = task_root_type::Aggregate != t.m_task_type; + + //---------------------------------------- + + if (runnable && lock != t.m_next) { + // Is a runnable task has finished executing and requested respawn. + // Schedule the task for subsequent execution. + + schedule_runnable(task); + } + //---------------------------------------- + else { + // Is either an aggregate or a runnable task that executed + // and did not respawn. Transition this task to complete. + + // If 'task' is an aggregate then any of the runnable tasks that + // it depends upon may be attempting to complete this 'task'. + // Must only transition a task once to complete status. + // This is controlled by atomically locking the wait queue. + + // Stop other tasks from adding themselves to this task's wait queue + // by locking the head of this task's wait queue. + + task_root_type *x = Kokkos::atomic_exchange(&t.m_wait, lock); + + if (x != (task_root_type *)lock) { + // This thread has transitioned this 'task' to complete. + // 'task' is no longer in a queue and is not executing + // so decrement the reference count from 'task's creation. + // If no other references to this 'task' then it will be deleted. + + TaskQueue::assign(&task, zero); + + // This thread has exclusive access to the wait list so + // the concurrency-safe pop_ready_task function is not needed. + // Schedule the tasks that have been waiting on the input 'task', + // which may have been deleted. + + while (x != end) { + // Have exclusive access to 'x' until it is scheduled + // Set x->m_next = zero <= no dependence, not a respawn + + task_root_type volatile &vx = *x; + + task_root_type *const next = vx.m_next; + vx.m_next = nullptr; + + Kokkos::memory_fence(); + + if (task_root_type::Aggregate != vx.m_task_type) { + schedule_runnable(x); + } else { +#if !defined(__HIP_DEVICE_COMPILE__) + schedule_aggregate(x); +#endif + } + + x = next; + } + } + } + + if (runnable) { + // A runnable task was popped from a ready queue and executed. + // If respawned into a ready queue then the ready count was incremented + // so decrement whether respawned or not. + Kokkos::atomic_decrement(&m_ready_count); + } +} + +//---------------------------------------------------------------------------- + +} /* namespace Impl */ +} /* namespace Kokkos */ + +#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */ +#endif /* #ifndef KOKKOS_IMPL_TASKQUEUE_IMPL_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskResult.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskResult.hpp new file mode 100644 index 0000000000000000000000000000000000000000..40a9c3bf57cfbe36e5a2646b963b71338c410cf7 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_TaskResult.hpp @@ -0,0 +1,144 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +// Experimental unified task-data parallel manycore LDRD + +#ifndef KOKKOS_IMPL_TASKRESULT_HPP +#define KOKKOS_IMPL_TASKRESULT_HPP + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_TASKDAG) + +#include <Kokkos_TaskScheduler_fwd.hpp> +#include <Kokkos_Core_fwd.hpp> + +#include <impl/Kokkos_TaskBase.hpp> +#include <impl/Kokkos_TaskNode.hpp> + +#include <string> +#include <typeinfo> +#include <stdexcept> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <typename ResultType> +struct TaskResult { + enum : int32_t { size = sizeof(ResultType) }; + + using reference_type = ResultType&; + + template <class CountType> + KOKKOS_INLINE_FUNCTION static ResultType* ptr( + PoolAllocatedObjectBase<CountType>* task) { + return reinterpret_cast<ResultType*>(reinterpret_cast<char*>(task) + + task->get_allocation_size() - + sizeof(ResultType)); + } + + KOKKOS_INLINE_FUNCTION static ResultType* ptr(TaskBase* task) { + return reinterpret_cast<ResultType*>(reinterpret_cast<char*>(task) + + task->m_alloc_size - + sizeof(ResultType)); + } + + KOKKOS_INLINE_FUNCTION static reference_type get(TaskBase* task) { + return *ptr(task); + } + + template <class TaskQueueTraits> + KOKKOS_INLINE_FUNCTION static reference_type get( + TaskNode<TaskQueueTraits>* task) { + return *ptr(task); + } + + KOKKOS_INLINE_FUNCTION static void destroy(TaskBase* task) { + get(task).~ResultType(); + } + + // template <class TaskQueueTraits> + // KOKKOS_INLINE_FUNCTION static + // void destroy( TaskNode<TaskQueueTraits>* task ) + //{ get(task).~ResultType(); } +}; + +template <> +struct TaskResult<void> { + enum : int32_t { size = 0 }; + + using reference_type = void; + + template <class TaskQueueTraits> + KOKKOS_INLINE_FUNCTION static void* ptr(TaskNode<TaskQueueTraits>* /*task*/) { + return nullptr; + } + + KOKKOS_INLINE_FUNCTION static void* ptr(TaskBase*) { return nullptr; } + + template <class TaskQueueTraits> + KOKKOS_INLINE_FUNCTION static reference_type get( + TaskNode<TaskQueueTraits>* /*task*/) { /* Should never be called */ + } + + KOKKOS_INLINE_FUNCTION static reference_type get(TaskBase*) {} + + KOKKOS_INLINE_FUNCTION static void destroy(TaskBase* /*task*/) {} + + // template <class TaskQueueTraits> + // KOKKOS_INLINE_FUNCTION static + // void destroy( TaskNode<TaskQueueTraits>* task ) + //{ } +}; + +} /* namespace Impl */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */ +#endif /* #ifndef KOKKOS_IMPL_TASKRESULT_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskTeamMember.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskTeamMember.hpp new file mode 100644 index 0000000000000000000000000000000000000000..2faab5794907ecd43ccead5f74e09e414a8f3541 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_TaskTeamMember.hpp @@ -0,0 +1,128 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TASKTEAMMEMBER_HPP +#define KOKKOS_TASKTEAMMEMBER_HPP + +//---------------------------------------------------------------------------- + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_TASKDAG) + +#include <Kokkos_Core_fwd.hpp> +#include <Kokkos_TaskScheduler_fwd.hpp> +//---------------------------------------------------------------------------- + +#include <Kokkos_MemoryPool.hpp> +#include <impl/Kokkos_Tags.hpp> + +#include <Kokkos_Future.hpp> +#include <impl/Kokkos_TaskQueue.hpp> +#include <impl/Kokkos_SingleTaskQueue.hpp> +#include <impl/Kokkos_TaskQueueMultiple.hpp> +#include <impl/Kokkos_TaskPolicyData.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <class TeamMember, class Scheduler> +class TaskTeamMemberAdapter : public TeamMember { + private: + Scheduler m_scheduler; + + public: + //---------------------------------------- + + // Forward everything but the Scheduler to the constructor of the TeamMember + // type that we're adapting + template <typename... Args> + KOKKOS_INLINE_FUNCTION explicit TaskTeamMemberAdapter( + typename std::enable_if<std::is_constructible<TeamMember, Args...>::value, + Scheduler>::type arg_scheduler, + Args&&... args) // TODO @tasking @minor DSH noexcept specification + : TeamMember(std::forward<Args>(args)...), + m_scheduler( + std::move(arg_scheduler).get_team_scheduler(this->league_rank())) {} + + // (rule of 6 constructors) + + KOKKOS_DEFAULTED_FUNCTION + TaskTeamMemberAdapter() = default; + + KOKKOS_DEFAULTED_FUNCTION + TaskTeamMemberAdapter(TaskTeamMemberAdapter const&) = default; + + KOKKOS_DEFAULTED_FUNCTION + TaskTeamMemberAdapter(TaskTeamMemberAdapter&&) = default; + + KOKKOS_DEFAULTED_FUNCTION + TaskTeamMemberAdapter& operator=(TaskTeamMemberAdapter const&) = default; + + KOKKOS_DEFAULTED_FUNCTION + TaskTeamMemberAdapter& operator=(TaskTeamMemberAdapter&&) = default; + + KOKKOS_DEFAULTED_FUNCTION ~TaskTeamMemberAdapter() = default; + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + Scheduler const& scheduler() const noexcept { return m_scheduler; } + + KOKKOS_INLINE_FUNCTION + Scheduler& scheduler() noexcept { return m_scheduler; } + + //---------------------------------------- +}; + +} // end namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */ +#endif /* #ifndef KOKKOS_TASKTEAMMEMBER_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_Timer.hpp b/packages/kokkos/core/src/impl/Kokkos_Timer.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e8004ff85258975d3f36ee2a9345414e27ea09ad --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Timer.hpp @@ -0,0 +1,63 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_IMPLWALLTIME_HPP +#define KOKKOS_IMPLWALLTIME_HPP + +#include <Kokkos_Timer.hpp> + +namespace Kokkos { +namespace Impl { + +/** \brief Time since construction + * Timer promoted from Impl to Kokkos ns + * This file included for backwards compatibility + */ + +using Kokkos::Timer; + +} // namespace Impl +} // namespace Kokkos + +#endif /* #ifndef KOKKOS_IMPLWALLTIME_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_Tools.hpp b/packages/kokkos/core/src/impl/Kokkos_Tools.hpp new file mode 100644 index 0000000000000000000000000000000000000000..8d6ec64685ee51ed2320e31bf7b8ee535d5f2c28 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Tools.hpp @@ -0,0 +1,54 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +/** + * Header file to include all of Kokkos Tooling support + */ + +#ifndef KOKKOS_IMPL_KOKKOS_TOOLS_HPP +#define KOKKOS_IMPL_KOKKOS_TOOLS_HPP + +#include <impl/Kokkos_Profiling.hpp> + +#endif diff --git a/packages/kokkos/core/src/impl/Kokkos_Traits.hpp b/packages/kokkos/core/src/impl/Kokkos_Traits.hpp new file mode 100644 index 0000000000000000000000000000000000000000..d88230f5b247829dbf6e8ee79b111cb2d1309118 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Traits.hpp @@ -0,0 +1,344 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOSTRAITS_HPP +#define KOKKOSTRAITS_HPP + +#include <cstddef> +#include <cstdint> +#include <Kokkos_Macros.hpp> +#include <impl/Kokkos_BitOps.hpp> +#include <string> +#include <type_traits> + +namespace Kokkos { +namespace Impl { + +//---------------------------------------------------------------------------- +// Help with C++11 variadic argument packs + +template <unsigned I, typename... Pack> +struct get_type { + using type = void; +}; + +template <typename T, typename... Pack> +struct get_type<0, T, Pack...> { + using type = T; +}; + +template <unsigned I, typename T, typename... Pack> +struct get_type<I, T, Pack...> { + using type = typename get_type<I - 1, Pack...>::type; +}; + +template <typename T, typename... Pack> +struct has_type { + enum : bool { value = false }; +}; + +template <typename T, typename S, typename... Pack> +struct has_type<T, S, Pack...> { + private: + enum { self_value = std::is_same<T, S>::value }; + + using next = has_type<T, Pack...>; + + static_assert( + !(self_value && next::value), + "Error: more than one member of the argument pack matches the type"); + + public: + enum : bool { value = self_value || next::value }; +}; + +template <typename DefaultType, template <typename> class Condition, + typename... Pack> +struct has_condition { + enum : bool { value = false }; + using type = DefaultType; +}; + +template <typename DefaultType, template <typename> class Condition, typename S, + typename... Pack> +struct has_condition<DefaultType, Condition, S, Pack...> { + private: + enum { self_value = Condition<S>::value }; + + using next = has_condition<DefaultType, Condition, Pack...>; + + static_assert( + !(self_value && next::value), + "Error: more than one member of the argument pack satisfies condition"); + + public: + enum : bool { value = self_value || next::value }; + + using type = + typename std::conditional<self_value, S, typename next::type>::type; +}; + +template <class... Args> +struct are_integral { + enum : bool { value = true }; +}; + +template <typename T, class... Args> +struct are_integral<T, Args...> { + enum { + value = + // Accept std::is_integral OR std::is_enum as an integral value + // since a simple enum value is automically convertible to an + // integral value. + (std::is_integral<T>::value || std::is_enum<T>::value) && + are_integral<Args...>::value + }; +}; + +//---------------------------------------------------------------------------- +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- +// Other traits + +namespace Kokkos { +namespace Impl { + +//---------------------------------------------------------------------------- +// if_ + +template <bool Cond, typename TrueType, typename FalseType> +struct if_c { + enum : bool { value = Cond }; + + using type = FalseType; + + using value_type = typename std::remove_const< + typename std::remove_reference<type>::type>::type; + + using const_value_type = typename std::add_const<value_type>::type; + + static KOKKOS_INLINE_FUNCTION const_value_type& select(const_value_type& v) { + return v; + } + + static KOKKOS_INLINE_FUNCTION value_type& select(value_type& v) { return v; } + + template <class T> + static KOKKOS_INLINE_FUNCTION value_type& select(const T&) { + value_type* ptr(0); + return *ptr; + } + + template <class T> + static KOKKOS_INLINE_FUNCTION const_value_type& select(const T&, + const_value_type& v) { + return v; + } + + template <class T> + static KOKKOS_INLINE_FUNCTION value_type& select(const T&, value_type& v) { + return v; + } +}; + +template <typename TrueType, typename FalseType> +struct if_c<true, TrueType, FalseType> { + enum : bool { value = true }; + + using type = TrueType; + + using value_type = typename std::remove_const< + typename std::remove_reference<type>::type>::type; + + using const_value_type = typename std::add_const<value_type>::type; + + static KOKKOS_INLINE_FUNCTION const_value_type& select(const_value_type& v) { + return v; + } + + static KOKKOS_INLINE_FUNCTION value_type& select(value_type& v) { return v; } + + template <class T> + static KOKKOS_INLINE_FUNCTION value_type& select(const T&) { + value_type* ptr(0); + return *ptr; + } + + template <class F> + static KOKKOS_INLINE_FUNCTION const_value_type& select(const_value_type& v, + const F&) { + return v; + } + + template <class F> + static KOKKOS_INLINE_FUNCTION value_type& select(value_type& v, const F&) { + return v; + } +}; + +template <typename TrueType> +struct if_c<false, TrueType, void> { + enum : bool { value = false }; + + using type = void; + using value_type = void; +}; + +template <typename FalseType> +struct if_c<true, void, FalseType> { + enum : bool { value = true }; + + using type = void; + using value_type = void; +}; + +//---------------------------------------------------------------------------- +// These 'constexpr'functions can be used as +// both regular functions and meta-function. + +/**\brief There exists integral 'k' such that N = 2^k */ +KOKKOS_INLINE_FUNCTION +constexpr bool is_integral_power_of_two(const size_t N) { + return (0 < N) && (0 == (N & (N - 1))); +} + +/**\brief Return integral 'k' such that N = 2^k, assuming valid. */ +KOKKOS_INLINE_FUNCTION +constexpr unsigned integral_power_of_two_assume_valid(const size_t N) { + return N == 1 ? 0 : 1 + integral_power_of_two_assume_valid(N >> 1); +} + +/**\brief Return integral 'k' such that N = 2^k, if exists. + * If does not exist return ~0u. + */ +KOKKOS_INLINE_FUNCTION +constexpr unsigned integral_power_of_two(const size_t N) { + return is_integral_power_of_two(N) ? integral_power_of_two_assume_valid(N) + : ~0u; +} + +//---------------------------------------------------------------------------- + +template <size_t N> +struct is_power_of_two { + enum type { value = (N > 0) && !(N & (N - 1)) }; +}; + +template <size_t N, bool OK = is_power_of_two<N>::value> +struct power_of_two; + +template <size_t N> +struct power_of_two<N, true> { + enum type { value = 1 + power_of_two<(N >> 1), true>::value }; +}; + +template <> +struct power_of_two<2, true> { + enum type { value = 1 }; +}; + +template <> +struct power_of_two<1, true> { + enum type { value = 0 }; +}; + +/** \brief If power of two then return power, + * otherwise return ~0u. + */ +KOKKOS_FORCEINLINE_FUNCTION +unsigned power_of_two_if_valid(const unsigned N) { + unsigned p = ~0u; + if (is_integral_power_of_two(N)) { + p = bit_scan_forward(N); + } + return p; +} + +//---------------------------------------------------------------------------- + +template <typename T, T v, bool NonZero = (v != T(0))> +struct integral_nonzero_constant { + // Declaration of 'static const' causes an unresolved linker symbol in debug + // static const T value = v ; + enum { value = T(v) }; + using value_type = T; + using type = integral_nonzero_constant<T, v>; + KOKKOS_INLINE_FUNCTION integral_nonzero_constant(const T&) {} +}; + +template <typename T, T zero> +struct integral_nonzero_constant<T, zero, false> { + const T value; + using value_type = T; + using type = integral_nonzero_constant<T, 0>; + KOKKOS_INLINE_FUNCTION integral_nonzero_constant(const T& v) : value(v) {} +}; + +//---------------------------------------------------------------------------- + +template <class T> +struct make_all_extents_into_pointers { + using type = T; +}; + +template <class T, unsigned N> +struct make_all_extents_into_pointers<T[N]> { + using type = typename make_all_extents_into_pointers<T>::type*; +}; + +template <class T> +struct make_all_extents_into_pointers<T*> { + using type = typename make_all_extents_into_pointers<T>::type*; +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #ifndef KOKKOSTRAITS_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_Utilities.hpp b/packages/kokkos/core/src/impl/Kokkos_Utilities.hpp new file mode 100644 index 0000000000000000000000000000000000000000..cb8cf281ae06fe0a71862b47428a2ffa12f4bd67 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Utilities.hpp @@ -0,0 +1,167 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CORE_IMPL_UTILITIES_HPP +#define KOKKOS_CORE_IMPL_UTILITIES_HPP + +#include <Kokkos_Macros.hpp> +#include <cstdint> +#include <type_traits> +#include <initializer_list> // in-order comma operator fold emulation +#include <utility> // integer_sequence and friends + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <typename T> +struct identity { + using type = T; +}; + +template <typename T> +using identity_t = typename identity<T>::type; + +struct not_a_type { + not_a_type() = delete; + ~not_a_type() = delete; + not_a_type(not_a_type const&) = delete; + void operator=(not_a_type const&) = delete; +}; + +#if defined(__cpp_lib_void_t) +// since C++17 +using std::void_t; +#else +template <class...> +using void_t = void; +#endif + +//============================================================================== +// <editor-fold desc="remove_cvref_t"> {{{1 + +#if defined(__cpp_lib_remove_cvref) +// since C++20 +using std::remove_cvref; +using std::remove_cvref_t; +#else +template <class T> +struct remove_cvref { + using type = std::remove_cv_t<std::remove_reference_t<T>>; +}; + +template <class T> +using remove_cvref_t = typename remove_cvref<T>::type; +#endif + +// </editor-fold> end remove_cvref_t }}}1 +//============================================================================== + +//============================================================================== +// <editor-fold desc="is_specialization_of"> {{{1 + +template <class Type, template <class...> class Template, class Enable = void> +struct is_specialization_of : std::false_type {}; + +template <template <class...> class Template, class... Args> +struct is_specialization_of<Template<Args...>, Template> : std::true_type {}; + +// </editor-fold> end is_specialization_of }}}1 +//============================================================================== + +//============================================================================== +// <editor-fold desc="Folding emulation"> {{{1 + +// acts like void for comma fold emulation +struct _fold_comma_emulation_return {}; + +template <class... Ts> +constexpr KOKKOS_INLINE_FUNCTION _fold_comma_emulation_return +emulate_fold_comma_operator(Ts&&...) noexcept { + return _fold_comma_emulation_return{}; +} + +#define KOKKOS_IMPL_FOLD_COMMA_OPERATOR(expr) \ + ::Kokkos::Impl::emulate_fold_comma_operator( \ + ::std::initializer_list<::Kokkos::Impl::_fold_comma_emulation_return>{ \ + ((expr), ::Kokkos::Impl::_fold_comma_emulation_return{})...}) + +// </editor-fold> end Folding emulation }}}1 +//============================================================================== + +//============================================================================== +// destruct_delete is a unique_ptr deleter for objects +// created by placement new into already allocated memory +// by only calling the destructor on the object. +// +// Because unique_ptr never calls its deleter with a nullptr value, +// no need to check if p == nullptr. +// +// Note: This differs in interface from std::default_delete in that the +// function call operator is templated instead of the class, to make +// it easier to use and disallow specialization. +struct destruct_delete { + template <typename T> + KOKKOS_INLINE_FUNCTION constexpr void operator()(T* p) const noexcept { + p->~T(); + } +}; +//============================================================================== + +//============================================================================== +// <editor-fold desc="type_list"> {{{1 + +// An intentionally uninstantiateable type_list for metaprogramming purposes +template <class...> +struct type_list; + +// </editor-fold> end type_list }}}1 +//============================================================================== + +} // namespace Impl +} // namespace Kokkos + +#endif // KOKKOS_CORE_IMPL_UTILITIES_HPP diff --git a/packages/kokkos/core/src/impl/Kokkos_VLAEmulation.hpp b/packages/kokkos/core/src/impl/Kokkos_VLAEmulation.hpp new file mode 100644 index 0000000000000000000000000000000000000000..41607a2a8e7fedc56fd92fdfa9a472bda10bc547 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_VLAEmulation.hpp @@ -0,0 +1,284 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_IMPL_VLAEMULATION_HPP +#define KOKKOS_IMPL_VLAEMULATION_HPP + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_TASKDAG) + +#include <Kokkos_Core_fwd.hpp> + +#include <impl/Kokkos_Error.hpp> // KOKKOS_EXPECTS + +#include <type_traits> // std::is_abstract<>, ... + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <class Derived, class VLAValueType, class EntryCountType = int32_t> +struct ObjectWithVLAEmulation; + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +/** @brief Attorney to enable private CRTP inheritance from + * ObjectWithVLAEmulation + */ +struct VLAEmulationAccess { + private: + template <class, class, class> + friend struct ObjectWithVLAEmulation; + + template <class Derived, class VLAValueType, class EntryCountType> + KOKKOS_FORCEINLINE_FUNCTION static constexpr Derived* _cast_to_derived( + ObjectWithVLAEmulation<Derived, VLAValueType, EntryCountType>* + base) noexcept { + return static_cast<Derived*>(base); + } + + template <class Derived, class VLAValueType, class EntryCountType> + KOKKOS_FORCEINLINE_FUNCTION static constexpr Derived const* _cast_to_derived( + ObjectWithVLAEmulation<Derived, VLAValueType, EntryCountType> const* + base) noexcept { + return static_cast<Derived const*>(base); + } +}; + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +/** \brief A CRTP base class for a type that includes a variable-length array by + * allocation + * + * The storage for the derived type must be allocated manually and the objects + * (both derived type and VLA objects) must be constructed with placement new. + * Obviously, this can't be done for objects on the stack. + * + * Note: Though most uses of this currently delete the copy and move + * constructor in the `Derived` type, this type is intended to have value + * semantics. + * + * \todo @documentation elaborate on implications of value semantics for this + * class template + * + */ +template <class Derived, class VLAValueType, + class EntryCountType /* = int32_t */ + > +struct ObjectWithVLAEmulation { + public: + using object_type = Derived; + using vla_value_type = VLAValueType; + using vla_entry_count_type = EntryCountType; + + using iterator = VLAValueType*; + using const_iterator = typename std::add_const<VLAValueType>::type*; + + // TODO @tasking @minor DSH require that Derived be marked final? (note that + // std::is_final is C++14) + // TODO @tasking @minor DSH delete non-placement operator new for Derived + // type? + + private: + vla_entry_count_type m_num_entries; + + // CRTP boilerplate + + KOKKOS_FORCEINLINE_FUNCTION + /* KOKKOS_CONSTEXPR_14 */ + Derived* _this() noexcept { + return VLAEmulationAccess::_cast_to_derived(this); + } + + KOKKOS_FORCEINLINE_FUNCTION + /* KOKKOS_CONSTEXPR_14 */ + Derived const* _this() const noexcept { + return VLAEmulationAccess::_cast_to_derived(this); + } + + // Note: can't be constexpr because of reinterpret_cast + KOKKOS_FORCEINLINE_FUNCTION + /* KOKKOS_CONSTEXPR_14 */ + vla_value_type* _vla_pointer() noexcept { + // The data starts right after the aligned storage of Derived + return reinterpret_cast<vla_value_type*>(_this() + 1); + } + + // Note: can't be constexpr because of reinterpret_cast + KOKKOS_FORCEINLINE_FUNCTION + /* KOKKOS_CONSTEXPR_14 */ + vla_value_type const* _vla_pointer() const noexcept { + // The data starts right after the aligned storage of Derived + return reinterpret_cast<vla_value_type const*>(_this() + 1); + } + + public: + KOKKOS_INLINE_FUNCTION + static /* KOKKOS_CONSTEXPR_14 */ size_t required_allocation_size( + vla_entry_count_type num_vla_entries) { + KOKKOS_EXPECTS(num_vla_entries >= 0); + return sizeof(Derived) + num_vla_entries * sizeof(VLAValueType); + } + + //---------------------------------------------------------------------------- + // <editor-fold desc="Constructors, destructor, and assignment"> {{{2 + + // TODO @tasking @optimization DSH specialization for trivially constructible + // VLAValueType? + // TODO @tasking @minor DSH SFINAE-out this constructor for non-default + // contructible vla_value_types + KOKKOS_INLINE_FUNCTION + explicit ObjectWithVLAEmulation(vla_entry_count_type num_entries) noexcept( + noexcept(vla_value_type())) + : m_num_entries(num_entries) { + // Note: We can't do this at class scope because it unnecessarily requires + // object_type to be a complete type + static_assert(alignof(object_type) >= alignof(vla_value_type), + "Can't append emulated variable length array of type with " + "greater alignment than" + " the type to which the VLA is being appended"); + + // Note: We can't do this at class scope because it unnecessarily requires + // vla_value_type to be a complete type + static_assert(!std::is_abstract<vla_value_type>::value, + "Can't use abstract type with VLA emulation"); + + KOKKOS_EXPECTS(num_entries >= 0); + for (vla_entry_count_type i = 0; i < m_num_entries; ++i) { + new (_vla_pointer() + i) vla_value_type(); + } + } + + KOKKOS_INLINE_FUNCTION + ~ObjectWithVLAEmulation() { + for (auto&& value : *this) { + value.~vla_value_type(); + } + } + + // TODO @tasking @new_feature DSH constrained analogs for move and copy ctors + // and assignment ops + // TODO @tasking @new_feature DSH forwarding in_place constructor + // TODO @tasking @new_feature DSH initializer_list constructor? + + // </editor-fold> end Constructors, destructor, and assignment }}}2 + //---------------------------------------------------------------------------- + + KOKKOS_INLINE_FUNCTION + constexpr EntryCountType n_vla_entries() const noexcept { + return m_num_entries; + } + + //---------------------------------------------------------------------------- + // <editor-fold desc="Accessing the object and the VLA values"> {{{2 + + KOKKOS_INLINE_FUNCTION + object_type& object() & { return static_cast<Derived&>(*this); } + + KOKKOS_INLINE_FUNCTION + object_type const& object() const& { + return static_cast<Derived const&>(*this); + } + + KOKKOS_INLINE_FUNCTION + object_type&& object() && { return static_cast<Derived&&>(*this); } + + KOKKOS_INLINE_FUNCTION + vla_value_type& vla_value_at(vla_entry_count_type n) & { + KOKKOS_EXPECTS(n < n_vla_entries()); + return _vla_pointer()[n]; + } + + KOKKOS_INLINE_FUNCTION + vla_value_type const& vla_value_at(vla_entry_count_type n) const& { + KOKKOS_EXPECTS(n < n_vla_entries()); + return _vla_pointer()[n]; + } + + KOKKOS_INLINE_FUNCTION + vla_value_type& vla_value_at(vla_entry_count_type n) && { + KOKKOS_EXPECTS(n < n_vla_entries()); + return _vla_pointer()[n]; + } + + // </editor-fold> end Accessing the object and the VLA values }}}2 + //---------------------------------------------------------------------------- + + //---------------------------------------------------------------------------- + // <editor-fold desc="Iterators"> {{{2 + + KOKKOS_INLINE_FUNCTION + iterator begin() noexcept { return _vla_pointer(); } + + KOKKOS_INLINE_FUNCTION + const_iterator begin() const noexcept { return _vla_pointer(); } + + KOKKOS_INLINE_FUNCTION + const_iterator cbegin() noexcept { return _vla_pointer(); } + + KOKKOS_INLINE_FUNCTION + iterator end() noexcept { return _vla_pointer() + m_num_entries; } + + KOKKOS_INLINE_FUNCTION + const_iterator end() const noexcept { return _vla_pointer() + m_num_entries; } + + KOKKOS_INLINE_FUNCTION + const_iterator cend() noexcept { return _vla_pointer() + m_num_entries; } + + // </editor-fold> end Iterators }}}2 + //---------------------------------------------------------------------------- +}; + +} /* namespace Impl */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */ +#endif /* #ifndef KOKKOS_IMPL_VLAEMULATION_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewArray.hpp b/packages/kokkos/core/src/impl/Kokkos_ViewArray.hpp new file mode 100644 index 0000000000000000000000000000000000000000..7adbe4690d7a914a00e37ced88a4c0312a9557a1 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_ViewArray.hpp @@ -0,0 +1,627 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_EXPERIMENTAL_VIEW_ARRAY_MAPPING_HPP +#define KOKKOS_EXPERIMENTAL_VIEW_ARRAY_MAPPING_HPP + +#include <Kokkos_Array.hpp> + +namespace Kokkos { +namespace Impl { + +template <class DataType, class ArrayLayout, class V, size_t N, class P> +struct ViewDataAnalysis<DataType, ArrayLayout, Kokkos::Array<V, N, P>> { + private: + using array_analysis = ViewArrayAnalysis<DataType>; + + static_assert(std::is_same<P, void>::value, ""); + static_assert(std::is_same<typename array_analysis::non_const_value_type, + Kokkos::Array<V, N, P>>::value, + ""); + static_assert(std::is_scalar<V>::value, + "View of Array type must be of a scalar type"); + + public: + using specialize = Kokkos::Array<>; + + using dimension = typename array_analysis::dimension; + + private: + enum { + is_const = std::is_same<typename array_analysis::value_type, + typename array_analysis::const_value_type>::value + }; + + using array_scalar_dimension = typename dimension::template append<N>::type; + + using scalar_type = typename std::conditional<is_const, const V, V>::type; + using non_const_scalar_type = V; + using const_scalar_type = const V; + + public: + using value_type = typename array_analysis::value_type; + using const_value_type = typename array_analysis::const_value_type; + using non_const_value_type = typename array_analysis::non_const_value_type; + + using type = typename ViewDataType<value_type, dimension>::type; + using const_type = typename ViewDataType<const_value_type, dimension>::type; + using non_const_type = + typename ViewDataType<non_const_value_type, dimension>::type; + + using scalar_array_type = + typename ViewDataType<scalar_type, array_scalar_dimension>::type; + using const_scalar_array_type = + typename ViewDataType<const_scalar_type, array_scalar_dimension>::type; + using non_const_scalar_array_type = + typename ViewDataType<non_const_scalar_type, + array_scalar_dimension>::type; +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +/** \brief View mapping for non-specialized data type and standard layout */ +template <class Traits> +class ViewMapping<Traits, Kokkos::Array<>> { + private: + template <class, class...> + friend class ViewMapping; + template <class, class...> + friend class Kokkos::View; + + using offset_type = ViewOffset<typename Traits::dimension, + typename Traits::array_layout, void>; + + using handle_type = typename Traits::value_type::pointer; + + handle_type m_impl_handle; + offset_type m_impl_offset; + size_t m_stride = 0; + + using scalar_type = typename Traits::value_type::value_type; + + using contiguous_reference = Kokkos::Array<scalar_type, (~std::size_t(0)), + Kokkos::Array<>::contiguous>; + using strided_reference = + Kokkos::Array<scalar_type, (~std::size_t(0)), Kokkos::Array<>::strided>; + + enum { + is_contiguous_reference = + (Traits::rank == 0) || (std::is_same<typename Traits::array_layout, + Kokkos::LayoutRight>::value) + }; + + enum { Array_N = Traits::value_type::size() }; + enum { Array_S = is_contiguous_reference ? Array_N : 1 }; + + KOKKOS_INLINE_FUNCTION + ViewMapping(const handle_type &arg_handle, const offset_type &arg_offset) + : m_impl_handle(arg_handle), + m_impl_offset(arg_offset), + m_stride(is_contiguous_reference ? 0 : arg_offset.span()) {} + + public: + //---------------------------------------- + // Domain dimensions + + enum { Rank = Traits::dimension::rank }; + + template <typename iType> + KOKKOS_INLINE_FUNCTION constexpr size_t extent(const iType &r) const { + return m_impl_offset.m_dim.extent(r); + } + + KOKKOS_INLINE_FUNCTION constexpr typename Traits::array_layout layout() + const { + return m_impl_offset.layout(); + } + + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_0() const { + return m_impl_offset.dimension_0(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_1() const { + return m_impl_offset.dimension_1(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_2() const { + return m_impl_offset.dimension_2(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_3() const { + return m_impl_offset.dimension_3(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_4() const { + return m_impl_offset.dimension_4(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_5() const { + return m_impl_offset.dimension_5(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_6() const { + return m_impl_offset.dimension_6(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_7() const { + return m_impl_offset.dimension_7(); + } + + // Is a regular layout with uniform striding for each index. + using is_regular = typename offset_type::is_regular; + + KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { + return m_impl_offset.stride_0(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { + return m_impl_offset.stride_1(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { + return m_impl_offset.stride_2(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { + return m_impl_offset.stride_3(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { + return m_impl_offset.stride_4(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { + return m_impl_offset.stride_5(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { + return m_impl_offset.stride_6(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { + return m_impl_offset.stride_7(); + } + + //---------------------------------------- + // Range span + + /** \brief Span of the mapped range */ + KOKKOS_INLINE_FUNCTION constexpr size_t span() const { + return m_impl_offset.span() * Array_N; + } + + /** \brief Is the mapped range span contiguous */ + KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { + return m_impl_offset.span_is_contiguous(); + } + + using reference_type = + typename std::conditional<is_contiguous_reference, contiguous_reference, + strided_reference>::type; + + using pointer_type = handle_type; + + /** \brief If data references are lvalue_reference than can query pointer to + * memory */ + KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { + return m_impl_handle; + } + + //---------------------------------------- + // The View class performs all rank and bounds checking before + // calling these element reference methods. + + KOKKOS_FORCEINLINE_FUNCTION + reference_type reference() const { + return reference_type(m_impl_handle + 0, Array_N, 0); + } + + template <typename I0> + KOKKOS_FORCEINLINE_FUNCTION reference_type reference(const I0 &i0) const { + return reference_type(m_impl_handle + m_impl_offset(i0) * Array_S, Array_N, + m_stride); + } + + template <typename I0, typename I1> + KOKKOS_FORCEINLINE_FUNCTION reference_type reference(const I0 &i0, + const I1 &i1) const { + return reference_type(m_impl_handle + m_impl_offset(i0, i1) * Array_S, + Array_N, m_stride); + } + + template <typename I0, typename I1, typename I2> + KOKKOS_FORCEINLINE_FUNCTION reference_type reference(const I0 &i0, + const I1 &i1, + const I2 &i2) const { + return reference_type(m_impl_handle + m_impl_offset(i0, i1, i2) * Array_S, + Array_N, m_stride); + } + + template <typename I0, typename I1, typename I2, typename I3> + KOKKOS_FORCEINLINE_FUNCTION reference_type + reference(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3) const { + return reference_type( + m_impl_handle + m_impl_offset(i0, i1, i2, i3) * Array_S, Array_N, + m_stride); + } + + template <typename I0, typename I1, typename I2, typename I3, typename I4> + KOKKOS_FORCEINLINE_FUNCTION reference_type reference(const I0 &i0, + const I1 &i1, + const I2 &i2, + const I3 &i3, + const I4 &i4) const { + return reference_type( + m_impl_handle + m_impl_offset(i0, i1, i2, i3, i4) * Array_S, Array_N, + m_stride); + } + + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5> + KOKKOS_FORCEINLINE_FUNCTION reference_type + reference(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, + const I4 &i4, const I5 &i5) const { + return reference_type( + m_impl_handle + m_impl_offset(i0, i1, i2, i3, i4, i5) * Array_S, + Array_N, m_stride); + } + + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5, typename I6> + KOKKOS_FORCEINLINE_FUNCTION reference_type + reference(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, + const I4 &i4, const I5 &i5, const I6 &i6) const { + return reference_type( + m_impl_handle + m_impl_offset(i0, i1, i2, i3, i4, i5, i6) * Array_S, + Array_N, m_stride); + } + + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5, typename I6, typename I7> + KOKKOS_FORCEINLINE_FUNCTION reference_type + reference(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, + const I4 &i4, const I5 &i5, const I6 &i6, const I7 &i7) const { + return reference_type( + m_impl_handle + m_impl_offset(i0, i1, i2, i3, i4, i5, i6, i7) * Array_S, + Array_N, m_stride); + } + + //---------------------------------------- + + private: + enum { MemorySpanMask = 8 - 1 /* Force alignment on 8 byte boundary */ }; + enum { MemorySpanSize = sizeof(scalar_type) }; + + public: + /** \brief Span, in bytes, of the referenced memory */ + KOKKOS_INLINE_FUNCTION constexpr size_t memory_span() const { + return (m_impl_offset.span() * Array_N * MemorySpanSize + MemorySpanMask) & + ~size_t(MemorySpanMask); + } + + //---------------------------------------- + + KOKKOS_DEFAULTED_FUNCTION ViewMapping() = default; + + //---------------------------------------- + + template <class... Args> + KOKKOS_INLINE_FUNCTION ViewMapping(pointer_type ptr, Args... args) + : m_impl_handle(ptr), + m_impl_offset(std::integral_constant<unsigned, 0>(), args...), + m_stride(m_impl_offset.span()) {} + + //---------------------------------------- + + template <class... P> + Kokkos::Impl::SharedAllocationRecord<> *allocate_shared( + Kokkos::Impl::ViewCtorProp<P...> const &arg_prop, + typename Traits::array_layout const &arg_layout) { + using alloc_prop = Kokkos::Impl::ViewCtorProp<P...>; + + using execution_space = typename alloc_prop::execution_space; + using memory_space = typename Traits::memory_space; + using functor_type = ViewValueFunctor<execution_space, scalar_type>; + using record_type = + Kokkos::Impl::SharedAllocationRecord<memory_space, functor_type>; + + // Query the mapping for byte-size of allocation. + using padding = std::integral_constant< + unsigned int, alloc_prop::allow_padding ? sizeof(scalar_type) : 0>; + + m_impl_offset = offset_type(padding(), arg_layout); + + const size_t alloc_size = + (m_impl_offset.span() * Array_N * MemorySpanSize + MemorySpanMask) & + ~size_t(MemorySpanMask); + const auto &alloc_name = + static_cast<Kokkos::Impl::ViewCtorProp<void, std::string> const &>( + arg_prop) + .value; + // Allocate memory from the memory space and create tracking record. + record_type *const record = record_type::allocate( + static_cast<Kokkos::Impl::ViewCtorProp<void, memory_space> const &>( + arg_prop) + .value, + alloc_name, alloc_size); + + if (alloc_size) { + m_impl_handle = + handle_type(reinterpret_cast<pointer_type>(record->data())); + + if (alloc_prop::initialize) { + // The functor constructs and destroys + record->m_destroy = functor_type( + static_cast<Kokkos::Impl::ViewCtorProp<void, execution_space> const + &>(arg_prop) + .value, + (pointer_type)m_impl_handle, m_impl_offset.span() * Array_N, + alloc_name); + + record->m_destroy.construct_shared_allocation(); + } + } + + return record; + } +}; + +/** \brief Assign Array to non-Array */ + +template <class DstTraits, class SrcTraits> +class ViewMapping< + DstTraits, SrcTraits, + typename std::enable_if<( + std::is_same<typename DstTraits::memory_space, + typename SrcTraits::memory_space>::value && + std::is_same<typename DstTraits::specialize, void>::value && + (std::is_same<typename DstTraits::array_layout, + Kokkos::LayoutLeft>::value || + std::is_same<typename DstTraits::array_layout, + Kokkos::LayoutRight>::value || + std::is_same<typename DstTraits::array_layout, + Kokkos::LayoutStride>::value) && + std::is_same<typename SrcTraits::specialize, Kokkos::Array<>>::value && + (std::is_same<typename SrcTraits::array_layout, + Kokkos::LayoutLeft>::value || + std::is_same<typename SrcTraits::array_layout, + Kokkos::LayoutRight>::value || + std::is_same<typename SrcTraits::array_layout, + Kokkos::LayoutStride>::value))>::type> { + public: + // Can only convert to View::array_type + + enum { + is_assignable_data_type = + std::is_same<typename DstTraits::data_type, + typename SrcTraits::scalar_array_type>::value && + (DstTraits::rank == SrcTraits::rank + 1) + }; + enum { + is_assignable = + std::is_same<typename DstTraits::data_type, + typename SrcTraits::scalar_array_type>::value && + std::is_same<typename DstTraits::array_layout, + typename SrcTraits::array_layout>::value + }; + + using TrackType = Kokkos::Impl::SharedAllocationTracker; + using DstType = ViewMapping<DstTraits, void>; + using SrcType = ViewMapping<SrcTraits, Kokkos::Array<>>; + + KOKKOS_INLINE_FUNCTION + static void assign(DstType &dst, const SrcType &src, + const TrackType & /*src_track*/) { + static_assert(is_assignable, "Can only convert to array_type"); + + using dst_offset_type = typename DstType::offset_type; + + // Array dimension becomes the last dimension. + // Arguments beyond the destination rank are ignored. + if (src.span_is_contiguous()) { // not padded + dst.m_impl_offset = dst_offset_type( + std::integral_constant<unsigned, 0>(), + typename DstTraits::array_layout( + (0 < SrcType::Rank ? src.dimension_0() + : SrcTraits::value_type::size()), + (1 < SrcType::Rank ? src.dimension_1() + : SrcTraits::value_type::size()), + (2 < SrcType::Rank ? src.dimension_2() + : SrcTraits::value_type::size()), + (3 < SrcType::Rank ? src.dimension_3() + : SrcTraits::value_type::size()), + (4 < SrcType::Rank ? src.dimension_4() + : SrcTraits::value_type::size()), + (5 < SrcType::Rank ? src.dimension_5() + : SrcTraits::value_type::size()), + (6 < SrcType::Rank ? src.dimension_6() + : SrcTraits::value_type::size()), + (7 < SrcType::Rank ? src.dimension_7() + : SrcTraits::value_type::size()))); + } else { // is padded + using padded = std::integral_constant< + unsigned int, sizeof(typename SrcTraits::value_type::value_type)>; + + dst.m_impl_offset = dst_offset_type( + padded(), typename DstTraits::array_layout( + (0 < SrcType::Rank ? src.dimension_0() + : SrcTraits::value_type::size()), + (1 < SrcType::Rank ? src.dimension_1() + : SrcTraits::value_type::size()), + (2 < SrcType::Rank ? src.dimension_2() + : SrcTraits::value_type::size()), + (3 < SrcType::Rank ? src.dimension_3() + : SrcTraits::value_type::size()), + (4 < SrcType::Rank ? src.dimension_4() + : SrcTraits::value_type::size()), + (5 < SrcType::Rank ? src.dimension_5() + : SrcTraits::value_type::size()), + (6 < SrcType::Rank ? src.dimension_6() + : SrcTraits::value_type::size()), + (7 < SrcType::Rank ? src.dimension_7() + : SrcTraits::value_type::size()))); + } + + dst.m_impl_handle = src.m_impl_handle; + } +}; + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +template <class SrcTraits, class... Args> +class ViewMapping< + typename std::enable_if<( + std::is_same<typename SrcTraits::specialize, Kokkos::Array<>>::value && + (std::is_same<typename SrcTraits::array_layout, + Kokkos::LayoutLeft>::value || + std::is_same<typename SrcTraits::array_layout, + Kokkos::LayoutRight>::value || + std::is_same<typename SrcTraits::array_layout, + Kokkos::LayoutStride>::value))>::type, + SrcTraits, Args...> { + private: + static_assert(SrcTraits::rank == sizeof...(Args), ""); + + enum : bool { + R0 = is_integral_extent<0, Args...>::value, + R1 = is_integral_extent<1, Args...>::value, + R2 = is_integral_extent<2, Args...>::value, + R3 = is_integral_extent<3, Args...>::value, + R4 = is_integral_extent<4, Args...>::value, + R5 = is_integral_extent<5, Args...>::value, + R6 = is_integral_extent<6, Args...>::value, + R7 = is_integral_extent<7, Args...>::value + }; + + enum { + rank = unsigned(R0) + unsigned(R1) + unsigned(R2) + unsigned(R3) + + unsigned(R4) + unsigned(R5) + unsigned(R6) + unsigned(R7) + }; + + // Whether right-most rank is a range. + enum { + R0_rev = + 0 == SrcTraits::rank + ? false + : (1 == SrcTraits::rank + ? R0 + : (2 == SrcTraits::rank + ? R1 + : (3 == SrcTraits::rank + ? R2 + : (4 == SrcTraits::rank + ? R3 + : (5 == SrcTraits::rank + ? R4 + : (6 == SrcTraits::rank + ? R5 + : (7 == SrcTraits::rank + ? R6 + : R7))))))) + }; + + // Subview's layout + using array_layout = + typename std::conditional<((rank == 0) || + (rank <= 2 && R0 && + std::is_same<typename SrcTraits::array_layout, + Kokkos::LayoutLeft>::value) || + (rank <= 2 && R0_rev && + std::is_same<typename SrcTraits::array_layout, + Kokkos::LayoutRight>::value)), + typename SrcTraits::array_layout, + Kokkos::LayoutStride>::type; + + using value_type = typename SrcTraits::value_type; + + using data_type = typename std::conditional< + rank == 0, value_type, + typename std::conditional< + rank == 1, value_type *, + typename std::conditional< + rank == 2, value_type **, + typename std::conditional< + rank == 3, value_type ***, + typename std::conditional< + rank == 4, value_type ****, + typename std::conditional< + rank == 5, value_type *****, + typename std::conditional< + rank == 6, value_type ******, + typename std::conditional< + rank == 7, value_type *******, + value_type ********>::type>::type>::type>:: + type>::type>::type>::type>::type; + + public: + using traits_type = Kokkos::ViewTraits<data_type, array_layout, + typename SrcTraits::device_type, + typename SrcTraits::memory_traits>; + + using type = + Kokkos::View<data_type, array_layout, typename SrcTraits::device_type, + typename SrcTraits::memory_traits>; + + KOKKOS_INLINE_FUNCTION + static void assign(ViewMapping<traits_type, void> &dst, + ViewMapping<SrcTraits, void> const &src, Args... args) { + using DstType = ViewMapping<traits_type, void>; + + using dst_offset_type = typename DstType::offset_type; + using dst_handle_type = typename DstType::handle_type; + + const SubviewExtents<SrcTraits::rank, rank> extents(src.m_impl_offset.m_dim, + args...); + + dst.m_impl_offset = dst_offset_type(src.m_impl_offset, extents); + dst.m_impl_handle = dst_handle_type( + src.m_impl_handle + + src.m_impl_offset(extents.domain_offset(0), extents.domain_offset(1), + extents.domain_offset(2), extents.domain_offset(3), + extents.domain_offset(4), extents.domain_offset(5), + extents.domain_offset(6), extents.domain_offset(7))); + } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #ifndef KOKKOS_EXPERIMENTAL_VIEW_ARRAY_MAPPING_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewCtor.hpp b/packages/kokkos/core/src/impl/Kokkos_ViewCtor.hpp new file mode 100644 index 0000000000000000000000000000000000000000..b9e32a04e09afcf1a5fcbeba0bd81257631f7714 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_ViewCtor.hpp @@ -0,0 +1,304 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_EXPERIMENTAL_IMPL_VIEW_CTOR_PROP_HPP +#define KOKKOS_EXPERIMENTAL_IMPL_VIEW_CTOR_PROP_HPP + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +struct WithoutInitializing_t {}; +struct AllowPadding_t {}; +struct NullSpace_t {}; + +//---------------------------------------------------------------------------- +/**\brief Whether a type can be used for a view label */ + +template <typename> +struct is_view_label : public std::false_type {}; + +template <> +struct is_view_label<std::string> : public std::true_type {}; + +template <unsigned N> +struct is_view_label<char[N]> : public std::true_type {}; + +template <unsigned N> +struct is_view_label<const char[N]> : public std::true_type {}; + +//---------------------------------------------------------------------------- + +template <typename... P> +struct ViewCtorProp; + +// Forward declare +template <typename Specialize, typename T> +struct CommonViewAllocProp; + +/* Common value_type stored as ViewCtorProp + */ +template <typename Specialize, typename T> +struct ViewCtorProp<void, CommonViewAllocProp<Specialize, T> > { + ViewCtorProp() = default; + ViewCtorProp(const ViewCtorProp &) = default; + ViewCtorProp &operator=(const ViewCtorProp &) = default; + + using type = CommonViewAllocProp<Specialize, T>; + + KOKKOS_INLINE_FUNCTION + ViewCtorProp(const type &arg) : value(arg) {} + KOKKOS_INLINE_FUNCTION + ViewCtorProp(type &&arg) : value(arg) {} + + type value; +}; + +/* std::integral_constant<unsigned,I> are dummy arguments + * that avoid duplicate base class errors + */ +template <unsigned I> +struct ViewCtorProp<void, std::integral_constant<unsigned, I> > { + ViewCtorProp() = default; + ViewCtorProp(const ViewCtorProp &) = default; + ViewCtorProp &operator=(const ViewCtorProp &) = default; + + template <typename P> + KOKKOS_INLINE_FUNCTION ViewCtorProp(const P &) {} +}; + +/* Property flags have constexpr value */ +template <typename P> +struct ViewCtorProp<typename std::enable_if< + std::is_same<P, AllowPadding_t>::value || + std::is_same<P, WithoutInitializing_t>::value>::type, + P> { + ViewCtorProp() = default; + ViewCtorProp(const ViewCtorProp &) = default; + ViewCtorProp &operator=(const ViewCtorProp &) = default; + + using type = P; + + ViewCtorProp(const type &) {} + + static constexpr type value = type(); +}; + +/* Map input label type to std::string */ +template <typename Label> +struct ViewCtorProp<typename std::enable_if<is_view_label<Label>::value>::type, + Label> { + ViewCtorProp() = default; + ViewCtorProp(const ViewCtorProp &) = default; + ViewCtorProp &operator=(const ViewCtorProp &) = default; + + using type = std::string; + + ViewCtorProp(const type &arg) : value(arg) {} + ViewCtorProp(type &&arg) : value(arg) {} + + type value; +}; + +template <typename Space> +struct ViewCtorProp<typename std::enable_if< + Kokkos::Impl::is_memory_space<Space>::value || + Kokkos::Impl::is_execution_space<Space>::value>::type, + Space> { + ViewCtorProp() = default; + ViewCtorProp(const ViewCtorProp &) = default; + ViewCtorProp &operator=(const ViewCtorProp &) = default; + + using type = Space; + + ViewCtorProp(const type &arg) : value(arg) {} + + type value; +}; + +template <typename T> +struct ViewCtorProp<void, T *> { + ViewCtorProp() = default; + ViewCtorProp(const ViewCtorProp &) = default; + ViewCtorProp &operator=(const ViewCtorProp &) = default; + + using type = T *; + + KOKKOS_INLINE_FUNCTION + ViewCtorProp(const type arg) : value(arg) {} + + type value; +}; + +// For some reason I don't understand I needed this specialization explicitly +// for NVCC/MSVC +template <typename T> +struct ViewCtorProp<T *> { + ViewCtorProp() = default; + ViewCtorProp(const ViewCtorProp &) = default; + ViewCtorProp &operator=(const ViewCtorProp &) = default; + + using type = T *; + + KOKKOS_INLINE_FUNCTION + ViewCtorProp(const type arg) : value(arg) {} + + enum : bool { has_pointer = true }; + using pointer_type = type; + type value; +}; + +// If we use `ViewCtorProp<Args...>` and `ViewCtorProp<void, Args>...` directly +// in the parameter lists and base class initializers, respectively, as far as +// we can tell MSVC 16.5.5+CUDA 10.2 thinks that `ViewCtorProp` refers to the +// current instantiation, not the template itself, and gets all kinds of +// confused. To work around this, we just use a couple of alias templates that +// amount to the same thing. +template <typename... Args> +using view_ctor_prop_args = ViewCtorProp<Args...>; + +template <typename Arg> +using view_ctor_prop_base = ViewCtorProp<void, Arg>; + +template <typename... P> +struct ViewCtorProp : public ViewCtorProp<void, P>... { + private: + using var_memory_space = + Kokkos::Impl::has_condition<void, Kokkos::Impl::is_memory_space, P...>; + + using var_execution_space = + Kokkos::Impl::has_condition<void, Kokkos::Impl::is_execution_space, P...>; + + struct VOIDDUMMY {}; + + using var_pointer = + Kokkos::Impl::has_condition<VOIDDUMMY, std::is_pointer, P...>; + + public: + /* Flags for the common properties */ + enum { has_memory_space = var_memory_space::value }; + enum { has_execution_space = var_execution_space::value }; + enum { has_pointer = var_pointer::value }; + enum { has_label = Kokkos::Impl::has_type<std::string, P...>::value }; + enum { allow_padding = Kokkos::Impl::has_type<AllowPadding_t, P...>::value }; + enum { + initialize = !Kokkos::Impl::has_type<WithoutInitializing_t, P...>::value + }; + + using memory_space = typename var_memory_space::type; + using execution_space = typename var_execution_space::type; + using pointer_type = typename var_pointer::type; + + /* Copy from a matching argument list. + * Requires std::is_same< P , ViewCtorProp< void , Args >::value ... + */ + template <typename... Args> + inline ViewCtorProp(Args const &... args) : ViewCtorProp<void, P>(args)... {} + + template <typename... Args> + KOKKOS_INLINE_FUNCTION ViewCtorProp(pointer_type arg0, Args const &... args) + : ViewCtorProp<void, pointer_type>(arg0), + ViewCtorProp<void, typename ViewCtorProp<void, Args>::type>(args)... {} + + /* Copy from a matching property subset */ + KOKKOS_INLINE_FUNCTION ViewCtorProp(pointer_type arg0) + : ViewCtorProp<void, pointer_type>(arg0) {} + + // If we use `ViewCtorProp<Args...>` and `ViewCtorProp<void, Args>...` here + // directly, MSVC 16.5.5+CUDA 10.2 appears to think that `ViewCtorProp` refers + // to the current instantiation, not the template itself, and gets all kinds + // of confused. To work around this, we just use a couple of alias templates + // that amount to the same thing. + template <typename... Args> + ViewCtorProp(view_ctor_prop_args<Args...> const &arg) + : view_ctor_prop_base<Args>( + static_cast<view_ctor_prop_base<Args> const &>(arg))... { + // Suppress an unused argument warning that (at least at one point) would + // show up if sizeof...(Args) == 0 + (void)arg; + } +}; + +} /* namespace Impl */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +/* For backward compatibility */ +namespace Impl { +struct ViewAllocateWithoutInitializingBackwardCompat {}; + +template <> +struct ViewCtorProp<void, ViewAllocateWithoutInitializingBackwardCompat> {}; + +// NOTE This specialization is meant to be used as the +// ViewAllocateWithoutInitializing alias below. All it does is add a +// constructor that takes the label as single argument. +template <> +struct ViewCtorProp<WithoutInitializing_t, std::string, + ViewAllocateWithoutInitializingBackwardCompat> + : ViewCtorProp<WithoutInitializing_t, std::string>, + ViewCtorProp<void, ViewAllocateWithoutInitializingBackwardCompat> { + ViewCtorProp(std::string label) + : ViewCtorProp<WithoutInitializing_t, std::string>( + WithoutInitializing_t(), std::move(label)) {} +}; +} /* namespace Impl */ + +/*[[deprecated(Use Kokkos::alloc(Kokkos::WithoutInitializing, label) instead]]*/ +using ViewAllocateWithoutInitializing = + Impl::ViewCtorProp<Impl::WithoutInitializing_t, std::string, + Impl::ViewAllocateWithoutInitializingBackwardCompat>; + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewLayoutTiled.hpp b/packages/kokkos/core/src/impl/Kokkos_ViewLayoutTiled.hpp new file mode 100644 index 0000000000000000000000000000000000000000..6915622352e47d25efa34ae687f3e4f190150974 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_ViewLayoutTiled.hpp @@ -0,0 +1,1452 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_EXPERIMENTAL_VIEWLAYOUTTILE_HPP +#define KOKKOS_EXPERIMENTAL_VIEWLAYOUTTILE_HPP + +#include <Kokkos_Layout.hpp> +#include <Kokkos_View.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +// View offset and mapping for tiled view's + +template <Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, unsigned ArgN0, + unsigned ArgN1> +struct is_array_layout<Kokkos::Experimental::LayoutTiled< + OuterP, InnerP, ArgN0, ArgN1, 0, 0, 0, 0, 0, 0, true> > + : public std::true_type {}; + +template <Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, unsigned ArgN0, + unsigned ArgN1, unsigned ArgN2> +struct is_array_layout<Kokkos::Experimental::LayoutTiled< + OuterP, InnerP, ArgN0, ArgN1, ArgN2, 0, 0, 0, 0, 0, true> > + : public std::true_type {}; + +template <Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, unsigned ArgN0, + unsigned ArgN1, unsigned ArgN2, unsigned ArgN3> +struct is_array_layout<Kokkos::Experimental::LayoutTiled< + OuterP, InnerP, ArgN0, ArgN1, ArgN2, ArgN3, 0, 0, 0, 0, true> > + : public std::true_type {}; + +template <Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, unsigned ArgN0, + unsigned ArgN1, unsigned ArgN2, unsigned ArgN3, unsigned ArgN4> +struct is_array_layout<Kokkos::Experimental::LayoutTiled< + OuterP, InnerP, ArgN0, ArgN1, ArgN2, ArgN3, ArgN4, 0, 0, 0, true> > + : public std::true_type {}; + +template <Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, unsigned ArgN0, + unsigned ArgN1, unsigned ArgN2, unsigned ArgN3, unsigned ArgN4, + unsigned ArgN5> +struct is_array_layout<Kokkos::Experimental::LayoutTiled< + OuterP, InnerP, ArgN0, ArgN1, ArgN2, ArgN3, ArgN4, ArgN5, 0, 0, true> > + : public std::true_type {}; + +template <Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, unsigned ArgN0, + unsigned ArgN1, unsigned ArgN2, unsigned ArgN3, unsigned ArgN4, + unsigned ArgN5, unsigned ArgN6> +struct is_array_layout<Kokkos::Experimental::LayoutTiled< + OuterP, InnerP, ArgN0, ArgN1, ArgN2, ArgN3, ArgN4, ArgN5, ArgN6, 0, true> > + : public std::true_type {}; + +template <Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, unsigned ArgN0, + unsigned ArgN1, unsigned ArgN2, unsigned ArgN3, unsigned ArgN4, + unsigned ArgN5, unsigned ArgN6, unsigned ArgN7> +struct is_array_layout< + Kokkos::Experimental::LayoutTiled<OuterP, InnerP, ArgN0, ArgN1, ArgN2, + ArgN3, ArgN4, ArgN5, ArgN6, ArgN7, true> > + : public std::true_type {}; + +template <class L> +struct is_array_layout_tiled : public std::false_type {}; + +template <Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, unsigned ArgN0, + unsigned ArgN1, unsigned ArgN2, unsigned ArgN3, unsigned ArgN4, + unsigned ArgN5, unsigned ArgN6, unsigned ArgN7, bool IsPowerTwo> +struct is_array_layout_tiled<Kokkos::Experimental::LayoutTiled< + OuterP, InnerP, ArgN0, ArgN1, ArgN2, ArgN3, ArgN4, ArgN5, ArgN6, ArgN7, + IsPowerTwo> > : public std::true_type { +}; // Last template parameter "true" meaning this currently only supports + // powers-of-two + +namespace Impl { + +template <class Dimension, class Layout> +struct ViewOffset< + Dimension, Layout, + typename std::enable_if<((Dimension::rank <= 8) && (Dimension::rank >= 2) && + is_array_layout<Layout>::value && + is_array_layout_tiled<Layout>::value)>::type> { + public: + static constexpr Kokkos::Iterate outer_pattern = Layout::outer_pattern; + static constexpr Kokkos::Iterate inner_pattern = Layout::inner_pattern; + + static constexpr int VORank = Dimension::rank; + + static constexpr unsigned SHIFT_0 = + Kokkos::Impl::integral_power_of_two(Layout::N0); + static constexpr unsigned SHIFT_1 = + Kokkos::Impl::integral_power_of_two(Layout::N1); + static constexpr unsigned SHIFT_2 = + Kokkos::Impl::integral_power_of_two(Layout::N2); + static constexpr unsigned SHIFT_3 = + Kokkos::Impl::integral_power_of_two(Layout::N3); + static constexpr unsigned SHIFT_4 = + Kokkos::Impl::integral_power_of_two(Layout::N4); + static constexpr unsigned SHIFT_5 = + Kokkos::Impl::integral_power_of_two(Layout::N5); + static constexpr unsigned SHIFT_6 = + Kokkos::Impl::integral_power_of_two(Layout::N6); + static constexpr unsigned SHIFT_7 = + Kokkos::Impl::integral_power_of_two(Layout::N7); + static constexpr int MASK_0 = Layout::N0 - 1; + static constexpr int MASK_1 = Layout::N1 - 1; + static constexpr int MASK_2 = Layout::N2 - 1; + static constexpr int MASK_3 = Layout::N3 - 1; + static constexpr int MASK_4 = Layout::N4 - 1; + static constexpr int MASK_5 = Layout::N5 - 1; + static constexpr int MASK_6 = Layout::N6 - 1; + static constexpr int MASK_7 = Layout::N7 - 1; + + static constexpr unsigned SHIFT_2T = SHIFT_0 + SHIFT_1; + static constexpr unsigned SHIFT_3T = SHIFT_0 + SHIFT_1 + SHIFT_2; + static constexpr unsigned SHIFT_4T = SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3; + static constexpr unsigned SHIFT_5T = + SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4; + static constexpr unsigned SHIFT_6T = + SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4 + SHIFT_5; + static constexpr unsigned SHIFT_7T = + SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4 + SHIFT_5 + SHIFT_6; + static constexpr unsigned SHIFT_8T = SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + + SHIFT_4 + SHIFT_5 + SHIFT_6 + SHIFT_7; + + // Is an irregular layout that does not have uniform striding for each index. + using is_mapping_plugin = std::true_type; + using is_regular = std::false_type; + + using size_type = size_t; + using dimension_type = Dimension; + using array_layout = Layout; + + dimension_type m_dim; + size_type m_tile_N0; // Num tiles dim 0 + size_type m_tile_N1; + size_type m_tile_N2; + size_type m_tile_N3; + size_type m_tile_N4; + size_type m_tile_N5; + size_type m_tile_N6; + size_type m_tile_N7; + + //---------------------------------------- + +#define DEBUG_OUTPUT_CHECK 0 + + // Rank 2 + template <typename I0, typename I1> + KOKKOS_INLINE_FUNCTION size_type operator()(I0 const& i0, + I1 const& i1) const { + auto tile_offset = + (outer_pattern == (Kokkos::Iterate::Left)) + ? (((i0 >> SHIFT_0) + m_tile_N0 * ((i1 >> SHIFT_1))) << SHIFT_2T) + : (((m_tile_N1 * (i0 >> SHIFT_0) + (i1 >> SHIFT_1))) << SHIFT_2T); + // ( num_tiles[1] * ti0 + ti1 ) * FTD + + auto local_offset = (inner_pattern == (Kokkos::Iterate::Left)) + ? ((i0 & MASK_0) + ((i1 & MASK_1) << SHIFT_0)) + : (((i0 & MASK_0) << SHIFT_1) + (i1 & MASK_1)); + // ( tile_dim[1] * li0 + li1 ) + +#if DEBUG_OUTPUT_CHECK + std::cout << "Am I Outer Left? " + << (outer_pattern == (Kokkos::Iterate::Left)) << std::endl; + std::cout << "Am I Inner Left? " + << (inner_pattern == (Kokkos::Iterate::Left)) << std::endl; + std::cout << "i0 = " << i0 << " i1 = " << i1 + << "\ntilei0 = " << (i0 >> SHIFT_0) + << " tilei1 = " << (i1 >> SHIFT_1) + << "locali0 = " << (i0 & MASK_0) + << "\nlocali1 = " << (i1 & MASK_1) << std::endl; +#endif + + return tile_offset + local_offset; + } + + // Rank 3 + template <typename I0, typename I1, typename I2> + KOKKOS_INLINE_FUNCTION size_type operator()(I0 const& i0, I1 const& i1, + I2 const& i2) const { + auto tile_offset = + (outer_pattern == Kokkos::Iterate::Left) + ? (((i0 >> SHIFT_0) + + m_tile_N0 * ((i1 >> SHIFT_1) + m_tile_N1 * (i2 >> SHIFT_2))) + << SHIFT_3T) + : ((m_tile_N2 * (m_tile_N1 * (i0 >> SHIFT_0) + (i1 >> SHIFT_1)) + + (i2 >> SHIFT_2)) + << SHIFT_3T); + + auto local_offset = (inner_pattern == Kokkos::Iterate::Left) + ? ((i0 & MASK_0) + ((i1 & MASK_1) << SHIFT_0) + + ((i2 & MASK_2) << (SHIFT_0 + SHIFT_1))) + : (((i0 & MASK_0) << (SHIFT_2 + SHIFT_1)) + + ((i1 & MASK_1) << (SHIFT_2)) + (i2 & MASK_2)); + +#if DEBUG_OUTPUT_CHECK + std::cout << "Am I Outer Left? " + << (outer_pattern == (Kokkos::Iterate::Left)) << std::endl; + std::cout << "Am I Inner Left? " + << (inner_pattern == (Kokkos::Iterate::Left)) << std::endl; + std::cout << "i0 = " << i0 << " i1 = " << i1 << " i2 = " << i2 + << "\ntilei0 = " << (i0 >> SHIFT_0) + << " tilei1 = " << (i1 >> SHIFT_1) + << " tilei2 = " << (i2 >> SHIFT_2) + << "\nlocali0 = " << (i0 & MASK_0) + << "locali1 = " << (i1 & MASK_1) << "locali2 = " << (i2 & MASK_2) + << std::endl; +#endif + + return tile_offset + local_offset; + } + + // Rank 4 + template <typename I0, typename I1, typename I2, typename I3> + KOKKOS_INLINE_FUNCTION size_type operator()(I0 const& i0, I1 const& i1, + I2 const& i2, + I3 const& i3) const { + auto tile_offset = + (outer_pattern == Kokkos::Iterate::Left) + ? (((i0 >> SHIFT_0) + + m_tile_N0 * ((i1 >> SHIFT_1) + + m_tile_N1 * ((i2 >> SHIFT_2) + + m_tile_N2 * (i3 >> SHIFT_3)))) + << SHIFT_4T) + : ((m_tile_N3 * (m_tile_N2 * (m_tile_N1 * (i0 >> SHIFT_0) + + (i1 >> SHIFT_1)) + + (i2 >> SHIFT_2)) + + (i3 >> SHIFT_3)) + << SHIFT_4T); + + auto local_offset = + (inner_pattern == Kokkos::Iterate::Left) + ? ((i0 & MASK_0) + ((i1 & MASK_1) << SHIFT_0) + + ((i2 & MASK_2) << (SHIFT_0 + SHIFT_1)) + + ((i3 & MASK_3) << (SHIFT_0 + SHIFT_1 + SHIFT_2))) + : (((i0 & MASK_0) << (SHIFT_3 + SHIFT_2 + SHIFT_1)) + + ((i1 & MASK_1) << (SHIFT_3 + SHIFT_2)) + + ((i2 & MASK_2) << (SHIFT_3)) + (i3 & MASK_3)); + + return tile_offset + local_offset; + } + + // Rank 5 + template <typename I0, typename I1, typename I2, typename I3, typename I4> + KOKKOS_INLINE_FUNCTION size_type operator()(I0 const& i0, I1 const& i1, + I2 const& i2, I3 const& i3, + I4 const& i4) const { + auto tile_offset = + (outer_pattern == Kokkos::Iterate::Left) + ? (((i0 >> SHIFT_0) + + m_tile_N0 * + ((i1 >> SHIFT_1) + + m_tile_N1 * ((i2 >> SHIFT_2) + + m_tile_N2 * ((i3 >> SHIFT_3) + + m_tile_N3 * (i4 >> SHIFT_4))))) + << SHIFT_5T) + : ((m_tile_N4 * + (m_tile_N3 * (m_tile_N2 * (m_tile_N1 * (i0 >> SHIFT_0) + + (i1 >> SHIFT_1)) + + (i2 >> SHIFT_2)) + + (i3 >> SHIFT_3)) + + (i4 >> SHIFT_4)) + << SHIFT_5T); + + auto local_offset = + (inner_pattern == Kokkos::Iterate::Left) + ? ((i0 & MASK_0) + ((i1 & MASK_1) << SHIFT_0) + + ((i2 & MASK_2) << (SHIFT_0 + SHIFT_1)) + + ((i3 & MASK_3) << (SHIFT_0 + SHIFT_1 + SHIFT_2)) + + ((i4 & MASK_4) << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3))) + : (((i0 & MASK_0) << (SHIFT_4 + SHIFT_3 + SHIFT_2 + SHIFT_1)) + + ((i1 & MASK_1) << (SHIFT_4 + SHIFT_3 + SHIFT_2)) + + ((i2 & MASK_2) << (SHIFT_4 + SHIFT_3)) + + ((i3 & MASK_3) << (SHIFT_4)) + (i4 & MASK_4)); + + return tile_offset + local_offset; + } + + // Rank 6 + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5> + KOKKOS_INLINE_FUNCTION size_type operator()(I0 const& i0, I1 const& i1, + I2 const& i2, I3 const& i3, + I4 const& i4, + I5 const& i5) const { + auto tile_offset = + (outer_pattern == Kokkos::Iterate::Left) + ? (((i0 >> SHIFT_0) + + m_tile_N0 * + ((i1 >> SHIFT_1) + + m_tile_N1 * + ((i2 >> SHIFT_2) + + m_tile_N2 * + ((i3 >> SHIFT_3) + + m_tile_N3 * ((i4 >> SHIFT_4) + + m_tile_N4 * (i5 >> SHIFT_5)))))) + << SHIFT_6T) + : ((m_tile_N5 * + (m_tile_N4 * + (m_tile_N3 * + (m_tile_N2 * (m_tile_N1 * (i0 >> SHIFT_0) + + (i1 >> SHIFT_1)) + + (i2 >> SHIFT_2)) + + (i3 >> SHIFT_3)) + + (i4 >> SHIFT_4)) + + (i5 >> SHIFT_5)) + << SHIFT_6T); + + auto local_offset = + (inner_pattern == Kokkos::Iterate::Left) + ? ((i0 & MASK_0) + ((i1 & MASK_1) << SHIFT_0) + + ((i2 & MASK_2) << (SHIFT_0 + SHIFT_1)) + + ((i3 & MASK_3) << (SHIFT_0 + SHIFT_1 + SHIFT_2)) + + ((i4 & MASK_4) << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3)) + + ((i5 & MASK_5) + << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4))) + : (((i0 & MASK_0) + << (SHIFT_5 + SHIFT_4 + SHIFT_3 + SHIFT_2 + SHIFT_1)) + + ((i1 & MASK_1) << (SHIFT_5 + SHIFT_4 + SHIFT_3 + SHIFT_2)) + + ((i2 & MASK_2) << (SHIFT_5 + SHIFT_4 + SHIFT_3)) + + ((i3 & MASK_3) << (SHIFT_5 + SHIFT_4)) + + ((i4 & MASK_4) << (SHIFT_5)) + (i5 & MASK_5)); + + return tile_offset + local_offset; + } + + // Rank 7 + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5, typename I6> + KOKKOS_INLINE_FUNCTION size_type operator()(I0 const& i0, I1 const& i1, + I2 const& i2, I3 const& i3, + I4 const& i4, I5 const& i5, + I6 const& i6) const { + auto tile_offset = + (outer_pattern == Kokkos::Iterate::Left) + ? (((i0 >> SHIFT_0) + + m_tile_N0 * + ((i1 >> SHIFT_1) + + m_tile_N1 * + ((i2 >> SHIFT_2) + + m_tile_N2 * + ((i3 >> SHIFT_3) + + m_tile_N3 * + ((i4 >> SHIFT_4) + + m_tile_N4 * + ((i5 >> SHIFT_5) + + m_tile_N5 * (i6 >> SHIFT_6))))))) + << SHIFT_7T) + : ((m_tile_N6 * + (m_tile_N5 * + (m_tile_N4 * + (m_tile_N3 * + (m_tile_N2 * (m_tile_N1 * (i0 >> SHIFT_0) + + (i1 >> SHIFT_1)) + + (i2 >> SHIFT_2)) + + (i3 >> SHIFT_3)) + + (i4 >> SHIFT_4)) + + (i5 >> SHIFT_5)) + + (i6 >> SHIFT_6)) + << SHIFT_7T); + + auto local_offset = + (inner_pattern == Kokkos::Iterate::Left) + ? ((i0 & MASK_0) + ((i1 & MASK_1) << SHIFT_0) + + ((i2 & MASK_2) << (SHIFT_0 + SHIFT_1)) + + ((i3 & MASK_3) << (SHIFT_0 + SHIFT_1 + SHIFT_2)) + + ((i4 & MASK_4) << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3)) + + ((i5 & MASK_5) + << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4)) + + ((i6 & MASK_6) + << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4 + SHIFT_5))) + : (((i0 & MASK_0) << (SHIFT_6 + SHIFT_5 + SHIFT_4 + SHIFT_3 + + SHIFT_2 + SHIFT_1)) + + ((i1 & MASK_1) + << (SHIFT_6 + SHIFT_5 + SHIFT_4 + SHIFT_3 + SHIFT_2)) + + ((i2 & MASK_2) << (SHIFT_6 + SHIFT_5 + SHIFT_4 + SHIFT_3)) + + ((i3 & MASK_3) << (SHIFT_6 + SHIFT_5 + SHIFT_4)) + + ((i4 & MASK_4) << (SHIFT_6 + SHIFT_5)) + + ((i5 & MASK_5) << (SHIFT_6)) + (i6 & MASK_6)); + + return tile_offset + local_offset; + } + + // Rank 8 + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5, typename I6, typename I7> + KOKKOS_INLINE_FUNCTION size_type operator()(I0 const& i0, I1 const& i1, + I2 const& i2, I3 const& i3, + I4 const& i4, I5 const& i5, + I6 const& i6, + I7 const& i7) const { + auto tile_offset = + (outer_pattern == Kokkos::Iterate::Left) + ? (((i0 >> SHIFT_0) + + m_tile_N0 * + ((i1 >> SHIFT_1) + + m_tile_N1 * + ((i2 >> SHIFT_2) + + m_tile_N2 * + ((i3 >> SHIFT_3) + + m_tile_N3 * + ((i4 >> SHIFT_4) + + m_tile_N4 * + ((i5 >> SHIFT_5) + + m_tile_N5 * + ((i6 >> SHIFT_6) + + m_tile_N6 * (i7 >> SHIFT_7)))))))) + << SHIFT_8T) + : ((m_tile_N7 * + (m_tile_N6 * + (m_tile_N5 * + (m_tile_N4 * + (m_tile_N3 * + (m_tile_N2 * + (m_tile_N1 * (i0 >> SHIFT_0) + + (i1 >> SHIFT_1)) + + (i2 >> SHIFT_2)) + + (i3 >> SHIFT_3)) + + (i4 >> SHIFT_4)) + + (i5 >> SHIFT_5)) + + (i6 >> SHIFT_6)) + + (i7 >> SHIFT_7)) + << SHIFT_8T); + + auto local_offset = + (inner_pattern == Kokkos::Iterate::Left) + ? ((i0 & MASK_0) + ((i1 & MASK_1) << SHIFT_0) + + ((i2 & MASK_2) << (SHIFT_0 + SHIFT_1)) + + ((i3 & MASK_3) << (SHIFT_0 + SHIFT_1 + SHIFT_2)) + + ((i4 & MASK_4) << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3)) + + ((i5 & MASK_5) + << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + SHIFT_4)) + + ((i6 & MASK_6) << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + + SHIFT_4 + SHIFT_5)) + + ((i7 & MASK_7) << (SHIFT_0 + SHIFT_1 + SHIFT_2 + SHIFT_3 + + SHIFT_4 + SHIFT_5 + SHIFT_6))) + : (((i0 & MASK_0) << (SHIFT_7 + SHIFT_6 + SHIFT_5 + SHIFT_4 + + SHIFT_3 + SHIFT_2 + SHIFT_1)) + + ((i1 & MASK_1) << (SHIFT_7 + SHIFT_6 + SHIFT_5 + SHIFT_4 + + SHIFT_3 + SHIFT_2)) + + ((i2 & MASK_2) + << (SHIFT_7 + SHIFT_6 + SHIFT_5 + SHIFT_4 + SHIFT_3)) + + ((i3 & MASK_3) << (SHIFT_7 + SHIFT_6 + SHIFT_5 + SHIFT_4)) + + ((i4 & MASK_4) << (SHIFT_7 + SHIFT_6 + SHIFT_5)) + + ((i5 & MASK_5) << (SHIFT_7 + SHIFT_6)) + + ((i6 & MASK_6) << (SHIFT_7)) + (i7 & MASK_7)); + + return tile_offset + local_offset; + } + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION constexpr array_layout layout() const { + return array_layout(m_dim.N0, m_dim.N1, m_dim.N2, m_dim.N2, m_dim.N3, + m_dim.N4, m_dim.N5, m_dim.N6, m_dim.N7); + } + + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { + return m_dim.N0; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_1() const { + return m_dim.N1; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_2() const { + return m_dim.N2; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_3() const { + return m_dim.N3; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_4() const { + return m_dim.N4; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_5() const { + return m_dim.N5; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_6() const { + return m_dim.N6; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_7() const { + return m_dim.N7; + } + + KOKKOS_INLINE_FUNCTION constexpr size_type size() const { + return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * + m_dim.N6 * m_dim.N7; + } + + // Strides are meaningless due to irregularity + KOKKOS_INLINE_FUNCTION constexpr size_type stride_0() const { return 0; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_1() const { return 0; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_2() const { return 0; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_3() const { return 0; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_4() const { return 0; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_5() const { return 0; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_6() const { return 0; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_7() const { return 0; } + + // Stride with [ rank ] value is the total length + template <typename iType> + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + s[0] = 0; + if (0 < dimension_type::rank) { + s[1] = 0; + } + if (1 < dimension_type::rank) { + s[2] = 0; + } + if (2 < dimension_type::rank) { + s[3] = 0; + } + if (3 < dimension_type::rank) { + s[4] = 0; + } + if (4 < dimension_type::rank) { + s[5] = 0; + } + if (5 < dimension_type::rank) { + s[6] = 0; + } + if (6 < dimension_type::rank) { + s[7] = 0; + } + if (7 < dimension_type::rank) { + s[8] = 0; + } + } + + KOKKOS_INLINE_FUNCTION constexpr size_type span() const { + // Rank2: ( NumTile0 * ( NumTile1 ) ) * TileSize, etc + return (VORank == 2) + ? (m_tile_N0 * m_tile_N1) << SHIFT_2T + : (VORank == 3) + ? (m_tile_N0 * m_tile_N1 * m_tile_N2) << SHIFT_3T + : (VORank == 4) + ? (m_tile_N0 * m_tile_N1 * m_tile_N2 * m_tile_N3) + << SHIFT_4T + : (VORank == 5) + ? (m_tile_N0 * m_tile_N1 * m_tile_N2 * + m_tile_N3 * m_tile_N4) + << SHIFT_5T + : (VORank == 6) + ? (m_tile_N0 * m_tile_N1 * m_tile_N2 * + m_tile_N3 * m_tile_N4 * m_tile_N5) + << SHIFT_6T + : (VORank == 7) + ? (m_tile_N0 * m_tile_N1 * + m_tile_N2 * m_tile_N3 * + m_tile_N4 * m_tile_N5 * + m_tile_N6) + << SHIFT_7T + : (m_tile_N0 * m_tile_N1 * + m_tile_N2 * m_tile_N3 * + m_tile_N4 * m_tile_N5 * + m_tile_N6 * m_tile_N7) + << SHIFT_8T; + } + + KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { + return true; + } + + //---------------------------------------- +#ifdef KOKKOS_IMPL_WINDOWS_CUDA + KOKKOS_FUNCTION ViewOffset() {} + KOKKOS_FUNCTION ViewOffset(const ViewOffset& src) { + m_dim = src.m_dim; + m_tile_N0 = src.m_tile_N0; + m_tile_N1 = src.m_tile_N1; + m_tile_N2 = src.m_tile_N2; + m_tile_N3 = src.m_tile_N3; + m_tile_N4 = src.m_tile_N4; + m_tile_N5 = src.m_tile_N5; + m_tile_N6 = src.m_tile_N6; + m_tile_N7 = src.m_tile_N7; + } + KOKKOS_FUNCTION ViewOffset& operator=(const ViewOffset& src) { + m_dim = src.m_dim; + m_tile_N0 = src.m_tile_N0; + m_tile_N1 = src.m_tile_N1; + m_tile_N2 = src.m_tile_N2; + m_tile_N3 = src.m_tile_N3; + m_tile_N4 = src.m_tile_N4; + m_tile_N5 = src.m_tile_N5; + m_tile_N6 = src.m_tile_N6; + m_tile_N7 = src.m_tile_N7; + return *this; + } +#else + KOKKOS_DEFAULTED_FUNCTION ~ViewOffset() = default; + KOKKOS_DEFAULTED_FUNCTION ViewOffset() = default; + KOKKOS_DEFAULTED_FUNCTION ViewOffset(const ViewOffset&) = default; + KOKKOS_DEFAULTED_FUNCTION ViewOffset& operator=(const ViewOffset&) = default; +#endif + + template <unsigned TrivialScalarSize> + KOKKOS_INLINE_FUNCTION constexpr ViewOffset( + std::integral_constant<unsigned, TrivialScalarSize> const&, + array_layout const arg_layout) + : m_dim(arg_layout.dimension[0], arg_layout.dimension[1], + arg_layout.dimension[2], arg_layout.dimension[3], + arg_layout.dimension[4], arg_layout.dimension[5], + arg_layout.dimension[6], arg_layout.dimension[7]), + m_tile_N0((arg_layout.dimension[0] + MASK_0) >> + SHIFT_0 /* number of tiles in first dimension */), + m_tile_N1((arg_layout.dimension[1] + MASK_1) >> SHIFT_1), + m_tile_N2((VORank > 2) ? (arg_layout.dimension[2] + MASK_2) >> SHIFT_2 + : 0), + m_tile_N3((VORank > 3) ? (arg_layout.dimension[3] + MASK_3) >> SHIFT_3 + : 0), + m_tile_N4((VORank > 4) ? (arg_layout.dimension[4] + MASK_4) >> SHIFT_4 + : 0), + m_tile_N5((VORank > 5) ? (arg_layout.dimension[5] + MASK_5) >> SHIFT_5 + : 0), + m_tile_N6((VORank > 6) ? (arg_layout.dimension[6] + MASK_6) >> SHIFT_6 + : 0), + m_tile_N7((VORank > 7) ? (arg_layout.dimension[7] + MASK_7) >> SHIFT_7 + : 0) {} +}; + +// FIXME Remove the out-of-class definitions when we require C++17 +#define KOKKOS_ITERATE_VIEW_OFFSET_ENABLE \ + typename std::enable_if<((Dimension::rank <= 8) && (Dimension::rank >= 2) && \ + is_array_layout<Layout>::value && \ + is_array_layout_tiled<Layout>::value)>::type +template <class Dimension, class Layout> +constexpr Kokkos::Iterate ViewOffset< + Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::outer_pattern; +template <class Dimension, class Layout> +constexpr Kokkos::Iterate ViewOffset< + Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::inner_pattern; +template <class Dimension, class Layout> +constexpr int + ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::VORank; +template <class Dimension, class Layout> +constexpr unsigned + ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_0; +template <class Dimension, class Layout> +constexpr unsigned + ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_1; +template <class Dimension, class Layout> +constexpr unsigned + ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_2; +template <class Dimension, class Layout> +constexpr unsigned + ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_3; +template <class Dimension, class Layout> +constexpr unsigned + ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_4; +template <class Dimension, class Layout> +constexpr unsigned + ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_5; +template <class Dimension, class Layout> +constexpr unsigned + ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_6; +template <class Dimension, class Layout> +constexpr unsigned + ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_7; +template <class Dimension, class Layout> +constexpr int + ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::MASK_0; +template <class Dimension, class Layout> +constexpr int + ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::MASK_1; +template <class Dimension, class Layout> +constexpr int + ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::MASK_2; +template <class Dimension, class Layout> +constexpr int + ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::MASK_3; +template <class Dimension, class Layout> +constexpr int + ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::MASK_4; +template <class Dimension, class Layout> +constexpr int + ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::MASK_5; +template <class Dimension, class Layout> +constexpr int + ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::MASK_6; +template <class Dimension, class Layout> +constexpr int + ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::MASK_7; +template <class Dimension, class Layout> +constexpr unsigned + ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_2T; +template <class Dimension, class Layout> +constexpr unsigned + ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_3T; +template <class Dimension, class Layout> +constexpr unsigned + ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_4T; +template <class Dimension, class Layout> +constexpr unsigned + ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_5T; +template <class Dimension, class Layout> +constexpr unsigned + ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_6T; +template <class Dimension, class Layout> +constexpr unsigned + ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_7T; +template <class Dimension, class Layout> +constexpr unsigned + ViewOffset<Dimension, Layout, KOKKOS_ITERATE_VIEW_OFFSET_ENABLE>::SHIFT_8T; +#undef KOKKOS_ITERATE_VIEW_OFFSET_ENABLE + +//---------------------------------------- + +// ViewMapping assign method needed in order to return a 'subview' tile as a +// proper View The outer iteration pattern determines the mapping of the pointer +// offset to the beginning of requested tile The inner iteration pattern is +// needed for the layout of the tile's View to be returned Rank 2 +template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, + unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4, + unsigned N5, unsigned N6, unsigned N7, class... P, typename iType0, + typename iType1> +class ViewMapping< + typename std::enable_if<(N2 == 0 && N3 == 0 && N4 == 0 && N5 == 0 && + N6 == 0 && N7 == 0)>::type // void + , + Kokkos::ViewTraits< + T**, + Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, + N5, N6, N7, true>, + P...>, + Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5, + N6, N7, true>, + iType0, iType1> { + public: + using src_layout = + Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5, + N6, N7, true>; + using src_traits = Kokkos::ViewTraits<T**, src_layout, P...>; + + static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left); + static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left); + using array_layout = + typename std::conditional<is_inner_left, Kokkos::LayoutLeft, + Kokkos::LayoutRight>::type; + using traits = Kokkos::ViewTraits<T[N0][N1], array_layout, P...>; + using type = Kokkos::View<T[N0][N1], array_layout, P...>; + + KOKKOS_INLINE_FUNCTION static void assign( + ViewMapping<traits, void>& dst, const ViewMapping<src_traits, void>& src, + const src_layout&, const iType0 i_tile0, const iType1 i_tile1) { + using dst_map_type = ViewMapping<traits, void>; + using src_map_type = ViewMapping<src_traits, void>; + using dst_handle_type = typename dst_map_type::handle_type; + using dst_offset_type = typename dst_map_type::offset_type; + using src_offset_type = typename src_map_type::offset_type; + + dst = dst_map_type( + dst_handle_type( + src.m_impl_handle + + (is_outer_left ? ((i_tile0 + src.m_impl_offset.m_tile_N0 * i_tile1) + << src_offset_type::SHIFT_2T) + : ((src.m_impl_offset.m_tile_N1 * i_tile0 + i_tile1) + << src_offset_type::SHIFT_2T)) // offset to start + // of the tile + ), + dst_offset_type()); + } +}; + +// Rank 3 +template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, + unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4, + unsigned N5, unsigned N6, unsigned N7, class... P, typename iType0, + typename iType1, typename iType2> +class ViewMapping<typename std::enable_if<(N3 == 0 && N4 == 0 && N5 == 0 && + N6 == 0 && N7 == 0)>::type // void + , + Kokkos::ViewTraits< + T***, + Kokkos::Experimental::LayoutTiled< + OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, + P...>, + Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, + N3, N4, N5, N6, N7, true>, + iType0, iType1, iType2> { + public: + using src_layout = + Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5, + N6, N7, true>; + using src_traits = Kokkos::ViewTraits<T***, src_layout, P...>; + + static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left); + static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left); + using array_layout = + typename std::conditional<is_inner_left, Kokkos::LayoutLeft, + Kokkos::LayoutRight>::type; + using traits = Kokkos::ViewTraits<T[N0][N1][N2], array_layout, P...>; + using type = Kokkos::View<T[N0][N1][N2], array_layout, P...>; + + KOKKOS_INLINE_FUNCTION static void assign( + ViewMapping<traits, void>& dst, const ViewMapping<src_traits, void>& src, + const src_layout&, const iType0 i_tile0, const iType1 i_tile1, + const iType2 i_tile2) { + using dst_map_type = ViewMapping<traits, void>; + using src_map_type = ViewMapping<src_traits, void>; + using dst_handle_type = typename dst_map_type::handle_type; + using dst_offset_type = typename dst_map_type::offset_type; + using src_offset_type = typename src_map_type::offset_type; + + dst = dst_map_type( + dst_handle_type( + src.m_impl_handle + + (is_outer_left + ? ((i_tile0 + + src.m_impl_offset.m_tile_N0 * + (i_tile1 + src.m_impl_offset.m_tile_N1 * i_tile2)) + << src_offset_type::SHIFT_3T) + : ((src.m_impl_offset.m_tile_N2 * + (src.m_impl_offset.m_tile_N1 * i_tile0 + i_tile1) + + i_tile2) + << src_offset_type::SHIFT_3T))) // offset to start of the + // tile + , + dst_offset_type()); + } +}; + +// Rank 4 +template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, + unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4, + unsigned N5, unsigned N6, unsigned N7, class... P, typename iType0, + typename iType1, typename iType2, typename iType3> +class ViewMapping<typename std::enable_if<(N4 == 0 && N5 == 0 && N6 == 0 && + N7 == 0)>::type // void + , + Kokkos::ViewTraits< + T****, + Kokkos::Experimental::LayoutTiled< + OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, + P...>, + Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, + N3, N4, N5, N6, N7, true>, + iType0, iType1, iType2, iType3> { + public: + using src_layout = + Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5, + N6, N7, true>; + using src_traits = Kokkos::ViewTraits<T****, src_layout, P...>; + + static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left); + static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left); + using array_layout = + typename std::conditional<is_inner_left, Kokkos::LayoutLeft, + Kokkos::LayoutRight>::type; + using traits = Kokkos::ViewTraits<T[N0][N1][N2][N3], array_layout, P...>; + using type = Kokkos::View<T[N0][N1][N2][N3], array_layout, P...>; + + KOKKOS_INLINE_FUNCTION static void assign( + ViewMapping<traits, void>& dst, const ViewMapping<src_traits, void>& src, + const src_layout&, const iType0 i_tile0, const iType1 i_tile1, + const iType2 i_tile2, const iType3 i_tile3) { + using dst_map_type = ViewMapping<traits, void>; + using src_map_type = ViewMapping<src_traits, void>; + using dst_handle_type = typename dst_map_type::handle_type; + using dst_offset_type = typename dst_map_type::offset_type; + using src_offset_type = typename src_map_type::offset_type; + + dst = dst_map_type( + dst_handle_type( + src.m_impl_handle + + (is_outer_left + ? ((i_tile0 + + src.m_impl_offset.m_tile_N0 * + (i_tile1 + src.m_impl_offset.m_tile_N1 * + (i_tile2 + src.m_impl_offset.m_tile_N2 * + i_tile3))) + << src_offset_type::SHIFT_4T) + : ((src.m_impl_offset.m_tile_N3 * + (src.m_impl_offset.m_tile_N2 * + (src.m_impl_offset.m_tile_N1 * i_tile0 + + i_tile1) + + i_tile2) + + i_tile3) + << src_offset_type::SHIFT_4T))) // offset to start of the + // tile + , + dst_offset_type()); + } +}; + +// Rank 5 +template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, + unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4, + unsigned N5, unsigned N6, unsigned N7, class... P, typename iType0, + typename iType1, typename iType2, typename iType3, typename iType4> +class ViewMapping< + typename std::enable_if<(N5 == 0 && N6 == 0 && N7 == 0)>::type // void + , + Kokkos::ViewTraits< + T*****, + Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, + N5, N6, N7, true>, + P...>, + Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5, + N6, N7, true>, + iType0, iType1, iType2, iType3, iType4> { + public: + using src_layout = + Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5, + N6, N7, true>; + using src_traits = Kokkos::ViewTraits<T*****, src_layout, P...>; + + static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left); + static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left); + using array_layout = + typename std::conditional<is_inner_left, Kokkos::LayoutLeft, + Kokkos::LayoutRight>::type; + using traits = Kokkos::ViewTraits<T[N0][N1][N2][N3][N4], array_layout, P...>; + using type = Kokkos::View<T[N0][N1][N2][N3][N4], array_layout, P...>; + + KOKKOS_INLINE_FUNCTION static void assign( + ViewMapping<traits, void>& dst, const ViewMapping<src_traits, void>& src, + const src_layout&, const iType0 i_tile0, const iType1 i_tile1, + const iType2 i_tile2, const iType3 i_tile3, const iType4 i_tile4) { + using dst_map_type = ViewMapping<traits, void>; + using src_map_type = ViewMapping<src_traits, void>; + using dst_handle_type = typename dst_map_type::handle_type; + using dst_offset_type = typename dst_map_type::offset_type; + using src_offset_type = typename src_map_type::offset_type; + + dst = dst_map_type( + dst_handle_type( + src.m_impl_handle + + (is_outer_left + ? ((i_tile0 + + src.m_impl_offset.m_tile_N0 * + (i_tile1 + + src.m_impl_offset.m_tile_N1 * + (i_tile2 + + src.m_impl_offset.m_tile_N2 * + (i_tile3 + + src.m_impl_offset.m_tile_N3 * i_tile4)))) + << src_offset_type::SHIFT_5T) + : ((src.m_impl_offset.m_tile_N4 * + (src.m_impl_offset.m_tile_N3 * + (src.m_impl_offset.m_tile_N2 * + (src.m_impl_offset.m_tile_N1 * i_tile0 + + i_tile1) + + i_tile2) + + i_tile3) + + i_tile4) + << src_offset_type::SHIFT_5T))) // offset to start of the + // tile + , + dst_offset_type()); + } +}; + +// Rank 6 +template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, + unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4, + unsigned N5, unsigned N6, unsigned N7, class... P, typename iType0, + typename iType1, typename iType2, typename iType3, typename iType4, + typename iType5> +class ViewMapping<typename std::enable_if<(N6 == 0 && N7 == 0)>::type // void + , + Kokkos::ViewTraits< + T******, + Kokkos::Experimental::LayoutTiled< + OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, + P...>, + Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, + N3, N4, N5, N6, N7, true>, + iType0, iType1, iType2, iType3, iType4, iType5> { + public: + using src_layout = + Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5, + N6, N7, true>; + using src_traits = Kokkos::ViewTraits<T******, src_layout, P...>; + + static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left); + static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left); + using array_layout = + typename std::conditional<is_inner_left, Kokkos::LayoutLeft, + Kokkos::LayoutRight>::type; + using traits = + Kokkos::ViewTraits<T[N0][N1][N2][N3][N4][N5], array_layout, P...>; + using type = Kokkos::View<T[N0][N1][N2][N3][N4][N5], array_layout, P...>; + + KOKKOS_INLINE_FUNCTION static void assign( + ViewMapping<traits, void>& dst, const ViewMapping<src_traits, void>& src, + const src_layout&, const iType0 i_tile0, const iType1 i_tile1, + const iType2 i_tile2, const iType3 i_tile3, const iType4 i_tile4, + const iType5 i_tile5) { + using dst_map_type = ViewMapping<traits, void>; + using src_map_type = ViewMapping<src_traits, void>; + using dst_handle_type = typename dst_map_type::handle_type; + using dst_offset_type = typename dst_map_type::offset_type; + using src_offset_type = typename src_map_type::offset_type; + + dst = dst_map_type( + dst_handle_type( + src.m_impl_handle + + (is_outer_left + ? ((i_tile0 + + src.m_impl_offset.m_tile_N0 * + (i_tile1 + + src.m_impl_offset.m_tile_N1 * + (i_tile2 + + src.m_impl_offset.m_tile_N2 * + (i_tile3 + + src.m_impl_offset.m_tile_N3 * + (i_tile4 + src.m_impl_offset.m_tile_N4 * + i_tile5))))) + << src_offset_type::SHIFT_6T) + : ((src.m_impl_offset.m_tile_N5 * + (src.m_impl_offset.m_tile_N4 * + (src.m_impl_offset.m_tile_N3 * + (src.m_impl_offset.m_tile_N2 * + (src.m_impl_offset.m_tile_N1 * i_tile0 + + i_tile1) + + i_tile2) + + i_tile3) + + i_tile4) + + i_tile5) + << src_offset_type::SHIFT_6T))) // offset to start of the + // tile + , + dst_offset_type()); + } +}; + +// Rank 7 +template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, + unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4, + unsigned N5, unsigned N6, unsigned N7, class... P, typename iType0, + typename iType1, typename iType2, typename iType3, typename iType4, + typename iType5, typename iType6> +class ViewMapping<typename std::enable_if<(N7 == 0)>::type // void + , + Kokkos::ViewTraits< + T*******, + Kokkos::Experimental::LayoutTiled< + OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, + P...>, + Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, + N3, N4, N5, N6, N7, true>, + iType0, iType1, iType2, iType3, iType4, iType5, iType6> { + public: + using src_layout = + Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5, + N6, N7, true>; + using src_traits = Kokkos::ViewTraits<T*******, src_layout, P...>; + + static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left); + static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left); + using array_layout = + typename std::conditional<is_inner_left, Kokkos::LayoutLeft, + Kokkos::LayoutRight>::type; + using traits = + Kokkos::ViewTraits<T[N0][N1][N2][N3][N4][N5][N6], array_layout, P...>; + using type = Kokkos::View<T[N0][N1][N2][N3][N4][N5][N6], array_layout, P...>; + + KOKKOS_INLINE_FUNCTION static void assign( + ViewMapping<traits, void>& dst, const ViewMapping<src_traits, void>& src, + const src_layout&, const iType0 i_tile0, const iType1 i_tile1, + const iType2 i_tile2, const iType3 i_tile3, const iType4 i_tile4, + const iType5 i_tile5, const iType6 i_tile6) { + using dst_map_type = ViewMapping<traits, void>; + using src_map_type = ViewMapping<src_traits, void>; + using dst_handle_type = typename dst_map_type::handle_type; + using dst_offset_type = typename dst_map_type::offset_type; + using src_offset_type = typename src_map_type::offset_type; + + dst = dst_map_type( + dst_handle_type( + src.m_impl_handle + + (is_outer_left + ? ((i_tile0 + + src.m_impl_offset.m_tile_N0 * + (i_tile1 + + src.m_impl_offset.m_tile_N1 * + (i_tile2 + + src.m_impl_offset.m_tile_N2 * + (i_tile3 + + src.m_impl_offset.m_tile_N3 * + (i_tile4 + + src.m_impl_offset.m_tile_N4 * + (i_tile5 + + src.m_impl_offset.m_tile_N5 * + i_tile6)))))) + << src_offset_type::SHIFT_7T) + : ((src.m_impl_offset.m_tile_N6 * + (src.m_impl_offset.m_tile_N5 * + (src.m_impl_offset.m_tile_N4 * + (src.m_impl_offset.m_tile_N3 * + (src.m_impl_offset.m_tile_N2 * + (src.m_impl_offset.m_tile_N1 * + i_tile0 + + i_tile1) + + i_tile2) + + i_tile3) + + i_tile4) + + i_tile5) + + i_tile6) + << src_offset_type::SHIFT_7T))) // offset to start of the + // tile + , + dst_offset_type()); + } +}; + +// Rank 8 +template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, + unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4, + unsigned N5, unsigned N6, unsigned N7, class... P, typename iType0, + typename iType1, typename iType2, typename iType3, typename iType4, + typename iType5, typename iType6, typename iType7> +class ViewMapping<typename std::enable_if<(N0 != 0 && N1 != 0 && N2 != 0 && + N3 != 0 && N4 != 0 && N5 != 0 && + N6 != 0 && N7 != 0)>::type // void + , + Kokkos::ViewTraits< + T********, + Kokkos::Experimental::LayoutTiled< + OuterP, InnerP, N0, N1, N2, N3, N4, N5, N6, N7, true>, + P...>, + Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, + N3, N4, N5, N6, N7, true>, + iType0, iType1, iType2, iType3, iType4, iType5, iType6, + iType7> { + public: + using src_layout = + Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5, + N6, N7, true>; + using src_traits = Kokkos::ViewTraits<T********, src_layout, P...>; + + static constexpr bool is_outer_left = (OuterP == Kokkos::Iterate::Left); + static constexpr bool is_inner_left = (InnerP == Kokkos::Iterate::Left); + using array_layout = + typename std::conditional<is_inner_left, Kokkos::LayoutLeft, + Kokkos::LayoutRight>::type; + using traits = + Kokkos::ViewTraits<T[N0][N1][N2][N3][N4][N5][N6][N7], array_layout, P...>; + using type = + Kokkos::View<T[N0][N1][N2][N3][N4][N5][N6][N7], array_layout, P...>; + + KOKKOS_INLINE_FUNCTION static void assign( + ViewMapping<traits, void>& dst, const ViewMapping<src_traits, void>& src, + const src_layout&, const iType0 i_tile0, const iType1 i_tile1, + const iType2 i_tile2, const iType3 i_tile3, const iType4 i_tile4, + const iType5 i_tile5, const iType6 i_tile6, const iType7 i_tile7) { + using dst_map_type = ViewMapping<traits, void>; + using src_map_type = ViewMapping<src_traits, void>; + using dst_handle_type = typename dst_map_type::handle_type; + using dst_offset_type = typename dst_map_type::offset_type; + using src_offset_type = typename src_map_type::offset_type; + + dst = dst_map_type( + dst_handle_type( + src.m_impl_handle + + (is_outer_left + ? ((i_tile0 + + src.m_impl_offset.m_tile_N0 * + (i_tile1 + + src.m_impl_offset.m_tile_N1 * + (i_tile2 + + src.m_impl_offset.m_tile_N2 * + (i_tile3 + + src.m_impl_offset.m_tile_N3 * + (i_tile4 + + src.m_impl_offset.m_tile_N4 * + (i_tile5 + + src.m_impl_offset.m_tile_N5 * + (i_tile6 + + src.m_impl_offset.m_tile_N6 * + i_tile7))))))) + << src_offset_type::SHIFT_8T) + : ((src.m_impl_offset.m_tile_N7 * + (src.m_impl_offset.m_tile_N6 * + (src.m_impl_offset.m_tile_N5 * + (src.m_impl_offset.m_tile_N4 * + (src.m_impl_offset.m_tile_N3 * + (src.m_impl_offset.m_tile_N2 * + (src.m_impl_offset.m_tile_N1 * + i_tile0 + + i_tile1) + + i_tile2) + + i_tile3) + + i_tile4) + + i_tile5) + + i_tile6) + + i_tile7) + << src_offset_type::SHIFT_8T))) // offset to start of the + // tile + , + dst_offset_type()); + } +}; + +} /* namespace Impl */ +} /* namespace Kokkos */ + +//---------------------------------------- + +namespace Kokkos { + +// Rank 2 +template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, + unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4, + unsigned N5, unsigned N6, unsigned N7, class... P> +KOKKOS_INLINE_FUNCTION Kokkos::View< + T[N0][N1], + typename std::conditional<(InnerP == Kokkos::Iterate::Left), + Kokkos::LayoutLeft, Kokkos::LayoutRight>::type, + P...> +tile_subview(const Kokkos::View< + T**, + Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, + N3, N4, N5, N6, N7, true>, + P...>& src, + const size_t i_tile0, const size_t i_tile1) { + // Force the specialized ViewMapping for extracting a tile + // by using the first subview argument as the layout. + using array_layout = + typename std::conditional<(InnerP == Kokkos::Iterate::Left), + Kokkos::LayoutLeft, Kokkos::LayoutRight>::type; + using SrcLayout = + Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5, + N6, N7, true>; + + return Kokkos::View<T[N0][N1], array_layout, P...>(src, SrcLayout(), i_tile0, + i_tile1); +} + +// Rank 3 +template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, + unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4, + unsigned N5, unsigned N6, unsigned N7, class... P> +KOKKOS_INLINE_FUNCTION Kokkos::View< + T[N0][N1][N2], + typename std::conditional<(InnerP == Kokkos::Iterate::Left), + Kokkos::LayoutLeft, Kokkos::LayoutRight>::type, + P...> +tile_subview(const Kokkos::View< + T***, + Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, + N3, N4, N5, N6, N7, true>, + P...>& src, + const size_t i_tile0, const size_t i_tile1, const size_t i_tile2) { + // Force the specialized ViewMapping for extracting a tile + // by using the first subview argument as the layout. + using array_layout = + typename std::conditional<(InnerP == Kokkos::Iterate::Left), + Kokkos::LayoutLeft, Kokkos::LayoutRight>::type; + using SrcLayout = + Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5, + N6, N7, true>; + + return Kokkos::View<T[N0][N1][N2], array_layout, P...>( + src, SrcLayout(), i_tile0, i_tile1, i_tile2); +} + +// Rank 4 +template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, + unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4, + unsigned N5, unsigned N6, unsigned N7, class... P> +KOKKOS_INLINE_FUNCTION Kokkos::View< + T[N0][N1][N2][N3], + typename std::conditional<(InnerP == Kokkos::Iterate::Left), + Kokkos::LayoutLeft, Kokkos::LayoutRight>::type, + P...> +tile_subview(const Kokkos::View< + T****, + Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, + N3, N4, N5, N6, N7, true>, + P...>& src, + const size_t i_tile0, const size_t i_tile1, const size_t i_tile2, + const size_t i_tile3) { + // Force the specialized ViewMapping for extracting a tile + // by using the first subview argument as the layout. + using array_layout = + typename std::conditional<(InnerP == Kokkos::Iterate::Left), + Kokkos::LayoutLeft, Kokkos::LayoutRight>::type; + using SrcLayout = + Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5, + N6, N7, true>; + + return Kokkos::View<T[N0][N1][N2][N3], array_layout, P...>( + src, SrcLayout(), i_tile0, i_tile1, i_tile2, i_tile3); +} + +// Rank 5 +template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, + unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4, + unsigned N5, unsigned N6, unsigned N7, class... P> +KOKKOS_INLINE_FUNCTION Kokkos::View< + T[N0][N1][N2][N3][N4], + typename std::conditional<(InnerP == Kokkos::Iterate::Left), + Kokkos::LayoutLeft, Kokkos::LayoutRight>::type, + P...> +tile_subview(const Kokkos::View< + T*****, + Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, + N3, N4, N5, N6, N7, true>, + P...>& src, + const size_t i_tile0, const size_t i_tile1, const size_t i_tile2, + const size_t i_tile3, const size_t i_tile4) { + // Force the specialized ViewMapping for extracting a tile + // by using the first subview argument as the layout. + using array_layout = + typename std::conditional<(InnerP == Kokkos::Iterate::Left), + Kokkos::LayoutLeft, Kokkos::LayoutRight>::type; + using SrcLayout = + Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5, + N6, N7, true>; + + return Kokkos::View<T[N0][N1][N2][N3][N4], array_layout, P...>( + src, SrcLayout(), i_tile0, i_tile1, i_tile2, i_tile3, i_tile4); +} + +// Rank 6 +template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, + unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4, + unsigned N5, unsigned N6, unsigned N7, class... P> +KOKKOS_INLINE_FUNCTION Kokkos::View< + T[N0][N1][N2][N3][N4][N5], + typename std::conditional<(InnerP == Kokkos::Iterate::Left), + Kokkos::LayoutLeft, Kokkos::LayoutRight>::type, + P...> +tile_subview(const Kokkos::View< + T******, + Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, + N3, N4, N5, N6, N7, true>, + P...>& src, + const size_t i_tile0, const size_t i_tile1, const size_t i_tile2, + const size_t i_tile3, const size_t i_tile4, const size_t i_tile5) { + // Force the specialized ViewMapping for extracting a tile + // by using the first subview argument as the layout. + using array_layout = + typename std::conditional<(InnerP == Kokkos::Iterate::Left), + Kokkos::LayoutLeft, Kokkos::LayoutRight>::type; + using SrcLayout = + Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5, + N6, N7, true>; + + return Kokkos::View<T[N0][N1][N2][N3][N4][N5], array_layout, P...>( + src, SrcLayout(), i_tile0, i_tile1, i_tile2, i_tile3, i_tile4, i_tile5); +} + +// Rank 7 +template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, + unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4, + unsigned N5, unsigned N6, unsigned N7, class... P> +KOKKOS_INLINE_FUNCTION Kokkos::View< + T[N0][N1][N2][N3][N4][N5][N6], + typename std::conditional<(InnerP == Kokkos::Iterate::Left), + Kokkos::LayoutLeft, Kokkos::LayoutRight>::type, + P...> +tile_subview(const Kokkos::View< + T*******, + Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, + N3, N4, N5, N6, N7, true>, + P...>& src, + const size_t i_tile0, const size_t i_tile1, const size_t i_tile2, + const size_t i_tile3, const size_t i_tile4, const size_t i_tile5, + const size_t i_tile6) { + // Force the specialized ViewMapping for extracting a tile + // by using the first subview argument as the layout. + using array_layout = + typename std::conditional<(InnerP == Kokkos::Iterate::Left), + Kokkos::LayoutLeft, Kokkos::LayoutRight>::type; + using SrcLayout = + Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5, + N6, N7, true>; + + return Kokkos::View<T[N0][N1][N2][N3][N4][N5][N6], array_layout, P...>( + src, SrcLayout(), i_tile0, i_tile1, i_tile2, i_tile3, i_tile4, i_tile5, + i_tile6); +} + +// Rank 8 +template <typename T, Kokkos::Iterate OuterP, Kokkos::Iterate InnerP, + unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4, + unsigned N5, unsigned N6, unsigned N7, class... P> +KOKKOS_INLINE_FUNCTION Kokkos::View< + T[N0][N1][N2][N3][N4][N5][N6][N7], + typename std::conditional<(InnerP == Kokkos::Iterate::Left), + Kokkos::LayoutLeft, Kokkos::LayoutRight>::type, + P...> +tile_subview(const Kokkos::View< + T********, + Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, + N3, N4, N5, N6, N7, true>, + P...>& src, + const size_t i_tile0, const size_t i_tile1, const size_t i_tile2, + const size_t i_tile3, const size_t i_tile4, const size_t i_tile5, + const size_t i_tile6, const size_t i_tile7) { + // Force the specialized ViewMapping for extracting a tile + // by using the first subview argument as the layout. + using array_layout = + typename std::conditional<(InnerP == Kokkos::Iterate::Left), + Kokkos::LayoutLeft, Kokkos::LayoutRight>::type; + using SrcLayout = + Kokkos::Experimental::LayoutTiled<OuterP, InnerP, N0, N1, N2, N3, N4, N5, + N6, N7, true>; + + return Kokkos::View<T[N0][N1][N2][N3][N4][N5][N6][N7], array_layout, P...>( + src, SrcLayout(), i_tile0, i_tile1, i_tile2, i_tile3, i_tile4, i_tile5, + i_tile6, i_tile7); +} + +} /* namespace Kokkos */ +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #ifndef KOKKOS_EXPERIENTAL_VIEWLAYOUTTILE_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp b/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a380a306931f4150e95b6f433c8bb076b091c456 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp @@ -0,0 +1,3903 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_EXPERIMENTAL_VIEW_MAPPING_HPP +#define KOKKOS_EXPERIMENTAL_VIEW_MAPPING_HPP + +#include <type_traits> +#include <initializer_list> + +#include <Kokkos_Core_fwd.hpp> +#include <Kokkos_Pair.hpp> +#include <Kokkos_Layout.hpp> +#include <Kokkos_Extents.hpp> +#include <impl/Kokkos_Error.hpp> +#include <impl/Kokkos_Traits.hpp> +#include <impl/Kokkos_ViewTracker.hpp> +#include <impl/Kokkos_ViewCtor.hpp> +#include <impl/Kokkos_Atomic_View.hpp> +#include <impl/Kokkos_Tools.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <unsigned I, size_t... Args> +struct variadic_size_t { + enum : size_t { value = KOKKOS_INVALID_INDEX }; +}; + +template <size_t Val, size_t... Args> +struct variadic_size_t<0, Val, Args...> { + enum : size_t { value = Val }; +}; + +template <unsigned I, size_t Val, size_t... Args> +struct variadic_size_t<I, Val, Args...> { + enum : size_t { value = variadic_size_t<I - 1, Args...>::value }; +}; + +template <size_t... Args> +struct rank_dynamic; + +template <> +struct rank_dynamic<> { + enum : unsigned { value = 0 }; +}; + +template <size_t Val, size_t... Args> +struct rank_dynamic<Val, Args...> { + enum : unsigned { value = (Val == 0 ? 1 : 0) + rank_dynamic<Args...>::value }; +}; + +#define KOKKOS_IMPL_VIEW_DIMENSION(R) \ + template <size_t V, unsigned> \ + struct ViewDimension##R { \ + static constexpr size_t ArgN##R = (V != KOKKOS_INVALID_INDEX ? V : 1); \ + static constexpr size_t N##R = (V != KOKKOS_INVALID_INDEX ? V : 1); \ + KOKKOS_INLINE_FUNCTION explicit ViewDimension##R(size_t) {} \ + ViewDimension##R() = default; \ + ViewDimension##R(const ViewDimension##R&) = default; \ + ViewDimension##R& operator=(const ViewDimension##R&) = default; \ + }; \ + template <size_t V, unsigned RD> \ + constexpr size_t ViewDimension##R<V, RD>::ArgN##R; \ + template <size_t V, unsigned RD> \ + constexpr size_t ViewDimension##R<V, RD>::N##R; \ + template <unsigned RD> \ + struct ViewDimension##R<0u, RD> { \ + static constexpr size_t ArgN##R = 0; \ + typename std::conditional<(RD < 3), size_t, unsigned>::type N##R; \ + ViewDimension##R() = default; \ + ViewDimension##R(const ViewDimension##R&) = default; \ + ViewDimension##R& operator=(const ViewDimension##R&) = default; \ + KOKKOS_INLINE_FUNCTION explicit ViewDimension##R(size_t V) : N##R(V) {} \ + }; \ + template <unsigned RD> \ + constexpr size_t ViewDimension##R<0u, RD>::ArgN##R; + +KOKKOS_IMPL_VIEW_DIMENSION(0) +KOKKOS_IMPL_VIEW_DIMENSION(1) +KOKKOS_IMPL_VIEW_DIMENSION(2) +KOKKOS_IMPL_VIEW_DIMENSION(3) +KOKKOS_IMPL_VIEW_DIMENSION(4) +KOKKOS_IMPL_VIEW_DIMENSION(5) +KOKKOS_IMPL_VIEW_DIMENSION(6) +KOKKOS_IMPL_VIEW_DIMENSION(7) + +#undef KOKKOS_IMPL_VIEW_DIMENSION + +// MSVC does not do empty base class optimization by default. +// Per standard it is required for standard layout types +template <size_t... Vals> +struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION ViewDimension + : public ViewDimension0<variadic_size_t<0u, Vals...>::value, + rank_dynamic<Vals...>::value>, + public ViewDimension1<variadic_size_t<1u, Vals...>::value, + rank_dynamic<Vals...>::value>, + public ViewDimension2<variadic_size_t<2u, Vals...>::value, + rank_dynamic<Vals...>::value>, + public ViewDimension3<variadic_size_t<3u, Vals...>::value, + rank_dynamic<Vals...>::value>, + public ViewDimension4<variadic_size_t<4u, Vals...>::value, + rank_dynamic<Vals...>::value>, + public ViewDimension5<variadic_size_t<5u, Vals...>::value, + rank_dynamic<Vals...>::value>, + public ViewDimension6<variadic_size_t<6u, Vals...>::value, + rank_dynamic<Vals...>::value>, + public ViewDimension7<variadic_size_t<7u, Vals...>::value, + rank_dynamic<Vals...>::value> { + using D0 = ViewDimension0<variadic_size_t<0U, Vals...>::value, + rank_dynamic<Vals...>::value>; + using D1 = ViewDimension1<variadic_size_t<1U, Vals...>::value, + rank_dynamic<Vals...>::value>; + using D2 = ViewDimension2<variadic_size_t<2U, Vals...>::value, + rank_dynamic<Vals...>::value>; + using D3 = ViewDimension3<variadic_size_t<3U, Vals...>::value, + rank_dynamic<Vals...>::value>; + using D4 = ViewDimension4<variadic_size_t<4U, Vals...>::value, + rank_dynamic<Vals...>::value>; + using D5 = ViewDimension5<variadic_size_t<5U, Vals...>::value, + rank_dynamic<Vals...>::value>; + using D6 = ViewDimension6<variadic_size_t<6U, Vals...>::value, + rank_dynamic<Vals...>::value>; + using D7 = ViewDimension7<variadic_size_t<7U, Vals...>::value, + rank_dynamic<Vals...>::value>; + + using D0::ArgN0; + using D1::ArgN1; + using D2::ArgN2; + using D3::ArgN3; + using D4::ArgN4; + using D5::ArgN5; + using D6::ArgN6; + using D7::ArgN7; + + using D0::N0; + using D1::N1; + using D2::N2; + using D3::N3; + using D4::N4; + using D5::N5; + using D6::N6; + using D7::N7; + + enum : unsigned { rank = sizeof...(Vals) }; + enum : unsigned { rank_dynamic = Impl::rank_dynamic<Vals...>::value }; + + ViewDimension() = default; + ViewDimension(const ViewDimension&) = default; + ViewDimension& operator=(const ViewDimension&) = default; + + KOKKOS_INLINE_FUNCTION + constexpr ViewDimension(size_t n0, size_t n1, size_t n2, size_t n3, size_t n4, + size_t n5, size_t n6, size_t n7) + : D0(n0), D1(n1), D2(n2), D3(n3), D4(n4), D5(n5), D6(n6), D7(n7) {} + + KOKKOS_INLINE_FUNCTION + constexpr size_t extent(const unsigned r) const noexcept { + return r == 0 + ? N0 + : (r == 1 + ? N1 + : (r == 2 + ? N2 + : (r == 3 + ? N3 + : (r == 4 + ? N4 + : (r == 5 + ? N5 + : (r == 6 + ? N6 + : (r == 7 ? N7 + : 0))))))); + } + + static KOKKOS_INLINE_FUNCTION constexpr size_t static_extent( + const unsigned r) noexcept { + return r == 0 + ? ArgN0 + : (r == 1 + ? ArgN1 + : (r == 2 + ? ArgN2 + : (r == 3 + ? ArgN3 + : (r == 4 + ? ArgN4 + : (r == 5 + ? ArgN5 + : (r == 6 + ? ArgN6 + : (r == 7 ? ArgN7 + : 0))))))); + } + + template <size_t N> + struct prepend { + using type = ViewDimension<N, Vals...>; + }; + + template <size_t N> + struct append { + using type = ViewDimension<Vals..., N>; + }; +}; + +template <class A, class B> +struct ViewDimensionJoin; + +template <size_t... A, size_t... B> +struct ViewDimensionJoin<ViewDimension<A...>, ViewDimension<B...>> { + using type = ViewDimension<A..., B...>; +}; + +//---------------------------------------------------------------------------- + +template <class DstDim, class SrcDim> +struct ViewDimensionAssignable; + +template <size_t... DstArgs, size_t... SrcArgs> +struct ViewDimensionAssignable<ViewDimension<DstArgs...>, + ViewDimension<SrcArgs...>> { + using dst = ViewDimension<DstArgs...>; + using src = ViewDimension<SrcArgs...>; + + enum { + value = unsigned(dst::rank) == unsigned(src::rank) && + ( + // Compile time check that potential static dimensions match + ((1 > dst::rank_dynamic && 1 > src::rank_dynamic) + ? (size_t(dst::ArgN0) == size_t(src::ArgN0)) + : true) && + ((2 > dst::rank_dynamic && 2 > src::rank_dynamic) + ? (size_t(dst::ArgN1) == size_t(src::ArgN1)) + : true) && + ((3 > dst::rank_dynamic && 3 > src::rank_dynamic) + ? (size_t(dst::ArgN2) == size_t(src::ArgN2)) + : true) && + ((4 > dst::rank_dynamic && 4 > src::rank_dynamic) + ? (size_t(dst::ArgN3) == size_t(src::ArgN3)) + : true) && + ((5 > dst::rank_dynamic && 5 > src::rank_dynamic) + ? (size_t(dst::ArgN4) == size_t(src::ArgN4)) + : true) && + ((6 > dst::rank_dynamic && 6 > src::rank_dynamic) + ? (size_t(dst::ArgN5) == size_t(src::ArgN5)) + : true) && + ((7 > dst::rank_dynamic && 7 > src::rank_dynamic) + ? (size_t(dst::ArgN6) == size_t(src::ArgN6)) + : true) && + ((8 > dst::rank_dynamic && 8 > src::rank_dynamic) + ? (size_t(dst::ArgN7) == size_t(src::ArgN7)) + : true)) + }; +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +struct ALL_t { + KOKKOS_INLINE_FUNCTION + constexpr const ALL_t& operator()() const { return *this; } + + KOKKOS_INLINE_FUNCTION + constexpr bool operator==(const ALL_t&) const { return true; } +}; + +} // namespace Impl +} // namespace Kokkos + +namespace Kokkos { +namespace Impl { + +template <class T> +struct is_integral_extent_type { + enum : bool { value = std::is_same<T, Kokkos::Impl::ALL_t>::value ? 1 : 0 }; +}; + +template <class iType> +struct is_integral_extent_type<std::pair<iType, iType>> { + enum : bool { value = std::is_integral<iType>::value ? 1 : 0 }; +}; + +template <class iType> +struct is_integral_extent_type<Kokkos::pair<iType, iType>> { + enum : bool { value = std::is_integral<iType>::value ? 1 : 0 }; +}; + +// Assuming '2 == initializer_list<iType>::size()' +template <class iType> +struct is_integral_extent_type<std::initializer_list<iType>> { + enum : bool { value = std::is_integral<iType>::value ? 1 : 0 }; +}; + +template <unsigned I, class... Args> +struct is_integral_extent { + // get_type is void when sizeof...(Args) <= I + using type = typename std::remove_cv<typename std::remove_reference< + typename Kokkos::Impl::get_type<I, Args...>::type>::type>::type; + + enum : bool { value = is_integral_extent_type<type>::value }; + + static_assert(value || std::is_integral<type>::value || + std::is_same<type, void>::value, + "subview argument must be either integral or integral extent"); +}; + +// Rules for subview arguments and layouts matching + +template <class LayoutDest, class LayoutSrc, int RankDest, int RankSrc, + int CurrentArg, class... SubViewArgs> +struct SubviewLegalArgsCompileTime; + +// Rules which allow LayoutLeft to LayoutLeft assignment + +template <int RankDest, int RankSrc, int CurrentArg, class Arg, + class... SubViewArgs> +struct SubviewLegalArgsCompileTime<Kokkos::LayoutLeft, Kokkos::LayoutLeft, + RankDest, RankSrc, CurrentArg, Arg, + SubViewArgs...> { + enum { + value = (((CurrentArg == RankDest - 1) && + (Kokkos::Impl::is_integral_extent_type<Arg>::value)) || + ((CurrentArg >= RankDest) && (std::is_integral<Arg>::value)) || + ((CurrentArg < RankDest) && + (std::is_same<Arg, Kokkos::Impl::ALL_t>::value)) || + ((CurrentArg == 0) && + (Kokkos::Impl::is_integral_extent_type<Arg>::value))) && + (SubviewLegalArgsCompileTime<Kokkos::LayoutLeft, Kokkos::LayoutLeft, + RankDest, RankSrc, CurrentArg + 1, + SubViewArgs...>::value) + }; +}; + +template <int RankDest, int RankSrc, int CurrentArg, class Arg> +struct SubviewLegalArgsCompileTime<Kokkos::LayoutLeft, Kokkos::LayoutLeft, + RankDest, RankSrc, CurrentArg, Arg> { + enum { + value = ((CurrentArg == RankDest - 1) || (std::is_integral<Arg>::value)) && + (CurrentArg == RankSrc - 1) + }; +}; + +// Rules which allow LayoutRight to LayoutRight assignment + +template <int RankDest, int RankSrc, int CurrentArg, class Arg, + class... SubViewArgs> +struct SubviewLegalArgsCompileTime<Kokkos::LayoutRight, Kokkos::LayoutRight, + RankDest, RankSrc, CurrentArg, Arg, + SubViewArgs...> { + enum { + value = (((CurrentArg == RankSrc - RankDest) && + (Kokkos::Impl::is_integral_extent_type<Arg>::value)) || + ((CurrentArg < RankSrc - RankDest) && + (std::is_integral<Arg>::value)) || + ((CurrentArg >= RankSrc - RankDest) && + (std::is_same<Arg, Kokkos::Impl::ALL_t>::value))) && + (SubviewLegalArgsCompileTime<Kokkos::LayoutRight, + Kokkos::LayoutRight, RankDest, RankSrc, + CurrentArg + 1, SubViewArgs...>::value) + }; +}; + +template <int RankDest, int RankSrc, int CurrentArg, class Arg> +struct SubviewLegalArgsCompileTime<Kokkos::LayoutRight, Kokkos::LayoutRight, + RankDest, RankSrc, CurrentArg, Arg> { + enum { + value = ((CurrentArg == RankSrc - 1) && + (std::is_same<Arg, Kokkos::Impl::ALL_t>::value)) + }; +}; + +// Rules which allow assignment to LayoutStride + +template <int RankDest, int RankSrc, int CurrentArg, class... SubViewArgs> +struct SubviewLegalArgsCompileTime<Kokkos::LayoutStride, Kokkos::LayoutLeft, + RankDest, RankSrc, CurrentArg, + SubViewArgs...> { + enum : bool { value = true }; +}; + +template <int RankDest, int RankSrc, int CurrentArg, class... SubViewArgs> +struct SubviewLegalArgsCompileTime<Kokkos::LayoutStride, Kokkos::LayoutRight, + RankDest, RankSrc, CurrentArg, + SubViewArgs...> { + enum : bool { value = true }; +}; + +template <int RankDest, int RankSrc, int CurrentArg, class... SubViewArgs> +struct SubviewLegalArgsCompileTime<Kokkos::LayoutStride, Kokkos::LayoutStride, + RankDest, RankSrc, CurrentArg, + SubViewArgs...> { + enum : bool { value = true }; +}; + +template <unsigned DomainRank, unsigned RangeRank> +struct SubviewExtents { + private: + // Cannot declare zero-length arrays + // '+' is used to silence GCC 7.2.0 -Wduplicated-branches warning when + // RangeRank=1 + enum { InternalRangeRank = RangeRank ? RangeRank : +1u }; + + size_t m_begin[DomainRank]; + size_t m_length[InternalRangeRank]; + unsigned m_index[InternalRangeRank]; + + template <size_t... DimArgs> + KOKKOS_FORCEINLINE_FUNCTION bool set(unsigned, unsigned, + const ViewDimension<DimArgs...>&) { + return true; + } + + template <class T, size_t... DimArgs, class... Args> + KOKKOS_FORCEINLINE_FUNCTION bool set(unsigned domain_rank, + unsigned range_rank, + const ViewDimension<DimArgs...>& dim, + const T& val, Args... args) { + const size_t v = static_cast<size_t>(val); + + m_begin[domain_rank] = v; + + return set(domain_rank + 1, range_rank, dim, args...) +#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) + && (v < dim.extent(domain_rank)) +#endif + ; + } + + // ALL_t + template <size_t... DimArgs, class... Args> + KOKKOS_FORCEINLINE_FUNCTION bool set(unsigned domain_rank, + unsigned range_rank, + const ViewDimension<DimArgs...>& dim, + const Kokkos::Impl::ALL_t, + Args... args) { + m_begin[domain_rank] = 0; + m_length[range_rank] = dim.extent(domain_rank); + m_index[range_rank] = domain_rank; + + return set(domain_rank + 1, range_rank + 1, dim, args...); + } + + // std::pair range + template <class T, size_t... DimArgs, class... Args> + KOKKOS_FORCEINLINE_FUNCTION bool set(unsigned domain_rank, + unsigned range_rank, + const ViewDimension<DimArgs...>& dim, + const std::pair<T, T>& val, + Args... args) { + const size_t b = static_cast<size_t>(val.first); + const size_t e = static_cast<size_t>(val.second); + + m_begin[domain_rank] = b; + m_length[range_rank] = e - b; + m_index[range_rank] = domain_rank; + + return set(domain_rank + 1, range_rank + 1, dim, args...) +#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) + && (e <= b + dim.extent(domain_rank)) +#endif + ; + } + + // Kokkos::pair range + template <class T, size_t... DimArgs, class... Args> + KOKKOS_FORCEINLINE_FUNCTION bool set(unsigned domain_rank, + unsigned range_rank, + const ViewDimension<DimArgs...>& dim, + const Kokkos::pair<T, T>& val, + Args... args) { + const size_t b = static_cast<size_t>(val.first); + const size_t e = static_cast<size_t>(val.second); + + m_begin[domain_rank] = b; + m_length[range_rank] = e - b; + m_index[range_rank] = domain_rank; + + return set(domain_rank + 1, range_rank + 1, dim, args...) +#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) + && (e <= b + dim.extent(domain_rank)) +#endif + ; + } + + // { begin , end } range + template <class T, size_t... DimArgs, class... Args> + KOKKOS_FORCEINLINE_FUNCTION bool set(unsigned domain_rank, + unsigned range_rank, + const ViewDimension<DimArgs...>& dim, + const std::initializer_list<T>& val, + Args... args) { + const size_t b = static_cast<size_t>(val.begin()[0]); + const size_t e = static_cast<size_t>(val.begin()[1]); + + m_begin[domain_rank] = b; + m_length[range_rank] = e - b; + m_index[range_rank] = domain_rank; + + return set(domain_rank + 1, range_rank + 1, dim, args...) +#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) + && (val.size() == 2) && (e <= b + dim.extent(domain_rank)) +#endif + ; + } + + //------------------------------ + +#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) + + template <size_t... DimArgs> + void error(char*, int, unsigned, unsigned, + const ViewDimension<DimArgs...>&) const {} + + template <class T, size_t... DimArgs, class... Args> + void error(char* buf, int buf_len, unsigned domain_rank, unsigned range_rank, + const ViewDimension<DimArgs...>& dim, const T& val, + Args... args) const { + const int n = std::min( + buf_len, + snprintf(buf, buf_len, " %lu < %lu %c", static_cast<unsigned long>(val), + static_cast<unsigned long>(dim.extent(domain_rank)), + int(sizeof...(Args) ? ',' : ')'))); + + error(buf + n, buf_len - n, domain_rank + 1, range_rank, dim, args...); + } + + // std::pair range + template <size_t... DimArgs, class... Args> + void error(char* buf, int buf_len, unsigned domain_rank, unsigned range_rank, + const ViewDimension<DimArgs...>& dim, const Kokkos::Impl::ALL_t, + Args... args) const { + const int n = std::min(buf_len, snprintf(buf, buf_len, " Kokkos::ALL %c", + int(sizeof...(Args) ? ',' : ')'))); + + error(buf + n, buf_len - n, domain_rank + 1, range_rank + 1, dim, args...); + } + + // std::pair range + template <class T, size_t... DimArgs, class... Args> + void error(char* buf, int buf_len, unsigned domain_rank, unsigned range_rank, + const ViewDimension<DimArgs...>& dim, const std::pair<T, T>& val, + Args... args) const { + // d <= e - b + const int n = std::min( + buf_len, snprintf(buf, buf_len, " %lu <= %lu - %lu %c", + static_cast<unsigned long>(dim.extent(domain_rank)), + static_cast<unsigned long>(val.second), + static_cast<unsigned long>(val.first), + int(sizeof...(Args) ? ',' : ')'))); + + error(buf + n, buf_len - n, domain_rank + 1, range_rank + 1, dim, args...); + } + + // Kokkos::pair range + template <class T, size_t... DimArgs, class... Args> + void error(char* buf, int buf_len, unsigned domain_rank, unsigned range_rank, + const ViewDimension<DimArgs...>& dim, + const Kokkos::pair<T, T>& val, Args... args) const { + // d <= e - b + const int n = std::min( + buf_len, snprintf(buf, buf_len, " %lu <= %lu - %lu %c", + static_cast<unsigned long>(dim.extent(domain_rank)), + static_cast<unsigned long>(val.second), + static_cast<unsigned long>(val.first), + int(sizeof...(Args) ? ',' : ')'))); + + error(buf + n, buf_len - n, domain_rank + 1, range_rank + 1, dim, args...); + } + + // { begin , end } range + template <class T, size_t... DimArgs, class... Args> + void error(char* buf, int buf_len, unsigned domain_rank, unsigned range_rank, + const ViewDimension<DimArgs...>& dim, + const std::initializer_list<T>& val, Args... args) const { + // d <= e - b + int n = 0; + if (val.size() == 2) { + n = std::min(buf_len, + snprintf(buf, buf_len, " %lu <= %lu - %lu %c", + static_cast<unsigned long>(dim.extent(domain_rank)), + static_cast<unsigned long>(val.begin()[0]), + static_cast<unsigned long>(val.begin()[1]), + int(sizeof...(Args) ? ',' : ')'))); + } else { + n = std::min(buf_len, snprintf(buf, buf_len, " { ... }.size() == %u %c", + unsigned(val.size()), + int(sizeof...(Args) ? ',' : ')'))); + } + + error(buf + n, buf_len - n, domain_rank + 1, range_rank + 1, dim, args...); + } + +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + template <size_t... DimArgs, class... Args> + KOKKOS_FORCEINLINE_FUNCTION void error(const ViewDimension<DimArgs...>& dim, + Args... args) const { + enum { LEN = 1024 }; + char buffer[LEN]; + + const int n = snprintf(buffer, LEN, "Kokkos::subview bounds error ("); + error(buffer + n, LEN - n, 0, 0, dim, args...); + + Kokkos::Impl::throw_runtime_exception(std::string(buffer)); + } +#else + template <size_t... DimArgs, class... Args> + KOKKOS_FORCEINLINE_FUNCTION void error(const ViewDimension<DimArgs...>&, + Args...) const { + Kokkos::abort("Kokkos::subview bounds error"); + } +#endif + +#else + + template <size_t... DimArgs, class... Args> + KOKKOS_FORCEINLINE_FUNCTION void error(const ViewDimension<DimArgs...>&, + Args...) const {} + +#endif + + public: + template <size_t... DimArgs, class... Args> + KOKKOS_INLINE_FUNCTION SubviewExtents(const ViewDimension<DimArgs...>& dim, + Args... args) { + static_assert(DomainRank == sizeof...(DimArgs), ""); + static_assert(DomainRank == sizeof...(Args), ""); + + // Verifies that all arguments, up to 8, are integral types, + // integral extents, or don't exist. + static_assert( + RangeRank == unsigned(is_integral_extent<0, Args...>::value) + + unsigned(is_integral_extent<1, Args...>::value) + + unsigned(is_integral_extent<2, Args...>::value) + + unsigned(is_integral_extent<3, Args...>::value) + + unsigned(is_integral_extent<4, Args...>::value) + + unsigned(is_integral_extent<5, Args...>::value) + + unsigned(is_integral_extent<6, Args...>::value) + + unsigned(is_integral_extent<7, Args...>::value), + ""); + + if (RangeRank == 0) { + m_length[0] = 0; + m_index[0] = ~0u; + } + + if (!set(0, 0, dim, args...)) error(dim, args...); + } + + template <typename iType> + KOKKOS_FORCEINLINE_FUNCTION constexpr size_t domain_offset( + const iType i) const { + return unsigned(i) < DomainRank ? m_begin[i] : 0; + } + + template <typename iType> + KOKKOS_FORCEINLINE_FUNCTION constexpr size_t range_extent( + const iType i) const { + return unsigned(i) < InternalRangeRank ? m_length[i] : 0; + } + + template <typename iType> + KOKKOS_FORCEINLINE_FUNCTION constexpr unsigned range_index( + const iType i) const { + return unsigned(i) < InternalRangeRank ? m_index[i] : ~0u; + } +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +/** \brief Given a value type and dimension generate the View data type */ +template <class T, class Dim> +struct ViewDataType; + +template <class T> +struct ViewDataType<T, ViewDimension<>> { + using type = T; +}; + +template <class T, size_t... Args> +struct ViewDataType<T, ViewDimension<0, Args...>> { + using type = typename ViewDataType<T*, ViewDimension<Args...>>::type; +}; + +template <class T, size_t N, size_t... Args> +struct ViewDataType<T, ViewDimension<N, Args...>> { + using type = typename ViewDataType<T, ViewDimension<Args...>>::type[N]; +}; + +/**\brief Analysis of View data type. + * + * Data type conforms to one of the following patterns : + * {const} value_type [][#][#][#] + * {const} value_type ***[#][#][#] + * Where the sum of counts of '*' and '[#]' is at most ten. + * + * Provide alias for ViewDimension<...> and value_type. + */ +template <class T> +struct ViewArrayAnalysis { + using value_type = T; + using const_value_type = typename std::add_const<T>::type; + using non_const_value_type = typename std::remove_const<T>::type; + using static_dimension = ViewDimension<>; + using dynamic_dimension = ViewDimension<>; + using dimension = ViewDimension<>; +}; + +template <class T, size_t N> +struct ViewArrayAnalysis<T[N]> { + private: + using nested = ViewArrayAnalysis<T>; + + public: + using value_type = typename nested::value_type; + using const_value_type = typename nested::const_value_type; + using non_const_value_type = typename nested::non_const_value_type; + + using static_dimension = + typename nested::static_dimension::template prepend<N>::type; + + using dynamic_dimension = typename nested::dynamic_dimension; + + using dimension = + typename ViewDimensionJoin<dynamic_dimension, static_dimension>::type; +}; + +template <class T> +struct ViewArrayAnalysis<T[]> { + private: + using nested = ViewArrayAnalysis<T>; + using nested_dimension = typename nested::dimension; + + public: + using value_type = typename nested::value_type; + using const_value_type = typename nested::const_value_type; + using non_const_value_type = typename nested::non_const_value_type; + + using dynamic_dimension = + typename nested::dynamic_dimension::template prepend<0>::type; + + using static_dimension = typename nested::static_dimension; + + using dimension = + typename ViewDimensionJoin<dynamic_dimension, static_dimension>::type; +}; + +template <class T> +struct ViewArrayAnalysis<T*> { + private: + using nested = ViewArrayAnalysis<T>; + + public: + using value_type = typename nested::value_type; + using const_value_type = typename nested::const_value_type; + using non_const_value_type = typename nested::non_const_value_type; + + using dynamic_dimension = + typename nested::dynamic_dimension::template prepend<0>::type; + + using static_dimension = typename nested::static_dimension; + + using dimension = + typename ViewDimensionJoin<dynamic_dimension, static_dimension>::type; +}; + +template <class DataType, class ArrayLayout, class ValueType> +struct ViewDataAnalysis { + private: + using array_analysis = ViewArrayAnalysis<DataType>; + + // ValueType is opportunity for partial specialization. + // Must match array analysis when this default template is used. + static_assert( + std::is_same<ValueType, + typename array_analysis::non_const_value_type>::value, + ""); + + public: + using specialize = void; // No specialization + + using dimension = typename array_analysis::dimension; + using value_type = typename array_analysis::value_type; + using const_value_type = typename array_analysis::const_value_type; + using non_const_value_type = typename array_analysis::non_const_value_type; + + // Generate analogous multidimensional array specification type. + using type = typename ViewDataType<value_type, dimension>::type; + using const_type = typename ViewDataType<const_value_type, dimension>::type; + using non_const_type = + typename ViewDataType<non_const_value_type, dimension>::type; + + // Generate "flattened" multidimensional array specification type. + using scalar_array_type = type; + using const_scalar_array_type = const_type; + using non_const_scalar_array_type = non_const_type; +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <class Dimension, class Layout, typename Enable = void> +struct ViewOffset { + using is_mapping_plugin = std::false_type; +}; + +//---------------------------------------------------------------------------- +// LayoutLeft AND ( 1 >= rank OR 0 == rank_dynamic ) : no padding / striding +template <class Dimension> +struct ViewOffset< + Dimension, Kokkos::LayoutLeft, + typename std::enable_if<(1 >= Dimension::rank || + 0 == Dimension::rank_dynamic)>::type> { + using is_mapping_plugin = std::true_type; + using is_regular = std::true_type; + + using size_type = size_t; + using dimension_type = Dimension; + using array_layout = Kokkos::LayoutLeft; + + dimension_type m_dim; + + //---------------------------------------- + + // rank 1 + template <typename I0> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()(I0 const& i0) const { + return i0; + } + + // rank 2 + template <typename I0, typename I1> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()(I0 const& i0, + I1 const& i1) const { + return i0 + m_dim.N0 * i1; + } + + // rank 3 + template <typename I0, typename I1, typename I2> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()(I0 const& i0, + I1 const& i1, + I2 const& i2) const { + return i0 + m_dim.N0 * (i1 + m_dim.N1 * i2); + } + + // rank 4 + template <typename I0, typename I1, typename I2, typename I3> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()(I0 const& i0, + I1 const& i1, + I2 const& i2, + I3 const& i3) const { + return i0 + m_dim.N0 * (i1 + m_dim.N1 * (i2 + m_dim.N2 * i3)); + } + + // rank 5 + template <typename I0, typename I1, typename I2, typename I3, typename I4> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()(I0 const& i0, + I1 const& i1, + I2 const& i2, + I3 const& i3, + I4 const& i4) const { + return i0 + + m_dim.N0 * (i1 + m_dim.N1 * (i2 + m_dim.N2 * (i3 + m_dim.N3 * i4))); + } + + // rank 6 + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()( + I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4, + I5 const& i5) const { + return i0 + + m_dim.N0 * + (i1 + + m_dim.N1 * + (i2 + m_dim.N2 * (i3 + m_dim.N3 * (i4 + m_dim.N4 * i5)))); + } + + // rank 7 + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5, typename I6> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()( + I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4, + I5 const& i5, I6 const& i6) const { + return i0 + + m_dim.N0 * + (i1 + m_dim.N1 * + (i2 + m_dim.N2 * + (i3 + m_dim.N3 * + (i4 + m_dim.N4 * + (i5 + m_dim.N5 * i6))))); + } + + // rank 8 + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5, typename I6, typename I7> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()( + I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4, + I5 const& i5, I6 const& i6, I7 const& i7) const { + return i0 + + m_dim.N0 * + (i1 + + m_dim.N1 * + (i2 + m_dim.N2 * + (i3 + m_dim.N3 * + (i4 + m_dim.N4 * + (i5 + m_dim.N5 * + (i6 + m_dim.N6 * + i7)))))); + } + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + constexpr array_layout layout() const { + return array_layout(m_dim.N0, m_dim.N1, m_dim.N2, m_dim.N3, m_dim.N4, + m_dim.N5, m_dim.N6, m_dim.N7); + } + + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { + return m_dim.N0; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_1() const { + return m_dim.N1; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_2() const { + return m_dim.N2; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_3() const { + return m_dim.N3; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_4() const { + return m_dim.N4; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_5() const { + return m_dim.N5; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_6() const { + return m_dim.N6; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_7() const { + return m_dim.N7; + } + + /* Cardinality of the domain index space */ + KOKKOS_INLINE_FUNCTION + constexpr size_type size() const { + return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * + m_dim.N6 * m_dim.N7; + } + + /* Span of the range space */ + KOKKOS_INLINE_FUNCTION + constexpr size_type span() const { + return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * + m_dim.N6 * m_dim.N7; + } + + KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { + return true; + } + + /* Strides of dimensions */ + KOKKOS_INLINE_FUNCTION constexpr size_type stride_0() const { return 1; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_1() const { + return m_dim.N0; + } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_2() const { + return m_dim.N0 * m_dim.N1; + } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_3() const { + return m_dim.N0 * m_dim.N1 * m_dim.N2; + } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_4() const { + return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3; + } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_5() const { + return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4; + } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_6() const { + return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5; + } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_7() const { + return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * + m_dim.N6; + } + + // Stride with [ rank ] value is the total length + template <typename iType> + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + s[0] = 1; + if (0 < dimension_type::rank) { + s[1] = m_dim.N0; + } + if (1 < dimension_type::rank) { + s[2] = s[1] * m_dim.N1; + } + if (2 < dimension_type::rank) { + s[3] = s[2] * m_dim.N2; + } + if (3 < dimension_type::rank) { + s[4] = s[3] * m_dim.N3; + } + if (4 < dimension_type::rank) { + s[5] = s[4] * m_dim.N4; + } + if (5 < dimension_type::rank) { + s[6] = s[5] * m_dim.N5; + } + if (6 < dimension_type::rank) { + s[7] = s[6] * m_dim.N6; + } + if (7 < dimension_type::rank) { + s[8] = s[7] * m_dim.N7; + } + } + + //---------------------------------------- + + // MSVC (16.5.5) + CUDA (10.2) did not generate the defaulted functions + // correct and errors out during compilation. Same for the other places where + // I changed this. +#ifdef KOKKOS_IMPL_WINDOWS_CUDA + KOKKOS_FUNCTION ViewOffset() : m_dim(dimension_type()) {} + KOKKOS_FUNCTION ViewOffset(const ViewOffset& src) { m_dim = src.m_dim; } + KOKKOS_FUNCTION ViewOffset& operator=(const ViewOffset& src) { + m_dim = src.m_dim; + return *this; + } +#else + ViewOffset() = default; + ViewOffset(const ViewOffset&) = default; + ViewOffset& operator=(const ViewOffset&) = default; +#endif + + template <unsigned TrivialScalarSize> + KOKKOS_INLINE_FUNCTION constexpr ViewOffset( + std::integral_constant<unsigned, TrivialScalarSize> const&, + Kokkos::LayoutLeft const& arg_layout) + : m_dim(arg_layout.dimension[0], 0, 0, 0, 0, 0, 0, 0) {} + + template <class DimRHS> + KOKKOS_INLINE_FUNCTION constexpr ViewOffset( + const ViewOffset<DimRHS, Kokkos::LayoutLeft, void>& rhs) + : m_dim(rhs.m_dim.N0, rhs.m_dim.N1, rhs.m_dim.N2, rhs.m_dim.N3, + rhs.m_dim.N4, rhs.m_dim.N5, rhs.m_dim.N6, rhs.m_dim.N7) { + static_assert(int(DimRHS::rank) == int(dimension_type::rank), + "ViewOffset assignment requires equal rank"); + // Also requires equal static dimensions ... + } + + template <class DimRHS> + KOKKOS_INLINE_FUNCTION constexpr ViewOffset( + const ViewOffset<DimRHS, Kokkos::LayoutRight, void>& rhs) + : m_dim(rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0) { + static_assert((DimRHS::rank == 0 && dimension_type::rank == 0) || + (DimRHS::rank == 1 && dimension_type::rank == 1 && + dimension_type::rank_dynamic == 1), + "ViewOffset LayoutLeft and LayoutRight are only compatible " + "when rank <= 1"); + } + + template <class DimRHS> + KOKKOS_INLINE_FUNCTION ViewOffset( + const ViewOffset<DimRHS, Kokkos::LayoutStride, void>& rhs) + : m_dim(rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0) { + if (rhs.m_stride.S0 != 1) { + Kokkos::abort( + "Kokkos::Impl::ViewOffset assignment of LayoutLeft from LayoutStride " + " requires stride == 1"); + } + } + + //---------------------------------------- + // Subview construction + + template <class DimRHS> + KOKKOS_INLINE_FUNCTION constexpr ViewOffset( + const ViewOffset<DimRHS, Kokkos::LayoutLeft, void>&, + const SubviewExtents<DimRHS::rank, dimension_type::rank>& sub) + : m_dim(sub.range_extent(0), 0, 0, 0, 0, 0, 0, 0) { + static_assert((0 == dimension_type::rank_dynamic) || + (1 == dimension_type::rank && + 1 == dimension_type::rank_dynamic && 1 <= DimRHS::rank), + "ViewOffset subview construction requires compatible rank"); + } +}; + +//---------------------------------------------------------------------------- +// LayoutLeft AND ( 1 < rank AND 0 < rank_dynamic ) : has padding / striding +template <class Dimension> +struct ViewOffset< + Dimension, Kokkos::LayoutLeft, + typename std::enable_if<(1 < Dimension::rank && + 0 < Dimension::rank_dynamic)>::type> { + using is_mapping_plugin = std::true_type; + using is_regular = std::true_type; + + using size_type = size_t; + using dimension_type = Dimension; + using array_layout = Kokkos::LayoutLeft; + + dimension_type m_dim; + size_type m_stride; + + //---------------------------------------- + + // rank 1 + template <typename I0> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()(I0 const& i0) const { + return i0; + } + + // rank 2 + template <typename I0, typename I1> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()(I0 const& i0, + I1 const& i1) const { + return i0 + m_stride * i1; + } + + // rank 3 + template <typename I0, typename I1, typename I2> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()(I0 const& i0, + I1 const& i1, + I2 const& i2) const { + return i0 + m_stride * (i1 + m_dim.N1 * i2); + } + + // rank 4 + template <typename I0, typename I1, typename I2, typename I3> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()(I0 const& i0, + I1 const& i1, + I2 const& i2, + I3 const& i3) const { + return i0 + m_stride * (i1 + m_dim.N1 * (i2 + m_dim.N2 * i3)); + } + + // rank 5 + template <typename I0, typename I1, typename I2, typename I3, typename I4> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()(I0 const& i0, + I1 const& i1, + I2 const& i2, + I3 const& i3, + I4 const& i4) const { + return i0 + + m_stride * (i1 + m_dim.N1 * (i2 + m_dim.N2 * (i3 + m_dim.N3 * i4))); + } + + // rank 6 + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()( + I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4, + I5 const& i5) const { + return i0 + + m_stride * + (i1 + + m_dim.N1 * + (i2 + m_dim.N2 * (i3 + m_dim.N3 * (i4 + m_dim.N4 * i5)))); + } + + // rank 7 + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5, typename I6> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()( + I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4, + I5 const& i5, I6 const& i6) const { + return i0 + + m_stride * + (i1 + m_dim.N1 * + (i2 + m_dim.N2 * + (i3 + m_dim.N3 * + (i4 + m_dim.N4 * + (i5 + m_dim.N5 * i6))))); + } + + // rank 8 + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5, typename I6, typename I7> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()( + I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4, + I5 const& i5, I6 const& i6, I7 const& i7) const { + return i0 + + m_stride * + (i1 + + m_dim.N1 * + (i2 + m_dim.N2 * + (i3 + m_dim.N3 * + (i4 + m_dim.N4 * + (i5 + m_dim.N5 * + (i6 + m_dim.N6 * + i7)))))); + } + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + constexpr array_layout layout() const { + return array_layout(m_dim.N0, m_dim.N1, m_dim.N2, m_dim.N3, m_dim.N4, + m_dim.N5, m_dim.N6, m_dim.N7); + } + + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { + return m_dim.N0; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_1() const { + return m_dim.N1; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_2() const { + return m_dim.N2; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_3() const { + return m_dim.N3; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_4() const { + return m_dim.N4; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_5() const { + return m_dim.N5; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_6() const { + return m_dim.N6; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_7() const { + return m_dim.N7; + } + + /* Cardinality of the domain index space */ + KOKKOS_INLINE_FUNCTION + constexpr size_type size() const { + return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * + m_dim.N6 * m_dim.N7; + } + + /* Span of the range space */ + KOKKOS_INLINE_FUNCTION + constexpr size_type span() const { + return (m_dim.N0 > size_type(0) ? m_stride : size_type(0)) * m_dim.N1 * + m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 * m_dim.N7; + } + + KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { + return m_stride == m_dim.N0; + } + + /* Strides of dimensions */ + KOKKOS_INLINE_FUNCTION constexpr size_type stride_0() const { return 1; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_1() const { + return m_stride; + } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_2() const { + return m_stride * m_dim.N1; + } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_3() const { + return m_stride * m_dim.N1 * m_dim.N2; + } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_4() const { + return m_stride * m_dim.N1 * m_dim.N2 * m_dim.N3; + } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_5() const { + return m_stride * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4; + } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_6() const { + return m_stride * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5; + } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_7() const { + return m_stride * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * + m_dim.N6; + } + + // Stride with [ rank ] value is the total length + template <typename iType> + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + s[0] = 1; + if (0 < dimension_type::rank) { + s[1] = m_stride; + } + if (1 < dimension_type::rank) { + s[2] = s[1] * m_dim.N1; + } + if (2 < dimension_type::rank) { + s[3] = s[2] * m_dim.N2; + } + if (3 < dimension_type::rank) { + s[4] = s[3] * m_dim.N3; + } + if (4 < dimension_type::rank) { + s[5] = s[4] * m_dim.N4; + } + if (5 < dimension_type::rank) { + s[6] = s[5] * m_dim.N5; + } + if (6 < dimension_type::rank) { + s[7] = s[6] * m_dim.N6; + } + if (7 < dimension_type::rank) { + s[8] = s[7] * m_dim.N7; + } + } + + //---------------------------------------- + + private: + template <unsigned TrivialScalarSize> + struct Padding { + enum { + div = TrivialScalarSize == 0 + ? 0 + : Kokkos::Impl::MEMORY_ALIGNMENT / + (TrivialScalarSize ? TrivialScalarSize : 1) + }; + enum { + mod = TrivialScalarSize == 0 + ? 0 + : Kokkos::Impl::MEMORY_ALIGNMENT % + (TrivialScalarSize ? TrivialScalarSize : 1) + }; + + // If memory alignment is a multiple of the trivial scalar size then attempt + // to align. + enum { align = 0 != TrivialScalarSize && 0 == mod ? div : 0 }; + enum { + div_ok = (div != 0) ? div : 1 + }; // To valid modulo zero in constexpr + + KOKKOS_INLINE_FUNCTION + static constexpr size_t stride(size_t const N) { + return ((align != 0) && + ((Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align) < N) && + ((N % div_ok) != 0)) + ? N + align - (N % div_ok) + : N; + } + }; + + public: + // MSVC (16.5.5) + CUDA (10.2) did not generate the defaulted functions + // correct and errors out during compilation. Same for the other places where + // I changed this. +#ifdef KOKKOS_IMPL_WINDOWS_CUDA + KOKKOS_FUNCTION ViewOffset() : m_dim(dimension_type()), m_stride(0) {} + KOKKOS_FUNCTION ViewOffset(const ViewOffset& src) { + m_dim = src.m_dim; + m_stride = src.m_stride; + } + KOKKOS_FUNCTION ViewOffset& operator=(const ViewOffset& src) { + m_dim = src.m_dim; + m_stride = src.m_stride; + return *this; + } +#else + + ViewOffset() = default; + ViewOffset(const ViewOffset&) = default; + ViewOffset& operator=(const ViewOffset&) = default; +#endif + + /* Enable padding for trivial scalar types with non-zero trivial scalar size + */ + template <unsigned TrivialScalarSize> + KOKKOS_INLINE_FUNCTION constexpr ViewOffset( + std::integral_constant<unsigned, TrivialScalarSize> const&, + Kokkos::LayoutLeft const& arg_layout) + : m_dim(arg_layout.dimension[0], arg_layout.dimension[1], + arg_layout.dimension[2], arg_layout.dimension[3], + arg_layout.dimension[4], arg_layout.dimension[5], + arg_layout.dimension[6], arg_layout.dimension[7]), + m_stride(Padding<TrivialScalarSize>::stride(arg_layout.dimension[0])) {} + + template <class DimRHS> + KOKKOS_INLINE_FUNCTION constexpr ViewOffset( + const ViewOffset<DimRHS, Kokkos::LayoutLeft, void>& rhs) + : m_dim(rhs.m_dim.N0, rhs.m_dim.N1, rhs.m_dim.N2, rhs.m_dim.N3, + rhs.m_dim.N4, rhs.m_dim.N5, rhs.m_dim.N6, rhs.m_dim.N7), + m_stride(rhs.stride_1()) { + static_assert(int(DimRHS::rank) == int(dimension_type::rank), + "ViewOffset assignment requires equal rank"); + // Also requires equal static dimensions ... + } + + template <class DimRHS> + KOKKOS_INLINE_FUNCTION ViewOffset( + const ViewOffset<DimRHS, Kokkos::LayoutStride, void>& rhs) + : m_dim(rhs.m_dim.N0, rhs.m_dim.N1, rhs.m_dim.N2, rhs.m_dim.N3, + rhs.m_dim.N4, rhs.m_dim.N5, rhs.m_dim.N6, rhs.m_dim.N7), + m_stride(rhs.stride_1()) { + if (rhs.m_stride.S0 != 1) { + Kokkos::abort( + "Kokkos::Impl::ViewOffset assignment of LayoutLeft from LayoutStride " + "requires stride == 1"); + } + } + + //---------------------------------------- + // Subview construction + // This subview must be 2 == rank and 2 == rank_dynamic + // due to only having stride #0. + // The source dimension #0 must be non-zero for stride-one leading dimension. + // At most subsequent dimension can be non-zero. + + template <class DimRHS> + KOKKOS_INLINE_FUNCTION constexpr ViewOffset( + const ViewOffset<DimRHS, Kokkos::LayoutLeft, void>& rhs, + const SubviewExtents<DimRHS::rank, dimension_type::rank>& sub) + : m_dim(sub.range_extent(0), sub.range_extent(1), sub.range_extent(2), + sub.range_extent(3), sub.range_extent(4), sub.range_extent(5), + sub.range_extent(6), sub.range_extent(7)), + m_stride( + (1 == sub.range_index(1) + ? rhs.stride_1() + : (2 == sub.range_index(1) + ? rhs.stride_2() + : (3 == sub.range_index(1) + ? rhs.stride_3() + : (4 == sub.range_index(1) + ? rhs.stride_4() + : (5 == sub.range_index(1) + ? rhs.stride_5() + : (6 == sub.range_index(1) + ? rhs.stride_6() + : (7 == sub.range_index(1) + ? rhs.stride_7() + : 0)))))))) { + // static_assert( ( 2 == dimension_type::rank ) && + // ( 2 == dimension_type::rank_dynamic ) && + // ( 2 <= DimRHS::rank ) + // , "ViewOffset subview construction requires compatible rank" + // ); + } +}; + +//---------------------------------------------------------------------------- +// LayoutRight AND ( 1 >= rank OR 0 == rank_dynamic ) : no padding / striding +template <class Dimension> +struct ViewOffset< + Dimension, Kokkos::LayoutRight, + typename std::enable_if<(1 >= Dimension::rank || + 0 == Dimension::rank_dynamic)>::type> { + using is_mapping_plugin = std::true_type; + using is_regular = std::true_type; + + using size_type = size_t; + using dimension_type = Dimension; + using array_layout = Kokkos::LayoutRight; + + dimension_type m_dim; + + //---------------------------------------- + + // rank 1 + template <typename I0> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()(I0 const& i0) const { + return i0; + } + + // rank 2 + template <typename I0, typename I1> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()(I0 const& i0, + I1 const& i1) const { + return i1 + m_dim.N1 * i0; + } + + // rank 3 + template <typename I0, typename I1, typename I2> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()(I0 const& i0, + I1 const& i1, + I2 const& i2) const { + return i2 + m_dim.N2 * (i1 + m_dim.N1 * (i0)); + } + + // rank 4 + template <typename I0, typename I1, typename I2, typename I3> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()(I0 const& i0, + I1 const& i1, + I2 const& i2, + I3 const& i3) const { + return i3 + m_dim.N3 * (i2 + m_dim.N2 * (i1 + m_dim.N1 * (i0))); + } + + // rank 5 + template <typename I0, typename I1, typename I2, typename I3, typename I4> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()(I0 const& i0, + I1 const& i1, + I2 const& i2, + I3 const& i3, + I4 const& i4) const { + return i4 + m_dim.N4 * + (i3 + m_dim.N3 * (i2 + m_dim.N2 * (i1 + m_dim.N1 * (i0)))); + } + + // rank 6 + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()( + I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4, + I5 const& i5) const { + return i5 + + m_dim.N5 * + (i4 + + m_dim.N4 * + (i3 + m_dim.N3 * (i2 + m_dim.N2 * (i1 + m_dim.N1 * (i0))))); + } + + // rank 7 + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5, typename I6> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()( + I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4, + I5 const& i5, I6 const& i6) const { + return i6 + + m_dim.N6 * + (i5 + + m_dim.N5 * + (i4 + + m_dim.N4 * + (i3 + m_dim.N3 * + (i2 + m_dim.N2 * (i1 + m_dim.N1 * (i0)))))); + } + + // rank 8 + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5, typename I6, typename I7> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()( + I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4, + I5 const& i5, I6 const& i6, I7 const& i7) const { + return i7 + + m_dim.N7 * + (i6 + + m_dim.N6 * + (i5 + + m_dim.N5 * + (i4 + + m_dim.N4 * + (i3 + + m_dim.N3 * + (i2 + m_dim.N2 * (i1 + m_dim.N1 * (i0))))))); + } + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + constexpr array_layout layout() const { + return array_layout(m_dim.N0, m_dim.N1, m_dim.N2, m_dim.N3, m_dim.N4, + m_dim.N5, m_dim.N6, m_dim.N7); + } + + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { + return m_dim.N0; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_1() const { + return m_dim.N1; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_2() const { + return m_dim.N2; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_3() const { + return m_dim.N3; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_4() const { + return m_dim.N4; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_5() const { + return m_dim.N5; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_6() const { + return m_dim.N6; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_7() const { + return m_dim.N7; + } + + /* Cardinality of the domain index space */ + KOKKOS_INLINE_FUNCTION + constexpr size_type size() const { + return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * + m_dim.N6 * m_dim.N7; + } + + /* Span of the range space */ + KOKKOS_INLINE_FUNCTION + constexpr size_type span() const { + return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * + m_dim.N6 * m_dim.N7; + } + + KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { + return true; + } + + /* Strides of dimensions */ + KOKKOS_INLINE_FUNCTION constexpr size_type stride_7() const { return 1; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_6() const { + return m_dim.N7; + } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_5() const { + return m_dim.N7 * m_dim.N6; + } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_4() const { + return m_dim.N7 * m_dim.N6 * m_dim.N5; + } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_3() const { + return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4; + } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_2() const { + return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3; + } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_1() const { + return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 * m_dim.N2; + } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_0() const { + return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 * m_dim.N2 * + m_dim.N1; + } + + // Stride with [ rank ] value is the total length + template <typename iType> + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + size_type n = 1; + if (7 < dimension_type::rank) { + s[7] = n; + n *= m_dim.N7; + } + if (6 < dimension_type::rank) { + s[6] = n; + n *= m_dim.N6; + } + if (5 < dimension_type::rank) { + s[5] = n; + n *= m_dim.N5; + } + if (4 < dimension_type::rank) { + s[4] = n; + n *= m_dim.N4; + } + if (3 < dimension_type::rank) { + s[3] = n; + n *= m_dim.N3; + } + if (2 < dimension_type::rank) { + s[2] = n; + n *= m_dim.N2; + } + if (1 < dimension_type::rank) { + s[1] = n; + n *= m_dim.N1; + } + if (0 < dimension_type::rank) { + s[0] = n; + } + s[dimension_type::rank] = n * m_dim.N0; + } + + //---------------------------------------- + // MSVC (16.5.5) + CUDA (10.2) did not generate the defaulted functions + // correct and errors out during compilation. Same for the other places where + // I changed this. + +#ifdef KOKKOS_IMPL_WINDOWS_CUDA + KOKKOS_FUNCTION ViewOffset() : m_dim(dimension_type()) {} + KOKKOS_FUNCTION ViewOffset(const ViewOffset& src) { m_dim = src.m_dim; } + KOKKOS_FUNCTION ViewOffset& operator=(const ViewOffset& src) { + m_dim = src.m_dim; + return *this; + } +#else + + ViewOffset() = default; + ViewOffset(const ViewOffset&) = default; + ViewOffset& operator=(const ViewOffset&) = default; +#endif + + template <unsigned TrivialScalarSize> + KOKKOS_INLINE_FUNCTION constexpr ViewOffset( + std::integral_constant<unsigned, TrivialScalarSize> const&, + Kokkos::LayoutRight const& arg_layout) + : m_dim(arg_layout.dimension[0], 0, 0, 0, 0, 0, 0, 0) {} + + template <class DimRHS> + KOKKOS_INLINE_FUNCTION constexpr ViewOffset( + const ViewOffset<DimRHS, Kokkos::LayoutRight, void>& rhs) + : m_dim(rhs.m_dim.N0, rhs.m_dim.N1, rhs.m_dim.N2, rhs.m_dim.N3, + rhs.m_dim.N4, rhs.m_dim.N5, rhs.m_dim.N6, rhs.m_dim.N7) { + static_assert(int(DimRHS::rank) == int(dimension_type::rank), + "ViewOffset assignment requires equal rank"); + // Also requires equal static dimensions ... + } + + template <class DimRHS> + KOKKOS_INLINE_FUNCTION constexpr ViewOffset( + const ViewOffset<DimRHS, Kokkos::LayoutLeft, void>& rhs) + : m_dim(rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0) { + static_assert((DimRHS::rank == 0 && dimension_type::rank == 0) || + (DimRHS::rank == 1 && dimension_type::rank == 1 && + dimension_type::rank_dynamic == 1), + "ViewOffset LayoutRight and LayoutLeft are only compatible " + "when rank <= 1"); + } + + template <class DimRHS> + KOKKOS_INLINE_FUNCTION ViewOffset( + const ViewOffset<DimRHS, Kokkos::LayoutStride, void>& rhs) + : m_dim(rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0) {} + + //---------------------------------------- + // Subview construction + + template <class DimRHS> + KOKKOS_INLINE_FUNCTION constexpr ViewOffset( + const ViewOffset<DimRHS, Kokkos::LayoutRight, void>&, + const SubviewExtents<DimRHS::rank, dimension_type::rank>& sub) + : m_dim(sub.range_extent(0), 0, 0, 0, 0, 0, 0, 0) { + static_assert((0 == dimension_type::rank_dynamic) || + (1 == dimension_type::rank && + 1 == dimension_type::rank_dynamic && 1 <= DimRHS::rank), + "ViewOffset subview construction requires compatible rank"); + } +}; + +//---------------------------------------------------------------------------- +// LayoutRight AND ( 1 < rank AND 0 < rank_dynamic ) : has padding / striding +template <class Dimension> +struct ViewOffset< + Dimension, Kokkos::LayoutRight, + typename std::enable_if<(1 < Dimension::rank && + 0 < Dimension::rank_dynamic)>::type> { + using is_mapping_plugin = std::true_type; + using is_regular = std::true_type; + + using size_type = size_t; + using dimension_type = Dimension; + using array_layout = Kokkos::LayoutRight; + + dimension_type m_dim; + size_type m_stride; + + //---------------------------------------- + + // rank 1 + template <typename I0> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()(I0 const& i0) const { + return i0; + } + + // rank 2 + template <typename I0, typename I1> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()(I0 const& i0, + I1 const& i1) const { + return i1 + i0 * m_stride; + } + + // rank 3 + template <typename I0, typename I1, typename I2> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()(I0 const& i0, + I1 const& i1, + I2 const& i2) const { + return i2 + m_dim.N2 * (i1) + i0 * m_stride; + } + + // rank 4 + template <typename I0, typename I1, typename I2, typename I3> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()(I0 const& i0, + I1 const& i1, + I2 const& i2, + I3 const& i3) const { + return i3 + m_dim.N3 * (i2 + m_dim.N2 * (i1)) + i0 * m_stride; + } + + // rank 5 + template <typename I0, typename I1, typename I2, typename I3, typename I4> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()(I0 const& i0, + I1 const& i1, + I2 const& i2, + I3 const& i3, + I4 const& i4) const { + return i4 + m_dim.N4 * (i3 + m_dim.N3 * (i2 + m_dim.N2 * (i1))) + + i0 * m_stride; + } + + // rank 6 + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()( + I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4, + I5 const& i5) const { + return i5 + + m_dim.N5 * + (i4 + m_dim.N4 * (i3 + m_dim.N3 * (i2 + m_dim.N2 * (i1)))) + + i0 * m_stride; + } + + // rank 7 + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5, typename I6> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()( + I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4, + I5 const& i5, I6 const& i6) const { + return i6 + + m_dim.N6 * + (i5 + m_dim.N5 * + (i4 + m_dim.N4 * + (i3 + m_dim.N3 * (i2 + m_dim.N2 * (i1))))) + + i0 * m_stride; + } + + // rank 8 + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5, typename I6, typename I7> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()( + I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4, + I5 const& i5, I6 const& i6, I7 const& i7) const { + return i7 + + m_dim.N7 * + (i6 + + m_dim.N6 * + (i5 + + m_dim.N5 * + (i4 + m_dim.N4 * + (i3 + m_dim.N3 * (i2 + m_dim.N2 * (i1)))))) + + i0 * m_stride; + } + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + constexpr array_layout layout() const { + return array_layout(m_dim.N0, m_dim.N1, m_dim.N2, m_dim.N3, m_dim.N4, + m_dim.N5, m_dim.N6, m_dim.N7); + } + + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { + return m_dim.N0; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_1() const { + return m_dim.N1; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_2() const { + return m_dim.N2; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_3() const { + return m_dim.N3; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_4() const { + return m_dim.N4; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_5() const { + return m_dim.N5; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_6() const { + return m_dim.N6; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_7() const { + return m_dim.N7; + } + + /* Cardinality of the domain index space */ + KOKKOS_INLINE_FUNCTION + constexpr size_type size() const { + return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * + m_dim.N6 * m_dim.N7; + } + + /* Span of the range space */ + KOKKOS_INLINE_FUNCTION + constexpr size_type span() const { + return size() > 0 ? m_dim.N0 * m_stride : 0; + } + + KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { + return m_stride == m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 * + m_dim.N2 * m_dim.N1; + } + + /* Strides of dimensions */ + KOKKOS_INLINE_FUNCTION constexpr size_type stride_7() const { return 1; } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_6() const { + return m_dim.N7; + } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_5() const { + return m_dim.N7 * m_dim.N6; + } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_4() const { + return m_dim.N7 * m_dim.N6 * m_dim.N5; + } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_3() const { + return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4; + } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_2() const { + return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3; + } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_1() const { + return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 * m_dim.N2; + } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_0() const { + return m_stride; + } + + // Stride with [ rank ] value is the total length + template <typename iType> + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + size_type n = 1; + if (7 < dimension_type::rank) { + s[7] = n; + n *= m_dim.N7; + } + if (6 < dimension_type::rank) { + s[6] = n; + n *= m_dim.N6; + } + if (5 < dimension_type::rank) { + s[5] = n; + n *= m_dim.N5; + } + if (4 < dimension_type::rank) { + s[4] = n; + n *= m_dim.N4; + } + if (3 < dimension_type::rank) { + s[3] = n; + n *= m_dim.N3; + } + if (2 < dimension_type::rank) { + s[2] = n; + n *= m_dim.N2; + } + if (1 < dimension_type::rank) { + s[1] = n; + } + if (0 < dimension_type::rank) { + s[0] = m_stride; + } + s[dimension_type::rank] = m_stride * m_dim.N0; + } + + //---------------------------------------- + + private: + template <unsigned TrivialScalarSize> + struct Padding { + enum { + div = TrivialScalarSize == 0 + ? 0 + : Kokkos::Impl::MEMORY_ALIGNMENT / + (TrivialScalarSize ? TrivialScalarSize : 1) + }; + enum { + mod = TrivialScalarSize == 0 + ? 0 + : Kokkos::Impl::MEMORY_ALIGNMENT % + (TrivialScalarSize ? TrivialScalarSize : 1) + }; + + // If memory alignment is a multiple of the trivial scalar size then attempt + // to align. + enum { align = 0 != TrivialScalarSize && 0 == mod ? div : 0 }; + enum { + div_ok = (div != 0) ? div : 1 + }; // To valid modulo zero in constexpr + + KOKKOS_INLINE_FUNCTION + static constexpr size_t stride(size_t const N) { + return ((align != 0) && + ((Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align) < N) && + ((N % div_ok) != 0)) + ? N + align - (N % div_ok) + : N; + } + }; + + public: + // MSVC (16.5.5) + CUDA (10.2) did not generate the defaulted functions + // correct and errors out during compilation. Same for the other places where + // I changed this. + +#ifdef KOKKOS_IMPL_WINDOWS_CUDA + KOKKOS_FUNCTION ViewOffset() : m_dim(dimension_type()), m_stride(0) {} + KOKKOS_FUNCTION ViewOffset(const ViewOffset& src) { + m_dim = src.m_dim; + m_stride = src.m_stride; + } + KOKKOS_FUNCTION ViewOffset& operator=(const ViewOffset& src) { + m_dim = src.m_dim; + m_stride = src.m_stride; + return *this; + } +#else + + ViewOffset() = default; + ViewOffset(const ViewOffset&) = default; + ViewOffset& operator=(const ViewOffset&) = default; +#endif + + /* Enable padding for trivial scalar types with non-zero trivial scalar size. + */ + template <unsigned TrivialScalarSize> + KOKKOS_INLINE_FUNCTION constexpr ViewOffset( + std::integral_constant<unsigned, TrivialScalarSize> const&, + Kokkos::LayoutRight const& arg_layout) + : m_dim(arg_layout.dimension[0], arg_layout.dimension[1], + arg_layout.dimension[2], arg_layout.dimension[3], + arg_layout.dimension[4], arg_layout.dimension[5], + arg_layout.dimension[6], arg_layout.dimension[7]), + m_stride( + Padding<TrivialScalarSize>:: + stride(/* 2 <= rank */ + m_dim.N1 * + (dimension_type::rank == 2 + ? 1 + : m_dim.N2 * + (dimension_type::rank == 3 + ? 1 + : m_dim.N3 * + (dimension_type::rank == 4 + ? 1 + : m_dim.N4 * + (dimension_type::rank == + 5 + ? 1 + : m_dim.N5 * + (dimension_type:: + rank == + 6 + ? 1 + : m_dim.N6 * + (dimension_type:: + rank == + 7 + ? 1 + : m_dim + .N7)))))))) { + } + + template <class DimRHS> + KOKKOS_INLINE_FUNCTION constexpr ViewOffset( + const ViewOffset<DimRHS, Kokkos::LayoutRight, void>& rhs) + : m_dim(rhs.m_dim.N0, rhs.m_dim.N1, rhs.m_dim.N2, rhs.m_dim.N3, + rhs.m_dim.N4, rhs.m_dim.N5, rhs.m_dim.N6, rhs.m_dim.N7), + m_stride(rhs.stride_0()) { + static_assert(int(DimRHS::rank) == int(dimension_type::rank), + "ViewOffset assignment requires equal rank"); + // Also requires equal static dimensions ... + } + + template <class DimRHS> + KOKKOS_INLINE_FUNCTION ViewOffset( + const ViewOffset<DimRHS, Kokkos::LayoutStride, void>& rhs) + : m_dim(rhs.m_dim.N0, rhs.m_dim.N1, rhs.m_dim.N2, rhs.m_dim.N3, + rhs.m_dim.N4, rhs.m_dim.N5, rhs.m_dim.N6, rhs.m_dim.N7), + m_stride(rhs.stride_0()) { + if (((dimension_type::rank == 2) + ? rhs.m_stride.S1 + : ((dimension_type::rank == 3) + ? rhs.m_stride.S2 + : ((dimension_type::rank == 4) + ? rhs.m_stride.S3 + : ((dimension_type::rank == 5) + ? rhs.m_stride.S4 + : ((dimension_type::rank == 6) + ? rhs.m_stride.S5 + : ((dimension_type::rank == 7) + ? rhs.m_stride.S6 + : rhs.m_stride.S7)))))) != 1) { + Kokkos::abort( + "Kokkos::Impl::ViewOffset assignment of LayoutRight from " + "LayoutStride requires right-most stride == 1"); + } + } + + //---------------------------------------- + // Subview construction + // Last dimension must be non-zero + + template <class DimRHS> + KOKKOS_INLINE_FUNCTION constexpr ViewOffset( + const ViewOffset<DimRHS, Kokkos::LayoutRight, void>& rhs, + const SubviewExtents<DimRHS::rank, dimension_type::rank>& sub) + : m_dim(sub.range_extent(0), sub.range_extent(1), sub.range_extent(2), + sub.range_extent(3), sub.range_extent(4), sub.range_extent(5), + sub.range_extent(6), sub.range_extent(7)), + m_stride( + 0 == sub.range_index(0) + ? rhs.stride_0() + : (1 == sub.range_index(0) + ? rhs.stride_1() + : (2 == sub.range_index(0) + ? rhs.stride_2() + : (3 == sub.range_index(0) + ? rhs.stride_3() + : (4 == sub.range_index(0) + ? rhs.stride_4() + : (5 == sub.range_index(0) + ? rhs.stride_5() + : (6 == sub.range_index(0) + ? rhs.stride_6() + : 0))))))) { + /* // This subview must be 2 == rank and 2 == rank_dynamic + // due to only having stride #0. + // The source dimension #0 must be non-zero for stride-one leading + dimension. + // At most subsequent dimension can be non-zero. + + static_assert( (( 2 == dimension_type::rank ) && + ( 2 <= DimRHS::rank )) || + () + , "ViewOffset subview construction requires compatible + rank" ); + */ + } +}; + +//---------------------------------------------------------------------------- +/* Strided array layout only makes sense for 0 < rank */ +/* rank = 0 included for DynRankView case */ + +template <unsigned Rank> +struct ViewStride; + +template <> +struct ViewStride<0> { + enum { S0 = 0, S1 = 0, S2 = 0, S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0 }; + + ViewStride() = default; + ViewStride(const ViewStride&) = default; + ViewStride& operator=(const ViewStride&) = default; + + KOKKOS_INLINE_FUNCTION + constexpr ViewStride(size_t, size_t, size_t, size_t, size_t, size_t, size_t, + size_t) {} +}; + +template <> +struct ViewStride<1> { + size_t S0; + enum { S1 = 0, S2 = 0, S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0 }; + + ViewStride() = default; + ViewStride(const ViewStride&) = default; + ViewStride& operator=(const ViewStride&) = default; + + KOKKOS_INLINE_FUNCTION + constexpr ViewStride(size_t aS0, size_t, size_t, size_t, size_t, size_t, + size_t, size_t) + : S0(aS0) {} +}; + +template <> +struct ViewStride<2> { + size_t S0, S1; + enum { S2 = 0, S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0 }; + + ViewStride() = default; + ViewStride(const ViewStride&) = default; + ViewStride& operator=(const ViewStride&) = default; + + KOKKOS_INLINE_FUNCTION + constexpr ViewStride(size_t aS0, size_t aS1, size_t, size_t, size_t, size_t, + size_t, size_t) + : S0(aS0), S1(aS1) {} +}; + +template <> +struct ViewStride<3> { + size_t S0, S1, S2; + enum { S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0 }; + + ViewStride() = default; + ViewStride(const ViewStride&) = default; + ViewStride& operator=(const ViewStride&) = default; + + KOKKOS_INLINE_FUNCTION + constexpr ViewStride(size_t aS0, size_t aS1, size_t aS2, size_t, size_t, + size_t, size_t, size_t) + : S0(aS0), S1(aS1), S2(aS2) {} +}; + +template <> +struct ViewStride<4> { + size_t S0, S1, S2, S3; + enum { S4 = 0, S5 = 0, S6 = 0, S7 = 0 }; + + ViewStride() = default; + ViewStride(const ViewStride&) = default; + ViewStride& operator=(const ViewStride&) = default; + + KOKKOS_INLINE_FUNCTION + constexpr ViewStride(size_t aS0, size_t aS1, size_t aS2, size_t aS3, size_t, + size_t, size_t, size_t) + : S0(aS0), S1(aS1), S2(aS2), S3(aS3) {} +}; + +template <> +struct ViewStride<5> { + size_t S0, S1, S2, S3, S4; + enum { S5 = 0, S6 = 0, S7 = 0 }; + + ViewStride() = default; + ViewStride(const ViewStride&) = default; + ViewStride& operator=(const ViewStride&) = default; + + KOKKOS_INLINE_FUNCTION + constexpr ViewStride(size_t aS0, size_t aS1, size_t aS2, size_t aS3, + size_t aS4, size_t, size_t, size_t) + : S0(aS0), S1(aS1), S2(aS2), S3(aS3), S4(aS4) {} +}; + +template <> +struct ViewStride<6> { + size_t S0, S1, S2, S3, S4, S5; + enum { S6 = 0, S7 = 0 }; + + ViewStride() = default; + ViewStride(const ViewStride&) = default; + ViewStride& operator=(const ViewStride&) = default; + + KOKKOS_INLINE_FUNCTION + constexpr ViewStride(size_t aS0, size_t aS1, size_t aS2, size_t aS3, + size_t aS4, size_t aS5, size_t, size_t) + : S0(aS0), S1(aS1), S2(aS2), S3(aS3), S4(aS4), S5(aS5) {} +}; + +template <> +struct ViewStride<7> { + size_t S0, S1, S2, S3, S4, S5, S6; + enum { S7 = 0 }; + + ViewStride() = default; + ViewStride(const ViewStride&) = default; + ViewStride& operator=(const ViewStride&) = default; + + KOKKOS_INLINE_FUNCTION + constexpr ViewStride(size_t aS0, size_t aS1, size_t aS2, size_t aS3, + size_t aS4, size_t aS5, size_t aS6, size_t) + : S0(aS0), S1(aS1), S2(aS2), S3(aS3), S4(aS4), S5(aS5), S6(aS6) {} +}; + +template <> +struct ViewStride<8> { + size_t S0, S1, S2, S3, S4, S5, S6, S7; + + ViewStride() = default; + ViewStride(const ViewStride&) = default; + ViewStride& operator=(const ViewStride&) = default; + + KOKKOS_INLINE_FUNCTION + constexpr ViewStride(size_t aS0, size_t aS1, size_t aS2, size_t aS3, + size_t aS4, size_t aS5, size_t aS6, size_t aS7) + : S0(aS0), + S1(aS1), + S2(aS2), + S3(aS3), + S4(aS4), + S5(aS5), + S6(aS6), + S7(aS7) {} +}; + +template <class Dimension> +struct ViewOffset<Dimension, Kokkos::LayoutStride, void> { + private: + using stride_type = ViewStride<Dimension::rank>; + + public: + using is_mapping_plugin = std::true_type; + using is_regular = std::true_type; + + using size_type = size_t; + using dimension_type = Dimension; + using array_layout = Kokkos::LayoutStride; + + dimension_type m_dim; + stride_type m_stride; + + //---------------------------------------- + + // rank 1 + template <typename I0> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()(I0 const& i0) const { + return i0 * m_stride.S0; + } + + // rank 2 + template <typename I0, typename I1> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()(I0 const& i0, + I1 const& i1) const { + return i0 * m_stride.S0 + i1 * m_stride.S1; + } + + // rank 3 + template <typename I0, typename I1, typename I2> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()(I0 const& i0, + I1 const& i1, + I2 const& i2) const { + return i0 * m_stride.S0 + i1 * m_stride.S1 + i2 * m_stride.S2; + } + + // rank 4 + template <typename I0, typename I1, typename I2, typename I3> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()(I0 const& i0, + I1 const& i1, + I2 const& i2, + I3 const& i3) const { + return i0 * m_stride.S0 + i1 * m_stride.S1 + i2 * m_stride.S2 + + i3 * m_stride.S3; + } + + // rank 5 + template <typename I0, typename I1, typename I2, typename I3, typename I4> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()(I0 const& i0, + I1 const& i1, + I2 const& i2, + I3 const& i3, + I4 const& i4) const { + return i0 * m_stride.S0 + i1 * m_stride.S1 + i2 * m_stride.S2 + + i3 * m_stride.S3 + i4 * m_stride.S4; + } + + // rank 6 + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()( + I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4, + I5 const& i5) const { + return i0 * m_stride.S0 + i1 * m_stride.S1 + i2 * m_stride.S2 + + i3 * m_stride.S3 + i4 * m_stride.S4 + i5 * m_stride.S5; + } + + // rank 7 + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5, typename I6> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()( + I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4, + I5 const& i5, I6 const& i6) const { + return i0 * m_stride.S0 + i1 * m_stride.S1 + i2 * m_stride.S2 + + i3 * m_stride.S3 + i4 * m_stride.S4 + i5 * m_stride.S5 + + i6 * m_stride.S6; + } + + // rank 8 + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5, typename I6, typename I7> + KOKKOS_INLINE_FUNCTION constexpr size_type operator()( + I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4, + I5 const& i5, I6 const& i6, I7 const& i7) const { + return i0 * m_stride.S0 + i1 * m_stride.S1 + i2 * m_stride.S2 + + i3 * m_stride.S3 + i4 * m_stride.S4 + i5 * m_stride.S5 + + i6 * m_stride.S6 + i7 * m_stride.S7; + } + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + constexpr array_layout layout() const { + return array_layout(m_dim.N0, m_stride.S0, m_dim.N1, m_stride.S1, m_dim.N2, + m_stride.S2, m_dim.N3, m_stride.S3, m_dim.N4, + m_stride.S4, m_dim.N5, m_stride.S5, m_dim.N6, + m_stride.S6, m_dim.N7, m_stride.S7); + } + + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { + return m_dim.N0; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_1() const { + return m_dim.N1; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_2() const { + return m_dim.N2; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_3() const { + return m_dim.N3; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_4() const { + return m_dim.N4; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_5() const { + return m_dim.N5; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_6() const { + return m_dim.N6; + } + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_7() const { + return m_dim.N7; + } + + /* Cardinality of the domain index space */ + KOKKOS_INLINE_FUNCTION + constexpr size_type size() const { + return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * + m_dim.N6 * m_dim.N7; + } + + private: + KOKKOS_INLINE_FUNCTION + static constexpr size_type Max(size_type lhs, size_type rhs) { + return lhs < rhs ? rhs : lhs; + } + + public: + /* Span of the range space, largest stride * dimension */ + KOKKOS_INLINE_FUNCTION + constexpr size_type span() const { + return size() == size_type(0) + ? size_type(0) + : Max(m_dim.N0 * m_stride.S0, + Max(m_dim.N1 * m_stride.S1, + Max(m_dim.N2 * m_stride.S2, + Max(m_dim.N3 * m_stride.S3, + Max(m_dim.N4 * m_stride.S4, + Max(m_dim.N5 * m_stride.S5, + Max(m_dim.N6 * m_stride.S6, + m_dim.N7 * m_stride.S7))))))); + } + + KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { + return span() == size(); + } + + /* Strides of dimensions */ + KOKKOS_INLINE_FUNCTION constexpr size_type stride_0() const { + return m_stride.S0; + } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_1() const { + return m_stride.S1; + } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_2() const { + return m_stride.S2; + } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_3() const { + return m_stride.S3; + } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_4() const { + return m_stride.S4; + } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_5() const { + return m_stride.S5; + } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_6() const { + return m_stride.S6; + } + KOKKOS_INLINE_FUNCTION constexpr size_type stride_7() const { + return m_stride.S7; + } + + // Stride with [ rank ] value is the total length + template <typename iType> + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + if (0 < dimension_type::rank) { + s[0] = m_stride.S0; + } + if (1 < dimension_type::rank) { + s[1] = m_stride.S1; + } + if (2 < dimension_type::rank) { + s[2] = m_stride.S2; + } + if (3 < dimension_type::rank) { + s[3] = m_stride.S3; + } + if (4 < dimension_type::rank) { + s[4] = m_stride.S4; + } + if (5 < dimension_type::rank) { + s[5] = m_stride.S5; + } + if (6 < dimension_type::rank) { + s[6] = m_stride.S6; + } + if (7 < dimension_type::rank) { + s[7] = m_stride.S7; + } + s[dimension_type::rank] = span(); + } + + //---------------------------------------- + // MSVC (16.5.5) + CUDA (10.2) did not generate the defaulted functions + // correct and errors out during compilation. Same for the other places where + // I changed this. + +#ifdef KOKKOS_IMPL_WINDOWS_CUDA + KOKKOS_FUNCTION ViewOffset() + : m_dim(dimension_type()), m_stride(stride_type()) {} + KOKKOS_FUNCTION ViewOffset(const ViewOffset& src) { + m_dim = src.m_dim; + m_stride = src.m_stride; + } + KOKKOS_FUNCTION ViewOffset& operator=(const ViewOffset& src) { + m_dim = src.m_dim; + m_stride = src.m_stride; + return *this; + } +#else + + ViewOffset() = default; + ViewOffset(const ViewOffset&) = default; + ViewOffset& operator=(const ViewOffset&) = default; +#endif + + KOKKOS_INLINE_FUNCTION + constexpr ViewOffset(std::integral_constant<unsigned, 0> const&, + Kokkos::LayoutStride const& rhs) + : m_dim(rhs.dimension[0], rhs.dimension[1], rhs.dimension[2], + rhs.dimension[3], rhs.dimension[4], rhs.dimension[5], + rhs.dimension[6], rhs.dimension[7]), + m_stride(rhs.stride[0], rhs.stride[1], rhs.stride[2], rhs.stride[3], + rhs.stride[4], rhs.stride[5], rhs.stride[6], rhs.stride[7]) {} + + template <class DimRHS, class LayoutRHS> + KOKKOS_INLINE_FUNCTION constexpr ViewOffset( + const ViewOffset<DimRHS, LayoutRHS, void>& rhs) + : m_dim(rhs.m_dim.N0, rhs.m_dim.N1, rhs.m_dim.N2, rhs.m_dim.N3, + rhs.m_dim.N4, rhs.m_dim.N5, rhs.m_dim.N6, rhs.m_dim.N7), + m_stride(rhs.stride_0(), rhs.stride_1(), rhs.stride_2(), rhs.stride_3(), + rhs.stride_4(), rhs.stride_5(), rhs.stride_6(), + rhs.stride_7()) { + static_assert(int(DimRHS::rank) == int(dimension_type::rank), + "ViewOffset assignment requires equal rank"); + // Also requires equal static dimensions ... + } + + //---------------------------------------- + // Subview construction + + private: + template <class DimRHS, class LayoutRHS> + KOKKOS_INLINE_FUNCTION static constexpr size_t stride( + unsigned r, const ViewOffset<DimRHS, LayoutRHS, void>& rhs) { + return r > 7 + ? 0 + : (r == 0 + ? rhs.stride_0() + : (r == 1 + ? rhs.stride_1() + : (r == 2 + ? rhs.stride_2() + : (r == 3 + ? rhs.stride_3() + : (r == 4 + ? rhs.stride_4() + : (r == 5 + ? rhs.stride_5() + : (r == 6 + ? rhs.stride_6() + : rhs.stride_7()))))))); + } + + public: + template <class DimRHS, class LayoutRHS> + KOKKOS_INLINE_FUNCTION constexpr ViewOffset( + const ViewOffset<DimRHS, LayoutRHS, void>& rhs, + const SubviewExtents<DimRHS::rank, dimension_type::rank>& sub) + // range_extent(r) returns 0 when dimension_type::rank <= r + : m_dim(sub.range_extent(0), sub.range_extent(1), sub.range_extent(2), + sub.range_extent(3), sub.range_extent(4), sub.range_extent(5), + sub.range_extent(6), sub.range_extent(7)) + // range_index(r) returns ~0u when dimension_type::rank <= r + , + m_stride( + stride(sub.range_index(0), rhs), stride(sub.range_index(1), rhs), + stride(sub.range_index(2), rhs), stride(sub.range_index(3), rhs), + stride(sub.range_index(4), rhs), stride(sub.range_index(5), rhs), + stride(sub.range_index(6), rhs), stride(sub.range_index(7), rhs)) {} +}; + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +/** \brief ViewDataHandle provides the type of the 'data handle' which the view + * uses to access data with the [] operator. It also provides + * an allocate function and a function to extract a raw ptr from the + * data handle. ViewDataHandle also defines an enum ReferenceAble which + * specifies whether references/pointers to elements can be taken and a + * 'return_type' which is what the view operators will give back. + * Specialisation of this object allows three things depending + * on ViewTraits and compiler options: + * (i) Use special allocator (e.g. huge pages/small pages and pinned + * memory) (ii) Use special data handle type (e.g. add Cuda Texture Object) + * (iii) Use special access intrinsics (e.g. texture fetch and + * non-caching loads) + */ +template <class Traits, class Enable = void> +struct ViewDataHandle { + using value_type = typename Traits::value_type; + using handle_type = typename Traits::value_type*; + using return_type = typename Traits::value_type&; + using track_type = Kokkos::Impl::SharedAllocationTracker; + + KOKKOS_INLINE_FUNCTION + static handle_type assign(value_type* arg_data_ptr, + track_type const& /*arg_tracker*/) { + return handle_type(arg_data_ptr); + } + + KOKKOS_INLINE_FUNCTION + static handle_type assign(handle_type const arg_data_ptr, size_t offset) { + return handle_type(arg_data_ptr + offset); + } +}; + +template <class Traits> +struct ViewDataHandle< + Traits, typename std::enable_if<( + std::is_same<typename Traits::non_const_value_type, + typename Traits::value_type>::value && + std::is_same<typename Traits::specialize, void>::value && + Traits::memory_traits::is_atomic)>::type> { + using value_type = typename Traits::value_type; + using handle_type = typename Kokkos::Impl::AtomicViewDataHandle<Traits>; + using return_type = typename Kokkos::Impl::AtomicDataElement<Traits>; + using track_type = Kokkos::Impl::SharedAllocationTracker; + + KOKKOS_INLINE_FUNCTION + static handle_type assign(value_type* arg_data_ptr, + track_type const& /*arg_tracker*/) { + return handle_type(arg_data_ptr); + } + + template <class SrcHandleType> + KOKKOS_INLINE_FUNCTION static handle_type assign( + const SrcHandleType& arg_handle, size_t offset) { + return handle_type(arg_handle + offset); + } +}; + +template <class Traits> +struct ViewDataHandle< + Traits, typename std::enable_if<( + std::is_same<typename Traits::specialize, void>::value && + (!Traits::memory_traits::is_aligned) && + Traits::memory_traits::is_restrict +#ifdef KOKKOS_ENABLE_CUDA + && (!(std::is_same<typename Traits::memory_space, + Kokkos::CudaSpace>::value || + std::is_same<typename Traits::memory_space, + Kokkos::CudaUVMSpace>::value)) +#endif + && (!Traits::memory_traits::is_atomic))>::type> { + using value_type = typename Traits::value_type; + using handle_type = typename Traits::value_type* KOKKOS_RESTRICT; + using return_type = typename Traits::value_type& KOKKOS_RESTRICT; + using track_type = Kokkos::Impl::SharedAllocationTracker; + + KOKKOS_INLINE_FUNCTION + static value_type* assign(value_type* arg_data_ptr, + track_type const& /*arg_tracker*/) { + return (value_type*)(arg_data_ptr); + } + + KOKKOS_INLINE_FUNCTION + static value_type* assign(handle_type const arg_data_ptr, size_t offset) { + return (value_type*)(arg_data_ptr + offset); + } +}; + +template <class Traits> +struct ViewDataHandle< + Traits, typename std::enable_if<( + std::is_same<typename Traits::specialize, void>::value && + Traits::memory_traits::is_aligned && + (!Traits::memory_traits::is_restrict) +#ifdef KOKKOS_ENABLE_CUDA + && (!(std::is_same<typename Traits::memory_space, + Kokkos::CudaSpace>::value || + std::is_same<typename Traits::memory_space, + Kokkos::CudaUVMSpace>::value)) +#endif + && (!Traits::memory_traits::is_atomic))>::type> { + using value_type = typename Traits::value_type; + // typedef work-around for intel compilers error #3186: expected typedef + // declaration + // NOLINTNEXTLINE(modernize-use-using) + typedef value_type* KOKKOS_IMPL_ALIGN_PTR(KOKKOS_MEMORY_ALIGNMENT) + handle_type; + using return_type = typename Traits::value_type&; + using track_type = Kokkos::Impl::SharedAllocationTracker; + + KOKKOS_INLINE_FUNCTION + static handle_type assign(value_type* arg_data_ptr, + track_type const& /*arg_tracker*/) { + if (reinterpret_cast<uintptr_t>(arg_data_ptr) % Impl::MEMORY_ALIGNMENT) { + Kokkos::abort( + "Assigning NonAligned View or Pointer to Kokkos::View with Aligned " + "attribute"); + } + return handle_type(arg_data_ptr); + } + + KOKKOS_INLINE_FUNCTION + static handle_type assign(handle_type const arg_data_ptr, size_t offset) { + if (reinterpret_cast<uintptr_t>(arg_data_ptr + offset) % + Impl::MEMORY_ALIGNMENT) { + Kokkos::abort( + "Assigning NonAligned View or Pointer to Kokkos::View with Aligned " + "attribute"); + } + return handle_type(arg_data_ptr + offset); + } +}; + +template <class Traits> +struct ViewDataHandle< + Traits, + typename std::enable_if<( + std::is_same<typename Traits::specialize, void>::value && + Traits::memory_traits::is_aligned && Traits::memory_traits::is_restrict +#ifdef KOKKOS_ENABLE_CUDA + && (!(std::is_same<typename Traits::memory_space, + Kokkos::CudaSpace>::value || + std::is_same<typename Traits::memory_space, + Kokkos::CudaUVMSpace>::value)) +#endif + && (!Traits::memory_traits::is_atomic))>::type> { + using value_type = typename Traits::value_type; + // typedef work-around for intel compilers error #3186: expected typedef + // declaration + // NOLINTNEXTLINE(modernize-use-using) + typedef value_type* KOKKOS_IMPL_ALIGN_PTR(KOKKOS_MEMORY_ALIGNMENT) + handle_type; + using return_type = typename Traits::value_type& KOKKOS_RESTRICT; + using track_type = Kokkos::Impl::SharedAllocationTracker; + + KOKKOS_INLINE_FUNCTION + static value_type* assign(value_type* arg_data_ptr, + track_type const& /*arg_tracker*/) { + if (reinterpret_cast<uintptr_t>(arg_data_ptr) % Impl::MEMORY_ALIGNMENT) { + Kokkos::abort( + "Assigning NonAligned View or Pointer to Kokkos::View with Aligned " + "attribute"); + } + return (value_type*)(arg_data_ptr); + } + + KOKKOS_INLINE_FUNCTION + static value_type* assign(handle_type const arg_data_ptr, size_t offset) { + if (reinterpret_cast<uintptr_t>(arg_data_ptr + offset) % + Impl::MEMORY_ALIGNMENT) { + Kokkos::abort( + "Assigning NonAligned View or Pointer to Kokkos::View with Aligned " + "attribute"); + } + return (value_type*)(arg_data_ptr + offset); + } +}; +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +//---------------------------------------------------------------------------- + +/* + * The construction, assignment to default, and destruction + * are merged into a single functor. + * Primarily to work around an unresolved CUDA back-end bug + * that would lose the destruction cuda device function when + * called from the shared memory tracking destruction. + * Secondarily to have two fewer partial specializations. + */ +template <class ExecSpace, class ValueType, + bool IsScalar = std::is_scalar<ValueType>::value> +struct ViewValueFunctor; + +template <class ExecSpace, class ValueType> +struct ViewValueFunctor<ExecSpace, ValueType, false /* is_scalar */> { + using PolicyType = Kokkos::RangePolicy<ExecSpace, Kokkos::IndexType<int64_t>>; + using Exec = typename ExecSpace::execution_space; + + Exec space; + ValueType* ptr; + size_t n; + bool destroy; + std::string name; + + KOKKOS_INLINE_FUNCTION + void operator()(const size_t i) const { + if (destroy) { + (ptr + i)->~ValueType(); + } // KOKKOS_IMPL_CUDA_CLANG_WORKAROUND this line causes ptax error + // __cxa_begin_catch in nested_view unit-test + else { + new (ptr + i) ValueType(); + } + } + + ViewValueFunctor() = default; + ViewValueFunctor(const ViewValueFunctor&) = default; + ViewValueFunctor& operator=(const ViewValueFunctor&) = default; + + ViewValueFunctor(ExecSpace const& arg_space, ValueType* const arg_ptr, + size_t const arg_n, std::string arg_name) + : space(arg_space), + ptr(arg_ptr), + n(arg_n), + destroy(false), + name(std::move(arg_name)) {} + + void execute(bool arg) { + destroy = arg; + PolicyType policy(0, n); + std::string functor_name; + if (!space.in_parallel()) { + uint64_t kpID = 0; + if (Kokkos::Profiling::profileLibraryLoaded()) { + functor_name = + (destroy ? "Kokkos::View::destruction [" + name + "]" + : "Kokkos::View::initialization [" + name + "]"); + Kokkos::Tools::Impl::begin_parallel_for(policy, *this, functor_name, + kpID); + } +#ifdef KOKKOS_ENABLE_CUDA + if (std::is_same<ExecSpace, Kokkos::Cuda>::value) { + Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n, + true); + } +#endif + const Kokkos::Impl::ParallelFor<ViewValueFunctor, PolicyType> closure( + *this, policy); + closure.execute(); + space.fence(); + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Tools::Impl::end_parallel_for(policy, *this, functor_name, + kpID); + } + } else { + for (size_t i = 0; i < n; ++i) operator()(i); + } + } + + void construct_shared_allocation() { execute(false); } + + void destroy_shared_allocation() { execute(true); } +}; + +template <class ExecSpace, class ValueType> +struct ViewValueFunctor<ExecSpace, ValueType, true /* is_scalar */> { + using PolicyType = Kokkos::RangePolicy<ExecSpace, Kokkos::IndexType<int64_t>>; + + ExecSpace space; + ValueType* ptr; + size_t n; + std::string name; + + KOKKOS_INLINE_FUNCTION + void operator()(const size_t i) const { ptr[i] = ValueType(); } + + ViewValueFunctor() = default; + ViewValueFunctor(const ViewValueFunctor&) = default; + ViewValueFunctor& operator=(const ViewValueFunctor&) = default; + + ViewValueFunctor(ExecSpace const& arg_space, ValueType* const arg_ptr, + size_t const arg_n, std::string arg_name) + : space(arg_space), ptr(arg_ptr), n(arg_n), name(std::move(arg_name)) {} + + void construct_shared_allocation() { + if (!space.in_parallel()) { + uint64_t kpID = 0; + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::beginParallelFor( + "Kokkos::View::initialization [" + name + "]", 0, &kpID); + } +#ifdef KOKKOS_ENABLE_CUDA + if (std::is_same<ExecSpace, Kokkos::Cuda>::value) { + Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n, + true); + } +#endif + const Kokkos::Impl::ParallelFor<ViewValueFunctor, PolicyType> closure( + *this, PolicyType(0, n)); + closure.execute(); + space.fence(); + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::endParallelFor(kpID); + } + } else { + for (size_t i = 0; i < n; ++i) operator()(i); + } + } + + void destroy_shared_allocation() {} +}; + +//---------------------------------------------------------------------------- +/** \brief View mapping for non-specialized data type and standard layout */ +template <class Traits> +class ViewMapping< + Traits, + typename std::enable_if<( + std::is_same<typename Traits::specialize, void>::value && + ViewOffset<typename Traits::dimension, typename Traits::array_layout, + void>::is_mapping_plugin::value)>::type> { + public: + using offset_type = ViewOffset<typename Traits::dimension, + typename Traits::array_layout, void>; + + using handle_type = typename ViewDataHandle<Traits>::handle_type; + + handle_type m_impl_handle; + offset_type m_impl_offset; + + private: + template <class, class...> + friend class ViewMapping; + + KOKKOS_INLINE_FUNCTION + ViewMapping(const handle_type& arg_handle, const offset_type& arg_offset) + : m_impl_handle(arg_handle), m_impl_offset(arg_offset) {} + + public: + using printable_label_typedef = void; + enum { is_managed = Traits::is_managed }; + + //---------------------------------------- + // Domain dimensions + + enum { Rank = Traits::dimension::rank }; + + template <typename iType> + KOKKOS_INLINE_FUNCTION constexpr size_t extent(const iType& r) const { + return m_impl_offset.m_dim.extent(r); + } + + static KOKKOS_INLINE_FUNCTION constexpr size_t static_extent( + const unsigned r) noexcept { + using dim_type = typename offset_type::dimension_type; + return dim_type::static_extent(r); + } + + KOKKOS_INLINE_FUNCTION constexpr typename Traits::array_layout layout() + const { + return m_impl_offset.layout(); + } + + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_0() const { + return m_impl_offset.dimension_0(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_1() const { + return m_impl_offset.dimension_1(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_2() const { + return m_impl_offset.dimension_2(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_3() const { + return m_impl_offset.dimension_3(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_4() const { + return m_impl_offset.dimension_4(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_5() const { + return m_impl_offset.dimension_5(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_6() const { + return m_impl_offset.dimension_6(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t dimension_7() const { + return m_impl_offset.dimension_7(); + } + + // Is a regular layout with uniform striding for each index. + using is_regular = typename offset_type::is_regular; + + KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { + return m_impl_offset.stride_0(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { + return m_impl_offset.stride_1(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { + return m_impl_offset.stride_2(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { + return m_impl_offset.stride_3(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { + return m_impl_offset.stride_4(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { + return m_impl_offset.stride_5(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { + return m_impl_offset.stride_6(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { + return m_impl_offset.stride_7(); + } + + template <typename iType> + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + m_impl_offset.stride(s); + } + + //---------------------------------------- + // Range span + + /** \brief Span of the mapped range */ + KOKKOS_INLINE_FUNCTION constexpr size_t span() const { + return m_impl_offset.span(); + } + + /** \brief Is the mapped range span contiguous */ + KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { + return m_impl_offset.span_is_contiguous(); + } + + using reference_type = typename ViewDataHandle<Traits>::return_type; + using pointer_type = typename Traits::value_type*; + + /** \brief Query raw pointer to memory */ + KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { + return m_impl_handle; + } + + //---------------------------------------- + // The View class performs all rank and bounds checking before + // calling these element reference methods. + + KOKKOS_FORCEINLINE_FUNCTION + reference_type reference() const { return m_impl_handle[0]; } + + template <typename I0> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<(std::is_integral<I0>::value && + // if layout is neither stride nor irregular, + // then just use the handle directly + !(std::is_same<typename Traits::array_layout, + Kokkos::LayoutStride>::value || + !is_regular::value)), + reference_type>::type + reference(const I0& i0) const { + return m_impl_handle[i0]; + } + + template <typename I0> + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if<(std::is_integral<I0>::value && + // if the layout is strided or irregular, then + // we have to use the offset + (std::is_same<typename Traits::array_layout, + Kokkos::LayoutStride>::value || + !is_regular::value)), + reference_type>::type + reference(const I0& i0) const { + return m_impl_handle[m_impl_offset(i0)]; + } + + template <typename I0, typename I1> + KOKKOS_FORCEINLINE_FUNCTION reference_type reference(const I0& i0, + const I1& i1) const { + return m_impl_handle[m_impl_offset(i0, i1)]; + } + + template <typename I0, typename I1, typename I2> + KOKKOS_FORCEINLINE_FUNCTION reference_type reference(const I0& i0, + const I1& i1, + const I2& i2) const { + return m_impl_handle[m_impl_offset(i0, i1, i2)]; + } + + template <typename I0, typename I1, typename I2, typename I3> + KOKKOS_FORCEINLINE_FUNCTION reference_type + reference(const I0& i0, const I1& i1, const I2& i2, const I3& i3) const { + return m_impl_handle[m_impl_offset(i0, i1, i2, i3)]; + } + + template <typename I0, typename I1, typename I2, typename I3, typename I4> + KOKKOS_FORCEINLINE_FUNCTION reference_type reference(const I0& i0, + const I1& i1, + const I2& i2, + const I3& i3, + const I4& i4) const { + return m_impl_handle[m_impl_offset(i0, i1, i2, i3, i4)]; + } + + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5> + KOKKOS_FORCEINLINE_FUNCTION reference_type + reference(const I0& i0, const I1& i1, const I2& i2, const I3& i3, + const I4& i4, const I5& i5) const { + return m_impl_handle[m_impl_offset(i0, i1, i2, i3, i4, i5)]; + } + + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5, typename I6> + KOKKOS_FORCEINLINE_FUNCTION reference_type + reference(const I0& i0, const I1& i1, const I2& i2, const I3& i3, + const I4& i4, const I5& i5, const I6& i6) const { + return m_impl_handle[m_impl_offset(i0, i1, i2, i3, i4, i5, i6)]; + } + + template <typename I0, typename I1, typename I2, typename I3, typename I4, + typename I5, typename I6, typename I7> + KOKKOS_FORCEINLINE_FUNCTION reference_type + reference(const I0& i0, const I1& i1, const I2& i2, const I3& i3, + const I4& i4, const I5& i5, const I6& i6, const I7& i7) const { + return m_impl_handle[m_impl_offset(i0, i1, i2, i3, i4, i5, i6, i7)]; + } + + //---------------------------------------- + + private: + enum { MemorySpanMask = 8 - 1 /* Force alignment on 8 byte boundary */ }; + enum { MemorySpanSize = sizeof(typename Traits::value_type) }; + + public: + /** \brief Span, in bytes, of the referenced memory */ + KOKKOS_INLINE_FUNCTION constexpr size_t memory_span() const { + return (m_impl_offset.span() * sizeof(typename Traits::value_type) + + MemorySpanMask) & + ~size_t(MemorySpanMask); + } + + //---------------------------------------- + + KOKKOS_DEFAULTED_FUNCTION ~ViewMapping() = default; + KOKKOS_INLINE_FUNCTION ViewMapping() : m_impl_handle(), m_impl_offset() {} + + KOKKOS_DEFAULTED_FUNCTION ViewMapping(const ViewMapping&) = default; + KOKKOS_DEFAULTED_FUNCTION ViewMapping& operator=(const ViewMapping&) = + default; + + KOKKOS_DEFAULTED_FUNCTION ViewMapping(ViewMapping&&) = default; + KOKKOS_DEFAULTED_FUNCTION ViewMapping& operator=(ViewMapping&&) = default; + + //---------------------------------------- + + /**\brief Span, in bytes, of the required memory */ + KOKKOS_INLINE_FUNCTION + static constexpr size_t memory_span( + typename Traits::array_layout const& arg_layout) { + using padding = std::integral_constant<unsigned int, 0>; + return (offset_type(padding(), arg_layout).span() * MemorySpanSize + + MemorySpanMask) & + ~size_t(MemorySpanMask); + } + + /**\brief Wrap a span of memory */ + template <class... P> + KOKKOS_INLINE_FUNCTION ViewMapping( + Kokkos::Impl::ViewCtorProp<P...> const& arg_prop, + typename Traits::array_layout const& arg_layout) + : m_impl_handle( + ((Kokkos::Impl::ViewCtorProp<void, pointer_type> const&)arg_prop) + .value), + m_impl_offset(std::integral_constant<unsigned, 0>(), arg_layout) {} + + /**\brief Assign data */ + KOKKOS_INLINE_FUNCTION + void assign_data(pointer_type arg_ptr) { + m_impl_handle = handle_type(arg_ptr); + } + + //---------------------------------------- + /* Allocate and construct mapped array. + * Allocate via shared allocation record and + * return that record for allocation tracking. + */ + template <class... P> + Kokkos::Impl::SharedAllocationRecord<>* allocate_shared( + Kokkos::Impl::ViewCtorProp<P...> const& arg_prop, + typename Traits::array_layout const& arg_layout) { + using alloc_prop = Kokkos::Impl::ViewCtorProp<P...>; + + using execution_space = typename alloc_prop::execution_space; + using memory_space = typename Traits::memory_space; + using value_type = typename Traits::value_type; + using functor_type = ViewValueFunctor<execution_space, value_type>; + using record_type = + Kokkos::Impl::SharedAllocationRecord<memory_space, functor_type>; + + // Query the mapping for byte-size of allocation. + // If padding is allowed then pass in sizeof value type + // for padding computation. + using padding = std::integral_constant< + unsigned int, alloc_prop::allow_padding ? sizeof(value_type) : 0>; + + m_impl_offset = offset_type(padding(), arg_layout); + + const size_t alloc_size = + (m_impl_offset.span() * MemorySpanSize + MemorySpanMask) & + ~size_t(MemorySpanMask); + const std::string& alloc_name = + static_cast<Kokkos::Impl::ViewCtorProp<void, std::string> const&>( + arg_prop) + .value; + // Create shared memory tracking record with allocate memory from the memory + // space + record_type* const record = record_type::allocate( + static_cast<Kokkos::Impl::ViewCtorProp<void, memory_space> const&>( + arg_prop) + .value, + alloc_name, alloc_size); + + m_impl_handle = handle_type(reinterpret_cast<pointer_type>(record->data())); + + // Only initialize if the allocation is non-zero. + // May be zero if one of the dimensions is zero. + if (alloc_size && alloc_prop::initialize) { + // Assume destruction is only required when construction is requested. + // The ViewValueFunctor has both value construction and destruction + // operators. + record->m_destroy = functor_type( + static_cast<Kokkos::Impl::ViewCtorProp<void, execution_space> const&>( + arg_prop) + .value, + (value_type*)m_impl_handle, m_impl_offset.span(), alloc_name); + + // Construct values + record->m_destroy.construct_shared_allocation(); + } + + return record; + } +}; + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- +/** \brief Assign compatible default mappings */ + +template <class DstTraits, class SrcTraits> +class ViewMapping< + DstTraits, SrcTraits, + typename std::enable_if<( + !(std::is_same<typename SrcTraits::array_layout, LayoutStride>:: + value) && // Added to have a new specialization for SrcType of + // LayoutStride + // default mappings + std::is_same<typename DstTraits::specialize, void>::value && + std::is_same<typename SrcTraits::specialize, void>::value && + ( + // same layout + std::is_same<typename DstTraits::array_layout, + typename SrcTraits::array_layout>::value || + // known layout + ((std::is_same<typename DstTraits::array_layout, + Kokkos::LayoutLeft>::value || + std::is_same<typename DstTraits::array_layout, + Kokkos::LayoutRight>::value || + std::is_same<typename DstTraits::array_layout, + Kokkos::LayoutStride>::value) && + (std::is_same<typename SrcTraits::array_layout, + Kokkos::LayoutLeft>::value || + std::is_same<typename SrcTraits::array_layout, + Kokkos::LayoutRight>::value || + std::is_same<typename SrcTraits::array_layout, + Kokkos::LayoutStride>::value))))>::type> { + private: + enum { + is_assignable_space = +#if 1 + Kokkos::Impl::MemorySpaceAccess< + typename DstTraits::memory_space, + typename SrcTraits::memory_space>::assignable + }; +#else + std::is_same<typename DstTraits::memory_space, + typename SrcTraits::memory_space>::value + }; +#endif + + enum { + is_assignable_value_type = + std::is_same<typename DstTraits::value_type, + typename SrcTraits::value_type>::value || + std::is_same<typename DstTraits::value_type, + typename SrcTraits::const_value_type>::value + }; + + enum { + is_assignable_dimension = + ViewDimensionAssignable<typename DstTraits::dimension, + typename SrcTraits::dimension>::value + }; + + enum { + is_assignable_layout = + std::is_same<typename DstTraits::array_layout, + typename SrcTraits::array_layout>::value || + std::is_same<typename DstTraits::array_layout, + Kokkos::LayoutStride>::value || + (DstTraits::dimension::rank == 0) || + (DstTraits::dimension::rank == 1 && + DstTraits::dimension::rank_dynamic == 1) + }; + + public: + enum { + is_assignable_data_type = + is_assignable_value_type && is_assignable_dimension + }; + enum { + is_assignable = is_assignable_space && is_assignable_value_type && + is_assignable_dimension && is_assignable_layout + }; + + using TrackType = Kokkos::Impl::SharedAllocationTracker; + using DstType = ViewMapping<DstTraits, void>; + using SrcType = ViewMapping<SrcTraits, void>; + + KOKKOS_INLINE_FUNCTION + static void assign(DstType& dst, const SrcType& src, + const TrackType& src_track) { + static_assert(is_assignable_space, + "View assignment must have compatible spaces"); + + static_assert( + is_assignable_value_type, + "View assignment must have same value type or const = non-const"); + + static_assert(is_assignable_dimension, + "View assignment must have compatible dimensions"); + + static_assert( + is_assignable_layout, + "View assignment must have compatible layout or have rank <= 1"); + + using dst_offset_type = typename DstType::offset_type; + + if (size_t(DstTraits::dimension::rank_dynamic) < + size_t(SrcTraits::dimension::rank_dynamic)) { + using dst_dim = typename DstTraits::dimension; + bool assignable = ((1 > DstTraits::dimension::rank_dynamic && + 1 <= SrcTraits::dimension::rank_dynamic) + ? dst_dim::ArgN0 == src.dimension_0() + : true) && + ((2 > DstTraits::dimension::rank_dynamic && + 2 <= SrcTraits::dimension::rank_dynamic) + ? dst_dim::ArgN1 == src.dimension_1() + : true) && + ((3 > DstTraits::dimension::rank_dynamic && + 3 <= SrcTraits::dimension::rank_dynamic) + ? dst_dim::ArgN2 == src.dimension_2() + : true) && + ((4 > DstTraits::dimension::rank_dynamic && + 4 <= SrcTraits::dimension::rank_dynamic) + ? dst_dim::ArgN3 == src.dimension_3() + : true) && + ((5 > DstTraits::dimension::rank_dynamic && + 5 <= SrcTraits::dimension::rank_dynamic) + ? dst_dim::ArgN4 == src.dimension_4() + : true) && + ((6 > DstTraits::dimension::rank_dynamic && + 6 <= SrcTraits::dimension::rank_dynamic) + ? dst_dim::ArgN5 == src.dimension_5() + : true) && + ((7 > DstTraits::dimension::rank_dynamic && + 7 <= SrcTraits::dimension::rank_dynamic) + ? dst_dim::ArgN6 == src.dimension_6() + : true) && + ((8 > DstTraits::dimension::rank_dynamic && + 8 <= SrcTraits::dimension::rank_dynamic) + ? dst_dim::ArgN7 == src.dimension_7() + : true); + if (!assignable) + Kokkos::abort( + "View Assignment: trying to assign runtime dimension to non " + "matching compile time dimension."); + } + dst.m_impl_offset = dst_offset_type(src.m_impl_offset); + dst.m_impl_handle = Kokkos::Impl::ViewDataHandle<DstTraits>::assign( + src.m_impl_handle, src_track); + } +}; + +//---------------------------------------------------------------------------- +// Create new specialization for SrcType of LayoutStride. Runtime check for +// compatible layout +template <class DstTraits, class SrcTraits> +class ViewMapping< + DstTraits, SrcTraits, + typename std::enable_if<( + std::is_same<typename SrcTraits::array_layout, + Kokkos::LayoutStride>::value && + std::is_same<typename DstTraits::specialize, void>::value && + std::is_same<typename SrcTraits::specialize, void>::value && + ( + // same layout + std::is_same<typename DstTraits::array_layout, + typename SrcTraits::array_layout>::value || + // known layout + (std::is_same<typename DstTraits::array_layout, + Kokkos::LayoutLeft>::value || + std::is_same<typename DstTraits::array_layout, + Kokkos::LayoutRight>::value || + std::is_same<typename DstTraits::array_layout, + Kokkos::LayoutStride>::value)))>::type> { + private: + enum { + is_assignable_space = Kokkos::Impl::MemorySpaceAccess< + typename DstTraits::memory_space, + typename SrcTraits::memory_space>::assignable + }; + + enum { + is_assignable_value_type = + std::is_same<typename DstTraits::value_type, + typename SrcTraits::value_type>::value || + std::is_same<typename DstTraits::value_type, + typename SrcTraits::const_value_type>::value + }; + + enum { + is_assignable_dimension = + ViewDimensionAssignable<typename DstTraits::dimension, + typename SrcTraits::dimension>::value + }; + + public: + enum { + is_assignable_data_type = + is_assignable_value_type && is_assignable_dimension + }; + enum { + is_assignable = is_assignable_space && is_assignable_value_type && + is_assignable_dimension + }; + + using TrackType = Kokkos::Impl::SharedAllocationTracker; + using DstType = ViewMapping<DstTraits, void>; + using SrcType = ViewMapping<SrcTraits, void>; + + KOKKOS_INLINE_FUNCTION + static bool assignable_layout_check(DstType&, + const SrcType& src) // Runtime check + { + size_t strides[9]; + bool assignable = true; + src.stride(strides); + size_t exp_stride = 1; + if (std::is_same<typename DstTraits::array_layout, + Kokkos::LayoutLeft>::value) { + for (int i = 0; i < src.Rank; i++) { + if (i > 0) exp_stride *= src.extent(i - 1); + if (strides[i] != exp_stride) { + assignable = false; + break; + } + } + } else if (std::is_same<typename DstTraits::array_layout, + Kokkos::LayoutRight>::value) { + for (int i = src.Rank - 1; i >= 0; i--) { + if (i < src.Rank - 1) exp_stride *= src.extent(i + 1); + if (strides[i] != exp_stride) { + assignable = false; + break; + } + } + } + return assignable; + } + + KOKKOS_INLINE_FUNCTION + static void assign(DstType& dst, const SrcType& src, + const TrackType& src_track) { + static_assert(is_assignable_space, + "View assignment must have compatible spaces"); + + static_assert( + is_assignable_value_type, + "View assignment must have same value type or const = non-const"); + + static_assert(is_assignable_dimension, + "View assignment must have compatible dimensions"); + + bool assignable_layout = assignable_layout_check(dst, src); // Runtime + // check + if (!assignable_layout) + Kokkos::abort("View assignment must have compatible layouts\n"); + + using dst_offset_type = typename DstType::offset_type; + + if (size_t(DstTraits::dimension::rank_dynamic) < + size_t(SrcTraits::dimension::rank_dynamic)) { + using dst_dim = typename DstTraits::dimension; + bool assignable = ((1 > DstTraits::dimension::rank_dynamic && + 1 <= SrcTraits::dimension::rank_dynamic) + ? dst_dim::ArgN0 == src.dimension_0() + : true) && + ((2 > DstTraits::dimension::rank_dynamic && + 2 <= SrcTraits::dimension::rank_dynamic) + ? dst_dim::ArgN1 == src.dimension_1() + : true) && + ((3 > DstTraits::dimension::rank_dynamic && + 3 <= SrcTraits::dimension::rank_dynamic) + ? dst_dim::ArgN2 == src.dimension_2() + : true) && + ((4 > DstTraits::dimension::rank_dynamic && + 4 <= SrcTraits::dimension::rank_dynamic) + ? dst_dim::ArgN3 == src.dimension_3() + : true) && + ((5 > DstTraits::dimension::rank_dynamic && + 5 <= SrcTraits::dimension::rank_dynamic) + ? dst_dim::ArgN4 == src.dimension_4() + : true) && + ((6 > DstTraits::dimension::rank_dynamic && + 6 <= SrcTraits::dimension::rank_dynamic) + ? dst_dim::ArgN5 == src.dimension_5() + : true) && + ((7 > DstTraits::dimension::rank_dynamic && + 7 <= SrcTraits::dimension::rank_dynamic) + ? dst_dim::ArgN6 == src.dimension_6() + : true) && + ((8 > DstTraits::dimension::rank_dynamic && + 8 <= SrcTraits::dimension::rank_dynamic) + ? dst_dim::ArgN7 == src.dimension_7() + : true); + if (!assignable) + Kokkos::abort( + "View Assignment: trying to assign runtime dimension to non " + "matching compile time dimension."); + } + dst.m_impl_offset = dst_offset_type(src.m_impl_offset); + dst.m_impl_handle = Kokkos::Impl::ViewDataHandle<DstTraits>::assign( + src.m_impl_handle, src_track); + } +}; + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- +// Subview mapping. +// Deduce destination view type from source view traits and subview arguments + +template <class, class ValueType, class Exts, class... Args> +struct SubViewDataTypeImpl; + +/* base case */ +template <class ValueType> +struct SubViewDataTypeImpl<void, ValueType, Kokkos::Experimental::Extents<>> { + using type = ValueType; +}; + +/* for integral args, subview doesn't have that dimension */ +template <class ValueType, ptrdiff_t Ext, ptrdiff_t... Exts, class Integral, + class... Args> +struct SubViewDataTypeImpl< + typename std::enable_if< + std::is_integral<typename std::decay<Integral>::type>::value>::type, + ValueType, Kokkos::Experimental::Extents<Ext, Exts...>, Integral, Args...> + : SubViewDataTypeImpl<void, ValueType, + Kokkos::Experimental::Extents<Exts...>, Args...> {}; + +/* for ALL slice, subview has the same dimension */ +template <class ValueType, ptrdiff_t Ext, ptrdiff_t... Exts, class... Args> +struct SubViewDataTypeImpl<void, ValueType, + Kokkos::Experimental::Extents<Ext, Exts...>, ALL_t, + Args...> + : SubViewDataTypeImpl<void, typename ApplyExtent<ValueType, Ext>::type, + Kokkos::Experimental::Extents<Exts...>, Args...> {}; + +/* for pair-style slice, subview has dynamic dimension, since pair doesn't give + * static sizes */ +/* Since we don't allow interleaving of dynamic and static extents, make all of + * the dimensions to the left dynamic */ +template <class ValueType, ptrdiff_t Ext, ptrdiff_t... Exts, class PairLike, + class... Args> +struct SubViewDataTypeImpl< + typename std::enable_if<is_pair_like<PairLike>::value>::type, ValueType, + Kokkos::Experimental::Extents<Ext, Exts...>, PairLike, Args...> + : SubViewDataTypeImpl< + void, typename make_all_extents_into_pointers<ValueType>::type*, + Kokkos::Experimental::Extents<Exts...>, Args...> {}; + +template <class ValueType, class Exts, class... Args> +struct SubViewDataType : SubViewDataTypeImpl<void, ValueType, Exts, Args...> {}; + +//---------------------------------------------------------------------------- + +template <class SrcTraits, class... Args> +class ViewMapping< + typename std::enable_if<( + std::is_same<typename SrcTraits::specialize, void>::value && + (std::is_same<typename SrcTraits::array_layout, + Kokkos::LayoutLeft>::value || + std::is_same<typename SrcTraits::array_layout, + Kokkos::LayoutRight>::value || + std::is_same<typename SrcTraits::array_layout, + Kokkos::LayoutStride>::value))>::type, + SrcTraits, Args...> { + private: + static_assert(SrcTraits::rank == sizeof...(Args), + "Subview mapping requires one argument for each dimension of " + "source View"); + + enum { + RZ = false, + R0 = bool(is_integral_extent<0, Args...>::value), + R1 = bool(is_integral_extent<1, Args...>::value), + R2 = bool(is_integral_extent<2, Args...>::value), + R3 = bool(is_integral_extent<3, Args...>::value), + R4 = bool(is_integral_extent<4, Args...>::value), + R5 = bool(is_integral_extent<5, Args...>::value), + R6 = bool(is_integral_extent<6, Args...>::value), + R7 = bool(is_integral_extent<7, Args...>::value) + }; + + enum { + rank = unsigned(R0) + unsigned(R1) + unsigned(R2) + unsigned(R3) + + unsigned(R4) + unsigned(R5) + unsigned(R6) + unsigned(R7) + }; + + // Whether right-most rank is a range. + enum { + R0_rev = + (0 == SrcTraits::rank + ? RZ + : (1 == SrcTraits::rank + ? R0 + : (2 == SrcTraits::rank + ? R1 + : (3 == SrcTraits::rank + ? R2 + : (4 == SrcTraits::rank + ? R3 + : (5 == SrcTraits::rank + ? R4 + : (6 == SrcTraits::rank + ? R5 + : (7 == SrcTraits::rank + ? R6 + : R7)))))))) + }; + + // Subview's layout + using array_layout = typename std::conditional< + ( /* Same array layout IF */ + (rank == 0) /* output rank zero */ + || SubviewLegalArgsCompileTime<typename SrcTraits::array_layout, + typename SrcTraits::array_layout, rank, + SrcTraits::rank, 0, Args...>::value || + // OutputRank 1 or 2, InputLayout Left, Interval 0 + // because single stride one or second index has a stride. + (rank <= 2 && R0 && + std::is_same<typename SrcTraits::array_layout, + Kokkos::LayoutLeft>::value) // replace with input rank + || + // OutputRank 1 or 2, InputLayout Right, Interval [InputRank-1] + // because single stride one or second index has a stride. + (rank <= 2 && R0_rev && + std::is_same<typename SrcTraits::array_layout, + Kokkos::LayoutRight>::value) // replace input rank + ), + typename SrcTraits::array_layout, Kokkos::LayoutStride>::type; + + using value_type = typename SrcTraits::value_type; + + using data_type = + typename SubViewDataType<value_type, + typename Kokkos::Impl::ParseViewExtents< + typename SrcTraits::data_type>::type, + Args...>::type; + + public: + using traits_type = Kokkos::ViewTraits<data_type, array_layout, + typename SrcTraits::device_type, + typename SrcTraits::memory_traits>; + + using type = + Kokkos::View<data_type, array_layout, typename SrcTraits::device_type, + typename SrcTraits::memory_traits>; + + template <class MemoryTraits> + struct apply { + static_assert(Kokkos::Impl::is_memory_traits<MemoryTraits>::value, ""); + + using traits_type = + Kokkos::ViewTraits<data_type, array_layout, + typename SrcTraits::device_type, MemoryTraits>; + + using type = Kokkos::View<data_type, array_layout, + typename SrcTraits::device_type, MemoryTraits>; + }; + + // The presumed type is 'ViewMapping< traits_type , void >' + // However, a compatible ViewMapping is acceptable. + template <class DstTraits> + KOKKOS_INLINE_FUNCTION static void assign( + ViewMapping<DstTraits, void>& dst, + ViewMapping<SrcTraits, void> const& src, Args... args) { + static_assert(ViewMapping<DstTraits, traits_type, void>::is_assignable, + "Subview destination type must be compatible with subview " + "derived type"); + + using DstType = ViewMapping<DstTraits, void>; + + using dst_offset_type = typename DstType::offset_type; + + const SubviewExtents<SrcTraits::rank, rank> extents(src.m_impl_offset.m_dim, + args...); + + dst.m_impl_offset = dst_offset_type(src.m_impl_offset, extents); + + dst.m_impl_handle = ViewDataHandle<DstTraits>::assign( + src.m_impl_handle, + src.m_impl_offset(extents.domain_offset(0), extents.domain_offset(1), + extents.domain_offset(2), extents.domain_offset(3), + extents.domain_offset(4), extents.domain_offset(5), + extents.domain_offset(6), extents.domain_offset(7))); + } +}; + +//---------------------------------------------------------------------------- + +} // namespace Impl +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template <unsigned, class MapType> +KOKKOS_INLINE_FUNCTION bool view_verify_operator_bounds(const MapType&) { + return true; +} + +template <unsigned R, class MapType, class iType, class... Args> +KOKKOS_INLINE_FUNCTION bool view_verify_operator_bounds(const MapType& map, + const iType& i, + Args... args) { + return (size_t(i) < map.extent(R)) && + view_verify_operator_bounds<R + 1>(map, args...); +} + +template <unsigned, class MapType> +inline void view_error_operator_bounds(char*, int, const MapType&) {} + +template <unsigned R, class MapType, class iType, class... Args> +inline void view_error_operator_bounds(char* buf, int len, const MapType& map, + const iType& i, Args... args) { + const int n = snprintf( + buf, len, " %ld < %ld %c", static_cast<unsigned long>(i), + static_cast<unsigned long>(map.extent(R)), (sizeof...(Args) ? ',' : ')')); + view_error_operator_bounds<R + 1>(buf + n, len - n, map, args...); +} + +#if !defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + +/* Check #3: is the View managed as determined by the MemoryTraits? */ +template <class MapType, bool is_managed = (MapType::is_managed != 0)> +struct OperatorBoundsErrorOnDevice; + +template <class MapType> +struct OperatorBoundsErrorOnDevice<MapType, false> { + KOKKOS_INLINE_FUNCTION + static void run(MapType const&) { Kokkos::abort("View bounds error"); } +}; + +template <class MapType> +struct OperatorBoundsErrorOnDevice<MapType, true> { + KOKKOS_INLINE_FUNCTION + static void run(MapType const& map) { + SharedAllocationHeader const* const header = + SharedAllocationHeader::get_header((void*)(map.data())); + char const* const label = header->label(); + enum { LEN = 128 }; + char msg[LEN]; + char const* const first_part = "View bounds error of view "; + char* p = msg; + char* const end = msg + LEN - 1; + for (char const* p2 = first_part; (*p2 != '\0') && (p < end); ++p, ++p2) { + *p = *p2; + } + for (char const* p2 = label; (*p2 != '\0') && (p < end); ++p, ++p2) { + *p = *p2; + } + *p = '\0'; + Kokkos::abort(msg); + } +}; + +/* Check #2: does the ViewMapping have the printable_label_typedef defined? + See above that only the non-specialized standard-layout ViewMapping has + this defined by default. + The existence of this alias indicates the existence of MapType::is_managed + */ +template <class T, class Enable = void> +struct has_printable_label_typedef : public std::false_type {}; + +template <class T> +struct has_printable_label_typedef<T, + void_t<typename T::printable_label_typedef>> + : public std::true_type {}; + +template <class MapType> +KOKKOS_INLINE_FUNCTION void operator_bounds_error_on_device(MapType const&, + std::false_type) { + Kokkos::abort("View bounds error"); +} + +template <class MapType> +KOKKOS_INLINE_FUNCTION void operator_bounds_error_on_device(MapType const& map, + std::true_type) { + OperatorBoundsErrorOnDevice<MapType>::run(map); +} + +#endif // ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) + +template <class MemorySpace, class ViewType, class MapType, class... Args> +KOKKOS_INLINE_FUNCTION void view_verify_operator_bounds( + Kokkos::Impl::ViewTracker<ViewType> const& tracker, const MapType& map, + Args... args) { + if (!view_verify_operator_bounds<0>(map, args...)) { +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + enum { LEN = 1024 }; + char buffer[LEN]; + const std::string label = + tracker.m_tracker.template get_label<MemorySpace>(); + int n = + snprintf(buffer, LEN, "View bounds error of view %s (", label.c_str()); + view_error_operator_bounds<0>(buffer + n, LEN - n, map, args...); + Kokkos::Impl::throw_runtime_exception(std::string(buffer)); +#else + /* Check #1: is there a SharedAllocationRecord? + (we won't use it, but if its not there then there isn't + a corresponding SharedAllocationHeader containing a label). + This check should cover the case of Views that don't + have the Unmanaged trait but were initialized by pointer. */ + if (tracker.m_tracker.has_record()) { + operator_bounds_error_on_device<MapType>( + map, has_printable_label_typedef<MapType>()); + } else { + Kokkos::abort("View bounds error"); + } +#endif + } +} + +} /* namespace Impl */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #ifndef KOKKOS_EXPERIMENTAL_VIEW_MAPPING_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewTracker.hpp b/packages/kokkos/core/src/impl/Kokkos_ViewTracker.hpp new file mode 100644 index 0000000000000000000000000000000000000000..9cfe9d79144dbcf7457b50b300f691a7a8b6504e --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_ViewTracker.hpp @@ -0,0 +1,130 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_VIEW_TRACKER_HPP +#define KOKKOS_VIEW_TRACKER_HPP + +namespace Kokkos { + +template <class DataType, class... Properties> +class View; + +namespace Impl { + +/* + * \class ViewTracker + * \brief template class to wrap the shared allocation tracker + * + * \section This class is templated on the View and provides + * constructors that match the view. The constructors and assignments + * from view will externalize the logic needed to enable/disable + * ref counting to provide a single gate to enable further developments + * which may hing on the same logic. + * + */ +template <class ParentView> +struct ViewTracker { + using track_type = Kokkos::Impl::SharedAllocationTracker; + using view_traits = typename ParentView::traits; + + track_type m_tracker; + + KOKKOS_INLINE_FUNCTION + ViewTracker() : m_tracker() {} + + KOKKOS_INLINE_FUNCTION + ViewTracker(const ViewTracker& vt) noexcept + : m_tracker(vt.m_tracker, view_traits::is_managed) {} + + KOKKOS_INLINE_FUNCTION + explicit ViewTracker(const ParentView& vt) noexcept : m_tracker() { + assign(vt); + } + + template <class RT, class... RP> + KOKKOS_INLINE_FUNCTION explicit ViewTracker( + const View<RT, RP...>& vt) noexcept + : m_tracker() { + assign(vt); + } + + template <class RT, class... RP> + KOKKOS_INLINE_FUNCTION void assign(const View<RT, RP...>& vt) noexcept { +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + if (view_traits::is_managed && + Kokkos::Impl::SharedAllocationRecord<void, void>::tracking_enabled()) { + m_tracker.assign_direct(vt.m_track.m_tracker); + } else { + m_tracker.assign_force_disable(vt.m_track.m_tracker); + } +#else + m_tracker.assign_force_disable(vt.m_track.m_tracker); +#endif + } + + KOKKOS_INLINE_FUNCTION + ViewTracker& operator=(const ViewTracker& rhs) noexcept { +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + if (view_traits::is_managed && + Kokkos::Impl::SharedAllocationRecord<void, void>::tracking_enabled()) { + m_tracker.assign_direct(rhs.m_tracker); + } else { + m_tracker.assign_force_disable(rhs.m_tracker); + } +#else + m_tracker.assign_force_disable(rhs.m_tracker); +#endif + return *this; + } + + KOKKOS_INLINE_FUNCTION + explicit ViewTracker(const track_type& tt) noexcept + : m_tracker(tt, view_traits::is_managed) {} +}; + +} // namespace Impl + +} // namespace Kokkos + +#endif // KOKKOS_VIEW_TRACKER_HPP diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewUniformType.hpp b/packages/kokkos/core/src/impl/Kokkos_ViewUniformType.hpp new file mode 100644 index 0000000000000000000000000000000000000000..2eb8fc9e3b820cc5d93dcf01b049ede170735d1a --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_ViewUniformType.hpp @@ -0,0 +1,123 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_EXPERIMENTAL_VIEWUNIFORMTYPE_HPP +#define KOKKOS_EXPERIMENTAL_VIEWUNIFORMTYPE_HPP + +#include <Kokkos_Macros.hpp> + +namespace Kokkos { +namespace Impl { +template <class ScalarType, int Rank> +struct ViewScalarToDataType { + using type = typename ViewScalarToDataType<ScalarType, Rank - 1>::type *; +}; + +template <class ScalarType> +struct ViewScalarToDataType<ScalarType, 0> { + using type = ScalarType; +}; + +template <class LayoutType, int Rank> +struct ViewUniformLayout { + using array_layout = LayoutType; +}; + +template <class LayoutType> +struct ViewUniformLayout<LayoutType, 0> { + using array_layout = Kokkos::LayoutLeft; +}; + +template <> +struct ViewUniformLayout<Kokkos::LayoutRight, 1> { + using array_layout = Kokkos::LayoutLeft; +}; + +template <class ViewType, int Traits> +struct ViewUniformType { + using data_type = typename ViewType::data_type; + using const_data_type = + typename std::add_const<typename ViewType::data_type>::type; + using runtime_data_type = + typename ViewScalarToDataType<typename ViewType::value_type, + ViewType::rank>::type; + using runtime_const_data_type = typename ViewScalarToDataType< + typename std::add_const<typename ViewType::value_type>::type, + ViewType::rank>::type; + + using array_layout = + typename ViewUniformLayout<typename ViewType::array_layout, + ViewType::rank>::array_layout; + + using device_type = typename ViewType::device_type; + using anonymous_device_type = + typename Kokkos::Device<typename device_type::execution_space, + Kokkos::AnonymousSpace>; + + using memory_traits = typename Kokkos::MemoryTraits<Traits>; + using type = + Kokkos::View<data_type, array_layout, device_type, memory_traits>; + using const_type = + Kokkos::View<const_data_type, array_layout, device_type, memory_traits>; + using runtime_type = + Kokkos::View<runtime_data_type, array_layout, device_type, memory_traits>; + using runtime_const_type = Kokkos::View<runtime_const_data_type, array_layout, + device_type, memory_traits>; + + using nomemspace_type = Kokkos::View<data_type, array_layout, + anonymous_device_type, memory_traits>; + using const_nomemspace_type = + Kokkos::View<const_data_type, array_layout, anonymous_device_type, + memory_traits>; + using runtime_nomemspace_type = + Kokkos::View<runtime_data_type, array_layout, anonymous_device_type, + memory_traits>; + using runtime_const_nomemspace_type = + Kokkos::View<runtime_const_data_type, array_layout, anonymous_device_type, + memory_traits>; +}; +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/impl/Kokkos_Volatile_Load.hpp b/packages/kokkos/core/src/impl/Kokkos_Volatile_Load.hpp new file mode 100644 index 0000000000000000000000000000000000000000..4af26dcc91db449b58d460ef9f1e809da2bc5b9d --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_Volatile_Load.hpp @@ -0,0 +1,237 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Macros.hpp> + +#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_VOLATILE_LOAD_HPP) +#define KOKKOS_VOLATILE_LOAD_HPP + +#if defined(__GNUC__) /* GNU C */ || defined(__GNUG__) /* GNU C++ */ || \ + defined(__clang__) + +#define KOKKOS_IMPL_MAY_ALIAS __attribute__((__may_alias__)) + +#else + +#define KOKKOS_IMPL_MAY_ALIAS + +#endif + +namespace Kokkos { + +//---------------------------------------------------------------------------- + +template <typename T> +KOKKOS_FORCEINLINE_FUNCTION T volatile_load(T const volatile* const src_ptr) { + typedef uint64_t KOKKOS_IMPL_MAY_ALIAS T64; // NOLINT(modernize-use-using) + typedef uint32_t KOKKOS_IMPL_MAY_ALIAS T32; // NOLINT(modernize-use-using) + typedef uint16_t KOKKOS_IMPL_MAY_ALIAS T16; // NOLINT(modernize-use-using) + typedef uint8_t KOKKOS_IMPL_MAY_ALIAS T8; // NOLINT(modernize-use-using) + + enum { + NUM_8 = sizeof(T), + NUM_16 = NUM_8 / 2, + NUM_32 = NUM_8 / 4, + NUM_64 = NUM_8 / 8 + }; + + union { + T const volatile* const ptr; + T64 const volatile* const ptr64; + T32 const volatile* const ptr32; + T16 const volatile* const ptr16; + T8 const volatile* const ptr8; + } src = {src_ptr}; + + T result; + + union { + T* const ptr; + T64* const ptr64; + T32* const ptr32; + T16* const ptr16; + T8* const ptr8; + } dst = {&result}; + + for (int i = 0; i < NUM_64; ++i) { + dst.ptr64[i] = src.ptr64[i]; + } + + if (NUM_64 * 2 < NUM_32) { + dst.ptr32[NUM_64 * 2] = src.ptr32[NUM_64 * 2]; + } + + if (NUM_32 * 2 < NUM_16) { + dst.ptr16[NUM_32 * 2] = src.ptr16[NUM_32 * 2]; + } + + if (NUM_16 * 2 < NUM_8) { + dst.ptr8[NUM_16 * 2] = src.ptr8[NUM_16 * 2]; + } + + return result; +} + +template <typename T> +KOKKOS_FORCEINLINE_FUNCTION void volatile_store( + T volatile* const dst_ptr, T const volatile* const src_ptr) { + typedef uint64_t KOKKOS_IMPL_MAY_ALIAS T64; // NOLINT(modernize-use-using) + typedef uint32_t KOKKOS_IMPL_MAY_ALIAS T32; // NOLINT(modernize-use-using) + typedef uint16_t KOKKOS_IMPL_MAY_ALIAS T16; // NOLINT(modernize-use-using) + typedef uint8_t KOKKOS_IMPL_MAY_ALIAS T8; // NOLINT(modernize-use-using) + + enum { + NUM_8 = sizeof(T), + NUM_16 = NUM_8 / 2, + NUM_32 = NUM_8 / 4, + NUM_64 = NUM_8 / 8 + }; + + union { + T const volatile* const ptr; + T64 const volatile* const ptr64; + T32 const volatile* const ptr32; + T16 const volatile* const ptr16; + T8 const volatile* const ptr8; + } src = {src_ptr}; + + union { + T volatile* const ptr; + T64 volatile* const ptr64; + T32 volatile* const ptr32; + T16 volatile* const ptr16; + T8 volatile* const ptr8; + } dst = {dst_ptr}; + + for (int i = 0; i < NUM_64; ++i) { + dst.ptr64[i] = src.ptr64[i]; + } + + if (NUM_64 * 2 < NUM_32) { + dst.ptr32[NUM_64 * 2] = src.ptr32[NUM_64 * 2]; + } + + if (NUM_32 * 2 < NUM_16) { + dst.ptr16[NUM_32 * 2] = src.ptr16[NUM_32 * 2]; + } + + if (NUM_16 * 2 < NUM_8) { + dst.ptr8[NUM_16 * 2] = src.ptr8[NUM_16 * 2]; + } +} + +template <typename T> +KOKKOS_FORCEINLINE_FUNCTION void volatile_store(T volatile* const dst_ptr, + T const* const src_ptr) { + typedef uint64_t KOKKOS_IMPL_MAY_ALIAS T64; // NOLINT(modernize-use-using) + typedef uint32_t KOKKOS_IMPL_MAY_ALIAS T32; // NOLINT(modernize-use-using) + typedef uint16_t KOKKOS_IMPL_MAY_ALIAS T16; // NOLINT(modernize-use-using) + typedef uint8_t KOKKOS_IMPL_MAY_ALIAS T8; // NOLINT(modernize-use-using) + + enum { + NUM_8 = sizeof(T), + NUM_16 = NUM_8 / 2, + NUM_32 = NUM_8 / 4, + NUM_64 = NUM_8 / 8 + }; + + union { + T const* const ptr; + T64 const* const ptr64; + T32 const* const ptr32; + T16 const* const ptr16; + T8 const* const ptr8; + } src = {src_ptr}; + + union { + T volatile* const ptr; + T64 volatile* const ptr64; + T32 volatile* const ptr32; + T16 volatile* const ptr16; + T8 volatile* const ptr8; + } dst = {dst_ptr}; + + for (int i = 0; i < NUM_64; ++i) { + dst.ptr64[i] = src.ptr64[i]; + } + + if (NUM_64 * 2 < NUM_32) { + dst.ptr32[NUM_64 * 2] = src.ptr32[NUM_64 * 2]; + } + + if (NUM_32 * 2 < NUM_16) { + dst.ptr16[NUM_32 * 2] = src.ptr16[NUM_32 * 2]; + } + + if (NUM_16 * 2 < NUM_8) { + dst.ptr8[NUM_16 * 2] = src.ptr8[NUM_16 * 2]; + } +} + +template <typename T> +KOKKOS_FORCEINLINE_FUNCTION void volatile_store(T volatile* dst_ptr, + T const volatile& src) { + volatile_store(dst_ptr, &src); +} + +template <typename T> +KOKKOS_FORCEINLINE_FUNCTION void volatile_store(T volatile* dst_ptr, + T const& src) { + volatile_store(dst_ptr, &src); +} + +template <typename T> +KOKKOS_FORCEINLINE_FUNCTION T safe_load(T const* const ptr) { +#if !defined(__MIC__) + return *ptr; +#else + return volatile_load(ptr); +#endif +} + +} // namespace Kokkos + +#undef KOKKOS_IMPL_MAY_ALIAS + +#endif diff --git a/packages/kokkos/core/src/impl/Kokkos_hwloc.cpp b/packages/kokkos/core/src/impl/Kokkos_hwloc.cpp new file mode 100644 index 0000000000000000000000000000000000000000..04507b0984a78b12ea16c8c1ee33c4fc99f24b08 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_hwloc.cpp @@ -0,0 +1,747 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#define DEBUG_PRINT 0 + +#include <iostream> +#include <sstream> +#include <algorithm> + +#include <Kokkos_Macros.hpp> +#include <Kokkos_Core.hpp> +#include <Kokkos_hwloc.hpp> +#include <impl/Kokkos_Error.hpp> + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace hwloc { + +/* Return 0 if asynchronous, 1 if synchronous and include process. */ +unsigned thread_mapping(const char* const label, const bool allow_async, + unsigned& thread_count, unsigned& use_numa_count, + unsigned& use_cores_per_numa, + std::pair<unsigned, unsigned> threads_coord[]) { + const bool hwloc_avail = Kokkos::hwloc::available(); + const unsigned avail_numa_count = + hwloc_avail ? hwloc::get_available_numa_count() : 1; + const unsigned avail_cores_per_numa = + hwloc_avail ? hwloc::get_available_cores_per_numa() : thread_count; + const unsigned avail_threads_per_core = + hwloc_avail ? hwloc::get_available_threads_per_core() : 1; + + // (numa,core) coordinate of the process: + const std::pair<unsigned, unsigned> proc_coord = + Kokkos::hwloc::get_this_thread_coordinate(); + + //------------------------------------------------------------------------ + // Defaults for unspecified inputs: + + if (!use_numa_count) { + // Default to use all NUMA regions + use_numa_count = !thread_count + ? avail_numa_count + : (thread_count < avail_numa_count ? thread_count + : avail_numa_count); + } + + if (!use_cores_per_numa) { + // Default to use all but one core if asynchronous, all cores if + // synchronous. + const unsigned threads_per_numa = thread_count / use_numa_count; + + use_cores_per_numa = + !threads_per_numa + ? avail_cores_per_numa - (allow_async ? 1 : 0) + : (threads_per_numa < avail_cores_per_numa ? threads_per_numa + : avail_cores_per_numa); + } + + if (!thread_count) { + thread_count = use_numa_count * use_cores_per_numa * avail_threads_per_core; + } + + //------------------------------------------------------------------------ + // Input verification: + + const bool valid_numa = use_numa_count <= avail_numa_count; + const bool valid_cores = + use_cores_per_numa && use_cores_per_numa <= avail_cores_per_numa; + const bool valid_threads = + thread_count && thread_count <= use_numa_count * use_cores_per_numa * + avail_threads_per_core; + const bool balanced_numa = !(thread_count % use_numa_count); + const bool balanced_cores = + !(thread_count % (use_numa_count * use_cores_per_numa)); + + const bool valid_input = valid_numa && valid_cores && valid_threads && + balanced_numa && balanced_cores; + + if (!valid_input) { + std::ostringstream msg; + + msg << label << " HWLOC ERROR(s)"; + + if (!valid_threads) { + msg << " : thread_count(" << thread_count << ") exceeds capacity(" + << use_numa_count * use_cores_per_numa * avail_threads_per_core + << ")"; + } + if (!valid_numa) { + msg << " : use_numa_count(" << use_numa_count << ") exceeds capacity(" + << avail_numa_count << ")"; + } + if (!valid_cores) { + msg << " : use_cores_per_numa(" << use_cores_per_numa + << ") exceeds capacity(" << avail_cores_per_numa << ")"; + } + if (!balanced_numa) { + msg << " : thread_count(" << thread_count << ") imbalanced among numa(" + << use_numa_count << ")"; + } + if (!balanced_cores) { + msg << " : thread_count(" << thread_count << ") imbalanced among cores(" + << use_numa_count * use_cores_per_numa << ")"; + } + + Kokkos::Impl::throw_runtime_exception(msg.str()); + } + + const unsigned thread_spawn_synchronous = + (allow_async && 1 < thread_count && + (use_numa_count < avail_numa_count || + use_cores_per_numa < avail_cores_per_numa)) + ? 0 /* asyncronous */ + : 1 /* synchronous, threads_coord[0] is process core */; + + // Determine binding coordinates for to-be-spawned threads so that + // threads may be bound to cores as they are spawned. + + const unsigned threads_per_core = + thread_count / (use_numa_count * use_cores_per_numa); + + if (thread_spawn_synchronous) { + // Working synchronously and include process core as threads_coord[0]. + // Swap the NUMA coordinate of the process core with 0 + // Swap the CORE coordinate of the process core with 0 + for (unsigned i = 0, inuma = avail_numa_count - use_numa_count; + inuma < avail_numa_count; ++inuma) { + const unsigned numa_coord = 0 == inuma + ? proc_coord.first + : (proc_coord.first == inuma ? 0 : inuma); + for (unsigned icore = avail_cores_per_numa - use_cores_per_numa; + icore < avail_cores_per_numa; ++icore) { + const unsigned core_coord = + 0 == icore ? proc_coord.second + : (proc_coord.second == icore ? 0 : icore); + for (unsigned ith = 0; ith < threads_per_core; ++ith, ++i) { + threads_coord[i].first = numa_coord; + threads_coord[i].second = core_coord; + } + } + } + } else if (use_numa_count < avail_numa_count) { + // Working asynchronously and omit the process' NUMA region from the pool. + // Swap the NUMA coordinate of the process core with ( ( avail_numa_count - + // use_numa_count ) - 1 ) + const unsigned numa_coord_swap = (avail_numa_count - use_numa_count) - 1; + for (unsigned i = 0, inuma = avail_numa_count - use_numa_count; + inuma < avail_numa_count; ++inuma) { + const unsigned numa_coord = + proc_coord.first == inuma ? numa_coord_swap : inuma; + for (unsigned icore = avail_cores_per_numa - use_cores_per_numa; + icore < avail_cores_per_numa; ++icore) { + const unsigned core_coord = icore; + for (unsigned ith = 0; ith < threads_per_core; ++ith, ++i) { + threads_coord[i].first = numa_coord; + threads_coord[i].second = core_coord; + } + } + } + } else if (use_cores_per_numa < avail_cores_per_numa) { + // Working asynchronously and omit the process' core from the pool. + // Swap the CORE coordinate of the process core with ( ( + // avail_cores_per_numa - use_cores_per_numa ) - 1 ) + const unsigned core_coord_swap = + (avail_cores_per_numa - use_cores_per_numa) - 1; + for (unsigned i = 0, inuma = avail_numa_count - use_numa_count; + inuma < avail_numa_count; ++inuma) { + const unsigned numa_coord = inuma; + for (unsigned icore = avail_cores_per_numa - use_cores_per_numa; + icore < avail_cores_per_numa; ++icore) { + const unsigned core_coord = + proc_coord.second == icore ? core_coord_swap : icore; + for (unsigned ith = 0; ith < threads_per_core; ++ith, ++i) { + threads_coord[i].first = numa_coord; + threads_coord[i].second = core_coord; + } + } + } + } + + return thread_spawn_synchronous; +} + +} /* namespace hwloc */ +} /* namespace Kokkos */ + +/*--------------------------------------------------------------------------*/ +/*--------------------------------------------------------------------------*/ + +#if defined(KOKKOS_ENABLE_HWLOC) + +#include <iostream> +#include <sstream> +#include <stdexcept> + +/*--------------------------------------------------------------------------*/ +/* Third Party Libraries */ + +/* Hardware locality library: http://www.open-mpi.org/projects/hwloc/ */ +#include <hwloc.h> + +#define REQUIRED_HWLOC_API_VERSION 0x000010300 + +#if HWLOC_API_VERSION < REQUIRED_HWLOC_API_VERSION +#error \ + "Requires http://www.open-mpi.org/projects/hwloc/ Version 1.3 or greater" +#endif + +/*--------------------------------------------------------------------------*/ + +namespace Kokkos { +namespace hwloc { +namespace { + +#if DEBUG_PRINT + +inline void print_bitmap(std::ostream& s, const hwloc_const_bitmap_t bitmap) { + s << "{"; + for (int i = hwloc_bitmap_first(bitmap); - 1 != i; + i = hwloc_bitmap_next(bitmap, i)) { + s << " " << i; + } + s << " }"; +} + +#endif + +enum { MAX_CORE = 1024 }; + +std::pair<unsigned, unsigned> s_core_topology(0, 0); +unsigned s_core_capacity(0); +hwloc_topology_t s_hwloc_topology(0); +hwloc_bitmap_t s_hwloc_location(0); +hwloc_bitmap_t s_process_binding(0); +hwloc_bitmap_t s_core[MAX_CORE]; +bool s_can_bind_threads(true); + +struct Sentinel { + ~Sentinel(); + Sentinel(); +}; + +bool sentinel() { + static Sentinel self; + + if (0 == s_hwloc_topology) { + std::cerr << "Kokkos::hwloc ERROR : Called after return from main()" + << std::endl; + std::cerr.flush(); + } + + return 0 != s_hwloc_topology; +} + +Sentinel::~Sentinel() { + hwloc_topology_destroy(s_hwloc_topology); + hwloc_bitmap_free(s_process_binding); + hwloc_bitmap_free(s_hwloc_location); + + s_core_topology.first = 0; + s_core_topology.second = 0; + s_core_capacity = 0; + s_hwloc_topology = 0; + s_hwloc_location = 0; + s_process_binding = 0; +} + +Sentinel::Sentinel() { +#if defined(__MIC__) + static const bool remove_core_0 = true; +#else + static const bool remove_core_0 = false; +#endif + + s_core_topology = std::pair<unsigned, unsigned>(0, 0); + s_core_capacity = 0; + s_hwloc_topology = 0; + s_hwloc_location = 0; + s_process_binding = 0; + + for (unsigned i = 0; i < MAX_CORE; ++i) s_core[i] = 0; + + hwloc_topology_init(&s_hwloc_topology); + hwloc_topology_load(s_hwloc_topology); + + s_hwloc_location = hwloc_bitmap_alloc(); + s_process_binding = hwloc_bitmap_alloc(); + + hwloc_get_cpubind(s_hwloc_topology, s_process_binding, HWLOC_CPUBIND_PROCESS); + + if (hwloc_bitmap_iszero(s_process_binding)) { + if (Kokkos::show_warnings()) { + std::cerr << "WARNING: Cannot detect process binding -- ASSUMING ALL " + "processing units" + << std::endl; + } + const int pu_depth = hwloc_get_type_depth(s_hwloc_topology, HWLOC_OBJ_PU); + int num_pu = 1; + if (pu_depth != HWLOC_TYPE_DEPTH_UNKNOWN) { + num_pu = hwloc_get_nbobjs_by_depth(s_hwloc_topology, pu_depth); + } else { + if (Kokkos::show_warnings()) { + std::cerr << "WARNING: Cannot detect number of processing units -- " + "ASSUMING 1 (serial)." + << std::endl; + } + num_pu = 1; + } + hwloc_bitmap_set_range(s_process_binding, 0, num_pu - 1); + s_can_bind_threads = false; + } + + if (remove_core_0) { + const hwloc_obj_t core = + hwloc_get_obj_by_type(s_hwloc_topology, HWLOC_OBJ_CORE, 0); + + if (hwloc_bitmap_intersects(s_process_binding, core->cpuset)) { + hwloc_bitmap_t s_process_no_core_zero = hwloc_bitmap_alloc(); + + hwloc_bitmap_andnot(s_process_no_core_zero, s_process_binding, + core->cpuset); + + bool ok = + 0 == hwloc_set_cpubind(s_hwloc_topology, s_process_no_core_zero, + HWLOC_CPUBIND_PROCESS | HWLOC_CPUBIND_STRICT); + + if (ok) { + hwloc_get_cpubind(s_hwloc_topology, s_process_binding, + HWLOC_CPUBIND_PROCESS); + + ok = 0 != + hwloc_bitmap_isequal(s_process_binding, s_process_no_core_zero); + } + + hwloc_bitmap_free(s_process_no_core_zero); + + if (Kokkos::show_warnings() && !ok) { + std::cerr << "WARNING: Kokkos::hwloc attempted and failed to move " + "process off of core #0" + << std::endl; + } + } + } + + // Choose a hwloc object type for the NUMA level, which may not exist. + + hwloc_obj_type_t root_type = HWLOC_OBJ_TYPE_MAX; + + { + // Object types to search, in order. + static const hwloc_obj_type_t candidate_root_type[] = { + HWLOC_OBJ_NODE /* NUMA region */ + , + HWLOC_OBJ_SOCKET /* hardware socket */ + , + HWLOC_OBJ_MACHINE /* local machine */ + }; + + enum { + CANDIDATE_ROOT_TYPE_COUNT = + sizeof(candidate_root_type) / sizeof(hwloc_obj_type_t) + }; + + for (int k = 0; + k < CANDIDATE_ROOT_TYPE_COUNT && HWLOC_OBJ_TYPE_MAX == root_type; + ++k) { + if (0 < + hwloc_get_nbobjs_by_type(s_hwloc_topology, candidate_root_type[k])) { + root_type = candidate_root_type[k]; + } + } + } + + // Determine which of these 'root' types are available to this process. + // The process may have been bound (e.g., by MPI) to a subset of these root + // types. Determine current location of the master (calling) process> + + hwloc_bitmap_t proc_cpuset_location = hwloc_bitmap_alloc(); + + hwloc_get_last_cpu_location(s_hwloc_topology, proc_cpuset_location, + HWLOC_CPUBIND_THREAD); + + const unsigned max_root = + hwloc_get_nbobjs_by_type(s_hwloc_topology, root_type); + + unsigned root_base = max_root; + unsigned root_count = 0; + unsigned core_per_root = 0; + unsigned pu_per_core = 0; + bool symmetric = true; + + for (unsigned i = 0; i < max_root; ++i) { + const hwloc_obj_t root = + hwloc_get_obj_by_type(s_hwloc_topology, root_type, i); + + if (hwloc_bitmap_intersects(s_process_binding, root->cpuset)) { + ++root_count; + + // Remember which root (NUMA) object the master thread is running on. + // This will be logical NUMA rank #0 for this process. + + if (hwloc_bitmap_intersects(proc_cpuset_location, root->cpuset)) { + root_base = i; + } + + // Count available cores: + + const unsigned max_core = hwloc_get_nbobjs_inside_cpuset_by_type( + s_hwloc_topology, root->cpuset, HWLOC_OBJ_CORE); + + unsigned core_count = 0; + + for (unsigned j = 0; j < max_core; ++j) { + const hwloc_obj_t core = hwloc_get_obj_inside_cpuset_by_type( + s_hwloc_topology, root->cpuset, HWLOC_OBJ_CORE, j); + + // If process' cpuset intersects core's cpuset then process can access + // this core. Must use intersection instead of inclusion because the + // Intel-Phi MPI may bind the process to only one of the core's + // hyperthreads. + // + // Assumption: if the process can access any hyperthread of the core + // then it has ownership of the entire core. + // This assumes that it would be performance-detrimental + // to spawn more than one MPI process per core and use nested threading. + + if (hwloc_bitmap_intersects(s_process_binding, core->cpuset)) { + ++core_count; + + const unsigned pu_count = hwloc_get_nbobjs_inside_cpuset_by_type( + s_hwloc_topology, core->cpuset, HWLOC_OBJ_PU); + + if (pu_per_core == 0) pu_per_core = pu_count; + + // Enforce symmetry by taking the minimum: + + pu_per_core = std::min(pu_per_core, pu_count); + + if (pu_count != pu_per_core) symmetric = false; + } + } + + if (0 == core_per_root) core_per_root = core_count; + + // Enforce symmetry by taking the minimum: + + core_per_root = std::min(core_per_root, core_count); + + if (core_count != core_per_root) symmetric = false; + } + } + + s_core_topology.first = root_count; + s_core_topology.second = core_per_root; + s_core_capacity = pu_per_core; + + // Fill the 's_core' array for fast mapping from a core coordinate to the + // hwloc cpuset object required for thread location querying and binding. + + for (unsigned i = 0; i < max_root; ++i) { + const unsigned root_rank = (i + root_base) % max_root; + + const hwloc_obj_t root = + hwloc_get_obj_by_type(s_hwloc_topology, root_type, root_rank); + + if (hwloc_bitmap_intersects(s_process_binding, root->cpuset)) { + const unsigned max_core = hwloc_get_nbobjs_inside_cpuset_by_type( + s_hwloc_topology, root->cpuset, HWLOC_OBJ_CORE); + + unsigned core_count = 0; + + for (unsigned j = 0; j < max_core && core_count < core_per_root; ++j) { + const hwloc_obj_t core = hwloc_get_obj_inside_cpuset_by_type( + s_hwloc_topology, root->cpuset, HWLOC_OBJ_CORE, j); + + if (hwloc_bitmap_intersects(s_process_binding, core->cpuset)) { + s_core[core_count + core_per_root * i] = core->cpuset; + + ++core_count; + } + } + } + } + + hwloc_bitmap_free(proc_cpuset_location); + + if (Kokkos::show_warnings() && !symmetric) { + std::cerr << "Kokkos::hwloc WARNING: Using a symmetric subset of a " + "non-symmetric core topology." + << std::endl; + } +} + +} // namespace + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +bool available() { return true; } + +unsigned get_available_numa_count() { + sentinel(); + return s_core_topology.first; +} + +unsigned get_available_cores_per_numa() { + sentinel(); + return s_core_topology.second; +} + +unsigned get_available_threads_per_core() { + sentinel(); + return s_core_capacity; +} + +bool can_bind_threads() { + sentinel(); + return s_can_bind_threads; +} + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +unsigned bind_this_thread(const unsigned coordinate_count, + std::pair<unsigned, unsigned> coordinate[]) { + unsigned i = 0; + + try { + const std::pair<unsigned, unsigned> current = get_this_thread_coordinate(); + + // Match one of the requests: + for (i = 0; i < coordinate_count && current != coordinate[i]; ++i) + ; + + if (coordinate_count == i) { + // Match the first request (typically NUMA): + for (i = 0; i < coordinate_count && current.first != coordinate[i].first; + ++i) + ; + } + + if (coordinate_count == i) { + // Match any unclaimed request: + for (i = 0; i < coordinate_count && ~0u == coordinate[i].first; ++i) + ; + } + + if (coordinate_count == i || !bind_this_thread(coordinate[i])) { + // Failed to bind: + i = ~0u; + } + + if (i < coordinate_count) { +#if DEBUG_PRINT + if (current != coordinate[i]) { + std::cout << " bind_this_thread: rebinding from (" << current.first + << "," << current.second << ") to (" << coordinate[i].first + << "," << coordinate[i].second << ")" << std::endl; + } +#endif + + coordinate[i].first = ~0u; + coordinate[i].second = ~0u; + } + } catch (...) { + i = ~0u; + } + + return i; +} + +bool bind_this_thread(const std::pair<unsigned, unsigned> coord) { + if (!sentinel()) return false; + +#if DEBUG_PRINT + + std::cout << "Kokkos::bind_this_thread() at "; + + hwloc_get_last_cpu_location(s_hwloc_topology, s_hwloc_location, + HWLOC_CPUBIND_THREAD); + + print_bitmap(std::cout, s_hwloc_location); + + std::cout << " to "; + + print_bitmap(std::cout, + s_core[coord.second + coord.first * s_core_topology.second]); + + std::cout << std::endl; + +#endif + + // As safe and fast as possible. + // Fast-lookup by caching the coordinate -> hwloc cpuset mapping in 's_core'. + return coord.first < s_core_topology.first && + coord.second < s_core_topology.second && + 0 == hwloc_set_cpubind( + s_hwloc_topology, + s_core[coord.second + coord.first * s_core_topology.second], + HWLOC_CPUBIND_THREAD | HWLOC_CPUBIND_STRICT); +} + +bool unbind_this_thread() { + if (!sentinel()) return false; + +#define HWLOC_DEBUG_PRINT 0 + +#if HWLOC_DEBUG_PRINT + + std::cout << "Kokkos::unbind_this_thread() from "; + + hwloc_get_cpubind(s_hwloc_topology, s_hwloc_location, HWLOC_CPUBIND_THREAD); + + print_bitmap(std::cout, s_hwloc_location); + +#endif + + const bool result = + s_hwloc_topology && + 0 == hwloc_set_cpubind(s_hwloc_topology, s_process_binding, + HWLOC_CPUBIND_THREAD | HWLOC_CPUBIND_STRICT); + +#if HWLOC_DEBUG_PRINT + + std::cout << " to "; + + hwloc_get_cpubind(s_hwloc_topology, s_hwloc_location, HWLOC_CPUBIND_THREAD); + + print_bitmap(std::cout, s_hwloc_location); + + std::cout << std::endl; + +#endif + + return result; + +#undef HWLOC_DEBUG_PRINT +} + +//---------------------------------------------------------------------------- + +std::pair<unsigned, unsigned> get_this_thread_coordinate() { + std::pair<unsigned, unsigned> coord(0u, 0u); + + if (!sentinel()) return coord; + + const unsigned n = s_core_topology.first * s_core_topology.second; + + // Using the pre-allocated 's_hwloc_location' to avoid memory + // allocation by this thread. This call is NOT thread-safe. + hwloc_get_last_cpu_location(s_hwloc_topology, s_hwloc_location, + HWLOC_CPUBIND_THREAD); + + unsigned i = 0; + + while (i < n && !hwloc_bitmap_intersects(s_hwloc_location, s_core[i])) ++i; + + if (i < n) { + coord.first = i / s_core_topology.second; + coord.second = i % s_core_topology.second; + } + + return coord; +} + +//---------------------------------------------------------------------------- + +} /* namespace hwloc */ +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#else /* ! defined( KOKKOS_ENABLE_HWLOC ) */ + +namespace Kokkos { +namespace hwloc { + +bool available() { return false; } +bool can_bind_threads() { return false; } + +unsigned get_available_numa_count() { return 1; } +unsigned get_available_cores_per_numa() { return 1; } +unsigned get_available_threads_per_core() { return 1; } + +unsigned bind_this_thread(const unsigned, std::pair<unsigned, unsigned>[]) { + return ~0; +} + +bool bind_this_thread(const std::pair<unsigned, unsigned>) { return false; } + +bool unbind_this_thread() { return true; } + +std::pair<unsigned, unsigned> get_this_thread_coordinate() { + return std::pair<unsigned, unsigned>(0, 0); +} + +} // namespace hwloc +} // namespace Kokkos + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif diff --git a/packages/kokkos/core/src/setup/Kokkos_Setup_Cuda.hpp b/packages/kokkos/core/src/setup/Kokkos_Setup_Cuda.hpp new file mode 100644 index 0000000000000000000000000000000000000000..1913e508236a6ead07d3a4aaf684eef412f978ba --- /dev/null +++ b/packages/kokkos/core/src/setup/Kokkos_Setup_Cuda.hpp @@ -0,0 +1,138 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_CUDA_SETUP_HPP_ +#define KOKKOS_CUDA_SETUP_HPP_ + +#if !defined(KOKKOS_ENABLE_CUDA) +#error \ + "KOKKOS_ENABLE_CUDA was not defined, but Kokkos_Setup_Cuda.hpp was included anyway." +#endif + +#if defined(KOKKOS_ENABLE_CUDA) && !defined(__CUDACC__) +#error \ + "KOKKOS_ENABLE_CUDA defined but the compiler is not defining the __CUDACC__ macro as expected" +// Some tooling environments will still function better if we do this here. +#define __CUDACC__ +#endif /* defined(KOKKOS_ENABLE_CUDA) && !defined(__CUDACC__) */ + +// Compiling with a CUDA compiler. +// +// Include <cuda.h> to pick up the CUDA_VERSION macro defined as: +// CUDA_VERSION = ( MAJOR_VERSION * 1000 ) + ( MINOR_VERSION * 10 ) +// +// When generating device code the __CUDA_ARCH__ macro is defined as: +// __CUDA_ARCH__ = ( MAJOR_CAPABILITY * 100 ) + ( MINOR_CAPABILITY * 10 ) + +#include <cuda_runtime.h> +#include <cuda.h> + +#if defined(_WIN32) +#define KOKKOS_IMPL_WINDOWS_CUDA +#endif + +#if !defined(CUDA_VERSION) +#error "#include <cuda.h> did not define CUDA_VERSION." +#endif + +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 300) +// Compiling with CUDA compiler for device code. +#error "Cuda device capability >= 3.0 is required." +#endif + +#ifdef KOKKOS_ENABLE_CUDA_LAMBDA +#define KOKKOS_LAMBDA [=] __host__ __device__ + +#if defined(KOKKOS_ENABLE_CXX17) || defined(KOKKOS_ENABLE_CXX20) +#define KOKKOS_CLASS_LAMBDA [ =, *this ] __host__ __device__ +#endif + +#if defined(__NVCC__) +#define KOKKOS_IMPL_NEED_FUNCTOR_WRAPPER +#endif +#else // !defined(KOKKOS_ENABLE_CUDA_LAMBDA) +#undef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA +#endif // !defined(KOKKOS_ENABLE_CUDA_LAMBDA) + +#if (10000 > CUDA_VERSION) +#define KOKKOS_ENABLE_PRE_CUDA_10_DEPRECATION_API +#endif + +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700) +// PTX atomics with memory order semantics are only available on volta and later +#if !defined(KOKKOS_DISABLE_CUDA_ASM) +#if !defined(KOKKOS_ENABLE_CUDA_ASM) +#define KOKKOS_ENABLE_CUDA_ASM +#if !defined(KOKKOS_DISABLE_CUDA_ASM_ATOMICS) && \ + defined(KOKKOS_ENABLE_GNU_ATOMICS) +#define KOKKOS_ENABLE_CUDA_ASM_ATOMICS +#endif +#endif +#endif +#endif + +#define KOKKOS_IMPL_FORCEINLINE_FUNCTION __device__ __host__ __forceinline__ +#define KOKKOS_IMPL_FORCEINLINE __forceinline__ +#define KOKKOS_IMPL_INLINE_FUNCTION __device__ __host__ inline +#define KOKKOS_IMPL_FUNCTION __device__ __host__ +#define KOKKOS_IMPL_HOST_FUNCTION __host__ +#define KOKKOS_IMPL_DEVICE_FUNCTION __device__ +#if defined(KOKKOS_COMPILER_NVCC) +#define KOKKOS_INLINE_FUNCTION_DELETED inline +#else +#define KOKKOS_INLINE_FUNCTION_DELETED __device__ __host__ inline +#endif +#if (CUDA_VERSION < 10000) +#define KOKKOS_DEFAULTED_FUNCTION __host__ __device__ inline +#else +#define KOKKOS_DEFAULTED_FUNCTION inline +#endif +#define KOKKOS_IMPL_HOST_FUNCTION __host__ +#define KOKKOS_IMPL_DEVICE_FUNCTION __device__ + +#if (CUDA_VERSION >= 10000) +#define KOKKOS_CUDA_ENABLE_GRAPHS +#endif + +#endif /* KOKKOS_CUDA_SETUP_HPP_ */ diff --git a/packages/kokkos/core/src/setup/Kokkos_Setup_HIP.hpp b/packages/kokkos/core/src/setup/Kokkos_Setup_HIP.hpp new file mode 100644 index 0000000000000000000000000000000000000000..f1df2f87bb0a7911949442622e27cadaef466c1d --- /dev/null +++ b/packages/kokkos/core/src/setup/Kokkos_Setup_HIP.hpp @@ -0,0 +1,71 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_SETUP_HIP_HPP_ +#define KOKKOS_SETUP_HIP_HPP_ + +#if defined(KOKKOS_ENABLE_HIP) + +#define KOKKOS_IMPL_HIP_CLANG_WORKAROUND + +#define HIP_ENABLE_PRINTF +#include <hip/hip_runtime.h> +#include <hip/hip_runtime_api.h> + +#define KOKKOS_LAMBDA [=] __host__ __device__ +#if defined(KOKKOS_ENABLE_CXX17) || defined(KOKKOS_ENABLE_CXX20) +#define KOKKOS_CLASS_LAMBDA [ =, *this ] __host__ __device__ +#endif + +#define KOKKOS_IMPL_FORCEINLINE_FUNCTION __device__ __host__ __forceinline__ +#define KOKKOS_IMPL_INLINE_FUNCTION __device__ __host__ inline +#define KOKKOS_DEFAULTED_FUNCTION __device__ __host__ inline +#define KOKKOS_INLINE_FUNCTION_DELETED __device__ __host__ inline +#define KOKKOS_IMPL_FUNCTION __device__ __host__ +#define KOKKOS_IMPL_HOST_FUNCTION __host__ +#define KOKKOS_IMPL_DEVICE_FUNCTION __device__ + +#endif // #if defined( KOKKOS_ENABLE_HIP ) + +#endif diff --git a/packages/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp b/packages/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a5f5406746befc984f17f815e04bac63f0fadff4 --- /dev/null +++ b/packages/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp @@ -0,0 +1,73 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_SETUP_SYCL_HPP_ +#define KOKKOS_SETUP_SYCL_HPP_ + +#include <CL/sycl.hpp> + +#ifdef __SYCL_DEVICE_ONLY__ +#ifdef KOKKOS_IMPL_DISABLE_SYCL_DEVICE_PRINTF +namespace Kokkos { +namespace ImplSYCL { +template <typename... Args> +void sink(Args&&... args) { + (void)(sizeof...(args)); +} +} // namespace ImplSYCL +} // namespace Kokkos +#define KOKKOS_IMPL_DO_NOT_USE_PRINTF(...) \ + do { \ + Kokkos::ImplSYCL::sink(__VA_ARGS__); \ + } while (0) +#else +#define KOKKOS_IMPL_DO_NOT_USE_PRINTF(format, ...) \ + do { \ + static const __attribute__((opencl_constant)) char fmt[] = (format); \ + sycl::ONEAPI::experimental::printf(fmt, ##__VA_ARGS__); \ + } while (0) +#endif +#endif + +#endif diff --git a/packages/kokkos/core/src/traits/Kokkos_ExecutionSpaceTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_ExecutionSpaceTrait.hpp new file mode 100644 index 0000000000000000000000000000000000000000..4467b2e03c486d07d80c3fee66e6c3b50c42256e --- /dev/null +++ b/packages/kokkos/core/src/traits/Kokkos_ExecutionSpaceTrait.hpp @@ -0,0 +1,95 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_KOKKOS_EXECUTIONSPACETRAIT_HPP +#define KOKKOS_KOKKOS_EXECUTIONSPACETRAIT_HPP + +#include <Kokkos_Macros.hpp> +#include <Kokkos_Concepts.hpp> // is_execution_space +#include <traits/Kokkos_PolicyTraitAdaptor.hpp> +#include <traits/Kokkos_Traits_fwd.hpp> + +namespace Kokkos { +namespace Impl { + +//============================================================================== +// <editor-fold desc="trait specification"> {{{1 + +struct ExecutionSpaceTrait : TraitSpecificationBase<ExecutionSpaceTrait> { + struct base_traits { + static constexpr auto execution_space_is_defaulted = true; + + using execution_space = Kokkos::DefaultExecutionSpace; + }; + template <class T> + using trait_matches_specification = is_execution_space<T>; +}; + +// </editor-fold> end trait specification }}}1 +//============================================================================== + +//============================================================================== +// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1 + +template <class ExecutionSpace, class... Traits> +struct AnalyzeExecPolicy< + std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value>, + ExecutionSpace, Traits...> : AnalyzeExecPolicy<void, Traits...> { + using base_t = AnalyzeExecPolicy<void, Traits...>; + using base_t::base_t; + + static_assert(base_t::execution_space_is_defaulted, + "Kokkos Error: More than one execution space given"); + + static constexpr bool execution_space_is_defaulted = false; + + using execution_space = ExecutionSpace; +}; + +// </editor-fold> end AnalyzeExecPolicy specializations }}}1 +//============================================================================== +} // end namespace Impl +} // end namespace Kokkos + +#endif // KOKKOS_KOKKOS_EXECUTIONSPACETRAIT_HPP diff --git a/packages/kokkos/core/src/traits/Kokkos_GraphKernelTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_GraphKernelTrait.hpp new file mode 100644 index 0000000000000000000000000000000000000000..eb649dc0887a2aab8c88feae8156676b70a7cdf7 --- /dev/null +++ b/packages/kokkos/core/src/traits/Kokkos_GraphKernelTrait.hpp @@ -0,0 +1,87 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_KOKKOS_GRAPHKERNELTRAIT_HPP +#define KOKKOS_KOKKOS_GRAPHKERNELTRAIT_HPP + +#include <Kokkos_Macros.hpp> +#include <traits/Kokkos_PolicyTraitAdaptor.hpp> +#include <impl/Kokkos_GraphImpl_fwd.hpp> // IsGraphKernelTag +#include <traits/Kokkos_Traits_fwd.hpp> +#include <impl/Kokkos_Utilities.hpp> + +namespace Kokkos { +namespace Impl { + +//============================================================================== +// <editor-fold desc="trait specification"> {{{1 + +struct GraphKernelTrait : TraitSpecificationBase<GraphKernelTrait> { + struct base_traits { + using is_graph_kernel = std::false_type; + }; + template <class T> + using trait_matches_specification = std::is_same<T, IsGraphKernelTag>; +}; + +// </editor-fold> end trait specification }}}1 +//============================================================================== + +//============================================================================== +// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1 + +template <class... Traits> +struct AnalyzeExecPolicy<void, Impl::IsGraphKernelTag, Traits...> + : AnalyzeExecPolicy<void, Traits...> { + using base_t = AnalyzeExecPolicy<void, Traits...>; + using base_t::base_t; + using is_graph_kernel = std::true_type; +}; + +// </editor-fold> end AnalyzeExecPolicy specializations }}}1 +//============================================================================== +} // end namespace Impl +} // end namespace Kokkos + +#endif // KOKKOS_KOKKOS_GRAPHKERNELTRAIT_HPP diff --git a/packages/kokkos/core/src/traits/Kokkos_IndexTypeTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_IndexTypeTrait.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e15adc17116cb66481f90acc0b9ba5a83ec1ab52 --- /dev/null +++ b/packages/kokkos/core/src/traits/Kokkos_IndexTypeTrait.hpp @@ -0,0 +1,107 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_KOKKOS_INDEXTYPETRAIT_HPP +#define KOKKOS_KOKKOS_INDEXTYPETRAIT_HPP + +#include <Kokkos_Macros.hpp> +#include <Kokkos_Concepts.hpp> // IndexType, is_index_type +#include <traits/Kokkos_PolicyTraitAdaptor.hpp> +#include <traits/Kokkos_Traits_fwd.hpp> + +namespace Kokkos { +namespace Impl { + +//============================================================================== +// <editor-fold desc="trait specification"> {{{1 + +struct IndexTypeTrait : TraitSpecificationBase<IndexTypeTrait> { + struct base_traits { + static constexpr bool index_type_is_defaulted = true; + using index_type = dependent_policy_trait_default; + }; + template <class T> + using trait_matches_specification = + std::integral_constant<bool, std::is_integral<T>::value || + is_index_type<T>::value>; +}; + +// </editor-fold> end trait specification }}}1 +//============================================================================== + +//============================================================================== +// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1 + +// Index type given as IndexType template +template <class IntegralIndexType, class... Traits> +struct AnalyzeExecPolicy<void, Kokkos::IndexType<IntegralIndexType>, Traits...> + : AnalyzeExecPolicy<void, Traits...> { + using base_t = AnalyzeExecPolicy<void, Traits...>; + using base_t::base_t; + static_assert(base_t::index_type_is_defaulted, + "Kokkos Error: More than one index type given"); + static constexpr bool index_type_is_defaulted = false; + using index_type = Kokkos::IndexType<IntegralIndexType>; +}; + +// IndexType given as an integral type directly +template <class IntegralIndexType, class... Traits> +struct AnalyzeExecPolicy< + std::enable_if_t<std::is_integral<IntegralIndexType>::value>, + IntegralIndexType, Traits...> : AnalyzeExecPolicy<void, Traits...> { + using base_t = AnalyzeExecPolicy<void, Traits...>; + using base_t::base_t; + static_assert(base_t::index_type_is_defaulted, + "Kokkos Error: More than one index type given"); + static constexpr bool index_type_is_defaulted = false; + using index_type = Kokkos::IndexType<IntegralIndexType>; +}; + +// </editor-fold> end AnalyzeExecPolicy specializations }}}1 +//============================================================================== + +} // end namespace Impl +} // end namespace Kokkos + +#endif // KOKKOS_KOKKOS_INDEXTYPETRAIT_HPP diff --git a/packages/kokkos/core/src/traits/Kokkos_IterationPatternTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_IterationPatternTrait.hpp new file mode 100644 index 0000000000000000000000000000000000000000..30e07039a405d61f2c78217284f9036a0a533f06 --- /dev/null +++ b/packages/kokkos/core/src/traits/Kokkos_IterationPatternTrait.hpp @@ -0,0 +1,88 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_KOKKOS_ITERATIONPATTERNTRAIT_HPP +#define KOKKOS_KOKKOS_ITERATIONPATTERNTRAIT_HPP + +#include <Kokkos_Concepts.hpp> // is_iteration_pattern +#include <type_traits> // is_void + +namespace Kokkos { +namespace Impl { + +//============================================================================== +// <editor-fold desc="trait specification"> {{{1 + +struct IterationPatternTrait : TraitSpecificationBase<IterationPatternTrait> { + struct base_traits { + using iteration_pattern = void; // TODO set default iteration pattern + }; + template <class T> + using trait_matches_specification = is_iteration_pattern<T>; +}; + +// </editor-fold> end trait specification }}}1 +//============================================================================== + +//============================================================================== +// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1 + +template <class IterationPattern, class... Traits> +struct AnalyzeExecPolicy< + std::enable_if_t<is_iteration_pattern<IterationPattern>::value>, + IterationPattern, Traits...> : AnalyzeExecPolicy<void, Traits...> { + using base_t = AnalyzeExecPolicy<void, Traits...>; + using base_t::base_t; + static_assert(std::is_void<typename base_t::iteration_pattern>::value, + "Kokkos Error: More than one iteration pattern given"); + using iteration_pattern = IterationPattern; +}; + +// </editor-fold> end AnalyzeExecPolicy specializations }}}1 +//============================================================================== + +} // end namespace Impl +} // end namespace Kokkos + +#endif // KOKKOS_KOKKOS_ITERATIONPATTERNTRAIT_HPP diff --git a/packages/kokkos/core/src/traits/Kokkos_LaunchBoundsTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_LaunchBoundsTrait.hpp new file mode 100644 index 0000000000000000000000000000000000000000..73ae8e27e2eca54412b4cbab464b1760c93d7aed --- /dev/null +++ b/packages/kokkos/core/src/traits/Kokkos_LaunchBoundsTrait.hpp @@ -0,0 +1,91 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_KOKKOS_LAUNCHBOUNDSTRAIT_HPP +#define KOKKOS_KOKKOS_LAUNCHBOUNDSTRAIT_HPP + +#include <Kokkos_Macros.hpp> +#include <Kokkos_Concepts.hpp> // LaunchBounds +#include <traits/Kokkos_PolicyTraitAdaptor.hpp> +#include <traits/Kokkos_Traits_fwd.hpp> + +namespace Kokkos { +namespace Impl { + +//============================================================================== +// <editor-fold desc="trait specification"> {{{1 + +struct LaunchBoundsTrait : TraitSpecificationBase<LaunchBoundsTrait> { + struct base_traits { + static constexpr bool launch_bounds_is_defaulted = true; + + using launch_bounds = LaunchBounds<>; + }; + template <class T> + using trait_matches_specification = is_launch_bounds<T>; +}; + +// </editor-fold> end trait specification }}}1 +//============================================================================== + +//============================================================================== +// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1 + +template <unsigned int MaxT, unsigned int MinB, class... Traits> +struct AnalyzeExecPolicy<void, Kokkos::LaunchBounds<MaxT, MinB>, Traits...> + : AnalyzeExecPolicy<void, Traits...> { + using base_t = AnalyzeExecPolicy<void, Traits...>; + using base_t::base_t; + static_assert(base_t::launch_bounds_is_defaulted, + "Kokkos Error: More than one launch_bounds given"); + static constexpr bool launch_bounds_is_defaulted = false; + using launch_bounds = Kokkos::LaunchBounds<MaxT, MinB>; +}; + +// </editor-fold> end AnalyzeExecPolicy specializations }}}1 +//============================================================================== +} // end namespace Impl +} // end namespace Kokkos + +#endif // KOKKOS_KOKKOS_LAUNCHBOUNDSTRAIT_HPP diff --git a/packages/kokkos/core/src/traits/Kokkos_OccupancyControlTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_OccupancyControlTrait.hpp new file mode 100644 index 0000000000000000000000000000000000000000..3deb4a94d54ddeee0a6a0712f107d61674818668 --- /dev/null +++ b/packages/kokkos/core/src/traits/Kokkos_OccupancyControlTrait.hpp @@ -0,0 +1,208 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_KOKKOS_OCCUPANCYCONTROLTRAIT_HPP +#define KOKKOS_KOKKOS_OCCUPANCYCONTROLTRAIT_HPP + +#include <impl/Kokkos_Error.hpp> // KOKKOS_EXPECTS macro + +#include <traits/Kokkos_PolicyTraitAdaptor.hpp> + +#include <traits/Kokkos_Traits_fwd.hpp> + +namespace Kokkos { + +namespace Experimental { + +//============================================================================== +// <editor-fold desc="Occupancy control user interface"> {{{1 + +struct MaximizeOccupancy; + +struct DesiredOccupancy { + int m_occ = 100; + explicit constexpr DesiredOccupancy(int occ) : m_occ(occ) { + KOKKOS_EXPECTS(0 <= occ && occ <= 100); + } + explicit constexpr operator int() const { return m_occ; } + constexpr int value() const { return m_occ; } + DesiredOccupancy() = default; + explicit DesiredOccupancy(MaximizeOccupancy const&) : DesiredOccupancy() {} +}; + +struct MaximizeOccupancy { + explicit MaximizeOccupancy() = default; +}; + +// </editor-fold> end Occupancy control user interface }}}1 +//============================================================================== + +} // end namespace Experimental + +namespace Impl { + +//============================================================================== +// <editor-fold desc="Occupancy control trait specification"> {{{1 + +struct OccupancyControlTrait : TraitSpecificationBase<OccupancyControlTrait> { + struct base_traits { + using occupancy_control = Kokkos::Experimental::MaximizeOccupancy; + static constexpr bool experimental_contains_desired_occupancy = false; + // Default access occupancy_control, for when it is the (stateless) default + static constexpr occupancy_control impl_get_occupancy_control() { + return occupancy_control{}; + } + }; + template <class T> + using trait_matches_specification = std::integral_constant< + bool, + std::is_same<T, Kokkos::Experimental::DesiredOccupancy>::value || + std::is_same<T, Kokkos::Experimental::MaximizeOccupancy>::value>; +}; + +// </editor-fold> end Occupancy control trait specification }}}1 +//============================================================================== + +//============================================================================== +// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1 + +// The DesiredOccupancy case has runtime storage, so we need to handle copies +// and assignments +template <class... Traits> +struct AnalyzeExecPolicy<void, Kokkos::Experimental::DesiredOccupancy, + Traits...> : AnalyzeExecPolicy<void, Traits...> { + public: + using base_t = AnalyzeExecPolicy<void, Traits...>; + using occupancy_control = Kokkos::Experimental::DesiredOccupancy; + static constexpr bool experimental_contains_desired_occupancy = true; + + template <class OccControl> + using with_occupancy_control = AnalyzeExecPolicy<void, OccControl, Traits...>; + + // Treat this as private, but make it public so that MSVC will still treat + // this as a standard layout class and make it the right size: storage for a + // stateful desired occupancy + // private: + occupancy_control m_desired_occupancy; + + AnalyzeExecPolicy() = default; + // Converting constructor + // Just rely on the convertibility of occupancy_control to transfer the data + template <class Other> + AnalyzeExecPolicy(ExecPolicyTraitsWithDefaults<Other> const& other) + : base_t(other), + m_desired_occupancy(other.impl_get_occupancy_control()) {} + + // Converting assignment operator + // Just rely on the convertibility of occupancy_control to transfer the data + template <class Other> + AnalyzeExecPolicy& operator=( + ExecPolicyTraitsWithDefaults<Other> const& other) { + *static_cast<base_t*>(this) = other; + this->impl_set_desired_occupancy( + occupancy_control{other.impl_get_occupancy_control()}); + return *this; + } + + // Access to occupancy control instance, usable in generic context + constexpr occupancy_control impl_get_occupancy_control() const { + return m_desired_occupancy; + } + + // Access to desired occupancy (getter and setter) + Kokkos::Experimental::DesiredOccupancy impl_get_desired_occupancy() const { + return m_desired_occupancy; + } + + void impl_set_desired_occupancy(occupancy_control desired_occupancy) { + m_desired_occupancy = desired_occupancy; + } +}; + +template <class... Traits> +struct AnalyzeExecPolicy<void, Kokkos::Experimental::MaximizeOccupancy, + Traits...> : AnalyzeExecPolicy<void, Traits...> { + using base_t = AnalyzeExecPolicy<void, Traits...>; + using base_t::base_t; + using occupancy_control = Kokkos::Experimental::MaximizeOccupancy; + static constexpr bool experimental_contains_desired_occupancy = false; +}; + +// </editor-fold> end AnalyzeExecPolicy specializations }}}1 +//============================================================================== + +} // end namespace Impl + +namespace Experimental { + +//============================================================================== +// <editor-fold desc="User interface"> {{{1 + +template <typename Policy> +auto prefer(Policy const& p, DesiredOccupancy occ) { + using new_policy_t = + Kokkos::Impl::OccupancyControlTrait::policy_with_trait<Policy, + DesiredOccupancy>; + new_policy_t pwo{p}; + pwo.impl_set_desired_occupancy(occ); + return pwo; +} + +template <typename Policy> +constexpr auto prefer(Policy const& p, MaximizeOccupancy) { + static_assert(Kokkos::is_execution_policy<Policy>::value, ""); + using new_policy_t = + Kokkos::Impl::OccupancyControlTrait::policy_with_trait<Policy, + MaximizeOccupancy>; + return new_policy_t{p}; +} + +// </editor-fold> end User interface }}}1 +//============================================================================== + +} // end namespace Experimental + +} // end namespace Kokkos + +#endif // KOKKOS_KOKKOS_OCCUPANCYCONTROLTRAIT_HPP diff --git a/packages/kokkos/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp b/packages/kokkos/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp new file mode 100644 index 0000000000000000000000000000000000000000..b087dac85559bd6dc67c983bdaad1a6675cfde9b --- /dev/null +++ b/packages/kokkos/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp @@ -0,0 +1,156 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <impl/Kokkos_Utilities.hpp> // type_list + +#include <traits/Kokkos_Traits_fwd.hpp> + +#ifndef KOKKOS_KOKKOS_POLICYTRAITADAPTOR_HPP +#define KOKKOS_KOKKOS_POLICYTRAITADAPTOR_HPP + +namespace Kokkos { +namespace Impl { + +//============================================================================== +// <editor-fold desc="Adapter for replacing/adding a trait"> {{{1 + +//------------------------------------------------------------------------------ + +// General strategy: given a TraitSpecification, go through the entries in the +// parameter pack of the policy template and find the first one that returns +// `true` for the nested `trait_matches_specification` variable template. If +// that nested variable template is not found these overloads should be safely +// ignored, and the trait can specialize PolicyTraitAdapterImpl to get the +// desired behavior. + +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// <editor-fold desc="PolicyTraitMatcher"> {{{2 + +// To handle the WorkTag case, we need more than just a predicate; we need +// something that we can default to in the unspecialized case, just like we +// do for AnalyzeExecPolicy +template <class TraitSpec, class Trait, class Enable = void> +struct PolicyTraitMatcher; + +template <class TraitSpec, class Trait> +struct PolicyTraitMatcher< + TraitSpec, Trait, + std::enable_if_t< + TraitSpec::template trait_matches_specification<Trait>::value>> + : std::true_type {}; + +// </editor-fold> end PolicyTraitMatcher }}}2 +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// <editor-fold desc="PolicyTraitAdaptorImpl specializations"> {{{2 + +// Matching version, replace the trait +template <class TraitSpec, template <class...> class PolicyTemplate, + class... ProcessedTraits, class MatchingTrait, + class... ToProcessTraits, class NewTrait> +struct PolicyTraitAdaptorImpl< + TraitSpec, PolicyTemplate, type_list<ProcessedTraits...>, + type_list<MatchingTrait, ToProcessTraits...>, NewTrait, + std::enable_if_t<PolicyTraitMatcher<TraitSpec, MatchingTrait>::value>> { + static_assert(PolicyTraitMatcher<TraitSpec, NewTrait>::value, ""); + using type = PolicyTemplate<ProcessedTraits..., NewTrait, ToProcessTraits...>; +}; + +// Non-matching version, check the next option +template <class TraitSpec, template <class...> class PolicyTemplate, + class... ProcessedTraits, class NonMatchingTrait, + class... ToProcessTraits, class NewTrait> +struct PolicyTraitAdaptorImpl< + TraitSpec, PolicyTemplate, type_list<ProcessedTraits...>, + type_list<NonMatchingTrait, ToProcessTraits...>, NewTrait, + std::enable_if_t<!PolicyTraitMatcher<TraitSpec, NonMatchingTrait>::value>> { + using type = typename PolicyTraitAdaptorImpl< + TraitSpec, PolicyTemplate, + type_list<ProcessedTraits..., NonMatchingTrait>, + type_list<ToProcessTraits...>, NewTrait>::type; +}; + +// Base case: no matches found; just add the trait to the end of the list +template <class TraitSpec, template <class...> class PolicyTemplate, + class... ProcessedTraits, class NewTrait> +struct PolicyTraitAdaptorImpl<TraitSpec, PolicyTemplate, + type_list<ProcessedTraits...>, type_list<>, + NewTrait> { + static_assert(PolicyTraitMatcher<TraitSpec, NewTrait>::value, ""); + using type = PolicyTemplate<ProcessedTraits..., NewTrait>; +}; + +// </editor-fold> end PolicyTraitAdaptorImpl specializations }}}2 +//------------------------------------------------------------------------------ + +template <class TraitSpec, template <class...> class PolicyTemplate, + class... Traits, class NewTrait> +struct PolicyTraitAdaptor<TraitSpec, PolicyTemplate<Traits...>, NewTrait> + : PolicyTraitAdaptorImpl<TraitSpec, PolicyTemplate, type_list<>, + type_list<Traits...>, NewTrait> {}; + +// </editor-fold> end Adapter for replacing/adding a trait }}}1 +//============================================================================== + +//============================================================================== +// <editor-fold desc="CRTP Base class for trait specifications"> {{{1 + +template <class TraitSpec> +struct TraitSpecificationBase { + using trait_specification = TraitSpec; + template <class Policy, class Trait> + using policy_with_trait = + typename PolicyTraitAdaptor<TraitSpec, Policy, Trait>::type; +}; + +// </editor-fold> end CRTP Base class for trait specifications }}}1 +//============================================================================== + +} // end namespace Impl +} // end namespace Kokkos + +#endif // KOKKOS_KOKKOS_POLICYTRAITADAPTOR_HPP diff --git a/packages/kokkos/core/src/traits/Kokkos_ScheduleTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_ScheduleTrait.hpp new file mode 100644 index 0000000000000000000000000000000000000000..74bab6fce2a632269a804971af3e50348e34c8b2 --- /dev/null +++ b/packages/kokkos/core/src/traits/Kokkos_ScheduleTrait.hpp @@ -0,0 +1,112 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_KOKKOS_SCHEDULETRAIT_HPP +#define KOKKOS_KOKKOS_SCHEDULETRAIT_HPP + +#include <Kokkos_Macros.hpp> +#include <Kokkos_Concepts.hpp> // is_schedule_type, Schedule +#include <traits/Kokkos_PolicyTraitAdaptor.hpp> +#include <traits/Kokkos_Traits_fwd.hpp> + +namespace Kokkos { + +namespace Impl { + +//============================================================================== +// <editor-fold desc="trait specification"> {{{1 + +struct ScheduleTrait : TraitSpecificationBase<ScheduleTrait> { + struct base_traits { + static constexpr auto schedule_type_is_defaulted = true; + + using schedule_type = Schedule<Static>; + }; + template <class T> + using trait_matches_specification = is_schedule_type<T>; +}; + +// </editor-fold> end trait specification }}}1 +//============================================================================== + +//============================================================================== +// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1 + +template <class ScheduleType, class... Traits> +struct AnalyzeExecPolicy<void, Kokkos::Schedule<ScheduleType>, Traits...> + : AnalyzeExecPolicy<void, Traits...> { + using base_t = AnalyzeExecPolicy<void, Traits...>; + using base_t::base_t; + static_assert(base_t::schedule_type_is_defaulted, + "Kokkos Error: More than one schedule type given"); + static constexpr bool schedule_type_is_defaulted = false; + using schedule_type = Kokkos::Schedule<ScheduleType>; +}; + +// </editor-fold> end AnalyzeExecPolicy specializations }}}1 +//============================================================================== + +} // end namespace Impl + +namespace Experimental { + +//============================================================================== +// <editor-fold desc="User interface"> {{{1 + +template <class Policy, class ScheduleType> +constexpr auto require(Policy const& p, Kokkos::Schedule<ScheduleType>) { + static_assert(Kokkos::is_execution_policy<Policy>::value, ""); + using new_policy_t = Kokkos::Impl::ScheduleTrait::policy_with_trait< + Policy, Kokkos::Schedule<ScheduleType>>; + return new_policy_t{p}; +} + +// </editor-fold> end User interface }}}1 +//============================================================================== + +} // end namespace Experimental + +} // end namespace Kokkos + +#endif // KOKKOS_KOKKOS_SCHEDULETRAIT_HPP diff --git a/packages/kokkos/core/src/traits/Kokkos_Traits_fwd.hpp b/packages/kokkos/core/src/traits/Kokkos_Traits_fwd.hpp new file mode 100644 index 0000000000000000000000000000000000000000..b8b9a0ca2d889b08116528803d0c1b096060ecad --- /dev/null +++ b/packages/kokkos/core/src/traits/Kokkos_Traits_fwd.hpp @@ -0,0 +1,73 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_KOKKOS_TRAITS_FWD_HPP +#define KOKKOS_KOKKOS_TRAITS_FWD_HPP + +namespace Kokkos { +namespace Impl { + +template <class Enable, class... TraitsList> +struct AnalyzeExecPolicy; + +template <class AnalysisResults> +struct ExecPolicyTraitsWithDefaults; + +template <class TraitSpec, template <class...> class PolicyTemplate, + class AlreadyProcessedList, class ToProcessList, class NewTrait, + class Enable = void> +struct PolicyTraitAdaptorImpl; + +template <class TraitSpec, class Policy, class NewTrait> +struct PolicyTraitAdaptor; + +// A tag class for dependent defaults that must be handled by the +// ExecPolicyTraitsWithDefaults wrapper, since their defaults depend on other +// traits +struct dependent_policy_trait_default; + +} // end namespace Impl +} // end namespace Kokkos + +#endif // KOKKOS_KOKKOS_TRAITS_FWD_HPP diff --git a/packages/kokkos/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp new file mode 100644 index 0000000000000000000000000000000000000000..2656316fb934333655d0370f4dc3d40eea7bbb86 --- /dev/null +++ b/packages/kokkos/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp @@ -0,0 +1,114 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_KOKKOS_WORKITEMPROPERTYTRAIT_HPP +#define KOKKOS_KOKKOS_WORKITEMPROPERTYTRAIT_HPP + +#include <Kokkos_Macros.hpp> +#include <Kokkos_Concepts.hpp> // WorkItemProperty +#include <traits/Kokkos_PolicyTraitAdaptor.hpp> +#include <traits/Kokkos_Traits_fwd.hpp> + +namespace Kokkos { +namespace Impl { + +//============================================================================== +// <editor-fold desc="trait specification"> {{{1 + +struct WorkItemPropertyTrait : TraitSpecificationBase<WorkItemPropertyTrait> { + struct base_traits { + using work_item_property = Kokkos::Experimental::WorkItemProperty::None_t; + }; + template <class T> + using trait_matches_specification = + Kokkos::Experimental::is_work_item_property<T>; +}; + +// </editor-fold> end trait specification }}}1 +//============================================================================== + +//============================================================================== +// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1 + +template <class Property, class... Traits> +struct AnalyzeExecPolicy< + std::enable_if_t< + Kokkos::Experimental::is_work_item_property<Property>::value>, + Property, Traits...> : AnalyzeExecPolicy<void, Traits...> { + using base_t = AnalyzeExecPolicy<void, Traits...>; + using base_t::base_t; + static_assert( + std::is_same<typename base_t::work_item_property, + Kokkos::Experimental::WorkItemProperty::None_t>::value, + "Kokkos Error: More than one work item property given"); + using work_item_property = Property; +}; + +// </editor-fold> end AnalyzeExecPolicy specializations }}}1 +//============================================================================== + +} // end namespace Impl + +namespace Experimental { + +//============================================================================== +// <editor-fold desc="User interface"> {{{1 + +template <class Policy, unsigned long Property> +constexpr auto require(const Policy p, + WorkItemProperty::ImplWorkItemProperty<Property>) { + static_assert(Kokkos::is_execution_policy<Policy>::value, ""); + using new_policy_t = Kokkos::Impl::WorkItemPropertyTrait::policy_with_trait< + Policy, WorkItemProperty::ImplWorkItemProperty<Property>>; + return new_policy_t{p}; +} + +// </editor-fold> end User interface }}}1 +//============================================================================== + +} // namespace Experimental + +} // end namespace Kokkos + +#endif // KOKKOS_KOKKOS_WORKITEMPROPERTYTRAIT_HPP diff --git a/packages/kokkos/core/src/traits/Kokkos_WorkTagTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_WorkTagTrait.hpp new file mode 100644 index 0000000000000000000000000000000000000000..877005756a703b067c07c6f57c3fc4212f7484ca --- /dev/null +++ b/packages/kokkos/core/src/traits/Kokkos_WorkTagTrait.hpp @@ -0,0 +1,124 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_KOKKOS_WORKTAGTRAIT_HPP +#define KOKKOS_KOKKOS_WORKTAGTRAIT_HPP + +#include <Kokkos_Macros.hpp> +#include <Kokkos_Concepts.hpp> // is_execution_space +#include <traits/Kokkos_PolicyTraitAdaptor.hpp> +#include <traits/Kokkos_Traits_fwd.hpp> + +namespace Kokkos { +namespace Impl { + +//============================================================================== +// <editor-fold desc="trait specification"> {{{1 + +struct WorkTagTrait : TraitSpecificationBase<WorkTagTrait> { + struct base_traits { + using work_tag = void; + }; +}; + +// </editor-fold> end trait specification }}}1 +//============================================================================== + +//============================================================================== +// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1 + +// Since we don't have subsumption in pre-C++20, we need to have the work tag +// "trait" handling code be unspecialized, so we handle it instead in a class +// with a different name. +template <class... Traits> +struct AnalyzeExecPolicyHandleWorkTag : AnalyzeExecPolicy<void, Traits...> { + using base_t = AnalyzeExecPolicy<void, Traits...>; + using base_t::base_t; +}; + +template <class WorkTag, class... Traits> +struct AnalyzeExecPolicyHandleWorkTag<WorkTag, Traits...> + : AnalyzeExecPolicy<void, Traits...> { + using base_t = AnalyzeExecPolicy<void, Traits...>; + using base_t::base_t; + static_assert(std::is_void<typename base_t::work_tag>::value, + "Kokkos Error: More than one work tag given"); + using work_tag = WorkTag; +}; + +// This only works if this is not a partial specialization, so we have to +// do the partial specialization elsewhere +template <class Enable, class... Traits> +struct AnalyzeExecPolicy : AnalyzeExecPolicyHandleWorkTag<Traits...> { + using base_t = AnalyzeExecPolicyHandleWorkTag<Traits...>; + using base_t::base_t; +}; + +// </editor-fold> end AnalyzeExecPolicy specializations }}}1 +//============================================================================== + +//============================================================================== +// <editor-fold desc="PolicyTraitMatcher specializations"> {{{1 + +// In order to match the work tag trait the work tag "matcher" needs to be +// unspecialized and the logic needs to be handled in a differently-named class, +// just like above. +template <class TraitSpec, class Trait> +struct PolicyTraitMatcherHandleWorkTag : std::false_type {}; + +template <class Trait> +struct PolicyTraitMatcherHandleWorkTag<WorkTagTrait, Trait> + : std::integral_constant<bool, !std::is_void<Trait>::value> {}; + +template <class TraitSpec, class Trait, class Enable> +struct PolicyTraitMatcher /* unspecialized! */ + : PolicyTraitMatcherHandleWorkTag<TraitSpec, Trait> {}; + +// </editor-fold> end PolicyTraitMatcher specializations }}}1 +//============================================================================== + +} // end namespace Impl +} // end namespace Kokkos + +#endif // KOKKOS_KOKKOS_WORKTAGTRAIT_HPP diff --git a/packages/kokkos/core/unit_test/CMakeLists.txt b/packages/kokkos/core/unit_test/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..5826208851090933ee296988287a6a633eb2c476 --- /dev/null +++ b/packages/kokkos/core/unit_test/CMakeLists.txt @@ -0,0 +1,866 @@ +# +# Add test-only library for gtest to be reused by all the subpackages +# + + +SET(GTEST_SOURCE_DIR ${${PARENT_PACKAGE_NAME}_SOURCE_DIR}/tpls/gtest) + +#need here for tribits +KOKKOS_INCLUDE_DIRECTORIES(${GTEST_SOURCE_DIR}) +KOKKOS_ADD_TEST_LIBRARY( + kokkos_gtest + HEADERS ${GTEST_SOURCE_DIR}/gtest/gtest.h + SOURCES ${GTEST_SOURCE_DIR}/gtest/gtest-all.cc +) + +# avoid deprecation warnings from MSVC +TARGET_COMPILE_DEFINITIONS(kokkos_gtest PUBLIC GTEST_HAS_TR1_TUPLE=0 GTEST_HAS_PTHREAD=0) + +TARGET_INCLUDE_DIRECTORIES(kokkos_gtest PUBLIC ${GTEST_SOURCE_DIR}) +IF((NOT (Kokkos_ENABLE_CUDA AND WIN32)) AND (NOT ("${KOKKOS_CXX_COMPILER_ID}" STREQUAL "Fujitsu"))) + TARGET_COMPILE_FEATURES(kokkos_gtest PUBLIC cxx_std_14) +ENDIF() + +# Suppress clang-tidy diagnostics on code that we do not have control over +IF(CMAKE_CXX_CLANG_TIDY) + SET_TARGET_PROPERTIES(kokkos_gtest PROPERTIES CXX_CLANG_TIDY "") +ENDIF() + +# +# Define Incremental Testing Feature Levels +# Define Device name mappings (i.e. what comes after Kokkos:: for the ExecSpace) +# + +SET(KOKKOS_CUDA_FEATURE_LEVEL 999) +SET(KOKKOS_CUDA_NAME Cuda) +SET(KOKKOS_HIP_FEATURE_LEVEL 999) +SET(KOKKOS_HIP_NAME Experimental::HIP) +SET(KOKKOS_HPX_FEATURE_LEVEL 999) +SET(KOKKOS_HPX_NAME Experimental::HPX) +SET(KOKKOS_OPENMP_FEATURE_LEVEL 999) +SET(KOKKOS_OPENMP_NAME OpenMP) + +# FIXME_OPENMPTARGET - The NVIDIA HPC compiler nvc++ only compiles the first 8 incremental tests for the OpenMPTarget backend. +IF(KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + SET(KOKKOS_OPENMPTARGET_FEATURE_LEVEL 8) +ELSE() + SET(KOKKOS_OPENMPTARGET_FEATURE_LEVEL 13) +ENDIF() + +SET(KOKKOS_OPENMPTARGET_NAME Experimental::OpenMPTarget) +SET(KOKKOS_SERIAL_FEATURE_LEVEL 999) +SET(KOKKOS_SERIAL_NAME Serial) +SET(KOKKOS_SYCL_FEATURE_LEVEL 999) +SET(KOKKOS_SYCL_NAME Experimental::SYCL) +SET(KOKKOS_THREADS_FEATURE_LEVEL 999) +SET(KOKKOS_THREADS_NAME Threads) + + +# +# Define the tests +# + +#I will leave these alone for now because I don't need transitive dependencies on tests +KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) +KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files) + +foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;HIP;SYCL) + # Because there is always an exception to the rule + if(Tag STREQUAL "Threads") + set(DEVICE "PTHREAD") + else() + string(TOUPPER ${Tag} DEVICE) + endif() + string(TOLOWER ${Tag} dir) + + if(Kokkos_ENABLE_${DEVICE}) + set(dir ${CMAKE_CURRENT_BINARY_DIR}/${dir}) + file(MAKE_DIRECTORY ${dir}) + # Needed to split this for Windows NVCC, since it ends up putting everything on the + # command line in an intermediate compilation step even if CMake generated a response + # file. That then exceeded the shell command line max length. + set(${Tag}_SOURCES1A) + foreach(Name + AtomicOperations_int + AtomicOperations_unsignedint + AtomicOperations_longint + AtomicOperations_unsignedlongint + AtomicOperations_longlongint + AtomicOperations_double + AtomicOperations_float + AtomicOperations_complexdouble + AtomicOperations_complexfloat + AtomicViews + Atomics + BlockSizeDeduction + Concepts + Complex + Crs + DeepCopyAlignment + FunctorAnalysis + Init + LocalDeepCopy + MathematicalFunctions + MDRange_a + MDRange_b + MDRange_c + HostSharedPtr + HostSharedPtrAccessOnDevice + ) + set(file ${dir}/Test${Tag}_${Name}.cpp) + # Write to a temporary intermediate file and call configure_file to avoid + # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs. + file(WRITE ${dir}/dummy.cpp + "#include <Test${Tag}_Category.hpp>\n" + "#include <Test${Name}.hpp>\n" + ) + configure_file(${dir}/dummy.cpp ${file}) + list(APPEND ${Tag}_SOURCES1A ${file}) + endforeach() + + set(${Tag}_SOURCES1B) + foreach(Name + MDRange_d + MDRange_e + MDRange_f + NumericTraits + Other + RangePolicy + RangePolicyRequire + Reductions + Reducers_a + Reducers_b + Reducers_c + Reducers_d + Reductions_DeviceView + Scan + SharedAlloc + ViewMapping_a + ) + set(file ${dir}/Test${Tag}_${Name}.cpp) + # Write to a temporary intermediate file and call configure_file to avoid + # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs. + file(WRITE ${dir}/dummy.cpp + "#include <Test${Tag}_Category.hpp>\n" + "#include <Test${Name}.hpp>\n" + ) + configure_file(${dir}/dummy.cpp ${file}) + list(APPEND ${Tag}_SOURCES1B ${file}) + endforeach() + + SET(${Tag}_SOURCES2A) + foreach(Name + TeamBasic + TeamReductionScan + TeamScan + TeamScratch + TeamTeamSize + TeamVectorRange + UniqueToken + ViewAPI_a + ViewAPI_b + ViewAPI_c + ViewAPI_d + ViewAPI_e + ViewCopy_a + ViewCopy_b + ViewLayoutStrideAssignment + ViewMapping_b + ViewMapping_subview + ViewOfClass + ViewResize + View_64bit + WorkGraph + ) + set(file ${dir}/Test${Tag}_${Name}.cpp) + # Write to a temporary intermediate file and call configure_file to avoid + # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs. + file(WRITE ${dir}/dummy.cpp + "#include <Test${Tag}_Category.hpp>\n" + "#include <Test${Name}.hpp>\n" + ) + configure_file(${dir}/dummy.cpp ${file}) + list(APPEND ${Tag}_SOURCES2A ${file}) + endforeach() + + set(TagHostAccessible ${Tag}) + if (Tag STREQUAL "Cuda") + set(TagHostAccessible CudaUVM) + elseif(Tag STREQUAL "HIP") + set(TagHostAccessible HIPHostPinned) + elseif(Tag STREQUAL "SYCL") + set(TagHostAccessible SYCLSharedUSMSpace) + endif() + + set(${Tag}_SOURCES2B) + foreach(Name + SubView_a + SubView_b + SubView_c01 + SubView_c02 + SubView_c03 + SubView_c04 + SubView_c05 + ) + set(file ${dir}/Test${Tag}_${Name}.cpp) + # Write to a temporary intermediate file and call configure_file to avoid + # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs. + file(WRITE ${dir}/dummy.cpp + "#include <Test${TagHostAccessible}_Category.hpp>\n" + "#include <Test${Name}.hpp>\n" + ) + configure_file(${dir}/dummy.cpp ${file}) + list(APPEND ${Tag}_SOURCES2B ${file}) + endforeach() + + set(${Tag}_SOURCES2C) + foreach(Name + SubView_c06 + SubView_c07 + SubView_c08 + SubView_c09 + ) + set(file ${dir}/Test${Tag}_${Name}.cpp) + # Write to a temporary intermediate file and call configure_file to avoid + # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs. + file(WRITE ${dir}/dummy.cpp + "#include <Test${TagHostAccessible}_Category.hpp>\n" + "#include <Test${Name}.hpp>\n" + ) + configure_file(${dir}/dummy.cpp ${file}) + list(APPEND ${Tag}_SOURCES2C ${file}) + endforeach() + + set(${Tag}_SOURCES2D) + foreach(Name + SubView_c10 + SubView_c11 + SubView_c12 + SubView_c13 + SubView_c14 + ) + set(file ${dir}/Test${Tag}_${Name}.cpp) + # Write to a temporary intermediate file and call configure_file to avoid + # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs. + file(WRITE ${dir}/dummy.cpp + "#include <Test${TagHostAccessible}_Category.hpp>\n" + "#include <Test${Name}.hpp>\n" + ) + configure_file(${dir}/dummy.cpp ${file}) + list(APPEND ${Tag}_SOURCES2D ${file}) + endforeach() + + SET(${Tag}_SOURCES1 ${${Tag}_SOURCES1A} ${${Tag}_SOURCES1B}) + SET(${Tag}_SOURCES2 ${${Tag}_SOURCES2A} ${${Tag}_SOURCES2B} ${${Tag}_SOURCES2C} ${${Tag}_SOURCES2D}) + SET(${Tag}_SOURCES ${${Tag}_SOURCES1} ${${Tag}_SOURCES2}) + endif() +endforeach() + +if(Kokkos_ENABLE_OPENMPTARGET) + list(REMOVE_ITEM OpenMPTarget_SOURCES + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_complexfloat.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_complexdouble.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Crs.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_LocalDeepCopy.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Other.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reductions_DeviceView.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamReductionScan.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamTeamSize.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamScan.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_e.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewCopy_a.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewCopy_b.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewMapping_subview.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewOfClass.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_WorkGraph.cpp + ) +endif() + +# FIXME_OPENMPTARGET - Comment non-passing tests with the NVIDIA HPC compiler nvc++ +IF(KOKKOS_ENABLE_OPENMPTARGET + AND (KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)) + list(REMOVE_ITEM OpenMPTarget_SOURCES + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_UniqueToken.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_HostSharedPtr.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_HostSharedPtrAccessOnDevice.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamScratch.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TestScan.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TestTeamScan.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TestTeamReductionScan.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Atomics.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_float.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_int.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_longint.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_longlongint.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_double.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_unsignedint.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_unsignedlongint.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicViews.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_BlockSizeDeduction.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_a.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_b.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_c.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_d.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewMapping_b.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamBasic.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Scan.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_NumericTraits.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_DeepCopyAlignment.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MathematicalFunctions.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_b.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c01.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c02.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c03.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c04.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c05.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c06.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c07.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c08.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c09.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c10.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c11.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c12.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c13.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MDRange_a.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MDRange_b.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MDRange_c.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MDRange_d.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_a.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_b.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_c.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_d.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_f.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewResize.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_RangePolicyRequire.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_RangePolicy.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_a1.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_b1.cpp + ) +endif() + +if(Kokkos_ENABLE_SERIAL) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_Serial1 + SOURCES + UnitTestMainInit.cpp + ${Serial_SOURCES1} + serial/TestSerial_Task.cpp + ) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_Serial2 + SOURCES + UnitTestMainInit.cpp + ${Serial_SOURCES2} + ) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_SerialGraph + SOURCES + UnitTestMainInit.cpp + serial/TestSerial_Graph.cpp + ) +endif() + +if(Kokkos_ENABLE_PTHREAD) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_Threads + SOURCES ${Threads_SOURCES} + UnitTestMainInit.cpp + ) +endif() + +if(Kokkos_ENABLE_OPENMP) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_OpenMP + SOURCES + UnitTestMainInit.cpp + ${OpenMP_SOURCES} + openmp/TestOpenMP_PartitionMaster.cpp + openmp/TestOpenMP_Task.cpp + ) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_OpenMPInterOp + SOURCES + UnitTestMain.cpp + openmp/TestOpenMP_InterOp.cpp + ) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_OpenMPGraph + SOURCES + UnitTestMainInit.cpp + openmp/TestOpenMP_Graph.cpp + ) +endif() + +if(Kokkos_ENABLE_HPX) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_HPX + SOURCES + UnitTestMainInit.cpp + ${HPX_SOURCES} + hpx/TestHPX_Task.cpp + ) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_HPXInterOp + SOURCES + UnitTestMain.cpp + hpx/TestHPX_InterOp.cpp + ) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_HPX_IndependentInstances + SOURCES + UnitTestMain.cpp + hpx/TestHPX_IndependentInstances.cpp + ) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_HPX_IndependentInstancesDelayedExecution + SOURCES + UnitTestMain.cpp + hpx/TestHPX_IndependentInstancesDelayedExecution.cpp + ) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_HPX_IndependentInstancesInstanceIds + SOURCES + UnitTestMain.cpp + hpx/TestHPX_IndependentInstancesInstanceIds.cpp + ) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_HPX_IndependentInstancesRefCounting + SOURCES + UnitTestMain.cpp + hpx/TestHPX_IndependentInstancesRefCounting.cpp + ) +endif() + +if(Kokkos_ENABLE_OPENMPTARGET) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_OpenMPTarget + SOURCES + UnitTestMainInit.cpp + ${OpenMPTarget_SOURCES} + ) +endif() + +if(Kokkos_ENABLE_CUDA) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_Cuda1 + SOURCES + UnitTestMainInit.cpp + ${Cuda_SOURCES1} + ) + + KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_Cuda2 + SOURCES + UnitTestMainInit.cpp + ${Cuda_SOURCES2} + ) + + KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_Cuda3 + SOURCES + UnitTestMainInit.cpp + cuda/TestCuda_Task.cpp + cuda/TestCuda_TeamScratchStreams.cpp + cuda/TestCudaHostPinned_SharedAlloc.cpp + cuda/TestCudaHostPinned_ViewAPI_a.cpp + cuda/TestCudaHostPinned_ViewAPI_b.cpp + cuda/TestCudaHostPinned_ViewAPI_c.cpp + cuda/TestCudaHostPinned_ViewAPI_d.cpp + cuda/TestCudaHostPinned_ViewAPI_e.cpp + cuda/TestCudaHostPinned_ViewCopy_a.cpp + cuda/TestCudaHostPinned_ViewCopy_b.cpp + cuda/TestCudaHostPinned_ViewMapping_a.cpp + cuda/TestCudaHostPinned_ViewMapping_b.cpp + cuda/TestCudaHostPinned_ViewMapping_subview.cpp + cuda/TestCudaUVM_SharedAlloc.cpp + cuda/TestCudaUVM_ViewAPI_a.cpp + cuda/TestCudaUVM_ViewAPI_b.cpp + cuda/TestCudaUVM_ViewAPI_c.cpp + cuda/TestCudaUVM_ViewAPI_d.cpp + cuda/TestCudaUVM_ViewAPI_e.cpp + cuda/TestCudaUVM_ViewCopy_a.cpp + cuda/TestCudaUVM_ViewCopy_b.cpp + cuda/TestCudaUVM_ViewMapping_a.cpp + cuda/TestCudaUVM_ViewMapping_b.cpp + cuda/TestCudaUVM_ViewMapping_subview.cpp + cuda/TestCuda_Spaces.cpp + ) + + KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_CudaTimingBased + SOURCES + UnitTestMainInit.cpp + cuda/TestCuda_DebugSerialExecution.cpp + cuda/TestCuda_DebugPinUVMSpace.cpp + ) + + KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_CudaInterOpInit + SOURCES + UnitTestMain.cpp + cuda/TestCuda_InterOp_Init.cpp + ) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_CudaInterOpStreams + SOURCES + UnitTestMain.cpp + cuda/TestCuda_InterOp_Streams.cpp + ) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_CudaGraph + SOURCES + UnitTestMainInit.cpp + cuda/TestCuda_Graph.cpp + ) +endif() + +if(Kokkos_ENABLE_HIP) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_HIP + SOURCES + UnitTestMainInit.cpp + ${HIP_SOURCES} + hip/TestHIP_ScanUnit.cpp + hip/TestHIP_TeamScratchStreams.cpp + hip/TestHIPHostPinned_ViewAPI_a.cpp + hip/TestHIPHostPinned_ViewAPI_b.cpp + hip/TestHIPHostPinned_ViewAPI_c.cpp + hip/TestHIPHostPinned_ViewAPI_d.cpp + hip/TestHIPHostPinned_ViewAPI_e.cpp + hip/TestHIPHostPinned_ViewCopy_a.cpp + hip/TestHIPHostPinned_ViewCopy_b.cpp + hip/TestHIPHostPinned_ViewMapping_a.cpp + hip/TestHIPHostPinned_ViewMapping_b.cpp + hip/TestHIPHostPinned_ViewMapping_subview.cpp + hip/TestHIP_AsyncLauncher.cpp + ) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_HIPInterOpInit + SOURCES + UnitTestMain.cpp + hip/TestHIP_InterOp_Init.cpp + ) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_HIPInterOpStreams + SOURCES + UnitTestMain.cpp + hip/TestHIP_InterOp_Streams.cpp + ) +endif() + +if(Kokkos_ENABLE_SYCL) + list(REMOVE_ITEM SYCL_SOURCES1A + # FIXME_SYCL atomic_fetch_oper for large types to be implemented + ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_AtomicOperations_complexdouble.cpp + ) + + list(REMOVE_ITEM SYCL_SOURCES2A + ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_WorkGraph.cpp + ) + + KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_SYCL1A + SOURCES + UnitTestMainInit.cpp + ${SYCL_SOURCES1A} + ) + + KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_SYCL1B + SOURCES + UnitTestMainInit.cpp + ${SYCL_SOURCES1B} + ) + + KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_SYCL2A + SOURCES + UnitTestMainInit.cpp + ${SYCL_SOURCES2A} + ) + + KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_SYCL2B + SOURCES + UnitTestMainInit.cpp + ${SYCL_SOURCES2B} + ) + + KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_SYCL2C + SOURCES + UnitTestMainInit.cpp + ${SYCL_SOURCES2C} + ) + + KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_SYCL2D + SOURCES + UnitTestMainInit.cpp + ${SYCL_SOURCES2D} + ) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_SYCLInterOpInit + SOURCES + UnitTestMain.cpp + sycl/TestSYCL_InterOp_Init.cpp + ) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_SYCLInterOpInit_Context + SOURCES + UnitTestMainInit.cpp + sycl/TestSYCL_InterOp_Init_Context.cpp + ) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_SYCLInterOpStreams + SOURCES + UnitTestMain.cpp + sycl/TestSYCL_InterOp_Streams.cpp + ) +endif() + +# FIXME_OPENMPTARGET - Comment non-passing tests with the NVIDIA HPC compiler nvc++ +if (KOKKOS_ENABLE_OPENMPTARGET + AND (KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)) + SET(DEFAULT_DEVICE_SOURCES + UnitTestMainInit.cpp + default/TestDefaultDeviceType.cpp + ) +else() + SET(DEFAULT_DEVICE_SOURCES + UnitTestMainInit.cpp + default/TestDefaultDeviceType.cpp + default/TestDefaultDeviceType_a1.cpp + default/TestDefaultDeviceType_b1.cpp + default/TestDefaultDeviceType_c1.cpp + default/TestDefaultDeviceType_a2.cpp + default/TestDefaultDeviceType_b2.cpp + default/TestDefaultDeviceType_c2.cpp + default/TestDefaultDeviceType_a3.cpp + default/TestDefaultDeviceType_b3.cpp + default/TestDefaultDeviceType_c3.cpp + default/TestDefaultDeviceType_d.cpp + default/TestDefaultDeviceTypeResize.cpp + ) +endif() + +KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_Default + SOURCES ${DEFAULT_DEVICE_SOURCES} +) + +KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_PushFinalizeHook + SOURCES + UnitTest_PushFinalizeHook.cpp +) + +# This test is intended for development and debugging by putting code +# into TestDefaultDeviceDevelop.cpp. By default its empty. +KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_Develop + SOURCES + UnitTestMainInit.cpp + default/TestDefaultDeviceDevelop.cpp +) + +# This test is special, because it passes exactly when it prints the +# message "PASSED: I am the custom std::terminate handler.", AND calls +# std::terminate. This means that we can't use +# KOKKOS_ADD_EXECUTABLE_AND_TEST. See GitHub issue #2147. + +KOKKOS_ADD_TEST_EXECUTABLE( push_finalize_hook_terminate + SOURCES UnitTest_PushFinalizeHook_terminate.cpp +) + +KOKKOS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate + TEST_0 + EXEC push_finalize_hook_terminate + NUM_MPI_PROCS 1 + PASS_REGULAR_EXPRESSION + "PASSED: I am the custom std::terminate handler." + ALWAYS_FAIL_ON_ZERO_RETURN +) + + if(KOKKOS_ENABLE_TUNING) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_TuningBasics + SOURCES + tools/TestTuning.cpp + ) + endif() + if(NOT Kokkos_ENABLE_OPENMPTARGET) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_LogicalSpaces + SOURCES + tools/TestLogicalSpaces.cpp + ) + endif() + if(KOKKOS_ENABLE_LIBDL) + + KOKKOS_ADD_TEST_LIBRARY( + kokkosprinter-tool SHARED + SOURCES tools/printing-tool.cpp + ) + + if((NOT (Kokkos_ENABLE_CUDA AND WIN32)) AND (NOT ("${KOKKOS_CXX_COMPILER_ID}" STREQUAL "Fujitsu"))) + TARGET_COMPILE_FEATURES(kokkosprinter-tool PUBLIC cxx_std_14) + endif() + + KOKKOS_ADD_TEST_EXECUTABLE( + ProfilingAllCalls + tools/TestAllCalls.cpp + ) + + set(ADDRESS_REGEX "0x[0-9a-f]*") + set(MEMSPACE_REGEX "[HC][ou][sd][ta][a-zA-Z]*") + set(SIZE_REGEX "[0-9]*") + set(SKIP_SCRATCH_INITIALIZATION_REGEX ".*") + + # check help works via environment variable + KOKKOS_ADD_TEST( + SKIP_TRIBITS + NAME ProfilingTestLibraryLoadHelp + EXE ProfilingAllCalls + TOOL kokkosprinter-tool + ARGS --kokkos-tools-help + PASS_REGULAR_EXPRESSION + "kokkosp_init_library::kokkosp_print_help:KokkosCore_ProfilingAllCalls::kokkosp_finalize_library::") + + # check help works via direct library specification + KOKKOS_ADD_TEST( + SKIP_TRIBITS + NAME ProfilingTestLibraryCmdLineHelp + EXE ProfilingAllCalls + ARGS --kokkos-tools-help + --kokkos-tools-library=$<TARGET_FILE:kokkosprinter-tool> + PASS_REGULAR_EXPRESSION + "kokkosp_init_library::kokkosp_print_help:KokkosCore_ProfilingAllCalls::kokkosp_finalize_library::") + + KOKKOS_ADD_TEST( + SKIP_TRIBITS + NAME ProfilingTestLibraryLoad + EXE ProfilingAllCalls + TOOL kokkosprinter-tool + ARGS --kokkos-tools-args="-c test delimit" + PASS_REGULAR_EXPRESSION "kokkosp_init_library::kokkosp_parse_args:4:KokkosCore_ProfilingAllCalls:-c:test:delimit::.*::kokkosp_allocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]source]:0:0::kokkosp_end_parallel_for:0::kokkosp_allocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]destination]:0:0::kokkosp_end_parallel_for:0::kokkosp_begin_deep_copy:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_end_deep_copy::kokkosp_begin_parallel_for:parallel_for:${SIZE_REGEX}:0::kokkosp_end_parallel_for:0::kokkosp_begin_parallel_reduce:parallel_reduce:${SIZE_REGEX}:1${SKIP_SCRATCH_INITIALIZATION_REGEX}::kokkosp_end_parallel_reduce:1::kokkosp_begin_parallel_scan:parallel_scan:${SIZE_REGEX}:2::kokkosp_end_parallel_scan:2::kokkosp_push_profile_region:push_region::kokkosp_pop_profile_region::kokkosp_create_profile_section:created_section:3::kokkosp_start_profile_section:3::kokkosp_stop_profile_section:3::kokkosp_destroy_profile_section:3::kokkosp_profile_event:profiling_event::kokkosp_declare_metadata:dogs:good::kokkosp_deallocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_deallocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_finalize_library::" + ) + + # Above will test that leading/trailing quotes are stripped bc ctest cmd args is: + # "--kokkos-tools-args="-c test delimit"" + # The bracket argument syntax: [=[ and ]=] used below ensures it is treated as + # a single argument: + # "--kokkos-tools-args=-c test delimit" + # + # https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#bracket-argument + # + KOKKOS_ADD_TEST( + SKIP_TRIBITS + NAME ProfilingTestLibraryCmdLine + EXE ProfilingAllCalls + ARGS [=[--kokkos-tools-args=-c test delimit]=] + --kokkos-tools-library=$<TARGET_FILE:kokkosprinter-tool> + PASS_REGULAR_EXPRESSION "kokkosp_init_library::kokkosp_parse_args:4:KokkosCore_ProfilingAllCalls:-c:test:delimit::.*::kokkosp_allocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]source]:0:0::kokkosp_end_parallel_for:0::kokkosp_allocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]destination]:0:0::kokkosp_end_parallel_for:0::kokkosp_begin_deep_copy:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_end_deep_copy::kokkosp_begin_parallel_for:parallel_for:${SIZE_REGEX}:0::kokkosp_end_parallel_for:0::kokkosp_begin_parallel_reduce:parallel_reduce:${SIZE_REGEX}:1${SKIP_SCRATCH_INITIALIZATION_REGEX}::kokkosp_end_parallel_reduce:1::kokkosp_begin_parallel_scan:parallel_scan:${SIZE_REGEX}:2::kokkosp_end_parallel_scan:2::kokkosp_push_profile_region:push_region::kokkosp_pop_profile_region::kokkosp_create_profile_section:created_section:3::kokkosp_start_profile_section:3::kokkosp_stop_profile_section:3::kokkosp_destroy_profile_section:3::kokkosp_profile_event:profiling_event::kokkosp_declare_metadata:dogs:good::kokkosp_deallocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_deallocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_finalize_library::" + ) + endif() #KOKKOS_ENABLE_LIBDL +if(NOT KOKKOS_HAS_TRILINOS) +KOKKOS_ADD_TEST_EXECUTABLE( + StackTraceTestExec + SOURCES + TestStackTrace.cpp + TestStackTrace_f0.cpp + TestStackTrace_f1.cpp + TestStackTrace_f2.cpp + TestStackTrace_f3.cpp + TestStackTrace_f4.cpp +) +# We need -rdynamic on GNU platforms for the stacktrace functionality +# to work correctly with shared libraries +KOKKOS_SET_EXE_PROPERTY(StackTraceTestExec ENABLE_EXPORTS ON) + +KOKKOS_ADD_TEST( NAME UnitTest_StackTraceTest + EXE StackTraceTestExec + FAIL_REGULAR_EXPRESSION "FAILED" + ) +endif() + +foreach(INITTESTS_NUM RANGE 1 18) +KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_DefaultInit_${INITTESTS_NUM} + SOURCES UnitTestMain.cpp default/TestDefaultDeviceTypeInit_${INITTESTS_NUM}.cpp +) +endforeach(INITTESTS_NUM) + +if (KOKKOS_ENABLE_HWLOC) +KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_HWLOC + SOURCES UnitTestMain.cpp TestHWLOC.cpp +) +endif() + +FUNCTION (KOKKOS_ADD_INCREMENTAL_TEST DEVICE) + KOKKOS_OPTION( ${DEVICE}_EXCLUDE_TESTS "" STRING "Incremental test exclude list" ) + # Add unit test main + SET(${DEVICE}_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/UnitTestMainInit.cpp) + + # Iterate over incremental tests in directory + + APPEND_GLOB(INCREMENTAL_FILE_LIST ${CMAKE_CURRENT_SOURCE_DIR}/incremental/*.hpp) + + SET(DEVICE_NAME ${KOKKOS_${DEVICE}_NAME}) + FOREACH (CURRENT_FILE_PATH ${INCREMENTAL_FILE_LIST}) + GET_FILENAME_COMPONENT( CURRENT_FILE_NAME ${CURRENT_FILE_PATH} NAME ) + STRING (REPLACE ".hpp" "" CURRENT_TEST_NAME ${CURRENT_FILE_NAME}) + IF (NOT CURRENT_TEST_NAME IN_LIST Kokkos_${DEVICE}_EXCLUDE_TESTS) + SET (CURRENT_TEST_OUTPUT_FILENAME ${CURRENT_TEST_NAME}_${DEVICE}) + FILE( STRINGS ${CURRENT_FILE_PATH} CURRENT_REQUIRED_FEATURE_LINE REGEX "Kokkos_Feature_Level_Required" ) + # From each test get level implementation required + STRING( REGEX REPLACE ".*Kokkos_Feature_Level_Required:" "" CURRENT_REQUIRED_FEATURE_LEVEL ${CURRENT_REQUIRED_FEATURE_LINE} ) + # Cross-reference list of dependencies with selected feature list > matching feature test files are added to test applications + IF (KOKKOS_${DEVICE}_FEATURE_LEVEL GREATER_EQUAL CURRENT_REQUIRED_FEATURE_LEVEL) + CONFIGURE_FILE (IncrementalTest.cpp.in ${CMAKE_BINARY_DIR}/core/unit_test/generated/${CURRENT_TEST_OUTPUT_FILENAME}.cpp ) + SET(${DEVICE}_SOURCES ${${DEVICE}_SOURCES}; ${CMAKE_BINARY_DIR}/core/unit_test/generated/${CURRENT_TEST_OUTPUT_FILENAME}.cpp) + ENDIF() + ENDIF() + ENDFOREACH() + + STRING(TOUPPER ${DEVICE} UC_DEVICE) + + KOKKOS_OPTION ( + ENABLE_${UC_DEVICE} ON BOOL "ENABLE ${UC_DEVICE}" + ) + + KOKKOS_ADD_EXECUTABLE_AND_TEST( + IncrementalTest_${DEVICE} + SOURCES ${${DEVICE}_SOURCES} + ) + + SET(EXE_NAME ${PACKAGE_NAME}_IncrementalTest_${DEVICE}) + # Check that the target was actually created because in a TribITS build + # where only tests marked as PERFORMANCE enabled it would not be. + IF(TARGET ${EXE_NAME}) + TARGET_INCLUDE_DIRECTORIES(${EXE_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/incremental ) + ENDIF() + +ENDFUNCTION() + +FOREACH (DEVICE ${KOKKOS_ENABLED_DEVICES}) + KOKKOS_ADD_INCREMENTAL_TEST(${DEVICE}) +ENDFOREACH() + +KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_CTestDevice + SOURCES UnitTestMain.cpp TestCTestDevice.cpp +) + +KOKKOS_ADD_EXECUTABLE_AND_TEST( + UnitTest_CMakePassCmdLineArgs + SOURCES UnitTest_CMakePassCmdLineArgs.cpp + ARGS "one 2 THREE" +) + +if (KOKKOS_ENABLE_HEADER_SELF_CONTAINMENT_TESTS AND NOT KOKKOS_HAS_TRILINOS) + add_subdirectory(headers_self_contained) +endif() diff --git a/packages/kokkos/core/unit_test/IncrementalTest.cpp.in b/packages/kokkos/core/unit_test/IncrementalTest.cpp.in new file mode 100644 index 0000000000000000000000000000000000000000..e4358efe9dd8683e025b5a2be67a4e9bd83e7552 --- /dev/null +++ b/packages/kokkos/core/unit_test/IncrementalTest.cpp.in @@ -0,0 +1,58 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER + +#ifndef KOKKOS_TEST_@BACK_END_NAME@_@CURRENT_TEST_NAME@ +#define KOKKOS_TEST_@BACK_END_NAME@_@CURRENT_TEST_NAME@ + +#include <gtest/gtest.h> +#include <Kokkos_Macros.hpp> + +#define TEST_CATEGORY @DEVICE@ +#define TEST_EXECSPACE Kokkos::@DEVICE_NAME@ + +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) +#include <@CURRENT_FILE_NAME@> +#endif + +#endif + + diff --git a/packages/kokkos/core/unit_test/Makefile b/packages/kokkos/core/unit_test/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..390fc79a4755e46cbd61b28ee54d44814fa501d9 --- /dev/null +++ b/packages/kokkos/core/unit_test/Makefile @@ -0,0 +1,517 @@ +KOKKOS_PATH = ../.. + +GTEST_PATH = ../../tpls/gtest + +vpath %.cpp ${KOKKOS_PATH}/core/unit_test +vpath %.cpp ${KOKKOS_PATH}/core/unit_test/default +vpath %.cpp ${KOKKOS_PATH}/core/unit_test/serial +vpath %.cpp ${KOKKOS_PATH}/core/unit_test/threads +vpath %.cpp ${KOKKOS_PATH}/core/unit_test/openmp +vpath %.cpp ${KOKKOS_PATH}/core/unit_test/openmptarget +vpath %.cpp ${KOKKOS_PATH}/core/unit_test/hip +vpath %.cpp ${KOKKOS_PATH}/core/unit_test/hpx +vpath %.cpp ${KOKKOS_PATH}/core/unit_test/cuda + + +TEST_HEADERS = $(wildcard $(KOKKOS_PATH)/core/unit_test/*.hpp) +TEST_HEADERS += $(wildcard $(KOKKOS_PATH)/core/unit_test/*/*.hpp) + +default: build_all + echo "End Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) + CXX = $(KOKKOS_PATH)/bin/nvcc_wrapper +else + CXX = g++ +endif + +CXXFLAGS = -O3 +LINK ?= $(CXX) +LDFLAGS ?= +override LDFLAGS += -lpthread + +include $(KOKKOS_PATH)/Makefile.kokkos + +KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/core/unit_test -I${KOKKOS_PATH}/core/unit_test/category_files + +TEST_TARGETS = +TARGETS = + +KOKKOS_INTERNAL_HAS_OPTIMIZATIONS := $(call kokkos_has_string,$(KOKKOS_CXXFLAGS),O3) +ifneq ($(KOKKOS_INTERNAL_HAS_OPTIMIZATIONS), 1) + KOKKOS_INTERNAL_HAS_OPTIMIZATIONS := $(call kokkos_has_string,$(KOKKOS_CXXFLAGS),O2) + ifneq ($(KOKKOS_INTERNAL_HAS_OPTIMIZATIONS), 1) + KOKKOS_INTERNAL_HAS_OPTIMIZATIONS := $(call kokkos_has_string,$(CXXFLAGS),O3) + ifneq ($(KOKKOS_INTERNAL_HAS_OPTIMIZATIONS), 1) + KOKKOS_INTERNAL_HAS_OPTIMIZATIONS := $(call kokkos_has_string,$(CXXFLAGS),O2) + endif + endif +endif +KOKKOS_INTERNAL_USE_RDYNAMIC := $(call kokkos_has_string,$(KOKKOS_CXXFLAGS),rdynamic) +ifneq ($(KOKKOS_INTERNAL_USE_RDYNAMIC), 1) + KOKKOS_INTERNAL_USE_RDYNAMIC := $(call kokkos_has_string,$(CXXFLAGS),rdynamic) +endif + +ifeq ($(KOKKOS_INTERNAL_USE_RDYNAMIC),1) + ifneq ($(KOKKOS_INTERNAL_HAS_OPTIMIZATIONS),1) + STACK_TRACE_TERMINATE_FILTER :=_dynamic + else + STACK_TRACE_TERMINATE_FILTER := + endif +else + STACK_TRACE_TERMINATE_FILTER := +endif + +TESTS = AtomicOperations_int AtomicOperations_unsignedint AtomicOperations_longint AtomicOperations_unsignedlongint AtomicOperations_longlongint AtomicOperations_double AtomicOperations_float AtomicOperations_complexdouble AtomicOperations_complexfloat AtomicViews Atomics BlockSizeDeduction Concepts Complex Crs DeepCopyAlignment FunctorAnalysis Init LocalDeepCopy MDRange_a MDRange_b MDRange_c MDRange_d MDRange_e MDRange_f Other RangePolicy RangePolicyRequire Reductions Reducers_a Reducers_b Reducers_c Reducers_d Reductions_DeviceView Scan SharedAlloc TeamBasic TeamReductionScan TeamScratch TeamTeamSize TeamVectorRange UniqueToken ViewAPI_a ViewAPI_b ViewAPI_c ViewAPI_d ViewAPI_e ViewCopy_a ViewCopy_b ViewLayoutStrideAssignment ViewMapping_a ViewMapping_b ViewMapping_subview ViewOfClass WorkGraph View_64bit ViewResize + +tmp := $(foreach device, $(KOKKOS_DEVICELIST), \ + tmp2 := $(foreach test, $(TESTS), \ + $(if $(filter Test$(device)_$(test).cpp, $(shell ls Test$(device)_$(test).cpp 2>/dev/null)),,\ + $(shell echo "\#include <Test"$(device)"_Category.hpp>" > Test$(device)_$(test).cpp); \ + $(shell echo "\#include <Test"$(test)".hpp>" >> Test$(device)_$(test).cpp); \ + ) \ + ) \ +) + +SUBVIEW_TESTS = SubView_a SubView_b SubView_c01 SubView_c02 SubView_c03 SubView_c04 SubView_c05 SubView_c06 SubView_c07 SubView_c08 SubView_c09 SubView_c10 SubView_c11 SubView_c12 SubView_c13 + +KOKKOS_SUBVIEW_DEVICELIST := $(filter-out Cuda, $(KOKKOS_DEVICELIST)) + +tmp := $(foreach device, $(KOKKOS_SUBVIEW_DEVICELIST), \ + tmp2 := $(foreach test, $(SUBVIEW_TESTS), \ + $(if $(filter Test$(device)_$(test).cpp, $(shell ls Test$(device)_$(test).cpp 2>/dev/null)),, \ + $(shell echo "\#include <Test"$(device)"_Category.hpp>" > Test$(device)_$(test).cpp); \ + $(shell echo "\#include <Test"$(test)".hpp>" >> Test$(device)_$(test).cpp); \ + ) \ + )\ +) + +ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) + tmp2 := $(foreach test, $(SUBVIEW_TESTS), \ + $(if $(filter TestCuda_$(test).cpp, $(shell ls TestCuda_$(test).cpp 2>/dev/null)),,\ + $(shell echo "\#include <TestCudaUVM_Category.hpp>" > TestCuda_$(test).cpp); \ + $(shell echo "\#include <Test"$(test)".hpp>" >> TestCuda_$(test).cpp); \ + )\ + ) + + OBJ_CUDA = UnitTestMainInit.o gtest-all.o + OBJ_CUDA += TestCuda_Init.o + OBJ_CUDA += TestCuda_SharedAlloc.o TestCudaUVM_SharedAlloc.o TestCudaHostPinned_SharedAlloc.o + OBJ_CUDA += TestCuda_RangePolicy.o TestCuda_RangePolicyRequire.o + OBJ_CUDA += TestCuda_ViewAPI_a.o TestCuda_ViewAPI_b.o TestCuda_ViewAPI_c.o TestCuda_ViewAPI_d.o TestCuda_ViewAPI_e.o TestCuda_ViewCopy_a.o TestCuda_ViewCopy_b.o + OBJ_CUDA += TestCuda_DeepCopyAlignment.o + OBJ_CUDA += TestCuda_ViewMapping_a.o TestCuda_ViewMapping_b.o TestCuda_ViewMapping_subview.o TestCuda_ViewResize.o TestCuda_ViewLayoutStrideAssignment.o + OBJ_CUDA += TestCudaUVM_ViewAPI_a.o TestCudaUVM_ViewAPI_b.o TestCudaUVM_ViewAPI_c.o TestCudaUVM_ViewAPI_d.o TestCudaUVM_ViewAPI_e.o + OBJ_CUDA += TestCudaUVM_ViewCopy_a.o TestCudaUVM_ViewCopy_b.o + OBJ_CUDA += TestCudaUVM_ViewMapping_a.o TestCudaUVM_ViewMapping_b.o TestCudaUVM_ViewMapping_subview.o + OBJ_CUDA += TestCudaHostPinned_ViewAPI_a.o TestCudaHostPinned_ViewAPI_b.o TestCudaHostPinned_ViewAPI_c.o TestCudaHostPinned_ViewAPI_d.o TestCudaHostPinned_ViewAPI_e.o + OBJ_CUDA += TestCudaHostPinned_ViewCopy_a.o TestCudaHostPinned_ViewCopy_b.o + OBJ_CUDA += TestCudaHostPinned_ViewMapping_a.o TestCudaHostPinned_ViewMapping_b.o TestCudaHostPinned_ViewMapping_subview.o + OBJ_CUDA += TestCuda_View_64bit.o + OBJ_CUDA += TestCuda_ViewOfClass.o + OBJ_CUDA += TestCuda_SubView_a.o TestCuda_SubView_b.o + OBJ_CUDA += TestCuda_SubView_c01.o TestCuda_SubView_c02.o TestCuda_SubView_c03.o + OBJ_CUDA += TestCuda_SubView_c04.o TestCuda_SubView_c05.o TestCuda_SubView_c06.o + OBJ_CUDA += TestCuda_SubView_c07.o TestCuda_SubView_c08.o TestCuda_SubView_c09.o + OBJ_CUDA += TestCuda_SubView_c10.o TestCuda_SubView_c11.o TestCuda_SubView_c12.o + OBJ_CUDA += TestCuda_SubView_c13.o + OBJ_CUDA += TestCuda_Reductions.o TestCuda_Scan.o + OBJ_CUDA += TestCuda_Reductions_DeviceView.o + OBJ_CUDA += TestCuda_Reducers_a.o TestCuda_Reducers_b.o TestCuda_Reducers_c.o TestCuda_Reducers_d.o + OBJ_CUDA += TestCuda_Complex.o + OBJ_CUDA += TestCuda_AtomicOperations_int.o TestCuda_AtomicOperations_unsignedint.o TestCuda_AtomicOperations_longint.o + OBJ_CUDA += TestCuda_AtomicOperations_unsignedlongint.o TestCuda_AtomicOperations_longlongint.o TestCuda_AtomicOperations_double.o TestCuda_AtomicOperations_float.o + OBJ_CUDA += TestCuda_AtomicOperations_complexfloat.o TestCuda_AtomicOperations_complexdouble.o + OBJ_CUDA += TestCuda_AtomicViews.o TestCuda_Atomics.o + OBJ_CUDA += TestCuda_TeamBasic.o TestCuda_TeamScratch.o + OBJ_CUDA += TestCuda_TeamReductionScan.o TestCuda_TeamTeamSize.o + OBJ_CUDA += TestCuda_TeamVectorRange.o + OBJ_CUDA += TestCuda_Other.o + OBJ_CUDA += TestCuda_MDRange_a.o TestCuda_MDRange_b.o TestCuda_MDRange_c.o TestCuda_MDRange_d.o TestCuda_MDRange_e.o + OBJ_CUDA += TestCuda_Crs.o + OBJ_CUDA += TestCuda_Task.o TestCuda_WorkGraph.o + OBJ_CUDA += TestCuda_Spaces.o + OBJ_CUDA += TestCuda_UniqueToken.o + OBJ_CUDA += TestCuda_LocalDeepCopy.o + OBJ_CUDA += TestCuda_DebugSerialExecution.o + OBJ_CUDA += TestCuda_DebugPinUVMSpace.o + OBJ_CUDA += TestCuda_TeamScratchStreams.o + + TARGETS += KokkosCore_UnitTest_Cuda + TARGETS += KokkosCore_UnitTest_CudaInterOpInit + TARGETS += KokkosCore_UnitTest_CudaInterOpStreams + TEST_TARGETS += test-cuda +endif + +ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) + OBJ_THREADS = UnitTestMainInit.o gtest-all.o + OBJ_THREADS += TestThreads_Init.o + OBJ_THREADS += TestThreads_SharedAlloc.o + OBJ_THREADS += TestThreads_RangePolicy.o TestThreads_RangePolicyRequire.o + OBJ_THREADS += TestThreads_View_64bit.o + OBJ_THREADS += TestThreads_ViewAPI_a.o TestThreads_ViewAPI_b.o TestThreads_ViewAPI_c.o TestThreads_ViewAPI_d.o TestThreads_ViewAPI_e.o + OBJ_THREADS += TestThreads_ViewCopy_a.o TestThreads_ViewCopy_b.o + OBJ_THREADS += TestThreads_DeepCopyAlignment.o + OBJ_THREADS += TestThreads_ViewMapping_a.o TestThreads_ViewMapping_b.o TestThreads_ViewMapping_subview.o TestThreads_ViewResize.o TestThreads_ViewLayoutStrideAssignment.o + OBJ_THREADS += TestThreads_ViewOfClass.o + OBJ_THREADS += TestThreads_SubView_a.o TestThreads_SubView_b.o + OBJ_THREADS += TestThreads_SubView_c01.o TestThreads_SubView_c02.o TestThreads_SubView_c03.o + OBJ_THREADS += TestThreads_SubView_c04.o TestThreads_SubView_c05.o TestThreads_SubView_c06.o + OBJ_THREADS += TestThreads_SubView_c07.o TestThreads_SubView_c08.o TestThreads_SubView_c09.o + OBJ_THREADS += TestThreads_SubView_c10.o TestThreads_SubView_c11.o TestThreads_SubView_c12.o + OBJ_THREADS += TestThreads_Reductions.o TestThreads_Scan.o + OBJ_THREADS += TestThreads_Reductions_DeviceView.o + OBJ_THREADS += TestThreads_Reducers_a.o TestThreads_Reducers_b.o TestThreads_Reducers_c.o TestThreads_Reducers_d.o + OBJ_THREADS += TestThreads_Complex.o + OBJ_THREADS += TestThreads_AtomicOperations_int.o TestThreads_AtomicOperations_unsignedint.o TestThreads_AtomicOperations_longint.o + OBJ_THREADS += TestThreads_AtomicOperations_unsignedlongint.o TestThreads_AtomicOperations_longlongint.o TestThreads_AtomicOperations_double.o TestThreads_AtomicOperations_float.o + OBJ_THREADS += TestThreads_AtomicOperations_complexfloat.o TestThreads_AtomicOperations_complexdouble.o + OBJ_THREADS += TestThreads_AtomicViews.o TestThreads_Atomics.o + OBJ_THREADS += TestThreads_TeamBasic.o TestThreads_TeamScratch.o TestThreads_TeamTeamSize.o + OBJ_THREADS += TestThreads_TeamReductionScan.o + OBJ_THREADS += TestThreads_TeamVectorRange.o + OBJ_THREADS += TestThreads_Other.o + OBJ_THREADS += TestThreads_MDRange_a.o TestThreads_MDRange_b.o TestThreads_MDRange_c.o TestThreads_MDRange_d.o TestThreads_MDRange_e.o + OBJ_THREADS += TestThreads_LocalDeepCopy.o + + TARGETS += KokkosCore_UnitTest_Threads + + TEST_TARGETS += test-threads +endif + +ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) + OBJ_OPENMP = UnitTestMainInit.o gtest-all.o + OBJ_OPENMP += TestOpenMP_Init.o + OBJ_OPENMP += TestOpenMP_SharedAlloc.o + OBJ_OPENMP += TestOpenMP_RangePolicy.o TestOpenMP_RangePolicyRequire.o + OBJ_OPENMP += TestOpenMP_View_64bit.o + OBJ_OPENMP += TestOpenMP_ViewAPI_a.o TestOpenMP_ViewAPI_b.o TestOpenMP_ViewAPI_c.o TestOpenMP_ViewAPI_d.o TestOpenMP_ViewAPI_e.o + OBJ_OPENMP += TestOpenMP_DeepCopyAlignment.o TestOpenMP_ViewCopy_a.o TestOpenMP_ViewCopy_b.o + OBJ_OPENMP += TestOpenMP_ViewMapping_a.o TestOpenMP_ViewMapping_b.o TestOpenMP_ViewMapping_subview.o TestOpenMP_ViewResize.o TestOpenMP_ViewLayoutStrideAssignment.o + OBJ_OPENMP += TestOpenMP_ViewOfClass.o + OBJ_OPENMP += TestOpenMP_SubView_a.o TestOpenMP_SubView_b.o + OBJ_OPENMP += TestOpenMP_SubView_c01.o TestOpenMP_SubView_c02.o TestOpenMP_SubView_c03.o + OBJ_OPENMP += TestOpenMP_SubView_c04.o TestOpenMP_SubView_c05.o TestOpenMP_SubView_c06.o + OBJ_OPENMP += TestOpenMP_SubView_c07.o TestOpenMP_SubView_c08.o TestOpenMP_SubView_c09.o + OBJ_OPENMP += TestOpenMP_SubView_c10.o TestOpenMP_SubView_c11.o TestOpenMP_SubView_c12.o + OBJ_OPENMP += TestOpenMP_SubView_c13.o + OBJ_OPENMP += TestOpenMP_Reductions.o TestOpenMP_Scan.o + OBJ_OPENMP += TestOpenMP_Reductions_DeviceView.o + OBJ_OPENMP += TestOpenMP_Reducers_a.o TestOpenMP_Reducers_b.o TestOpenMP_Reducers_c.o TestOpenMP_Reducers_d.o + OBJ_OPENMP += TestOpenMP_Complex.o + OBJ_OPENMP += TestOpenMP_AtomicOperations_int.o TestOpenMP_AtomicOperations_unsignedint.o TestOpenMP_AtomicOperations_longint.o + OBJ_OPENMP += TestOpenMP_AtomicOperations_unsignedlongint.o TestOpenMP_AtomicOperations_longlongint.o TestOpenMP_AtomicOperations_double.o TestOpenMP_AtomicOperations_float.o + OBJ_OPENMP += TestOpenMP_AtomicOperations_complexfloat.o TestOpenMP_AtomicOperations_complexdouble.o + OBJ_OPENMP += TestOpenMP_AtomicViews.o TestOpenMP_Atomics.o + OBJ_OPENMP += TestOpenMP_TeamBasic.o TestOpenMP_TeamScratch.o + OBJ_OPENMP += TestOpenMP_TeamReductionScan.o TestOpenMP_TeamTeamSize.o + OBJ_OPENMP += TestOpenMP_TeamVectorRange.o + OBJ_OPENMP += TestOpenMP_Other.o + OBJ_OPENMP += TestOpenMP_MDRange_a.o TestOpenMP_MDRange_b.o TestOpenMP_MDRange_c.o TestOpenMP_MDRange_d.o TestOpenMP_MDRange_e.o + OBJ_OPENMP += TestOpenMP_Crs.o + OBJ_OPENMP += TestOpenMP_Task.o TestOpenMP_WorkGraph.o + OBJ_OPENMP += TestOpenMP_UniqueToken.o + OBJ_OPENMP += TestOpenMP_LocalDeepCopy.o + + TARGETS += KokkosCore_UnitTest_OpenMP + TARGETS += KokkosCore_UnitTest_OpenMPInterOp + + TEST_TARGETS += test-openmp +endif + +ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) + OBJ_OPENMPTARGET = UnitTestMainInit.o gtest-all.o + OBJ_OPENMPTARGET += TestOpenMPTarget_Init.o + #OBJ_OPENMPTARGET += TestOpenMPTarget_SharedAlloc.o + OBJ_OPENMPTARGET += TestOpenMPTarget_RangePolicy.o + OBJ_OPENMPTARGET += TestOpenMPTarget_ViewAPI_a.o TestOpenMPTarget_ViewAPI_b.o TestOpenMPTarget_ViewAPI_c.o TestOpenMPTarget_ViewAPI_d.o #Some commented out code + #OBJ_OPENMPTARGET += TestOpenMPTarget_ViewAPI_e.o TestOpenMPTarget_ViewCopy_a.o TestOpenMPTarget_ViewCopy_b.o + OBJ_OPENMPTARGET += TestOpenMPTarget_DeepCopyAlignment.o + OBJ_OPENMPTARGET += TestOpenMPTarget_ViewMapping_a.o + OBJ_OPENMPTARGET += TestOpenMPTarget_ViewMapping_b.o + #OBJ_OPENMPTARGET += TestOpenMPTarget_ViewMapping_subview.o + #OBJ_OPENMPTARGET += TestOpenMPTarget_ViewOfClass.o + OBJ_OPENMPTARGET += TestOpenMPTarget_SubView_a.o TestOpenMPTarget_SubView_b.o + #The following subview tests need something like UVM: + #OBJ_OPENMPTARGET += TestOpenMPTarget_SubView_c01.o TestOpenMPTarget_SubView_c02.o TestOpenMPTarget_SubView_c03.o + #OBJ_OPENMPTARGET += TestOpenMPTarget_SubView_c04.o TestOpenMPTarget_SubView_c05.o TestOpenMPTarget_SubView_c06.o + #OBJ_OPENMPTARGET += TestOpenMPTarget_SubView_c07.o TestOpenMPTarget_SubView_c08.o TestOpenMPTarget_SubView_c09.o + #OBJ_OPENMPTARGET += TestOpenMPTarget_SubView_c10.o TestOpenMPTarget_SubView_c11.o TestOpenMPTarget_SubView_c12.o + #OBJ_OPENMPTARGET += TestOpenMPTarget_Reductions.o # Need custom reductions + OBJ_OPENMPTARGET += TestOpenMPTarget_Reducers_a.o TestOpenMPTarget_Reducers_b.o TestOpenMPTarget_Reducers_c.o TestOpenMPTarget_Reducers_d.o + #OBJ_OPENMPTARGET += TestOpenMPTarget_Scan.o + OBJ_OPENMPTARGET += TestOpenMPTarget_Complex.o + OBJ_OPENMPTARGET += TestOpenMPTarget_AtomicOperations_int.o TestOpenMPTarget_AtomicOperations_unsignedint.o TestOpenMPTarget_AtomicOperations_longint.o + OBJ_OPENMPTARGET += TestOpenMPTarget_AtomicOperations_unsignedlongint.o TestOpenMPTarget_AtomicOperations_longlongint.o TestOpenMPTarget_AtomicOperations_double.o TestOpenMPTarget_AtomicOperations_float.o + #OBJ_OPENMPTARGET += TestOpenMPTarget_AtomicOperations_complexfloat.o + #OBJ_OPENMPTARGET += TestOpenMPTarget_AtomicOperations_complexdouble.o + OBJ_OPENMPTARGET += TestOpenMPTarget_AtomicViews.o + OBJ_OPENMPTARGET += TestOpenMPTarget_Atomics.o # Commented Out Arbitrary Type Atomics + #OBJ_OPENMPTARGET += TestOpenMPTarget_TeamBasic.o # There is still a static function in this + #OBJ_OPENMPTARGET += TestOpenMPTarget_TeamScratch.o + #OBJ_OPENMPTARGET += TestOpenMPTarget_TeamScan.o + #OBJ_OPENMPTARGET += TestOpenMPTarget_TeamReductionScan.o + #OBJ_OPENMPTARGET += TestOpenMPTarget_Other.o + #OBJ_OPENMPTARGET += TestOpenMPTarget_MDRange_a.o TestOpenMPTarget_MDRange_b.o TestOpenMPTarget_MDRange_c.o TestOpenMPTarget_MDRange_d.o TestOpenMPTarget_MDRange_d.e + #OBJ_OPENMPTARGET += TestOpenMPTarget_Task.o + + TARGETS += KokkosCore_UnitTest_OpenMPTarget + + TEST_TARGETS += test-openmptarget +endif + +ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) + OBJ_HIP = UnitTestMainInit.o gtest-all.o + OBJ_HIP += TestHIP_Init.o + OBJ_HIP += TestHIP_Reducers_a.o TestHIP_Reducers_b.o TestHIP_Reducers_c.o TestHIP_Reducers_d.o + OBJ_HIP += TestHIP_Reductions.o + OBJ_HIP += TestHIP_MDRange_a.o TestHIP_MDRange_b.o TestHIP_MDRange_c.o TestHIP_MDRange_d.o TestHIP_MDRange_e.o + OBJ_HIP += TestHIP_Spaces.o + OBJ_HIP += TestHIPHostPinned_ViewAPI_a.o TestHIPHostPinned_ViewAPI_b.o TestHIPHostPinned_ViewAPI_c.o TestHIPHostPinned_ViewAPI_d.o TestHIPHostPinned_ViewAPI_e.o + OBJ_HIP += TestHIPHostPinned_ViewCopy_a.o TestHIPHostPinned_ViewCopy_b.o + OBJ_HIP += TestHIPHostPinned_ViewMapping_a.o TestHIPHostPinned_ViewMapping_b.o TestHIPHostPinned_ViewMapping_subview.o + + TARGETS += KokkosCore_UnitTest_HIP + + TEST_TARGETS += test-hip +endif + +ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) + OBJ_HPX = UnitTestMainInit.o gtest-all.o + OBJ_HPX += TestHPX_Init.o + OBJ_HPX += TestHPX_SharedAlloc.o + OBJ_HPX += TestHPX_RangePolicy.o TestHPX_RangePolicyRequire.o + OBJ_HPX += TestHPX_View_64bit.o + OBJ_HPX += TestHPX_ViewAPI_a.o TestHPX_ViewAPI_b.o TestHPX_ViewAPI_c.o TestHPX_ViewAPI_d.o TestHPX_ViewAPI_e.o + OBJ_HPX += TestHPX_ViewCopy_a.o TestHPX_ViewCopy_b.o + OBJ_HPX += TestHPX_ViewMapping_a.o TestHPX_ViewMapping_b.o TestHPX_ViewMapping_subview.o TestHPX_ViewResize.o + OBJ_HPX += TestHPX_ViewOfClass.o + OBJ_HPX += TestHPX_SubView_a.o TestHPX_SubView_b.o + OBJ_HPX += TestHPX_SubView_c01.o TestHPX_SubView_c02.o TestHPX_SubView_c03.o + OBJ_HPX += TestHPX_SubView_c04.o TestHPX_SubView_c05.o TestHPX_SubView_c06.o + OBJ_HPX += TestHPX_SubView_c07.o TestHPX_SubView_c08.o TestHPX_SubView_c09.o + OBJ_HPX += TestHPX_SubView_c10.o TestHPX_SubView_c11.o TestHPX_SubView_c12.o + OBJ_HPX += TestHPX_SubView_c13.o + OBJ_HPX += TestHPX_Reductions.o + OBJ_HPX += TestHPX_Scan.o + OBJ_HPX += TestHPX_Reducers_a.o TestHPX_Reducers_b.o TestHPX_Reducers_c.o TestHPX_Reducers_d.o + OBJ_HPX += TestHPX_Complex.o + OBJ_HPX += TestHPX_AtomicOperations_int.o TestHPX_AtomicOperations_unsignedint.o TestHPX_AtomicOperations_longint.o + OBJ_HPX += TestHPX_AtomicOperations_unsignedlongint.o TestHPX_AtomicOperations_longlongint.o TestHPX_AtomicOperations_double.o TestHPX_AtomicOperations_float.o + OBJ_HPX += TestHPX_AtomicViews.o TestHPX_Atomics.o + OBJ_HPX += TestHPX_TeamBasic.o + OBJ_HPX += TestHPX_TeamVectorRange.o + OBJ_HPX += TestHPX_TeamScratch.o + OBJ_HPX += TestHPX_TeamReductionScan.o + OBJ_HPX += TestHPX_Other.o + OBJ_HPX += TestHPX_MDRange_a.o TestHPX_MDRange_b.o TestHPX_MDRange_c.o TestHPX_MDRange_d.o TestHPX_MDRange_e.o + OBJ_HPX += TestHPX_Crs.o + OBJ_HPX += TestHPX_Task.o + OBJ_HPX += TestHPX_WorkGraph.o + OBJ_HPX += TestHPX_UniqueToken.o + + TARGETS += KokkosCore_UnitTest_HPX + TARGETS += KokkosCore_UnitTest_HPXInterOp + + TEST_TARGETS += test-hpx +endif + +ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) + OBJ_SERIAL = UnitTestMainInit.o gtest-all.o + OBJ_SERIAL += TestSerial_Init.o + OBJ_SERIAL += TestSerial_SharedAlloc.o + OBJ_SERIAL += TestSerial_RangePolicy.o TestSerial_RangePolicyRequire.o + OBJ_SERIAL += TestSerial_View_64bit.o + OBJ_SERIAL += TestSerial_ViewAPI_a.o TestSerial_ViewAPI_b.o TestSerial_ViewAPI_c.o TestSerial_ViewAPI_d.o TestSerial_ViewAPI_e.o + OBJ_SERIAL += TestSerial_DeepCopyAlignment.o TestSerial_ViewCopy_a.o TestSerial_ViewCopy_b.o + OBJ_SERIAL += TestSerial_ViewMapping_a.o TestSerial_ViewMapping_b.o TestSerial_ViewMapping_subview.o TestSerial_ViewResize.o TestSerial_ViewLayoutStrideAssignment.o + OBJ_SERIAL += TestSerial_ViewOfClass.o + OBJ_SERIAL += TestSerial_SubView_a.o TestSerial_SubView_b.o + OBJ_SERIAL += TestSerial_SubView_c01.o TestSerial_SubView_c02.o TestSerial_SubView_c03.o + OBJ_SERIAL += TestSerial_SubView_c04.o TestSerial_SubView_c05.o TestSerial_SubView_c06.o + OBJ_SERIAL += TestSerial_SubView_c07.o TestSerial_SubView_c08.o TestSerial_SubView_c09.o + OBJ_SERIAL += TestSerial_SubView_c10.o TestSerial_SubView_c11.o TestSerial_SubView_c12.o + OBJ_SERIAL += TestSerial_SubView_c13.o + OBJ_SERIAL += TestSerial_Reductions.o TestSerial_Scan.o + OBJ_SERIAL += TestSerial_Reductions_DeviceView.o + OBJ_SERIAL += TestSerial_Reducers_a.o TestSerial_Reducers_b.o TestSerial_Reducers_c.o TestSerial_Reducers_d.o + OBJ_SERIAL += TestSerial_Complex.o + OBJ_SERIAL += TestSerial_AtomicOperations_int.o TestSerial_AtomicOperations_unsignedint.o TestSerial_AtomicOperations_longint.o + OBJ_SERIAL += TestSerial_AtomicOperations_unsignedlongint.o TestSerial_AtomicOperations_longlongint.o TestSerial_AtomicOperations_double.o TestSerial_AtomicOperations_float.o + OBJ_SERIAL += TestSerial_AtomicOperations_complexfloat.o TestSerial_AtomicOperations_complexdouble.o + OBJ_SERIAL += TestSerial_AtomicViews.o TestSerial_Atomics.o + OBJ_SERIAL += TestSerial_TeamBasic.o TestSerial_TeamScratch.o + OBJ_SERIAL += TestSerial_TeamVectorRange.o + OBJ_SERIAL += TestSerial_TeamReductionScan.o TestSerial_TeamTeamSize.o + OBJ_SERIAL += TestSerial_Other.o + #HCC_WORKAROUND + ifneq ($(KOKKOS_INTERNAL_COMPILER_HCC), 1) + OBJ_SERIAL += TestSerial_MDRange_a.o TestSerial_MDRange_b.o TestSerial_MDRange_c.o TestSerial_MDRange_d.o TestSerial_MDRange_e.o + endif + OBJ_SERIAL += TestSerial_Crs.o + OBJ_SERIAL += TestSerial_Task.o TestSerial_WorkGraph.o + OBJ_SERIAL += TestSerial_LocalDeepCopy.o + + TARGETS += KokkosCore_UnitTest_Serial + + TEST_TARGETS += test-serial +endif + +OBJ_HWLOC = TestHWLOC.o UnitTestMain.o gtest-all.o +TARGETS += KokkosCore_UnitTest_HWLOC +TEST_TARGETS += test-hwloc + +OBJ_DEFAULT = UnitTestMainInit.o gtest-all.o +ifneq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) +ifneq ($(KOKKOS_INTERNAL_COMPILER_HCC), 1) + OBJ_DEFAULT += TestDefaultDeviceType.o + OBJ_DEFAULT += TestDefaultDeviceType_a1.o TestDefaultDeviceType_b1.o TestDefaultDeviceType_c1.o + OBJ_DEFAULT += TestDefaultDeviceType_a2.o TestDefaultDeviceType_b2.o TestDefaultDeviceType_c2.o + OBJ_DEFAULT += TestDefaultDeviceType_a3.o TestDefaultDeviceType_b3.o TestDefaultDeviceType_c3.o + OBJ_DEFAULT += TestDefaultDeviceType_d.o +endif +endif + +TARGETS += KokkosCore_UnitTest_Default +TEST_TARGETS += test-default + +TARGETS += KokkosCore_UnitTest_PushFinalizeHook +TEST_TARGETS += test-push-finalize-hook + +TARGETS += KokkosCore_UnitTest_PushFinalizeHook_terminate +TEST_TARGETS += test-push-finalize-hook-terminate + +TARGETS += KokkosCore_UnitTest_StackTraceTestExec +TEST_TARGETS += test-stack-trace +TEST_TARGETS += test-stack-trace-terminate +TEST_TARGETS += test-stack-trace-generic-term + +NUM_INITTESTS = 16 +INITTESTS_NUMBERS := $(shell seq 1 ${NUM_INITTESTS}) +INITTESTS_TARGETS := $(addprefix KokkosCore_UnitTest_DefaultDeviceTypeInit_,${INITTESTS_NUMBERS}) +TARGETS += ${INITTESTS_TARGETS} +INITTESTS_TEST_TARGETS := $(addprefix test-default-init-,${INITTESTS_NUMBERS}) +TEST_TARGETS += ${INITTESTS_TEST_TARGETS} + +KokkosCore_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_UnitTest_Cuda + +KokkosCore_UnitTest_CudaInterOpInit: UnitTestMain.o gtest-all.o TestCuda_InterOp_Init.o $(KOKKOS_LINK_DEPENDS) + $(LINK) $(EXTRA_PATH) UnitTestMain.o gtest-all.o TestCuda_InterOp_Init.o $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_UnitTest_CudaInterOpInit +KokkosCore_UnitTest_CudaInterOpStreams: UnitTestMain.o gtest-all.o TestCuda_InterOp_Streams.o $(KOKKOS_LINK_DEPENDS) + $(LINK) $(EXTRA_PATH) UnitTestMain.o gtest-all.o TestCuda_InterOp_Streams.o $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_UnitTest_CudaInterOpStreams + +KokkosCore_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_UnitTest_Threads + +KokkosCore_UnitTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(EXTRA_PATH) $(OBJ_OPENMP) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_UnitTest_OpenMP + +KokkosCore_UnitTest_OpenMPInterOp: UnitTestMain.o gtest-all.o TestOpenMP_InterOp.o $(KOKKOS_LINK_DEPENDS) + $(LINK) $(EXTRA_PATH) UnitTestMain.o gtest-all.o TestOpenMP_InterOp.o $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_UnitTest_OpenMPInterOp + +KokkosCore_UnitTest_OpenMPTarget: $(OBJ_OPENMPTARGET) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_OPENMPTARGET) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_OpenMPTarget + +KokkosCore_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(EXTRA_PATH) $(OBJ_SERIAL) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_UnitTest_Serial + +KokkosCore_UnitTest_HIP: $(OBJ_HIP) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(EXTRA_PATH) $(OBJ_HIP) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_UnitTest_HIP + +KokkosCore_UnitTest_HPX: $(OBJ_HPX) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(EXTRA_PATH) $(OBJ_HPX) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_UnitTest_HPX + +KokkosCore_UnitTest_HPXInterOp: UnitTestMain.o gtest-all.o TestHPX_InterOp.o $(KOKKOS_LINK_DEPENDS) + $(LINK) $(EXTRA_PATH) UnitTestMain.o gtest-all.o TestHPX_InterOp.o $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_UnitTest_HPXInterOp + +KokkosCore_UnitTest_HWLOC: $(OBJ_HWLOC) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(EXTRA_PATH) $(OBJ_HWLOC) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_UnitTest_HWLOC + +KokkosCore_UnitTest_AllocationTracker: $(OBJ_ALLOCATIONTRACKER) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(EXTRA_PATH) $(OBJ_ALLOCATIONTRACKER) $(KOKKOS_LIBS) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(LIB) -o KokkosCore_UnitTest_AllocationTracker + +KokkosCore_UnitTest_Default: $(OBJ_DEFAULT) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(EXTRA_PATH) $(OBJ_DEFAULT) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_UnitTest_Default + +KokkosCore_UnitTest_PushFinalizeHook: $(OBJ_DEFAULT) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(EXTRA_PATH) $(OBJ_DEFAULT) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_UnitTest_PushFinalizeHook + +KokkosCore_UnitTest_PushFinalizeHook_terminate: $(OBJ_DEFAULT) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(EXTRA_PATH) $(OBJ_DEFAULT) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_UnitTest_PushFinalizeHook_terminate + + +${INITTESTS_TARGETS}: KokkosCore_UnitTest_DefaultDeviceTypeInit_%: TestDefaultDeviceTypeInit_%.o UnitTestMain.o gtest-all.o $(KOKKOS_LINK_DEPENDS) + $(LINK) $(EXTRA_PATH) TestDefaultDeviceTypeInit_$*.o UnitTestMain.o gtest-all.o $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_UnitTest_DefaultDeviceTypeInit_$* + +KokkosCore_UnitTest_StackTraceTestExec: TestStackTrace.o TestStackTrace_f0.o TestStackTrace_f1.o TestStackTrace_f2.o TestStackTrace_f3.o TestStackTrace_f4.o $(KOKKOS_LINK_DEPENDS) gtest-all.o + $(LINK) $(EXTRA_PATH) TestStackTrace.o TestStackTrace_f0.o TestStackTrace_f1.o TestStackTrace_f2.o TestStackTrace_f3.o TestStackTrace_f4.o gtest-all.o $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_UnitTest_StackTraceTestExec + +test-cuda: KokkosCore_UnitTest_Cuda + ./KokkosCore_UnitTest_Cuda + ./KokkosCore_UnitTest_CudaInterOpInit + ./KokkosCore_UnitTest_CudaInterOpStreams + +test-threads: KokkosCore_UnitTest_Threads + ./KokkosCore_UnitTest_Threads + +test-openmp: KokkosCore_UnitTest_OpenMP + ./KokkosCore_UnitTest_OpenMP + ./KokkosCore_UnitTest_OpenMPInterOp + +test-openmptarget: KokkosCore_UnitTest_OpenMPTarget + ./KokkosCore_UnitTest_OpenMPTarget + +test-serial: KokkosCore_UnitTest_Serial + ./KokkosCore_UnitTest_Serial + +test-hip: KokkosCore_UnitTest_HIP + ./KokkosCore_UnitTest_HIP + +test-hpx: KokkosCore_UnitTest_HPX + ./KokkosCore_UnitTest_HPX + ./KokkosCore_UnitTest_HPXInterOp + +test-hwloc: KokkosCore_UnitTest_HWLOC + ./KokkosCore_UnitTest_HWLOC + +test-allocationtracker: KokkosCore_UnitTest_AllocationTracker + ./KokkosCore_UnitTest_AllocationTracker + +test-default: KokkosCore_UnitTest_Default + ./KokkosCore_UnitTest_Default + +test-push-finalize-hook: KokkosCore_UnitTest_PushFinalizeHook + ./KokkosCore_UnitTest_PushFinalizeHook + +test-push-finalize-hook-terminate: KokkosCore_UnitTest_PushFinalizeHook_terminate + ./KokkosCore_UnitTest_PushFinalizeHook_terminate + +test-stack-trace: KokkosCore_UnitTest_StackTraceTestExec + ./KokkosCore_UnitTest_StackTraceTestExec --gtest_filter=*normal$(STACK_TRACE_TERMINATE_FILTER) + +test-stack-trace-terminate: KokkosCore_UnitTest_StackTraceTestExec + ./KokkosCore_UnitTest_StackTraceTestExec --gtest_filter=*terminate$(STACK_TRACE_TERMINATE_FILTER) + +test-stack-trace-generic-term: KokkosCore_UnitTest_StackTraceTestExec + ./KokkosCore_UnitTest_StackTraceTestExec --gtest_filter=*generic_term$(STACK_TRACE_TERMINATE_FILTER) + + +${INITTESTS_TEST_TARGETS}: test-default-init-%: KokkosCore_UnitTest_DefaultDeviceTypeInit_% + ./KokkosCore_UnitTest_DefaultDeviceTypeInit_$* + +build_all: $(TARGETS) + +test: $(TEST_TARGETS) + +clean: kokkos-clean + rm -f *.o $(TARGETS) TestCuda*.cpp TestThreads*.cpp TestOpenMP*.cpp TestSerial*.cpp TestHIP*.cpp \ + TestOpenMPTarget*.cpp TestHPX*.cpp + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(TEST_HEADERS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< + +gtest-all.o:$(GTEST_PATH)/gtest/gtest-all.cc + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $(GTEST_PATH)/gtest/gtest-all.cc diff --git a/packages/kokkos/core/unit_test/TestAggregate.hpp b/packages/kokkos/core/unit_test/TestAggregate.hpp new file mode 100644 index 0000000000000000000000000000000000000000..3151143a6ff992c30ecee6dd52668c5c77941923 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestAggregate.hpp @@ -0,0 +1,147 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef TEST_AGGREGATE_HPP +#define TEST_AGGREGATE_HPP + +#include <gtest/gtest.h> + +#include <stdexcept> +#include <sstream> +#include <iostream> + +#include <impl/Kokkos_ViewArray.hpp> + +namespace Test { + +template <class DeviceType> +void TestViewAggregate() { + using value_type = Kokkos::Array<double, 32>; + using analysis_1d = + Kokkos::Impl::ViewDataAnalysis<value_type *, Kokkos::LayoutLeft, + value_type>; + + static_assert( + std::is_same<typename analysis_1d::specialize, Kokkos::Array<> >::value, + ""); + + using a32_traits = Kokkos::ViewTraits<value_type **, DeviceType>; + using flat_traits = + Kokkos::ViewTraits<typename a32_traits::scalar_array_type, DeviceType>; + + static_assert( + std::is_same<typename a32_traits::specialize, Kokkos::Array<> >::value, + ""); + static_assert( + std::is_same<typename a32_traits::value_type, value_type>::value, ""); + static_assert(a32_traits::rank == 2, ""); + static_assert(a32_traits::rank_dynamic == 2, ""); + + static_assert(std::is_same<typename flat_traits::specialize, void>::value, + ""); + static_assert(flat_traits::rank == 3, ""); + static_assert(flat_traits::rank_dynamic == 2, ""); + static_assert(flat_traits::dimension::N2 == 32, ""); + + using a32_type = Kokkos::View<Kokkos::Array<double, 32> **, DeviceType>; + using a32_flat_type = typename a32_type::array_type; + + static_assert(std::is_same<typename a32_type::value_type, value_type>::value, + ""); + static_assert(std::is_same<typename a32_type::pointer_type, double *>::value, + ""); + static_assert(a32_type::Rank == 2, ""); + static_assert(a32_flat_type::Rank == 3, ""); + + a32_type x("test", 4, 5); + a32_flat_type y(x); + + ASSERT_EQ(x.extent(0), 4); + ASSERT_EQ(x.extent(1), 5); + ASSERT_EQ(y.extent(0), 4); + ASSERT_EQ(y.extent(1), 5); + ASSERT_EQ(y.extent(2), 32); + + // Initialize arrays from brace-init-list as for std::array. + // + // Comment: Clang will issue the following warning if we don't use double + // braces here (one for initializing the Kokkos::Array and one for + // initializing the sub-aggreagate C-array data member), + // + // warning: suggest braces around initialization of subobject + // + // but single brace syntax would be valid as well. + Kokkos::Array<float, 2> aggregate_initialization_syntax_1 = {{1.41, 3.14}}; + ASSERT_FLOAT_EQ(aggregate_initialization_syntax_1[0], 1.41); + ASSERT_FLOAT_EQ(aggregate_initialization_syntax_1[1], 3.14); + + Kokkos::Array<int, 3> aggregate_initialization_syntax_2{ + {0, 1, 2}}; // since C++11 + for (int i = 0; i < 3; ++i) { + ASSERT_EQ(aggregate_initialization_syntax_2[i], i); + } + + // Note that this is a valid initialization. + Kokkos::Array<double, 3> initialized_with_one_argument_missing = {{255, 255}}; + for (int i = 0; i < 2; ++i) { + ASSERT_DOUBLE_EQ(initialized_with_one_argument_missing[i], 255); + } + // But the following line would not compile + // Kokkos::Array< double, 3 > initialized_with_too_many{ { 1, 2, 3, 4 } }; + + // The code below must compile for zero-sized arrays. + using T = float; + + constexpr int N = 0; + Kokkos::Array<T, N> a; + for (int i = 0; i < N; ++i) { + a[i] = T(); + } +} + +TEST(TEST_CATEGORY, view_aggregate) { TestViewAggregate<TEST_EXECSPACE>(); } + +} // namespace Test + +#endif /* #ifndef TEST_AGGREGATE_HPP */ diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations.hpp new file mode 100644 index 0000000000000000000000000000000000000000..04362125c0648e679f9a1cfb9886ccb84e6b14d5 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestAtomicOperations.hpp @@ -0,0 +1,974 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> + +namespace TestAtomicOperations { + +//----------------------------------------------- +//--------------zero_functor--------------------- +//----------------------------------------------- + +template <class T, class DEVICE_TYPE> +struct ZeroFunctor { + using execution_space = DEVICE_TYPE; + using type = typename Kokkos::View<T, execution_space>; + using h_type = typename Kokkos::View<T, execution_space>::HostMirror; + + type data; + + KOKKOS_INLINE_FUNCTION + void operator()(int) const { data() = 0; } +}; + +//----------------------------------------------- +//--------------init_functor--------------------- +//----------------------------------------------- + +template <class T, class DEVICE_TYPE> +struct InitFunctor { + using execution_space = DEVICE_TYPE; + using type = typename Kokkos::View<T, execution_space>; + using h_type = typename Kokkos::View<T, execution_space>::HostMirror; + + type data; + T init_value; + + KOKKOS_INLINE_FUNCTION + void operator()(int) const { data() = init_value; } + + InitFunctor(T _init_value) : init_value(_init_value) {} +}; + +//--------------------------------------------------- +//--------------atomic_fetch_max--------------------- +//--------------------------------------------------- + +template <class T, class DEVICE_TYPE> +struct MaxFunctor { + using execution_space = DEVICE_TYPE; + using type = Kokkos::View<T, execution_space>; + + type data; + T i0; + T i1; + + KOKKOS_INLINE_FUNCTION + void operator()(int) const { + // Kokkos::atomic_fetch_max( &data(), (T) 1 ); + Kokkos::atomic_fetch_max(&data(), (T)i1); + } + MaxFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {} +}; + +template <class T, class execution_space> +T MaxAtomic(T i0, T i1) { + struct InitFunctor<T, execution_space> f_init(i0); + typename InitFunctor<T, execution_space>::type data("Data"); + typename InitFunctor<T, execution_space>::h_type h_data("HData"); + + f_init.data = data; + Kokkos::parallel_for(1, f_init); + execution_space().fence(); + + struct MaxFunctor<T, execution_space> f(i0, i1); + + f.data = data; + Kokkos::parallel_for(1, f); + execution_space().fence(); + + Kokkos::deep_copy(h_data, data); + T val = h_data(); + + return val; +} + +template <class T> +T MaxAtomicCheck(T i0, T i1) { + T* data = new T[1]; + data[0] = 0; + + *data = (i0 > i1 ? i0 : i1); + + T val = *data; + delete[] data; + + return val; +} + +template <class T, class DeviceType> +bool MaxAtomicTest(T i0, T i1) { + T res = MaxAtomic<T, DeviceType>(i0, i1); + T resSerial = MaxAtomicCheck<T>(i0, i1); + + bool passed = true; + + if (resSerial != res) { + passed = false; + + std::cout << "Loop<" << typeid(T).name() << ">( test = MaxAtomicTest" + << " FAILED : " << resSerial << " != " << res << std::endl; + } + + return passed; +} + +//--------------------------------------------------- +//--------------atomic_fetch_min--------------------- +//--------------------------------------------------- + +template <class T, class DEVICE_TYPE> +struct MinFunctor { + using execution_space = DEVICE_TYPE; + using type = Kokkos::View<T, execution_space>; + + type data; + T i0; + T i1; + + KOKKOS_INLINE_FUNCTION + void operator()(int) const { Kokkos::atomic_fetch_min(&data(), (T)i1); } + + MinFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {} +}; + +template <class T, class execution_space> +T MinAtomic(T i0, T i1) { + struct InitFunctor<T, execution_space> f_init(i0); + typename InitFunctor<T, execution_space>::type data("Data"); + typename InitFunctor<T, execution_space>::h_type h_data("HData"); + + f_init.data = data; + Kokkos::parallel_for(1, f_init); + execution_space().fence(); + + struct MinFunctor<T, execution_space> f(i0, i1); + + f.data = data; + Kokkos::parallel_for(1, f); + execution_space().fence(); + + Kokkos::deep_copy(h_data, data); + T val = h_data(); + + return val; +} + +template <class T> +T MinAtomicCheck(T i0, T i1) { + T* data = new T[1]; + data[0] = 0; + + *data = (i0 < i1 ? i0 : i1); + + T val = *data; + delete[] data; + + return val; +} + +template <class T, class DeviceType> +bool MinAtomicTest(T i0, T i1) { + T res = MinAtomic<T, DeviceType>(i0, i1); + T resSerial = MinAtomicCheck<T>(i0, i1); + + bool passed = true; + + if (resSerial != res) { + passed = false; + + std::cout << "Loop<" << typeid(T).name() << ">( test = MinAtomicTest" + << " FAILED : " << resSerial << " != " << res << std::endl; + } + + return passed; +} + +//--------------------------------------------------- +//--------------atomic_increment--------------------- +//--------------------------------------------------- + +template <class T, class DEVICE_TYPE> +struct IncFunctor { + using execution_space = DEVICE_TYPE; + using type = Kokkos::View<T, execution_space>; + + type data; + T i0; + + KOKKOS_INLINE_FUNCTION + void operator()(int) const { Kokkos::atomic_increment(&data()); } + + IncFunctor(T _i0) : i0(_i0) {} +}; + +template <class T, class execution_space> +T IncAtomic(T i0) { + struct InitFunctor<T, execution_space> f_init(i0); + typename InitFunctor<T, execution_space>::type data("Data"); + typename InitFunctor<T, execution_space>::h_type h_data("HData"); + + f_init.data = data; + Kokkos::parallel_for(1, f_init); + execution_space().fence(); + + struct IncFunctor<T, execution_space> f(i0); + + f.data = data; + Kokkos::parallel_for(1, f); + execution_space().fence(); + + Kokkos::deep_copy(h_data, data); + T val = h_data(); + + return val; +} + +template <class T> +T IncAtomicCheck(T i0) { + T* data = new T[1]; + data[0] = 0; + + *data = i0 + 1; + + T val = *data; + delete[] data; + + return val; +} + +template <class T, class DeviceType> +bool IncAtomicTest(T i0) { + T res = IncAtomic<T, DeviceType>(i0); + T resSerial = IncAtomicCheck<T>(i0); + + bool passed = true; + + if (resSerial != res) { + passed = false; + + std::cout << "Loop<" << typeid(T).name() << ">( test = IncAtomicTest" + << " FAILED : " << resSerial << " != " << res << std::endl; + } + + return passed; +} + +//--------------------------------------------------- +//--------------atomic_decrement--------------------- +//--------------------------------------------------- + +template <class T, class DEVICE_TYPE> +struct DecFunctor { + using execution_space = DEVICE_TYPE; + using type = Kokkos::View<T, execution_space>; + + type data; + T i0; + + KOKKOS_INLINE_FUNCTION + void operator()(int) const { Kokkos::atomic_decrement(&data()); } + + DecFunctor(T _i0) : i0(_i0) {} +}; + +template <class T, class execution_space> +T DecAtomic(T i0) { + struct InitFunctor<T, execution_space> f_init(i0); + typename InitFunctor<T, execution_space>::type data("Data"); + typename InitFunctor<T, execution_space>::h_type h_data("HData"); + + f_init.data = data; + Kokkos::parallel_for(1, f_init); + execution_space().fence(); + + struct DecFunctor<T, execution_space> f(i0); + + f.data = data; + Kokkos::parallel_for(1, f); + execution_space().fence(); + + Kokkos::deep_copy(h_data, data); + T val = h_data(); + + return val; +} + +template <class T> +T DecAtomicCheck(T i0) { + T* data = new T[1]; + data[0] = 0; + + *data = i0 - 1; + + T val = *data; + delete[] data; + + return val; +} + +template <class T, class DeviceType> +bool DecAtomicTest(T i0) { + T res = DecAtomic<T, DeviceType>(i0); + T resSerial = DecAtomicCheck<T>(i0); + + bool passed = true; + + if (resSerial != res) { + passed = false; + + std::cout << "Loop<" << typeid(T).name() << ">( test = DecAtomicTest" + << " FAILED : " << resSerial << " != " << res << std::endl; + } + + return passed; +} + +//--------------------------------------------------- +//--------------atomic_fetch_mul--------------------- +//--------------------------------------------------- + +template <class T, class DEVICE_TYPE> +struct MulFunctor { + using execution_space = DEVICE_TYPE; + using type = Kokkos::View<T, execution_space>; + + type data; + T i0; + T i1; + + KOKKOS_INLINE_FUNCTION + void operator()(int) const { Kokkos::atomic_fetch_mul(&data(), (T)i1); } + + MulFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {} +}; + +template <class T, class execution_space> +T MulAtomic(T i0, T i1) { + struct InitFunctor<T, execution_space> f_init(i0); + typename InitFunctor<T, execution_space>::type data("Data"); + typename InitFunctor<T, execution_space>::h_type h_data("HData"); + + f_init.data = data; + Kokkos::parallel_for(1, f_init); + execution_space().fence(); + + struct MulFunctor<T, execution_space> f(i0, i1); + + f.data = data; + Kokkos::parallel_for(1, f); + execution_space().fence(); + + Kokkos::deep_copy(h_data, data); + T val = h_data(); + + return val; +} + +template <class T> +T MulAtomicCheck(T i0, T i1) { + T* data = new T[1]; + data[0] = 0; + + *data = i0 * i1; + + T val = *data; + delete[] data; + + return val; +} + +template <class T, class DeviceType> +bool MulAtomicTest(T i0, T i1) { + T res = MulAtomic<T, DeviceType>(i0, i1); + T resSerial = MulAtomicCheck<T>(i0, i1); + + bool passed = true; + + if (resSerial != res) { + passed = false; + + std::cout << "Loop<" << typeid(T).name() << ">( test = MulAtomicTest" + << " FAILED : " << resSerial << " != " << res << std::endl; + } + + return passed; +} + +//--------------------------------------------------- +//--------------atomic_fetch_div--------------------- +//--------------------------------------------------- + +template <class T, class DEVICE_TYPE> +struct DivFunctor { + using execution_space = DEVICE_TYPE; + using type = Kokkos::View<T, execution_space>; + + type data; + T i0; + T i1; + + KOKKOS_INLINE_FUNCTION + void operator()(int) const { Kokkos::atomic_fetch_div(&data(), (T)i1); } + + DivFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {} +}; + +template <class T, class execution_space> +T DivAtomic(T i0, T i1) { + struct InitFunctor<T, execution_space> f_init(i0); + typename InitFunctor<T, execution_space>::type data("Data"); + typename InitFunctor<T, execution_space>::h_type h_data("HData"); + + f_init.data = data; + Kokkos::parallel_for(1, f_init); + execution_space().fence(); + + struct DivFunctor<T, execution_space> f(i0, i1); + + f.data = data; + Kokkos::parallel_for(1, f); + execution_space().fence(); + + Kokkos::deep_copy(h_data, data); + T val = h_data(); + + return val; +} + +template <class T> +T DivAtomicCheck(T i0, T i1) { + T* data = new T[1]; + data[0] = 0; + + *data = i0 / i1; + + T val = *data; + delete[] data; + + return val; +} + +template <class T, class DeviceType> +bool DivAtomicTest(T i0, T i1) { + T res = DivAtomic<T, DeviceType>(i0, i1); + T resSerial = DivAtomicCheck<T>(i0, i1); + + bool passed = true; + + using Kokkos::abs; + using std::abs; + if (abs((resSerial - res) * 1.) > 1e-5) { + passed = false; + + std::cout << "Loop<" << typeid(T).name() << ">( test = DivAtomicTest" + << " FAILED : " << resSerial << " != " << res << std::endl; + } + + return passed; +} + +//--------------------------------------------------- +//--------------atomic_fetch_mod--------------------- +//--------------------------------------------------- + +template <class T, class DEVICE_TYPE> +struct ModFunctor { + using execution_space = DEVICE_TYPE; + using type = Kokkos::View<T, execution_space>; + + type data; + T i0; + T i1; + + KOKKOS_INLINE_FUNCTION + void operator()(int) const { Kokkos::atomic_fetch_mod(&data(), (T)i1); } + + ModFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {} +}; + +template <class T, class execution_space> +T ModAtomic(T i0, T i1) { + struct InitFunctor<T, execution_space> f_init(i0); + typename InitFunctor<T, execution_space>::type data("Data"); + typename InitFunctor<T, execution_space>::h_type h_data("HData"); + + f_init.data = data; + Kokkos::parallel_for(1, f_init); + execution_space().fence(); + + struct ModFunctor<T, execution_space> f(i0, i1); + + f.data = data; + Kokkos::parallel_for(1, f); + execution_space().fence(); + + Kokkos::deep_copy(h_data, data); + T val = h_data(); + + return val; +} + +template <class T> +T ModAtomicCheck(T i0, T i1) { + T* data = new T[1]; + data[0] = 0; + + *data = i0 % i1; + + T val = *data; + delete[] data; + + return val; +} + +template <class T, class DeviceType> +bool ModAtomicTest(T i0, T i1) { + T res = ModAtomic<T, DeviceType>(i0, i1); + T resSerial = ModAtomicCheck<T>(i0, i1); + + bool passed = true; + + if (resSerial != res) { + passed = false; + + std::cout << "Loop<" << typeid(T).name() << ">( test = ModAtomicTest" + << " FAILED : " << resSerial << " != " << res << std::endl; + } + + return passed; +} + +//--------------------------------------------------- +//--------------atomic_fetch_and--------------------- +//--------------------------------------------------- + +template <class T, class DEVICE_TYPE> +struct AndFunctor { + using execution_space = DEVICE_TYPE; + using type = Kokkos::View<T, execution_space>; + + type data; + T i0; + T i1; + + KOKKOS_INLINE_FUNCTION + void operator()(int) const { Kokkos::atomic_fetch_and(&data(), (T)i1); } + + AndFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {} +}; + +template <class T, class execution_space> +T AndAtomic(T i0, T i1) { + struct InitFunctor<T, execution_space> f_init(i0); + typename InitFunctor<T, execution_space>::type data("Data"); + typename InitFunctor<T, execution_space>::h_type h_data("HData"); + + f_init.data = data; + Kokkos::parallel_for(1, f_init); + execution_space().fence(); + + struct AndFunctor<T, execution_space> f(i0, i1); + + f.data = data; + Kokkos::parallel_for(1, f); + execution_space().fence(); + + Kokkos::deep_copy(h_data, data); + T val = h_data(); + + return val; +} + +template <class T> +T AndAtomicCheck(T i0, T i1) { + T* data = new T[1]; + data[0] = 0; + + *data = i0 & i1; + + T val = *data; + delete[] data; + + return val; +} + +template <class T, class DeviceType> +bool AndAtomicTest(T i0, T i1) { + T res = AndAtomic<T, DeviceType>(i0, i1); + T resSerial = AndAtomicCheck<T>(i0, i1); + + bool passed = true; + + if (resSerial != res) { + passed = false; + + std::cout << "Loop<" << typeid(T).name() << ">( test = AndAtomicTest" + << " FAILED : " << resSerial << " != " << res << std::endl; + } + + return passed; +} + +//--------------------------------------------------- +//--------------atomic_fetch_or---------------------- +//--------------------------------------------------- + +template <class T, class DEVICE_TYPE> +struct OrFunctor { + using execution_space = DEVICE_TYPE; + using type = Kokkos::View<T, execution_space>; + + type data; + T i0; + T i1; + + KOKKOS_INLINE_FUNCTION + void operator()(int) const { Kokkos::atomic_fetch_or(&data(), (T)i1); } + + OrFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {} +}; + +template <class T, class execution_space> +T OrAtomic(T i0, T i1) { + struct InitFunctor<T, execution_space> f_init(i0); + typename InitFunctor<T, execution_space>::type data("Data"); + typename InitFunctor<T, execution_space>::h_type h_data("HData"); + + f_init.data = data; + Kokkos::parallel_for(1, f_init); + execution_space().fence(); + + struct OrFunctor<T, execution_space> f(i0, i1); + + f.data = data; + Kokkos::parallel_for(1, f); + execution_space().fence(); + + Kokkos::deep_copy(h_data, data); + T val = h_data(); + + return val; +} + +template <class T> +T OrAtomicCheck(T i0, T i1) { + T* data = new T[1]; + data[0] = 0; + + *data = i0 | i1; + + T val = *data; + delete[] data; + + return val; +} + +template <class T, class DeviceType> +bool OrAtomicTest(T i0, T i1) { + T res = OrAtomic<T, DeviceType>(i0, i1); + T resSerial = OrAtomicCheck<T>(i0, i1); + + bool passed = true; + + if (resSerial != res) { + passed = false; + + std::cout << "Loop<" << typeid(T).name() << ">( test = OrAtomicTest" + << " FAILED : " << resSerial << " != " << res << std::endl; + } + + return passed; +} + +//--------------------------------------------------- +//--------------atomic_fetch_xor--------------------- +//--------------------------------------------------- + +template <class T, class DEVICE_TYPE> +struct XorFunctor { + using execution_space = DEVICE_TYPE; + using type = Kokkos::View<T, execution_space>; + + type data; + T i0; + T i1; + + KOKKOS_INLINE_FUNCTION + void operator()(int) const { Kokkos::atomic_fetch_xor(&data(), (T)i1); } + + XorFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {} +}; + +template <class T, class execution_space> +T XorAtomic(T i0, T i1) { + struct InitFunctor<T, execution_space> f_init(i0); + typename InitFunctor<T, execution_space>::type data("Data"); + typename InitFunctor<T, execution_space>::h_type h_data("HData"); + + f_init.data = data; + Kokkos::parallel_for(1, f_init); + execution_space().fence(); + + struct XorFunctor<T, execution_space> f(i0, i1); + + f.data = data; + Kokkos::parallel_for(1, f); + execution_space().fence(); + + Kokkos::deep_copy(h_data, data); + T val = h_data(); + + return val; +} + +template <class T> +T XorAtomicCheck(T i0, T i1) { + T* data = new T[1]; + data[0] = 0; + + *data = i0 ^ i1; + + T val = *data; + delete[] data; + + return val; +} + +template <class T, class DeviceType> +bool XorAtomicTest(T i0, T i1) { + T res = XorAtomic<T, DeviceType>(i0, i1); + T resSerial = XorAtomicCheck<T>(i0, i1); + + bool passed = true; + + if (resSerial != res) { + passed = false; + + std::cout << "Loop<" << typeid(T).name() << ">( test = XorAtomicTest" + << " FAILED : " << resSerial << " != " << res << std::endl; + } + + return passed; +} + +//--------------------------------------------------- +//--------------atomic_fetch_lshift--------------------- +//--------------------------------------------------- + +template <class T, class DEVICE_TYPE> +struct LShiftFunctor { + using execution_space = DEVICE_TYPE; + using type = Kokkos::View<T, execution_space>; + + type data; + T i0; + T i1; + + KOKKOS_INLINE_FUNCTION + void operator()(int) const { Kokkos::atomic_fetch_lshift(&data(), (T)i1); } + + LShiftFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {} +}; + +template <class T, class execution_space> +T LShiftAtomic(T i0, T i1) { + struct InitFunctor<T, execution_space> f_init(i0); + typename InitFunctor<T, execution_space>::type data("Data"); + typename InitFunctor<T, execution_space>::h_type h_data("HData"); + + f_init.data = data; + Kokkos::parallel_for(1, f_init); + execution_space().fence(); + + struct LShiftFunctor<T, execution_space> f(i0, i1); + + f.data = data; + Kokkos::parallel_for(1, f); + execution_space().fence(); + + Kokkos::deep_copy(h_data, data); + T val = h_data(); + + return val; +} + +template <class T> +T LShiftAtomicCheck(T i0, T i1) { + T* data = new T[1]; + data[0] = 0; + + *data = i0 << i1; + + T val = *data; + delete[] data; + + return val; +} + +template <class T, class DeviceType> +bool LShiftAtomicTest(T i0, T i1) { + T res = LShiftAtomic<T, DeviceType>(i0, i1); + T resSerial = LShiftAtomicCheck<T>(i0, i1); + + bool passed = true; + + if (resSerial != res) { + passed = false; + + std::cout << "Loop<" << typeid(T).name() << ">( test = LShiftAtomicTest" + << " FAILED : " << resSerial << " != " << res << std::endl; + } + + return passed; +} + +//--------------------------------------------------- +//--------------atomic_fetch_rshift--------------------- +//--------------------------------------------------- + +template <class T, class DEVICE_TYPE> +struct RShiftFunctor { + using execution_space = DEVICE_TYPE; + using type = Kokkos::View<T, execution_space>; + + type data; + T i0; + T i1; + + KOKKOS_INLINE_FUNCTION + void operator()(int) const { Kokkos::atomic_fetch_rshift(&data(), (T)i1); } + + RShiftFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {} +}; + +template <class T, class execution_space> +T RShiftAtomic(T i0, T i1) { + struct InitFunctor<T, execution_space> f_init(i0); + typename InitFunctor<T, execution_space>::type data("Data"); + typename InitFunctor<T, execution_space>::h_type h_data("HData"); + + f_init.data = data; + Kokkos::parallel_for(1, f_init); + execution_space().fence(); + + struct RShiftFunctor<T, execution_space> f(i0, i1); + + f.data = data; + Kokkos::parallel_for(1, f); + execution_space().fence(); + + Kokkos::deep_copy(h_data, data); + T val = h_data(); + + return val; +} + +template <class T> +T RShiftAtomicCheck(T i0, T i1) { + T* data = new T[1]; + data[0] = 0; + + *data = i0 >> i1; + + T val = *data; + delete[] data; + + return val; +} + +template <class T, class DeviceType> +bool RShiftAtomicTest(T i0, T i1) { + T res = RShiftAtomic<T, DeviceType>(i0, i1); + T resSerial = RShiftAtomicCheck<T>(i0, i1); + + bool passed = true; + + if (resSerial != res) { + passed = false; + + std::cout << "Loop<" << typeid(T).name() << ">( test = RShiftAtomicTest" + << " FAILED : " << resSerial << " != " << res << std::endl; + } + + return passed; +} + +//--------------------------------------------------- +//--------------atomic_test_control------------------ +//--------------------------------------------------- + +template <class T, class DeviceType> +bool AtomicOperationsTestIntegralType(int i0, int i1, int test) { + switch (test) { + case 1: return MaxAtomicTest<T, DeviceType>((T)i0, (T)i1); + case 2: return MinAtomicTest<T, DeviceType>((T)i0, (T)i1); + case 3: return MulAtomicTest<T, DeviceType>((T)i0, (T)i1); + case 4: return DivAtomicTest<T, DeviceType>((T)i0, (T)i1); + case 5: return ModAtomicTest<T, DeviceType>((T)i0, (T)i1); + case 6: return AndAtomicTest<T, DeviceType>((T)i0, (T)i1); + case 7: return OrAtomicTest<T, DeviceType>((T)i0, (T)i1); + case 8: return XorAtomicTest<T, DeviceType>((T)i0, (T)i1); + case 9: return LShiftAtomicTest<T, DeviceType>((T)i0, (T)i1); + case 10: return RShiftAtomicTest<T, DeviceType>((T)i0, (T)i1); + case 11: return IncAtomicTest<T, DeviceType>((T)i0); + case 12: return DecAtomicTest<T, DeviceType>((T)i0); + } + + return 0; +} + +template <class T, class DeviceType> +bool AtomicOperationsTestNonIntegralType(int i0, int i1, int test) { + switch (test) { + case 1: return MaxAtomicTest<T, DeviceType>((T)i0, (T)i1); + case 2: return MinAtomicTest<T, DeviceType>((T)i0, (T)i1); + case 3: return MulAtomicTest<T, DeviceType>((T)i0, (T)i1); + case 4: return DivAtomicTest<T, DeviceType>((T)i0, (T)i1); + } + + return 0; +} + +} // namespace TestAtomicOperations diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_complexdouble.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_complexdouble.hpp new file mode 100644 index 0000000000000000000000000000000000000000..612247d03fa7f823bbe7699b15d5a0bf65fe2db6 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestAtomicOperations_complexdouble.hpp @@ -0,0 +1,60 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestAtomicOperations.hpp> + +namespace Test { +TEST(TEST_CATEGORY, atomic_operations_complexdouble) { + const int start = 1; // Avoid zero for division. + const int end = 11; + for (int i = start; i < end; ++i) { + ASSERT_TRUE( + (TestAtomicOperations::MulAtomicTest<Kokkos::complex<double>, + TEST_EXECSPACE>(start, end - i))); + ASSERT_TRUE( + (TestAtomicOperations::DivAtomicTest<Kokkos::complex<double>, + TEST_EXECSPACE>(start, end - i))); + } +} +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_complexfloat.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_complexfloat.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e812d32074c0213c8547fed4353e0d3898b2adeb --- /dev/null +++ b/packages/kokkos/core/unit_test/TestAtomicOperations_complexfloat.hpp @@ -0,0 +1,60 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestAtomicOperations.hpp> + +namespace Test { +TEST(TEST_CATEGORY, atomic_operations_complexfloat) { + const int start = 1; // Avoid zero for division. + const int end = 11; + for (int i = start; i < end; ++i) { + ASSERT_TRUE( + (TestAtomicOperations::MulAtomicTest<Kokkos::complex<float>, + TEST_EXECSPACE>(start, end - i))); + ASSERT_TRUE( + (TestAtomicOperations::DivAtomicTest<Kokkos::complex<float>, + TEST_EXECSPACE>(start, end - i))); + } +} +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_double.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_double.hpp new file mode 100644 index 0000000000000000000000000000000000000000..ba9937e1c6643bfd8a4decde2c7823061b0fcbe4 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestAtomicOperations_double.hpp @@ -0,0 +1,62 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestAtomicOperations.hpp> + +namespace Test { +TEST(TEST_CATEGORY, atomic_operations_double) { + const int start = 1; // Avoid zero for division. + const int end = 11; + for (int i = start; i < end; ++i) { + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType< + double, TEST_EXECSPACE>(start, end - i, 1))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType< + double, TEST_EXECSPACE>(start, end - i, 2))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType< + double, TEST_EXECSPACE>(start, end - i, 3))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType< + double, TEST_EXECSPACE>(start, end - i, 4))); + } +} +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_float.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_float.hpp new file mode 100644 index 0000000000000000000000000000000000000000..aa56b5ff10e770d2964d498f99e25611d85311c6 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestAtomicOperations_float.hpp @@ -0,0 +1,62 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestAtomicOperations.hpp> + +namespace Test { +TEST(TEST_CATEGORY, atomic_operations_float) { + const int start = 1; // Avoid zero for division. + const int end = 11; + for (int i = start; i < end; ++i) { + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType< + float, TEST_EXECSPACE>(start, end - i, 1))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType< + float, TEST_EXECSPACE>(start, end - i, 2))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType< + float, TEST_EXECSPACE>(start, end - i, 3))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType< + float, TEST_EXECSPACE>(start, end - i, 4))); + } +} +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_int.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_int.hpp new file mode 100644 index 0000000000000000000000000000000000000000..f828be6223c7b4e7554252fd04927ce1a3fcb69a --- /dev/null +++ b/packages/kokkos/core/unit_test/TestAtomicOperations_int.hpp @@ -0,0 +1,76 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestAtomicOperations.hpp> + +namespace Test { +TEST(TEST_CATEGORY, atomic_operations_int) { + const int start = 1; // Avoid zero for division. + const int end = 11; + for (int i = start; i < end; ++i) { + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + int, TEST_EXECSPACE>(start, end - i, 1))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + int, TEST_EXECSPACE>(start, end - i, 2))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + int, TEST_EXECSPACE>(start, end - i, 3))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + int, TEST_EXECSPACE>(start, end - i, 4))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + int, TEST_EXECSPACE>(start, end - i, 5))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + int, TEST_EXECSPACE>(start, end - i, 6))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + int, TEST_EXECSPACE>(start, end - i, 7))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + int, TEST_EXECSPACE>(start, end - i, 8))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + int, TEST_EXECSPACE>(start, end - i, 9))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + int, TEST_EXECSPACE>(start, end - i, 11))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + int, TEST_EXECSPACE>(start, end - i, 12))); + } +} +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_longint.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_longint.hpp new file mode 100644 index 0000000000000000000000000000000000000000..eee44c9571cf890b25a9a1b9bb32edd279d3cae7 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestAtomicOperations_longint.hpp @@ -0,0 +1,76 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestAtomicOperations.hpp> + +namespace Test { +TEST(TEST_CATEGORY, atomic_operations_long) { + const int start = 1; // Avoid zero for division. + const int end = 11; + for (int i = start; i < end; ++i) { + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + long int, TEST_EXECSPACE>(start, end - i, 1))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + long int, TEST_EXECSPACE>(start, end - i, 2))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + long int, TEST_EXECSPACE>(start, end - i, 3))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + long int, TEST_EXECSPACE>(start, end - i, 4))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + long int, TEST_EXECSPACE>(start, end - i, 5))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + long int, TEST_EXECSPACE>(start, end - i, 6))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + long int, TEST_EXECSPACE>(start, end - i, 7))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + long int, TEST_EXECSPACE>(start, end - i, 8))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + long int, TEST_EXECSPACE>(start, end - i, 9))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + long int, TEST_EXECSPACE>(start, end - i, 11))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + long int, TEST_EXECSPACE>(start, end - i, 12))); + } +} +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_longlongint.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_longlongint.hpp new file mode 100644 index 0000000000000000000000000000000000000000..73d4a61d7291f852d1ff2d6607d36a1d0bb2f829 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestAtomicOperations_longlongint.hpp @@ -0,0 +1,76 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestAtomicOperations.hpp> + +namespace Test { +TEST(TEST_CATEGORY, atomic_operations_longlong) { + const int start = 1; // Avoid zero for division. + const int end = 11; + for (int i = start; i < end; ++i) { + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + long long int, TEST_EXECSPACE>(start, end - i, 1))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + long long int, TEST_EXECSPACE>(start, end - i, 2))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + long long int, TEST_EXECSPACE>(start, end - i, 3))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + long long int, TEST_EXECSPACE>(start, end - i, 4))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + long long int, TEST_EXECSPACE>(start, end - i, 5))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + long long int, TEST_EXECSPACE>(start, end - i, 6))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + long long int, TEST_EXECSPACE>(start, end - i, 7))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + long long int, TEST_EXECSPACE>(start, end - i, 8))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + long long int, TEST_EXECSPACE>(start, end - i, 9))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + long long int, TEST_EXECSPACE>(start, end - i, 11))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + long long int, TEST_EXECSPACE>(start, end - i, 12))); + } +} +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedint.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedint.hpp new file mode 100644 index 0000000000000000000000000000000000000000..02f337c57c64633d62d8111c28ae49cee05e80e3 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedint.hpp @@ -0,0 +1,76 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestAtomicOperations.hpp> + +namespace Test { +TEST(TEST_CATEGORY, atomic_operations_unsigned) { + const int start = 1; // Avoid zero for division. + const int end = 11; + for (int i = start; i < end; ++i) { + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + unsigned int, TEST_EXECSPACE>(start, end - i, 1))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + unsigned int, TEST_EXECSPACE>(start, end - i, 2))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + unsigned int, TEST_EXECSPACE>(start, end - i, 3))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + unsigned int, TEST_EXECSPACE>(start, end - i, 4))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + unsigned int, TEST_EXECSPACE>(start, end - i, 5))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + unsigned int, TEST_EXECSPACE>(start, end - i, 6))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + unsigned int, TEST_EXECSPACE>(start, end - i, 7))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + unsigned int, TEST_EXECSPACE>(start, end - i, 8))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + unsigned int, TEST_EXECSPACE>(start, end - i, 9))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + unsigned int, TEST_EXECSPACE>(start, end - i, 11))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + unsigned int, TEST_EXECSPACE>(start, end - i, 12))); + } +} +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedlongint.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedlongint.hpp new file mode 100644 index 0000000000000000000000000000000000000000..f4340475f573c3c8f4c108f8b7bfacef0d72af4e --- /dev/null +++ b/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedlongint.hpp @@ -0,0 +1,76 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestAtomicOperations.hpp> + +namespace Test { +TEST(TEST_CATEGORY, atomic_operations_unsignedlong) { + const int start = 1; // Avoid zero for division. + const int end = 11; + for (int i = start; i < end; ++i) { + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + unsigned long int, TEST_EXECSPACE>(start, end - i, 1))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + unsigned long int, TEST_EXECSPACE>(start, end - i, 2))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + unsigned long int, TEST_EXECSPACE>(start, end - i, 3))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + unsigned long int, TEST_EXECSPACE>(start, end - i, 4))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + unsigned long int, TEST_EXECSPACE>(start, end - i, 5))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + unsigned long int, TEST_EXECSPACE>(start, end - i, 6))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + unsigned long int, TEST_EXECSPACE>(start, end - i, 7))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + unsigned long int, TEST_EXECSPACE>(start, end - i, 8))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + unsigned long int, TEST_EXECSPACE>(start, end - i, 9))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + unsigned long int, TEST_EXECSPACE>(start, end - i, 11))); + ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType< + unsigned long int, TEST_EXECSPACE>(start, end - i, 12))); + } +} +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestAtomicViews.hpp b/packages/kokkos/core/unit_test/TestAtomicViews.hpp new file mode 100644 index 0000000000000000000000000000000000000000..b615b407f334a60d187bbc3c27b3c69110acc94c --- /dev/null +++ b/packages/kokkos/core/unit_test/TestAtomicViews.hpp @@ -0,0 +1,1461 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> + +namespace TestAtomicViews { + +//------------------------------------------------- +//-----------atomic view api tests----------------- +//------------------------------------------------- + +template <class T, class... P> +size_t allocation_count(const Kokkos::View<T, P...>& view) { + const size_t card = view.size(); + const size_t alloc = view.span(); + + const int memory_span = Kokkos::View<int*>::required_allocation_size(100); + + return (card <= alloc && memory_span == 400) ? alloc : 0; +} + +template <class DataType, class DeviceType, + unsigned Rank = Kokkos::ViewTraits<DataType>::rank> +struct TestViewOperator_LeftAndRight; + +template <class DataType, class DeviceType> +struct TestViewOperator_LeftAndRight<DataType, DeviceType, 1> { + using execution_space = typename DeviceType::execution_space; + using memory_space = typename DeviceType::memory_space; + using size_type = typename execution_space::size_type; + + using value_type = int; + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type& update, + const volatile value_type& input) { + update |= input; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type& update) { update = 0; } + + using left_view = Kokkos::View<DataType, Kokkos::LayoutLeft, execution_space, + Kokkos::MemoryTraits<Kokkos::Atomic> >; + + using right_view = + Kokkos::View<DataType, Kokkos::LayoutRight, execution_space, + Kokkos::MemoryTraits<Kokkos::Atomic> >; + + using stride_view = + Kokkos::View<DataType, Kokkos::LayoutStride, execution_space, + Kokkos::MemoryTraits<Kokkos::Atomic> >; + + left_view left; + right_view right; + stride_view left_stride; + stride_view right_stride; + int64_t left_alloc; + int64_t right_alloc; + + TestViewOperator_LeftAndRight() + : left("left"), + right("right"), + left_stride(left), + right_stride(right), + left_alloc(allocation_count(left)), + right_alloc(allocation_count(right)) {} + + static void testit() { + TestViewOperator_LeftAndRight driver; + + int error_flag = 0; + + Kokkos::parallel_reduce(1, driver, error_flag); + + ASSERT_EQ(error_flag, 0); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const size_type, value_type& update) const { + for (unsigned i0 = 0; i0 < unsigned(left.extent(0)); ++i0) { + // Below checks that values match, but unable to check the references. + // Should this be able to be checked? + if (left(i0) != left.access(i0, 0, 0, 0, 0, 0, 0, 0)) { + update |= 3; + } + if (right(i0) != right.access(i0, 0, 0, 0, 0, 0, 0, 0)) { + update |= 3; + } + if (left(i0) != left_stride(i0)) { + update |= 4; + } + if (right(i0) != right_stride(i0)) { + update |= 8; + } + /* + if ( &left( i0 ) != &left( i0, 0, 0, 0, 0, 0, 0, 0 ) ) { update |= + 3; } if ( &right( i0 ) != &right( i0, 0, 0, 0, 0, 0, 0, 0 ) ) { update + |= 3; } if ( &left( i0 ) != &left_stride( i0 ) ) { update |= 4; } if ( + &right( i0 ) != &right_stride( i0 ) ) { update |= 8; } + */ + } + } +}; + +template <typename T, class DeviceType> +class TestAtomicViewAPI { + public: + using device = DeviceType; + + enum { N0 = 1000, N1 = 3, N2 = 5, N3 = 7 }; + + using dView0 = Kokkos::View<T, device>; + using dView1 = Kokkos::View<T*, device>; + using dView2 = Kokkos::View<T * [N1], device>; + using dView3 = Kokkos::View<T * [N1][N2], device>; + using dView4 = Kokkos::View<T * [N1][N2][N3], device>; + using const_dView4 = Kokkos::View<const T * [N1][N2][N3], device>; + using dView4_unmanaged = Kokkos::View<T****, device, Kokkos::MemoryUnmanaged>; + using host = typename dView0::host_mirror_space; + + using aView0 = Kokkos::View<T, device, Kokkos::MemoryTraits<Kokkos::Atomic> >; + using aView1 = + Kokkos::View<T*, device, Kokkos::MemoryTraits<Kokkos::Atomic> >; + using aView2 = + Kokkos::View<T * [N1], device, Kokkos::MemoryTraits<Kokkos::Atomic> >; + using aView3 = + Kokkos::View<T * [N1][N2], device, Kokkos::MemoryTraits<Kokkos::Atomic> >; + using aView4 = Kokkos::View<T * [N1][N2][N3], device, + Kokkos::MemoryTraits<Kokkos::Atomic> >; + using const_aView4 = Kokkos::View<const T * [N1][N2][N3], device, + Kokkos::MemoryTraits<Kokkos::Atomic> >; + + using aView4_unmanaged = + Kokkos::View<T****, device, + Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::Atomic> >; + + using host_atomic = typename aView0::host_mirror_space; + + TestAtomicViewAPI() { + // FIXME_OPENMPTARGET +#ifndef KOKKOS_ENABLE_OPENMPTARGET + TestViewOperator_LeftAndRight<int[2], device>::testit(); +#endif + run_test_rank0(); + run_test_rank4(); + run_test_const(); + } + + static void run_test_rank0() { + dView0 dx, dy; + aView0 ax, ay, az; + + dx = dView0("dx"); + dy = dView0("dy"); + ASSERT_EQ(dx.use_count(), size_t(1)); + ASSERT_EQ(dy.use_count(), size_t(1)); + + ax = dx; + ay = dy; + ASSERT_EQ(dx.use_count(), size_t(2)); + ASSERT_EQ(dy.use_count(), size_t(2)); + ASSERT_EQ(dx.use_count(), ax.use_count()); + + az = ax; + ASSERT_EQ(dx.use_count(), size_t(3)); + ASSERT_EQ(ax.use_count(), size_t(3)); + ASSERT_EQ(az.use_count(), size_t(3)); + ASSERT_EQ(az.use_count(), ax.use_count()); + } + + static void run_test_rank4() { + dView4 dx, dy; + aView4 ax, ay, az; + + dx = dView4("dx", N0); + dy = dView4("dy", N0); + ASSERT_EQ(dx.use_count(), size_t(1)); + ASSERT_EQ(dy.use_count(), size_t(1)); + + ax = dx; + ay = dy; + ASSERT_EQ(dx.use_count(), size_t(2)); + ASSERT_EQ(dy.use_count(), size_t(2)); + ASSERT_EQ(dx.use_count(), ax.use_count()); + + dView4_unmanaged unmanaged_dx = dx; + ASSERT_EQ(dx.use_count(), size_t(2)); + + az = ax; + ASSERT_EQ(dx.use_count(), size_t(3)); + ASSERT_EQ(ax.use_count(), size_t(3)); + ASSERT_EQ(az.use_count(), size_t(3)); + ASSERT_EQ(az.use_count(), ax.use_count()); + + aView4_unmanaged unmanaged_ax = ax; + ASSERT_EQ(ax.use_count(), size_t(3)); + + aView4_unmanaged unmanaged_ax_from_ptr_dx = aView4_unmanaged( + dx.data(), dx.extent(0), dx.extent(1), dx.extent(2), dx.extent(3)); + ASSERT_EQ(ax.use_count(), size_t(3)); + + const_aView4 const_ax = ax; + ASSERT_EQ(ax.use_count(), size_t(4)); + ASSERT_EQ(const_ax.use_count(), ax.use_count()); + + ASSERT_FALSE(ax.data() == nullptr); + ASSERT_FALSE(const_ax.data() == nullptr); // referenceable ptr + ASSERT_FALSE(unmanaged_ax.data() == nullptr); + ASSERT_FALSE(unmanaged_ax_from_ptr_dx.data() == nullptr); + ASSERT_FALSE(ay.data() == nullptr); + // ASSERT_NE( ax, ay ); + // Above test results in following runtime error from gtest: + // Expected: (ax) != (ay), actual: 32-byte object <30-01 D0-A0 D8-7F + // 00-00 00-31 44-0C 01-00 00-00 E8-03 00-00 00-00 00-00 69-00 00-00 + // 00-00 00-00> vs 32-byte object <80-01 D0-A0 D8-7F 00-00 00-A1 4A-0C + // 01-00 00-00 E8-03 00-00 00-00 00-00 69-00 00-00 00-00 00-00> + + ASSERT_EQ(ax.extent(0), unsigned(N0)); + ASSERT_EQ(ax.extent(1), unsigned(N1)); + ASSERT_EQ(ax.extent(2), unsigned(N2)); + ASSERT_EQ(ax.extent(3), unsigned(N3)); + + ASSERT_EQ(ay.extent(0), unsigned(N0)); + ASSERT_EQ(ay.extent(1), unsigned(N1)); + ASSERT_EQ(ay.extent(2), unsigned(N2)); + ASSERT_EQ(ay.extent(3), unsigned(N3)); + + ASSERT_EQ(unmanaged_ax_from_ptr_dx.span(), + unsigned(N0) * unsigned(N1) * unsigned(N2) * unsigned(N3)); + } + + using DataType = T[2]; + + static void check_auto_conversion_to_const( + const Kokkos::View<const DataType, device, + Kokkos::MemoryTraits<Kokkos::Atomic> >& arg_const, + const Kokkos::View<const DataType, device, + Kokkos::MemoryTraits<Kokkos::Atomic> >& arg) { + ASSERT_TRUE(arg_const == arg); + } + + static void run_test_const() { + using typeX = + Kokkos::View<DataType, device, Kokkos::MemoryTraits<Kokkos::Atomic> >; + using const_typeX = Kokkos::View<const DataType, device, + Kokkos::MemoryTraits<Kokkos::Atomic> >; + + typeX x("X"); + const_typeX xc = x; + + // ASSERT_TRUE( xc == x ); // const xc is referenceable, non-const x is not + // ASSERT_TRUE( x == xc ); + + check_auto_conversion_to_const(x, xc); + } +}; + +//--------------------------------------------------- +//-----------initialization functors----------------- +//--------------------------------------------------- + +template <class T, class execution_space> +struct InitFunctor_Seq { + using view_type = Kokkos::View<T*, execution_space>; + + view_type input; + const int64_t length; + + InitFunctor_Seq(view_type& input_, const int64_t length_) + : input(input_), length(length_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int64_t i) const { + if (i < length) { + input(i) = (T)i; + } + } +}; + +template <class T, class execution_space> +struct InitFunctor_ModTimes { + using view_type = Kokkos::View<T*, execution_space>; + + view_type input; + const int64_t length; + const int64_t remainder; + + InitFunctor_ModTimes(view_type& input_, const int64_t length_, + const int64_t remainder_) + : input(input_), length(length_), remainder(remainder_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int64_t i) const { + if (i < length) { + if (i % (remainder + 1) == remainder) { + input(i) = (T)2; + } else { + input(i) = (T)1; + } + } + } +}; + +template <class T, class execution_space> +struct InitFunctor_ModShift { + using view_type = Kokkos::View<T*, execution_space>; + + view_type input; + const int64_t length; + const int64_t remainder; + + InitFunctor_ModShift(view_type& input_, const int64_t length_, + const int64_t remainder_) + : input(input_), length(length_), remainder(remainder_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int64_t i) const { + if (i < length) { + if (i % (remainder + 1) == remainder) { + input(i) = 1; + } + } + } +}; + +//--------------------------------------------------- +//-----------atomic view plus-equal------------------ +//--------------------------------------------------- + +template <class T, class execution_space> +struct PlusEqualAtomicViewFunctor { + using atomic_view_type = + Kokkos::View<T*, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> >; + using view_type = Kokkos::View<T*, execution_space>; + + view_type input; + atomic_view_type even_odd_result; + const int64_t length; + + // Wrap the result view in an atomic view, use this for operator + PlusEqualAtomicViewFunctor(const view_type& input_, + view_type& even_odd_result_, const int64_t length_) + : input(input_), even_odd_result(even_odd_result_), length(length_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int64_t i) const { + if (i < length) { + if (i % 2 == 0) { + even_odd_result(0) += input(i); + } else { + even_odd_result(1) += input(i); + } + } + } +}; + +template <class T, class execution_space> +T PlusEqualAtomicView(const int64_t input_length) { + using view_type = Kokkos::View<T*, execution_space>; + using host_view_type = typename view_type::HostMirror; + + const int64_t length = input_length; + + view_type input("input_view", length); + view_type result_view("result_view", 2); + + InitFunctor_Seq<T, execution_space> init_f(input, length); + Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(0, length), init_f); + + PlusEqualAtomicViewFunctor<T, execution_space> functor(input, result_view, + length); + Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(0, length), + functor); + Kokkos::fence(); + + host_view_type h_result_view = Kokkos::create_mirror_view(result_view); + Kokkos::deep_copy(h_result_view, result_view); + + return (T)(h_result_view(0) + h_result_view(1)); +} + +template <class T> +T PlusEqualAtomicViewCheck(const int64_t input_length) { + const int64_t N = input_length; + T result[2]; + + if (N % 2 == 0) { + const int64_t half_sum_end = (N / 2) - 1; + const int64_t full_sum_end = N - 1; + result[0] = half_sum_end * (half_sum_end + 1) / 2; // Even sum. + result[1] = + (full_sum_end * (full_sum_end + 1) / 2) - result[0]; // Odd sum. + } else { + const int64_t half_sum_end = (T)(N / 2); + const int64_t full_sum_end = N - 2; + result[0] = half_sum_end * (half_sum_end - 1) / 2; // Even sum. + result[1] = + (full_sum_end * (full_sum_end - 1) / 2) - result[0]; // Odd sum. + } + + return (T)(result[0] + result[1]); +} + +template <class T, class DeviceType> +bool PlusEqualAtomicViewTest(int64_t input_length) { + T res = PlusEqualAtomicView<T, DeviceType>(input_length); + T resSerial = PlusEqualAtomicViewCheck<T>(input_length); + + bool passed = true; + + if (resSerial != res) { + passed = false; + + std::cout << "Loop<" << typeid(T).name() + << ">( test = PlusEqualAtomicViewTest" + << " FAILED : " << resSerial << " != " << res << std::endl; + } + + return passed; +} + +//--------------------------------------------------- +//-----------atomic view minus-equal----------------- +//--------------------------------------------------- + +template <class T, class execution_space> +struct MinusEqualAtomicViewFunctor { + using atomic_view_type = + Kokkos::View<T*, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> >; + using view_type = Kokkos::View<T*, execution_space>; + + view_type input; + atomic_view_type even_odd_result; + const int64_t length; + + // Wrap the result view in an atomic view, use this for operator. + MinusEqualAtomicViewFunctor(const view_type& input_, + view_type& even_odd_result_, + const int64_t length_) + : input(input_), even_odd_result(even_odd_result_), length(length_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int64_t i) const { + if (i < length) { + if (i % 2 == 0) { + even_odd_result(0) -= input(i); + } else { + even_odd_result(1) -= input(i); + } + } + } +}; + +template <class T, class execution_space> +T MinusEqualAtomicView(const int64_t input_length) { + using view_type = Kokkos::View<T*, execution_space>; + using host_view_type = typename view_type::HostMirror; + + const int64_t length = input_length; + + view_type input("input_view", length); + view_type result_view("result_view", 2); + + InitFunctor_Seq<T, execution_space> init_f(input, length); + Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(0, length), init_f); + + MinusEqualAtomicViewFunctor<T, execution_space> functor(input, result_view, + length); + Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(0, length), + functor); + Kokkos::fence(); + + host_view_type h_result_view = Kokkos::create_mirror_view(result_view); + Kokkos::deep_copy(h_result_view, result_view); + + return (T)(h_result_view(0) + h_result_view(1)); +} + +template <class T> +T MinusEqualAtomicViewCheck(const int64_t input_length) { + const int64_t N = input_length; + T result[2]; + + if (N % 2 == 0) { + const int64_t half_sum_end = (N / 2) - 1; + const int64_t full_sum_end = N - 1; + result[0] = -1 * (half_sum_end * (half_sum_end + 1) / 2); // Even sum. + result[1] = + -1 * ((full_sum_end * (full_sum_end + 1) / 2) + result[0]); // Odd sum. + } else { + const int64_t half_sum_end = (int64_t)(N / 2); + const int64_t full_sum_end = N - 2; + result[0] = -1 * (half_sum_end * (half_sum_end - 1) / 2); // Even sum. + result[1] = + -1 * ((full_sum_end * (full_sum_end - 1) / 2) + result[0]); // Odd sum. + } + + return (result[0] + result[1]); +} + +template <class T, class DeviceType> +bool MinusEqualAtomicViewTest(int64_t input_length) { + T res = MinusEqualAtomicView<T, DeviceType>(input_length); + T resSerial = MinusEqualAtomicViewCheck<T>(input_length); + + bool passed = true; + + if (resSerial != res) { + passed = false; + + std::cout << "Loop<" << typeid(T).name() + << ">( test = MinusEqualAtomicViewTest" + << " FAILED : " << resSerial << " != " << res << std::endl; + } + + return passed; +} + +//--------------------------------------------------- +//-----------atomic view times-equal----------------- +//--------------------------------------------------- + +template <class T, class execution_space> +struct TimesEqualAtomicViewFunctor { + using atomic_view_type = + Kokkos::View<T*, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> >; + using view_type = Kokkos::View<T*, execution_space>; + + view_type input; + atomic_view_type result; + const int64_t length; + + // Wrap the result view in an atomic view, use this for operator + TimesEqualAtomicViewFunctor(const view_type& input_, view_type& result_, + const int64_t length_) + : input(input_), result(result_), length(length_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int64_t i) const { + if (i < length && i > 0) { + result(0) *= (double)input(i); + } + } +}; + +template <class T, class execution_space> +T TimesEqualAtomicView(const int64_t input_length, const int64_t remainder) { + using view_type = Kokkos::View<T*, execution_space>; + using host_view_type = typename view_type::HostMirror; + + const int64_t length = input_length; + + view_type input("input_view", length); + view_type result_view("result_view", 1); + deep_copy(result_view, 1.0); + + InitFunctor_ModTimes<T, execution_space> init_f(input, length, remainder); + Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(0, length), init_f); + + TimesEqualAtomicViewFunctor<T, execution_space> functor(input, result_view, + length); + Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(0, length), + functor); + Kokkos::fence(); + + host_view_type h_result_view = Kokkos::create_mirror_view(result_view); + Kokkos::deep_copy(h_result_view, result_view); + + return (T)(h_result_view(0)); +} + +template <class T> +T TimesEqualAtomicViewCheck(const int64_t input_length, + const int64_t remainder) { + // Analytical result. + const int64_t N = input_length; + T result = 1.0; + + for (int64_t i = 2; i < N; ++i) { + if (i % (remainder + 1) == remainder) { + result *= 2.0; + } else { + result *= 1.0; + } + } + + return (T)result; +} + +template <class T, class DeviceType> +bool TimesEqualAtomicViewTest(const int64_t input_length) { + const int64_t remainder = 23; + T res = TimesEqualAtomicView<T, DeviceType>(input_length, remainder); + T resSerial = TimesEqualAtomicViewCheck<T>(input_length, remainder); + + bool passed = true; + + if (resSerial != res) { + passed = false; + + std::cout << "Loop<" << typeid(T).name() + << ">( test = TimesEqualAtomicViewTest" + << " FAILED : " << resSerial << " != " << res << std::endl; + } + + return passed; +} + +//--------------------------------------------------- +//------------atomic view div-equal------------------ +//--------------------------------------------------- + +template <class T, class execution_space> +struct DivEqualAtomicViewFunctor { + using atomic_view_type = + Kokkos::View<T, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> >; + using view_type = Kokkos::View<T*, execution_space>; + using scalar_view_type = Kokkos::View<T, execution_space>; + + view_type input; + atomic_view_type result; + const int64_t length; + + // Wrap the result view in an atomic view, use this for operator. + DivEqualAtomicViewFunctor(const view_type& input_, scalar_view_type& result_, + const int64_t length_) + : input(input_), result(result_), length(length_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int64_t i) const { + if (i < length && i > 0) { + result() /= (double)(input(i)); + } + } +}; + +template <class T, class execution_space> +T DivEqualAtomicView(const int64_t input_length, const int64_t remainder) { + using view_type = Kokkos::View<T*, execution_space>; + using scalar_view_type = Kokkos::View<T, execution_space>; + using host_scalar_view_type = typename scalar_view_type::HostMirror; + + const int64_t length = input_length; + + view_type input("input_view", length); + scalar_view_type result_view("result_view"); + Kokkos::deep_copy(result_view, 12121212121); + + InitFunctor_ModTimes<T, execution_space> init_f(input, length, remainder); + Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(0, length), init_f); + + DivEqualAtomicViewFunctor<T, execution_space> functor(input, result_view, + length); + Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(0, length), + functor); + Kokkos::fence(); + + host_scalar_view_type h_result_view = Kokkos::create_mirror_view(result_view); + Kokkos::deep_copy(h_result_view, result_view); + + return (T)(h_result_view()); +} + +template <class T> +T DivEqualAtomicViewCheck(const int64_t input_length, const int64_t remainder) { + const int64_t N = input_length; + T result = 12121212121.0; + for (int64_t i = 2; i < N; ++i) { + if (i % (remainder + 1) == remainder) { + result /= 1.0; + } else { + result /= 2.0; + } + } + + return (T)result; +} + +template <class T, class DeviceType> +bool DivEqualAtomicViewTest(const int64_t input_length) { + const int64_t remainder = 23; + + T res = DivEqualAtomicView<T, DeviceType>(input_length, remainder); + T resSerial = DivEqualAtomicViewCheck<T>(input_length, remainder); + + bool passed = true; + + if (resSerial != res) { + passed = false; + + std::cout << "Loop<" << typeid(T).name() + << ">( test = DivEqualAtomicViewTest" + << " FAILED : " << resSerial << " != " << res << std::endl; + } + + return passed; +} + +//--------------------------------------------------- +//------------atomic view mod-equal------------------ +//--------------------------------------------------- + +template <class T, class execution_space> +struct ModEqualAtomicViewFunctor { + using atomic_view_type = + Kokkos::View<T, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> >; + using view_type = Kokkos::View<T*, execution_space>; + using scalar_view_type = Kokkos::View<T, execution_space>; + + view_type input; + atomic_view_type result; + const int64_t length; + + // Wrap the result view in an atomic view, use this for operator. + ModEqualAtomicViewFunctor(const view_type& input_, scalar_view_type& result_, + const int64_t length_) + : input(input_), result(result_), length(length_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int64_t i) const { + if (i < length && i > 0) { + result() %= (double)(input(i)); + } + } +}; + +template <class T, class execution_space> +T ModEqualAtomicView(const int64_t input_length, const int64_t remainder) { + using view_type = Kokkos::View<T*, execution_space>; + using scalar_view_type = Kokkos::View<T, execution_space>; + using host_scalar_view_type = typename scalar_view_type::HostMirror; + + const int64_t length = input_length; + + view_type input("input_view", length); + scalar_view_type result_view("result_view"); + Kokkos::deep_copy(result_view, 12121212121); + + InitFunctor_ModTimes<T, execution_space> init_f(input, length, remainder); + Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(0, length), init_f); + + ModEqualAtomicViewFunctor<T, execution_space> functor(input, result_view, + length); + Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(0, length), + functor); + Kokkos::fence(); + + host_scalar_view_type h_result_view = Kokkos::create_mirror_view(result_view); + Kokkos::deep_copy(h_result_view, result_view); + + return (T)(h_result_view()); +} + +template <class T> +T ModEqualAtomicViewCheck(const int64_t input_length, const int64_t remainder) { + const int64_t N = input_length; + T result = 12121212121; + for (int64_t i = 2; i < N; ++i) { + if (i % (remainder + 1) == remainder) { + result %= 1; + } else { + result %= 2; + } + } + + return (T)result; +} + +template <class T, class DeviceType> +bool ModEqualAtomicViewTest(const int64_t input_length) { + static_assert(std::is_integral<T>::value, + "ModEqualAtomicView Error: Type must be integral type for this " + "unit test"); + + const int64_t remainder = 23; + + T res = ModEqualAtomicView<T, DeviceType>(input_length, remainder); + T resSerial = ModEqualAtomicViewCheck<T>(input_length, remainder); + + bool passed = true; + + if (resSerial != res) { + passed = false; + + std::cout << "Loop<" << typeid(T).name() + << ">( test = ModEqualAtomicViewTest" + << " FAILED : " << resSerial << " != " << res << std::endl; + } + + return passed; +} + +//--------------------------------------------------- +//------------atomic view rs-equal------------------ +//--------------------------------------------------- + +template <class T, class execution_space> +struct RSEqualAtomicViewFunctor { + using atomic_view_type = Kokkos::View<T****, execution_space, + Kokkos::MemoryTraits<Kokkos::Atomic> >; + using view_type = Kokkos::View<T*, execution_space>; + using result_view_type = Kokkos::View<T****, execution_space>; + + const view_type input; + atomic_view_type result; + const int64_t length; + const int64_t value; + + // Wrap the result view in an atomic view, use this for operator. + RSEqualAtomicViewFunctor(const view_type& input_, result_view_type& result_, + const int64_t& length_, const int64_t& value_) + : input(input_), result(result_), length(length_), value(value_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int64_t i) const { + if (i < length) { + if (i % 4 == 0) { + result(1, 0, 0, 0) >>= input(i); + } else if (i % 4 == 1) { + result(0, 1, 0, 0) >>= input(i); + } else if (i % 4 == 2) { + result(0, 0, 1, 0) >>= input(i); + } else if (i % 4 == 3) { + result(0, 0, 0, 1) >>= input(i); + } + } + } +}; + +template <class T, class execution_space> +T RSEqualAtomicView(const int64_t input_length, const int64_t value, + const int64_t remainder) { + using view_type = Kokkos::View<T*, execution_space>; + using result_view_type = Kokkos::View<T****, execution_space>; + using host_scalar_view_type = typename result_view_type::HostMirror; + + const int64_t length = input_length; + + view_type input("input_view", length); + result_view_type result_view("result_view", 2, 2, 2, 2); + host_scalar_view_type h_result_view = Kokkos::create_mirror_view(result_view); + h_result_view(1, 0, 0, 0) = value; + h_result_view(0, 1, 0, 0) = value; + h_result_view(0, 0, 1, 0) = value; + h_result_view(0, 0, 0, 1) = value; + Kokkos::deep_copy(result_view, h_result_view); + + InitFunctor_ModShift<T, execution_space> init_f(input, length, remainder); + Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(0, length), init_f); + + RSEqualAtomicViewFunctor<T, execution_space> functor(input, result_view, + length, value); + Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(0, length), + functor); + Kokkos::fence(); + + Kokkos::deep_copy(h_result_view, result_view); + + return (T)(h_result_view(1, 0, 0, 0)); +} + +template <class T> +T RSEqualAtomicViewCheck(const int64_t input_length, const int64_t value, + const int64_t remainder) { + T result[4]; + result[0] = value; + result[1] = value; + result[2] = value; + result[3] = value; + + T* input = new T[input_length]; + for (int64_t i = 0; i < input_length; ++i) { + if (i % (remainder + 1) == remainder) { + input[i] = 1; + } else { + input[i] = 0; + } + } + + for (int64_t i = 0; i < input_length; ++i) { + if (i % 4 == 0) { + result[0] >>= input[i]; + } else if (i % 4 == 1) { + result[1] >>= input[i]; + } else if (i % 4 == 2) { + result[2] >>= input[i]; + } else if (i % 4 == 3) { + result[3] >>= input[i]; + } + } + + delete[] input; + + return (T)result[0]; +} + +template <class T, class DeviceType> +bool RSEqualAtomicViewTest(const int64_t input_length) { + static_assert(std::is_integral<T>::value, + "RSEqualAtomicViewTest: Must be integral type for test"); + + const int64_t remainder = 61042; // prime - 1 + const int64_t value = 1073741825; // 2^30+1 + T res = RSEqualAtomicView<T, DeviceType>(input_length, value, remainder); + T resSerial = RSEqualAtomicViewCheck<T>(input_length, value, remainder); + + bool passed = true; + + if (resSerial != res) { + passed = false; + + std::cout << "Loop<" << typeid(T).name() + << ">( test = RSEqualAtomicViewTest" + << " FAILED : " << resSerial << " != " << res << std::endl; + } + + return passed; +} + +//--------------------------------------------------- +//------------atomic view ls-equal------------------ +//--------------------------------------------------- + +template <class T, class execution_space> +struct LSEqualAtomicViewFunctor { + using atomic_view_type = Kokkos::View<T****, execution_space, + Kokkos::MemoryTraits<Kokkos::Atomic> >; + using view_type = Kokkos::View<T*, execution_space>; + using result_view_type = Kokkos::View<T****, execution_space>; + + view_type input; + atomic_view_type result; + const int64_t length; + const int64_t value; + + // Wrap the result view in an atomic view, use this for operator. + LSEqualAtomicViewFunctor(const view_type& input_, result_view_type& result_, + const int64_t& length_, const int64_t& value_) + : input(input_), result(result_), length(length_), value(value_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int64_t i) const { + if (i < length) { + if (i % 4 == 0) { + result(1, 0, 0, 0) <<= input(i); + } else if (i % 4 == 1) { + result(0, 1, 0, 0) <<= input(i); + } else if (i % 4 == 2) { + result(0, 0, 1, 0) <<= input(i); + } else if (i % 4 == 3) { + result(0, 0, 0, 1) <<= input(i); + } + } + } +}; + +template <class T, class execution_space> +T LSEqualAtomicView(const int64_t input_length, const int64_t value, + const int64_t remainder) { + using view_type = Kokkos::View<T*, execution_space>; + using result_view_type = Kokkos::View<T****, execution_space>; + using host_scalar_view_type = typename result_view_type::HostMirror; + + const int64_t length = input_length; + + view_type input("input_view", length); + result_view_type result_view("result_view", 2, 2, 2, 2); + host_scalar_view_type h_result_view = Kokkos::create_mirror_view(result_view); + h_result_view(1, 0, 0, 0) = value; + h_result_view(0, 1, 0, 0) = value; + h_result_view(0, 0, 1, 0) = value; + h_result_view(0, 0, 0, 1) = value; + Kokkos::deep_copy(result_view, h_result_view); + + InitFunctor_ModShift<T, execution_space> init_f(input, length, remainder); + Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(0, length), init_f); + + LSEqualAtomicViewFunctor<T, execution_space> functor(input, result_view, + length, value); + Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(0, length), + functor); + Kokkos::fence(); + + Kokkos::deep_copy(h_result_view, result_view); + + return (T)(h_result_view(1, 0, 0, 0)); +} + +template <class T> +T LSEqualAtomicViewCheck(const int64_t input_length, const int64_t value, + const int64_t remainder) { + T result[4]; + result[0] = value; + result[1] = value; + result[2] = value; + result[3] = value; + + T* input = new T[input_length]; + for (int64_t i = 0; i < input_length; ++i) { + if (i % (remainder + 1) == remainder) { + input[i] = 1; + } else { + input[i] = 0; + } + } + + for (int64_t i = 0; i < input_length; ++i) { + if (i % 4 == 0) { + result[0] <<= input[i]; + } else if (i % 4 == 1) { + result[1] <<= input[i]; + } else if (i % 4 == 2) { + result[2] <<= input[i]; + } else if (i % 4 == 3) { + result[3] <<= input[i]; + } + } + + delete[] input; + + return (T)result[0]; +} + +template <class T, class DeviceType> +bool LSEqualAtomicViewTest(const int64_t input_length) { + static_assert(std::is_integral<T>::value, + "LSEqualAtomicViewTest: Must be integral type for test"); + + const int64_t remainder = 61042; // prime - 1 + const int64_t value = 1; // 2^30+1 + T res = LSEqualAtomicView<T, DeviceType>(input_length, value, remainder); + T resSerial = LSEqualAtomicViewCheck<T>(input_length, value, remainder); + + bool passed = true; + + if (resSerial != res) { + passed = false; + + std::cout << "Loop<" << typeid(T).name() + << ">( test = RSEqualAtomicViewTest" + << " FAILED : " << resSerial << " != " << res << std::endl; + } + + return passed; +} + +//--------------------------------------------------- +//-----------atomic view and-equal----------------- +//--------------------------------------------------- + +template <class T, class execution_space> +struct AndEqualAtomicViewFunctor { + using atomic_view_type = + Kokkos::View<T*, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> >; + using view_type = Kokkos::View<T*, execution_space>; + + view_type input; + atomic_view_type even_odd_result; + const int64_t length; + + // Wrap the result view in an atomic view, use this for operator. + AndEqualAtomicViewFunctor(const view_type& input_, + view_type& even_odd_result_, const int64_t length_) + : input(input_), even_odd_result(even_odd_result_), length(length_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int64_t i) const { + if (i < length) { + if (i % 2 == 0) { + even_odd_result(0) &= input(i); + } else { + even_odd_result(1) &= input(i); + } + } + } +}; + +template <class T, class execution_space> +T AndEqualAtomicView(const int64_t input_length) { + using view_type = Kokkos::View<T*, execution_space>; + using host_view_type = typename view_type::HostMirror; + + const int64_t length = input_length; + + view_type input("input_view", length); + view_type result_view("result_view", 2); + Kokkos::deep_copy(result_view, 1); + + InitFunctor_Seq<T, execution_space> init_f(input, length); + Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(0, length), init_f); + + AndEqualAtomicViewFunctor<T, execution_space> functor(input, result_view, + length); + Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(0, length), + functor); + Kokkos::fence(); + + host_view_type h_result_view = Kokkos::create_mirror_view(result_view); + Kokkos::deep_copy(h_result_view, result_view); + + return (T)(h_result_view(0)); +} + +template <class T> +T AndEqualAtomicViewCheck(const int64_t input_length) { + const int64_t N = input_length; + T result[2] = {1}; + for (int64_t i = 0; i < N; ++i) { + if (N % 2 == 0) { + result[0] &= (T)i; + } else { + result[1] &= (T)i; + } + } + + return (result[0]); +} + +template <class T, class DeviceType> +bool AndEqualAtomicViewTest(int64_t input_length) { + static_assert(std::is_integral<T>::value, + "AndEqualAtomicViewTest: Must be integral type for test"); + + T res = AndEqualAtomicView<T, DeviceType>(input_length); + T resSerial = AndEqualAtomicViewCheck<T>(input_length); + + bool passed = true; + + if (resSerial != res) { + passed = false; + + std::cout << "Loop<" << typeid(T).name() + << ">( test = AndEqualAtomicViewTest" + << " FAILED : " << resSerial << " != " << res << std::endl; + } + + return passed; +} + +//--------------------------------------------------- +//-----------atomic view or-equal----------------- +//--------------------------------------------------- + +template <class T, class execution_space> +struct OrEqualAtomicViewFunctor { + using atomic_view_type = + Kokkos::View<T*, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> >; + using view_type = Kokkos::View<T*, execution_space>; + + view_type input; + atomic_view_type even_odd_result; + const int64_t length; + + // Wrap the result view in an atomic view, use this for operator. + OrEqualAtomicViewFunctor(const view_type& input_, view_type& even_odd_result_, + const int64_t length_) + : input(input_), even_odd_result(even_odd_result_), length(length_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int64_t i) const { + if (i < length) { + if (i % 2 == 0) { + even_odd_result(0) |= input(i); + } else { + even_odd_result(1) |= input(i); + } + } + } +}; + +template <class T, class execution_space> +T OrEqualAtomicView(const int64_t input_length) { + using view_type = Kokkos::View<T*, execution_space>; + using host_view_type = typename view_type::HostMirror; + + const int64_t length = input_length; + + view_type input("input_view", length); + view_type result_view("result_view", 2); + + InitFunctor_Seq<T, execution_space> init_f(input, length); + Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(0, length), init_f); + + OrEqualAtomicViewFunctor<T, execution_space> functor(input, result_view, + length); + Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(0, length), + functor); + Kokkos::fence(); + + host_view_type h_result_view = Kokkos::create_mirror_view(result_view); + Kokkos::deep_copy(h_result_view, result_view); + + return (T)(h_result_view(0)); +} + +template <class T> +T OrEqualAtomicViewCheck(const int64_t input_length) { + const int64_t N = input_length; + T result[2] = {0}; + for (int64_t i = 0; i < N; ++i) { + if (i % 2 == 0) { + result[0] |= (T)i; + } else { + result[1] |= (T)i; + } + } + + return (T)(result[0]); +} + +template <class T, class DeviceType> +bool OrEqualAtomicViewTest(int64_t input_length) { + static_assert(std::is_integral<T>::value, + "OrEqualAtomicViewTest: Must be integral type for test"); + + T res = OrEqualAtomicView<T, DeviceType>(input_length); + T resSerial = OrEqualAtomicViewCheck<T>(input_length); + + bool passed = true; + + if (resSerial != res) { + passed = false; + + std::cout << "Loop<" << typeid(T).name() + << ">( test = OrEqualAtomicViewTest" + << " FAILED : " << resSerial << " != " << res << std::endl; + } + + return passed; +} + +//--------------------------------------------------- +//-----------atomic view xor-equal----------------- +//--------------------------------------------------- + +template <class T, class execution_space> +struct XOrEqualAtomicViewFunctor { + using atomic_view_type = + Kokkos::View<T*, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> >; + using view_type = Kokkos::View<T*, execution_space>; + + view_type input; + atomic_view_type even_odd_result; + const int64_t length; + + // Wrap the result view in an atomic view, use this for operator. + XOrEqualAtomicViewFunctor(const view_type& input_, + view_type& even_odd_result_, const int64_t length_) + : input(input_), even_odd_result(even_odd_result_), length(length_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int64_t i) const { + if (i < length) { + if (i % 2 == 0) { + even_odd_result(0) ^= input(i); + } else { + even_odd_result(1) ^= input(i); + } + } + } +}; + +template <class T, class execution_space> +T XOrEqualAtomicView(const int64_t input_length) { + using view_type = Kokkos::View<T*, execution_space>; + using host_view_type = typename view_type::HostMirror; + + const int64_t length = input_length; + + view_type input("input_view", length); + view_type result_view("result_view", 2); + + InitFunctor_Seq<T, execution_space> init_f(input, length); + Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(0, length), init_f); + + XOrEqualAtomicViewFunctor<T, execution_space> functor(input, result_view, + length); + Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(0, length), + functor); + Kokkos::fence(); + + host_view_type h_result_view = Kokkos::create_mirror_view(result_view); + Kokkos::deep_copy(h_result_view, result_view); + + return (T)(h_result_view(0)); +} + +template <class T> +T XOrEqualAtomicViewCheck(const int64_t input_length) { + const int64_t N = input_length; + T result[2] = {0}; + for (int64_t i = 0; i < N; ++i) { + if (i % 2 == 0) { + result[0] ^= (T)i; + } else { + result[1] ^= (T)i; + } + } + + return (T)(result[0]); +} + +template <class T, class DeviceType> +bool XOrEqualAtomicViewTest(int64_t input_length) { + static_assert(std::is_integral<T>::value, + "XOrEqualAtomicViewTest: Must be integral type for test"); + + T res = XOrEqualAtomicView<T, DeviceType>(input_length); + T resSerial = XOrEqualAtomicViewCheck<T>(input_length); + + bool passed = true; + + if (resSerial != res) { + passed = false; + + std::cout << "Loop<" << typeid(T).name() + << ">( test = XOrEqualAtomicViewTest" + << " FAILED : " << resSerial << " != " << res << std::endl; + } + + return passed; +} + +// inc/dec? + +//--------------------------------------------------- +//--------------atomic_test_control------------------ +//--------------------------------------------------- + +template <class T, class DeviceType> +bool AtomicViewsTestIntegralType(const int length, int test) { + static_assert(std::is_integral<T>::value, + "TestAtomicViews Error: Non-integral type passed into " + "IntegralType tests"); + + switch (test) { + case 1: return PlusEqualAtomicViewTest<T, DeviceType>(length); + case 2: return MinusEqualAtomicViewTest<T, DeviceType>(length); + case 3: return RSEqualAtomicViewTest<T, DeviceType>(length); + case 4: return LSEqualAtomicViewTest<T, DeviceType>(length); + case 5: return ModEqualAtomicViewTest<T, DeviceType>(length); + case 6: return AndEqualAtomicViewTest<T, DeviceType>(length); + case 7: return OrEqualAtomicViewTest<T, DeviceType>(length); + case 8: return XOrEqualAtomicViewTest<T, DeviceType>(length); + } + + return 0; +} + +template <class T, class DeviceType> +bool AtomicViewsTestNonIntegralType(const int length, int test) { + switch (test) { + case 1: return PlusEqualAtomicViewTest<T, DeviceType>(length); + case 2: return MinusEqualAtomicViewTest<T, DeviceType>(length); + case 3: return TimesEqualAtomicViewTest<T, DeviceType>(length); + case 4: return DivEqualAtomicViewTest<T, DeviceType>(length); + } + + return 0; +} + +} // namespace TestAtomicViews + +namespace Test { + +TEST(TEST_CATEGORY, atomic_views_integral) { + const int64_t length = 1000000; + { + // Integral Types. + ASSERT_TRUE( + (TestAtomicViews::AtomicViewsTestIntegralType<int64_t, TEST_EXECSPACE>( + length, 1))); + ASSERT_TRUE( + (TestAtomicViews::AtomicViewsTestIntegralType<int64_t, TEST_EXECSPACE>( + length, 2))); + ASSERT_TRUE( + (TestAtomicViews::AtomicViewsTestIntegralType<int64_t, TEST_EXECSPACE>( + length, 3))); + ASSERT_TRUE( + (TestAtomicViews::AtomicViewsTestIntegralType<int64_t, TEST_EXECSPACE>( + length, 4))); + ASSERT_TRUE( + (TestAtomicViews::AtomicViewsTestIntegralType<int64_t, TEST_EXECSPACE>( + length, 5))); + ASSERT_TRUE( + (TestAtomicViews::AtomicViewsTestIntegralType<int64_t, TEST_EXECSPACE>( + length, 6))); + ASSERT_TRUE( + (TestAtomicViews::AtomicViewsTestIntegralType<int64_t, TEST_EXECSPACE>( + length, 7))); + ASSERT_TRUE( + (TestAtomicViews::AtomicViewsTestIntegralType<int64_t, TEST_EXECSPACE>( + length, 8))); + } +} + +TEST(TEST_CATEGORY, atomic_views_nonintegral) { + const int64_t length = 1000000; + { + // Non-Integral Types. + ASSERT_TRUE(( + TestAtomicViews::AtomicViewsTestNonIntegralType<double, TEST_EXECSPACE>( + length, 1))); + ASSERT_TRUE(( + TestAtomicViews::AtomicViewsTestNonIntegralType<double, TEST_EXECSPACE>( + length, 2))); + ASSERT_TRUE(( + TestAtomicViews::AtomicViewsTestNonIntegralType<double, TEST_EXECSPACE>( + length, 3))); + ASSERT_TRUE(( + TestAtomicViews::AtomicViewsTestNonIntegralType<double, TEST_EXECSPACE>( + length, 4))); + } +} + +TEST(TEST_CATEGORY, atomic_view_api) { + TestAtomicViews::TestAtomicViewAPI<int, TEST_EXECSPACE>(); +} +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestAtomics.hpp b/packages/kokkos/core/unit_test/TestAtomics.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e41ad5257d64ad3acb3266a0354f18d291662377 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestAtomics.hpp @@ -0,0 +1,573 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> + +namespace TestAtomic { + +// Struct for testing arbitrary size atomics. + +template <int N> +struct SuperScalar { + double val[N]; + + KOKKOS_INLINE_FUNCTION + SuperScalar() { + for (int i = 0; i < N; i++) { + val[i] = 0.0; + } + } + + KOKKOS_INLINE_FUNCTION + SuperScalar(const SuperScalar& src) { + for (int i = 0; i < N; i++) { + val[i] = src.val[i]; + } + } + + KOKKOS_INLINE_FUNCTION + SuperScalar(const volatile SuperScalar& src) { + for (int i = 0; i < N; i++) { + val[i] = src.val[i]; + } + } + + KOKKOS_INLINE_FUNCTION + SuperScalar& operator=(const SuperScalar& src) { + for (int i = 0; i < N; i++) { + val[i] = src.val[i]; + } + return *this; + } + + KOKKOS_INLINE_FUNCTION + SuperScalar& operator=(const volatile SuperScalar& src) { + for (int i = 0; i < N; i++) { + val[i] = src.val[i]; + } + return *this; + } + + KOKKOS_INLINE_FUNCTION + void operator=(const SuperScalar& src) volatile { + for (int i = 0; i < N; i++) { + val[i] = src.val[i]; + } + } + + KOKKOS_INLINE_FUNCTION + SuperScalar operator+(const SuperScalar& src) { + SuperScalar tmp = *this; + for (int i = 0; i < N; i++) { + tmp.val[i] += src.val[i]; + } + return tmp; + } + + KOKKOS_INLINE_FUNCTION + SuperScalar& operator+=(const double& src) { + for (int i = 0; i < N; i++) { + val[i] += 1.0 * (i + 1) * src; + } + return *this; + } + + KOKKOS_INLINE_FUNCTION + SuperScalar& operator+=(const SuperScalar& src) { + for (int i = 0; i < N; i++) { + val[i] += src.val[i]; + } + return *this; + } + + KOKKOS_INLINE_FUNCTION + bool operator==(const SuperScalar& src) const { + bool compare = true; + for (int i = 0; i < N; i++) { + compare = compare && (val[i] == src.val[i]); + } + return compare; + } + + KOKKOS_INLINE_FUNCTION + bool operator!=(const SuperScalar& src) const { + bool compare = true; + for (int i = 0; i < N; i++) { + compare = compare && (val[i] == src.val[i]); + } + return !compare; + } + + KOKKOS_INLINE_FUNCTION + SuperScalar(const double& src) { + for (int i = 0; i < N; i++) { + val[i] = 1.0 * (i + 1) * src; + } + } +}; + +template <int N> +std::ostream& operator<<(std::ostream& os, const SuperScalar<N>& dt) { + os << "{ "; + for (int i = 0; i < N - 1; i++) { + os << dt.val[i] << ", "; + } + os << dt.val[N - 1] << "}"; + + return os; +} + +template <class T, class DEVICE_TYPE> +struct ZeroFunctor { + using execution_space = DEVICE_TYPE; + using type = typename Kokkos::View<T, execution_space>; + using h_type = typename Kokkos::View<T, execution_space>::HostMirror; + + type data; + + KOKKOS_INLINE_FUNCTION + void operator()(int) const { data() = 0; } +}; + +//--------------------------------------------------- +//--------------atomic_fetch_add--------------------- +//--------------------------------------------------- + +template <class T, class DEVICE_TYPE> +struct AddFunctor { + using execution_space = DEVICE_TYPE; + using type = Kokkos::View<T, execution_space>; + + type data; + + KOKKOS_INLINE_FUNCTION + void operator()(int) const { Kokkos::atomic_fetch_add(&data(), (T)1); } +}; + +template <class T, class DEVICE_TYPE> +struct AddFunctorReduce { + using execution_space = DEVICE_TYPE; + using type = Kokkos::View<T, execution_space>; + + type data; + + KOKKOS_INLINE_FUNCTION + void operator()(int, int&) const { Kokkos::atomic_fetch_add(&data(), (T)1); } +}; + +template <class T, class execution_space> +T AddLoop(int loop) { + struct ZeroFunctor<T, execution_space> f_zero; + typename ZeroFunctor<T, execution_space>::type data("Data"); + typename ZeroFunctor<T, execution_space>::h_type h_data("HData"); + + f_zero.data = data; + + Kokkos::parallel_for(1, f_zero); + execution_space().fence(); + + struct AddFunctor<T, execution_space> f_add; + + f_add.data = data; + Kokkos::parallel_for(loop, f_add); + execution_space().fence(); + + Kokkos::deep_copy(h_data, data); + T val = h_data(); + + struct AddFunctorReduce<T, execution_space> f_add_red; + f_add_red.data = data; + int dummy_result; + Kokkos::parallel_reduce(loop, f_add_red, dummy_result); + execution_space().fence(); + + return val; +} + +template <class T> +T AddLoopSerial(int loop) { + T* data = new T[1]; + data[0] = 0; + + for (int i = 0; i < loop; i++) { + *data += (T)1; + } + + T val = *data; + delete[] data; + + return val; +} + +//------------------------------------------------------ +//--------------atomic_compare_exchange----------------- +//------------------------------------------------------ + +template <class T, class DEVICE_TYPE> +struct CASFunctor { + using execution_space = DEVICE_TYPE; + using type = Kokkos::View<T, execution_space>; + + type data; + + KOKKOS_INLINE_FUNCTION + void operator()(int) const { + T old = data(); + T newval, assumed; + + do { + assumed = old; + newval = assumed + (T)1; + old = Kokkos::atomic_compare_exchange(&data(), assumed, newval); + } while (old != assumed); + } +}; + +template <class T, class DEVICE_TYPE> +struct CASFunctorReduce { + using execution_space = DEVICE_TYPE; + using type = Kokkos::View<T, execution_space>; + + type data; + + KOKKOS_INLINE_FUNCTION + void operator()(int, int&) const { + T old = data(); + T newval, assumed; + + do { + assumed = old; + newval = assumed + (T)1; + old = Kokkos::atomic_compare_exchange(&data(), assumed, newval); + } while (old != assumed); + } +}; + +template <class T, class execution_space> +T CASLoop(int loop) { + struct ZeroFunctor<T, execution_space> f_zero; + typename ZeroFunctor<T, execution_space>::type data("Data"); + typename ZeroFunctor<T, execution_space>::h_type h_data("HData"); + + f_zero.data = data; + Kokkos::parallel_for(1, f_zero); + execution_space().fence(); + + struct CASFunctor<T, execution_space> f_cas; + f_cas.data = data; + Kokkos::parallel_for(loop, f_cas); + execution_space().fence(); + + Kokkos::deep_copy(h_data, data); + T val = h_data(); + + struct CASFunctorReduce<T, execution_space> f_cas_red; + f_cas_red.data = data; + int dummy_result; + Kokkos::parallel_reduce(loop, f_cas_red, dummy_result); + execution_space().fence(); + + return val; +} + +template <class T> +T CASLoopSerial(int loop) { + T* data = new T[1]; + data[0] = 0; + + for (int i = 0; i < loop; i++) { + T assumed; + T newval; + T old; + + do { + assumed = *data; + newval = assumed + (T)1; + old = *data; + *data = newval; + } while (!(assumed == old)); + } + + T val = *data; + delete[] data; + + return val; +} + +//---------------------------------------------- +//--------------atomic_exchange----------------- +//---------------------------------------------- + +template <class T, class DEVICE_TYPE> +struct ExchFunctor { + using execution_space = DEVICE_TYPE; + using type = Kokkos::View<T, execution_space>; + + type data, data2; + + KOKKOS_INLINE_FUNCTION + void operator()(int i) const { + T old = Kokkos::atomic_exchange(&data(), (T)i); + Kokkos::atomic_fetch_add(&data2(), old); + } +}; + +template <class T, class DEVICE_TYPE> +struct ExchFunctorReduce { + using execution_space = DEVICE_TYPE; + using type = Kokkos::View<T, execution_space>; + + type data, data2; + + KOKKOS_INLINE_FUNCTION + void operator()(int i, int&) const { + T old = Kokkos::atomic_exchange(&data(), (T)i); + Kokkos::atomic_fetch_add(&data2(), old); + } +}; + +template <class T, class execution_space> +T ExchLoop(int loop) { + struct ZeroFunctor<T, execution_space> f_zero; + typename ZeroFunctor<T, execution_space>::type data("Data"); + typename ZeroFunctor<T, execution_space>::h_type h_data("HData"); + + f_zero.data = data; + Kokkos::parallel_for(1, f_zero); + execution_space().fence(); + + typename ZeroFunctor<T, execution_space>::type data2("Data"); + typename ZeroFunctor<T, execution_space>::h_type h_data2("HData"); + + f_zero.data = data2; + Kokkos::parallel_for(1, f_zero); + execution_space().fence(); + + struct ExchFunctor<T, execution_space> f_exch; + f_exch.data = data; + f_exch.data2 = data2; + Kokkos::parallel_for(loop, f_exch); + execution_space().fence(); + + Kokkos::deep_copy(h_data, data); + Kokkos::deep_copy(h_data2, data2); + T val = h_data() + h_data2(); + + struct ExchFunctorReduce<T, execution_space> f_exch_red; + f_exch_red.data = data; + f_exch_red.data2 = data2; + int dummy_result; + Kokkos::parallel_reduce(loop, f_exch_red, dummy_result); + execution_space().fence(); + + return val; +} + +template <class T> +T ExchLoopSerial( + typename std::conditional<!std::is_same<T, Kokkos::complex<double> >::value, + int, void>::type loop) { + T* data = new T[1]; + T* data2 = new T[1]; + data[0] = 0; + data2[0] = 0; + + for (int i = 0; i < loop; i++) { + T old = *data; + *data = (T)i; + *data2 += old; + } + + T val = *data2 + *data; + delete[] data; + delete[] data2; + + return val; +} + +template <class T> +T ExchLoopSerial( + typename std::conditional<std::is_same<T, Kokkos::complex<double> >::value, + int, void>::type loop) { + T* data = new T[1]; + T* data2 = new T[1]; + data[0] = 0; + data2[0] = 0; + + for (int i = 0; i < loop; i++) { + T old = *data; + data->real() = (static_cast<double>(i)); + data->imag() = 0; + *data2 += old; + } + + T val = *data2 + *data; + delete[] data; + delete[] data2; + + return val; +} + +template <class T, class DeviceType> +T LoopVariant(int loop, int test) { + switch (test) { + case 1: return AddLoop<T, DeviceType>(loop); + case 2: return CASLoop<T, DeviceType>(loop); + case 3: return ExchLoop<T, DeviceType>(loop); + } + + return 0; +} + +template <class T> +T LoopVariantSerial(int loop, int test) { + switch (test) { + case 1: return AddLoopSerial<T>(loop); + case 2: return CASLoopSerial<T>(loop); + case 3: return ExchLoopSerial<T>(loop); + } + + return 0; +} + +template <class T, class DeviceType> +bool Loop(int loop, int test) { + T res = LoopVariant<T, DeviceType>(loop, test); + T resSerial = LoopVariantSerial<T>(loop, test); + + bool passed = true; + + if (resSerial != res) { + passed = false; + + std::cout << "Loop<" << typeid(T).name() << ">( test = " << test + << " FAILED : " << resSerial << " != " << res << std::endl; + } + + return passed; +} + +} // namespace TestAtomic + +namespace Test { + +TEST(TEST_CATEGORY, atomics) { + const int loop_count = 1e4; + + ASSERT_TRUE((TestAtomic::Loop<int, TEST_EXECSPACE>(loop_count, 1))); + ASSERT_TRUE((TestAtomic::Loop<int, TEST_EXECSPACE>(loop_count, 2))); + ASSERT_TRUE((TestAtomic::Loop<int, TEST_EXECSPACE>(loop_count, 3))); + + ASSERT_TRUE((TestAtomic::Loop<unsigned int, TEST_EXECSPACE>(loop_count, 1))); + ASSERT_TRUE((TestAtomic::Loop<unsigned int, TEST_EXECSPACE>(loop_count, 2))); + ASSERT_TRUE((TestAtomic::Loop<unsigned int, TEST_EXECSPACE>(loop_count, 3))); + + ASSERT_TRUE((TestAtomic::Loop<long int, TEST_EXECSPACE>(loop_count, 1))); + ASSERT_TRUE((TestAtomic::Loop<long int, TEST_EXECSPACE>(loop_count, 2))); + ASSERT_TRUE((TestAtomic::Loop<long int, TEST_EXECSPACE>(loop_count, 3))); + + ASSERT_TRUE( + (TestAtomic::Loop<unsigned long int, TEST_EXECSPACE>(loop_count, 1))); + ASSERT_TRUE( + (TestAtomic::Loop<unsigned long int, TEST_EXECSPACE>(loop_count, 2))); + ASSERT_TRUE( + (TestAtomic::Loop<unsigned long int, TEST_EXECSPACE>(loop_count, 3))); + + ASSERT_TRUE((TestAtomic::Loop<long long int, TEST_EXECSPACE>(loop_count, 1))); + ASSERT_TRUE((TestAtomic::Loop<long long int, TEST_EXECSPACE>(loop_count, 2))); + ASSERT_TRUE((TestAtomic::Loop<long long int, TEST_EXECSPACE>(loop_count, 3))); + + ASSERT_TRUE((TestAtomic::Loop<double, TEST_EXECSPACE>(loop_count, 1))); + ASSERT_TRUE((TestAtomic::Loop<double, TEST_EXECSPACE>(loop_count, 2))); + ASSERT_TRUE((TestAtomic::Loop<double, TEST_EXECSPACE>(loop_count, 3))); + + ASSERT_TRUE((TestAtomic::Loop<float, TEST_EXECSPACE>(100, 1))); + ASSERT_TRUE((TestAtomic::Loop<float, TEST_EXECSPACE>(100, 2))); + ASSERT_TRUE((TestAtomic::Loop<float, TEST_EXECSPACE>(100, 3))); + +#ifndef KOKKOS_ENABLE_OPENMPTARGET + ASSERT_TRUE((TestAtomic::Loop<Kokkos::complex<float>, TEST_EXECSPACE>(1, 1))); + ASSERT_TRUE((TestAtomic::Loop<Kokkos::complex<float>, TEST_EXECSPACE>(1, 2))); + ASSERT_TRUE((TestAtomic::Loop<Kokkos::complex<float>, TEST_EXECSPACE>(1, 3))); + + ASSERT_TRUE( + (TestAtomic::Loop<Kokkos::complex<float>, TEST_EXECSPACE>(100, 1))); + ASSERT_TRUE( + (TestAtomic::Loop<Kokkos::complex<float>, TEST_EXECSPACE>(100, 2))); + ASSERT_TRUE( + (TestAtomic::Loop<Kokkos::complex<float>, TEST_EXECSPACE>(100, 3))); + +// FIXME_SYCL atomics for large types to be implemented +#ifndef KOKKOS_ENABLE_SYCL + // FIXME_HIP HIP doesn't yet support atomics for >64bit types properly +#ifndef KOKKOS_ENABLE_HIP + ASSERT_TRUE( + (TestAtomic::Loop<Kokkos::complex<double>, TEST_EXECSPACE>(1, 1))); + ASSERT_TRUE( + (TestAtomic::Loop<Kokkos::complex<double>, TEST_EXECSPACE>(1, 2))); + ASSERT_TRUE( + (TestAtomic::Loop<Kokkos::complex<double>, TEST_EXECSPACE>(1, 3))); + + ASSERT_TRUE( + (TestAtomic::Loop<Kokkos::complex<double>, TEST_EXECSPACE>(100, 1))); + ASSERT_TRUE( + (TestAtomic::Loop<Kokkos::complex<double>, TEST_EXECSPACE>(100, 2))); + ASSERT_TRUE( + (TestAtomic::Loop<Kokkos::complex<double>, TEST_EXECSPACE>(100, 3))); + +// WORKAROUND MSVC +#ifndef _WIN32 + ASSERT_TRUE( + (TestAtomic::Loop<TestAtomic::SuperScalar<4>, TEST_EXECSPACE>(100, 1))); + ASSERT_TRUE( + (TestAtomic::Loop<TestAtomic::SuperScalar<4>, TEST_EXECSPACE>(100, 2))); + ASSERT_TRUE( + (TestAtomic::Loop<TestAtomic::SuperScalar<4>, TEST_EXECSPACE>(100, 3))); +#endif +#endif +#endif +#endif +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestBlockSizeDeduction.hpp b/packages/kokkos/core/unit_test/TestBlockSizeDeduction.hpp new file mode 100644 index 0000000000000000000000000000000000000000..d29e3737c6ee84eaccab66634430f0b4b058b3bc --- /dev/null +++ b/packages/kokkos/core/unit_test/TestBlockSizeDeduction.hpp @@ -0,0 +1,76 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef TEST_BLOCK_SIZE_DEDUCTION_HPP +#define TEST_BLOCK_SIZE_DEDUCTION_HPP + +#include <Kokkos_Core.hpp> +#include <gtest/gtest.h> + +// NOTE kokkos/kokkos#3103 introduced a bug that was accidentally fixed in #3124 +// The code below will do until we decide to test block size deduction more +// thoroughly + +struct PoorMansLambda { + template <typename MemberType> + KOKKOS_FUNCTION void operator()(MemberType const&) const {} +}; + +template <typename ExecutionSpace> +void test_bug_pr_3103() { + using Policy = + Kokkos::TeamPolicy<ExecutionSpace, Kokkos::LaunchBounds<32, 1>>; + int const league_size = 1; + int const team_size = std::min(32, ExecutionSpace::concurrency()); + int const vector_length = 1; + + Kokkos::parallel_for(Policy(league_size, team_size, vector_length), + PoorMansLambda()); +} + +TEST(TEST_CATEGORY, test_block_deduction_bug_pr_3103) { + test_bug_pr_3103<TEST_EXECSPACE>(); +} + +#endif diff --git a/packages/kokkos/core/unit_test/TestCTestDevice.cpp b/packages/kokkos/core/unit_test/TestCTestDevice.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b2ee79b856b0b995bb86b39d8f7fedb4548c5a7d --- /dev/null +++ b/packages/kokkos/core/unit_test/TestCTestDevice.cpp @@ -0,0 +1,138 @@ +#include <gtest/gtest.h> + +namespace Kokkos { +namespace Impl { + +int get_ctest_gpu(const char *local_rank_str); + +} // namespace Impl +} // namespace Kokkos + +#ifdef _WIN32 +int setenv(const char *name, const char *value, int overwrite) { + int errcode = 0; + if (!overwrite) { + size_t envsize = 0; + errcode = getenv_s(&envsize, NULL, 0, name); + if (errcode || envsize) return errcode; + } + return _putenv_s(name, value); +} + +int unsetenv(const char *name) { return _putenv_s(name, ""); } +#endif + +// Needed because https://github.com/google/googletest/issues/952 has not been +// resolved +#define EXPECT_THROW_WITH_MESSAGE(stmt, etype, whatstring) \ + EXPECT_THROW( \ + try { stmt; } catch (const etype &ex) { \ + EXPECT_EQ(whatstring, std::string(ex.what())); \ + throw; \ + }, \ + etype) + +class ctest_environment : public ::testing::Test { + protected: + void SetUp(); +}; + +void ctest_environment::SetUp() { + setenv("CTEST_KOKKOS_DEVICE_TYPE", "gpus", 1); + setenv("CTEST_RESOURCE_GROUP_COUNT", "10", 1); + unsetenv("CTEST_RESOURCE_GROUP_0"); + setenv("CTEST_RESOURCE_GROUP_1", "threads", 1); + setenv("CTEST_RESOURCE_GROUP_2", "threads,cores", 1); + + setenv("CTEST_RESOURCE_GROUP_3", "gpus", 1); + unsetenv("CTEST_RESOURCE_GROUP_3_GPUS"); + + setenv("CTEST_RESOURCE_GROUP_4", "gpus", 1); + setenv("CTEST_RESOURCE_GROUP_4_GPUS", "id:2", 1); + + setenv("CTEST_RESOURCE_GROUP_5", "gpus", 1); + setenv("CTEST_RESOURCE_GROUP_5_GPUS", "slots:1,id:2", 1); + + setenv("CTEST_RESOURCE_GROUP_6", "gpus", 1); + setenv("CTEST_RESOURCE_GROUP_6_GPUS", "id:2,slots:1", 1); + + setenv("CTEST_RESOURCE_GROUP_7", "threads,gpus", 1); + setenv("CTEST_RESOURCE_GROUP_7_GPUS", "id:3,slots:1", 1); + + setenv("CTEST_RESOURCE_GROUP_8", "gpus,threads", 1); + setenv("CTEST_RESOURCE_GROUP_8_GPUS", "id:1,slots:1", 1); + + setenv("CTEST_RESOURCE_GROUP_9", "cores,gpus,threads", 1); + setenv("CTEST_RESOURCE_GROUP_9_GPUS", "id:4,slots:1", 1); +} + +TEST_F(ctest_environment, no_device_type) { + unsetenv("CTEST_KOKKOS_DEVICE_TYPE"); + EXPECT_EQ(Kokkos::Impl::get_ctest_gpu("0"), 0); +} + +TEST_F(ctest_environment, no_process_count) { + unsetenv("CTEST_RESOURCE_GROUP_COUNT"); + EXPECT_EQ(Kokkos::Impl::get_ctest_gpu("0"), 0); +} + +TEST_F(ctest_environment, invalid_rank) { + EXPECT_THROW_WITH_MESSAGE( + Kokkos::Impl::get_ctest_gpu("10"), std::runtime_error, + "Error: local rank 10 is outside the bounds of resource groups provided " + "by" + " CTest. Raised by Kokkos::Impl::get_ctest_gpu().\nTraceback " + "functionality" + " not available\n"); +} + +TEST_F(ctest_environment, no_type_str) { + EXPECT_THROW_WITH_MESSAGE( + Kokkos::Impl::get_ctest_gpu("0"), std::runtime_error, + "Error: CTEST_RESOURCE_GROUP_0 is not specified. Raised by " + "Kokkos::Impl::get_ctest_gpu().\nTraceback functionality not " + "available\n"); +} + +TEST_F(ctest_environment, missing_type) { + EXPECT_THROW_WITH_MESSAGE( + Kokkos::Impl::get_ctest_gpu("1"), std::runtime_error, + "Error: device type 'gpus' not included in CTEST_RESOURCE_GROUP_1. " + "Raised " + "by Kokkos::Impl::get_ctest_gpu().\nTraceback functionality not available" + "\n"); + EXPECT_THROW_WITH_MESSAGE( + Kokkos::Impl::get_ctest_gpu("2"), std::runtime_error, + "Error: device type 'gpus' not included in CTEST_RESOURCE_GROUP_2. " + "Raised " + "by Kokkos::Impl::get_ctest_gpu().\nTraceback functionality not available" + "\n"); +} + +TEST_F(ctest_environment, no_id_str) { + EXPECT_THROW_WITH_MESSAGE( + Kokkos::Impl::get_ctest_gpu("3"), std::runtime_error, + "Error: CTEST_RESOURCE_GROUP_3_GPUS is not specified. Raised by " + "Kokkos::Impl::get_ctest_gpu().\nTraceback functionality not " + "available\n"); +} + +TEST_F(ctest_environment, invalid_id_str) { + EXPECT_THROW_WITH_MESSAGE( + Kokkos::Impl::get_ctest_gpu("4"), std::runtime_error, + "Error: invalid value of CTEST_RESOURCE_GROUP_4_GPUS: 'id:2'. Raised by " + "Kokkos::Impl::get_ctest_gpu().\nTraceback functionality not " + "available\n"); + EXPECT_THROW_WITH_MESSAGE( + Kokkos::Impl::get_ctest_gpu("5"), std::runtime_error, + "Error: invalid value of CTEST_RESOURCE_GROUP_5_GPUS: 'slots:1,id:2'. " + "Raised by Kokkos::Impl::get_ctest_gpu().\nTraceback functionality not " + "available\n"); +} + +TEST_F(ctest_environment, good) { + EXPECT_EQ(Kokkos::Impl::get_ctest_gpu("6"), 2); + EXPECT_EQ(Kokkos::Impl::get_ctest_gpu("7"), 3); + EXPECT_EQ(Kokkos::Impl::get_ctest_gpu("8"), 1); + EXPECT_EQ(Kokkos::Impl::get_ctest_gpu("9"), 4); +} diff --git a/packages/kokkos/core/unit_test/TestCXX11.hpp b/packages/kokkos/core/unit_test/TestCXX11.hpp new file mode 100644 index 0000000000000000000000000000000000000000..bbe0d01cbae2228afb61f2dfc51d81c81d95173e --- /dev/null +++ b/packages/kokkos/core/unit_test/TestCXX11.hpp @@ -0,0 +1,383 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> + +namespace TestCXX11 { + +template <class DeviceType> +struct FunctorAddTest { + using view_type = Kokkos::View<double**, DeviceType>; + using execution_space = DeviceType; + using team_member = typename Kokkos::TeamPolicy<execution_space>::member_type; + + view_type a_, b_; + + FunctorAddTest(view_type& a, view_type& b) : a_(a), b_(b) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int& i) const { + b_(i, 0) = a_(i, 1) + a_(i, 2); + b_(i, 1) = a_(i, 0) - a_(i, 3); + b_(i, 2) = a_(i, 4) + a_(i, 0); + b_(i, 3) = a_(i, 2) - a_(i, 1); + b_(i, 4) = a_(i, 3) + a_(i, 4); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const team_member& dev) const { + const int begin = dev.league_rank() * 4; + const int end = begin + 4; + for (int i = begin + dev.team_rank(); i < end; i += dev.team_size()) { + b_(i, 0) = a_(i, 1) + a_(i, 2); + b_(i, 1) = a_(i, 0) - a_(i, 3); + b_(i, 2) = a_(i, 4) + a_(i, 0); + b_(i, 3) = a_(i, 2) - a_(i, 1); + b_(i, 4) = a_(i, 3) + a_(i, 4); + } + } +}; + +template <class DeviceType, bool PWRTest> +double AddTestFunctor() { + using policy_type = Kokkos::TeamPolicy<DeviceType>; + + Kokkos::View<double**, DeviceType> a("A", 100, 5); + Kokkos::View<double**, DeviceType> b("B", 100, 5); + typename Kokkos::View<double**, DeviceType>::HostMirror h_a = + Kokkos::create_mirror_view(a); + typename Kokkos::View<double**, DeviceType>::HostMirror h_b = + Kokkos::create_mirror_view(b); + + for (int i = 0; i < 100; i++) { + for (int j = 0; j < 5; j++) { + h_a(i, j) = 0.1 * i / (1.1 * j + 1.0) + 0.5 * j; + } + } + Kokkos::deep_copy(a, h_a); + + if (PWRTest == false) { + Kokkos::parallel_for(100, FunctorAddTest<DeviceType>(a, b)); + } else { + Kokkos::parallel_for(policy_type(25, Kokkos::AUTO), + FunctorAddTest<DeviceType>(a, b)); + } + Kokkos::deep_copy(h_b, b); + + double result = 0; + for (int i = 0; i < 100; i++) { + for (int j = 0; j < 5; j++) { + result += h_b(i, j); + } + } + + return result; +} + +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) +template <class DeviceType, bool PWRTest> +double AddTestLambda() { + Kokkos::View<double**, DeviceType> a("A", 100, 5); + Kokkos::View<double**, DeviceType> b("B", 100, 5); + typename Kokkos::View<double**, DeviceType>::HostMirror h_a = + Kokkos::create_mirror_view(a); + typename Kokkos::View<double**, DeviceType>::HostMirror h_b = + Kokkos::create_mirror_view(b); + + for (int i = 0; i < 100; i++) { + for (int j = 0; j < 5; j++) { + h_a(i, j) = 0.1 * i / (1.1 * j + 1.0) + 0.5 * j; + } + } + Kokkos::deep_copy(a, h_a); + + if (PWRTest == false) { + Kokkos::parallel_for( + 100, KOKKOS_LAMBDA(const int& i) { + b(i, 0) = a(i, 1) + a(i, 2); + b(i, 1) = a(i, 0) - a(i, 3); + b(i, 2) = a(i, 4) + a(i, 0); + b(i, 3) = a(i, 2) - a(i, 1); + b(i, 4) = a(i, 3) + a(i, 4); + }); + } else { + using policy_type = Kokkos::TeamPolicy<DeviceType>; + using team_member = typename policy_type::member_type; + + policy_type policy(25, Kokkos::AUTO); + + Kokkos::parallel_for( + policy, KOKKOS_LAMBDA(const team_member& dev) { + const unsigned int begin = dev.league_rank() * 4; + const unsigned int end = begin + 4; + for (unsigned int i = begin + dev.team_rank(); i < end; + i += dev.team_size()) { + b(i, 0) = a(i, 1) + a(i, 2); + b(i, 1) = a(i, 0) - a(i, 3); + b(i, 2) = a(i, 4) + a(i, 0); + b(i, 3) = a(i, 2) - a(i, 1); + b(i, 4) = a(i, 3) + a(i, 4); + } + }); + } + Kokkos::deep_copy(h_b, b); + + double result = 0; + for (int i = 0; i < 100; i++) { + for (int j = 0; j < 5; j++) { + result += h_b(i, j); + } + } + + return result; +} +#else +template <class DeviceType, bool PWRTest> +double AddTestLambda() { + return AddTestFunctor<DeviceType, PWRTest>(); +} +#endif + +template <class DeviceType> +struct FunctorReduceTest { + using view_type = Kokkos::View<double**, DeviceType>; + using execution_space = DeviceType; + using value_type = double; + using team_member = typename Kokkos::TeamPolicy<execution_space>::member_type; + + view_type a_; + + FunctorReduceTest(view_type& a) : a_(a) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int& i, value_type& sum) const { + sum += a_(i, 1) + a_(i, 2); + sum += a_(i, 0) - a_(i, 3); + sum += a_(i, 4) + a_(i, 0); + sum += a_(i, 2) - a_(i, 1); + sum += a_(i, 3) + a_(i, 4); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const team_member& dev, value_type& sum) const { + const int begin = dev.league_rank() * 4; + const int end = begin + 4; + for (int i = begin + dev.team_rank(); i < end; i += dev.team_size()) { + sum += a_(i, 1) + a_(i, 2); + sum += a_(i, 0) - a_(i, 3); + sum += a_(i, 4) + a_(i, 0); + sum += a_(i, 2) - a_(i, 1); + sum += a_(i, 3) + a_(i, 4); + } + } + + KOKKOS_INLINE_FUNCTION + void init(value_type& update) const { update = 0.0; } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& update, + volatile value_type const& input) const { + update += input; + } +}; + +template <class DeviceType, bool PWRTest> +double ReduceTestFunctor() { + using policy_type = Kokkos::TeamPolicy<DeviceType>; + using view_type = Kokkos::View<double**, DeviceType>; + using unmanaged_result = + Kokkos::View<double, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>; + + view_type a("A", 100, 5); + typename view_type::HostMirror h_a = Kokkos::create_mirror_view(a); + + for (int i = 0; i < 100; i++) { + for (int j = 0; j < 5; j++) { + h_a(i, j) = 0.1 * i / (1.1 * j + 1.0) + 0.5 * j; + } + } + Kokkos::deep_copy(a, h_a); + + double result = 0.0; + if (PWRTest == false) { + Kokkos::parallel_reduce(100, FunctorReduceTest<DeviceType>(a), + unmanaged_result(&result)); + } else { + Kokkos::parallel_reduce(policy_type(25, Kokkos::AUTO), + FunctorReduceTest<DeviceType>(a), + unmanaged_result(&result)); + } + Kokkos::fence(); + + return result; +} + +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) +template <class DeviceType, bool PWRTest> +double ReduceTestLambda() { + using policy_type = Kokkos::TeamPolicy<DeviceType>; + using view_type = Kokkos::View<double**, DeviceType>; + using unmanaged_result = + Kokkos::View<double, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>; + + view_type a("A", 100, 5); + typename view_type::HostMirror h_a = Kokkos::create_mirror_view(a); + + for (int i = 0; i < 100; i++) { + for (int j = 0; j < 5; j++) { + h_a(i, j) = 0.1 * i / (1.1 * j + 1.0) + 0.5 * j; + } + } + Kokkos::deep_copy(a, h_a); + + double result = 0.0; + + if (PWRTest == false) { + Kokkos::parallel_reduce( + 100, + KOKKOS_LAMBDA(const int& i, double& sum) { + sum += a(i, 1) + a(i, 2); + sum += a(i, 0) - a(i, 3); + sum += a(i, 4) + a(i, 0); + sum += a(i, 2) - a(i, 1); + sum += a(i, 3) + a(i, 4); + }, + unmanaged_result(&result)); + } else { + using team_member = typename policy_type::member_type; + Kokkos::parallel_reduce( + policy_type(25, Kokkos::AUTO), + KOKKOS_LAMBDA(const team_member& dev, double& sum) { + const unsigned int begin = dev.league_rank() * 4; + const unsigned int end = begin + 4; + for (unsigned int i = begin + dev.team_rank(); i < end; + i += dev.team_size()) { + sum += a(i, 1) + a(i, 2); + sum += a(i, 0) - a(i, 3); + sum += a(i, 4) + a(i, 0); + sum += a(i, 2) - a(i, 1); + sum += a(i, 3) + a(i, 4); + } + }, + unmanaged_result(&result)); + } + Kokkos::fence(); + + return result; +} +#else +template <class DeviceType, bool PWRTest> +double ReduceTestLambda() { + return ReduceTestFunctor<DeviceType, PWRTest>(); +} +#endif + +template <class DeviceType> +double TestVariantLambda(int test) { + switch (test) { + case 1: return AddTestLambda<DeviceType, false>(); + case 2: return AddTestLambda<DeviceType, true>(); + case 3: return ReduceTestLambda<DeviceType, false>(); + case 4: return ReduceTestLambda<DeviceType, true>(); + } + + return 0; +} + +template <class DeviceType> +double TestVariantFunctor(int test) { + switch (test) { + case 1: return AddTestFunctor<DeviceType, false>(); + case 2: return AddTestFunctor<DeviceType, true>(); + case 3: return ReduceTestFunctor<DeviceType, false>(); + case 4: return ReduceTestFunctor<DeviceType, true>(); + } + + return 0; +} + +template <class DeviceType> +bool Test(int test) { +#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA + double res_functor = TestVariantFunctor<DeviceType>(test); + double res_lambda = TestVariantLambda<DeviceType>(test); + + char testnames[5][256] = {" ", "AddTest", "AddTest TeamPolicy", "ReduceTest", + "ReduceTest TeamPolicy"}; + bool passed = true; + + auto a = res_functor; + auto b = res_lambda; + // use a tolerant comparison because functors and lambdas vectorize + // differently https://github.com/trilinos/Trilinos/issues/3233 + auto rel_err = (std::abs(b - a) / std::max(std::abs(a), std::abs(b))); + auto tol = 1e-14; + if (rel_err > tol) { + passed = false; + + std::cout << "CXX11 ( test = '" << testnames[test] + << "' FAILED : relative error " << rel_err << " > tolerance " + << tol << std::endl; + } + + return passed; +#else + (void)test; + return true; +#endif +} + +} // namespace TestCXX11 + +namespace Test { +TEST(TEST_CATEGORY, cxx11) { + if (std::is_same<Kokkos::DefaultExecutionSpace, TEST_EXECSPACE>::value) { + ASSERT_TRUE((TestCXX11::Test<TEST_EXECSPACE>(1))); + ASSERT_TRUE((TestCXX11::Test<TEST_EXECSPACE>(2))); + ASSERT_TRUE((TestCXX11::Test<TEST_EXECSPACE>(3))); + ASSERT_TRUE((TestCXX11::Test<TEST_EXECSPACE>(4))); + } +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestCXX11Deduction.hpp b/packages/kokkos/core/unit_test/TestCXX11Deduction.hpp new file mode 100644 index 0000000000000000000000000000000000000000..c7efab2711f283c02fad350bd5a8163ebc4b3a91 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestCXX11Deduction.hpp @@ -0,0 +1,108 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> + +#ifndef TESTCXX11DEDUCTION_HPP +#define TESTCXX11DEDUCTION_HPP + +namespace TestCXX11 { + +struct TestReductionDeductionTagA {}; +struct TestReductionDeductionTagB {}; + +template <class ExecSpace> +struct TestReductionDeductionFunctor { + // KOKKOS_INLINE_FUNCTION + // void operator()( long i, long & value ) const + // { value += i + 1; } + + KOKKOS_INLINE_FUNCTION + void operator()(TestReductionDeductionTagA, long i, long &value) const { + value += (2 * i + 1) + (2 * i + 2); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const TestReductionDeductionTagB &, const long i, + long &value) const { + value += (3 * i + 1) + (3 * i + 2) + (3 * i + 3); + } +}; + +template <class ExecSpace> +void test_reduction_deduction() { + using Functor = TestReductionDeductionFunctor<ExecSpace>; + + const long N = 50; + // const long answer = N % 2 ? ( N * ( ( N + 1 ) / 2 ) ) : ( ( N / 2 ) * ( N + // + 1 ) ); + const long answerA = + N % 2 ? ((2 * N) * (((2 * N) + 1) / 2)) : (((2 * N) / 2) * ((2 * N) + 1)); + const long answerB = + N % 2 ? ((3 * N) * (((3 * N) + 1) / 2)) : (((3 * N) / 2) * ((3 * N) + 1)); + long result = 0; + + // Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), + // Functor(), result ); ASSERT_EQ( answer, result ); + + Kokkos::parallel_reduce( + Kokkos::RangePolicy<ExecSpace, TestReductionDeductionTagA>(0, N), + Functor(), result); + ASSERT_EQ(answerA, result); + + Kokkos::parallel_reduce( + Kokkos::RangePolicy<ExecSpace, TestReductionDeductionTagB>(0, N), + Functor(), result); + ASSERT_EQ(answerB, result); +} + +} // namespace TestCXX11 + +namespace Test { + +TEST(TEST_CATEGORY, reduction_deduction) { + TestCXX11::test_reduction_deduction<TEST_EXECSPACE>(); +} +} // namespace Test +#endif diff --git a/packages/kokkos/core/unit_test/TestCompilerMacros.hpp b/packages/kokkos/core/unit_test/TestCompilerMacros.hpp new file mode 100644 index 0000000000000000000000000000000000000000..273e87ccc3f140b2648b78de49a7ab512d26cd6b --- /dev/null +++ b/packages/kokkos/core/unit_test/TestCompilerMacros.hpp @@ -0,0 +1,114 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> + +#if defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_CUDA_LAMBDA) +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) +#error "Macro bug: KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA shouldn't be defined" +#endif +#else +#if !defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) +#error "Macro bug: KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA should be defined" +#endif +#endif + +#define KOKKOS_PRAGMA_UNROLL(a) + +namespace TestCompilerMacros { + +template <class DEVICE_TYPE> +struct AddFunctor { + using execution_space = DEVICE_TYPE; + using type = typename Kokkos::View<int**, execution_space>; + type a, b; + int length; + + AddFunctor(type a_, type b_) : a(a_), b(b_), length(a.extent(1)) {} + + KOKKOS_INLINE_FUNCTION + void operator()(int i) const { +#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL +#pragma unroll +#endif +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif +#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR +#pragma vector always +#endif +#ifdef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT +#pragma loop count(128) +#endif +#ifndef KOKKOS_ENABLE_DEBUG +#ifdef KOKKOS_ENABLE_PRAGMA_SIMD +#pragma simd +#endif +#endif + for (int j = 0; j < length; j++) { + a(i, j) += b(i, j); + } + } +}; + +template <class DeviceType> +bool Test() { + using type = typename Kokkos::View<int**, DeviceType>; + type a("A", 1024, 128); + type b("B", 1024, 128); + + AddFunctor<DeviceType> f(a, b); + Kokkos::parallel_for(1024, f); + DeviceType().fence(); + + return true; +} + +} // namespace TestCompilerMacros + +namespace Test { +TEST(TEST_CATEGORY, compiler_macros) { + ASSERT_TRUE((TestCompilerMacros::Test<TEST_EXECSPACE>())); +} +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestComplex.hpp b/packages/kokkos/core/unit_test/TestComplex.hpp new file mode 100644 index 0000000000000000000000000000000000000000..b926058ebf990b0c7d0bff6f4c22b5bd4c12e2e8 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestComplex.hpp @@ -0,0 +1,518 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <cstdio> +#include <sstream> + +namespace Test { + +// Test construction and assignment + +template <class ExecSpace> +struct TestComplexConstruction { + Kokkos::View<Kokkos::complex<double> *, ExecSpace> d_results; + typename Kokkos::View<Kokkos::complex<double> *, ExecSpace>::HostMirror + h_results; + + void testit() { + d_results = Kokkos::View<Kokkos::complex<double> *, ExecSpace>( + "TestComplexConstruction", 10); + h_results = Kokkos::create_mirror_view(d_results); + + Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, 1), *this); + Kokkos::fence(); + Kokkos::deep_copy(h_results, d_results); + + ASSERT_FLOAT_EQ(h_results(0).real(), 1.5); + ASSERT_FLOAT_EQ(h_results(0).imag(), 2.5); + ASSERT_FLOAT_EQ(h_results(1).real(), 1.5); + ASSERT_FLOAT_EQ(h_results(1).imag(), 2.5); + ASSERT_FLOAT_EQ(h_results(2).real(), 0.0); + ASSERT_FLOAT_EQ(h_results(2).imag(), 0.0); + ASSERT_FLOAT_EQ(h_results(3).real(), 3.5); + ASSERT_FLOAT_EQ(h_results(3).imag(), 0.0); + ASSERT_FLOAT_EQ(h_results(4).real(), 4.5); + ASSERT_FLOAT_EQ(h_results(4).imag(), 5.5); + ASSERT_FLOAT_EQ(h_results(5).real(), 1.5); + ASSERT_FLOAT_EQ(h_results(5).imag(), 2.5); + ASSERT_FLOAT_EQ(h_results(6).real(), 4.5); + ASSERT_FLOAT_EQ(h_results(6).imag(), 5.5); + ASSERT_FLOAT_EQ(h_results(7).real(), 7.5); + ASSERT_FLOAT_EQ(h_results(7).imag(), 0.0); + ASSERT_FLOAT_EQ(h_results(8).real(), double(8)); + ASSERT_FLOAT_EQ(h_results(8).imag(), 0.0); + + // Copy construction conversion between + // Kokkos::complex and std::complex doesn't compile + Kokkos::complex<double> a(1.5, 2.5), b(3.25, 5.25), r_kk; + std::complex<double> sa(a), sb(3.25, 5.25), r; + r = a; + r_kk = a; + ASSERT_FLOAT_EQ(r.real(), r_kk.real()); + ASSERT_FLOAT_EQ(r.imag(), r_kk.imag()); + r = sb * a; + r_kk = b * a; + ASSERT_FLOAT_EQ(r.real(), r_kk.real()); + ASSERT_FLOAT_EQ(r.imag(), r_kk.imag()); + r = sa; + r_kk = a; + ASSERT_FLOAT_EQ(r.real(), r_kk.real()); + ASSERT_FLOAT_EQ(r.imag(), r_kk.imag()); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int & /*i*/) const { + Kokkos::complex<double> a(1.5, 2.5); + d_results(0) = a; + Kokkos::complex<double> b(a); + d_results(1) = b; + Kokkos::complex<double> c = Kokkos::complex<double>(); + d_results(2) = c; + Kokkos::complex<double> d(3.5); + d_results(3) = d; + volatile Kokkos::complex<double> a_v(4.5, 5.5); + d_results(4) = a_v; + volatile Kokkos::complex<double> b_v(a); + d_results(5) = b_v; + Kokkos::complex<double> e(a_v); + d_results(6) = e; + + d_results(7) = double(7.5); + d_results(8) = int(8); + } +}; + +TEST(TEST_CATEGORY, complex_construction) { + TestComplexConstruction<TEST_EXECSPACE> test; + test.testit(); +} + +// Test Math FUnction + +template <class ExecSpace> +struct TestComplexBasicMath { + Kokkos::View<Kokkos::complex<double> *, ExecSpace> d_results; + typename Kokkos::View<Kokkos::complex<double> *, ExecSpace>::HostMirror + h_results; + + void testit() { + d_results = Kokkos::View<Kokkos::complex<double> *, ExecSpace>( + "TestComplexBasicMath", 24); + h_results = Kokkos::create_mirror_view(d_results); + + Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, 1), *this); + Kokkos::fence(); + Kokkos::deep_copy(h_results, d_results); + + std::complex<double> a(1.5, 2.5); + std::complex<double> b(3.25, 5.75); + std::complex<double> d(1.0, 2.0); + double c = 9.3; + int e = 2; + + std::complex<double> r; + r = a + b; + ASSERT_FLOAT_EQ(h_results(0).real(), r.real()); + ASSERT_FLOAT_EQ(h_results(0).imag(), r.imag()); + r = a - b; + ASSERT_FLOAT_EQ(h_results(1).real(), r.real()); + ASSERT_FLOAT_EQ(h_results(1).imag(), r.imag()); + r = a * b; + ASSERT_FLOAT_EQ(h_results(2).real(), r.real()); + ASSERT_FLOAT_EQ(h_results(2).imag(), r.imag()); + r = a / b; +#ifndef KOKKOS_WORKAROUND_OPENMPTARGET_CLANG + ASSERT_FLOAT_EQ(h_results(3).real(), r.real()); + ASSERT_FLOAT_EQ(h_results(3).imag(), r.imag()); +#endif + r = d + a; + ASSERT_FLOAT_EQ(h_results(4).real(), r.real()); + ASSERT_FLOAT_EQ(h_results(4).imag(), r.imag()); + r = d - a; + ASSERT_FLOAT_EQ(h_results(5).real(), r.real()); + ASSERT_FLOAT_EQ(h_results(5).imag(), r.imag()); + r = d * a; + ASSERT_FLOAT_EQ(h_results(6).real(), r.real()); + ASSERT_FLOAT_EQ(h_results(6).imag(), r.imag()); + r = d / a; + ASSERT_FLOAT_EQ(h_results(7).real(), r.real()); + ASSERT_FLOAT_EQ(h_results(7).imag(), r.imag()); + r = a + c; + ASSERT_FLOAT_EQ(h_results(8).real(), r.real()); + ASSERT_FLOAT_EQ(h_results(8).imag(), r.imag()); + r = a - c; + ASSERT_FLOAT_EQ(h_results(9).real(), r.real()); + ASSERT_FLOAT_EQ(h_results(9).imag(), r.imag()); + r = a * c; + ASSERT_FLOAT_EQ(h_results(10).real(), r.real()); + ASSERT_FLOAT_EQ(h_results(10).imag(), r.imag()); + r = a / c; + ASSERT_FLOAT_EQ(h_results(11).real(), r.real()); + ASSERT_FLOAT_EQ(h_results(11).imag(), r.imag()); + r = d + c; + ASSERT_FLOAT_EQ(h_results(12).real(), r.real()); + ASSERT_FLOAT_EQ(h_results(12).imag(), r.imag()); + r = d - c; + ASSERT_FLOAT_EQ(h_results(13).real(), r.real()); + ASSERT_FLOAT_EQ(h_results(13).imag(), r.imag()); + r = d * c; + ASSERT_FLOAT_EQ(h_results(14).real(), r.real()); + ASSERT_FLOAT_EQ(h_results(14).imag(), r.imag()); + r = d / c; + ASSERT_FLOAT_EQ(h_results(15).real(), r.real()); + ASSERT_FLOAT_EQ(h_results(15).imag(), r.imag()); + r = c + a; + ASSERT_FLOAT_EQ(h_results(16).real(), r.real()); + ASSERT_FLOAT_EQ(h_results(16).imag(), r.imag()); + r = c - a; + ASSERT_FLOAT_EQ(h_results(17).real(), r.real()); + ASSERT_FLOAT_EQ(h_results(17).imag(), r.imag()); + r = c * a; + ASSERT_FLOAT_EQ(h_results(18).real(), r.real()); + ASSERT_FLOAT_EQ(h_results(18).imag(), r.imag()); + r = c / a; +#ifndef KOKKOS_WORKAROUND_OPENMPTARGET_CLANG + ASSERT_FLOAT_EQ(h_results(19).real(), r.real()); + ASSERT_FLOAT_EQ(h_results(19).imag(), r.imag()); +#endif + + r = a; + /* r = a+e; */ ASSERT_FLOAT_EQ(h_results(20).real(), r.real() + e); + ASSERT_FLOAT_EQ(h_results(20).imag(), r.imag()); + /* r = a-e; */ ASSERT_FLOAT_EQ(h_results(21).real(), r.real() - e); + ASSERT_FLOAT_EQ(h_results(21).imag(), r.imag()); + /* r = a*e; */ ASSERT_FLOAT_EQ(h_results(22).real(), r.real() * e); + ASSERT_FLOAT_EQ(h_results(22).imag(), r.imag() * e); + /* r = a/e; */ ASSERT_FLOAT_EQ(h_results(23).real(), r.real() / 2); + ASSERT_FLOAT_EQ(h_results(23).imag(), r.imag() / e); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int & /*i*/) const { + Kokkos::complex<double> a(1.5, 2.5); + Kokkos::complex<double> b(3.25, 5.75); + // Basic math complex / complex + d_results(0) = a + b; + d_results(1) = a - b; + d_results(2) = a * b; + d_results(3) = a / b; + d_results(4).real(1.0); + d_results(4).imag(2.0); + d_results(4) += a; + d_results(5) = Kokkos::complex<double>(1.0, 2.0); + d_results(5) -= a; + d_results(6) = Kokkos::complex<double>(1.0, 2.0); + d_results(6) *= a; + d_results(7) = Kokkos::complex<double>(1.0, 2.0); + d_results(7) /= a; + + // Basic math complex / scalar + double c = 9.3; + d_results(8) = a + c; + d_results(9) = a - c; + d_results(10) = a * c; + d_results(11) = a / c; + d_results(12).real(1.0); + d_results(12).imag(2.0); + d_results(12) += c; + d_results(13) = Kokkos::complex<double>(1.0, 2.0); + d_results(13) -= c; + d_results(14) = Kokkos::complex<double>(1.0, 2.0); + d_results(14) *= c; + d_results(15) = Kokkos::complex<double>(1.0, 2.0); + d_results(15) /= c; + + // Basic math scalar / complex + d_results(16) = c + a; + d_results(17) = c - a; + d_results(18) = c * a; + d_results(19) = c / a; + + int e = 2; + d_results(20) = a + e; + d_results(21) = a - e; + d_results(22) = a * e; + d_results(23) = a / e; + } +}; + +TEST(TEST_CATEGORY, complex_basic_math) { + TestComplexBasicMath<TEST_EXECSPACE> test; + test.testit(); +} + +template <class ExecSpace> +struct TestComplexSpecialFunctions { + Kokkos::View<Kokkos::complex<double> *, ExecSpace> d_results; + typename Kokkos::View<Kokkos::complex<double> *, ExecSpace>::HostMirror + h_results; + + void testit() { + d_results = Kokkos::View<Kokkos::complex<double> *, ExecSpace>( + "TestComplexSpecialFunctions", 20); + h_results = Kokkos::create_mirror_view(d_results); + + Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, 1), *this); + Kokkos::fence(); + Kokkos::deep_copy(h_results, d_results); + + std::complex<double> a(1.5, 2.5); + double c = 9.3; + + std::complex<double> r; + r = a; + ASSERT_FLOAT_EQ(h_results(0).real(), r.real()); + ASSERT_FLOAT_EQ(h_results(0).imag(), r.imag()); + r = std::sqrt(a); + ASSERT_FLOAT_EQ(h_results(1).real(), r.real()); + ASSERT_FLOAT_EQ(h_results(1).imag(), r.imag()); + r = std::pow(a, c); + ASSERT_FLOAT_EQ(h_results(2).real(), r.real()); + ASSERT_FLOAT_EQ(h_results(2).imag(), r.imag()); + r = std::abs(a); + ASSERT_FLOAT_EQ(h_results(3).real(), r.real()); + ASSERT_FLOAT_EQ(h_results(3).imag(), r.imag()); + r = std::exp(a); + ASSERT_FLOAT_EQ(h_results(4).real(), r.real()); + ASSERT_FLOAT_EQ(h_results(4).imag(), r.imag()); + r = Kokkos::exp(a); + ASSERT_FLOAT_EQ(h_results(4).real(), r.real()); + ASSERT_FLOAT_EQ(h_results(4).imag(), r.imag()); +#ifndef KOKKOS_WORKAROUND_OPENMPTARGET_CLANG + r = std::log(a); + ASSERT_FLOAT_EQ(h_results(5).real(), r.real()); + ASSERT_FLOAT_EQ(h_results(5).imag(), r.imag()); + r = std::sin(a); + ASSERT_FLOAT_EQ(h_results(6).real(), r.real()); + ASSERT_FLOAT_EQ(h_results(6).imag(), r.imag()); + r = std::cos(a); + ASSERT_FLOAT_EQ(h_results(7).real(), r.real()); + ASSERT_FLOAT_EQ(h_results(7).imag(), r.imag()); + r = std::tan(a); + ASSERT_FLOAT_EQ(h_results(8).real(), r.real()); + ASSERT_FLOAT_EQ(h_results(8).imag(), r.imag()); + r = std::sinh(a); + ASSERT_FLOAT_EQ(h_results(9).real(), r.real()); + ASSERT_FLOAT_EQ(h_results(9).imag(), r.imag()); + r = std::cosh(a); + ASSERT_FLOAT_EQ(h_results(10).real(), r.real()); + ASSERT_FLOAT_EQ(h_results(10).imag(), r.imag()); + r = std::tanh(a); + ASSERT_FLOAT_EQ(h_results(11).real(), r.real()); + ASSERT_FLOAT_EQ(h_results(11).imag(), r.imag()); + r = std::asinh(a); + ASSERT_FLOAT_EQ(h_results(12).real(), r.real()); + ASSERT_FLOAT_EQ(h_results(12).imag(), r.imag()); + r = std::acosh(a); + ASSERT_FLOAT_EQ(h_results(13).real(), r.real()); + ASSERT_FLOAT_EQ(h_results(13).imag(), r.imag()); + r = std::atanh(a); + ASSERT_FLOAT_EQ(h_results(14).real(), r.real()); + ASSERT_FLOAT_EQ(h_results(14).imag(), r.imag()); + r = std::asin(a); + ASSERT_FLOAT_EQ(h_results(15).real(), r.real()); + ASSERT_FLOAT_EQ(h_results(15).imag(), r.imag()); + r = std::acos(a); + ASSERT_FLOAT_EQ(h_results(16).real(), r.real()); + ASSERT_FLOAT_EQ(h_results(16).imag(), r.imag()); + r = std::atan(a); + ASSERT_FLOAT_EQ(h_results(17).real(), r.real()); + ASSERT_FLOAT_EQ(h_results(17).imag(), r.imag()); +#endif + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int & /*i*/) const { + Kokkos::complex<double> a(1.5, 2.5); + Kokkos::complex<double> b(3.25, 5.75); + double c = 9.3; + + d_results(0) = Kokkos::complex<double>(Kokkos::real(a), Kokkos::imag(a)); + d_results(1) = Kokkos::sqrt(a); + d_results(2) = Kokkos::pow(a, c); + d_results(3) = Kokkos::abs(a); + d_results(4) = Kokkos::exp(a); + d_results(5) = Kokkos::log(a); + d_results(6) = Kokkos::sin(a); + d_results(7) = Kokkos::cos(a); + d_results(8) = Kokkos::tan(a); + d_results(9) = Kokkos::sinh(a); + d_results(10) = Kokkos::cosh(a); + d_results(11) = Kokkos::tanh(a); + d_results(12) = Kokkos::asinh(a); + d_results(13) = Kokkos::acosh(a); + d_results(14) = Kokkos::atanh(a); + d_results(15) = Kokkos::asin(a); + d_results(16) = Kokkos::acos(a); + d_results(17) = Kokkos::atan(a); + } +}; + +void testComplexIO() { + Kokkos::complex<double> z = {3.14, 1.41}; + std::stringstream ss; + ss << z; + ASSERT_EQ(ss.str(), "(3.14,1.41)"); + + ss.str("1 (2) (3,4)"); + ss.clear(); + ss >> z; + ASSERT_EQ(z, (Kokkos::complex<double>{1, 0})); + ss >> z; + ASSERT_EQ(z, (Kokkos::complex<double>{2, 0})); + ss >> z; + ASSERT_EQ(z, (Kokkos::complex<double>{3, 4})); +} + +TEST(TEST_CATEGORY, complex_special_funtions) { + TestComplexSpecialFunctions<TEST_EXECSPACE> test; + test.testit(); +} + +TEST(TEST_CATEGORY, complex_io) { testComplexIO(); } + +TEST(TEST_CATEGORY, complex_trivially_copyable) { + // Kokkos::complex<RealType> is trivially copyable when RealType is + // trivially copyable + // Simply disable the check for IBM's XL compiler since we can't reliably + // check for a version that defines relevant functions. +#if !defined(__ibmxl__) + using RealType = double; + // clang claims compatibility with gcc 4.2.1 but all versions tested know + // about std::is_trivially_copyable. + ASSERT_TRUE(std::is_trivially_copyable<Kokkos::complex<RealType>>::value || + !std::is_trivially_copyable<RealType>::value); +#endif +} + +template <class ExecSpace> +struct TestBugPowAndLogComplex { + Kokkos::View<Kokkos::complex<double> *, ExecSpace> d_pow; + Kokkos::View<Kokkos::complex<double> *, ExecSpace> d_log; + TestBugPowAndLogComplex() : d_pow("pow", 2), d_log("log", 2) { test(); } + void test() { + Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, 1), *this); + auto h_pow = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), d_pow); + ASSERT_FLOAT_EQ(h_pow(0).real(), 18); + ASSERT_FLOAT_EQ(h_pow(0).imag(), 26); + ASSERT_FLOAT_EQ(h_pow(1).real(), -18); + ASSERT_FLOAT_EQ(h_pow(1).imag(), 26); + auto h_log = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), d_log); + ASSERT_FLOAT_EQ(h_log(0).real(), 1.151292546497023); + ASSERT_FLOAT_EQ(h_log(0).imag(), 0.3217505543966422); + ASSERT_FLOAT_EQ(h_log(1).real(), 1.151292546497023); + ASSERT_FLOAT_EQ(h_log(1).imag(), 2.819842099193151); + } + KOKKOS_FUNCTION void operator()(int) const { + d_pow(0) = Kokkos::pow(Kokkos::complex<double>(+3., 1.), 3.); + d_pow(1) = Kokkos::pow(Kokkos::complex<double>(-3., 1.), 3.); + d_log(0) = Kokkos::log(Kokkos::complex<double>(+3., 1.)); + d_log(1) = Kokkos::log(Kokkos::complex<double>(-3., 1.)); + } +}; + +TEST(TEST_CATEGORY, complex_issue_3865) { + TestBugPowAndLogComplex<TEST_EXECSPACE>(); +} + +TEST(TEST_CATEGORY, complex_issue_3867) { + ASSERT_EQ(Kokkos::pow(Kokkos::complex<double>(2., 1.), 3.), + Kokkos::pow(Kokkos::complex<double>(2., 1.), 3)); + ASSERT_EQ( + Kokkos::pow(Kokkos::complex<double>(2., 1.), 3.), + Kokkos::pow(Kokkos::complex<double>(2., 1.), Kokkos::complex<double>(3))); + + auto x = Kokkos::pow(Kokkos::complex<double>(2, 1), + Kokkos::complex<double>(-3, 4)); + auto y = Kokkos::complex<double>( + std::pow(std::complex<double>(2, 1), std::complex<double>(-3, 4))); + ASSERT_FLOAT_EQ(x.real(), y.real()); + ASSERT_FLOAT_EQ(x.imag(), y.imag()); + +#define CHECK_POW_COMPLEX_PROMOTION(ARGTYPE1, ARGTYPE2, RETURNTYPE) \ + static_assert( \ + std::is_same<RETURNTYPE, \ + decltype(Kokkos::pow(std::declval<ARGTYPE1>(), \ + std::declval<ARGTYPE2>()))>::value, \ + ""); \ + static_assert( \ + std::is_same<RETURNTYPE, \ + decltype(Kokkos::pow(std::declval<ARGTYPE2>(), \ + std::declval<ARGTYPE1>()))>::value, \ + ""); + + CHECK_POW_COMPLEX_PROMOTION(Kokkos::complex<long double>, long double, + Kokkos::complex<long double>); + CHECK_POW_COMPLEX_PROMOTION(Kokkos::complex<long double>, double, + Kokkos::complex<long double>); + CHECK_POW_COMPLEX_PROMOTION(Kokkos::complex<long double>, float, + Kokkos::complex<long double>); + CHECK_POW_COMPLEX_PROMOTION(Kokkos::complex<long double>, int, + Kokkos::complex<long double>); + + CHECK_POW_COMPLEX_PROMOTION(Kokkos::complex<double>, long double, + Kokkos::complex<long double>); + CHECK_POW_COMPLEX_PROMOTION(Kokkos::complex<double>, double, + Kokkos::complex<double>); + CHECK_POW_COMPLEX_PROMOTION(Kokkos::complex<double>, float, + Kokkos::complex<double>); + CHECK_POW_COMPLEX_PROMOTION(Kokkos::complex<double>, int, + Kokkos::complex<double>); + + CHECK_POW_COMPLEX_PROMOTION(Kokkos::complex<float>, long double, + Kokkos::complex<long double>); + CHECK_POW_COMPLEX_PROMOTION(Kokkos::complex<float>, double, + Kokkos::complex<double>); + CHECK_POW_COMPLEX_PROMOTION(Kokkos::complex<float>, float, + Kokkos::complex<float>); + CHECK_POW_COMPLEX_PROMOTION(Kokkos::complex<float>, int, + Kokkos::complex<double>); + +#undef CHECK_POW_COMPLEX_PROMOTION +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestConcepts.hpp b/packages/kokkos/core/unit_test/TestConcepts.hpp new file mode 100644 index 0000000000000000000000000000000000000000..561302f2a0f863e5857727b821a9f07867058f53 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestConcepts.hpp @@ -0,0 +1,87 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> + +namespace TestConcept { + +using ExecutionSpace = TEST_EXECSPACE; +using MemorySpace = typename ExecutionSpace::memory_space; +using DeviceType = typename ExecutionSpace::device_type; + +static_assert(Kokkos::is_execution_space<ExecutionSpace>{}, ""); +static_assert(Kokkos::is_execution_space<ExecutionSpace const>{}, ""); +static_assert(!Kokkos::is_execution_space<ExecutionSpace &>{}, ""); +static_assert(!Kokkos::is_execution_space<ExecutionSpace const &>{}, ""); + +static_assert(Kokkos::is_memory_space<MemorySpace>{}, ""); +static_assert(Kokkos::is_memory_space<MemorySpace const>{}, ""); +static_assert(!Kokkos::is_memory_space<MemorySpace &>{}, ""); +static_assert(!Kokkos::is_memory_space<MemorySpace const &>{}, ""); + +static_assert(Kokkos::is_device<DeviceType>{}, ""); +static_assert(Kokkos::is_device<DeviceType const>{}, ""); +static_assert(!Kokkos::is_device<DeviceType &>{}, ""); +static_assert(!Kokkos::is_device<DeviceType const &>{}, ""); + +static_assert(!Kokkos::is_device<ExecutionSpace>{}, ""); +static_assert(!Kokkos::is_device<MemorySpace>{}, ""); + +static_assert(Kokkos::is_space<ExecutionSpace>{}, ""); +static_assert(Kokkos::is_space<MemorySpace>{}, ""); +static_assert(Kokkos::is_space<DeviceType>{}, ""); +static_assert(Kokkos::is_space<ExecutionSpace const>{}, ""); +static_assert(Kokkos::is_space<MemorySpace const>{}, ""); +static_assert(Kokkos::is_space<DeviceType const>{}, ""); +static_assert(!Kokkos::is_space<ExecutionSpace &>{}, ""); +static_assert(!Kokkos::is_space<MemorySpace &>{}, ""); +static_assert(!Kokkos::is_space<DeviceType &>{}, ""); + +static_assert( + std::is_same<float, Kokkos::Impl::remove_cvref_t<float const &>>{}, ""); +static_assert(std::is_same<int, Kokkos::Impl::remove_cvref_t<int &>>{}, ""); +static_assert(std::is_same<int, Kokkos::Impl::remove_cvref_t<int const>>{}, ""); +static_assert(std::is_same<float, Kokkos::Impl::remove_cvref_t<float>>{}, ""); + +} // namespace TestConcept diff --git a/packages/kokkos/core/unit_test/TestConcurrentBitset.hpp b/packages/kokkos/core/unit_test/TestConcurrentBitset.hpp new file mode 100644 index 0000000000000000000000000000000000000000..5a7b8e4bae55565c277637e8f04909029ea37ba7 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestConcurrentBitset.hpp @@ -0,0 +1,169 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef TEST_CONCURRENTBITSET_HPP +#define TEST_CONCURRENTBITSET_HPP + +#include <gtest/gtest.h> + +#include <stdexcept> +#include <sstream> +#include <iostream> + +#include <impl/Kokkos_ConcurrentBitset.hpp> + +namespace Test { + +template <class DeviceType> +struct ConcurrentBitset { + using view_unsigned_type = Kokkos::View<uint32_t*, DeviceType>; + using view_int_type = Kokkos::View<int*, DeviceType>; + + view_unsigned_type bitset; + view_int_type acquired; + uint32_t bitset_count_lg2; + uint32_t bitset_count_mask; + + ConcurrentBitset(const uint32_t arg_bitset_count_lg2, + const view_unsigned_type& arg_bitset, + const view_int_type& arg_acquired) + : bitset(arg_bitset), + acquired(arg_acquired), + bitset_count_lg2(arg_bitset_count_lg2), + bitset_count_mask(uint32_t(1u << arg_bitset_count_lg2) - 1) {} + + struct TagAcquire {}; + struct TagRelease {}; + struct TagReacquire {}; + + KOKKOS_INLINE_FUNCTION + void operator()(TagAcquire, int i, long& update) const { + unsigned hint = Kokkos::Impl::clock_tic() & bitset_count_mask; + + Kokkos::pair<int, int> result = + Kokkos::Impl::concurrent_bitset::acquire_bounded_lg2( + bitset.data(), bitset_count_lg2, hint); + + acquired(i) = result.first; + + if (0 <= result.first) ++update; + } + + KOKKOS_INLINE_FUNCTION + void operator()(TagRelease, int i, long& update) const { + if (0 == (i % 3) && 0 <= acquired(i)) { + Kokkos::Impl::concurrent_bitset::release(bitset.data(), acquired(i)); + acquired(i) = -1; + ++update; + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(TagReacquire, int i, long& update) const { + if (acquired(i) < 0) { + unsigned hint = Kokkos::Impl::clock_tic() & bitset_count_mask; + + Kokkos::pair<int, int> result = + Kokkos::Impl::concurrent_bitset::acquire_bounded_lg2( + bitset.data(), bitset_count_lg2, hint); + + acquired(i) = result.first; + + if (0 <= result.first) ++update; + } + } +}; + +template <class DeviceType> +void test_concurrent_bitset(int bit_count) { + using Functor = ConcurrentBitset<DeviceType>; + using view_unsigned_type = typename Functor::view_unsigned_type; + using view_int_type = typename Functor::view_int_type; + + int bit_count_lg2 = 1; + + while ((1 << bit_count_lg2) < bit_count) ++bit_count_lg2; + + bit_count = 1 << bit_count_lg2; + + const int buffer_length = + Kokkos::Impl::concurrent_bitset::buffer_bound_lg2(bit_count_lg2); + + view_unsigned_type bitset("bitset", buffer_length); + + // Try to acquire more than available: + + const size_t n = (bit_count * 3) / 2; + + view_int_type acquired("acquired", n); + + typename view_unsigned_type::HostMirror bitset_host = + Kokkos::create_mirror_view(bitset); + + Kokkos::deep_copy(bitset, 0u); + + long total = 0; + long total_release = 0; + long total_reacquire = 0; + + Kokkos::parallel_reduce( + Kokkos::RangePolicy<DeviceType, typename Functor::TagAcquire>(0, n), + Functor(bit_count_lg2, bitset, acquired), total); + + ASSERT_EQ(bit_count, total); + + Kokkos::parallel_reduce( + Kokkos::RangePolicy<DeviceType, typename Functor::TagRelease>(0, n), + Functor(bit_count_lg2, bitset, acquired), total_release); + + Kokkos::parallel_reduce( + Kokkos::RangePolicy<DeviceType, typename Functor::TagReacquire>(0, n), + Functor(bit_count_lg2, bitset, acquired), total_reacquire); + + ASSERT_EQ(total_release, total_reacquire); +} + +} // namespace Test + +#endif /* #ifndef TEST_CONCURRENTBITSET_HPP */ diff --git a/packages/kokkos/core/unit_test/TestCrs.hpp b/packages/kokkos/core/unit_test/TestCrs.hpp new file mode 100644 index 0000000000000000000000000000000000000000..78208c911606379ff2635ddfdf940d6251973d9f --- /dev/null +++ b/packages/kokkos/core/unit_test/TestCrs.hpp @@ -0,0 +1,226 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <vector> + +#include <Kokkos_Core.hpp> + +namespace Test { + +namespace { + +template <class ExecSpace> +struct CountFillFunctor { + KOKKOS_INLINE_FUNCTION + std::int32_t operator()(std::int32_t row, float *fill) const { + auto n = (row % 4) + 1; + if (fill) { + for (std::int32_t j = 0; j < n; ++j) { + fill[j] = j + 1; + } + } + return n; + } +}; + +/* RunUpdateCrsTest + * 4 test cases: + * 1. use member object version which is constructed directly using the copy + * constructor + * 2. excplicity copy construct in local variable + * 3. construct default and assign to input object + * 4. construct object from views + */ +template <class CrsType, class ExecSpace, class scalarType> +struct RunUpdateCrsTest { + struct TestOne {}; + struct TestTwo {}; + struct TestThree {}; + struct TestFour {}; + + CrsType graph; + RunUpdateCrsTest(CrsType g_in) : graph(g_in) {} + + void run_test(int nTest) { + switch (nTest) { + case 1: + parallel_for( + "TestCrs1", + Kokkos::RangePolicy<ExecSpace, TestOne>(0, graph.numRows()), *this); + break; + case 2: + parallel_for( + "TestCrs2", + Kokkos::RangePolicy<ExecSpace, TestTwo>(0, graph.numRows()), *this); + break; + case 3: + parallel_for( + "TestCrs3", + Kokkos::RangePolicy<ExecSpace, TestThree>(0, graph.numRows()), + *this); + break; + case 4: + parallel_for( + "TestCrs4", + Kokkos::RangePolicy<ExecSpace, TestFour>(0, graph.numRows()), + *this); + break; + default: break; + } + } + + KOKKOS_INLINE_FUNCTION + void updateGraph(const CrsType &g_in, const scalarType row) const { + auto row_map = g_in.row_map; + auto entries = g_in.entries; + auto j_start = row_map(row); + auto j_end = row_map(row + 1) - j_start; + for (scalarType j = 0; j < j_end; ++j) { + entries(j_start + j) = (j + 1) * (j + 1); + } + } + + // Test Crs class from class member + KOKKOS_INLINE_FUNCTION + void operator()(const TestOne &, const scalarType row) const { + updateGraph(graph, row); + } + + // Test Crs class from copy constructor (local_graph(graph) + KOKKOS_INLINE_FUNCTION + void operator()(const TestTwo &, const scalarType row) const { + CrsType local_graph(graph); + updateGraph(local_graph, row); + } + + // Test Crs class from default constructor assigned to function parameter + KOKKOS_INLINE_FUNCTION + void operator()(const TestThree &, const scalarType row) const { + CrsType local_graph; + local_graph = graph; + updateGraph(local_graph, row); + } + + // Test Crs class from local graph constructed from row_map and entities + // access on input parameter) + KOKKOS_INLINE_FUNCTION + void operator()(const TestFour &, const scalarType row) const { + CrsType local_graph(graph.row_map, graph.entries); + updateGraph(local_graph, row); + } +}; + +template <class ExecSpace> +void test_count_fill(std::int32_t nrows) { + Kokkos::Crs<float, ExecSpace, void, std::int32_t> graph; + Kokkos::count_and_fill_crs(graph, nrows, CountFillFunctor<ExecSpace>()); + ASSERT_EQ(graph.numRows(), nrows); + auto row_map = Kokkos::create_mirror_view(graph.row_map); + Kokkos::deep_copy(row_map, graph.row_map); + auto entries = Kokkos::create_mirror_view(graph.entries); + Kokkos::deep_copy(entries, graph.entries); + for (std::int32_t row = 0; row < nrows; ++row) { + auto n = (row % 4) + 1; + ASSERT_EQ(row_map(row + 1) - row_map(row), n); + for (std::int32_t j = 0; j < n; ++j) { + ASSERT_EQ(entries(row_map(row) + j), j + 1); + } + } +} + +// Test Crs Constructor / assignment operation by +// using count and fill to create/populate initial graph, +// then use parallel_for with Crs directly to update content +// then verify results +template <class ExecSpace> +void test_constructor(std::int32_t nrows) { + for (int nTest = 1; nTest < 5; nTest++) { + using crs_type = Kokkos::Crs<float, ExecSpace, void, std::int32_t>; + crs_type graph; + Kokkos::count_and_fill_crs(graph, nrows, CountFillFunctor<ExecSpace>()); + ASSERT_EQ(graph.numRows(), nrows); + + RunUpdateCrsTest<crs_type, ExecSpace, std::int32_t> crstest(graph); + crstest.run_test(nTest); + + auto row_map = Kokkos::create_mirror_view(graph.row_map); + Kokkos::deep_copy(row_map, graph.row_map); + auto entries = Kokkos::create_mirror_view(graph.entries); + Kokkos::deep_copy(entries, graph.entries); + + for (std::int32_t row = 0; row < nrows; ++row) { + auto n = (row % 4) + 1; + ASSERT_EQ(row_map(row + 1) - row_map(row), n); + for (std::int32_t j = 0; j < n; ++j) { + ASSERT_EQ(entries(row_map(row) + j), (j + 1) * (j + 1)); + } + } + } +} + +} // anonymous namespace + +TEST(TEST_CATEGORY, crs_count_fill) { + test_count_fill<TEST_EXECSPACE>(0); + test_count_fill<TEST_EXECSPACE>(1); + test_count_fill<TEST_EXECSPACE>(2); + test_count_fill<TEST_EXECSPACE>(3); + test_count_fill<TEST_EXECSPACE>(13); + test_count_fill<TEST_EXECSPACE>(100); + test_count_fill<TEST_EXECSPACE>(1000); + test_count_fill<TEST_EXECSPACE>(10000); +} + +TEST(TEST_CATEGORY, crs_copy_constructor) { + test_constructor<TEST_EXECSPACE>(0); + test_constructor<TEST_EXECSPACE>(1); + test_constructor<TEST_EXECSPACE>(2); + test_constructor<TEST_EXECSPACE>(3); + test_constructor<TEST_EXECSPACE>(13); + test_constructor<TEST_EXECSPACE>(100); + test_constructor<TEST_EXECSPACE>(1000); + test_constructor<TEST_EXECSPACE>(10000); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestDeepCopyAlignment.hpp b/packages/kokkos/core/unit_test/TestDeepCopyAlignment.hpp new file mode 100644 index 0000000000000000000000000000000000000000..49f8daf89eabca9b3aa7e1f06d7a10ceb23a6a24 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestDeepCopyAlignment.hpp @@ -0,0 +1,356 @@ +#include <Kokkos_Core.hpp> + +namespace Test { + +#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA +namespace Impl { +template <class MemorySpaceA, class MemorySpaceB> +struct TestDeepCopy { + using a_base_t = Kokkos::View<double*, Kokkos::LayoutRight, MemorySpaceA>; + using b_base_t = Kokkos::View<double*, Kokkos::LayoutRight, MemorySpaceB>; + using a_char_t = Kokkos::View<char*, Kokkos::LayoutRight, MemorySpaceA>; + using b_char_t = Kokkos::View<char*, Kokkos::LayoutRight, MemorySpaceB>; + + using policyA_t = Kokkos::RangePolicy<typename MemorySpaceA::execution_space>; + using policyB_t = Kokkos::RangePolicy<typename MemorySpaceB::execution_space>; + + static void reset_a_copy_and_b( + Kokkos::View<char*, Kokkos::LayoutRight, MemorySpaceA> a_char_copy, + Kokkos::View<char*, Kokkos::LayoutRight, MemorySpaceB> b_char) { + const int N = b_char.extent_int(0); + Kokkos::parallel_for( + "TestDeepCopy: FillA_copy", policyA_t(0, N), + KOKKOS_LAMBDA(const int& i) { a_char_copy(i) = char(0); }); + Kokkos::parallel_for( + "TestDeepCopy: FillB", policyB_t(0, N), + KOKKOS_LAMBDA(const int& i) { b_char(i) = char(0); }); + } + + static int compare_equal( + Kokkos::View<char*, Kokkos::LayoutRight, MemorySpaceA> a_char_copy, + Kokkos::View<char*, Kokkos::LayoutRight, MemorySpaceA> a_char) { + const int N = a_char.extent_int(0); + int errors; + Kokkos::parallel_reduce( + "TestDeepCopy: FillA_copy", policyA_t(0, N), + KOKKOS_LAMBDA(const int& i, int& lsum) { + if (a_char_copy(i) != a_char(i)) lsum++; + }, + errors); + return errors; + } + + static void run_test(int num_bytes) { + a_base_t a_base("test_space_to_space", (num_bytes + 128) / 8); + a_base_t a_base_copy("test_space_to_space", (num_bytes + 128) / 8); + Kokkos::View<double*, Kokkos::LayoutRight, MemorySpaceB> b_base( + "test_space_to_space", (num_bytes + 128) / 8); + + Kokkos::View<char*, Kokkos::LayoutRight, MemorySpaceA> a_char( + (char*)a_base.data(), a_base.extent(0) * 8); + Kokkos::View<char*, Kokkos::LayoutRight, MemorySpaceA> a_char_copy( + (char*)a_base_copy.data(), a_base.extent(0) * 8); + Kokkos::View<char*, Kokkos::LayoutRight, MemorySpaceB> b_char( + (char*)b_base.data(), b_base.extent(0) * 8); + + Kokkos::parallel_for( + "TestDeepCopy: FillA", policyA_t(0, a_char.extent(0)), + KOKKOS_LAMBDA(const int& i) { + a_char(i) = static_cast<char>(i % 97) + 1; + }); + + reset_a_copy_and_b(a_char_copy, b_char); + + { + int check = compare_equal(a_char_copy, a_char); + ASSERT_EQ(check, a_char.extent(0)); + } + + // (a.data()%8, (a.data()+a.extent(0))%8, b.data()%8, + // (b.data()+b.extent(0))%8 (0,0,0,0) + { + int a_begin = 0; + int a_end = 0; + int b_begin = 0; + int b_end = 0; + auto a = Kokkos::subview( + a_char, std::pair<int, int>(a_begin, a_char.extent_int(0) - a_end)); + auto b = Kokkos::subview( + b_char, std::pair<int, int>(b_begin, b_char.extent_int(0) - b_end)); + auto a_copy = Kokkos::subview( + a_char_copy, + std::pair<int, int>(a_begin, a_char_copy.extent_int(0) - a_end)); + Kokkos::deep_copy(b, a); + Kokkos::deep_copy(a_copy, b); + int check = compare_equal(a_copy, a); + ASSERT_EQ(check, 0); + } + + { + int a_begin = 0; + int a_end = 5; + int b_begin = 0; + int b_end = 5; + auto a = Kokkos::subview( + a_char, std::pair<int, int>(a_begin, a_char.extent_int(0) - a_end)); + auto b = Kokkos::subview( + b_char, std::pair<int, int>(b_begin, b_char.extent_int(0) - b_end)); + auto a_copy = Kokkos::subview( + a_char_copy, + std::pair<int, int>(a_begin, a_char_copy.extent_int(0) - a_end)); + Kokkos::deep_copy(b, a); + Kokkos::deep_copy(a_copy, b); + int check = compare_equal(a_copy, a); + ASSERT_EQ(check, 0); + } + + { + int a_begin = 3; + int a_end = 0; + int b_begin = 3; + int b_end = 0; + auto a = Kokkos::subview( + a_char, std::pair<int, int>(a_begin, a_char.extent_int(0) - a_end)); + auto b = Kokkos::subview( + b_char, std::pair<int, int>(b_begin, b_char.extent_int(0) - b_end)); + auto a_copy = Kokkos::subview( + a_char_copy, + std::pair<int, int>(a_begin, a_char_copy.extent_int(0) - a_end)); + Kokkos::deep_copy(b, a); + Kokkos::deep_copy(a_copy, b); + int check = compare_equal(a_copy, a); + ASSERT_EQ(check, 0); + } + + { + int a_begin = 3; + int a_end = 6; + int b_begin = 3; + int b_end = 6; + auto a = Kokkos::subview( + a_char, std::pair<int, int>(a_begin, a_char.extent_int(0) - a_end)); + auto b = Kokkos::subview( + b_char, std::pair<int, int>(b_begin, b_char.extent_int(0) - b_end)); + auto a_copy = Kokkos::subview( + a_char_copy, + std::pair<int, int>(a_begin, a_char_copy.extent_int(0) - a_end)); + Kokkos::deep_copy(b, a); + Kokkos::deep_copy(a_copy, b); + int check = compare_equal(a_copy, a); + ASSERT_EQ(check, 0); + } + + { + int a_begin = 5; + int a_end = 4; + int b_begin = 3; + int b_end = 6; + auto a = Kokkos::subview( + a_char, std::pair<int, int>(a_begin, a_char.extent_int(0) - a_end)); + auto b = Kokkos::subview( + b_char, std::pair<int, int>(b_begin, b_char.extent_int(0) - b_end)); + auto a_copy = Kokkos::subview( + a_char_copy, + std::pair<int, int>(a_begin, a_char_copy.extent_int(0) - a_end)); + Kokkos::deep_copy(b, a); + Kokkos::deep_copy(a_copy, b); + int check = compare_equal(a_copy, a); + ASSERT_EQ(check, 0); + } + + { + int a_begin = 0; + int a_end = 8; + int b_begin = 2; + int b_end = 6; + auto a = Kokkos::subview( + a_char, std::pair<int, int>(a_begin, a_char.extent_int(0) - a_end)); + auto b = Kokkos::subview( + b_char, std::pair<int, int>(b_begin, b_char.extent_int(0) - b_end)); + auto a_copy = Kokkos::subview( + a_char_copy, + std::pair<int, int>(a_begin, a_char_copy.extent_int(0) - a_end)); + Kokkos::deep_copy(b, a); + Kokkos::deep_copy(a_copy, b); + int check = compare_equal(a_copy, a); + ASSERT_EQ(check, 0); + } + + { + int a_begin = 2; + int a_end = 6; + int b_begin = 0; + int b_end = 8; + auto a = Kokkos::subview( + a_char, std::pair<int, int>(a_begin, a_char.extent_int(0) - a_end)); + auto b = Kokkos::subview( + b_char, std::pair<int, int>(b_begin, b_char.extent_int(0) - b_end)); + auto a_copy = Kokkos::subview( + a_char_copy, + std::pair<int, int>(a_begin, a_char_copy.extent_int(0) - a_end)); + Kokkos::deep_copy(b, a); + Kokkos::deep_copy(a_copy, b); + int check = compare_equal(a_copy, a); + ASSERT_EQ(check, 0); + } + } +}; +} // namespace Impl + +TEST(TEST_CATEGORY, deep_copy_alignment) { + { + Impl::TestDeepCopy<TEST_EXECSPACE::memory_space, + TEST_EXECSPACE::memory_space>::run_test(100000); + } + { + Impl::TestDeepCopy<Kokkos::HostSpace, + TEST_EXECSPACE::memory_space>::run_test(100000); + } + { + Impl::TestDeepCopy<TEST_EXECSPACE::memory_space, + Kokkos::HostSpace>::run_test(100000); + } +} +#endif + +namespace Impl { +template <class Scalar1, class Scalar2, class Layout1, class Layout2> +struct TestDeepCopyScalarConversion { + struct TagFill {}; + struct TagCompare {}; + + using view_type_s1_1d = Kokkos::View<Scalar1*, Layout1, TEST_EXECSPACE>; + using view_type_s2_1d = Kokkos::View<Scalar2*, Layout2, TEST_EXECSPACE>; + using view_type_s1_2d = Kokkos::View<Scalar1**, Layout1, TEST_EXECSPACE>; + using view_type_s2_2d = Kokkos::View<Scalar2**, Layout2, TEST_EXECSPACE>; + + using base_layout1 = typename std::conditional< + std::is_same<Layout1, Kokkos::LayoutStride>::value, Kokkos::LayoutLeft, + Layout1>::type; + using base_layout2 = typename std::conditional< + std::is_same<Layout2, Kokkos::LayoutStride>::value, Kokkos::LayoutLeft, + Layout2>::type; + + using base_type_s1_1d = Kokkos::View<Scalar1*, base_layout1, TEST_EXECSPACE>; + using base_type_s2_1d = Kokkos::View<Scalar2*, base_layout2, TEST_EXECSPACE>; + using base_type_s1_2d = Kokkos::View<Scalar1**, base_layout1, TEST_EXECSPACE>; + using base_type_s2_2d = Kokkos::View<Scalar2**, base_layout2, TEST_EXECSPACE>; + + view_type_s1_1d view_s1_1d; + view_type_s2_1d view_s2_1d; + view_type_s1_2d view_s1_2d; + view_type_s2_2d view_s2_2d; + + Kokkos::View<int64_t, TEST_EXECSPACE> error_count; + + void create_views(int64_t N0, int64_t N1) { + base_type_s1_1d b_s1_1d("TestDeepCopyConversion::b_s1_1d", N0); + base_type_s2_1d b_s2_1d("TestDeepCopyConversion::b_s2_1d", N0); + base_type_s1_2d b_s1_2d("TestDeepCopyConversion::b_s1_2d", N0, N1); + base_type_s2_2d b_s2_2d("TestDeepCopyConversion::b_s2_2d", N0, N1); + + view_s1_1d = view_type_s1_1d(b_s1_1d, Kokkos::ALL); + view_s2_1d = view_type_s2_1d(b_s2_1d, Kokkos::ALL); + view_s1_2d = view_type_s1_2d(b_s1_2d, Kokkos::ALL, Kokkos::ALL); + view_s2_2d = view_type_s2_2d(b_s2_2d, Kokkos::ALL, Kokkos::ALL); + + error_count = Kokkos::View<int64_t, TEST_EXECSPACE>( + "TestDeepCopyConversion::error_count"); + } + + KOKKOS_FUNCTION + void operator()(TagFill, const int64_t i) const { + view_s2_1d(i) = static_cast<Scalar2>(i + 1); + for (int64_t j = 0; j < static_cast<int64_t>(view_s2_2d.extent(1)); j++) + view_s2_2d(i, j) = static_cast<Scalar2>((i + 1) * 1000 + j + 1); + } + + KOKKOS_FUNCTION + void operator()(TagCompare, const int64_t i) const { + int64_t errors = 0; + if (view_s1_1d(i) != static_cast<Scalar1>(static_cast<Scalar2>(i + 1))) + errors++; + for (int64_t j = 0; j < static_cast<int64_t>(view_s1_2d.extent(1)); j++) { + if (view_s1_2d(i, j) != + static_cast<Scalar1>(static_cast<Scalar2>((i + 1) * 1000 + j + 1))) + errors++; + } + if (errors > 0) Kokkos::atomic_add(&error_count(), errors); + } + + void run_tests(int64_t N0, int64_t N1) { + create_views(N0, N1); + + Kokkos::parallel_for("TestDeepCopyConversion::Fill", + Kokkos::RangePolicy<TEST_EXECSPACE, TagFill, + Kokkos::IndexType<int64_t>>(0, N0), + *this); + + Kokkos::deep_copy(view_s1_1d, view_s2_1d); + Kokkos::deep_copy(view_s1_2d, view_s2_2d); + + Kokkos::parallel_for("TestDeepCopyConversion::Compare", + Kokkos::RangePolicy<TEST_EXECSPACE, TagCompare, + Kokkos::IndexType<int64_t>>(0, N0), + *this); + + int64_t errors = 0; + Kokkos::deep_copy(errors, error_count); + ASSERT_TRUE(errors == 0); + + Kokkos::deep_copy(view_s1_1d, static_cast<Scalar1>(0)); + Kokkos::deep_copy(view_s1_2d, static_cast<Scalar1>(0)); + + Kokkos::parallel_for("TestDeepCopyConversion::Compare", + Kokkos::RangePolicy<TEST_EXECSPACE, TagCompare, + Kokkos::IndexType<int64_t>>(0, N0), + *this); + Kokkos::deep_copy(errors, error_count); + ASSERT_TRUE(errors > 0); + + Kokkos::deep_copy(error_count, 0); + Kokkos::deep_copy(TEST_EXECSPACE(), view_s1_1d, view_s2_1d); + Kokkos::deep_copy(TEST_EXECSPACE(), view_s1_2d, view_s2_2d); + + Kokkos::parallel_for("TestDeepCopyConversion::Compare", + Kokkos::RangePolicy<TEST_EXECSPACE, TagCompare, + Kokkos::IndexType<int64_t>>(0, N0), + *this); + + Kokkos::deep_copy(errors, error_count); + ASSERT_TRUE(errors == 0); + } +}; +} // namespace Impl + +TEST(TEST_CATEGORY, deep_copy_conversion) { + int64_t N0 = 19381; + int64_t N1 = 17; + + using right = Kokkos::LayoutRight; + using left = Kokkos::LayoutLeft; + using stride = Kokkos::LayoutStride; + + Impl::TestDeepCopyScalarConversion<double, double, right, right>().run_tests( + N0, N1); + Impl::TestDeepCopyScalarConversion<double, double, right, left>().run_tests( + N0, N1); + Impl::TestDeepCopyScalarConversion<double, double, left, right>().run_tests( + N0, N1); + Impl::TestDeepCopyScalarConversion<double, double, stride, right>().run_tests( + N0, N1); + Impl::TestDeepCopyScalarConversion<double, double, right, stride>().run_tests( + N0, N1); + + Impl::TestDeepCopyScalarConversion<double, float, right, right>().run_tests( + N0, N1); + Impl::TestDeepCopyScalarConversion<double, float, right, left>().run_tests( + N0, N1); + Impl::TestDeepCopyScalarConversion<double, float, left, right>().run_tests( + N0, N1); + Impl::TestDeepCopyScalarConversion<double, float, stride, right>().run_tests( + N0, N1); + Impl::TestDeepCopyScalarConversion<double, float, right, stride>().run_tests( + N0, N1); +} +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp b/packages/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp new file mode 100644 index 0000000000000000000000000000000000000000..8a9263c8df5fcd1eab370837d9d2de92281e7aaa --- /dev/null +++ b/packages/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp @@ -0,0 +1,501 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +#ifdef KOKKOS_ENABLE_OPENMP +#include <omp.h> +#endif +#include <set> +#if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__) + +namespace Test { + +namespace Impl { + +std::set<char*> delete_these; +void cleanup_memory() { + for (auto x : delete_these) { + delete[] x; + } +} + +char** init_kokkos_args(bool do_threads, bool do_numa, bool do_device, + bool do_other, bool do_tune, int& nargs, + Kokkos::InitArguments& init_args) { + nargs = (do_threads ? 1 : 0) + (do_numa ? 1 : 0) + (do_device ? 1 : 0) + + (do_other ? 4 : 0) + (do_tune ? 1 : 0); + + char** args_kokkos = new char*[nargs]; + for (int i = 0; i < nargs; i++) { + args_kokkos[i] = new char[45]; + delete_these.insert(args_kokkos[i]); + } + + int threads_idx = do_other ? 1 : 0; + int numa_idx = (do_other ? 3 : 0) + (do_threads ? 1 : 0); + int device_idx = + (do_other ? 3 : 0) + (do_threads ? 1 : 0) + (do_numa ? 1 : 0); + int tune_idx = (do_other ? 3 : 0) + (do_threads ? 1 : 0) + (do_numa ? 1 : 0) + + (do_device ? 1 : 0); + + if (do_threads) { + int nthreads = 3; + +#ifdef KOKKOS_ENABLE_OPENMP + if (omp_get_max_threads() < 3) nthreads = omp_get_max_threads(); +#endif + + if (Kokkos::hwloc::available()) { + if (Kokkos::hwloc::get_available_threads_per_core() < 3) + nthreads = Kokkos::hwloc::get_available_threads_per_core() * + Kokkos::hwloc::get_available_numa_count(); + } + +#ifdef KOKKOS_ENABLE_SERIAL + if (std::is_same<Kokkos::Serial, Kokkos::DefaultExecutionSpace>::value || + std::is_same<Kokkos::Serial, + Kokkos::DefaultHostExecutionSpace>::value) { + nthreads = 1; + } +#endif + + init_args.num_threads = nthreads; + sprintf(args_kokkos[threads_idx], "--threads=%i", nthreads); + } + + if (do_numa) { + int numa = 1; + if (Kokkos::hwloc::available()) { + numa = Kokkos::hwloc::get_available_numa_count(); + } + +#ifdef KOKKOS_ENABLE_SERIAL + if (std::is_same<Kokkos::Serial, Kokkos::DefaultExecutionSpace>::value || + std::is_same<Kokkos::Serial, + Kokkos::DefaultHostExecutionSpace>::value) { + numa = 1; + } +#endif + + init_args.num_numa = numa; + sprintf(args_kokkos[numa_idx], "--numa=%i", numa); + } + + if (do_device) { + init_args.device_id = 0; + sprintf(args_kokkos[device_idx], "--device-id=%i", 0); + } + + if (do_other) { + sprintf(args_kokkos[0], "--dummyarg=1"); + sprintf(args_kokkos[threads_idx + (do_threads ? 1 : 0)], "--dummy2arg"); + sprintf(args_kokkos[threads_idx + (do_threads ? 1 : 0) + 1], "dummy3arg"); + sprintf(args_kokkos[device_idx + (do_device ? 1 : 0)], "dummy4arg=1"); + } + + if (do_tune) { + init_args.tune_internals = true; + sprintf(args_kokkos[tune_idx], "--kokkos-tune-internals"); + } + + return args_kokkos; +} + +Kokkos::InitArguments init_initstruct(bool do_threads, bool do_numa, + bool do_device, bool do_tune) { + Kokkos::InitArguments args; + + if (do_threads) { + int nthreads = 3; + +#ifdef KOKKOS_ENABLE_OPENMP + if (omp_get_max_threads() < 3) { + nthreads = omp_get_max_threads(); + } +#endif + + if (Kokkos::hwloc::available()) { + if (Kokkos::hwloc::get_available_threads_per_core() < 3) { + nthreads = Kokkos::hwloc::get_available_threads_per_core() * + Kokkos::hwloc::get_available_numa_count(); + } + } + +#ifdef KOKKOS_ENABLE_SERIAL + if (std::is_same<Kokkos::Serial, Kokkos::DefaultExecutionSpace>::value || + std::is_same<Kokkos::Serial, + Kokkos::DefaultHostExecutionSpace>::value) { + nthreads = 1; + } +#endif + + args.num_threads = nthreads; + } + + if (do_numa) { + int numa = 1; + if (Kokkos::hwloc::available()) { + numa = Kokkos::hwloc::get_available_numa_count(); + } + +#ifdef KOKKOS_ENABLE_SERIAL + if (std::is_same<Kokkos::Serial, Kokkos::DefaultExecutionSpace>::value || + std::is_same<Kokkos::Serial, + Kokkos::DefaultHostExecutionSpace>::value) { + numa = 1; + } +#endif + + args.num_numa = numa; + } + + if (do_device) { + args.device_id = 0; + } + + if (do_tune) { + args.tune_internals = true; + } + + return args; +} + +void check_correct_initialization(const Kokkos::InitArguments& argstruct) { + ASSERT_EQ(Kokkos::DefaultExecutionSpace::impl_is_initialized(), 1); + ASSERT_EQ(Kokkos::HostSpace::execution_space::impl_is_initialized(), 1); + + // Figure out the number of threads the HostSpace ExecutionSpace should have + // initialized to. + int expected_nthreads = argstruct.num_threads; + +#ifdef KOKKOS_ENABLE_OPENMP + if (std::is_same<Kokkos::HostSpace::execution_space, Kokkos::OpenMP>::value) { + // use openmp default num threads + if (expected_nthreads < 0 || + (expected_nthreads == 0 && !Kokkos::hwloc::available())) { + expected_nthreads = omp_get_max_threads(); + } + // use hwloc if available + else if (expected_nthreads == 0 && Kokkos::hwloc::available()) { + expected_nthreads = Kokkos::hwloc::get_available_numa_count() * + Kokkos::hwloc::get_available_cores_per_numa() * + Kokkos::hwloc::get_available_threads_per_core(); + } + } +#endif + + if (expected_nthreads < 1) { + if (Kokkos::hwloc::available()) { + expected_nthreads = Kokkos::hwloc::get_available_numa_count() * + Kokkos::hwloc::get_available_cores_per_numa() * + Kokkos::hwloc::get_available_threads_per_core(); + } else { + expected_nthreads = 1; + } + +#ifdef KOKKOS_ENABLE_SERIAL + if (std::is_same<Kokkos::DefaultExecutionSpace, Kokkos::Serial>::value || + std::is_same<Kokkos::DefaultHostExecutionSpace, + Kokkos::Serial>::value) { + expected_nthreads = 1; + } +#endif + +#ifdef KOKKOS_ENABLE_HPX + // HPX uses all cores on machine by default. Skip this test. + if (std::is_same<Kokkos::DefaultExecutionSpace, + Kokkos::Experimental::HPX>::value || + std::is_same<Kokkos::DefaultHostExecutionSpace, + Kokkos::Experimental::HPX>::value) { + return; + } +#endif + } + + int expected_numa = argstruct.num_numa; + + if (expected_numa < 1) { + if (Kokkos::hwloc::available()) { + expected_numa = Kokkos::hwloc::get_available_numa_count(); + } else { + expected_numa = 1; + } + +#ifdef KOKKOS_ENABLE_SERIAL + if (std::is_same<Kokkos::DefaultExecutionSpace, Kokkos::Serial>::value || + std::is_same<Kokkos::DefaultHostExecutionSpace, Kokkos::Serial>::value) + expected_numa = 1; +#endif + } + + ASSERT_EQ(Kokkos::HostSpace::execution_space::impl_thread_pool_size(), + expected_nthreads); + +#ifdef KOKKOS_ENABLE_CUDA + if (std::is_same<Kokkos::DefaultExecutionSpace, Kokkos::Cuda>::value) { + int device; + cudaGetDevice(&device); + + int expected_device = argstruct.device_id; + if (argstruct.device_id < 0) { + expected_device = Kokkos::Cuda().cuda_device(); + } + + ASSERT_EQ(expected_device, device); + } +#endif + ASSERT_EQ(argstruct.tune_internals, Kokkos::tune_internals()); +} + +// TODO: Add check whether correct number of threads are actually started. +void test_no_arguments() { + Kokkos::initialize(); + check_correct_initialization(Kokkos::InitArguments()); + Kokkos::finalize(); +} + +void test_commandline_args(int nargs, char** args, + const Kokkos::InitArguments& argstruct) { + Kokkos::initialize(nargs, args); + check_correct_initialization(argstruct); + Kokkos::finalize(); +} + +void test_initstruct_args(const Kokkos::InitArguments& args) { + Kokkos::initialize(args); + check_correct_initialization(args); + Kokkos::finalize(); +} + +} // namespace Impl + +#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_01 +TEST(defaultdevicetypeinit, no_args) { Impl::test_no_arguments(); } +#endif + +#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_02 +TEST(defaultdevicetypeinit, commandline_args_empty) { + Kokkos::InitArguments argstruct; + int nargs = 0; + char** args = Impl::init_kokkos_args(false, false, false, false, false, nargs, + argstruct); + Impl::test_commandline_args(nargs, args, argstruct); + + Impl::cleanup_memory(); + delete[] args; +} +#endif + +#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_03 +TEST(defaultdevicetypeinit, commandline_args_other) { + Kokkos::InitArguments argstruct; + int nargs = 0; + char** args = Impl::init_kokkos_args(false, false, false, true, false, nargs, + argstruct); + Impl::test_commandline_args(nargs, args, argstruct); + + Impl::cleanup_memory(); + delete[] args; +} +#endif + +#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_04 +TEST(defaultdevicetypeinit, commandline_args_nthreads) { + Kokkos::InitArguments argstruct; + int nargs = 0; + char** args = Impl::init_kokkos_args(true, false, false, false, false, nargs, + argstruct); + Impl::test_commandline_args(nargs, args, argstruct); + + Impl::cleanup_memory(); + delete[] args; +} +#endif + +#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_05 +TEST(defaultdevicetypeinit, commandline_args_nthreads_numa) { + Kokkos::InitArguments argstruct; + int nargs = 0; + char** args = + Impl::init_kokkos_args(true, true, false, false, false, nargs, argstruct); + Impl::test_commandline_args(nargs, args, argstruct); + + Impl::cleanup_memory(); + + delete[] args; +} +#endif + +#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_06 +TEST(defaultdevicetypeinit, commandline_args_nthreads_numa_device) { + Kokkos::InitArguments argstruct; + int nargs = 0; + char** args = + Impl::init_kokkos_args(true, true, true, false, false, nargs, argstruct); + Impl::test_commandline_args(nargs, args, argstruct); + + Impl::cleanup_memory(); + + delete[] args; +} +#endif + +#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_07 +TEST(defaultdevicetypeinit, commandline_args_nthreads_device) { + Kokkos::InitArguments argstruct; + int nargs = 0; + char** args = + Impl::init_kokkos_args(true, false, true, false, false, nargs, argstruct); + Impl::test_commandline_args(nargs, args, argstruct); + + Impl::cleanup_memory(); + delete[] args; +} +#endif + +#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_08 +TEST(defaultdevicetypeinit, commandline_args_numa_device) { + Kokkos::InitArguments argstruct; + int nargs = 0; + char** args = + Impl::init_kokkos_args(false, true, true, false, false, nargs, argstruct); + Impl::test_commandline_args(nargs, args, argstruct); + + Impl::cleanup_memory(); + delete[] args; +} +#endif + +#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_09 +TEST(defaultdevicetypeinit, commandline_args_device) { + Kokkos::InitArguments argstruct; + int nargs = 0; + char** args = Impl::init_kokkos_args(false, false, true, false, false, nargs, + argstruct); + Impl::test_commandline_args(nargs, args, argstruct); + + Impl::cleanup_memory(); + delete[] args; +} +#endif + +#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_10 +TEST(defaultdevicetypeinit, commandline_args_nthreads_numa_device_other) { + Kokkos::InitArguments argstruct; + int nargs = 0; + char** args = + Impl::init_kokkos_args(true, true, true, true, false, nargs, argstruct); + Impl::test_commandline_args(nargs, args, argstruct); + Impl::cleanup_memory(); + delete[] args; +} +#endif + +#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_11 +TEST(defaultdevicetypeinit, commandline_args_nthreads_numa_device_other_tune) { + Kokkos::InitArguments argstruct; + int nargs = 0; + char** args = + Impl::init_kokkos_args(true, true, true, true, true, nargs, argstruct); + Impl::test_commandline_args(nargs, args, argstruct); + Impl::cleanup_memory(); + delete[] args; +} +#endif + +#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_12 +TEST(defaultdevicetypeinit, initstruct_default) { + Kokkos::InitArguments args; + Impl::test_initstruct_args(args); +} +#endif + +#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_13 +TEST(defaultdevicetypeinit, initstruct_nthreads) { + Kokkos::InitArguments args = Impl::init_initstruct(true, false, false, false); + Impl::test_initstruct_args(args); +} +#endif + +#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_14 +TEST(defaultdevicetypeinit, initstruct_nthreads_numa) { + Kokkos::InitArguments args = Impl::init_initstruct(true, true, false, false); + Impl::test_initstruct_args(args); +} +#endif + +#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_15 +TEST(defaultdevicetypeinit, initstruct_device) { + Kokkos::InitArguments args = Impl::init_initstruct(false, false, true, false); + Impl::test_initstruct_args(args); +} +#endif + +#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_16 +TEST(defaultdevicetypeinit, initstruct_nthreads_device) { + Kokkos::InitArguments args = Impl::init_initstruct(true, false, true, false); + Impl::test_initstruct_args(args); +} +#endif + +#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_17 +TEST(defaultdevicetypeinit, initstruct_nthreads_numa_device) { + Kokkos::InitArguments args = Impl::init_initstruct(true, true, true, false); + Impl::test_initstruct_args(args); +} +#endif + +#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_18 +TEST(defaultdevicetypeinit, initstruct_nthreads_numa_device_tune) { + Kokkos::InitArguments args = Impl::init_initstruct(true, true, true, true); + Impl::test_initstruct_args(args); +} +#endif + +} // namespace Test + +#endif diff --git a/packages/kokkos/core/unit_test/TestFunctorAnalysis.hpp b/packages/kokkos/core/unit_test/TestFunctorAnalysis.hpp new file mode 100644 index 0000000000000000000000000000000000000000..d9e2486a4a9e2c6edcea39968d85e4773b38c484 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestFunctorAnalysis.hpp @@ -0,0 +1,150 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef TEST_FUNCTOR_ANALYSIS_HPP +#define TEST_FUNCTOR_ANALYSIS_HPP + +#include <gtest/gtest.h> +#include <Kokkos_Core.hpp> + +/*--------------------------------------------------------------------------*/ + +namespace Test { + +struct TestFunctorAnalysis_03 { + struct value_type { + double x[2]; + }; + + KOKKOS_INLINE_FUNCTION + void operator()(int, value_type&) const {} + + KOKKOS_INLINE_FUNCTION + void join(value_type volatile&, value_type const volatile&) const {} + + KOKKOS_INLINE_FUNCTION static void init(value_type&) {} +}; + +template <class ExecSpace> +void test_functor_analysis() { + //------------------------------ + auto c01 = KOKKOS_LAMBDA(int){}; + using A01 = + Kokkos::Impl::FunctorAnalysis<Kokkos::Impl::FunctorPatternInterface::FOR, + Kokkos::RangePolicy<ExecSpace>, + decltype(c01)>; + + using R01 = typename A01::template Reducer<typename ExecSpace::memory_space>; + + static_assert(std::is_same<typename A01::value_type, void>::value, ""); + static_assert(std::is_same<typename A01::pointer_type, void>::value, ""); + static_assert(std::is_same<typename A01::reference_type, void>::value, ""); + static_assert(std::is_same<typename R01::functor_type, decltype(c01)>::value, + ""); + + static_assert(!A01::has_join_member_function, ""); + static_assert(!A01::has_init_member_function, ""); + static_assert(!A01::has_final_member_function, ""); + static_assert(A01::StaticValueSize == 0, ""); + ASSERT_EQ(R01(&c01).length(), 0); + + //------------------------------ + auto c02 = KOKKOS_LAMBDA(int, double&){}; + using A02 = Kokkos::Impl::FunctorAnalysis< + Kokkos::Impl::FunctorPatternInterface::REDUCE, + Kokkos::RangePolicy<ExecSpace>, decltype(c02)>; + using R02 = typename A02::template Reducer<typename ExecSpace::memory_space>; + + static_assert(std::is_same<typename A02::value_type, double>::value, ""); + static_assert(std::is_same<typename A02::pointer_type, double*>::value, ""); + static_assert(std::is_same<typename A02::reference_type, double&>::value, ""); + static_assert(std::is_same<typename R02::functor_type, decltype(c02)>::value, + ""); + + static_assert(!A02::has_join_member_function, ""); + static_assert(!A02::has_init_member_function, ""); + static_assert(!A02::has_final_member_function, ""); + static_assert(A02::StaticValueSize == sizeof(double), ""); + ASSERT_EQ(R02(&c02).length(), 1); + + //------------------------------ + + TestFunctorAnalysis_03 c03; + using A03 = Kokkos::Impl::FunctorAnalysis< + Kokkos::Impl::FunctorPatternInterface::REDUCE, + Kokkos::RangePolicy<ExecSpace>, TestFunctorAnalysis_03>; + using R03 = typename A03::template Reducer<typename ExecSpace::memory_space>; + + static_assert(std::is_same<typename A03::value_type, + TestFunctorAnalysis_03::value_type>::value, + ""); + static_assert(std::is_same<typename A03::pointer_type, + TestFunctorAnalysis_03::value_type*>::value, + ""); + static_assert(std::is_same<typename A03::reference_type, + TestFunctorAnalysis_03::value_type&>::value, + ""); + static_assert( + std::is_same<typename R03::functor_type, TestFunctorAnalysis_03>::value, + ""); + + static_assert(A03::has_join_member_function, ""); + static_assert(A03::has_init_member_function, ""); + static_assert(!A03::has_final_member_function, ""); + static_assert( + A03::StaticValueSize == sizeof(TestFunctorAnalysis_03::value_type), ""); + ASSERT_EQ(R03(&c03).length(), 1); + + //------------------------------ +} + +TEST(TEST_CATEGORY, functor_analysis) { + test_functor_analysis<TEST_EXECSPACE>(); +} + +} // namespace Test + +/*--------------------------------------------------------------------------*/ + +#endif /* #ifndef TEST_FUNCTOR_ANALYSIS_HPP */ diff --git a/packages/kokkos/core/unit_test/TestGraph.hpp b/packages/kokkos/core/unit_test/TestGraph.hpp new file mode 100644 index 0000000000000000000000000000000000000000..d012da47be08e95c012d9c02e467da7420fc42ad --- /dev/null +++ b/packages/kokkos/core/unit_test/TestGraph.hpp @@ -0,0 +1,253 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <Kokkos_Graph.hpp> + +#include <gtest/gtest.h> + +namespace Test { + +template <class ExecSpace> +struct CountTestFunctor { + using value_type = int; + template <class T> + using atomic_view = + Kokkos::View<T, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic>>; + atomic_view<int> count; + atomic_view<int> bugs; + int expected_count_min; + int expected_count_max; + + template <class... Ts> + KOKKOS_FUNCTION void operator()(Ts&&...) const noexcept { + bugs() += int(count() > expected_count_max || count() < expected_count_min); + count()++; + } +}; + +template <class ExecSpace, class T> +struct SetViewToValueFunctor { + using value_type = T; + using view_type = + Kokkos::View<T, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic>>; + view_type v; + T value; + + template <class... Ts> + KOKKOS_FUNCTION void operator()(Ts&&...) const noexcept { + v() = value; + } +}; + +template <class ExecSpace, class T> +struct SetResultToViewFunctor { + using value_type = T; + using view_type = + Kokkos::View<T, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic>>; + view_type v; + + template <class U> + KOKKOS_FUNCTION void operator()(U&&, value_type& val) const noexcept { + val += v(); + } +}; + +struct TEST_CATEGORY_FIXTURE(count_bugs) : public ::testing::Test { + public: + using count_functor = CountTestFunctor<TEST_EXECSPACE>; + using set_functor = SetViewToValueFunctor<TEST_EXECSPACE, int>; + using set_result_functor = SetResultToViewFunctor<TEST_EXECSPACE, int>; + using view_type = Kokkos::View<int, TEST_EXECSPACE>; + using atomic_view_type = typename count_functor::template atomic_view<int>; + using view_host = Kokkos::View<int, Kokkos::HostSpace>; + atomic_view_type count{"count"}; + atomic_view_type bugs{"bugs"}; + view_host count_host{"count_host"}; + view_host bugs_host{"bugs_host"}; + TEST_EXECSPACE ex{}; + + protected: + void SetUp() override { + Kokkos::deep_copy(ex, count, 0); + Kokkos::deep_copy(ex, bugs, 0); + ex.fence(); + } +}; + +TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), launch_one) { + auto graph = + Kokkos::Experimental::create_graph<TEST_EXECSPACE>([&](auto root) { + root.then_parallel_for(1, count_functor{count, bugs, 0, 0}); + }); + graph.submit(); + Kokkos::deep_copy(graph.get_execution_space(), count_host, count); + Kokkos::deep_copy(graph.get_execution_space(), bugs_host, bugs); + graph.get_execution_space().fence(); + ASSERT_EQ(1, count_host()); + ASSERT_EQ(0, bugs_host()); +} + +TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), launch_one_rvalue) { + Kokkos::Experimental::create_graph(ex, [&](auto root) { + root.then_parallel_for(1, count_functor{count, bugs, 0, 0}); + }).submit(); + Kokkos::deep_copy(ex, count_host, count); + Kokkos::deep_copy(ex, bugs_host, bugs); + ex.fence(); + ASSERT_EQ(1, count_host()); + ASSERT_EQ(0, bugs_host()); +} + +TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), launch_six) { + auto graph = Kokkos::Experimental::create_graph(ex, [&](auto root) { + auto f_setup_count = root.then_parallel_for(1, set_functor{count, 0}); + auto f_setup_bugs = root.then_parallel_for(1, set_functor{bugs, 0}); + + //---------------------------------------- + auto ready = Kokkos::Experimental::when_all(f_setup_count, f_setup_bugs); + + //---------------------------------------- + ready.then_parallel_for(1, count_functor{count, bugs, 0, 6}); + //---------------------------------------- + ready.then_parallel_for(Kokkos::RangePolicy<TEST_EXECSPACE>{0, 1}, + count_functor{count, bugs, 0, 6}); + //---------------------------------------- + ready.then_parallel_for( + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>{{0, 0}, {1, 1}}, + count_functor{count, bugs, 0, 6}); + //---------------------------------------- + ready.then_parallel_for(Kokkos::TeamPolicy<TEST_EXECSPACE>{1, 1}, + count_functor{count, bugs, 0, 6}); + //---------------------------------------- + ready.then_parallel_for(2, count_functor{count, bugs, 0, 6}); + //---------------------------------------- + }); + graph.submit(); + Kokkos::deep_copy(ex, count_host, count); + Kokkos::deep_copy(ex, bugs_host, bugs); + ex.fence(); + + ASSERT_EQ(6, count_host()); + ASSERT_EQ(0, bugs_host()); +} + +TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), when_all_cycle) { + view_type reduction_out{"reduction_out"}; + view_host reduction_host{"reduction_host"}; + Kokkos::Experimental::create_graph(ex, [&](auto root) { + //---------------------------------------- + // Test when_all when redundant dependencies are given + auto f1 = root.then_parallel_for(1, set_functor{count, 0}); + auto f2 = f1.then_parallel_for(1, count_functor{count, bugs, 0, 0}); + auto f3 = f2.then_parallel_for(5, count_functor{count, bugs, 1, 5}); + auto f4 = Kokkos::Experimental::when_all(f2, f3).then_parallel_for( + 1, count_functor{count, bugs, 6, 6}); + Kokkos::Experimental::when_all(f1, f4, f3) + .then_parallel_reduce(6, set_result_functor{count}, reduction_out); + //---------------------------------------- + }).submit(); + Kokkos::deep_copy(ex, bugs_host, bugs); + Kokkos::deep_copy(ex, count_host, count); + Kokkos::deep_copy(ex, reduction_host, reduction_out); + ex.fence(); + ASSERT_EQ(0, bugs_host()); + ASSERT_EQ(7, count_host()); + ASSERT_EQ(42, reduction_host()); + //---------------------------------------- +} + +// This test is disabled because we don't currently support copying to host, +// even asynchronously. We _may_ want to do that eventually? +TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), DISABLED_repeat_chain) { + auto graph = Kokkos::Experimental::create_graph( + ex, [&, count_host = count_host](auto root) { + //---------------------------------------- + root.then_parallel_for(1, set_functor{count, 0}) + .then_parallel_for(1, count_functor{count, bugs, 0, 0}) + .then_parallel_for(1, count_functor{count, bugs, 1, 1}) + .then_parallel_reduce(1, set_result_functor{count}, count_host) + .then_parallel_reduce( + 1, set_result_functor{bugs}, + Kokkos::Sum<int, Kokkos::HostSpace>{bugs_host}); + //---------------------------------------- + }); + + //---------------------------------------- + constexpr int repeats = 10; + + for (int i = 0; i < repeats; ++i) { + graph.submit(); + ex.fence(); + EXPECT_EQ(2, count_host()); + EXPECT_EQ(0, bugs_host()); + } + //---------------------------------------- +} + +TEST_F(TEST_CATEGORY_FIXTURE(count_bugs), zero_work_reduce) { + auto graph = Kokkos::Experimental::create_graph(ex, [&](auto root) { + root.then_parallel_reduce(0, set_result_functor{bugs}, count); + }); +// These fences are only necessary because of the weirdness of how CUDA +// UVM works on pre pascal cards. +#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ENABLE_CUDA_UVM) && \ + (defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL)) + Kokkos::fence(); +#endif + graph.submit(); + Kokkos::deep_copy(ex, count, 1); +// These fences are only necessary because of the weirdness of how CUDA +// UVM works on pre pascal cards. +#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ENABLE_CUDA_UVM) && \ + (defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL)) + Kokkos::fence(); +#endif + graph.submit(); // should reset to 0, but doesn't + Kokkos::deep_copy(ex, count_host, count); + ex.fence(); + ASSERT_EQ(count_host(), 0); +} + +} // end namespace Test diff --git a/packages/kokkos/core/unit_test/TestHWLOC.cpp b/packages/kokkos/core/unit_test/TestHWLOC.cpp new file mode 100644 index 0000000000000000000000000000000000000000..428a67048451a6adbfbf77ab94bcf7a1ced910dc --- /dev/null +++ b/packages/kokkos/core/unit_test/TestHWLOC.cpp @@ -0,0 +1,60 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <iostream> + +#include <Kokkos_hwloc.hpp> + +namespace Test { + +TEST(hwloc, query) { + std::cout << " NUMA[" << Kokkos::hwloc::get_available_numa_count() << "]" + << " CORE[" << Kokkos::hwloc::get_available_cores_per_numa() << "]" + << " PU[" << Kokkos::hwloc::get_available_threads_per_core() << "]" + << std::endl; +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestHalfConversion.hpp b/packages/kokkos/core/unit_test/TestHalfConversion.hpp new file mode 100644 index 0000000000000000000000000000000000000000..277fb1b04234e58b0fe3d1639e48fcd1dc51ff86 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestHalfConversion.hpp @@ -0,0 +1,91 @@ + +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef TESTHALFCONVERSION_HPP_ +#define TESTHALFCONVERSION_HPP_ +namespace Test { + +template <class T> +void test_half_conversion_type() { + double epsilon = KOKKOS_HALF_T_IS_FLOAT ? 0.0000003 : 0.0003; + T base = static_cast<T>(3.3); + Kokkos::Experimental::half_t a = Kokkos::Experimental::cast_to_half(base); + T b = Kokkos::Experimental::cast_from_half<T>(a); + ASSERT_TRUE((double(b - base) / double(base)) < epsilon); + +// TODO: Remove ifndef once https://github.com/kokkos/kokkos/pull/3480 merges +#ifndef KOKKOS_ENABLE_SYCL +#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA + Kokkos::View<T> b_v("b_v"); + Kokkos::parallel_for( + "TestHalfConversion", 1, KOKKOS_LAMBDA(int) { + Kokkos::Experimental::half_t d_a = + Kokkos::Experimental::cast_to_half(base); + b_v() = Kokkos::Experimental::cast_from_half<T>(d_a); + }); + + Kokkos::deep_copy(b, b_v); + ASSERT_TRUE((double(b - base) / double(base)) < epsilon); +#endif // KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA +#endif // KOKKOS_ENABLE_SYCL +} + +void test_half_conversion() { + test_half_conversion_type<float>(); + test_half_conversion_type<double>(); + test_half_conversion_type<short>(); + test_half_conversion_type<int>(); + test_half_conversion_type<long>(); + test_half_conversion_type<long long>(); + test_half_conversion_type<unsigned short>(); + test_half_conversion_type<unsigned int>(); + test_half_conversion_type<unsigned long>(); + test_half_conversion_type<unsigned long long>(); +} + +TEST(TEST_CATEGORY, half_conversion) { test_half_conversion(); } + +} // namespace Test +#endif diff --git a/packages/kokkos/core/unit_test/TestHalfOperators.hpp b/packages/kokkos/core/unit_test/TestHalfOperators.hpp new file mode 100644 index 0000000000000000000000000000000000000000..db52a05d5d36d5919e101f60dd7652c92771c885 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestHalfOperators.hpp @@ -0,0 +1,879 @@ + +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef TESTHALFOPERATOR_HPP_ +#define TESTHALFOPERATOR_HPP_ +// TODO: Remove ifndef once https://github.com/kokkos/kokkos/pull/3480 merges +#ifndef KOKKOS_ENABLE_SYCL +namespace Test { +#define FP16_EPSILON 0.0009765625F +using namespace Kokkos::Experimental; +using ExecutionSpace = TEST_EXECSPACE; +using ScalarType = double; +using ViewType = Kokkos::View<ScalarType*, ExecutionSpace>; +using ViewTypeHost = Kokkos::View<ScalarType*, Kokkos::HostSpace>; +KOKKOS_FUNCTION +const half_t& accept_ref(const half_t& a) { return a; } + +enum OP_TESTS { + ASSIGN, + ASSIGN_CHAINED, + UNA, + UNS, + PREFIX_INC, + PREFIX_DEC, + POSTFIX_INC, + POSTFIX_DEC, + CADD_H_H, + CADD_H_S, + CADD_S_H, + CADD_H_D, + CADD_D_H, + CSUB_H_H, + CSUB_H_S, + CSUB_S_H, + CSUB_H_D, + CSUB_D_H, + CMUL_H_H, + CMUL_H_S, + CMUL_S_H, + CMUL_H_D, + CMUL_D_H, + CDIV_H_H, + CDIV_H_S, + CDIV_S_H, + CDIV_H_D, + CDIV_D_H, + ADD_H_H, + ADD_H_S, + ADD_S_H, + ADD_H_D, + ADD_D_H, + ADD_H_H_SZ, + ADD_H_S_SZ, + ADD_S_H_SZ, + ADD_H_D_SZ, + ADD_D_H_SZ, + ADD_SI_H, + ADD_SI_H_SZ, + ADD_I_H, + ADD_I_H_SZ, + ADD_LI_H, + ADD_LI_H_SZ, + ADD_LLI_H, + ADD_LLI_H_SZ, + ADD_USI_H, + ADD_USI_H_SZ, + ADD_UI_H, + ADD_UI_H_SZ, + ADD_ULI_H, + ADD_ULI_H_SZ, + ADD_ULLI_H, + ADD_ULLI_H_SZ, + ADD_H_SI, + ADD_H_SI_SZ, + ADD_H_I, + ADD_H_I_SZ, + ADD_H_LI, + ADD_H_LI_SZ, + ADD_H_LLI, + ADD_H_LLI_SZ, + ADD_H_USI, + ADD_H_USI_SZ, + ADD_H_UI, + ADD_H_UI_SZ, + ADD_H_ULI, + ADD_H_ULI_SZ, + ADD_H_ULLI, + ADD_H_ULLI_SZ, + SUB_H_H, + SUB_H_S, + SUB_S_H, + SUB_H_D, + SUB_D_H, + SUB_H_H_SZ, + SUB_H_S_SZ, + SUB_S_H_SZ, + SUB_H_D_SZ, + SUB_D_H_SZ, + SUB_SI_H, + SUB_SI_H_SZ, + SUB_I_H, + SUB_I_H_SZ, + SUB_LI_H, + SUB_LI_H_SZ, + SUB_LLI_H, + SUB_LLI_H_SZ, + SUB_USI_H, + SUB_USI_H_SZ, + SUB_UI_H, + SUB_UI_H_SZ, + SUB_ULI_H, + SUB_ULI_H_SZ, + SUB_ULLI_H, + SUB_ULLI_H_SZ, + SUB_H_SI, + SUB_H_SI_SZ, + SUB_H_I, + SUB_H_I_SZ, + SUB_H_LI, + SUB_H_LI_SZ, + SUB_H_LLI, + SUB_H_LLI_SZ, + SUB_H_USI, + SUB_H_USI_SZ, + SUB_H_UI, + SUB_H_UI_SZ, + SUB_H_ULI, + SUB_H_ULI_SZ, + SUB_H_ULLI, + SUB_H_ULLI_SZ, + MUL_H_H, + MUL_H_S, + MUL_S_H, + MUL_H_D, + MUL_D_H, + MUL_H_H_SZ, + MUL_H_S_SZ, + MUL_S_H_SZ, + MUL_H_D_SZ, + MUL_D_H_SZ, + MUL_SI_H, + MUL_SI_H_SZ, + MUL_I_H, + MUL_I_H_SZ, + MUL_LI_H, + MUL_LI_H_SZ, + MUL_LLI_H, + MUL_LLI_H_SZ, + MUL_USI_H, + MUL_USI_H_SZ, + MUL_UI_H, + MUL_UI_H_SZ, + MUL_ULI_H, + MUL_ULI_H_SZ, + MUL_ULLI_H, + MUL_ULLI_H_SZ, + MUL_H_SI, + MUL_H_SI_SZ, + MUL_H_I, + MUL_H_I_SZ, + MUL_H_LI, + MUL_H_LI_SZ, + MUL_H_LLI, + MUL_H_LLI_SZ, + MUL_H_USI, + MUL_H_USI_SZ, + MUL_H_UI, + MUL_H_UI_SZ, + MUL_H_ULI, + MUL_H_ULI_SZ, + MUL_H_ULLI, + MUL_H_ULLI_SZ, + DIV_H_H, + DIV_H_S, + DIV_S_H, + DIV_H_D, + DIV_D_H, + DIV_H_H_SZ, + DIV_H_S_SZ, + DIV_S_H_SZ, + DIV_H_D_SZ, + DIV_D_H_SZ, + DIV_SI_H, + DIV_SI_H_SZ, + DIV_I_H, + DIV_I_H_SZ, + DIV_LI_H, + DIV_LI_H_SZ, + DIV_LLI_H, + DIV_LLI_H_SZ, + DIV_USI_H, + DIV_USI_H_SZ, + DIV_UI_H, + DIV_UI_H_SZ, + DIV_ULI_H, + DIV_ULI_H_SZ, + DIV_ULLI_H, + DIV_ULLI_H_SZ, + DIV_H_SI, + DIV_H_SI_SZ, + DIV_H_I, + DIV_H_I_SZ, + DIV_H_LI, + DIV_H_LI_SZ, + DIV_H_LLI, + DIV_H_LLI_SZ, + DIV_H_USI, + DIV_H_USI_SZ, + DIV_H_UI, + DIV_H_UI_SZ, + DIV_H_ULI, + DIV_H_ULI_SZ, + DIV_H_ULLI, + DIV_H_ULLI_SZ, + NEG, + AND, + OR, + EQ, + NEQ, + LT, + GT, + LE, + GE, // TODO: TW, + PASS_BY_REF, + AO_IMPL_HALF, + AO_HALF_T, + N_OP_TESTS +}; + +template <class view_type> +struct Functor_TestHalfOperators { + half_t h_lhs, h_rhs; + double d_lhs, d_rhs; + view_type actual_lhs, expected_lhs; + + Functor_TestHalfOperators(half_t lhs = half_t(0), half_t rhs = half_t(0)) + : h_lhs(lhs), h_rhs(rhs) { + actual_lhs = view_type("actual_lhs", N_OP_TESTS); + expected_lhs = view_type("expected_lhs", N_OP_TESTS); + d_lhs = cast_from_half<double>(h_lhs); + d_rhs = cast_from_half<double>(h_rhs); + + if (std::is_same<view_type, ViewTypeHost>::value) { + auto run_on_host = *this; + run_on_host(0); + } else { + Kokkos::parallel_for("Test::Functor_TestHalfOperators", + Kokkos::RangePolicy<ExecutionSpace>(0, 1), *this); + } + } + + // BEGIN: Binary Arithmetic test helpers + template <class LhsType, class RhsType, class ExpectedResultType> + KOKKOS_INLINE_FUNCTION void test_add(int op_test_idx, + int op_test_sz_idx) const { + auto sum = static_cast<LhsType>(h_lhs) + static_cast<RhsType>(h_rhs); + actual_lhs(op_test_idx) = static_cast<double>(sum); + + if (std::is_same<RhsType, half_t>::value && + std::is_same<LhsType, half_t>::value) { + expected_lhs(op_test_idx) = d_lhs + d_rhs; + } else { + if (std::is_same<LhsType, half_t>::value) + expected_lhs(op_test_idx) = d_lhs + static_cast<RhsType>(d_rhs); + if (std::is_same<RhsType, half_t>::value) + expected_lhs(op_test_idx) = static_cast<LhsType>(d_lhs) + d_rhs; + } + + actual_lhs(op_test_sz_idx) = sizeof(sum); + expected_lhs(op_test_sz_idx) = sizeof(ExpectedResultType); + } + + template <class LhsType, class RhsType, class ExpectedResultType> + KOKKOS_INLINE_FUNCTION void test_sub(int op_test_idx, + int op_test_sz_idx) const { + auto result = static_cast<LhsType>(h_lhs) - static_cast<RhsType>(h_rhs); + actual_lhs(op_test_idx) = static_cast<double>(result); + + if (std::is_same<RhsType, half_t>::value && + std::is_same<LhsType, half_t>::value) { + expected_lhs(op_test_idx) = d_lhs - d_rhs; + } else { + if (std::is_same<LhsType, half_t>::value) + expected_lhs(op_test_idx) = d_lhs - static_cast<RhsType>(d_rhs); + if (std::is_same<RhsType, half_t>::value) + expected_lhs(op_test_idx) = static_cast<LhsType>(d_lhs) - d_rhs; + } + + actual_lhs(op_test_sz_idx) = sizeof(result); + expected_lhs(op_test_sz_idx) = sizeof(ExpectedResultType); + } + + template <class LhsType, class RhsType, class ExpectedResultType> + KOKKOS_INLINE_FUNCTION void test_mul(int op_test_idx, + int op_test_sz_idx) const { + auto result = static_cast<LhsType>(h_lhs) * static_cast<RhsType>(h_rhs); + actual_lhs(op_test_idx) = static_cast<double>(result); + + if (std::is_same<RhsType, half_t>::value && + std::is_same<LhsType, half_t>::value) { + expected_lhs(op_test_idx) = d_lhs * d_rhs; + } else { + if (std::is_same<LhsType, half_t>::value) + expected_lhs(op_test_idx) = d_lhs * static_cast<RhsType>(d_rhs); + if (std::is_same<RhsType, half_t>::value) + expected_lhs(op_test_idx) = static_cast<LhsType>(d_lhs) * d_rhs; + } + + actual_lhs(op_test_sz_idx) = sizeof(result); + expected_lhs(op_test_sz_idx) = sizeof(ExpectedResultType); + } + + template <class LhsType, class RhsType, class ExpectedResultType> + KOKKOS_INLINE_FUNCTION void test_div(int op_test_idx, + int op_test_sz_idx) const { + auto result = static_cast<LhsType>(h_lhs) / static_cast<RhsType>(h_rhs); + actual_lhs(op_test_idx) = static_cast<double>(result); + + if (std::is_same<RhsType, half_t>::value && + std::is_same<LhsType, half_t>::value) { + expected_lhs(op_test_idx) = d_lhs / d_rhs; + } else { + if (std::is_same<LhsType, half_t>::value) + expected_lhs(op_test_idx) = d_lhs / static_cast<RhsType>(d_rhs); + if (std::is_same<RhsType, half_t>::value) + expected_lhs(op_test_idx) = static_cast<LhsType>(d_lhs) / d_rhs; + } + + actual_lhs(op_test_sz_idx) = sizeof(result); + expected_lhs(op_test_sz_idx) = sizeof(ExpectedResultType); + } + // END: Binary Arithmetic test helpers + + KOKKOS_FUNCTION + void operator()(int) const { + half_t tmp_lhs, tmp2_lhs, *tmp_ptr; + double tmp_d_lhs; + float tmp_s_lhs; + using half_impl_type = Kokkos::Impl::half_impl_t::type; + half_impl_type half_tmp; + + // Initialze output views to catch missing test invocations + for (int i = 0; i < N_OP_TESTS; ++i) { + actual_lhs(i) = 1; + expected_lhs(i) = -1; + } + + tmp_lhs = h_lhs; + actual_lhs(ASSIGN) = cast_from_half<double>(tmp_lhs); + expected_lhs(ASSIGN) = d_lhs; + + tmp_lhs = 0; + tmp2_lhs = tmp_lhs = h_lhs; + actual_lhs(ASSIGN_CHAINED) = cast_from_half<double>(tmp2_lhs); + expected_lhs(ASSIGN_CHAINED) = d_lhs; + + actual_lhs(UNA) = cast_from_half<double>(+h_lhs); + expected_lhs(UNA) = +d_lhs; + + actual_lhs(UNS) = cast_from_half<double>(-h_lhs); + expected_lhs(UNS) = -d_lhs; + + tmp_lhs = h_lhs; + tmp_d_lhs = d_lhs; + actual_lhs(PREFIX_INC) = cast_from_half<double>(++tmp_lhs); + expected_lhs(PREFIX_INC) = ++tmp_d_lhs; + + actual_lhs(PREFIX_DEC) = cast_from_half<double>(--tmp_lhs); + expected_lhs(PREFIX_DEC) = --tmp_d_lhs; + + // if (h_lhs != tmp_lhs) { + // printf("tmp_lhs = %f, h_lhs = %f\n", __half2float(tmp_lhs), + // __half2float(h_lhs)); Kokkos::abort("Error in half_t prefix operators"); + //} + + actual_lhs(POSTFIX_INC) = cast_from_half<double>(tmp_lhs++); + expected_lhs(POSTFIX_INC) = tmp_d_lhs++; + + actual_lhs(POSTFIX_DEC) = cast_from_half<double>(tmp_lhs--); + expected_lhs(POSTFIX_DEC) = tmp_d_lhs--; + + // if (h_lhs != tmp_lhs) { + // printf("tmp_lhs = %f, h_lhs = %f\n", __half2float(tmp_lhs), + // __half2float(h_lhs)); Kokkos::abort("Error in half_t postfix + // operators"); + //} + + tmp_lhs = h_lhs; + tmp_lhs += h_rhs; + actual_lhs(CADD_H_H) = cast_from_half<double>(tmp_lhs); + expected_lhs(CADD_H_H) = d_lhs; + expected_lhs(CADD_H_H) += d_rhs; + + tmp_lhs = h_lhs; + tmp_lhs += static_cast<float>(d_rhs); + actual_lhs(CADD_H_S) = cast_from_half<double>(tmp_lhs); + expected_lhs(CADD_H_S) = d_lhs; + expected_lhs(CADD_H_S) += d_rhs; + + tmp_s_lhs = static_cast<float>(h_lhs); + tmp_s_lhs += h_rhs; + actual_lhs(CADD_S_H) = static_cast<double>(tmp_s_lhs); + expected_lhs(CADD_S_H) = d_lhs; + expected_lhs(CADD_S_H) += d_rhs; + + tmp_lhs = static_cast<double>(h_lhs); + tmp_lhs += static_cast<double>(d_rhs); + actual_lhs(CADD_H_D) = cast_from_half<double>(tmp_lhs); + expected_lhs(CADD_H_D) = d_lhs; + expected_lhs(CADD_H_D) += d_rhs; + + tmp_d_lhs = static_cast<double>(h_lhs); + tmp_d_lhs += h_rhs; + actual_lhs(CADD_D_H) = static_cast<double>(tmp_d_lhs); + expected_lhs(CADD_D_H) = d_lhs; + expected_lhs(CADD_D_H) += d_rhs; + + tmp_lhs = h_lhs; + tmp_lhs -= h_rhs; + actual_lhs(CSUB_H_H) = cast_from_half<double>(tmp_lhs); + expected_lhs(CSUB_H_H) = d_lhs; + expected_lhs(CSUB_H_H) -= d_rhs; + + tmp_lhs = h_lhs; + tmp_lhs -= static_cast<float>(d_rhs); + actual_lhs(CSUB_H_S) = cast_from_half<double>(tmp_lhs); + expected_lhs(CSUB_H_S) = d_lhs; + expected_lhs(CSUB_H_S) -= d_rhs; + + tmp_s_lhs = static_cast<float>(h_lhs); + tmp_s_lhs -= h_rhs; + actual_lhs(CSUB_S_H) = static_cast<double>(tmp_s_lhs); + expected_lhs(CSUB_S_H) = d_lhs; + expected_lhs(CSUB_S_H) -= d_rhs; + + tmp_lhs = h_lhs; + tmp_lhs -= d_rhs; + actual_lhs(CSUB_H_D) = static_cast<double>(tmp_lhs); + expected_lhs(CSUB_H_D) = d_lhs; + expected_lhs(CSUB_H_D) -= d_rhs; + + tmp_d_lhs = static_cast<double>(h_lhs); + tmp_d_lhs -= h_rhs; + actual_lhs(CSUB_D_H) = tmp_d_lhs; + expected_lhs(CSUB_D_H) = d_lhs; + expected_lhs(CSUB_D_H) -= d_rhs; + + tmp_lhs = h_lhs; + tmp_lhs *= h_rhs; + actual_lhs(CMUL_H_H) = cast_from_half<double>(tmp_lhs); + expected_lhs(CMUL_H_H) = d_lhs; + expected_lhs(CMUL_H_H) *= d_rhs; + + tmp_lhs = h_lhs; + tmp_lhs *= static_cast<float>(d_rhs); + actual_lhs(CMUL_H_S) = cast_from_half<double>(tmp_lhs); + expected_lhs(CMUL_H_S) = d_lhs; + expected_lhs(CMUL_H_S) *= d_rhs; + + tmp_s_lhs = static_cast<float>(h_lhs); + tmp_s_lhs *= h_rhs; + actual_lhs(CMUL_S_H) = static_cast<double>(tmp_s_lhs); + expected_lhs(CMUL_S_H) = d_lhs; + expected_lhs(CMUL_S_H) *= d_rhs; + + tmp_lhs = h_lhs; + tmp_lhs *= d_rhs; + actual_lhs(CMUL_H_D) = static_cast<double>(tmp_lhs); + expected_lhs(CMUL_H_D) = d_lhs; + expected_lhs(CMUL_H_D) *= d_rhs; + + tmp_d_lhs = static_cast<double>(h_lhs); + tmp_d_lhs *= h_rhs; + actual_lhs(CMUL_D_H) = tmp_d_lhs; + expected_lhs(CMUL_D_H) = d_lhs; + expected_lhs(CMUL_D_H) *= d_rhs; + + tmp_lhs = h_lhs; + tmp_lhs /= h_rhs; + actual_lhs(CDIV_H_H) = cast_from_half<double>(tmp_lhs); + expected_lhs(CDIV_H_H) = d_lhs; + expected_lhs(CDIV_H_H) /= d_rhs; + + tmp_lhs = h_lhs; + tmp_lhs /= static_cast<float>(d_rhs); + actual_lhs(CDIV_H_S) = cast_from_half<double>(tmp_lhs); + expected_lhs(CDIV_H_S) = d_lhs; + expected_lhs(CDIV_H_S) /= d_rhs; + + tmp_s_lhs = static_cast<float>(h_lhs); + tmp_s_lhs /= h_rhs; + actual_lhs(CDIV_S_H) = static_cast<double>(tmp_s_lhs); + expected_lhs(CDIV_S_H) = d_lhs; + expected_lhs(CDIV_S_H) /= d_rhs; + + tmp_lhs = h_lhs; + tmp_lhs /= d_rhs; + actual_lhs(CDIV_H_D) = static_cast<double>(tmp_lhs); + expected_lhs(CDIV_H_D) = d_lhs; + expected_lhs(CDIV_H_D) /= d_rhs; + + tmp_d_lhs = static_cast<double>(h_lhs); + tmp_d_lhs /= h_rhs; + actual_lhs(CDIV_D_H) = tmp_d_lhs; + expected_lhs(CDIV_D_H) = d_lhs; + expected_lhs(CDIV_D_H) /= d_rhs; + + test_add<half_t, half_t, half_t>(ADD_H_H, ADD_H_H_SZ); + test_add<float, half_t, float>(ADD_S_H, ADD_S_H_SZ); + test_add<double, half_t, double>(ADD_D_H, ADD_D_H_SZ); + test_add<short int, half_t, half_t>(ADD_SI_H, ADD_SI_H_SZ); + test_add<int, half_t, half_t>(ADD_I_H, ADD_I_H_SZ); + test_add<long int, half_t, half_t>(ADD_LI_H, ADD_LI_H_SZ); + test_add<long long int, half_t, half_t>(ADD_LLI_H, ADD_LLI_H_SZ); + test_add<half_t, float, float>(ADD_H_S, ADD_H_S_SZ); + test_add<half_t, double, double>(ADD_H_D, ADD_H_D_SZ); + test_add<half_t, short int, half_t>(ADD_H_SI, ADD_H_SI_SZ); + test_add<half_t, int, half_t>(ADD_H_I, ADD_H_I_SZ); + test_add<half_t, long int, half_t>(ADD_H_LI, ADD_H_LI_SZ); + test_add<half_t, long long int, half_t>(ADD_H_LLI, ADD_H_LLI_SZ); + + // Check for potential overflow due to negative half_t -> unsigned integral + // cast + if (h_lhs >= 0) { + test_add<unsigned short int, half_t, half_t>(ADD_USI_H, ADD_USI_H_SZ); + test_add<unsigned int, half_t, half_t>(ADD_UI_H, ADD_UI_H_SZ); + test_add<unsigned long int, half_t, half_t>(ADD_ULI_H, ADD_ULI_H_SZ); + test_add<unsigned long long int, half_t, half_t>(ADD_ULLI_H, + ADD_ULLI_H_SZ); + } else { + actual_lhs(ADD_USI_H) = expected_lhs(ADD_USI_H); + actual_lhs(ADD_USI_H_SZ) = expected_lhs(ADD_USI_H_SZ); + actual_lhs(ADD_UI_H) = expected_lhs(ADD_UI_H); + actual_lhs(ADD_UI_H_SZ) = expected_lhs(ADD_UI_H_SZ); + actual_lhs(ADD_ULI_H) = expected_lhs(ADD_ULI_H); + actual_lhs(ADD_ULI_H_SZ) = expected_lhs(ADD_ULI_H_SZ); + actual_lhs(ADD_ULLI_H) = expected_lhs(ADD_ULLI_H); + actual_lhs(ADD_ULLI_H_SZ) = expected_lhs(ADD_ULLI_H_SZ); + } + + // Check for potential overflow due to negative half_t -> unsigned integral + // cast + if (h_rhs >= 0) { + test_add<half_t, unsigned short int, half_t>(ADD_H_USI, ADD_H_USI_SZ); + test_add<half_t, unsigned int, half_t>(ADD_H_UI, ADD_H_UI_SZ); + test_add<half_t, unsigned long int, half_t>(ADD_H_ULI, ADD_H_ULI_SZ); + test_add<half_t, unsigned long long int, half_t>(ADD_H_ULLI, + ADD_H_ULLI_SZ); + } else { + actual_lhs(ADD_H_USI) = expected_lhs(ADD_H_USI); + actual_lhs(ADD_H_USI_SZ) = expected_lhs(ADD_H_USI_SZ); + actual_lhs(ADD_H_UI) = expected_lhs(ADD_H_UI); + actual_lhs(ADD_H_UI_SZ) = expected_lhs(ADD_H_UI_SZ); + actual_lhs(ADD_H_ULI) = expected_lhs(ADD_H_ULI); + actual_lhs(ADD_H_ULI_SZ) = expected_lhs(ADD_H_ULI_SZ); + actual_lhs(ADD_H_ULLI) = expected_lhs(ADD_H_ULLI); + actual_lhs(ADD_H_ULLI_SZ) = expected_lhs(ADD_H_ULLI_SZ); + } + + test_sub<half_t, half_t, half_t>(SUB_H_H, SUB_H_H_SZ); + test_sub<float, half_t, float>(SUB_S_H, SUB_S_H_SZ); + test_sub<double, half_t, double>(SUB_D_H, SUB_D_H_SZ); + test_sub<short int, half_t, half_t>(SUB_SI_H, SUB_SI_H_SZ); + test_sub<int, half_t, half_t>(SUB_I_H, SUB_I_H_SZ); + test_sub<long int, half_t, half_t>(SUB_LI_H, SUB_LI_H_SZ); + test_sub<long long int, half_t, half_t>(SUB_LLI_H, SUB_LLI_H_SZ); + test_sub<half_t, float, float>(SUB_H_S, SUB_H_S_SZ); + test_sub<half_t, double, double>(SUB_H_D, SUB_H_D_SZ); + test_sub<half_t, short int, half_t>(SUB_H_SI, SUB_H_SI_SZ); + test_sub<half_t, int, half_t>(SUB_H_I, SUB_H_I_SZ); + test_sub<half_t, long int, half_t>(SUB_H_LI, SUB_H_LI_SZ); + test_sub<half_t, long long int, half_t>(SUB_H_LLI, SUB_H_LLI_SZ); + + // Check for potential overflow due to negative half_t -> unsigned integral + // cast + if (h_lhs >= half_t(0)) { + test_sub<unsigned short int, half_t, half_t>(SUB_USI_H, SUB_USI_H_SZ); + test_sub<unsigned int, half_t, half_t>(SUB_UI_H, SUB_UI_H_SZ); + test_sub<unsigned long int, half_t, half_t>(SUB_ULI_H, SUB_ULI_H_SZ); + test_sub<unsigned long long int, half_t, half_t>(SUB_ULLI_H, + SUB_ULLI_H_SZ); + } else { + actual_lhs(SUB_USI_H) = expected_lhs(SUB_USI_H); + actual_lhs(SUB_USI_H_SZ) = expected_lhs(SUB_USI_H_SZ); + actual_lhs(SUB_UI_H) = expected_lhs(SUB_UI_H); + actual_lhs(SUB_UI_H_SZ) = expected_lhs(SUB_UI_H_SZ); + actual_lhs(SUB_ULI_H) = expected_lhs(SUB_ULI_H); + actual_lhs(SUB_ULI_H_SZ) = expected_lhs(SUB_ULI_H_SZ); + actual_lhs(SUB_ULLI_H) = expected_lhs(SUB_ULLI_H); + actual_lhs(SUB_ULLI_H_SZ) = expected_lhs(SUB_ULLI_H_SZ); + } + + // Check for potential overflow due to negative half_t -> unsigned integral + // cast + if (h_rhs >= half_t(0)) { + test_sub<half_t, unsigned short int, half_t>(SUB_H_USI, SUB_H_USI_SZ); + test_sub<half_t, unsigned int, half_t>(SUB_H_UI, SUB_H_UI_SZ); + test_sub<half_t, unsigned long int, half_t>(SUB_H_ULI, SUB_H_ULI_SZ); + test_sub<half_t, unsigned long long int, half_t>(SUB_H_ULLI, + SUB_H_ULLI_SZ); + } else { + actual_lhs(SUB_H_USI) = expected_lhs(SUB_H_USI); + actual_lhs(SUB_H_USI_SZ) = expected_lhs(SUB_H_USI_SZ); + actual_lhs(SUB_H_UI) = expected_lhs(SUB_H_UI); + actual_lhs(SUB_H_UI_SZ) = expected_lhs(SUB_H_UI_SZ); + actual_lhs(SUB_H_ULI) = expected_lhs(SUB_H_ULI); + actual_lhs(SUB_H_ULI_SZ) = expected_lhs(SUB_H_ULI_SZ); + actual_lhs(SUB_H_ULLI) = expected_lhs(SUB_H_ULLI); + actual_lhs(SUB_H_ULLI_SZ) = expected_lhs(SUB_H_ULLI_SZ); + } + + test_mul<half_t, half_t, half_t>(MUL_H_H, MUL_H_H_SZ); + test_mul<float, half_t, float>(MUL_S_H, MUL_S_H_SZ); + test_mul<double, half_t, double>(MUL_D_H, MUL_D_H_SZ); + test_mul<short int, half_t, half_t>(MUL_SI_H, MUL_SI_H_SZ); + test_mul<int, half_t, half_t>(MUL_I_H, MUL_I_H_SZ); + test_mul<long int, half_t, half_t>(MUL_LI_H, MUL_LI_H_SZ); + test_mul<long long int, half_t, half_t>(MUL_LLI_H, MUL_LLI_H_SZ); + test_mul<half_t, float, float>(MUL_H_S, MUL_H_S_SZ); + test_mul<half_t, double, double>(MUL_H_D, MUL_H_D_SZ); + test_mul<half_t, short int, half_t>(MUL_H_SI, MUL_H_SI_SZ); + test_mul<half_t, int, half_t>(MUL_H_I, MUL_H_I_SZ); + test_mul<half_t, long int, half_t>(MUL_H_LI, MUL_H_LI_SZ); + test_mul<half_t, long long int, half_t>(MUL_H_LLI, MUL_H_LLI_SZ); + + // Check for potential overflow due to negative half_t -> unsigned integral + // cast + if (h_lhs >= half_t(0)) { + test_mul<unsigned short int, half_t, half_t>(MUL_USI_H, MUL_USI_H_SZ); + test_mul<unsigned int, half_t, half_t>(MUL_UI_H, MUL_UI_H_SZ); + test_mul<unsigned long int, half_t, half_t>(MUL_ULI_H, MUL_ULI_H_SZ); + test_mul<unsigned long long int, half_t, half_t>(MUL_ULLI_H, + MUL_ULLI_H_SZ); + } else { + actual_lhs(MUL_USI_H) = expected_lhs(MUL_USI_H); + actual_lhs(MUL_UI_H) = expected_lhs(MUL_UI_H); + actual_lhs(MUL_ULI_H) = expected_lhs(MUL_ULI_H); + actual_lhs(MUL_ULLI_H) = expected_lhs(MUL_ULLI_H); + actual_lhs(MUL_USI_H_SZ) = expected_lhs(MUL_USI_H_SZ); + actual_lhs(MUL_UI_H_SZ) = expected_lhs(MUL_UI_H_SZ); + actual_lhs(MUL_ULI_H_SZ) = expected_lhs(MUL_ULI_H_SZ); + actual_lhs(MUL_ULLI_H_SZ) = expected_lhs(MUL_ULLI_H_SZ); + } + + // Check for potential overflow due to negative half_t -> unsigned integral + // cast + if (h_rhs >= half_t(0)) { + test_mul<half_t, unsigned short int, half_t>(MUL_H_USI, MUL_H_USI_SZ); + test_mul<half_t, unsigned int, half_t>(MUL_H_UI, MUL_H_UI_SZ); + test_mul<half_t, unsigned long int, half_t>(MUL_H_ULI, MUL_H_ULI_SZ); + test_mul<half_t, unsigned long long int, half_t>(MUL_H_ULLI, + MUL_H_ULLI_SZ); + } else { + actual_lhs(MUL_H_USI) = expected_lhs(MUL_H_USI); + actual_lhs(MUL_H_UI) = expected_lhs(MUL_H_UI); + actual_lhs(MUL_H_ULI) = expected_lhs(MUL_H_ULI); + actual_lhs(MUL_H_ULLI) = expected_lhs(MUL_H_ULLI); + actual_lhs(MUL_H_USI_SZ) = expected_lhs(MUL_H_USI_SZ); + actual_lhs(MUL_H_UI_SZ) = expected_lhs(MUL_H_UI_SZ); + actual_lhs(MUL_H_ULI_SZ) = expected_lhs(MUL_H_ULI_SZ); + actual_lhs(MUL_H_ULLI_SZ) = expected_lhs(MUL_H_ULLI_SZ); + } + + test_div<half_t, half_t, half_t>(DIV_H_H, DIV_H_H_SZ); + test_div<float, half_t, float>(DIV_S_H, DIV_S_H_SZ); + test_div<double, half_t, double>(DIV_D_H, DIV_D_H_SZ); + test_div<short int, half_t, half_t>(DIV_SI_H, DIV_SI_H_SZ); + test_div<int, half_t, half_t>(DIV_I_H, DIV_I_H_SZ); + test_div<long int, half_t, half_t>(DIV_LI_H, DIV_LI_H_SZ); + test_div<long long int, half_t, half_t>(DIV_LLI_H, DIV_LLI_H_SZ); + test_div<half_t, float, float>(DIV_H_S, DIV_H_S_SZ); + test_div<half_t, double, double>(DIV_H_D, DIV_H_D_SZ); + + // Check for division by zero due to truncation by half_t -> integral cast + if (h_rhs >= half_t(1) || h_rhs <= half_t(-1)) { + test_div<half_t, short int, half_t>(DIV_H_SI, DIV_H_SI_SZ); + test_div<half_t, int, half_t>(DIV_H_I, DIV_H_I_SZ); + test_div<half_t, long int, half_t>(DIV_H_LI, DIV_H_LI_SZ); + test_div<half_t, long long int, half_t>(DIV_H_LLI, DIV_H_LLI_SZ); + } else { + actual_lhs(DIV_H_SI) = expected_lhs(DIV_H_SI); + actual_lhs(DIV_H_I) = expected_lhs(DIV_H_I); + actual_lhs(DIV_H_LI) = expected_lhs(DIV_H_LI); + actual_lhs(DIV_H_LLI) = expected_lhs(DIV_H_LLI); + actual_lhs(DIV_H_SI_SZ) = expected_lhs(DIV_H_SI_SZ); + actual_lhs(DIV_H_I_SZ) = expected_lhs(DIV_H_I_SZ); + actual_lhs(DIV_H_LI_SZ) = expected_lhs(DIV_H_LI_SZ); + actual_lhs(DIV_H_LLI_SZ) = expected_lhs(DIV_H_LLI_SZ); + } + + // Check for potential overflow due to negative half_t -> unsigned integral + // cast + if (h_lhs >= half_t(0)) { + test_div<unsigned short int, half_t, half_t>(DIV_USI_H, DIV_USI_H_SZ); + test_div<unsigned int, half_t, half_t>(DIV_UI_H, DIV_UI_H_SZ); + test_div<unsigned long int, half_t, half_t>(DIV_ULI_H, DIV_ULI_H_SZ); + test_div<unsigned long long int, half_t, half_t>(DIV_ULLI_H, + DIV_ULLI_H_SZ); + } else { + actual_lhs(DIV_USI_H) = expected_lhs(DIV_USI_H); + actual_lhs(DIV_UI_H) = expected_lhs(DIV_UI_H); + actual_lhs(DIV_ULI_H) = expected_lhs(DIV_ULI_H); + actual_lhs(DIV_ULLI_H) = expected_lhs(DIV_ULLI_H); + actual_lhs(DIV_USI_H_SZ) = expected_lhs(DIV_USI_H_SZ); + actual_lhs(DIV_UI_H_SZ) = expected_lhs(DIV_UI_H_SZ); + actual_lhs(DIV_ULI_H_SZ) = expected_lhs(DIV_ULI_H_SZ); + actual_lhs(DIV_ULLI_H_SZ) = expected_lhs(DIV_ULLI_H_SZ); + } + + // Check for division by zero due to truncation by half_t -> integral cast + if (h_rhs >= half_t(1)) { + test_div<half_t, unsigned short int, half_t>(DIV_H_USI, DIV_H_USI_SZ); + test_div<half_t, unsigned int, half_t>(DIV_H_UI, DIV_H_UI_SZ); + test_div<half_t, unsigned long int, half_t>(DIV_H_ULI, DIV_H_ULI_SZ); + test_div<half_t, unsigned long long int, half_t>(DIV_H_ULLI, + DIV_H_ULLI_SZ); + } else { + actual_lhs(DIV_H_USI) = expected_lhs(DIV_H_USI); + actual_lhs(DIV_H_USI_SZ) = expected_lhs(DIV_H_USI_SZ); + actual_lhs(DIV_H_UI) = expected_lhs(DIV_H_UI); + actual_lhs(DIV_H_UI_SZ) = expected_lhs(DIV_H_UI_SZ); + actual_lhs(DIV_H_ULI) = expected_lhs(DIV_H_ULI); + actual_lhs(DIV_H_ULI_SZ) = expected_lhs(DIV_H_ULI_SZ); + actual_lhs(DIV_H_ULLI) = expected_lhs(DIV_H_ULLI); + actual_lhs(DIV_H_ULLI_SZ) = expected_lhs(DIV_H_ULLI_SZ); + } + + // TODO: figure out why operator{!,&&,||} are returning __nv_bool + actual_lhs(NEG) = static_cast<double>(!h_lhs); + expected_lhs(NEG) = !d_lhs; + + actual_lhs(AND) = static_cast<double>(half_t(0) && h_lhs); + expected_lhs(AND) = double(0) && d_lhs; + + actual_lhs(OR) = static_cast<double>(h_lhs || half_t(1)); + expected_lhs(OR) = d_lhs || double(1); + + actual_lhs(EQ) = h_lhs == h_rhs; + expected_lhs(EQ) = d_lhs == d_rhs; + + actual_lhs(NEQ) = h_lhs != h_rhs; + expected_lhs(NEQ) = d_lhs != d_rhs; + + actual_lhs(LT) = h_lhs < h_rhs; + expected_lhs(LT) = d_lhs < d_rhs; + + actual_lhs(GT) = h_lhs > h_rhs; + expected_lhs(GT) = d_lhs > d_rhs; + + actual_lhs(LE) = h_lhs <= h_rhs; + expected_lhs(LE) = d_lhs <= d_rhs; + + actual_lhs(GE) = h_lhs >= h_rhs; + expected_lhs(GE) = d_lhs >= d_rhs; + + // actual_lhs(TW) = h_lhs <=> h_rhs; // Need C++20? + // expected_lhs(TW) = d_lhs <=> d_rhs; // Need C++20? + + actual_lhs(PASS_BY_REF) = cast_from_half<double>(accept_ref(h_lhs)); + expected_lhs(PASS_BY_REF) = d_lhs; + + half_tmp = cast_from_half<float>(h_lhs); + tmp_ptr = &(tmp_lhs = half_tmp); + if (tmp_ptr != &tmp_lhs) + Kokkos::abort("Error in half_t address-of operator"); + actual_lhs(AO_IMPL_HALF) = cast_from_half<double>(*tmp_ptr); + expected_lhs(AO_IMPL_HALF) = d_lhs; + + tmp2_lhs = h_lhs; + tmp_ptr = &(tmp_lhs = tmp2_lhs); + if (tmp_ptr != &tmp_lhs) + Kokkos::abort("Error in half_t address-of operator"); + actual_lhs(AO_HALF_T) = cast_from_half<double>(tmp_ptr[0]); + expected_lhs(AO_HALF_T) = d_lhs; + + // TODO: Check upcasting and downcasting in large expressions involving + // integral and floating point types + } +}; + +void __test_half_operators(half_t h_lhs, half_t h_rhs) { + double epsilon = KOKKOS_HALF_T_IS_FLOAT ? FLT_EPSILON : FP16_EPSILON; + Functor_TestHalfOperators<ViewType> f_device(h_lhs, h_rhs); // Run on device + Functor_TestHalfOperators<ViewTypeHost> f_host(h_lhs, h_rhs); // Run on host + typename ViewType::HostMirror f_device_actual_lhs = + Kokkos::create_mirror_view(f_device.actual_lhs); + typename ViewType::HostMirror f_device_expected_lhs = + Kokkos::create_mirror_view(f_device.expected_lhs); + + ExecutionSpace().fence(); + Kokkos::deep_copy(f_device_actual_lhs, f_device.actual_lhs); + Kokkos::deep_copy(f_device_expected_lhs, f_device.expected_lhs); + for (int op_test = 0; op_test < N_OP_TESTS; op_test++) { + // printf("op_test = %d\n", op_test); + ASSERT_NEAR(f_device_actual_lhs(op_test), f_device_expected_lhs(op_test), + epsilon); + ASSERT_NEAR(f_host.actual_lhs(op_test), f_host.expected_lhs(op_test), + epsilon); + } + + // Check whether half_t is trivially copyable + ASSERT_TRUE(std::is_trivially_copyable<half_t>::value); + constexpr size_t n = 2; + constexpr size_t n_bytes = sizeof(half_t) * n; + const half_t h_arr0 = half_t(0x89ab), h_arr1 = half_t(0xcdef); + half_t h_arr[n]; + char c_arr[n_bytes], *h_arr_ptr = nullptr; + size_t i; + + h_arr[0] = h_arr0; + h_arr[1] = h_arr1; + h_arr_ptr = reinterpret_cast<char*>(h_arr); + + std::memcpy(c_arr, h_arr, n_bytes); + for (i = 0; i < n_bytes; i++) ASSERT_TRUE(c_arr[i] == h_arr_ptr[i]); + + std::memcpy(h_arr, c_arr, n_bytes); + ASSERT_TRUE(h_arr[0] == h_arr0); + ASSERT_TRUE(h_arr[1] == h_arr1); +} + +void test_half_operators() { + half_t h_lhs = half_t(0.23458), h_rhs = half_t(0.67898); + for (int i = -3; i < 2; i++) { + // printf("%f OP %f\n", float(h_lhs + cast_to_half(i + 1)), float(h_rhs + + // cast_to_half(i))); + __test_half_operators(h_lhs + cast_to_half(i + 1), h_rhs + cast_to_half(i)); + // TODO: __test_half_operators(h_lhs + cast_to_half(i + 1), half_t(0)); + // TODO: __test_half_operators(half_t(0), h_rhs + cast_to_half(i)); + } + // TODO: __test_half_operators(0, 0); +} + +TEST(TEST_CATEGORY, half_operators) { test_half_operators(); } +} // namespace Test +#endif // KOKKOS_ENABLE_SYCL +#endif // TESTHALFOPERATOR_HPP_ diff --git a/packages/kokkos/core/unit_test/TestHostSharedPtr.hpp b/packages/kokkos/core/unit_test/TestHostSharedPtr.hpp new file mode 100644 index 0000000000000000000000000000000000000000..731e9fc36d9bf17aa93fc1e458d3058bf7a37994 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestHostSharedPtr.hpp @@ -0,0 +1,155 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <impl/Kokkos_HostSharedPtr.hpp> + +#include <gtest/gtest.h> + +using Kokkos::Impl::HostSharedPtr; + +TEST(TEST_CATEGORY, host_shared_ptr_use_count) { + using T = int; + { + HostSharedPtr<T> p1; + EXPECT_EQ(p1.use_count(), 0); + } + { + HostSharedPtr<T> p1(nullptr); + EXPECT_EQ(p1.use_count(), 0); + } + { + HostSharedPtr<T> p1(new T()); + EXPECT_EQ(p1.use_count(), 1); + } + { + HostSharedPtr<T> p1(new T(), [](T* p) { delete p; }); + EXPECT_EQ(p1.use_count(), 1); + } + { + T i; + HostSharedPtr<T> p1(&i, [](T*) {}); + EXPECT_EQ(p1.use_count(), 1); + } + { + HostSharedPtr<T> p1(new T()); + HostSharedPtr<T> p2(p1); // copy construction + EXPECT_EQ(p1.use_count(), 2); + EXPECT_EQ(p2.use_count(), 2); + } + { + HostSharedPtr<T> p1(new T()); + HostSharedPtr<T> p2(std::move(p1)); // move construction + EXPECT_EQ(p2.use_count(), 1); + } + { + HostSharedPtr<T> p1(new T()); + HostSharedPtr<T> p2; + p2 = p1; // copy assignment + EXPECT_EQ(p1.use_count(), 2); + EXPECT_EQ(p2.use_count(), 2); + } + { + HostSharedPtr<T> p1(new T()); + HostSharedPtr<T> p2; + p2 = std::move(p1); // move assignment + EXPECT_EQ(p2.use_count(), 1); + } +} + +TEST(TEST_CATEGORY, host_shared_ptr_get) { + using T = int; + { + HostSharedPtr<T> p1; + EXPECT_EQ(p1.get(), nullptr); + } + { + HostSharedPtr<T> p1(nullptr); + EXPECT_EQ(p1.get(), nullptr); + } + { + T* p_i = new T(); + HostSharedPtr<T> p1(p_i); + EXPECT_EQ(p1.get(), p_i); + } + { + T* p_i = new T(); + HostSharedPtr<T> p1(p_i, [](T* p) { delete p; }); + EXPECT_EQ(p1.get(), p_i); + } + { + T i; + HostSharedPtr<T> p1(&i, [](T*) {}); + EXPECT_EQ(p1.get(), &i); + } + { + T i; + HostSharedPtr<T> p1(&i, [](T*) {}); + HostSharedPtr<T> p2(p1); // copy construction + EXPECT_EQ(p1.get(), &i); + EXPECT_EQ(p1.get(), &i); + } + { + T i; + HostSharedPtr<T> p1(&i, [](T*) {}); + HostSharedPtr<T> p2(std::move(p1)); // move construction + EXPECT_EQ(p1.get(), nullptr); + EXPECT_EQ(p2.get(), &i); + } + { + T i; + HostSharedPtr<T> p1(&i, [](T*) {}); + HostSharedPtr<T> p2; + p2 = p1; // copy assignment + EXPECT_EQ(p1.get(), &i); + EXPECT_EQ(p1.get(), &i); + } + { + T i; + HostSharedPtr<T> p1(&i, [](T*) {}); + HostSharedPtr<T> p2; + p2 = std::move(p1); // move assignment + EXPECT_EQ(p1.get(), nullptr); + EXPECT_EQ(p2.get(), &i); + } +} diff --git a/packages/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp b/packages/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp new file mode 100644 index 0000000000000000000000000000000000000000..18d1ac85188ca17cd7d127d3187103f42402be18 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp @@ -0,0 +1,156 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <impl/Kokkos_HostSharedPtr.hpp> +#include <Kokkos_Core.hpp> + +#include <gtest/gtest.h> + +using Kokkos::Impl::HostSharedPtr; + +namespace { + +class Data { + Kokkos::Array<char, 64> d; + + public: + KOKKOS_FUNCTION void write(char const* c) { + for (int i = 0; i < 64 && c; ++i, ++c) { + d[i] = *c; + } + } +}; + +template <class SmartPtr> +struct CheckAccessStoredPointerAndDereferenceOnDevice { + SmartPtr m_device_ptr; + using ElementType = typename SmartPtr::element_type; + static_assert(std::is_same<ElementType, Data>::value, ""); + + CheckAccessStoredPointerAndDereferenceOnDevice(SmartPtr device_ptr) + : m_device_ptr(device_ptr) { + int errors; + Kokkos::parallel_reduce(Kokkos::RangePolicy<TEST_EXECSPACE>(0, 1), *this, + errors); + EXPECT_EQ(errors, 0); + } + + KOKKOS_FUNCTION void operator()(int, int& e) const { + auto raw_ptr = m_device_ptr.get(); // get + + auto tmp = new (raw_ptr) ElementType(); + + auto& obj = *m_device_ptr; // operator* + if (&obj != raw_ptr) ++e; + + m_device_ptr->write("hello world"); // operator-> + + tmp->~ElementType(); + } +}; + +template <class Ptr> +CheckAccessStoredPointerAndDereferenceOnDevice<Ptr> +check_access_stored_pointer_and_dereference_on_device(Ptr p) { + return {p}; +} + +template <class SmartPtr> +struct CheckSpecialMembersOnDevice { + SmartPtr m_device_ptr; + + KOKKOS_FUNCTION void operator()(int, int& e) const { + SmartPtr p1 = m_device_ptr; // copy construction + SmartPtr p2 = std::move(p1); // move construction + + p1 = p2; // copy assignment + p2 = std::move(p1); // move assignment + + SmartPtr p3; // default constructor + if (p3) ++e; + SmartPtr p4{nullptr}; + if (p4) ++e; + } + + CheckSpecialMembersOnDevice(SmartPtr device_ptr) : m_device_ptr(device_ptr) { + int errors; + Kokkos::parallel_reduce(Kokkos::RangePolicy<TEST_EXECSPACE>(0, 1), *this, + errors); + EXPECT_EQ(errors, 0); + } +}; + +template <class Ptr> +CheckSpecialMembersOnDevice<Ptr> check_special_members_on_device(Ptr p) { + return {p}; +} + +} // namespace + +TEST(TEST_CATEGORY, host_shared_ptr_dereference_on_device) { + using T = Data; + + using MemorySpace = TEST_EXECSPACE::memory_space; + + HostSharedPtr<T> device_ptr( + static_cast<T*>(Kokkos::kokkos_malloc<MemorySpace>(sizeof(T))), + [](T* p) { Kokkos::kokkos_free<MemorySpace>(p); }); + + check_access_stored_pointer_and_dereference_on_device(device_ptr); +} + +// FIXME_OPENMPTARGET +#ifndef KOKKOS_ENABLE_OPENMPTARGET +TEST(TEST_CATEGORY, host_shared_ptr_special_members_on_device) { + using T = Data; + + using MemorySpace = TEST_EXECSPACE::memory_space; + + HostSharedPtr<T> device_ptr( + static_cast<T*>(Kokkos::kokkos_malloc<MemorySpace>(sizeof(T))), + [](T* p) { Kokkos::kokkos_free<MemorySpace>(p); }); + + check_special_members_on_device(device_ptr); +} +#endif diff --git a/packages/kokkos/core/unit_test/TestInit.hpp b/packages/kokkos/core/unit_test/TestInit.hpp new file mode 100644 index 0000000000000000000000000000000000000000..f124c6202c5675a28c0b539b7d86e260b62c5874 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestInit.hpp @@ -0,0 +1,74 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <cstdio> +#include <stdexcept> +#include <sstream> +#include <iostream> + +#include <Kokkos_Core.hpp> + +namespace Test { +TEST(TEST_CATEGORY, init) { ; } + +#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA + +template <class ExecSpace> +void test_dispatch() { + const int repeat = 100; + for (int i = 0; i < repeat; ++i) { + for (int j = 0; j < repeat; ++j) { + Kokkos::parallel_for(Kokkos::RangePolicy<TEST_EXECSPACE>(0, j), + KOKKOS_LAMBDA(int){}); + } + } +} + +TEST(TEST_CATEGORY, dispatch) { test_dispatch<TEST_EXECSPACE>(); } +#endif + +} // namespace Test + +#include <TestCompilerMacros.hpp> +#include <TestPolicyConstruction.hpp> diff --git a/packages/kokkos/core/unit_test/TestIrregularLayout.hpp b/packages/kokkos/core/unit_test/TestIrregularLayout.hpp new file mode 100644 index 0000000000000000000000000000000000000000..86f9353e2d539ec931e0d373c3f6746497222ebc --- /dev/null +++ b/packages/kokkos/core/unit_test/TestIrregularLayout.hpp @@ -0,0 +1,264 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> +#include <Kokkos_Core.hpp> +#include <stdexcept> +#include <sstream> +#include <iostream> +#define OFFSET_LIST_MAX_SIZE 100 + +namespace Kokkos { + +struct LayoutSelective { + //! Tag this class as a kokkos array layout + using array_layout = LayoutSelective; + + size_t offset_list[OFFSET_LIST_MAX_SIZE]; + size_t list_size; + + enum : bool { is_extent_constructible = false }; + + KOKKOS_INLINE_FUNCTION + LayoutSelective() { + for (int i = 0; i < OFFSET_LIST_MAX_SIZE; i++) { + offset_list[i] = i; + } + } + + KOKKOS_INLINE_FUNCTION + void assign(const size_t ol_[], const size_t size_) { + list_size = size_; + for (int i = 0; i < (int)list_size; i++) { + offset_list[i] = ol_[i]; + } + } + + KOKKOS_INLINE_FUNCTION + LayoutSelective(LayoutSelective const& rhs) { + assign(rhs.offset_list, rhs.list_size); + } + + KOKKOS_INLINE_FUNCTION + LayoutSelective(LayoutSelective&& rhs) { + assign(rhs.offset_list, rhs.list_size); + } + KOKKOS_INLINE_FUNCTION + LayoutSelective& operator=(LayoutSelective const& rhs) { + assign(rhs.offset_list, rhs.list_size); + return *this; + } + KOKKOS_INLINE_FUNCTION + LayoutSelective& operator=(LayoutSelective&& rhs) { + assign(rhs.offset_list, rhs.list_size); + return *this; + } + + KOKKOS_INLINE_FUNCTION + explicit LayoutSelective(const size_t ol_[], const size_t size_) { + assign(ol_, size_); + } + + KOKKOS_INLINE_FUNCTION + size_t offset(size_t ndx) const { + KOKKOS_ASSERT(ndx < list_size); + return offset_list[ndx]; + } +}; + +namespace Impl { +template <class Dimension> +struct ViewOffset<Dimension, Kokkos::LayoutSelective, void> { + public: + using is_mapping_plugin = std::true_type; + using is_regular = std::false_type; + + using size_type = size_t; + using dimension_type = Dimension; + using array_layout = Kokkos::LayoutSelective; + + //---------------------------------------- + dimension_type m_dim; + array_layout m_selective; + + // rank 1 + template <typename I0> + KOKKOS_INLINE_FUNCTION size_type operator()(I0 const& i0) const { + return m_selective.offset(i0); + } + + // This ViewOffset and the underlying layout only supports rank 1 Views + + //---------------------------------------- + + KOKKOS_INLINE_FUNCTION + array_layout layout() const { return array_layout(); } + + KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { + return m_dim.N0; + } + + /* Cardinality of the domain index space */ + KOKKOS_INLINE_FUNCTION + constexpr size_type size() const { return m_dim.N0; } + + public: + /* Span of the range space, largest stride * dimension */ + KOKKOS_INLINE_FUNCTION + constexpr size_type span() const { return m_dim.N0; } + + KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { + return false; + } + + /* Strides of dimensions */ + KOKKOS_INLINE_FUNCTION constexpr size_type stride_0() const { return 1; } + + // Stride with [ rank ] value is the total length + template <typename iType> + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + if (0 < dimension_type::rank) { + s[0] = 1; + } + for (int i = 1; i < 8; i++) s[i] = 0; + s[dimension_type::rank] = span(); + } + + //---------------------------------------- + ViewOffset() = default; + ViewOffset(const ViewOffset&) = default; + ViewOffset& operator=(const ViewOffset&) = default; + + KOKKOS_INLINE_FUNCTION + ViewOffset(std::integral_constant<unsigned, 0> const&, + Kokkos::LayoutSelective const& rhs) + : m_dim(rhs.list_size, 0, 0, 0, 0, 0, 0, 0), m_selective(rhs) {} +}; + +} // namespace Impl +} // namespace Kokkos + +namespace Test { + +class InnerClass { + public: + long data[100]; + + KOKKOS_INLINE_FUNCTION + InnerClass() { + for (int i = 0; i < 100; i++) { + data[i] = (long)i; + } + } + + KOKKOS_INLINE_FUNCTION + void update(long d) { + for (int i = 0; i < 100; i++) { + data[i] += d; + } + } + + KOKKOS_INLINE_FUNCTION + void set(long d) { + for (int i = 0; i < 100; i++) { + data[i] = d; + } + } +}; + +template <class ExecutionSpace> +struct TestLayout { + const int N = 100; + size_t offsets[2] = {20, 40}; + using Layout = Kokkos::LayoutRight; + using SubLayout = Kokkos::LayoutSelective; + + // Allocate y, x vectors and Matrix A on device. + using ViewVectorType = + Kokkos::View<InnerClass*, Layout, typename ExecutionSpace::memory_space>; + using SubViewVectorType = Kokkos::View<InnerClass*, SubLayout, + typename ExecutionSpace::memory_space, + Kokkos::MemoryUnmanaged>; + struct InitTag {}; + struct UpdateTag {}; + + ViewVectorType a; + SubLayout sl; + SubViewVectorType b; + TestLayout() : a("a", N), sl(offsets, 2), b(a.data(), sl) {} + + void run_test() { + Kokkos::parallel_for(Kokkos::RangePolicy<ExecutionSpace, InitTag>(0, N), + *this); + + Kokkos::parallel_for(Kokkos::RangePolicy<ExecutionSpace, UpdateTag>(0, 2), + *this); + + validate_results(); + } + + // set all values + KOKKOS_INLINE_FUNCTION + void operator()(const InitTag&, const int i) const { a(i).update(i); } + + // update selective values + KOKKOS_INLINE_FUNCTION + void operator()(const UpdateTag&, const int i) const { + b(i).set(200 * (i + 1)); + } + + void validate_results() { + auto a_h = Kokkos::create_mirror_view(a); + Kokkos::deep_copy(a_h, a); + ASSERT_EQ(a_h(20).data[0], 200); + ASSERT_EQ(a_h(40).data[0], 400); + } +}; + +TEST(TEST_CATEGORY, view_irregular_layout) { + TestLayout<TEST_EXECSPACE> tl; + tl.run_test(); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestLocalDeepCopy.hpp b/packages/kokkos/core/unit_test/TestLocalDeepCopy.hpp new file mode 100644 index 0000000000000000000000000000000000000000..80feb11f9b711bdbe2816d45d5df4c313e4c0865 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestLocalDeepCopy.hpp @@ -0,0 +1,1114 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <stdexcept> +#include <sstream> +#include <iostream> +#include <time.h> + +#include <Kokkos_Core.hpp> + +namespace Test { + +template <typename ExecSpace, typename ViewType> +void impl_test_local_deepcopy_teampolicy_rank_1(const int N) { + // Allocate matrices on device. + ViewType A("A", N, N, N, N, N, N, N, N); + ViewType B("B", N, N, N, N, N, N, N, N); + + // Create host mirrors of device views. + typename ViewType::HostMirror h_A = Kokkos::create_mirror_view(A); + typename ViewType::HostMirror h_B = Kokkos::create_mirror_view(B); + + // Initialize A matrix. + auto subA = + Kokkos::subview(A, 1, 1, 1, 1, 1, 1, Kokkos::ALL(), Kokkos::ALL()); + Kokkos::deep_copy(subA, 10.0); + + using team_policy = Kokkos::TeamPolicy<ExecSpace>; + using member_type = typename Kokkos::TeamPolicy<ExecSpace>::member_type; + + // Deep Copy + Kokkos::parallel_for( + team_policy(N, Kokkos::AUTO), + KOKKOS_LAMBDA(const member_type& teamMember) { + int lid = teamMember.league_rank(); // returns a number between 0 and N + auto subSrc = Kokkos::subview(A, 1, 1, 1, 1, 1, 1, lid, Kokkos::ALL()); + auto subDst = Kokkos::subview(B, 1, 1, 1, 1, 1, 1, lid, Kokkos::ALL()); + Kokkos::Experimental::local_deep_copy(teamMember, subDst, subSrc); + }); + + Kokkos::deep_copy(h_A, A); + Kokkos::deep_copy(h_B, B); + + bool test = true; + for (size_t i = 0; i < A.span(); i++) { + if (h_A.data()[i] != h_B.data()[i]) { + test = false; + break; + } + } + + ASSERT_EQ(test, true); + + // Fill + Kokkos::deep_copy(B, 0.0); + + Kokkos::parallel_for( + team_policy(N, Kokkos::AUTO), + KOKKOS_LAMBDA(const member_type& teamMember) { + int lid = teamMember.league_rank(); // returns a number between 0 and N + auto subDst = Kokkos::subview(B, 1, 1, 1, 1, 1, 1, lid, Kokkos::ALL()); + Kokkos::Experimental::local_deep_copy(teamMember, subDst, 20.0); + }); + + Kokkos::deep_copy(h_B, B); + + double sum_all = 0.0; + for (size_t i = 0; i < B.span(); i++) { + sum_all += h_B.data()[i]; + } + + ASSERT_EQ(sum_all, 20.0 * N * N); +} +//------------------------------------------------------------------------------------------------------------- +template <typename ExecSpace, typename ViewType> +void impl_test_local_deepcopy_teampolicy_rank_2(const int N) { + // Allocate matrices on device. + ViewType A("A", N, N, N, N, N, N, N, N); + ViewType B("B", N, N, N, N, N, N, N, N); + + // Create host mirrors of device views. + typename ViewType::HostMirror h_A = Kokkos::create_mirror_view(A); + typename ViewType::HostMirror h_B = Kokkos::create_mirror_view(B); + + // Initialize A matrix. + auto subA = Kokkos::subview(A, 1, 1, 1, 1, 1, Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL()); + Kokkos::deep_copy(subA, 10.0); + + using team_policy = Kokkos::TeamPolicy<ExecSpace>; + using member_type = typename Kokkos::TeamPolicy<ExecSpace>::member_type; + + // Deep Copy + Kokkos::parallel_for( + team_policy(N, Kokkos::AUTO), + KOKKOS_LAMBDA(const member_type& teamMember) { + int lid = teamMember.league_rank(); // returns a number between 0 and N + auto subSrc = Kokkos::subview(A, 1, 1, 1, 1, 1, lid, Kokkos::ALL(), + Kokkos::ALL()); + auto subDst = Kokkos::subview(B, 1, 1, 1, 1, 1, lid, Kokkos::ALL(), + Kokkos::ALL()); + Kokkos::Experimental::local_deep_copy(teamMember, subDst, subSrc); + }); + + Kokkos::deep_copy(h_A, A); + Kokkos::deep_copy(h_B, B); + + bool test = true; + for (size_t i = 0; i < A.span(); i++) { + if (h_A.data()[i] != h_B.data()[i]) { + test = false; + break; + } + } + + ASSERT_EQ(test, true); + + // Fill + Kokkos::deep_copy(B, 0.0); + + Kokkos::parallel_for( + team_policy(N, Kokkos::AUTO), + KOKKOS_LAMBDA(const member_type& teamMember) { + int lid = teamMember.league_rank(); // returns a number between 0 and N + auto subDst = Kokkos::subview(B, 1, 1, 1, 1, 1, lid, Kokkos::ALL(), + Kokkos::ALL()); + Kokkos::Experimental::local_deep_copy(teamMember, subDst, 20.0); + }); + + Kokkos::deep_copy(h_B, B); + + double sum_all = 0.0; + for (size_t i = 0; i < B.span(); i++) { + sum_all += h_B.data()[i]; + } + + ASSERT_EQ(sum_all, 20.0 * N * N * N); +} +//------------------------------------------------------------------------------------------------------------- +template <typename ExecSpace, typename ViewType> +void impl_test_local_deepcopy_teampolicy_rank_3(const int N) { + // Allocate matrices on device. + ViewType A("A", N, N, N, N, N, N, N, N); + ViewType B("B", N, N, N, N, N, N, N, N); + + // Create host mirrors of device views. + typename ViewType::HostMirror h_A = Kokkos::create_mirror_view(A); + typename ViewType::HostMirror h_B = Kokkos::create_mirror_view(B); + + // Initialize A matrix. + auto subA = Kokkos::subview(A, 1, 1, 1, 1, Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL()); + Kokkos::deep_copy(subA, 10.0); + + using team_policy = Kokkos::TeamPolicy<ExecSpace>; + using member_type = typename Kokkos::TeamPolicy<ExecSpace>::member_type; + + // Deep Copy + Kokkos::parallel_for( + team_policy(N, Kokkos::AUTO), + KOKKOS_LAMBDA(const member_type& teamMember) { + int lid = teamMember.league_rank(); // returns a number between 0 and N + auto subSrc = Kokkos::subview(A, 1, 1, 1, 1, lid, Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL()); + auto subDst = Kokkos::subview(B, 1, 1, 1, 1, lid, Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL()); + Kokkos::Experimental::local_deep_copy(teamMember, subDst, subSrc); + }); + + Kokkos::deep_copy(h_A, A); + Kokkos::deep_copy(h_B, B); + + bool test = true; + for (size_t i = 0; i < A.span(); i++) { + if (h_A.data()[i] != h_B.data()[i]) { + test = false; + break; + } + } + + ASSERT_EQ(test, true); + + // Fill + Kokkos::deep_copy(B, 0.0); + + Kokkos::parallel_for( + team_policy(N, Kokkos::AUTO), + KOKKOS_LAMBDA(const member_type& teamMember) { + int lid = teamMember.league_rank(); // returns a number between 0 and N + auto subDst = Kokkos::subview(B, 1, 1, 1, 1, lid, Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL()); + Kokkos::Experimental::local_deep_copy(teamMember, subDst, 20.0); + }); + + Kokkos::deep_copy(h_B, B); + + double sum_all = 0.0; + for (size_t i = 0; i < B.span(); i++) { + sum_all += h_B.data()[i]; + } + + ASSERT_EQ(sum_all, 20.0 * N * N * N * N); +} +//------------------------------------------------------------------------------------------------------------- +template <typename ExecSpace, typename ViewType> +void impl_test_local_deepcopy_teampolicy_rank_4(const int N) { + // Allocate matrices on device. + ViewType A("A", N, N, N, N, N, N, N, N); + ViewType B("B", N, N, N, N, N, N, N, N); + + // Create host mirrors of device views. + typename ViewType::HostMirror h_A = Kokkos::create_mirror_view(A); + typename ViewType::HostMirror h_B = Kokkos::create_mirror_view(B); + + // Initialize A matrix. + auto subA = Kokkos::subview(A, 1, 1, 1, Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + Kokkos::deep_copy(subA, 10.0); + + using team_policy = Kokkos::TeamPolicy<ExecSpace>; + using member_type = typename Kokkos::TeamPolicy<ExecSpace>::member_type; + + // Deep Copy + Kokkos::parallel_for( + team_policy(N, Kokkos::AUTO), + KOKKOS_LAMBDA(const member_type& teamMember) { + int lid = teamMember.league_rank(); // returns a number between 0 and N + auto subSrc = + Kokkos::subview(A, 1, 1, 1, lid, Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL()); + auto subDst = + Kokkos::subview(B, 1, 1, 1, lid, Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL()); + Kokkos::Experimental::local_deep_copy(teamMember, subDst, subSrc); + }); + + Kokkos::deep_copy(h_A, A); + Kokkos::deep_copy(h_B, B); + + bool test = true; + for (size_t i = 0; i < A.span(); i++) { + if (h_A.data()[i] != h_B.data()[i]) { + test = false; + break; + } + } + + ASSERT_EQ(test, true); + + // Fill + Kokkos::deep_copy(B, 0.0); + + Kokkos::parallel_for( + team_policy(N, Kokkos::AUTO), + KOKKOS_LAMBDA(const member_type& teamMember) { + int lid = teamMember.league_rank(); // returns a number between 0 and N + auto subDst = + Kokkos::subview(B, 1, 1, 1, lid, Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL()); + Kokkos::Experimental::local_deep_copy(teamMember, subDst, 20.0); + }); + + Kokkos::deep_copy(h_B, B); + + double sum_all = 0.0; + for (size_t i = 0; i < B.span(); i++) { + sum_all += h_B.data()[i]; + } + + ASSERT_EQ(sum_all, 20.0 * N * N * N * N * N); +} +//------------------------------------------------------------------------------------------------------------- +template <typename ExecSpace, typename ViewType> +void impl_test_local_deepcopy_teampolicy_rank_5(const int N) { + // Allocate matrices on device. + ViewType A("A", N, N, N, N, N, N, N, N); + ViewType B("B", N, N, N, N, N, N, N, N); + + // Create host mirrors of device views. + typename ViewType::HostMirror h_A = Kokkos::create_mirror_view(A); + typename ViewType::HostMirror h_B = Kokkos::create_mirror_view(B); + + // Initialize A matrix. + auto subA = + Kokkos::subview(A, 1, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + Kokkos::deep_copy(subA, 10.0); + + using team_policy = Kokkos::TeamPolicy<ExecSpace>; + using member_type = typename Kokkos::TeamPolicy<ExecSpace>::member_type; + + // Deep Copy + Kokkos::parallel_for( + team_policy(N, Kokkos::AUTO), + KOKKOS_LAMBDA(const member_type& teamMember) { + int lid = teamMember.league_rank(); // returns a number between 0 and N + auto subSrc = + Kokkos::subview(A, 1, 1, lid, Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + auto subDst = + Kokkos::subview(B, 1, 1, lid, Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + Kokkos::Experimental::local_deep_copy(teamMember, subDst, subSrc); + }); + + Kokkos::deep_copy(h_A, A); + Kokkos::deep_copy(h_B, B); + + bool test = true; + for (size_t i = 0; i < A.span(); i++) { + if (h_A.data()[i] != h_B.data()[i]) { + test = false; + break; + } + } + + ASSERT_EQ(test, true); + + // Fill + Kokkos::deep_copy(B, 0.0); + + Kokkos::parallel_for( + team_policy(N, Kokkos::AUTO), + KOKKOS_LAMBDA(const member_type& teamMember) { + int lid = teamMember.league_rank(); // returns a number between 0 and N + auto subDst = + Kokkos::subview(B, 1, 1, lid, Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + Kokkos::Experimental::local_deep_copy(teamMember, subDst, 20.0); + }); + + Kokkos::deep_copy(h_B, B); + + double sum_all = 0.0; + for (size_t i = 0; i < B.span(); i++) { + sum_all += h_B.data()[i]; + } + + ASSERT_EQ(sum_all, 20.0 * N * N * N * N * N * N); +} +//------------------------------------------------------------------------------------------------------------- +template <typename ExecSpace, typename ViewType> +void impl_test_local_deepcopy_teampolicy_rank_6(const int N) { + // Allocate matrices on device. + ViewType A("A", N, N, N, N, N, N, N, N); + ViewType B("B", N, N, N, N, N, N, N, N); + + // Create host mirrors of device views. + typename ViewType::HostMirror h_A = Kokkos::create_mirror_view(A); + typename ViewType::HostMirror h_B = Kokkos::create_mirror_view(B); + + // Initialize A matrix. + auto subA = Kokkos::subview(A, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL()); + Kokkos::deep_copy(subA, 10.0); + + using team_policy = Kokkos::TeamPolicy<ExecSpace>; + using member_type = typename Kokkos::TeamPolicy<ExecSpace>::member_type; + + // Deep Copy + Kokkos::parallel_for( + team_policy(N, Kokkos::AUTO), + KOKKOS_LAMBDA(const member_type& teamMember) { + int lid = teamMember.league_rank(); // returns a number between 0 and N + auto subSrc = Kokkos::subview(A, 1, lid, Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL()); + auto subDst = Kokkos::subview(B, 1, lid, Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL()); + Kokkos::Experimental::local_deep_copy(teamMember, subDst, subSrc); + }); + + Kokkos::deep_copy(h_A, A); + Kokkos::deep_copy(h_B, B); + + bool test = true; + for (size_t i = 0; i < A.span(); i++) { + if (h_A.data()[i] != h_B.data()[i]) { + test = false; + break; + } + } + + ASSERT_EQ(test, true); + + // Fill + Kokkos::deep_copy(B, 0.0); + + Kokkos::parallel_for( + team_policy(N, Kokkos::AUTO), + KOKKOS_LAMBDA(const member_type& teamMember) { + int lid = teamMember.league_rank(); // returns a number between 0 and N + auto subDst = Kokkos::subview(B, 1, lid, Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL()); + Kokkos::Experimental::local_deep_copy(teamMember, subDst, 20.0); + }); + + Kokkos::deep_copy(h_B, B); + + double sum_all = 0.0; + for (size_t i = 0; i < B.span(); i++) { + sum_all += h_B.data()[i]; + } + + ASSERT_EQ(sum_all, 20.0 * N * N * N * N * N * N * N); +} +//------------------------------------------------------------------------------------------------------------- +template <typename ExecSpace, typename ViewType> +void impl_test_local_deepcopy_teampolicy_rank_7(const int N) { + // Allocate matrices on device. + ViewType A("A", N, N, N, N, N, N, N, N); + ViewType B("B", N, N, N, N, N, N, N, N); + + // Create host mirrors of device views. + typename ViewType::HostMirror h_A = Kokkos::create_mirror_view(A); + typename ViewType::HostMirror h_B = Kokkos::create_mirror_view(B); + + // Initialize A matrix. + Kokkos::deep_copy(A, 10.0); + + using team_policy = Kokkos::TeamPolicy<ExecSpace>; + using member_type = typename Kokkos::TeamPolicy<ExecSpace>::member_type; + + // Deep Copy + Kokkos::parallel_for( + team_policy(N, Kokkos::AUTO), + KOKKOS_LAMBDA(const member_type& teamMember) { + int lid = teamMember.league_rank(); // returns a number between 0 and N + auto subSrc = Kokkos::subview( + A, lid, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + auto subDst = Kokkos::subview( + B, lid, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + Kokkos::Experimental::local_deep_copy(teamMember, subDst, subSrc); + }); + + Kokkos::deep_copy(h_A, A); + Kokkos::deep_copy(h_B, B); + + bool test = true; + for (size_t i = 0; i < A.span(); i++) { + if (h_A.data()[i] != h_B.data()[i]) { + test = false; + break; + } + } + + ASSERT_EQ(test, true); + + // Fill + Kokkos::deep_copy(B, 0.0); + + Kokkos::parallel_for( + team_policy(N, Kokkos::AUTO), + KOKKOS_LAMBDA(const member_type& teamMember) { + int lid = teamMember.league_rank(); // returns a number between 0 and N + auto subDst = Kokkos::subview( + B, lid, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + Kokkos::Experimental::local_deep_copy(teamMember, subDst, 20.0); + }); + + Kokkos::deep_copy(h_B, B); + + double sum_all = 0.0; + for (size_t i = 0; i < B.span(); i++) { + sum_all += h_B.data()[i]; + } + + ASSERT_EQ(sum_all, 20.0 * N * N * N * N * N * N * N * N); +} +//------------------------------------------------------------------------------------------------------------- +template <typename ExecSpace, typename ViewType> +void impl_test_local_deepcopy_rangepolicy_rank_1(const int N) { + // Allocate matrices on device. + ViewType A("A", N, N, N, N, N, N, N, N); + ViewType B("B", N, N, N, N, N, N, N, N); + + // Create host mirrors of device views. + typename ViewType::HostMirror h_A = Kokkos::create_mirror_view(A); + typename ViewType::HostMirror h_B = Kokkos::create_mirror_view(B); + + // Initialize A matrix. + auto subA = + Kokkos::subview(A, 1, 1, 1, 1, 1, 1, Kokkos::ALL(), Kokkos::ALL()); + Kokkos::deep_copy(subA, 10.0); + + // Deep Copy + Kokkos::parallel_for( + Kokkos::RangePolicy<ExecSpace>(0, N), KOKKOS_LAMBDA(const int& i) { + auto subSrc = Kokkos::subview(A, 1, 1, 1, 1, 1, 1, i, Kokkos::ALL()); + auto subDst = Kokkos::subview(B, 1, 1, 1, 1, 1, 1, i, Kokkos::ALL()); + Kokkos::Experimental::local_deep_copy(subDst, subSrc); + }); + + Kokkos::deep_copy(h_A, A); + Kokkos::deep_copy(h_B, B); + + bool test = true; + for (size_t i = 0; i < A.span(); i++) { + if (h_A.data()[i] != h_B.data()[i]) { + test = false; + break; + } + } + + ASSERT_EQ(test, true); + + // Fill + Kokkos::deep_copy(B, 0.0); + + Kokkos::parallel_for( + Kokkos::RangePolicy<ExecSpace>(0, N), KOKKOS_LAMBDA(const int& i) { + auto subDst = Kokkos::subview(B, 1, 1, 1, 1, 1, 1, i, Kokkos::ALL()); + Kokkos::Experimental::local_deep_copy(subDst, 20.0); + }); + + Kokkos::deep_copy(h_B, B); + + double sum_all = 0.0; + for (size_t i = 0; i < B.span(); i++) { + sum_all += h_B.data()[i]; + } + + ASSERT_EQ(sum_all, 20.0 * N * N); +} +//------------------------------------------------------------------------------------------------------------- +template <typename ExecSpace, typename ViewType> +void impl_test_local_deepcopy_rangepolicy_rank_2(const int N) { + // Allocate matrices on device. + ViewType A("A", N, N, N, N, N, N, N, N); + ViewType B("B", N, N, N, N, N, N, N, N); + + // Create host mirrors of device views. + typename ViewType::HostMirror h_A = Kokkos::create_mirror_view(A); + typename ViewType::HostMirror h_B = Kokkos::create_mirror_view(B); + + // Initialize A matrix. + auto subA = Kokkos::subview(A, 1, 1, 1, 1, 1, Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL()); + Kokkos::deep_copy(subA, 10.0); + + // Deep Copy + Kokkos::parallel_for( + Kokkos::RangePolicy<ExecSpace>(0, N), KOKKOS_LAMBDA(const int& i) { + auto subSrc = + Kokkos::subview(A, 1, 1, 1, 1, 1, i, Kokkos::ALL(), Kokkos::ALL()); + auto subDst = + Kokkos::subview(B, 1, 1, 1, 1, 1, i, Kokkos::ALL(), Kokkos::ALL()); + Kokkos::Experimental::local_deep_copy(subDst, subSrc); + }); + + Kokkos::deep_copy(h_A, A); + Kokkos::deep_copy(h_B, B); + + bool test = true; + for (size_t i = 0; i < A.span(); i++) { + if (h_A.data()[i] != h_B.data()[i]) { + test = false; + break; + } + } + + ASSERT_EQ(test, true); + + // Fill + Kokkos::deep_copy(B, 0.0); + + Kokkos::parallel_for( + Kokkos::RangePolicy<ExecSpace>(0, N), KOKKOS_LAMBDA(const int& i) { + auto subDst = + Kokkos::subview(B, 1, 1, 1, 1, 1, i, Kokkos::ALL(), Kokkos::ALL()); + Kokkos::Experimental::local_deep_copy(subDst, 20.0); + }); + + Kokkos::deep_copy(h_B, B); + + double sum_all = 0.0; + for (size_t i = 0; i < B.span(); i++) { + sum_all += h_B.data()[i]; + } + + ASSERT_EQ(sum_all, 20.0 * N * N * N); +} +//------------------------------------------------------------------------------------------------------------- +template <typename ExecSpace, typename ViewType> +void impl_test_local_deepcopy_rangepolicy_rank_3(const int N) { + // Allocate matrices on device. + ViewType A("A", N, N, N, N, N, N, N, N); + ViewType B("B", N, N, N, N, N, N, N, N); + + // Create host mirrors of device views. + typename ViewType::HostMirror h_A = Kokkos::create_mirror_view(A); + typename ViewType::HostMirror h_B = Kokkos::create_mirror_view(B); + + // Initialize A matrix. + auto subA = Kokkos::subview(A, 1, 1, 1, 1, Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL()); + Kokkos::deep_copy(subA, 10.0); + + // Deep Copy + Kokkos::parallel_for( + Kokkos::RangePolicy<ExecSpace>(0, N), KOKKOS_LAMBDA(const int& i) { + auto subSrc = Kokkos::subview(A, 1, 1, 1, 1, i, Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL()); + auto subDst = Kokkos::subview(B, 1, 1, 1, 1, i, Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL()); + Kokkos::Experimental::local_deep_copy(subDst, subSrc); + }); + + Kokkos::deep_copy(h_A, A); + Kokkos::deep_copy(h_B, B); + + bool test = true; + for (size_t i = 0; i < A.span(); i++) { + if (h_A.data()[i] != h_B.data()[i]) { + test = false; + break; + } + } + + ASSERT_EQ(test, true); + + // Fill + Kokkos::deep_copy(B, 0.0); + + Kokkos::parallel_for( + Kokkos::RangePolicy<ExecSpace>(0, N), KOKKOS_LAMBDA(const int& i) { + auto subDst = Kokkos::subview(B, 1, 1, 1, 1, i, Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL()); + Kokkos::Experimental::local_deep_copy(subDst, 20.0); + }); + + Kokkos::deep_copy(h_B, B); + + double sum_all = 0.0; + for (size_t i = 0; i < B.span(); i++) { + sum_all += h_B.data()[i]; + } + + ASSERT_EQ(sum_all, 20.0 * N * N * N * N); +} +//------------------------------------------------------------------------------------------------------------- +template <typename ExecSpace, typename ViewType> +void impl_test_local_deepcopy_rangepolicy_rank_4(const int N) { + // Allocate matrices on device. + ViewType A("A", N, N, N, N, N, N, N, N); + ViewType B("B", N, N, N, N, N, N, N, N); + + // Create host mirrors of device views. + typename ViewType::HostMirror h_A = Kokkos::create_mirror_view(A); + typename ViewType::HostMirror h_B = Kokkos::create_mirror_view(B); + + // Initialize A matrix. + auto subA = Kokkos::subview(A, 1, 1, 1, Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + Kokkos::deep_copy(subA, 10.0); + + // Deep Copy + Kokkos::parallel_for( + Kokkos::RangePolicy<ExecSpace>(0, N), KOKKOS_LAMBDA(const int& i) { + auto subSrc = + Kokkos::subview(A, 1, 1, 1, i, Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL()); + auto subDst = + Kokkos::subview(B, 1, 1, 1, i, Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL()); + Kokkos::Experimental::local_deep_copy(subDst, subSrc); + }); + + Kokkos::deep_copy(h_A, A); + Kokkos::deep_copy(h_B, B); + + bool test = true; + for (size_t i = 0; i < A.span(); i++) { + if (h_A.data()[i] != h_B.data()[i]) { + test = false; + break; + } + } + + ASSERT_EQ(test, true); + + // Fill + Kokkos::deep_copy(B, 0.0); + + Kokkos::parallel_for( + Kokkos::RangePolicy<ExecSpace>(0, N), KOKKOS_LAMBDA(const int& i) { + auto subDst = + Kokkos::subview(B, 1, 1, 1, i, Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL()); + Kokkos::Experimental::local_deep_copy(subDst, 20.0); + }); + + Kokkos::deep_copy(h_B, B); + + double sum_all = 0.0; + for (size_t i = 0; i < B.span(); i++) { + sum_all += h_B.data()[i]; + } + + ASSERT_EQ(sum_all, 20.0 * N * N * N * N * N); +} +//------------------------------------------------------------------------------------------------------------- +template <typename ExecSpace, typename ViewType> +void impl_test_local_deepcopy_rangepolicy_rank_5(const int N) { + // Allocate matrices on device. + ViewType A("A", N, N, N, N, N, N, N, N); + ViewType B("B", N, N, N, N, N, N, N, N); + + // Create host mirrors of device views. + typename ViewType::HostMirror h_A = Kokkos::create_mirror_view(A); + typename ViewType::HostMirror h_B = Kokkos::create_mirror_view(B); + + // Initialize A matrix. + auto subA = + Kokkos::subview(A, 1, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + Kokkos::deep_copy(subA, 10.0); + + // Deep Copy + Kokkos::parallel_for( + Kokkos::RangePolicy<ExecSpace>(0, N), KOKKOS_LAMBDA(const int& i) { + auto subSrc = + Kokkos::subview(A, 1, 1, i, Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + auto subDst = + Kokkos::subview(B, 1, 1, i, Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + Kokkos::Experimental::local_deep_copy(subDst, subSrc); + }); + + Kokkos::deep_copy(h_A, A); + Kokkos::deep_copy(h_B, B); + + bool test = true; + for (size_t i = 0; i < A.span(); i++) { + if (h_A.data()[i] != h_B.data()[i]) { + test = false; + break; + } + } + + ASSERT_EQ(test, true); + + // Fill + Kokkos::deep_copy(B, 0.0); + + Kokkos::parallel_for( + Kokkos::RangePolicy<ExecSpace>(0, N), KOKKOS_LAMBDA(const int& i) { + auto subDst = + Kokkos::subview(B, 1, 1, i, Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + Kokkos::Experimental::local_deep_copy(subDst, 20.0); + }); + + Kokkos::deep_copy(h_B, B); + + double sum_all = 0.0; + for (size_t i = 0; i < B.span(); i++) { + sum_all += h_B.data()[i]; + } + + ASSERT_EQ(sum_all, 20.0 * N * N * N * N * N * N); +} +//------------------------------------------------------------------------------------------------------------- +template <typename ExecSpace, typename ViewType> +void impl_test_local_deepcopy_rangepolicy_rank_6(const int N) { + // Allocate matrices on device. + ViewType A("A", N, N, N, N, N, N, N, N); + ViewType B("B", N, N, N, N, N, N, N, N); + + // Create host mirrors of device views. + typename ViewType::HostMirror h_A = Kokkos::create_mirror_view(A); + typename ViewType::HostMirror h_B = Kokkos::create_mirror_view(B); + + // Initialize A matrix. + auto subA = Kokkos::subview(A, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL()); + Kokkos::deep_copy(subA, 10.0); + + // Deep Copy + Kokkos::parallel_for( + Kokkos::RangePolicy<ExecSpace>(0, N), KOKKOS_LAMBDA(const int& i) { + auto subSrc = Kokkos::subview(A, 1, i, Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL()); + auto subDst = Kokkos::subview(B, 1, i, Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL()); + Kokkos::Experimental::local_deep_copy(subDst, subSrc); + }); + + Kokkos::deep_copy(h_A, A); + Kokkos::deep_copy(h_B, B); + + bool test = true; + for (size_t i = 0; i < A.span(); i++) { + if (h_A.data()[i] != h_B.data()[i]) { + test = false; + break; + } + } + + ASSERT_EQ(test, true); + + // Fill + Kokkos::deep_copy(B, 0.0); + + Kokkos::parallel_for( + Kokkos::RangePolicy<ExecSpace>(0, N), KOKKOS_LAMBDA(const int& i) { + auto subDst = Kokkos::subview(B, 1, i, Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL()); + Kokkos::Experimental::local_deep_copy(subDst, 20.0); + }); + + Kokkos::deep_copy(h_B, B); + + double sum_all = 0.0; + for (size_t i = 0; i < B.span(); i++) { + sum_all += h_B.data()[i]; + } + + ASSERT_EQ(sum_all, 20.0 * N * N * N * N * N * N * N); +} +//------------------------------------------------------------------------------------------------------------- +template <typename ExecSpace, typename ViewType> +void impl_test_local_deepcopy_rangepolicy_rank_7(const int N) { + // Allocate matrices on device. + ViewType A("A", N, N, N, N, N, N, N, N); + ViewType B("B", N, N, N, N, N, N, N, N); + + // Create host mirrors of device views. + typename ViewType::HostMirror h_A = Kokkos::create_mirror_view(A); + typename ViewType::HostMirror h_B = Kokkos::create_mirror_view(B); + + // Initialize A matrix. + Kokkos::deep_copy(A, 10.0); + + // Deep Copy + Kokkos::parallel_for( + Kokkos::RangePolicy<ExecSpace>(0, N), KOKKOS_LAMBDA(const int& i) { + auto subSrc = Kokkos::subview( + A, i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + auto subDst = Kokkos::subview( + B, i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + Kokkos::Experimental::local_deep_copy(subDst, subSrc); + }); + + Kokkos::deep_copy(h_A, A); + Kokkos::deep_copy(h_B, B); + + bool test = true; + for (size_t i = 0; i < A.span(); i++) { + if (h_A.data()[i] != h_B.data()[i]) { + test = false; + break; + } + } + + ASSERT_EQ(test, true); + + // Fill + Kokkos::deep_copy(B, 0.0); + + Kokkos::parallel_for( + Kokkos::RangePolicy<ExecSpace>(0, N), KOKKOS_LAMBDA(const int& i) { + auto subDst = Kokkos::subview( + B, i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + Kokkos::Experimental::local_deep_copy(subDst, 20.0); + }); + + Kokkos::deep_copy(h_B, B); + + double sum_all = 0.0; + for (size_t i = 0; i < B.span(); i++) { + sum_all += h_B.data()[i]; + } + + ASSERT_EQ(sum_all, 20.0 * N * N * N * N * N * N * N * N); +} +//------------------------------------------------------------------------------------------------------------- + +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) +TEST(TEST_CATEGORY, local_deepcopy_teampolicy_layoutleft) { + using ExecSpace = TEST_EXECSPACE; + using ViewType = Kokkos::View<double********, Kokkos::LayoutLeft, ExecSpace>; + + { // Rank-1 + impl_test_local_deepcopy_teampolicy_rank_1<ExecSpace, ViewType>(8); + } + { // Rank-2 + impl_test_local_deepcopy_teampolicy_rank_2<ExecSpace, ViewType>(8); + } + { // Rank-3 + impl_test_local_deepcopy_teampolicy_rank_3<ExecSpace, ViewType>(8); + } + { // Rank-4 + impl_test_local_deepcopy_teampolicy_rank_4<ExecSpace, ViewType>(8); + } + { // Rank-5 + impl_test_local_deepcopy_teampolicy_rank_5<ExecSpace, ViewType>(8); + } + { // Rank-6 + impl_test_local_deepcopy_teampolicy_rank_6<ExecSpace, ViewType>(8); + } + { // Rank-7 + impl_test_local_deepcopy_teampolicy_rank_7<ExecSpace, ViewType>(8); + } +} +//------------------------------------------------------------------------------------------------------------- +TEST(TEST_CATEGORY, local_deepcopy_rangepolicy_layoutleft) { + using ExecSpace = TEST_EXECSPACE; + using ViewType = Kokkos::View<double********, Kokkos::LayoutLeft, ExecSpace>; + + { // Rank-1 + impl_test_local_deepcopy_rangepolicy_rank_1<ExecSpace, ViewType>(8); + } + { // Rank-2 + impl_test_local_deepcopy_rangepolicy_rank_2<ExecSpace, ViewType>(8); + } + { // Rank-3 + impl_test_local_deepcopy_rangepolicy_rank_3<ExecSpace, ViewType>(8); + } + { // Rank-4 + impl_test_local_deepcopy_rangepolicy_rank_4<ExecSpace, ViewType>(8); + } + { // Rank-5 + impl_test_local_deepcopy_rangepolicy_rank_5<ExecSpace, ViewType>(8); + } + { // Rank-6 + impl_test_local_deepcopy_rangepolicy_rank_6<ExecSpace, ViewType>(8); + } + { // Rank-7 + impl_test_local_deepcopy_rangepolicy_rank_7<ExecSpace, ViewType>(8); + } +} +//------------------------------------------------------------------------------------------------------------- +TEST(TEST_CATEGORY, local_deepcopy_teampolicy_layoutright) { + using ExecSpace = TEST_EXECSPACE; + using ViewType = Kokkos::View<double********, Kokkos::LayoutRight, ExecSpace>; + + { // Rank-1 + impl_test_local_deepcopy_teampolicy_rank_1<ExecSpace, ViewType>(8); + } + { // Rank-2 + impl_test_local_deepcopy_teampolicy_rank_2<ExecSpace, ViewType>(8); + } + { // Rank-3 + impl_test_local_deepcopy_teampolicy_rank_3<ExecSpace, ViewType>(8); + } + { // Rank-4 + impl_test_local_deepcopy_teampolicy_rank_4<ExecSpace, ViewType>(8); + } + { // Rank-5 + impl_test_local_deepcopy_teampolicy_rank_5<ExecSpace, ViewType>(8); + } + { // Rank-6 + impl_test_local_deepcopy_teampolicy_rank_6<ExecSpace, ViewType>(8); + } + { // Rank-7 + impl_test_local_deepcopy_teampolicy_rank_7<ExecSpace, ViewType>(8); + } +} +//------------------------------------------------------------------------------------------------------------- +TEST(TEST_CATEGORY, local_deepcopy_rangepolicy_layoutright) { + using ExecSpace = TEST_EXECSPACE; + using ViewType = Kokkos::View<double********, Kokkos::LayoutRight, ExecSpace>; + + { // Rank-1 + impl_test_local_deepcopy_rangepolicy_rank_1<ExecSpace, ViewType>(8); + } + { // Rank-2 + impl_test_local_deepcopy_rangepolicy_rank_2<ExecSpace, ViewType>(8); + } + { // Rank-3 + impl_test_local_deepcopy_rangepolicy_rank_3<ExecSpace, ViewType>(8); + } + { // Rank-4 + impl_test_local_deepcopy_rangepolicy_rank_4<ExecSpace, ViewType>(8); + } + { // Rank-5 + impl_test_local_deepcopy_rangepolicy_rank_5<ExecSpace, ViewType>(8); + } + { // Rank-6 + impl_test_local_deepcopy_rangepolicy_rank_6<ExecSpace, ViewType>(8); + } + { // Rank-7 + impl_test_local_deepcopy_rangepolicy_rank_7<ExecSpace, ViewType>(8); + } +} +#endif + +namespace Impl { +template <typename T, typename SHMEMTYPE> +using ShMemView = + Kokkos::View<T, Kokkos::LayoutRight, SHMEMTYPE, Kokkos::MemoryUnmanaged>; + +struct DeepCopyScratchFunctor { + DeepCopyScratchFunctor( + Kokkos::View<double*, TEST_EXECSPACE::memory_space> check_view_1, + Kokkos::View<double*, TEST_EXECSPACE::memory_space> check_view_2) + : check_view_1_(check_view_1), + check_view_2_(check_view_2), + N_(check_view_1.extent(0)) {} + + KOKKOS_INLINE_FUNCTION void operator()( + Kokkos::TeamPolicy<TEST_EXECSPACE, + Kokkos::Schedule<Kokkos::Dynamic>>::member_type team) + const { + using ShmemType = TEST_EXECSPACE::scratch_memory_space; + auto shview = + Impl::ShMemView<double**, ShmemType>(team.team_scratch(1), N_, 1); + + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, N_), KOKKOS_LAMBDA(const size_t& index) { + auto thread_shview = Kokkos::subview(shview, index, Kokkos::ALL()); + Kokkos::Experimental::local_deep_copy(thread_shview, index); + }); + Kokkos::Experimental::local_deep_copy( + team, check_view_1_, Kokkos::subview(shview, Kokkos::ALL(), 0)); + + Kokkos::Experimental::local_deep_copy(team, shview, 6.); + Kokkos::Experimental::local_deep_copy( + team, check_view_2_, Kokkos::subview(shview, Kokkos::ALL(), 0)); + } + + Kokkos::View<double*, TEST_EXECSPACE::memory_space> check_view_1_; + Kokkos::View<double*, TEST_EXECSPACE::memory_space> check_view_2_; + int const N_; +}; +} // namespace Impl + +TEST(TEST_CATEGORY, deep_copy_scratch) { + using TestDeviceTeamPolicy = Kokkos::TeamPolicy<TEST_EXECSPACE>; + + const int N = 8; + const int bytes_per_team = + Impl::ShMemView<double**, + TEST_EXECSPACE::scratch_memory_space>::shmem_size(N, 1); + + TestDeviceTeamPolicy policy(1, Kokkos::AUTO); + auto team_exec = policy.set_scratch_size(1, Kokkos::PerTeam(bytes_per_team)); + + Kokkos::View<double*, TEST_EXECSPACE::memory_space> check_view_1("check_1", + N); + Kokkos::View<double*, TEST_EXECSPACE::memory_space> check_view_2("check_2", + N); + + Kokkos::parallel_for( + team_exec, Impl::DeepCopyScratchFunctor{check_view_1, check_view_2}); + auto host_copy_1 = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), check_view_1); + auto host_copy_2 = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), check_view_2); + + for (unsigned int i = 0; i < N; ++i) { + ASSERT_EQ(host_copy_1(i), i); + ASSERT_EQ(host_copy_2(i), 6.0); + } +} +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestMDRange.hpp b/packages/kokkos/core/unit_test/TestMDRange.hpp new file mode 100644 index 0000000000000000000000000000000000000000..5618e40989b185a0233de2b20d6dec6636c9fe51 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestMDRange.hpp @@ -0,0 +1,3777 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <cstdio> + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +namespace Test { + +namespace { + +using namespace Kokkos; + +template <typename ExecSpace> +struct TestMDRange_ReduceArray_2D { + using DataType = int; + using ViewType_2 = typename Kokkos::View<DataType **, ExecSpace>; + using HostViewType_2 = typename ViewType_2::HostMirror; + + ViewType_2 input_view; + + using scalar_type = double; + using value_type = scalar_type[]; + const unsigned value_count; + + TestMDRange_ReduceArray_2D(const int N0, const int N1, + const unsigned array_size) + : input_view("input_view", N0, N1), value_count(array_size) {} + + KOKKOS_INLINE_FUNCTION + void init(scalar_type dst[]) const { + for (unsigned i = 0; i < value_count; ++i) { + dst[i] = 0.0; + } + } + + KOKKOS_INLINE_FUNCTION + void join(volatile scalar_type dst[], + const volatile scalar_type src[]) const { + for (unsigned i = 0; i < value_count; ++i) { + dst[i] += src[i]; + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, const int j) const { input_view(i, j) = 1; } + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, const int j, value_type lsum) const { + lsum[0] += input_view(i, j) * 2; //+=6 each time if InitTag => N0*N1*6 + lsum[1] += input_view(i, j); //+=3 each time if InitTag => N0*N1*3 + } + + // tagged operators + struct InitTag {}; + KOKKOS_INLINE_FUNCTION + void operator()(const InitTag &, const int i, const int j) const { + input_view(i, j) = 3; + } + + static void test_arrayreduce2(const int N0, const int N1) { + { + using range_type_init = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<2>, + Kokkos::IndexType<int>, InitTag>; + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<2>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type_init range_init(point_type{{0, 0}}, point_type{{N0, N1}}, + tile_type{{3, 3}}); + range_type range(point_type{{0, 0}}, point_type{{N0, N1}}, + tile_type{{3, 3}}); + + const unsigned array_size = 2; + + TestMDRange_ReduceArray_2D functor(N0, N1, array_size); + + parallel_for(range_init, functor); // Init the view to 3's + + double sums[array_size]; + parallel_reduce(range, functor, sums); + + // Check output + // printf("Array Reduce result. N0 = %d N1 = %d N0*N1 = %d sums[0] = + // %lf sums[1] = %lf \n", N0, N1, N0*N1, sums[0], sums[1]); + + ASSERT_EQ(sums[0], 6 * N0 * N1); + ASSERT_EQ(sums[1], 3 * N0 * N1); + } + } +}; + +template <typename ExecSpace> +struct TestMDRange_ReduceArray_3D { + using DataType = int; + using ViewType_3 = typename Kokkos::View<DataType ***, ExecSpace>; + using HostViewType_3 = typename ViewType_3::HostMirror; + + ViewType_3 input_view; + + using scalar_type = double; + using value_type = scalar_type[]; + const unsigned value_count; + + TestMDRange_ReduceArray_3D(const int N0, const int N1, const int N2, + const unsigned array_size) + : input_view("input_view", N0, N1, N2), value_count(array_size) {} + + KOKKOS_INLINE_FUNCTION + void init(scalar_type dst[]) const { + for (unsigned i = 0; i < value_count; ++i) { + dst[i] = 0.0; + } + } + + KOKKOS_INLINE_FUNCTION + void join(volatile scalar_type dst[], + const volatile scalar_type src[]) const { + for (unsigned i = 0; i < value_count; ++i) { + dst[i] += src[i]; + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, const int j, const int k) const { + input_view(i, j, k) = 1; + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, const int j, const int k, + value_type lsum) const { + lsum[0] += + input_view(i, j, k) * 2; //+=6 each time if InitTag => N0*N1*N2*6 + lsum[1] += input_view(i, j, k); //+=3 each time if InitTag => N0*N1*N2*3 + } + + // tagged operators + struct InitTag {}; + KOKKOS_INLINE_FUNCTION + void operator()(const InitTag &, const int i, const int j, + const int k) const { + input_view(i, j, k) = 3; + } + + static void test_arrayreduce3(const int N0, const int N1, const int N2) { + { + using range_type_init = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<3>, + Kokkos::IndexType<int>, InitTag>; + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<3>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type_init range_init(point_type{{0, 0, 0}}, + point_type{{N0, N1, N2}}, + tile_type{{3, 3, 3}}); + range_type range(point_type{{0, 0, 0}}, point_type{{N0, N1, N2}}, + tile_type{{3, 3, 3}}); + + const unsigned array_size = 2; + + TestMDRange_ReduceArray_3D functor(N0, N1, N2, array_size); + + parallel_for(range_init, functor); // Init the view to 3's + + double sums[array_size]; + parallel_reduce(range, functor, sums); + + ASSERT_EQ(sums[0], 6 * N0 * N1 * N2); + ASSERT_EQ(sums[1], 3 * N0 * N1 * N2); + } + } +}; + +template <typename ExecSpace> +struct TestMDRange_2D { + using DataType = int; + using ViewType = typename Kokkos::View<DataType **, ExecSpace>; + using HostViewType = typename ViewType::HostMirror; + + ViewType input_view; + using value_type = double; + + TestMDRange_2D(const DataType N0, const DataType N1) + : input_view("input_view", N0, N1) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, const int j) const { input_view(i, j) = 1; } + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, const int j, value_type &lsum) const { + lsum += input_view(i, j) * 2; + } + + // tagged operators + struct InitTag {}; + KOKKOS_INLINE_FUNCTION + void operator()(const InitTag &, const int i, const int j) const { + input_view(i, j) = 3; + } + + // reduction tagged operators + KOKKOS_INLINE_FUNCTION + void operator()(const InitTag &, const int i, const int j, + value_type &lsum) const { + lsum += input_view(i, j) * 3; + } + + static void test_reduce2(const int N0, const int N1) { +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<2>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0}}, point_type{{N0, N1}}, + tile_type{{3, 3}}); + double sum = 0.0; + parallel_reduce( + range, + KOKKOS_LAMBDA(const int /*i*/, const int /*j*/, double &lsum) { + lsum += 1.0; + }, + sum); + ASSERT_EQ(sum, N0 * N1); + } +#endif + + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<2>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0}}, point_type{{N0, N1}}, + tile_type{{3, 3}}); + + TestMDRange_2D functor(N0, N1); + + parallel_for(range, functor); + double sum = 0.0; + parallel_reduce(range, functor, sum); + + ASSERT_EQ(sum, 2 * N0 * N1); + } + + // Test with reducers - scalar + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<2>, + Kokkos::IndexType<int>>; + int s0 = 1; + int s1 = 1; + range_type range({{s0, s1}}, {{N0, N1}}, {{3, 3}}); + + TestMDRange_2D functor(N0, N1); + + parallel_for(range, functor); + + value_type sum = 0.0; + Kokkos::Sum<value_type> reducer_scalar(sum); + + parallel_reduce(range, functor, reducer_scalar); + + ASSERT_EQ(sum, 2 * (N0 - s0) * (N1 - s1)); + } + // Test with reducers - scalar + label + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<2>, + Kokkos::IndexType<int>>; + int s0 = 1; + int s1 = 1; + range_type range({{s0, s1}}, {{N0, N1}}, {{3, 3}}); + + TestMDRange_2D functor(N0, N1); + + parallel_for("rank2-parfor-label", range, functor); + + value_type sum = 0.0; + Kokkos::Sum<value_type> reducer_scalar(sum); + + parallel_reduce("rank2-reducer-label", range, functor, reducer_scalar); + + ASSERT_EQ(sum, 2 * (N0 - s0) * (N1 - s1)); + } + // Test with reducers - scalar view + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<2>, + Kokkos::IndexType<int>>; + range_type range({{0, 0}}, {{N0, N1}}, {{3, 3}}); + + TestMDRange_2D functor(N0, N1); + + parallel_for(range, functor); + + value_type sum = 0.0; + Kokkos::View<value_type, Kokkos::HostSpace> sum_view("sum_view"); + sum_view() = sum; + Kokkos::Sum<value_type> reducer_view(sum_view); + + parallel_reduce(range, functor, reducer_view); + Kokkos::fence(); + sum = sum_view(); + + ASSERT_EQ(sum, 2 * N0 * N1); + } + // Test Min reducer with lambda +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<2>, + Kokkos::IndexType<int>>; + range_type range({{1, 1}}, {{N0, N1}}, {{3, 3}}); + + Kokkos::View<double **, ExecSpace> v_in("v_in", N0, N1); + + parallel_for( + "rank2-init-lambda", range, KOKKOS_LAMBDA(const int i, const int j) { + v_in(i, j) = (i + 1) * (j + 1); + }); + + double min; + Kokkos::Min<double> reducer_scalar(min); + + parallel_reduce( + "rank2-min-reducer", range, + KOKKOS_LAMBDA(const int i, const int j, double &min_val) { + min_val = Kokkos::Experimental::fmin(v_in(i, j), min_val); + }, + reducer_scalar); + + ASSERT_EQ(min, 4.0); + } +#endif + // Tagged operator test + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<2, Iterate::Default, Iterate::Default>, + Kokkos::IndexType<int>, InitTag>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0}}, point_type{{N0, N1}}, + tile_type{{2, 4}}); + + TestMDRange_2D functor(N0, N1); + + parallel_for(range, functor); + + // check parallel_for results correct with InitTag + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) { + if (h_view(i, j) != 3) { + ++counter; + } + } + + if (counter != 0) { + printf( + "Defaults + InitTag op(): Errors in test_for3; mismatches = %d\n\n", + counter); + } + ASSERT_EQ(counter, 0); + + double sum = 0.0; + parallel_reduce(range, functor, sum); + + ASSERT_EQ(sum, 9 * N0 * N1); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<2, Iterate::Default, Iterate::Default>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0}}, point_type{{N0, N1}}, + tile_type{{2, 6}}); + + TestMDRange_2D functor(N0, N1); + + parallel_for(range, functor); + double sum = 0.0; + parallel_reduce(range, functor, sum); + + ASSERT_EQ(sum, 2 * N0 * N1); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<2, Iterate::Left, Iterate::Left>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0}}, point_type{{N0, N1}}, + tile_type{{2, 6}}); + + TestMDRange_2D functor(N0, N1); + + parallel_for(range, functor); + double sum = 0.0; + parallel_reduce(range, functor, sum); + + ASSERT_EQ(sum, 2 * N0 * N1); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<2, Iterate::Left, Iterate::Right>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0}}, point_type{{N0, N1}}, + tile_type{{2, 6}}); + + TestMDRange_2D functor(N0, N1); + + parallel_for(range, functor); + double sum = 0.0; + parallel_reduce(range, functor, sum); + + ASSERT_EQ(sum, 2 * N0 * N1); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<2, Iterate::Right, Iterate::Left>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0}}, point_type{{N0, N1}}, + tile_type{{2, 6}}); + + TestMDRange_2D functor(N0, N1); + + parallel_for(range, functor); + double sum = 0.0; + parallel_reduce(range, functor, sum); + + ASSERT_EQ(sum, 2 * N0 * N1); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<2, Iterate::Right, Iterate::Right>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0}}, point_type{{N0, N1}}, + tile_type{{2, 6}}); + + TestMDRange_2D functor(N0, N1); + + parallel_for(range, functor); + double sum = 0.0; + parallel_reduce(range, functor, sum); + + ASSERT_EQ(sum, 2 * N0 * N1); + } + } // end test_reduce2 + + static void test_for2(const int N0, const int N1) { +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<2>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + const int s0 = 1; + const int s1 = 1; + + range_type range(point_type{{s0, s1}}, point_type{{N0, N1}}, + tile_type{{3, 3}}); + + TestMDRange_2D::ViewType v("v", N0, N1); + + parallel_for( + range, KOKKOS_LAMBDA(const int i, const int j) { v(i, j) = 3; }); + + TestMDRange_2D::HostViewType h_view = Kokkos::create_mirror_view(v); + Kokkos::deep_copy(h_view, v); + + int counter = 0; + for (int i = s0; i < N0; ++i) + for (int j = s1; j < N1; ++j) { + if (h_view(i, j) != 3) { + ++counter; + } + } + + if (counter != 0) { + printf( + "Offset Start + Default Layouts + InitTag op(): Errors in " + "test_for2; mismatches = %d\n\n", + counter); + } + + ASSERT_EQ(counter, 0); + } +#endif + + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<2>, + Kokkos::IndexType<int>, InitTag>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + const int s0 = 1; + const int s1 = 1; + range_type range(point_type{{s0, s1}}, point_type{{N0, N1}}, + tile_type{{3, 3}}); + TestMDRange_2D functor(N0, N1); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = s0; i < N0; ++i) + for (int j = s1; j < N1; ++j) { + if (h_view(i, j) != 3) { + ++counter; + } + } + + if (counter != 0) { + printf( + "Offset Start + Default Layouts + InitTag op(): Errors in " + "test_for2; mismatches = %d\n\n", + counter); + } + + ASSERT_EQ(counter, 0); + } + + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<2>, InitTag>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0}}, point_type{{N0, N1}}, + tile_type{{3, 3}}); + TestMDRange_2D functor(N0, N1); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) { + if (h_view(i, j) != 3) { + ++counter; + } + } + + if (counter != 0) { + printf( + "Default Layouts + InitTag op(): Errors in test_for2; mismatches = " + "%d\n\n", + counter); + } + + ASSERT_EQ(counter, 0); + } + + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<2>, InitTag>; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0}}, point_type{{N0, N1}}); + TestMDRange_2D functor(N0, N1); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) { + if (h_view(i, j) != 3) { + ++counter; + } + } + + if (counter != 0) { + printf( + "Default Layouts + InitTag op() + Default Tile: Errors in " + "test_for2; mismatches = %d\n\n", + counter); + } + + ASSERT_EQ(counter, 0); + } + + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<2>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0}}, point_type{{N0, N1}}, + tile_type{{3, 3}}); + TestMDRange_2D functor(N0, N1); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) { + if (h_view(i, j) != 1) { + ++counter; + } + } + + if (counter != 0) { + printf("No info: Errors in test_for2; mismatches = %d\n\n", counter); + } + + ASSERT_EQ(counter, 0); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<2, Iterate::Default, Iterate::Default>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0}}, point_type{{N0, N1}}, + tile_type{{4, 4}}); + TestMDRange_2D functor(N0, N1); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) { + if (h_view(i, j) != 1) { + ++counter; + } + } + + if (counter != 0) { + printf("D D: Errors in test_for2; mismatches = %d\n\n", counter); + } + + ASSERT_EQ(counter, 0); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<2, Iterate::Left, Iterate::Left>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0}}, point_type{{N0, N1}}, + tile_type{{3, 3}}); + TestMDRange_2D functor(N0, N1); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) { + if (h_view(i, j) != 1) { + ++counter; + } + } + + if (counter != 0) { + printf("L L: Errors in test_for2; mismatches = %d\n\n", counter); + } + + ASSERT_EQ(counter, 0); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<2, Iterate::Left, Iterate::Right>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0}}, point_type{{N0, N1}}, + tile_type{{7, 7}}); + TestMDRange_2D functor(N0, N1); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) { + if (h_view(i, j) != 1) { + ++counter; + } + } + + if (counter != 0) { + printf("L R: Errors in test_for2; mismatches = %d\n\n", counter); + } + + ASSERT_EQ(counter, 0); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<2, Iterate::Right, Iterate::Left>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0}}, point_type{{N0, N1}}, + tile_type{{16, 16}}); + TestMDRange_2D functor(N0, N1); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) { + if (h_view(i, j) != 1) { + ++counter; + } + } + + if (counter != 0) { + printf("R L: Errors in test_for2; mismatches = %d\n\n", counter); + } + + ASSERT_EQ(counter, 0); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<2, Iterate::Right, Iterate::Right>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0}}, point_type{{N0, N1}}, + tile_type{{5, 16}}); + TestMDRange_2D functor(N0, N1); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) { + if (h_view(i, j) != 1) { + ++counter; + } + } + + if (counter != 0) { + printf("R R: Errors in test_for2; mismatches = %d\n\n", counter); + } + + ASSERT_EQ(counter, 0); + } + + } // end test_for2 +}; // MDRange_2D + +template <typename ExecSpace> +struct TestMDRange_3D { + using DataType = int; + using ViewType = typename Kokkos::View<DataType ***, ExecSpace>; + using HostViewType = typename ViewType::HostMirror; + + ViewType input_view; + using value_type = double; + + TestMDRange_3D(const DataType N0, const DataType N1, const DataType N2) + : input_view("input_view", N0, N1, N2) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, const int j, const int k) const { + input_view(i, j, k) = 1; + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, const int j, const int k, double &lsum) const { + lsum += input_view(i, j, k) * 2; + } + + // tagged operators + struct InitTag {}; + KOKKOS_INLINE_FUNCTION + void operator()(const InitTag &, const int i, const int j, + const int k) const { + input_view(i, j, k) = 3; + } + + // reduction tagged operators + KOKKOS_INLINE_FUNCTION + void operator()(const InitTag &, const int i, const int j, const int k, + value_type &lsum) const { + lsum += input_view(i, j, k) * 3; + } + + static void test_reduce3(const int N0, const int N1, const int N2) { +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<3>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0, 0}}, point_type{{N0, N1, N2}}, + tile_type{{3, 3, 3}}); + double sum = 0.0; + parallel_reduce( + range, + KOKKOS_LAMBDA(const int /*i*/, const int /*j*/, const int /*k*/, + double &lsum) { lsum += 1.0; }, + sum); + ASSERT_EQ(sum, N0 * N1 * N2); + } +#endif + + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<3>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + int s0 = 1; + int s1 = 1; + int s2 = 1; + range_type range(point_type{{s0, s1, s2}}, point_type{{N0, N1, N2}}, + tile_type{{3, 3, 3}}); + + TestMDRange_3D functor(N0, N1, N2); + + parallel_for(range, functor); + double sum = 0.0; + parallel_reduce(range, functor, sum); + + ASSERT_EQ(sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2)); + } + + // Test with reducers - scalar + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<3>, + Kokkos::IndexType<int>>; + range_type range({{0, 0, 0}}, {{N0, N1, N2}}, {{3, 3, 3}}); + + TestMDRange_3D functor(N0, N1, N2); + + parallel_for(range, functor); + + value_type sum = 0.0; + Kokkos::Sum<value_type> reducer_scalar(sum); + + parallel_reduce(range, functor, reducer_scalar); + + ASSERT_EQ(sum, 2 * N0 * N1 * N2); + } + // Test with reducers - scalar + label + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<3>, + Kokkos::IndexType<int>>; + range_type range({{0, 0, 0}}, {{N0, N1, N2}}, {{3, 3, 3}}); + + TestMDRange_3D functor(N0, N1, N2); + + parallel_for("rank3-parfor-label", range, functor); + + value_type sum = 0.0; + Kokkos::Sum<value_type> reducer_scalar(sum); + + parallel_reduce("rank3-reducer-label", range, functor, reducer_scalar); + + ASSERT_EQ(sum, 2 * N0 * N1 * N2); + } + // Test with reducers - scalar view + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<3>, + Kokkos::IndexType<int>>; + range_type range({{0, 0, 0}}, {{N0, N1, N2}}, {{3, 3, 3}}); + + TestMDRange_3D functor(N0, N1, N2); + + parallel_for(range, functor); + + value_type sum = 0.0; + Kokkos::View<value_type, Kokkos::HostSpace> sum_view("sum_view"); + sum_view() = sum; + Kokkos::Sum<value_type> reducer_view(sum_view); + + parallel_reduce(range, functor, reducer_view); + Kokkos::fence(); + sum = sum_view(); + + ASSERT_EQ(sum, 2 * N0 * N1 * N2); + } + // Test Min reducer with lambda +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<3>, + Kokkos::IndexType<int>>; + + range_type range({{1, 1, 1}}, {{N0, N1, N2}}, {{3, 3, 3}}); + + Kokkos::View<double ***, ExecSpace> v_in("v_in", N0, N1, N2); + + parallel_for( + "rank3-init-lambda", range, + KOKKOS_LAMBDA(const int i, const int j, const int k) { + v_in(i, j, k) = (i + 1) * (j + 1) * (k + 1); + }); + + double min; + + parallel_reduce( + "rank3-min-reducer", range, + KOKKOS_LAMBDA(const int i, const int j, const int k, + double &min_val) { + min_val = (v_in(i, j, k) < min_val) ? v_in(i, j, k) : min_val; + }, + Kokkos::Min<double>(min)); + + if ((N0 - 1) * (N1 - 1) * (N2 - 1) > 0) + ASSERT_EQ(min, 8.0); + else { + double min_identity = Kokkos::reduction_identity<double>::min(); + ASSERT_EQ(min, min_identity); + } + } +#endif + + // Tagged operator test + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<3, Iterate::Default, Iterate::Default>, + Kokkos::IndexType<int>, InitTag>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0, 0}}, point_type{{N0, N1, N2}}, + tile_type{{2, 4, 6}}); + + TestMDRange_3D functor(N0, N1, N2); + + parallel_for(range, functor); + + // check parallel_for results correct with InitTag + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) + for (int k = 0; k < N2; ++k) { + if (h_view(i, j, k) != 3) { + ++counter; + } + } + + if (counter != 0) { + printf( + "Defaults + InitTag op(): Errors in test_for3; mismatches = %d\n\n", + counter); + } + ASSERT_EQ(counter, 0); + + double sum = 0.0; + parallel_reduce(range, functor, sum); + + ASSERT_EQ(sum, 9 * N0 * N1 * N2); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<3, Iterate::Default, Iterate::Default>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0, 0}}, point_type{{N0, N1, N2}}, + tile_type{{2, 4, 6}}); + + TestMDRange_3D functor(N0, N1, N2); + + parallel_for(range, functor); + double sum = 0.0; + parallel_reduce(range, functor, sum); + + ASSERT_EQ(sum, 2 * N0 * N1 * N2); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<3, Iterate::Left, Iterate::Left>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0, 0}}, point_type{{N0, N1, N2}}, + tile_type{{2, 4, 6}}); + + TestMDRange_3D functor(N0, N1, N2); + + parallel_for(range, functor); + double sum = 0.0; + parallel_reduce(range, functor, sum); + + ASSERT_EQ(sum, 2 * N0 * N1 * N2); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<3, Iterate::Left, Iterate::Right>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0, 0}}, point_type{{N0, N1, N2}}, + tile_type{{2, 4, 6}}); + + TestMDRange_3D functor(N0, N1, N2); + + parallel_for(range, functor); + double sum = 0.0; + parallel_reduce(range, functor, sum); + + ASSERT_EQ(sum, 2 * N0 * N1 * N2); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<3, Iterate::Right, Iterate::Left>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0, 0}}, point_type{{N0, N1, N2}}, + tile_type{{2, 4, 6}}); + + TestMDRange_3D functor(N0, N1, N2); + + parallel_for(range, functor); + double sum = 0.0; + parallel_reduce(range, functor, sum); + + ASSERT_EQ(sum, 2 * N0 * N1 * N2); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<3, Iterate::Right, Iterate::Right>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0, 0}}, point_type{{N0, N1, N2}}, + tile_type{{2, 4, 6}}); + + TestMDRange_3D functor(N0, N1, N2); + + parallel_for(range, functor); + double sum = 0.0; + parallel_reduce(range, functor, sum); + + ASSERT_EQ(sum, 2 * N0 * N1 * N2); + } + } // end test_reduce3 + + static void test_for3(const int N0, const int N1, const int N2) { +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<3>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + const int s0 = 1; + const int s1 = 1; + const int s2 = 1; + + range_type range(point_type{{s0, s1, s2}}, point_type{{N0, N1, N2}}, + tile_type{{3, 3, 3}}); + + TestMDRange_3D::ViewType v("v", N0, N1, N2); + + parallel_for( + range, KOKKOS_LAMBDA(const int i, const int j, const int k) { + v(i, j, k) = 3; + }); + + TestMDRange_3D::HostViewType h_view = Kokkos::create_mirror_view(v); + Kokkos::deep_copy(h_view, v); + + int counter = 0; + for (int i = s0; i < N0; ++i) + for (int j = s1; j < N1; ++j) + for (int k = s2; k < N2; ++k) { + if (h_view(i, j, k) != 3) { + ++counter; + } + } + + if (counter != 0) { + printf( + "Offset Start + Default Layouts + InitTag op(): Errors in " + "test_for3; mismatches = %d\n\n", + counter); + } + + ASSERT_EQ(counter, 0); + } +#endif + + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<3>>; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0, 0}}, point_type{{N0, N1, N2}}); + TestMDRange_3D functor(N0, N1, N2); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) + for (int k = 0; k < N2; ++k) { + if (h_view(i, j, k) != 1) { + ++counter; + } + } + + if (counter != 0) { + printf("Defaults + No Tile: Errors in test_for3; mismatches = %d\n\n", + counter); + } + + ASSERT_EQ(counter, 0); + } + + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<3>, + Kokkos::IndexType<int>, InitTag>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + int s0 = 1; + int s1 = 1; + int s2 = 1; + range_type range(point_type{{s0, s1, s2}}, point_type{{N0, N1, N2}}, + tile_type{{3, 3, 3}}); + TestMDRange_3D functor(N0, N1, N2); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = s0; i < N0; ++i) + for (int j = s1; j < N1; ++j) + for (int k = s2; k < N2; ++k) { + if (h_view(i, j, k) != 3) { + ++counter; + } + } + + if (counter != 0) { + printf( + "Offset Start + Defaults + InitTag op(): Errors in test_for3; " + "mismatches = %d\n\n", + counter); + } + + ASSERT_EQ(counter, 0); + } + + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<3>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0, 0}}, point_type{{N0, N1, N2}}, + tile_type{{3, 3, 3}}); + + TestMDRange_3D functor(N0, N1, N2); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) + for (int k = 0; k < N2; ++k) { + if (h_view(i, j, k) != 1) { + ++counter; + } + } + + if (counter != 0) { + printf(" Errors in test_for3; mismatches = %d\n\n", counter); + } + + ASSERT_EQ(counter, 0); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<3, Iterate::Default, Iterate::Default>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0, 0}}, point_type{{N0, N1, N2}}, + tile_type{{3, 3, 3}}); + TestMDRange_3D functor(N0, N1, N2); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) + for (int k = 0; k < N2; ++k) { + if (h_view(i, j, k) != 1) { + ++counter; + } + } + + if (counter != 0) { + printf(" Errors in test_for3; mismatches = %d\n\n", counter); + } + + ASSERT_EQ(counter, 0); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<3, Iterate::Left, Iterate::Left>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0, 0}}, point_type{{N0, N1, N2}}, + tile_type{{2, 4, 2}}); + TestMDRange_3D functor(N0, N1, N2); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) + for (int k = 0; k < N2; ++k) { + if (h_view(i, j, k) != 1) { + ++counter; + } + } + + if (counter != 0) { + printf(" Errors in test_for3; mismatches = %d\n\n", counter); + } + + ASSERT_EQ(counter, 0); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<3, Iterate::Left, Iterate::Right>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0, 0}}, point_type{{N0, N1, N2}}, + tile_type{{3, 5, 7}}); + TestMDRange_3D functor(N0, N1, N2); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) + for (int k = 0; k < N2; ++k) { + if (h_view(i, j, k) != 1) { + ++counter; + } + } + + if (counter != 0) { + printf(" Errors in test_for3; mismatches = %d\n\n", counter); + } + + ASSERT_EQ(counter, 0); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<3, Iterate::Right, Iterate::Left>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + +#ifdef KOKKOS_ENABLE_SYCL + range_type range(point_type{{0, 0, 0}}, point_type{{N0, N1, N2}}, + tile_type{{8, 8, 4}}); +#else + range_type range(point_type{{0, 0, 0}}, point_type{{N0, N1, N2}}, + tile_type{{8, 8, 8}}); +#endif + TestMDRange_3D functor(N0, N1, N2); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) + for (int k = 0; k < N2; ++k) { + if (h_view(i, j, k) != 1) { + ++counter; + } + } + + if (counter != 0) { + printf(" Errors in test_for3; mismatches = %d\n\n", counter); + } + + ASSERT_EQ(counter, 0); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<3, Iterate::Right, Iterate::Right>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0, 0}}, point_type{{N0, N1, N2}}, + tile_type{{2, 4, 2}}); + TestMDRange_3D functor(N0, N1, N2); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) + for (int k = 0; k < N2; ++k) { + if (h_view(i, j, k) != 1) { + ++counter; + } + } + + if (counter != 0) { + printf(" Errors in test_for3; mismatches = %d\n\n", counter); + } + + ASSERT_EQ(counter, 0); + } + } // end test_for3 +}; + +template <typename ExecSpace> +struct TestMDRange_4D { + using DataType = int; + using ViewType = typename Kokkos::View<DataType ****, ExecSpace>; + using HostViewType = typename ViewType::HostMirror; + + ViewType input_view; + using value_type = double; + + TestMDRange_4D(const DataType N0, const DataType N1, const DataType N2, + const DataType N3) + : input_view("input_view", N0, N1, N2, N3) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, const int j, const int k, const int l) const { + input_view(i, j, k, l) = 1; + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, const int j, const int k, const int l, + double &lsum) const { + lsum += input_view(i, j, k, l) * 2; + } + + // tagged operators + struct InitTag {}; + KOKKOS_INLINE_FUNCTION + void operator()(const InitTag &, const int i, const int j, const int k, + const int l) const { + input_view(i, j, k, l) = 3; + } + + // reduction tagged operators + KOKKOS_INLINE_FUNCTION + void operator()(const InitTag &, const int i, const int j, const int k, + const int l, value_type &lsum) const { + lsum += input_view(i, j, k, l) * 3; + } + + static void test_reduce4(const int N0, const int N1, const int N2, + const int N3) { +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<4>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0, 0, 0}}, point_type{{N0, N1, N2, N3}}, + tile_type{{3, 3, 3, 3}}); + double sum = 0.0; + parallel_reduce( + range, + KOKKOS_LAMBDA(const int /*i*/, const int /*j*/, const int /*k*/, + const int /*l*/, double &lsum) { lsum += 1.0; }, + sum); + ASSERT_EQ(sum, N0 * N1 * N2 * N3); + } +#endif + + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<4>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + int s0 = 1; + int s1 = 1; + int s2 = 1; + int s3 = 1; + range_type range(point_type{{s0, s1, s2, s3}}, + point_type{{N0, N1, N2, N3}}, tile_type{{3, 3, 3, 3}}); + + TestMDRange_4D functor(N0, N1, N2, N3); + + parallel_for(range, functor); + double sum = 0.0; + parallel_reduce(range, functor, sum); + + ASSERT_EQ(sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3)); + } + + // Test with reducers - scalar + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<4>, + Kokkos::IndexType<int>>; + range_type range({{0, 0, 0, 0}}, {{N0, N1, N2, N3}}, {{3, 3, 3, 3}}); + + TestMDRange_4D functor(N0, N1, N2, N3); + + parallel_for(range, functor); + + value_type sum = 0.0; + Kokkos::Sum<value_type> reducer_scalar(sum); + + parallel_reduce(range, functor, reducer_scalar); + + ASSERT_EQ(sum, 2 * N0 * N1 * N2 * N3); + } + + // Test with reducers - scalar + label + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<4>, + Kokkos::IndexType<int>>; + range_type range({{0, 0, 0, 0}}, {{N0, N1, N2, N3}}, {{3, 3, 3, 3}}); + + TestMDRange_4D functor(N0, N1, N2, N3); + + parallel_for("rank4-parfor-label", range, functor); + + value_type sum = 0.0; + Kokkos::Sum<value_type> reducer_scalar(sum); + + parallel_reduce("rank4-reducer-label", range, functor, reducer_scalar); + + ASSERT_EQ(sum, 2 * N0 * N1 * N2 * N3); + } + + // Test with reducers - scalar view + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<4>, + Kokkos::IndexType<int>>; + range_type range({{0, 0, 0, 0}}, {{N0, N1, N2, N3}}, {{3, 3, 3, 3}}); + + TestMDRange_4D functor(N0, N1, N2, N3); + + parallel_for(range, functor); + + value_type sum = 0.0; + Kokkos::View<value_type, Kokkos::HostSpace> sum_view("sum_view"); + sum_view() = sum; + Kokkos::Sum<value_type> reducer_view(sum_view); + + parallel_reduce(range, functor, reducer_view); + Kokkos::fence(); + sum = sum_view(); + + ASSERT_EQ(sum, 2 * N0 * N1 * N2 * N3); + } + + // Test Min reducer with lambda +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<4>, + Kokkos::IndexType<int>>; + + range_type range({{1, 1, 1, 1}}, {{N0, N1, N2, N3}}, {{3, 3, 3, 3}}); + + Kokkos::View<double ****, ExecSpace> v_in("v_in", N0, N1, N2, N3); + + parallel_for( + "rank4-init-lambda", range, + KOKKOS_LAMBDA(const int i, const int j, const int k, const int l) { + v_in(i, j, k, l) = (i + 1) * (j + 1) * (k + 1) * (l + 1); + }); + + double min; + + parallel_reduce( + "rank4-min-reducer", range, + KOKKOS_LAMBDA(const int i, const int j, const int k, const int l, + double &min_val) { + min_val = (v_in(i, j, k, l) < min_val) ? v_in(i, j, k, l) : min_val; + }, + Kokkos::Min<double>(min)); + + ASSERT_EQ(min, 16.0); + } +#endif + + // Tagged operator test + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<4, Iterate::Default, Iterate::Default>, + Kokkos::IndexType<int>, InitTag>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0, 0, 0}}, point_type{{N0, N1, N2, N3}}, + tile_type{{2, 4, 6, 2}}); + + TestMDRange_4D functor(N0, N1, N2, N3); + + parallel_for(range, functor); + + // check parallel_for results correct with InitTag + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) + for (int k = 0; k < N2; ++k) + for (int l = 0; l < N3; ++l) { + if (h_view(i, j, k, l) != 3) { + ++counter; + } + } + + if (counter != 0) { + printf( + "Defaults + InitTag op(): Errors in test_reduce4 parallel_for " + "init; mismatches = %d\n\n", + counter); + } + ASSERT_EQ(counter, 0); + + double sum = 0.0; + parallel_reduce(range, functor, sum); + + ASSERT_EQ(sum, 9 * N0 * N1 * N2 * N3); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<4, Iterate::Default, Iterate::Default>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0, 0, 0}}, point_type{{N0, N1, N2, N3}}, + tile_type{{2, 4, 6, 2}}); + + TestMDRange_4D functor(N0, N1, N2, N3); + + parallel_for(range, functor); + double sum = 0.0; + parallel_reduce(range, functor, sum); + + ASSERT_EQ(sum, 2 * N0 * N1 * N2 * N3); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<4, Iterate::Left, Iterate::Left>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0, 0, 0}}, point_type{{N0, N1, N2, N3}}, + tile_type{{2, 4, 6, 2}}); + + TestMDRange_4D functor(N0, N1, N2, N3); + + parallel_for(range, functor); + double sum = 0.0; + parallel_reduce(range, functor, sum); + + ASSERT_EQ(sum, 2 * N0 * N1 * N2 * N3); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<4, Iterate::Left, Iterate::Right>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0, 0, 0}}, point_type{{N0, N1, N2, N3}}, + tile_type{{2, 4, 6, 2}}); + + TestMDRange_4D functor(N0, N1, N2, N3); + + parallel_for(range, functor); + double sum = 0.0; + parallel_reduce(range, functor, sum); + + ASSERT_EQ(sum, 2 * N0 * N1 * N2 * N3); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<4, Iterate::Right, Iterate::Left>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0, 0, 0}}, point_type{{N0, N1, N2, N3}}, + tile_type{{2, 4, 6, 2}}); + + TestMDRange_4D functor(N0, N1, N2, N3); + + parallel_for(range, functor); + double sum = 0.0; + parallel_reduce(range, functor, sum); + + ASSERT_EQ(sum, 2 * N0 * N1 * N2 * N3); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<4, Iterate::Right, Iterate::Right>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0, 0, 0}}, point_type{{N0, N1, N2, N3}}, + tile_type{{2, 4, 6, 2}}); + + TestMDRange_4D functor(N0, N1, N2, N3); + + parallel_for(range, functor); + double sum = 0.0; + parallel_reduce(range, functor, sum); + + ASSERT_EQ(sum, 2 * N0 * N1 * N2 * N3); + } + } // end test_reduce + + static void test_for4(const int N0, const int N1, const int N2, + const int N3) { +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<4>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + const int s0 = 1; + const int s1 = 1; + const int s2 = 1; + const int s3 = 1; + + range_type range(point_type{{s0, s1, s2, s3}}, + point_type{{N0, N1, N2, N3}}, tile_type{{3, 3, 3, 3}}); + + TestMDRange_4D::ViewType v("v", N0, N1, N2, N3); + + parallel_for( + range, KOKKOS_LAMBDA(const int i, const int j, const int k, + const int l) { v(i, j, k, l) = 3; }); + + TestMDRange_4D::HostViewType h_view = Kokkos::create_mirror_view(v); + Kokkos::deep_copy(h_view, v); + + int counter = 0; + for (int i = s0; i < N0; ++i) + for (int j = s1; j < N1; ++j) + for (int k = s2; k < N2; ++k) + for (int l = s3; l < N3; ++l) { + if (h_view(i, j, k, l) != 3) { + ++counter; + } + } + + if (counter != 0) { + printf( + "Offset Start + Default Layouts + InitTag op(): Errors in " + "test_for4; mismatches = %d\n\n", + counter); + } + + ASSERT_EQ(counter, 0); + } +#endif + + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<4>>; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0, 0, 0}}, point_type{{N0, N1, N2, N3}}); + TestMDRange_4D functor(N0, N1, N2, N3); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) + for (int k = 0; k < N2; ++k) + for (int l = 0; l < N3; ++l) { + if (h_view(i, j, k, l) != 1) { + ++counter; + } + } + + if (counter != 0) { + printf("Defaults + No Tile: Errors in test_for4; mismatches = %d\n\n", + counter); + } + + ASSERT_EQ(counter, 0); + } + + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<4>, + Kokkos::IndexType<int>, InitTag>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + int s0 = 1; + int s1 = 1; + int s2 = 1; + int s3 = 1; +#ifdef KOKKOS_ENABLE_SYCL + range_type range(point_type{{s0, s1, s2, s3}}, + point_type{{N0, N1, N2, N3}}, tile_type{{3, 11, 3, 2}}); +#else + range_type range(point_type{{s0, s1, s2, s3}}, + point_type{{N0, N1, N2, N3}}, tile_type{{3, 11, 3, 3}}); +#endif + TestMDRange_4D functor(N0, N1, N2, N3); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = s0; i < N0; ++i) + for (int j = s1; j < N1; ++j) + for (int k = s2; k < N2; ++k) + for (int l = s3; l < N3; ++l) { + if (h_view(i, j, k, l) != 3) { + ++counter; + } + } + + if (counter != 0) { + printf( + "Offset Start + Defaults +m_tile > m_upper dim2 InitTag op(): " + "Errors in test_for4; mismatches = %d\n\n", + counter); + } + + ASSERT_EQ(counter, 0); + } + + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<4>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0, 0, 0}}, point_type{{N0, N1, N2, N3}}, + tile_type{{4, 4, 4, 4}}); + + TestMDRange_4D functor(N0, N1, N2, N3); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) + for (int k = 0; k < N2; ++k) + for (int l = 0; l < N3; ++l) { + if (h_view(i, j, k, l) != 1) { + ++counter; + } + } + + if (counter != 0) { + printf(" Errors in test_for4; mismatches = %d\n\n", counter); + } + + ASSERT_EQ(counter, 0); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<4, Iterate::Default, Iterate::Default>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0, 0, 0}}, point_type{{N0, N1, N2, N3}}, + tile_type{{4, 4, 4, 4}}); + + TestMDRange_4D functor(N0, N1, N2, N3); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) + for (int k = 0; k < N2; ++k) + for (int l = 0; l < N3; ++l) { + if (h_view(i, j, k, l) != 1) { + ++counter; + } + } + + if (counter != 0) { + printf(" Errors in test_for4; mismatches = %d\n\n", counter); + } + + ASSERT_EQ(counter, 0); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<4, Iterate::Left, Iterate::Left>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0, 0, 0}}, point_type{{N0, N1, N2, N3}}, + tile_type{{4, 4, 4, 4}}); + + TestMDRange_4D functor(N0, N1, N2, N3); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) + for (int k = 0; k < N2; ++k) + for (int l = 0; l < N3; ++l) { + if (h_view(i, j, k, l) != 1) { + ++counter; + } + } + + if (counter != 0) { + printf(" Errors in test_for4; mismatches = %d\n\n", counter); + } + + ASSERT_EQ(counter, 0); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<4, Iterate::Left, Iterate::Right>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0, 0, 0}}, point_type{{N0, N1, N2, N3}}, + tile_type{{4, 4, 4, 4}}); + + TestMDRange_4D functor(N0, N1, N2, N3); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) + for (int k = 0; k < N2; ++k) + for (int l = 0; l < N3; ++l) { + if (h_view(i, j, k, l) != 1) { + ++counter; + } + } + + if (counter != 0) { + printf(" Errors in test_for4; mismatches = %d\n\n", counter); + } + + ASSERT_EQ(counter, 0); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<4, Iterate::Right, Iterate::Left>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0, 0, 0}}, point_type{{N0, N1, N2, N3}}, + tile_type{{4, 4, 4, 4}}); + + TestMDRange_4D functor(N0, N1, N2, N3); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) + for (int k = 0; k < N2; ++k) + for (int l = 0; l < N3; ++l) { + if (h_view(i, j, k, l) != 1) { + ++counter; + } + } + + if (counter != 0) { + printf(" Errors in test_for4; mismatches = %d\n\n", counter); + } + + ASSERT_EQ(counter, 0); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<4, Iterate::Right, Iterate::Right>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0, 0, 0}}, point_type{{N0, N1, N2, N3}}, + tile_type{{4, 4, 4, 4}}); + + TestMDRange_4D functor(N0, N1, N2, N3); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) + for (int k = 0; k < N2; ++k) + for (int l = 0; l < N3; ++l) { + if (h_view(i, j, k, l) != 1) { + ++counter; + } + } + + if (counter != 0) { + printf(" Errors in test_for4; mismatches = %d\n\n", counter); + } + + ASSERT_EQ(counter, 0); + } + } // end test_for4 +}; + +template <typename ExecSpace> +struct TestMDRange_5D { + using DataType = int; + using ViewType = typename Kokkos::View<DataType *****, ExecSpace>; + using HostViewType = typename ViewType::HostMirror; + + ViewType input_view; + using value_type = double; + + TestMDRange_5D(const DataType N0, const DataType N1, const DataType N2, + const DataType N3, const DataType N4) + : input_view("input_view", N0, N1, N2, N3, N4) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, const int j, const int k, const int l, + const int m) const { + input_view(i, j, k, l, m) = 1; + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, const int j, const int k, const int l, + const int m, value_type &lsum) const { + lsum += input_view(i, j, k, l, m) * 2; + } + + // tagged operators + struct InitTag {}; + KOKKOS_INLINE_FUNCTION + void operator()(const InitTag &, const int i, const int j, const int k, + const int l, const int m) const { + input_view(i, j, k, l, m) = 3; + } + + // reduction tagged operators + KOKKOS_INLINE_FUNCTION + void operator()(const InitTag &, const int i, const int j, const int k, + const int l, const int m, value_type &lsum) const { + lsum += input_view(i, j, k, l, m) * 3; + } + + static void test_reduce5(const int N0, const int N1, const int N2, + const int N3, const int N4) { +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<5>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0, 0, 0, 0}}, + point_type{{N0, N1, N2, N3, N4}}, + tile_type{{3, 3, 3, 3, 1}}); + double sum = 0.0; + parallel_reduce( + range, + KOKKOS_LAMBDA(const int /*i*/, const int /*j*/, const int /*k*/, + const int /*l*/, const int /*m*/, + double &lsum) { lsum += 1.0; }, + sum); + ASSERT_EQ(sum, N0 * N1 * N2 * N3 * N4); + } +#endif + + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<5>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + int s0 = 1; + int s1 = 1; + int s2 = 1; + int s3 = 1; + int s4 = 1; + range_type range(point_type{{s0, s1, s2, s3, s4}}, + point_type{{N0, N1, N2, N3, N4}}, + tile_type{{3, 3, 3, 3, 3}}); + + TestMDRange_5D functor(N0, N1, N2, N3, N4); + + parallel_for(range, functor); + double sum = 0.0; + parallel_reduce(range, functor, sum); + + ASSERT_EQ(sum, + 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) * (N4 - s4)); + } + + // Test with reducers - scalar + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<5>, + Kokkos::IndexType<int>>; + range_type range({{0, 0, 0, 0, 0}}, {{N0, N1, N2, N3, N4}}, + {{3, 3, 3, 3, 3}}); + + TestMDRange_5D functor(N0, N1, N2, N3, N4); + + parallel_for(range, functor); + + value_type sum = 0.0; + Kokkos::Sum<value_type> reducer_scalar(sum); + + parallel_reduce(range, functor, reducer_scalar); + + ASSERT_EQ(sum, 2 * N0 * N1 * N2 * N3 * N4); + } + + // Test with reducers - scalar + label + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<5>, + Kokkos::IndexType<int>>; + range_type range({{0, 0, 0, 0, 0}}, {{N0, N1, N2, N3, N4}}, + {{3, 3, 3, 3, 3}}); + + TestMDRange_5D functor(N0, N1, N2, N3, N4); + + parallel_for("rank5-parfor-label", range, functor); + + value_type sum = 0.0; + Kokkos::Sum<value_type> reducer_scalar(sum); + + parallel_reduce("rank5-reducer-label", range, functor, reducer_scalar); + + ASSERT_EQ(sum, 2 * N0 * N1 * N2 * N3 * N4); + } + + // Test with reducers - scalar view + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<5>, + Kokkos::IndexType<int>>; + range_type range({{0, 0, 0, 0, 0}}, {{N0, N1, N2, N3, N4}}, + {{3, 3, 3, 3, 3}}); + + TestMDRange_5D functor(N0, N1, N2, N3, N4); + + parallel_for(range, functor); + + value_type sum = 0.0; + Kokkos::View<value_type, Kokkos::HostSpace> sum_view("sum_view"); + sum_view() = sum; + Kokkos::Sum<value_type> reducer_view(sum_view); + + parallel_reduce(range, functor, reducer_view); + Kokkos::fence(); + sum = sum_view(); + + ASSERT_EQ(sum, 2 * N0 * N1 * N2 * N3 * N4); + } + + // Test Min reducer with lambda +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<5>, + Kokkos::IndexType<int>>; + + range_type range({{1, 1, 1, 1, 1}}, {{N0, N1, N2, N3, N4}}, + {{3, 3, 3, 2, 2}}); + + Kokkos::View<double *****, ExecSpace> v_in("v_in", N0, N1, N2, N3, N4); + + parallel_for( + "rank5-init-lambda", range, + KOKKOS_LAMBDA(const int i, const int j, const int k, const int l, + const int m) { + v_in(i, j, k, l, m) = + (i + 1) * (j + 1) * (k + 1) * (l + 1) * (m + 1); + }); + + double min; + + parallel_reduce( + "rank5-min-reducer", range, + KOKKOS_LAMBDA(const int i, const int j, const int k, const int l, + const int m, double &min_val) { + min_val = + (v_in(i, j, k, l, m) < min_val) ? v_in(i, j, k, l, m) : min_val; + }, + Kokkos::Min<double>(min)); + + ASSERT_EQ(min, 32.0); + } +#endif + + // Tagged operator test + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<5, Iterate::Default, Iterate::Default>, + Kokkos::IndexType<int>, InitTag>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0, 0, 0, 0}}, + point_type{{N0, N1, N2, N3, N4}}, + tile_type{{2, 4, 6, 2, 2}}); + + TestMDRange_5D functor(N0, N1, N2, N3, N4); + + parallel_for(range, functor); + + // check parallel_for results correct with InitTag + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) + for (int k = 0; k < N2; ++k) + for (int l = 0; l < N3; ++l) + for (int m = 0; m < N4; ++m) { + if (h_view(i, j, k, l, m) != 3) { + ++counter; + } + } + + if (counter != 0) { + printf( + "Defaults + InitTag op(): Errors in test_reduce5 parallel_for " + "init; mismatches = %d\n\n", + counter); + } + ASSERT_EQ(counter, 0); + + double sum = 0.0; + parallel_reduce(range, functor, sum); + + ASSERT_EQ(sum, 9 * N0 * N1 * N2 * N3 * N4); + } + } + + static void test_for5(const int N0, const int N1, const int N2, const int N3, + const int N4) { +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<5>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + const int s0 = 1; + const int s1 = 1; + const int s2 = 1; + const int s3 = 1; + const int s4 = 1; + + range_type range(point_type{{s0, s1, s2, s3, s4}}, + point_type{{N0, N1, N2, N3, N4}}, + tile_type{{3, 3, 3, 3, 1}}); + + TestMDRange_5D::ViewType v("v", N0, N1, N2, N3, N4); + + parallel_for( + range, + KOKKOS_LAMBDA(const int i, const int j, const int k, const int l, + const int m) { v(i, j, k, l, m) = 3; }); + + TestMDRange_5D::HostViewType h_view = Kokkos::create_mirror_view(v); + Kokkos::deep_copy(h_view, v); + + int counter = 0; + for (int i = s0; i < N0; ++i) + for (int j = s1; j < N1; ++j) + for (int k = s2; k < N2; ++k) + for (int l = s3; l < N3; ++l) + for (int m = s4; m < N4; ++m) { + if (h_view(i, j, k, l, m) != 3) { + ++counter; + } + } + + if (counter != 0) { + printf( + "Offset Start + Default Layouts + InitTag op(): Errors in " + "test_for5; mismatches = %d\n\n", + counter); + } + + ASSERT_EQ(counter, 0); + } +#endif + + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<5>>; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0, 0, 0, 0}}, + point_type{{N0, N1, N2, N3, N4}}); + TestMDRange_5D functor(N0, N1, N2, N3, N4); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) + for (int k = 0; k < N2; ++k) + for (int l = 0; l < N3; ++l) + for (int m = 0; m < N4; ++m) { + if (h_view(i, j, k, l, m) != 1) { + ++counter; + } + } + + if (counter != 0) { + printf("Defaults + No Tile: Errors in test_for5; mismatches = %d\n\n", + counter); + } + + ASSERT_EQ(counter, 0); + } + + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<5>, + Kokkos::IndexType<int>, InitTag>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + int s0 = 1; + int s1 = 1; + int s2 = 1; + int s3 = 1; + int s4 = 1; +#ifdef KOKKOS_ENABLE_SYCL + range_type range(point_type{{s0, s1, s2, s3, s4}}, + point_type{{N0, N1, N2, N3, N4}}, + tile_type{{3, 3, 3, 3, 3}}); +#else + range_type range(point_type{{s0, s1, s2, s3, s4}}, + point_type{{N0, N1, N2, N3, N4}}, + tile_type{{3, 3, 3, 3, 5}}); +#endif + + TestMDRange_5D functor(N0, N1, N2, N3, N4); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = s0; i < N0; ++i) + for (int j = s1; j < N1; ++j) + for (int k = s2; k < N2; ++k) + for (int l = s3; l < N3; ++l) + for (int m = s4; m < N4; ++m) { + if (h_view(i, j, k, l, m) != 3) { + ++counter; + } + } + + if (counter != 0) { + printf( + "Offset Start + Defaults + InitTag op(): Errors in test_for5; " + "mismatches = %d\n\n", + counter); + } + + ASSERT_EQ(counter, 0); + } + + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<5>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0, 0, 0, 0}}, + point_type{{N0, N1, N2, N3, N4}}, + tile_type{{4, 4, 4, 2, 2}}); + + TestMDRange_5D functor(N0, N1, N2, N3, N4); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) + for (int k = 0; k < N2; ++k) + for (int l = 0; l < N3; ++l) + for (int m = 0; m < N4; ++m) { + if (h_view(i, j, k, l, m) != 1) { + ++counter; + } + } + + if (counter != 0) { + printf(" Errors in test_for5; mismatches = %d\n\n", counter); + } + + ASSERT_EQ(counter, 0); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<5, Iterate::Default, Iterate::Default>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0, 0, 0, 0}}, + point_type{{N0, N1, N2, N3, N4}}, + tile_type{{4, 4, 4, 2, 2}}); + + TestMDRange_5D functor(N0, N1, N2, N3, N4); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) + for (int k = 0; k < N2; ++k) + for (int l = 0; l < N3; ++l) + for (int m = 0; m < N4; ++m) { + if (h_view(i, j, k, l, m) != 1) { + ++counter; + } + } + + if (counter != 0) { + printf(" Errors in test_for5; mismatches = %d\n\n", counter); + } + + ASSERT_EQ(counter, 0); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<5, Iterate::Left, Iterate::Left>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0, 0, 0, 0}}, + point_type{{N0, N1, N2, N3, N4}}, + tile_type{{4, 4, 4, 2, 2}}); + + TestMDRange_5D functor(N0, N1, N2, N3, N4); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) + for (int k = 0; k < N2; ++k) + for (int l = 0; l < N3; ++l) + for (int m = 0; m < N4; ++m) { + if (h_view(i, j, k, l, m) != 1) { + ++counter; + } + } + + if (counter != 0) { + printf(" Errors in test_for5; mismatches = %d\n\n", counter); + } + + ASSERT_EQ(counter, 0); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<5, Iterate::Left, Iterate::Right>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0, 0, 0, 0}}, + point_type{{N0, N1, N2, N3, N4}}, + tile_type{{4, 4, 4, 2, 2}}); + + TestMDRange_5D functor(N0, N1, N2, N3, N4); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) + for (int k = 0; k < N2; ++k) + for (int l = 0; l < N3; ++l) + for (int m = 0; m < N4; ++m) { + if (h_view(i, j, k, l, m) != 1) { + ++counter; + } + } + + if (counter != 0) { + printf(" Errors in test_for5; mismatches = %d\n\n", counter); + } + + ASSERT_EQ(counter, 0); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<5, Iterate::Right, Iterate::Left>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0, 0, 0, 0}}, + point_type{{N0, N1, N2, N3, N4}}, + tile_type{{4, 4, 4, 2, 2}}); + + TestMDRange_5D functor(N0, N1, N2, N3, N4); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) + for (int k = 0; k < N2; ++k) + for (int l = 0; l < N3; ++l) + for (int m = 0; m < N4; ++m) { + if (h_view(i, j, k, l, m) != 1) { + ++counter; + } + } + + if (counter != 0) { + printf(" Errors in test_for5; mismatches = %d\n\n", counter); + } + + ASSERT_EQ(counter, 0); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<5, Iterate::Right, Iterate::Right>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0, 0, 0, 0}}, + point_type{{N0, N1, N2, N3, N4}}, + tile_type{{4, 4, 4, 2, 2}}); + + TestMDRange_5D functor(N0, N1, N2, N3, N4); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) + for (int k = 0; k < N2; ++k) + for (int l = 0; l < N3; ++l) + for (int m = 0; m < N4; ++m) { + if (h_view(i, j, k, l, m) != 1) { + ++counter; + } + } + + if (counter != 0) { + printf(" Errors in test_for5; mismatches = %d\n\n", counter); + } + + ASSERT_EQ(counter, 0); + } + } +}; + +template <typename ExecSpace> +struct TestMDRange_6D { + using DataType = int; + using ViewType = typename Kokkos::View<DataType ******, ExecSpace>; + using HostViewType = typename ViewType::HostMirror; + + ViewType input_view; + using value_type = double; + + TestMDRange_6D(const DataType N0, const DataType N1, const DataType N2, + const DataType N3, const DataType N4, const DataType N5) + : input_view("input_view", N0, N1, N2, N3, N4, N5) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, const int j, const int k, const int l, + const int m, const int n) const { + input_view(i, j, k, l, m, n) = 1; + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, const int j, const int k, const int l, + const int m, const int n, value_type &lsum) const { + lsum += input_view(i, j, k, l, m, n) * 2; + } + + // tagged operators + struct InitTag {}; + KOKKOS_INLINE_FUNCTION + void operator()(const InitTag &, const int i, const int j, const int k, + const int l, const int m, const int n) const { + input_view(i, j, k, l, m, n) = 3; + } + + // reduction tagged operators + KOKKOS_INLINE_FUNCTION + void operator()(const InitTag &, const int i, const int j, const int k, + const int l, const int m, const int n, + value_type &lsum) const { + lsum += input_view(i, j, k, l, m, n) * 3; + } + + static void test_reduce6(const int N0, const int N1, const int N2, + const int N3, const int N4, const int N5) { +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0, 0, 0, 0, 0}}, + point_type{{N0, N1, N2, N3, N4, N5}}, + tile_type{{3, 3, 3, 3, 1, 1}}); + double sum = 0.0; + parallel_reduce( + range, + KOKKOS_LAMBDA(const int /*i*/, const int /*j*/, const int /*k*/, + const int /*l*/, const int /*m*/, const int /*n*/, + double &lsum) { lsum += 1.0; }, + sum); + ASSERT_EQ(sum, N0 * N1 * N2 * N3 * N4 * N5); + } +#endif + + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + int s0 = 1; + int s1 = 1; + int s2 = 1; + int s3 = 1; + int s4 = 1; + int s5 = 1; + +#ifdef KOKKOS_ENABLE_SYCL + range_type range(point_type{{s0, s1, s2, s3, s4, s5}}, + point_type{{N0, N1, N2, N3, N4, N5}}, + tile_type{{3, 3, 3, 2, 2, 2}}); +#else + range_type range(point_type{{s0, s1, s2, s3, s4, s5}}, + point_type{{N0, N1, N2, N3, N4, N5}}, + tile_type{{3, 3, 3, 3, 3, 2}}); +#endif + + TestMDRange_6D functor(N0, N1, N2, N3, N4, N5); + + parallel_for(range, functor); + double sum = 0.0; + parallel_reduce(range, functor, sum); + + ASSERT_EQ(sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) * + (N4 - s4) * (N5 - s5)); + } + + // Test with reducers - scalar + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>, + Kokkos::IndexType<int>>; +#ifdef KOKKOS_ENABLE_SYCL + range_type range({{0, 0, 0, 0, 0, 0}}, {{N0, N1, N2, N3, N4, N5}}, + {{3, 3, 3, 2, 2, 2}}); +#else + range_type range({{0, 0, 0, 0, 0, 0}}, {{N0, N1, N2, N3, N4, N5}}, + {{3, 3, 3, 3, 3, 2}}); +#endif + + TestMDRange_6D functor(N0, N1, N2, N3, N4, N5); + + parallel_for(range, functor); + + value_type sum = 0.0; + Kokkos::Sum<value_type> reducer_scalar(sum); + + parallel_reduce(range, functor, reducer_scalar); + + ASSERT_EQ(sum, 2 * N0 * N1 * N2 * N3 * N4 * N5); + } + + // Test with reducers - scalar + label + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>, + Kokkos::IndexType<int>>; + +#ifdef KOKKOS_ENABLE_SYCL + range_type range({{0, 0, 0, 0, 0, 0}}, {{N0, N1, N2, N3, N4, N5}}, + {{3, 3, 3, 2, 2, 2}}); +#else + range_type range({{0, 0, 0, 0, 0, 0}}, {{N0, N1, N2, N3, N4, N5}}, + {{3, 3, 3, 3, 3, 2}}); +#endif + + TestMDRange_6D functor(N0, N1, N2, N3, N4, N5); + + parallel_for("rank6-parfor-label", range, functor); + + value_type sum = 0.0; + Kokkos::Sum<value_type> reducer_scalar(sum); + + parallel_reduce("rank6-reducer-label", range, functor, reducer_scalar); + + ASSERT_EQ(sum, 2 * N0 * N1 * N2 * N3 * N4 * N5); + } + + // Test with reducers - scalar view + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>, + Kokkos::IndexType<int>>; +#ifdef KOKKOS_ENABLE_SYCL + range_type range({{0, 0, 0, 0, 0, 0}}, {{N0, N1, N2, N3, N4, N5}}, + {{3, 3, 3, 2, 2, 2}}); +#else + range_type range({{0, 0, 0, 0, 0, 0}}, {{N0, N1, N2, N3, N4, N5}}, + {{3, 3, 3, 3, 3, 2}}); +#endif + + TestMDRange_6D functor(N0, N1, N2, N3, N4, N5); + + parallel_for(range, functor); + + value_type sum = 0.0; + Kokkos::View<value_type, Kokkos::HostSpace> sum_view("sum_view"); + sum_view() = sum; + Kokkos::Sum<value_type> reducer_view(sum_view); + + parallel_reduce(range, functor, reducer_view); + Kokkos::fence(); + sum = sum_view(); + + ASSERT_EQ(sum, 2 * N0 * N1 * N2 * N3 * N4 * N5); + } + + // Test Min reducer with lambda +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>, + Kokkos::IndexType<int>>; + range_type range({{1, 1, 1, 1, 1, 1}}, {{N0, N1, N2, N3, N4, N5}}, + {{3, 3, 3, 2, 2, 1}}); + + Kokkos::View<double ******, ExecSpace> v_in("v_in", N0, N1, N2, N3, N4, + N5); + + parallel_for( + "rank6-init-lambda", range, + KOKKOS_LAMBDA(const int i, const int j, const int k, const int l, + const int m, const int n) { + v_in(i, j, k, l, m, n) = + (i + 1) * (j + 1) * (k + 1) * (l + 1) * (m + 1) * (n + 1); + }); + + double min; + + parallel_reduce( + "rank6-min-reducer", range, + KOKKOS_LAMBDA(const int i, const int j, const int k, const int l, + const int m, const int n, double &min_val) { + min_val = (v_in(i, j, k, l, m, n) < min_val) + ? v_in(i, j, k, l, m, n) + : min_val; + }, + Kokkos::Min<double>(min)); + + ASSERT_EQ(min, 64.0); + } +#endif + + // Tagged operator test + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<6, Iterate::Default, Iterate::Default>, + Kokkos::IndexType<int>, InitTag>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + +#ifdef KOKKOS_ENABLE_SYCL + range_type range(point_type{{0, 0, 0, 0, 0, 0}}, + point_type{{N0, N1, N2, N3, N4, N5}}, + tile_type{{2, 4, 4, 2, 2, 2}}); +#else + range_type range(point_type{{0, 0, 0, 0, 0, 0}}, + point_type{{N0, N1, N2, N3, N4, N5}}, + tile_type{{2, 4, 6, 2, 2, 2}}); +#endif + + TestMDRange_6D functor(N0, N1, N2, N3, N4, N5); + + parallel_for(range, functor); + + // check parallel_for results correct with InitTag + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) + for (int k = 0; k < N2; ++k) + for (int l = 0; l < N3; ++l) + for (int m = 0; m < N4; ++m) + for (int n = 0; n < N5; ++n) { + if (h_view(i, j, k, l, m, n) != 3) { + ++counter; + } + } + + if (counter != 0) { + printf( + "Defaults + InitTag op(): Errors in test_reduce6 parallel_for " + "init; mismatches = %d\n\n", + counter); + } + ASSERT_EQ(counter, 0); + + double sum = 0.0; + parallel_reduce(range, functor, sum); + + ASSERT_EQ(sum, 9 * N0 * N1 * N2 * N3 * N4 * N5); + } + } + + static void test_for6(const int N0, const int N1, const int N2, const int N3, + const int N4, const int N5) { +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + const int s0 = 1; + const int s1 = 1; + const int s2 = 1; + const int s3 = 1; + const int s4 = 1; + const int s5 = 1; + + range_type range(point_type{{s0, s1, s2, s3, s4, s5}}, + point_type{{N0, N1, N2, N3, N4, N5}}, + tile_type{{3, 3, 3, 3, 1, 1}}); + + TestMDRange_6D::ViewType v("v", N0, N1, N2, N3, N4, N5); + + parallel_for( + range, + KOKKOS_LAMBDA(const int i, const int j, const int k, const int l, + const int m, const int n) { v(i, j, k, l, m, n) = 3; }); + + TestMDRange_6D::HostViewType h_view = Kokkos::create_mirror_view(v); + Kokkos::deep_copy(h_view, v); + + int counter = 0; + for (int i = s0; i < N0; ++i) + for (int j = s1; j < N1; ++j) + for (int k = s2; k < N2; ++k) + for (int l = s3; l < N3; ++l) + for (int m = s4; m < N4; ++m) + for (int n = s5; n < N5; ++n) { + if (h_view(i, j, k, l, m, n) != 3) { + ++counter; + } + } + + if (counter != 0) { + printf( + "Offset Start + Default Layouts + InitTag op(): Errors in " + "test_for6; mismatches = %d\n\n", + counter); + } + + ASSERT_EQ(counter, 0); + } +#endif + + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>>; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0, 0, 0, 0, 0}}, + point_type{{N0, N1, N2, N3, N4, N5}}); + TestMDRange_6D functor(N0, N1, N2, N3, N4, N5); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) + for (int k = 0; k < N2; ++k) + for (int l = 0; l < N3; ++l) + for (int m = 0; m < N4; ++m) + for (int n = 0; n < N5; ++n) { + if (h_view(i, j, k, l, m, n) != 1) { + ++counter; + } + } + + if (counter != 0) { + printf("Defaults + No Tile: Errors in test_for6; mismatches = %d\n\n", + counter); + } + + ASSERT_EQ(counter, 0); + } + + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>, + Kokkos::IndexType<int>, InitTag>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + int s0 = 1; + int s1 = 1; + int s2 = 1; + int s3 = 1; + int s4 = 1; + int s5 = 1; +#ifdef KOKKOS_ENABLE_SYCL + range_type range(point_type{{s0, s1, s2, s3, s4, s5}}, + point_type{{N0, N1, N2, N3, N4, N5}}, + tile_type{{3, 3, 3, 2, 2, 2}}); +#else + // tile dims 3,3,3,3,3,3 more than cuda can handle with debugging + range_type range(point_type{{s0, s1, s2, s3, s4, s5}}, + point_type{{N0, N1, N2, N3, N4, N5}}, + tile_type{{3, 3, 3, 3, 2, 3}}); +#endif + + TestMDRange_6D functor(N0, N1, N2, N3, N4, N5); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = s0; i < N0; ++i) + for (int j = s1; j < N1; ++j) + for (int k = s2; k < N2; ++k) + for (int l = s3; l < N3; ++l) + for (int m = s4; m < N4; ++m) + for (int n = s5; n < N5; ++n) { + if (h_view(i, j, k, l, m, n) != 3) { + ++counter; + } + } + + if (counter != 0) { + printf( + "Offset Start + Defaults + InitTag op(): Errors in test_for6; " + "mismatches = %d\n\n", + counter); + } + + ASSERT_EQ(counter, 0); + } + + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + +#ifdef KOKKOS_ENABLE_SYCL + range_type range(point_type{{0, 0, 0, 0, 0, 0}}, + point_type{{N0, N1, N2, N3, N4, N5}}, + tile_type{{4, 4, 2, 2, 2, 2}}); +#else + range_type range(point_type{{0, 0, 0, 0, 0, 0}}, + point_type{{N0, N1, N2, N3, N4, N5}}, + tile_type{{4, 4, 4, 2, 2, 2}}); +#endif + + TestMDRange_6D functor(N0, N1, N2, N3, N4, N5); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) + for (int k = 0; k < N2; ++k) + for (int l = 0; l < N3; ++l) + for (int m = 0; m < N4; ++m) + for (int n = 0; n < N5; ++n) { + if (h_view(i, j, k, l, m, n) != 1) { + ++counter; + } + } + + if (counter != 0) { + printf(" Errors in test_for6; mismatches = %d\n\n", counter); + } + + ASSERT_EQ(counter, 0); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<6, Iterate::Default, Iterate::Default>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + +#ifdef KOKKOS_ENABLE_SYCL + range_type range(point_type{{0, 0, 0, 0, 0, 0}}, + point_type{{N0, N1, N2, N3, N4, N5}}, + tile_type{{4, 4, 2, 2, 2, 2}}); +#else + range_type range(point_type{{0, 0, 0, 0, 0, 0}}, + point_type{{N0, N1, N2, N3, N4, N5}}, + tile_type{{4, 4, 4, 2, 2, 2}}); +#endif + + TestMDRange_6D functor(N0, N1, N2, N3, N4, N5); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) + for (int k = 0; k < N2; ++k) + for (int l = 0; l < N3; ++l) + for (int m = 0; m < N4; ++m) + for (int n = 0; n < N5; ++n) { + if (h_view(i, j, k, l, m, n) != 1) { + ++counter; + } + } + + if (counter != 0) { + printf(" Errors in test_for6; mismatches = %d\n\n", counter); + } + + ASSERT_EQ(counter, 0); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<6, Iterate::Left, Iterate::Left>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + +#ifdef KOKKOS_ENABLE_SYCL + range_type range(point_type{{0, 0, 0, 0, 0, 0}}, + point_type{{N0, N1, N2, N3, N4, N5}}, + tile_type{{4, 4, 2, 2, 2, 2}}); +#else + range_type range(point_type{{0, 0, 0, 0, 0, 0}}, + point_type{{N0, N1, N2, N3, N4, N5}}, + tile_type{{4, 4, 4, 2, 2, 2}}); +#endif + + TestMDRange_6D functor(N0, N1, N2, N3, N4, N5); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) + for (int k = 0; k < N2; ++k) + for (int l = 0; l < N3; ++l) + for (int m = 0; m < N4; ++m) + for (int n = 0; n < N5; ++n) { + if (h_view(i, j, k, l, m, n) != 1) { + ++counter; + } + } + + if (counter != 0) { + printf(" Errors in test_for6; mismatches = %d\n\n", counter); + } + + ASSERT_EQ(counter, 0); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<6, Iterate::Left, Iterate::Right>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + +#ifdef KOKKOS_ENABLE_SYCL + range_type range(point_type{{0, 0, 0, 0, 0, 0}}, + point_type{{N0, N1, N2, N3, N4, N5}}, + tile_type{{4, 4, 2, 2, 2, 2}}); +#else + range_type range(point_type{{0, 0, 0, 0, 0, 0}}, + point_type{{N0, N1, N2, N3, N4, N5}}, + tile_type{{4, 4, 4, 2, 2, 2}}); +#endif + + TestMDRange_6D functor(N0, N1, N2, N3, N4, N5); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) + for (int k = 0; k < N2; ++k) + for (int l = 0; l < N3; ++l) + for (int m = 0; m < N4; ++m) + for (int n = 0; n < N5; ++n) { + if (h_view(i, j, k, l, m, n) != 1) { + ++counter; + } + } + + if (counter != 0) { + printf(" Errors in test_for6; mismatches = %d\n\n", counter); + } + + ASSERT_EQ(counter, 0); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<6, Iterate::Right, Iterate::Left>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + +#ifdef KOKKOS_ENABLE_SYCL + range_type range(point_type{{0, 0, 0, 0, 0, 0}}, + point_type{{N0, N1, N2, N3, N4, N5}}, + tile_type{{4, 4, 2, 2, 2, 2}}); +#else + range_type range(point_type{{0, 0, 0, 0, 0, 0}}, + point_type{{N0, N1, N2, N3, N4, N5}}, + tile_type{{4, 4, 4, 2, 2, 2}}); +#endif + + TestMDRange_6D functor(N0, N1, N2, N3, N4, N5); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) + for (int k = 0; k < N2; ++k) + for (int l = 0; l < N3; ++l) + for (int m = 0; m < N4; ++m) + for (int n = 0; n < N5; ++n) { + if (h_view(i, j, k, l, m, n) != 1) { + ++counter; + } + } + + if (counter != 0) { + printf(" Errors in test_for6; mismatches = %d\n\n", counter); + } + + ASSERT_EQ(counter, 0); + } + + { + using range_type = typename Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank<6, Iterate::Right, Iterate::Right>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + +#ifdef KOKKOS_ENABLE_SYCL + range_type range(point_type{{0, 0, 0, 0, 0, 0}}, + point_type{{N0, N1, N2, N3, N4, N5}}, + tile_type{{4, 4, 2, 2, 2, 2}}); +#else + range_type range(point_type{{0, 0, 0, 0, 0, 0}}, + point_type{{N0, N1, N2, N3, N4, N5}}, + tile_type{{4, 4, 4, 2, 2, 2}}); +#endif + + TestMDRange_6D functor(N0, N1, N2, N3, N4, N5); + + parallel_for(range, functor); + + HostViewType h_view = Kokkos::create_mirror_view(functor.input_view); + Kokkos::deep_copy(h_view, functor.input_view); + + int counter = 0; + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) + for (int k = 0; k < N2; ++k) + for (int l = 0; l < N3; ++l) + for (int m = 0; m < N4; ++m) + for (int n = 0; n < N5; ++n) { + if (h_view(i, j, k, l, m, n) != 1) { + ++counter; + } + } + + if (counter != 0) { + printf(" Errors in test_for6; mismatches = %d\n\n", counter); + } + + ASSERT_EQ(counter, 0); + } + } +}; + +template <typename ExecSpace> +struct TestMDRange_2D_NegIdx { + using value_type = double; + + using DataType = int; + using ViewType = typename Kokkos::View<DataType **, ExecSpace>; + using HostViewType = typename ViewType::HostMirror; + + ViewType input_view; + DataType lower_offset[2]; + + TestMDRange_2D_NegIdx(const DataType L0, const DataType L1, const DataType N0, + const DataType N1) + : input_view("input_view", N0 - L0, N1 - L1) { + lower_offset[0] = L0; + lower_offset[1] = L1; + } + + // When using negative indices, must offset View appropriately as views cannot + // take a negative index + KOKKOS_INLINE_FUNCTION + void operator()(const int i, const int j) const { + input_view(i - lower_offset[0], j - lower_offset[1]) = 1; + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, const int j, value_type &lsum) const { + lsum += input_view(i - lower_offset[0], j - lower_offset[1]) * 2; + } + + static void test_2D_negidx(const int N0, const int N1) { + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<2>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + const point_type lower{{-1, -1}}; + const point_type upper{{N0, N1}}; + const tile_type tile{{8, 8}}; + + range_type range(point_type{{lower[0], lower[1]}}, + point_type{{upper[0], upper[1]}}, + tile_type{{tile[0], tile[1]}}); + + TestMDRange_2D_NegIdx functor(lower[0], lower[1], upper[0], upper[1]); + + parallel_for(range, functor); + double sum = 0.0; + parallel_reduce(range, functor, sum); + + ASSERT_EQ(sum, 2 * (upper[0] - lower[0]) * (upper[1] - lower[1])); + } + } +}; + +template <typename ExecSpace> +struct TestMDRange_3D_NegIdx { + using value_type = double; + + using DataType = int; + using ViewType = typename Kokkos::View<DataType ***, ExecSpace>; + using HostViewType = typename ViewType::HostMirror; + + ViewType input_view; + DataType lower_offset[3]; + + TestMDRange_3D_NegIdx(const DataType L0, const DataType L1, const DataType L2, + const DataType N0, const DataType N1, const DataType N2) + : input_view("input_view", N0 - L0, N1 - L1, N2 - L2) { + lower_offset[0] = L0; + lower_offset[1] = L1; + lower_offset[2] = L2; + } + + // When using negative indices, must offset View appropriately as views cannot + // take a negative index + KOKKOS_INLINE_FUNCTION + void operator()(const int i, const int j, const int k) const { + input_view(i - lower_offset[0], j - lower_offset[1], k - lower_offset[2]) = + 1; + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, const int j, const int k, + value_type &lsum) const { + lsum += input_view(i - lower_offset[0], j - lower_offset[1], + k - lower_offset[2]) * + 2; + } + + static void test_3D_negidx(const int N0, const int N1, const int N2) { + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<3>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + const point_type lower{{-1, -1, -1}}; + const point_type upper{{N0, N1, N2}}; + const tile_type tile{{8, 8, 2}}; + + range_type range(point_type{{lower[0], lower[1], lower[2]}}, + point_type{{upper[0], upper[1], upper[2]}}, + tile_type{{tile[0], tile[1], tile[2]}}); + + TestMDRange_3D_NegIdx functor(lower[0], lower[1], lower[2], upper[0], + upper[1], upper[2]); + + parallel_for(range, functor); + double sum = 0.0; + parallel_reduce(range, functor, sum); + + ASSERT_EQ(sum, 2 * (upper[0] - lower[0]) * (upper[1] - lower[1]) * + (upper[2] - lower[2])); + } + } +}; + +template <typename ExecSpace> +struct TestMDRange_4D_NegIdx { + using value_type = double; + + using DataType = int; + using ViewType = typename Kokkos::View<DataType ****, ExecSpace>; + using HostViewType = typename ViewType::HostMirror; + + ViewType input_view; + DataType lower_offset[4]; + + TestMDRange_4D_NegIdx(const DataType L0, const DataType L1, const DataType L2, + const DataType L3, const DataType N0, const DataType N1, + const DataType N2, const DataType N3) + : input_view("input_view", N0 - L0, N1 - L1, N2 - L2, N3 - L3) { + lower_offset[0] = L0; + lower_offset[1] = L1; + lower_offset[2] = L2; + lower_offset[3] = L3; + } + + // When using negative indices, must offset View appropriately as views cannot + // take a negative index + KOKKOS_INLINE_FUNCTION + void operator()(const int i, const int j, const int k, const int l) const { + input_view(i - lower_offset[0], j - lower_offset[1], k - lower_offset[2], + l - lower_offset[3]) = 1; + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, const int j, const int k, const int l, + value_type &lsum) const { + lsum += input_view(i - lower_offset[0], j - lower_offset[1], + k - lower_offset[2], l - lower_offset[3]) * + 2; + } + + static void test_4D_negidx(const int N0, const int N1, const int N2, + const int N3) { + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<4>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + const point_type lower{{-1, -1, -1, -1}}; + const point_type upper{{N0, N1, N2, N3}}; + const tile_type tile{{8, 8, 2, 2}}; + + range_type range(point_type{{lower[0], lower[1], lower[2], lower[3]}}, + point_type{{upper[0], upper[1], upper[2], upper[3]}}, + tile_type{{tile[0], tile[1], tile[2], tile[3]}}); + + TestMDRange_4D_NegIdx functor(lower[0], lower[1], lower[2], lower[3], + upper[0], upper[1], upper[2], upper[3]); + + parallel_for(range, functor); + double sum = 0.0; + parallel_reduce(range, functor, sum); + + ASSERT_EQ(sum, 2 * (upper[0] - lower[0]) * (upper[1] - lower[1]) * + (upper[2] - lower[2]) * (upper[3] - lower[3])); + } + } +}; + +template <typename ExecSpace> +struct TestMDRange_5D_NegIdx { + using value_type = double; + + using DataType = int; + using ViewType = typename Kokkos::View<DataType *****, ExecSpace>; + using HostViewType = typename ViewType::HostMirror; + + ViewType input_view; + DataType lower_offset[5]; + + TestMDRange_5D_NegIdx(const DataType L0, const DataType L1, const DataType L2, + const DataType L3, const DataType L4, const DataType N0, + const DataType N1, const DataType N2, const DataType N3, + const DataType N4) + : input_view("input_view", N0 - L0, N1 - L1, N2 - L2, N3 - L3, N4 - L4) { + lower_offset[0] = L0; + lower_offset[1] = L1; + lower_offset[2] = L2; + lower_offset[3] = L3; + lower_offset[4] = L4; + } + + // When using negative indices, must offset View appropriately as views cannot + // take a negative index + KOKKOS_INLINE_FUNCTION + void operator()(const int i, const int j, const int k, const int l, + const int m) const { + input_view(i - lower_offset[0], j - lower_offset[1], k - lower_offset[2], + l - lower_offset[3], m - lower_offset[4]) = 1; + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, const int j, const int k, const int l, + const int m, value_type &lsum) const { + lsum += input_view(i - lower_offset[0], j - lower_offset[1], + k - lower_offset[2], l - lower_offset[3], + m - lower_offset[4]) * + 2; + } + + static void test_5D_negidx(const int N0, const int N1, const int N2, + const int N3, const int N4) { + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<5>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + const point_type lower{{-1, -1, -1, -1, -1}}; + const point_type upper{{N0, N1, N2, N3, N4}}; + const tile_type tile{{8, 4, 2, 2, 2}}; + + range_type range( + point_type{{lower[0], lower[1], lower[2], lower[3], lower[4]}}, + point_type{{upper[0], upper[1], upper[2], upper[3], upper[4]}}, + tile_type{{tile[0], tile[1], tile[2], tile[3], tile[4]}}); + + TestMDRange_5D_NegIdx functor(lower[0], lower[1], lower[2], lower[3], + lower[4], upper[0], upper[1], upper[2], + upper[3], upper[4]); + + parallel_for(range, functor); + double sum = 0.0; + parallel_reduce(range, functor, sum); + + ASSERT_EQ(sum, 2 * (upper[0] - lower[0]) * (upper[1] - lower[1]) * + (upper[2] - lower[2]) * (upper[3] - lower[3]) * + (upper[4] - lower[4])); + } + } +}; + +template <typename ExecSpace> +struct TestMDRange_6D_NegIdx { + using value_type = double; + + using DataType = int; + using ViewType = typename Kokkos::View<DataType ******, ExecSpace>; + using HostViewType = typename ViewType::HostMirror; + + ViewType input_view; + DataType lower_offset[6]; + + TestMDRange_6D_NegIdx(const DataType L0, const DataType L1, const DataType L2, + const DataType L3, const DataType L4, const DataType L5, + const DataType N0, const DataType N1, const DataType N2, + const DataType N3, const DataType N4, const DataType N5) + : input_view("input_view", N0 - L0, N1 - L1, N2 - L2, N3 - L3, N4 - L4, + N5 - L5) { + lower_offset[0] = L0; + lower_offset[1] = L1; + lower_offset[2] = L2; + lower_offset[3] = L3; + lower_offset[4] = L4; + lower_offset[5] = L5; + } + + // When using negative indices, must offset View appropriately as views cannot + // take a negative index + KOKKOS_INLINE_FUNCTION + void operator()(const int i, const int j, const int k, const int l, + const int m, const int n) const { + input_view(i - lower_offset[0], j - lower_offset[1], k - lower_offset[2], + l - lower_offset[3], m - lower_offset[4], n - lower_offset[5]) = + 1; + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, const int j, const int k, const int l, + const int m, const int n, value_type &lsum) const { + lsum += input_view(i - lower_offset[0], j - lower_offset[1], + k - lower_offset[2], l - lower_offset[3], + m - lower_offset[4], n - lower_offset[5]) * + 2; + } + + static void test_6D_negidx(const int N0, const int N1, const int N2, + const int N3, const int N4, const int N5) { + { + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + const point_type lower{{-1, -1, -1, -1, -1, -1}}; + const point_type upper{{N0, N1, N2, N3, N4, N5}}; + const tile_type tile{{8, 4, 2, 2, 2, 1}}; + + range_type range( + point_type{ + {lower[0], lower[1], lower[2], lower[3], lower[4], lower[5]}}, + point_type{ + {upper[0], upper[1], upper[2], upper[3], upper[4], upper[5]}}, + tile_type{{tile[0], tile[1], tile[2], tile[3], tile[4], tile[5]}}); + + TestMDRange_6D_NegIdx functor(lower[0], lower[1], lower[2], lower[3], + lower[4], lower[5], upper[0], upper[1], + upper[2], upper[3], upper[4], upper[5]); + + parallel_for(range, functor); + double sum = 0.0; + parallel_reduce(range, functor, sum); + + ASSERT_EQ(sum, 2 * (upper[0] - lower[0]) * (upper[1] - lower[1]) * + (upper[2] - lower[2]) * (upper[3] - lower[3]) * + (upper[4] - lower[4]) * (upper[5] - lower[5])); + } + } +}; + +template <typename ExecSpace> +struct TestMDRange_ReduceScalar { + struct Scalar { + double v[4]; + KOKKOS_INLINE_FUNCTION + Scalar() { + for (int i = 0; i < 4; i++) v[i] = 0; + } + + KOKKOS_INLINE_FUNCTION + Scalar(const Scalar &src) { + for (int i = 0; i < 4; i++) v[i] = src.v[i]; + } + KOKKOS_INLINE_FUNCTION + void operator=(const Scalar &src) { + for (int i = 0; i < 4; i++) v[i] = src.v[i]; + } + KOKKOS_INLINE_FUNCTION + void operator+=(const Scalar &src) { + for (int i = 0; i < 4; i++) v[i] += src.v[i]; + } + KOKKOS_INLINE_FUNCTION + void operator=(const volatile Scalar &src) volatile { + for (int i = 0; i < 4; i++) v[i] = src.v[i]; + } + KOKKOS_INLINE_FUNCTION + void operator+=(const volatile Scalar &src) volatile { + for (int i = 0; i < 4; i++) v[i] += src.v[i]; + } + }; + + static void test_scalar_reduce(const int N0, const int N1) { +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + Scalar sum; + using range_type = + typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<2>, + Kokkos::IndexType<int>>; + using tile_type = typename range_type::tile_type; + using point_type = typename range_type::point_type; + + range_type range(point_type{{0, 0}}, point_type{{N0, N1}}, + tile_type{{3, 3}}); + + parallel_reduce( + range, + KOKKOS_LAMBDA(int, int, Scalar &lsum) { + for (int i = 0; i < 4; i++) lsum.v[i]++; + }, + sum); + for (int i = 0; i < 4; i++) ASSERT_EQ(sum.v[i], N0 * N1); +#else + std::ignore = N0; + std::ignore = N1; +#endif + } +}; + +} // namespace + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestMDRange_a.hpp b/packages/kokkos/core/unit_test/TestMDRange_a.hpp new file mode 100644 index 0000000000000000000000000000000000000000..0f2abd6d65e921bf07b512984b17ac3d5f5fe67c --- /dev/null +++ b/packages/kokkos/core/unit_test/TestMDRange_a.hpp @@ -0,0 +1,57 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestMDRange.hpp> + +namespace Test { + +TEST(TEST_CATEGORY, mdrange_5d) { +// FIXME_OPENMPTARGET requires MDRange parallel_reduce +#ifndef KOKKOS_ENABLE_OPENMPTARGET + TestMDRange_5D<TEST_EXECSPACE>::test_reduce5(100, 10, 10, 10, 5); +#endif + TestMDRange_5D<TEST_EXECSPACE>::test_for5(100, 10, 10, 10, 5); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestMDRange_b.hpp b/packages/kokkos/core/unit_test/TestMDRange_b.hpp new file mode 100644 index 0000000000000000000000000000000000000000..85410d5c27fa6ba60c5d8034efa0d30bb1f6db7a --- /dev/null +++ b/packages/kokkos/core/unit_test/TestMDRange_b.hpp @@ -0,0 +1,57 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestMDRange.hpp> + +namespace Test { + +TEST(TEST_CATEGORY, mdrange_6d) { + TestMDRange_6D<TEST_EXECSPACE>::test_for6(10, 10, 10, 10, 5, 5); +#ifndef KOKKOS_ENABLE_OPENMPTARGET + // FIXME_OPENMPTARGET requires MDRange parallel_reduce + TestMDRange_6D<TEST_EXECSPACE>::test_reduce6(100, 10, 10, 10, 5, 5); +#endif +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestMDRange_c.hpp b/packages/kokkos/core/unit_test/TestMDRange_c.hpp new file mode 100644 index 0000000000000000000000000000000000000000..9f597ec54b5777fe1df4f7e831c20e9eb1eab38d --- /dev/null +++ b/packages/kokkos/core/unit_test/TestMDRange_c.hpp @@ -0,0 +1,64 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestMDRange.hpp> + +namespace Test { + +TEST(TEST_CATEGORY, mdrange_2d) { +// FIXME_OPENMPTARGET requires MDRange parallel_reduce +#ifndef KOKKOS_ENABLE_OPENMPTARGET + TestMDRange_2D<TEST_EXECSPACE>::test_reduce2(100, 100); +#endif + TestMDRange_2D<TEST_EXECSPACE>::test_for2(100, 100); +} + +#ifndef KOKKOS_ENABLE_OPENMPTARGET +TEST(TEST_CATEGORY, mdrange_array_reduce) { + TestMDRange_ReduceArray_2D<TEST_EXECSPACE>::test_arrayreduce2(4, 5); + TestMDRange_ReduceArray_3D<TEST_EXECSPACE>::test_arrayreduce3(4, 5, 10); +} +#endif + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestMDRange_d.hpp b/packages/kokkos/core/unit_test/TestMDRange_d.hpp new file mode 100644 index 0000000000000000000000000000000000000000..5ca57ccf483710bdfb7907bcd4e10d03d13ecc39 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestMDRange_d.hpp @@ -0,0 +1,69 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestMDRange.hpp> + +namespace Test { + +TEST(TEST_CATEGORY, mdrange_3d) { + TestMDRange_3D<TEST_EXECSPACE>::test_for3(1, 10, 100); + TestMDRange_3D<TEST_EXECSPACE>::test_for3(100, 10, 100); +#ifndef KOKKOS_ENABLE_OPENMPTARGET + // FIXME_OPENMPTARGET requires MDRange parallel_reduce + TestMDRange_3D<TEST_EXECSPACE>::test_reduce3(1, 10, 100); + TestMDRange_3D<TEST_EXECSPACE>::test_reduce3(100, 10, 100); +#endif +} + +#ifndef KOKKOS_ENABLE_OPENMPTARGET +TEST(TEST_CATEGORY, mdrange_neg_idx) { + TestMDRange_2D_NegIdx<TEST_EXECSPACE>::test_2D_negidx(128, 32); + TestMDRange_3D_NegIdx<TEST_EXECSPACE>::test_3D_negidx(128, 32, 8); + TestMDRange_4D_NegIdx<TEST_EXECSPACE>::test_4D_negidx(128, 32, 8, 8); + TestMDRange_5D_NegIdx<TEST_EXECSPACE>::test_5D_negidx(128, 32, 8, 8, 4); + TestMDRange_6D_NegIdx<TEST_EXECSPACE>::test_6D_negidx(128, 32, 8, 8, 4, 2); +} +#endif + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestMDRange_e.hpp b/packages/kokkos/core/unit_test/TestMDRange_e.hpp new file mode 100644 index 0000000000000000000000000000000000000000..b9754e63d56bacb497fec4f932eb348c38f6c79f --- /dev/null +++ b/packages/kokkos/core/unit_test/TestMDRange_e.hpp @@ -0,0 +1,57 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestMDRange.hpp> + +namespace Test { + +TEST(TEST_CATEGORY, mdrange_4d) { +// FIXME_OPENMPTARGET requires MDRange parallel_reduce +#ifndef KOKKOS_ENABLE_OPENMPTARGET + TestMDRange_4D<TEST_EXECSPACE>::test_reduce4(100, 10, 10, 10); +#endif + TestMDRange_4D<TEST_EXECSPACE>::test_for4(100, 10, 10, 10); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestMDRange_f.hpp b/packages/kokkos/core/unit_test/TestMDRange_f.hpp new file mode 100644 index 0000000000000000000000000000000000000000..2cef1324d7c75059dfa50417d940bd7bf40a9763 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestMDRange_f.hpp @@ -0,0 +1,56 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestMDRange.hpp> + +namespace Test { + +// FIXME_OPENMPTARGET requires MDRange parallel_reduce +#ifndef KOKKOS_ENABLE_OPENMPTARGET +TEST(TEST_CATEGORY, mdrange_scalar) { + TestMDRange_ReduceScalar<TEST_EXECSPACE>::test_scalar_reduce(12, 11); +} +#endif + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestMathematicalFunctions.hpp b/packages/kokkos/core/unit_test/TestMathematicalFunctions.hpp new file mode 100644 index 0000000000000000000000000000000000000000..777f91aea3e560981d5dde05767f1726d8a1542f --- /dev/null +++ b/packages/kokkos/core/unit_test/TestMathematicalFunctions.hpp @@ -0,0 +1,871 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> +#include <algorithm> +#include <initializer_list> +#include <type_traits> +#include "Kokkos_ExecPolicy.hpp" +#include "Kokkos_Parallel_Reduce.hpp" + +#include <cfloat> + +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || \ + defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENMPTARGET) +#else +#define MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS +#endif + +// clang-format off +template <class> +struct math_unary_function_return_type; +// Floating-point types +template <> struct math_unary_function_return_type< float> { using type = float; }; +template <> struct math_unary_function_return_type< double> { using type = double; }; +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS +template <> struct math_unary_function_return_type<long double> { using type = long double; }; +#endif +// Integral types +template <> struct math_unary_function_return_type< bool> { using type = double; }; +template <> struct math_unary_function_return_type< short> { using type = double; }; +template <> struct math_unary_function_return_type< unsigned short> { using type = double; }; +template <> struct math_unary_function_return_type< int> { using type = double; }; +template <> struct math_unary_function_return_type< unsigned int> { using type = double; }; +template <> struct math_unary_function_return_type< long> { using type = double; }; +template <> struct math_unary_function_return_type< unsigned long> { using type = double; }; +template <> struct math_unary_function_return_type< long long> { using type = double; }; +template <> struct math_unary_function_return_type<unsigned long long> { using type = double; }; +template <class T> +using math_unary_function_return_type_t = typename math_unary_function_return_type<T>::type; +template <class, class> +struct math_binary_function_return_type; +template <> struct math_binary_function_return_type< float, float> { using type = float; }; +template <> struct math_binary_function_return_type< float, double> { using type = double; }; +template <> struct math_binary_function_return_type< float, bool> { using type = double; }; +template <> struct math_binary_function_return_type< float, short> { using type = double; }; +template <> struct math_binary_function_return_type< float, int> { using type = double; }; +template <> struct math_binary_function_return_type< float, long> { using type = double; }; +template <> struct math_binary_function_return_type< float, long long> { using type = double; }; +template <> struct math_binary_function_return_type< float, unsigned short> { using type = double; }; +template <> struct math_binary_function_return_type< float, unsigned int> { using type = double; }; +template <> struct math_binary_function_return_type< float, unsigned long> { using type = double; }; +template <> struct math_binary_function_return_type< float, unsigned long long> { using type = double; }; +template <> struct math_binary_function_return_type< double, float> { using type = double; }; +template <> struct math_binary_function_return_type< double, double> { using type = double; }; +template <> struct math_binary_function_return_type< double, bool> { using type = double; }; +template <> struct math_binary_function_return_type< double, short> { using type = double; }; +template <> struct math_binary_function_return_type< double, int> { using type = double; }; +template <> struct math_binary_function_return_type< double, long> { using type = double; }; +template <> struct math_binary_function_return_type< double, long long> { using type = double; }; +template <> struct math_binary_function_return_type< double, unsigned short> { using type = double; }; +template <> struct math_binary_function_return_type< double, unsigned int> { using type = double; }; +template <> struct math_binary_function_return_type< double, unsigned long> { using type = double; }; +template <> struct math_binary_function_return_type< double, unsigned long long> { using type = double; }; +template <> struct math_binary_function_return_type< short, float> { using type = double; }; +template <> struct math_binary_function_return_type< short, double> { using type = double; }; +template <> struct math_binary_function_return_type< short, bool> { using type = double; }; +template <> struct math_binary_function_return_type< short, short> { using type = double; }; +template <> struct math_binary_function_return_type< short, int> { using type = double; }; +template <> struct math_binary_function_return_type< short, long> { using type = double; }; +template <> struct math_binary_function_return_type< short, long long> { using type = double; }; +template <> struct math_binary_function_return_type< short, unsigned short> { using type = double; }; +template <> struct math_binary_function_return_type< short, unsigned int> { using type = double; }; +template <> struct math_binary_function_return_type< short, unsigned long> { using type = double; }; +template <> struct math_binary_function_return_type< short, unsigned long long> { using type = double; }; +template <> struct math_binary_function_return_type< int, float> { using type = double; }; +template <> struct math_binary_function_return_type< int, double> { using type = double; }; +template <> struct math_binary_function_return_type< int, bool> { using type = double; }; +template <> struct math_binary_function_return_type< int, short> { using type = double; }; +template <> struct math_binary_function_return_type< int, int> { using type = double; }; +template <> struct math_binary_function_return_type< int, long> { using type = double; }; +template <> struct math_binary_function_return_type< int, long long> { using type = double; }; +template <> struct math_binary_function_return_type< int, unsigned short> { using type = double; }; +template <> struct math_binary_function_return_type< int, unsigned int> { using type = double; }; +template <> struct math_binary_function_return_type< int, unsigned long> { using type = double; }; +template <> struct math_binary_function_return_type< int, unsigned long long> { using type = double; }; +template <> struct math_binary_function_return_type< long, float> { using type = double; }; +template <> struct math_binary_function_return_type< long, double> { using type = double; }; +template <> struct math_binary_function_return_type< long, bool> { using type = double; }; +template <> struct math_binary_function_return_type< long, short> { using type = double; }; +template <> struct math_binary_function_return_type< long, int> { using type = double; }; +template <> struct math_binary_function_return_type< long, long> { using type = double; }; +template <> struct math_binary_function_return_type< long, long long> { using type = double; }; +template <> struct math_binary_function_return_type< long, unsigned short> { using type = double; }; +template <> struct math_binary_function_return_type< long, unsigned int> { using type = double; }; +template <> struct math_binary_function_return_type< long, unsigned long> { using type = double; }; +template <> struct math_binary_function_return_type< long, unsigned long long> { using type = double; }; +template <> struct math_binary_function_return_type< long long, float> { using type = double; }; +template <> struct math_binary_function_return_type< long long, double> { using type = double; }; +template <> struct math_binary_function_return_type< long long, bool> { using type = double; }; +template <> struct math_binary_function_return_type< long long, short> { using type = double; }; +template <> struct math_binary_function_return_type< long long, int> { using type = double; }; +template <> struct math_binary_function_return_type< long long, long> { using type = double; }; +template <> struct math_binary_function_return_type< long long, long long> { using type = double; }; +template <> struct math_binary_function_return_type< long long, unsigned short> { using type = double; }; +template <> struct math_binary_function_return_type< long long, unsigned int> { using type = double; }; +template <> struct math_binary_function_return_type< long long, unsigned long> { using type = double; }; +template <> struct math_binary_function_return_type< long long, unsigned long long> { using type = double; }; +template <> struct math_binary_function_return_type< unsigned short, float> { using type = double; }; +template <> struct math_binary_function_return_type< unsigned short, double> { using type = double; }; +template <> struct math_binary_function_return_type< unsigned short, bool> { using type = double; }; +template <> struct math_binary_function_return_type< unsigned short, short> { using type = double; }; +template <> struct math_binary_function_return_type< unsigned short, int> { using type = double; }; +template <> struct math_binary_function_return_type< unsigned short, long> { using type = double; }; +template <> struct math_binary_function_return_type< unsigned short, long long> { using type = double; }; +template <> struct math_binary_function_return_type< unsigned short, unsigned short> { using type = double; }; +template <> struct math_binary_function_return_type< unsigned short, unsigned int> { using type = double; }; +template <> struct math_binary_function_return_type< unsigned short, unsigned long> { using type = double; }; +template <> struct math_binary_function_return_type< unsigned short, unsigned long long> { using type = double; }; +template <> struct math_binary_function_return_type< unsigned int, float> { using type = double; }; +template <> struct math_binary_function_return_type< unsigned int, double> { using type = double; }; +template <> struct math_binary_function_return_type< unsigned int, bool> { using type = double; }; +template <> struct math_binary_function_return_type< unsigned int, short> { using type = double; }; +template <> struct math_binary_function_return_type< unsigned int, int> { using type = double; }; +template <> struct math_binary_function_return_type< unsigned int, long> { using type = double; }; +template <> struct math_binary_function_return_type< unsigned int, long long> { using type = double; }; +template <> struct math_binary_function_return_type< unsigned int, unsigned short> { using type = double; }; +template <> struct math_binary_function_return_type< unsigned int, unsigned int> { using type = double; }; +template <> struct math_binary_function_return_type< unsigned int, unsigned long> { using type = double; }; +template <> struct math_binary_function_return_type< unsigned int, unsigned long long> { using type = double; }; +template <> struct math_binary_function_return_type< unsigned long, float> { using type = double; }; +template <> struct math_binary_function_return_type< unsigned long, double> { using type = double; }; +template <> struct math_binary_function_return_type< unsigned long, bool> { using type = double; }; +template <> struct math_binary_function_return_type< unsigned long, short> { using type = double; }; +template <> struct math_binary_function_return_type< unsigned long, int> { using type = double; }; +template <> struct math_binary_function_return_type< unsigned long, long> { using type = double; }; +template <> struct math_binary_function_return_type< unsigned long, long long> { using type = double; }; +template <> struct math_binary_function_return_type< unsigned long, unsigned short> { using type = double; }; +template <> struct math_binary_function_return_type< unsigned long, unsigned int> { using type = double; }; +template <> struct math_binary_function_return_type< unsigned long, unsigned long> { using type = double; }; +template <> struct math_binary_function_return_type< unsigned long, unsigned long long> { using type = double; }; +template <> struct math_binary_function_return_type<unsigned long long, float> { using type = double; }; +template <> struct math_binary_function_return_type<unsigned long long, double> { using type = double; }; +template <> struct math_binary_function_return_type<unsigned long long, bool> { using type = double; }; +template <> struct math_binary_function_return_type<unsigned long long, short> { using type = double; }; +template <> struct math_binary_function_return_type<unsigned long long, int> { using type = double; }; +template <> struct math_binary_function_return_type<unsigned long long, long> { using type = double; }; +template <> struct math_binary_function_return_type<unsigned long long, long long> { using type = double; }; +template <> struct math_binary_function_return_type<unsigned long long, unsigned short> { using type = double; }; +template <> struct math_binary_function_return_type<unsigned long long, unsigned int> { using type = double; }; +template <> struct math_binary_function_return_type<unsigned long long, unsigned long> { using type = double; }; +template <> struct math_binary_function_return_type<unsigned long long, unsigned long long> { using type = double; }; +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS +template <> struct math_binary_function_return_type< float, long double> { using type = long double; }; +template <> struct math_binary_function_return_type< double, long double> { using type = long double; }; +template <> struct math_binary_function_return_type< long double, float> { using type = long double; }; +template <> struct math_binary_function_return_type< long double, double> { using type = long double; }; +template <> struct math_binary_function_return_type< long double, long double> { using type = long double; }; +template <> struct math_binary_function_return_type< long double, bool> { using type = long double; }; +template <> struct math_binary_function_return_type< long double, short> { using type = long double; }; +template <> struct math_binary_function_return_type< long double, int> { using type = long double; }; +template <> struct math_binary_function_return_type< long double, long> { using type = long double; }; +template <> struct math_binary_function_return_type< long double, long long> { using type = long double; }; +template <> struct math_binary_function_return_type< long double, unsigned short> { using type = long double; }; +template <> struct math_binary_function_return_type< long double, unsigned int> { using type = long double; }; +template <> struct math_binary_function_return_type< long double, unsigned long> { using type = long double; }; +template <> struct math_binary_function_return_type< long double, unsigned long long> { using type = long double; }; +template <> struct math_binary_function_return_type< short, long double> { using type = long double; }; +template <> struct math_binary_function_return_type< int, long double> { using type = long double; }; +template <> struct math_binary_function_return_type< long, long double> { using type = long double; }; +template <> struct math_binary_function_return_type< long long, long double> { using type = long double; }; +template <> struct math_binary_function_return_type< unsigned short, long double> { using type = long double; }; +template <> struct math_binary_function_return_type< unsigned int, long double> { using type = long double; }; +template <> struct math_binary_function_return_type< unsigned long, long double> { using type = long double; }; +template <> struct math_binary_function_return_type<unsigned long long, long double> { using type = long double; }; +#endif +template <class T, class U> +using math_binary_function_return_type_t = typename math_binary_function_return_type<T, U>::type; +// clang-format on + +struct FloatingPointComparison { + private: + template <class T> + KOKKOS_FUNCTION double eps(T) const { + return DBL_EPSILON; + } + KOKKOS_FUNCTION + double eps(float) const { return FLT_EPSILON; } + KOKKOS_FUNCTION + double eps(long double) const { return LDBL_EPSILON; } + + // Using absolute here instead of abs, since we actually test abs ... + template <class T> + KOKKOS_FUNCTION typename std::enable_if<std::is_signed<T>::value, T>::type + absolute(T val) const { + return val < T(0) ? -val : val; + } + + template <class T> + KOKKOS_FUNCTION typename std::enable_if<!std::is_signed<T>::value, T>::type + absolute(T val) const { + return val; + } + + public: + template <class FPT> + KOKKOS_FUNCTION bool compare_near_zero(FPT const& fpv, double ulp) const { + auto abs_tol = eps(fpv) * ulp; + + bool ar = absolute(fpv) < abs_tol; + if (!ar) { +#if !defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ENABLE_HIP) + printf("absolute value exceeds tolerance [|%e| > %e]\n", (double)fpv, + abs_tol); +#endif + } + + return ar; + } + + template <class Lhs, class Rhs> + KOKKOS_FUNCTION bool compare(Lhs const& lhs, Rhs const& rhs, + double ulp) const { + if (lhs == 0) { + return compare_near_zero(rhs, ulp); + } else if (rhs == 0) { + return compare_near_zero(lhs, ulp); + } else { + auto rel_tol = (eps(lhs) < eps(rhs) ? eps(lhs) : eps(rhs)) * ulp; + double abs_diff = static_cast<double>(rhs > lhs ? rhs - lhs : lhs - rhs); + double min_denom = static_cast<double>( + absolute(rhs) < absolute(lhs) ? absolute(rhs) : absolute(lhs)); + double rel_diff = abs_diff / min_denom; + bool ar = rel_diff < rel_tol; + if (!ar) { +#if !defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ENABLE_HIP) + printf("relative difference exceeds tolerance [%e > %e]\n", + (double)rel_diff, rel_tol); +#endif + } + + return ar; + } + } +}; + +template <class> +struct math_function_name; + +#define DEFINE_UNARY_FUNCTION_EVAL(FUNC, ULP_FACTOR) \ + struct MathUnaryFunction_##FUNC { \ + template <typename T> \ + static KOKKOS_FUNCTION auto eval(T x) { \ + static_assert(std::is_same<decltype(Kokkos::Experimental::FUNC((T)0)), \ + math_unary_function_return_type_t<T>>::value, \ + ""); \ + return Kokkos::Experimental::FUNC(x); \ + } \ + template <typename T> \ + static auto eval_std(T x) { \ + static_assert(std::is_same<decltype(std::FUNC((T)0)), \ + math_unary_function_return_type_t<T>>::value, \ + ""); \ + return std::FUNC(x); \ + } \ + static KOKKOS_FUNCTION double ulp_factor() { return ULP_FACTOR; } \ + }; \ + using kk_##FUNC = MathUnaryFunction_##FUNC; \ + template <> \ + struct math_function_name<MathUnaryFunction_##FUNC> { \ + static constexpr char name[] = #FUNC; \ + }; \ + constexpr char math_function_name<MathUnaryFunction_##FUNC>::name[] + +// Generally the expected ULP error should come from here: +// https://www.gnu.org/software/libc/manual/html_node/Errors-in-Math-Functions.html +// For now 1s largely seem to work ... +DEFINE_UNARY_FUNCTION_EVAL(exp, 2); +DEFINE_UNARY_FUNCTION_EVAL(exp2, 2); +DEFINE_UNARY_FUNCTION_EVAL(expm1, 2); +DEFINE_UNARY_FUNCTION_EVAL(log, 2); +DEFINE_UNARY_FUNCTION_EVAL(log10, 2); +DEFINE_UNARY_FUNCTION_EVAL(log2, 2); +DEFINE_UNARY_FUNCTION_EVAL(log1p, 2); + +DEFINE_UNARY_FUNCTION_EVAL(sqrt, 2); +DEFINE_UNARY_FUNCTION_EVAL(cbrt, 2); + +DEFINE_UNARY_FUNCTION_EVAL(sin, 2); +DEFINE_UNARY_FUNCTION_EVAL(cos, 2); +DEFINE_UNARY_FUNCTION_EVAL(tan, 2); +DEFINE_UNARY_FUNCTION_EVAL(asin, 2); +DEFINE_UNARY_FUNCTION_EVAL(acos, 2); +DEFINE_UNARY_FUNCTION_EVAL(atan, 2); + +DEFINE_UNARY_FUNCTION_EVAL(sinh, 2); +DEFINE_UNARY_FUNCTION_EVAL(cosh, 2); +DEFINE_UNARY_FUNCTION_EVAL(tanh, 2); +DEFINE_UNARY_FUNCTION_EVAL(asinh, 4); +DEFINE_UNARY_FUNCTION_EVAL(acosh, 2); +DEFINE_UNARY_FUNCTION_EVAL(atanh, 2); + +DEFINE_UNARY_FUNCTION_EVAL(erf, 2); +DEFINE_UNARY_FUNCTION_EVAL(erfc, 5); +// has a larger error due to some impls doing integer exact. +// We cast always to double leading to larger difference when comparing our +// tgamma to std::tgamma on the host. +DEFINE_UNARY_FUNCTION_EVAL(tgamma, 200); +DEFINE_UNARY_FUNCTION_EVAL(lgamma, 2); + +DEFINE_UNARY_FUNCTION_EVAL(ceil, 2); +DEFINE_UNARY_FUNCTION_EVAL(floor, 2); +DEFINE_UNARY_FUNCTION_EVAL(trunc, 2); +#ifndef KOKKOS_ENABLE_SYCL +DEFINE_UNARY_FUNCTION_EVAL(nearbyint, 2); +#endif + +#undef DEFINE_UNARY_FUNCTION_EVAL + +#define DEFINE_BINARY_FUNCTION_EVAL(FUNC, ULP_FACTOR) \ + struct MathBinaryFunction_##FUNC { \ + template <typename T, typename U> \ + static KOKKOS_FUNCTION auto eval(T x, U y) { \ + static_assert( \ + std::is_same<decltype(Kokkos::Experimental::FUNC((T)0, (U)0)), \ + math_binary_function_return_type_t<T, U>>::value, \ + ""); \ + return Kokkos::Experimental::FUNC(x, y); \ + } \ + template <typename T, typename U> \ + static auto eval_std(T x, U y) { \ + static_assert( \ + std::is_same<decltype(std::FUNC((T)0, (U)0)), \ + math_binary_function_return_type_t<T, U>>::value, \ + ""); \ + return std::FUNC(x, y); \ + } \ + static KOKKOS_FUNCTION double ulp_factor() { return ULP_FACTOR; } \ + }; \ + using kk_##FUNC = MathBinaryFunction_##FUNC; \ + template <> \ + struct math_function_name<MathBinaryFunction_##FUNC> { \ + static constexpr char name[] = #FUNC; \ + }; \ + constexpr char math_function_name<MathBinaryFunction_##FUNC>::name[] + +DEFINE_BINARY_FUNCTION_EVAL(pow, 2); +DEFINE_BINARY_FUNCTION_EVAL(hypot, 2); + +#undef DEFINE_BINARY_FUNCTION_EVAL + +// clang-format off +template <class> +struct type_helper; +#define DEFINE_TYPE_NAME(T) \ +template <> struct type_helper<T> { static char const * name() { return #T; } }; +DEFINE_TYPE_NAME(bool) +DEFINE_TYPE_NAME(int) +DEFINE_TYPE_NAME(long) +DEFINE_TYPE_NAME(long long) +DEFINE_TYPE_NAME(unsigned int) +DEFINE_TYPE_NAME(unsigned long) +DEFINE_TYPE_NAME(unsigned long long) +DEFINE_TYPE_NAME(float) +DEFINE_TYPE_NAME(double) +DEFINE_TYPE_NAME(long double) +#undef DEFINE_TYPE_NAME +// clang-format on + +template <class Space, class Func, class Arg, std::size_t N, + class Ret = math_unary_function_return_type_t<Arg>> +struct TestMathUnaryFunction : FloatingPointComparison { + Arg val_[N]; + Ret res_[N]; + TestMathUnaryFunction(const Arg (&val)[N]) { + std::cout << math_function_name<Func>::name << "(" + << type_helper<Arg>::name() << ")\n"; + std::copy(val, val + N, val_); + std::transform(val, val + N, res_, + [](auto x) { return Func::eval_std(x); }); + run(); + } + void run() { + int errors = 0; + Kokkos::parallel_reduce(Kokkos::RangePolicy<Space>(0, N), *this, errors); + ASSERT_EQ(errors, 0); + } + KOKKOS_FUNCTION void operator()(int i, int& e) const { + bool ar = compare(Func::eval(val_[i]), res_[i], Func::ulp_factor()); + if (!ar) { + ++e; +#if !defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ENABLE_HIP) + printf("value at %f which is %f was expected to be %f\n", (double)val_[i], + (double)Func::eval(val_[i]), (double)res_[i]); +#endif + } + } +}; + +template <class Space, class... Func, class Arg, std::size_t N> +void do_test_math_unary_function(const Arg (&x)[N]) { + (void)std::initializer_list<int>{ + (TestMathUnaryFunction<Space, Func, Arg, N>(x), 0)...}; +} + +#define TEST_MATH_FUNCTION(FUNC) \ + do_test_math_unary_function<TEST_EXECSPACE, MathUnaryFunction_##FUNC> + +template <class Space, class Func, class Arg1, class Arg2, + class Ret = math_binary_function_return_type_t<Arg1, Arg2>> +struct TestMathBinaryFunction : FloatingPointComparison { + Arg1 val1_; + Arg2 val2_; + Ret res_; + TestMathBinaryFunction(Arg1 val1, Arg2 val2) + : val1_(val1), val2_(val2), res_(Func::eval_std(val1, val2)) { + std::cout << math_function_name<Func>::name << "(" + << type_helper<Arg1>::name() << ", " << type_helper<Arg2>::name() + << ")\n"; + run(); + } + void run() { + int errors = 0; + Kokkos::parallel_reduce(Kokkos::RangePolicy<Space>(0, 1), *this, errors); + ASSERT_EQ(errors, 0); + } + KOKKOS_FUNCTION void operator()(int, int& e) const { + bool ar = compare(Func::eval(val1_, val2_), res_, Func::ulp_factor()); + if (!ar) { + ++e; +#if !defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ENABLE_HIP) + printf("value at %f, %f which is %f was expected to be %f\n", + (double)val1_, (double)val2_, (double)Func::eval(val1_, val2_), + (double)res_); +#endif + } + } +}; + +template <class Space, class... Func, class Arg1, class Arg2> +void do_test_math_binary_function(Arg1 arg1, Arg2 arg2) { + (void)std::initializer_list<int>{ + (TestMathBinaryFunction<Space, Func, Arg1, Arg2>(arg1, arg2), 0)...}; +} + +TEST(TEST_CATEGORY, mathematical_functions_trigonometric_functions) { + TEST_MATH_FUNCTION(sin)({true, false}); + TEST_MATH_FUNCTION(sin)({-3, -2, -1, 0, 1}); + TEST_MATH_FUNCTION(sin)({-3l, -2l, -1l, 0l, 1l}); + TEST_MATH_FUNCTION(sin)({-3ll, -2ll, -1ll, 0ll, 1ll}); + TEST_MATH_FUNCTION(sin)({2u, 3u, 4u, 5u, 6u}); + TEST_MATH_FUNCTION(sin)({2ul, 3ul, 4ul, 5ul, 6ul}); + TEST_MATH_FUNCTION(sin)({2ull, 3ull, 4ull, 5ull, 6ull}); + TEST_MATH_FUNCTION(sin)({.1f, .2f, .3f}); + TEST_MATH_FUNCTION(sin)({.4, .5, .6}); +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS + TEST_MATH_FUNCTION(sin)({.7l, .8l, .9l}); +#endif + + TEST_MATH_FUNCTION(cos)({true, false}); + TEST_MATH_FUNCTION(cos)({-3, -2, -1, 0, 1}); + TEST_MATH_FUNCTION(cos)({-3l, -2l, -1l, 0l, 1l}); + TEST_MATH_FUNCTION(cos)({-3ll, -2ll, -1ll, 0ll, 1ll}); + TEST_MATH_FUNCTION(cos)({2u, 3u, 4u, 5u, 6u}); + TEST_MATH_FUNCTION(cos)({2ul, 3ul, 4ul, 5ul, 6ul}); + TEST_MATH_FUNCTION(cos)({2ull, 3ull, 4ull, 5ull, 6ull}); + TEST_MATH_FUNCTION(cos)({.1f, .2f, .3f}); + TEST_MATH_FUNCTION(cos)({.4, .5, .6}); +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS + TEST_MATH_FUNCTION(cos)({.7l, .8l, .9l}); +#endif + + TEST_MATH_FUNCTION(tan)({true, false}); + TEST_MATH_FUNCTION(tan)({-3, -2, -1, 0, 1}); + TEST_MATH_FUNCTION(tan)({-3l, -2l, -1l, 0l, 1l}); + TEST_MATH_FUNCTION(tan)({-3ll, -2ll, -1ll, 0ll, 1ll}); + TEST_MATH_FUNCTION(tan)({2u, 3u, 4u, 5u, 6u}); + TEST_MATH_FUNCTION(tan)({2ul, 3ul, 4ul, 5ul, 6ul}); + TEST_MATH_FUNCTION(tan)({2ull, 3ull, 4ull, 5ull, 6ull}); + TEST_MATH_FUNCTION(tan)({.1f, .2f, .3f}); + TEST_MATH_FUNCTION(tan)({.4, .5, .6}); +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS + TEST_MATH_FUNCTION(tan)({.7l, .8l, .9l}); +#endif + + TEST_MATH_FUNCTION(asin)({true, false}); + TEST_MATH_FUNCTION(asin)({-1, 0, 1}); + TEST_MATH_FUNCTION(asin)({-1l, 0l, 1l}); + TEST_MATH_FUNCTION(asin)({-1ll, 0ll, 1ll}); + TEST_MATH_FUNCTION(asin)({0u, 1u}); + TEST_MATH_FUNCTION(asin)({0ul, 1ul}); + TEST_MATH_FUNCTION(asin)({0ull, 1ull}); + TEST_MATH_FUNCTION(asin)({-1.f, .9f, -.8f, .7f, -.6f}); + TEST_MATH_FUNCTION(asin)({-.5, .4, -.3, .2, -.1, 0.}); +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS + TEST_MATH_FUNCTION(asin)({-.5l, .3l, 0.l, .2l, .4l, .6l}); +#endif + + TEST_MATH_FUNCTION(acos)({true, false}); + TEST_MATH_FUNCTION(acos)({-1, 0, 1}); + TEST_MATH_FUNCTION(acos)({-1l, 0l, 1l}); + TEST_MATH_FUNCTION(acos)({-1ll, 0ll, 1ll}); + TEST_MATH_FUNCTION(acos)({0u, 1u}); + TEST_MATH_FUNCTION(acos)({0ul, 1ul}); + TEST_MATH_FUNCTION(acos)({0ull, 1ull}); + TEST_MATH_FUNCTION(acos)({-1.f, .9f, -.8f, .7f, -.6f}); + TEST_MATH_FUNCTION(acos)({-.5, .4, -.3, .2, -.1, 0.}); +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS + TEST_MATH_FUNCTION(acos)({-.5l, .3l, 0.l, .2l, .4l, .6l}); +#endif + + TEST_MATH_FUNCTION(atan)({true, false}); + TEST_MATH_FUNCTION(atan)({-1, 0, 1}); + TEST_MATH_FUNCTION(atan)({-1l, 0l, 1l}); + TEST_MATH_FUNCTION(atan)({-1ll, 0ll, 1ll}); + TEST_MATH_FUNCTION(atan)({0u, 1u}); + TEST_MATH_FUNCTION(atan)({0ul, 1ul}); + TEST_MATH_FUNCTION(atan)({0ull, 1ull}); + TEST_MATH_FUNCTION(atan)({-1.5f, 1.3f, -1.1f, .9f, -.7f, .5f}); + TEST_MATH_FUNCTION(atan)({1.4, -1.2, 1., -.8, .6, -.4, .2, -0.}); +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS + TEST_MATH_FUNCTION(atan)({-.98l, .67l, -54.l, .34l, -.21l}); +#endif + + // TODO atan2 +} + +TEST(TEST_CATEGORY, mathematical_functions_power_functions) { + TEST_MATH_FUNCTION(sqrt)({0, 1, 2, 3, 5, 7, 11}); + TEST_MATH_FUNCTION(sqrt)({0l, 1l, 2l, 3l, 5l, 7l, 11l}); + TEST_MATH_FUNCTION(sqrt)({0ll, 1ll, 2ll, 3ll, 5ll, 7ll, 11ll}); + TEST_MATH_FUNCTION(sqrt)({0u, 1u, 2u, 3u, 5u, 7u}); + TEST_MATH_FUNCTION(sqrt)({0ul, 1ul, 2ul, 3ul, 5ul, 7ul}); + TEST_MATH_FUNCTION(sqrt)({0ull, 1ull, 2ull, 3ull, 5ull, 7ull}); + TEST_MATH_FUNCTION(sqrt)({10.f, 20.f, 30.f, 40.f}); + TEST_MATH_FUNCTION(sqrt)({11.1, 22.2, 33.3, 44.4}); +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS + TEST_MATH_FUNCTION(sqrt)({10.l, 20.l, 30.l, 40.l}); +#endif + + TEST_MATH_FUNCTION(cbrt)({-5, -3, -1, 2, 4, 6}); + TEST_MATH_FUNCTION(cbrt)({-5l, -3l, -1l, 2l, 4l, 6l}); + TEST_MATH_FUNCTION(cbrt)({-5ll, -3ll, -1ll, 2ll, 4ll, 6ll}); + TEST_MATH_FUNCTION(cbrt)({0u, 1u, 2u, 3u, 4u, 5u}); + TEST_MATH_FUNCTION(cbrt)({0ul, 1ul, 2ul, 3ul, 4ul, 5ul}); + TEST_MATH_FUNCTION(cbrt)({0ull, 1ull, 2ull, 3ull, 4ull, 5ull}); + TEST_MATH_FUNCTION(cbrt)({-1.f, .2f, -3.f, .4f, -5.f}); + TEST_MATH_FUNCTION(cbrt)({11.1, -2.2, 33.3, -4.4, 55.5}); +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS + TEST_MATH_FUNCTION(cbrt)({-10.l, 20.l, -30.l, 40.l, -50.l}); +#endif + + do_test_math_binary_function<TEST_EXECSPACE, kk_pow>(2.f, 3.f); + do_test_math_binary_function<TEST_EXECSPACE, kk_pow>(2., 3.); +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS + do_test_math_binary_function<TEST_EXECSPACE, kk_pow>(2.l, 3.l); +#endif + + do_test_math_binary_function<TEST_EXECSPACE, kk_hypot>(2.f, 3.f); + do_test_math_binary_function<TEST_EXECSPACE, kk_hypot>(2., 3.); +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS +#if !(defined(KOKKOS_ARCH_POWER8) || defined(KOKKOS_ARCH_POWER9)) // FIXME + do_test_math_binary_function<TEST_EXECSPACE, kk_hypot>(2.l, 3.l); +#endif +#endif +} + +TEST(TEST_CATEGORY, mathematical_functions_exponential_functions) { + TEST_MATH_FUNCTION(exp)({-9, -8, -7, -6, -5, 4, 3, 2, 1, 0}); + TEST_MATH_FUNCTION(exp)({-9l, -8l, -7l, -6l, -5l, 4l, 3l, 2l, 1l, 0l}); + TEST_MATH_FUNCTION(exp)({-9ll, -8ll, -7ll, -6ll, -5ll, 4ll, 3ll, 2ll, 1ll}); + TEST_MATH_FUNCTION(exp)({0u, 1u, 2u, 3u, 4u, 5u}); + TEST_MATH_FUNCTION(exp)({0ul, 1ul, 2ul, 3ul, 4ul, 5ul}); + TEST_MATH_FUNCTION(exp)({0ull, 1ull, 2ull, 3ull, 4ull, 5ull}); + TEST_MATH_FUNCTION(exp)({-98.f, -7.6f, -.54f, 3.2f, 1.f, -0.f}); + TEST_MATH_FUNCTION(exp)({-98., -7.6, -.54, 3.2, 1., -0.}); +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS + TEST_MATH_FUNCTION(exp)({-98.l, -7.6l, -.54l, 3.2l, 1.l, -0.l}); +#endif + + TEST_MATH_FUNCTION(exp2)({-9, -8, -7, -6, -5, 4, 3, 2, 1, 0}); + TEST_MATH_FUNCTION(exp2)({-9l, -8l, -7l, -6l, -5l, 4l, 3l, 2l, 1l, 0l}); + TEST_MATH_FUNCTION(exp2)({-9ll, -8ll, -7ll, -6ll, -5ll, 4ll, 3ll, 2ll, 1ll}); + TEST_MATH_FUNCTION(exp2)({0u, 1u, 2u, 3u, 4u, 5u}); + TEST_MATH_FUNCTION(exp2)({0ul, 1ul, 2ul, 3ul, 4ul, 5ul}); + TEST_MATH_FUNCTION(exp2)({0ull, 1ull, 2ull, 3ull, 4ull, 5ull}); + TEST_MATH_FUNCTION(exp2)({-98.f, -7.6f, -.54f, 3.2f, 1.f, -0.f}); + TEST_MATH_FUNCTION(exp2)({-98., -7.6, -.54, 3.2, 1., -0.}); +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS + TEST_MATH_FUNCTION(exp2)({-98.l, -7.6l, -.54l, 3.2l, 1.l, -0.l}); +#endif + + TEST_MATH_FUNCTION(expm1)({-9, -8, -7, -6, -5, 4, 3, 2, 1, 0}); + TEST_MATH_FUNCTION(expm1)({-9l, -8l, -7l, -6l, -5l, 4l, 3l, 2l, 1l, 0l}); + TEST_MATH_FUNCTION(expm1)({-9ll, -8ll, -7ll, -6ll, -5ll, 4ll, 3ll, 2ll, 1ll}); + TEST_MATH_FUNCTION(expm1)({0u, 1u, 2u, 3u, 4u, 5u}); + TEST_MATH_FUNCTION(expm1)({0ul, 1ul, 2ul, 3ul, 4ul, 5ul}); + TEST_MATH_FUNCTION(expm1)({0ull, 1ull, 2ull, 3ull, 4ull, 5ull}); + TEST_MATH_FUNCTION(expm1)({-98.f, -7.6f, -.54f, 3.2f, 1.f, -0.f}); + TEST_MATH_FUNCTION(expm1)({-98., -7.6, -.54, 3.2, 1., -0.}); +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS + TEST_MATH_FUNCTION(expm1)({-98.l, -7.6l, -.54l, 3.2l, 1.l, -0.l}); +#endif + + TEST_MATH_FUNCTION(log)({1, 23, 456, 7890}); + TEST_MATH_FUNCTION(log)({1l, 23l, 456l, 7890l}); + TEST_MATH_FUNCTION(log)({1ll, 23ll, 456ll, 7890ll}); + TEST_MATH_FUNCTION(log)({1u, 23u, 456u, 7890u}); + TEST_MATH_FUNCTION(log)({1ul, 23ul, 456ul, 7890ul}); + TEST_MATH_FUNCTION(log)({1ull, 23ull, 456ull, 7890ull}); + TEST_MATH_FUNCTION(log)({1234.f, 567.f, 89.f, .1f}); + TEST_MATH_FUNCTION(log)({1234., 567., 89., .02}); +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS + TEST_MATH_FUNCTION(log)({1234.l, 567.l, 89.l, .003l}); +#endif + + TEST_MATH_FUNCTION(log10)({1, 23, 456, 7890}); + TEST_MATH_FUNCTION(log10)({1l, 23l, 456l, 7890l}); + TEST_MATH_FUNCTION(log10)({1ll, 23ll, 456ll, 7890ll}); + TEST_MATH_FUNCTION(log10)({1u, 23u, 456u, 7890u}); + TEST_MATH_FUNCTION(log10)({1ul, 23ul, 456ul, 7890ul}); + TEST_MATH_FUNCTION(log10)({1ull, 23ull, 456ull, 7890ull}); + TEST_MATH_FUNCTION(log10)({1234.f, 567.f, 89.f, .1f}); + TEST_MATH_FUNCTION(log10)({1234., 567., 89., .02}); +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS + TEST_MATH_FUNCTION(log10)({1234.l, 567.l, 89.l, .003l}); +#endif + + TEST_MATH_FUNCTION(log2)({1, 23, 456, 7890}); + TEST_MATH_FUNCTION(log2)({1l, 23l, 456l, 7890l}); + TEST_MATH_FUNCTION(log2)({1ll, 23ll, 456ll, 7890ll}); + TEST_MATH_FUNCTION(log2)({1u, 23u, 456u, 7890u}); + TEST_MATH_FUNCTION(log2)({1ul, 23ul, 456ul, 7890ul}); + TEST_MATH_FUNCTION(log2)({1ull, 23ull, 456ull, 7890ull}); + TEST_MATH_FUNCTION(log2)({1234.f, 567.f, 89.f, .1f}); + TEST_MATH_FUNCTION(log2)({1234., 567., 89., .02}); +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS + TEST_MATH_FUNCTION(log2)({1234.l, 567.l, 89.l, .003l}); +#endif + + TEST_MATH_FUNCTION(log1p)({1, 23, 456, 7890, 0}); + TEST_MATH_FUNCTION(log1p)({1l, 23l, 456l, 7890l, 0l}); + TEST_MATH_FUNCTION(log1p)({1ll, 23ll, 456ll, 7890ll, 0ll}); + TEST_MATH_FUNCTION(log1p)({1u, 23u, 456u, 7890u, 0u}); + TEST_MATH_FUNCTION(log1p)({1ul, 23ul, 456ul, 7890ul, 0ul}); + TEST_MATH_FUNCTION(log1p)({1ull, 23ull, 456ull, 7890ull, 0ull}); + TEST_MATH_FUNCTION(log1p)({1234.f, 567.f, 89.f, -.9f}); + TEST_MATH_FUNCTION(log1p)({1234., 567., 89., -.08}); +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS + TEST_MATH_FUNCTION(log1p)({1234.l, 567.l, 89.l, -.007l}); +#endif +} + +TEST(TEST_CATEGORY, mathematical_functions_hyperbolic_functions) { + TEST_MATH_FUNCTION(sinh)({-3, -2, -1, 0, 1}); + TEST_MATH_FUNCTION(sinh)({-3l, -2l, -1l, 0l, 1l}); + TEST_MATH_FUNCTION(sinh)({-3ll, -2ll, -1ll, 0ll, 1ll}); + TEST_MATH_FUNCTION(sinh)({2u, 3u, 4u, 5u, 6u}); + TEST_MATH_FUNCTION(sinh)({2ul, 3ul, 4ul, 5ul, 6ul}); + TEST_MATH_FUNCTION(sinh)({2ull, 3ull, 4ull, 5ull, 6ull}); + TEST_MATH_FUNCTION(sinh)({.1f, -2.f, 3.f}); + TEST_MATH_FUNCTION(sinh)({-4., .5, -.6}); +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS + TEST_MATH_FUNCTION(sinh)({.7l, .8l, .9l}); +#endif + + TEST_MATH_FUNCTION(cosh)({-3, -2, -1, 0, 1}); + TEST_MATH_FUNCTION(cosh)({-3l, -2l, -1l, 0l, 1l}); + TEST_MATH_FUNCTION(cosh)({-3ll, -2ll, -1ll, 0ll, 1ll}); + TEST_MATH_FUNCTION(cosh)({2u, 3u, 4u, 5u, 6u}); + TEST_MATH_FUNCTION(cosh)({2ul, 3ul, 4ul, 5ul, 6ul}); + TEST_MATH_FUNCTION(cosh)({2ull, 3ull, 4ull, 5ull, 6ull}); + TEST_MATH_FUNCTION(cosh)({.1f, -2.f, 3.f}); + TEST_MATH_FUNCTION(cosh)({-4., .5, -.6}); +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS + TEST_MATH_FUNCTION(cosh)({.7l, .8l, .9l}); +#endif + + TEST_MATH_FUNCTION(tanh)({-3, -2, -1, 0, 1}); + TEST_MATH_FUNCTION(tanh)({-3l, -2l, -1l, 0l, 1l}); + TEST_MATH_FUNCTION(tanh)({-3ll, -2ll, -1ll, 0ll, 1ll}); + TEST_MATH_FUNCTION(tanh)({2u, 3u, 4u, 5u, 6u}); + TEST_MATH_FUNCTION(tanh)({2ul, 3ul, 4ul, 5ul, 6ul}); + TEST_MATH_FUNCTION(tanh)({2ull, 3ull, 4ull, 5ull, 6ull}); + TEST_MATH_FUNCTION(tanh)({.1f, -2.f, 3.f}); + TEST_MATH_FUNCTION(tanh)({-4., .5, -.6}); +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS + TEST_MATH_FUNCTION(tanh)({.7l, .8l, .9l}); +#endif + + TEST_MATH_FUNCTION(asinh)({-3, -2, -1, 0, 1}); + TEST_MATH_FUNCTION(asinh)({-3l, -2l, -1l, 0l, 1l}); + TEST_MATH_FUNCTION(asinh)({-3ll, -2ll, -1ll, 0ll, 1ll}); + TEST_MATH_FUNCTION(asinh)({2u, 3u, 4u, 5u, 6u}); + TEST_MATH_FUNCTION(asinh)({2ul, 3ul, 4ul, 5ul, 6ul}); + TEST_MATH_FUNCTION(asinh)({2ull, 3ull, 4ull, 5ull, 6ull}); + TEST_MATH_FUNCTION(asinh)({.1f, -2.f, 3.f}); + TEST_MATH_FUNCTION(asinh)({-4., .5, -.6}); +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS + TEST_MATH_FUNCTION(asinh)({.7l, .8l, .9l}); +#endif + + TEST_MATH_FUNCTION(acosh)({1, 2, 3, 4, 5, 6}); + TEST_MATH_FUNCTION(acosh)({1l, 2l, 3l, 4l, 5l, 6l}); + TEST_MATH_FUNCTION(acosh)({1ll, 2ll, 3ll, 4ll, 5ll, 6ll}); + TEST_MATH_FUNCTION(acosh)({1u, 2u, 3u, 4u, 5u, 6u}); + TEST_MATH_FUNCTION(acosh)({1ul, 2ul, 3ul, 4ul, 5ul, 6ul}); + TEST_MATH_FUNCTION(acosh)({1ull, 2ull, 3ull, 4ull, 5ull, 6ull}); + TEST_MATH_FUNCTION(acosh)({1.2f, 34.f, 56.f, 789.f}); + TEST_MATH_FUNCTION(acosh)({1.2, 34., 56., 789.}); +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS + TEST_MATH_FUNCTION(acosh)({1.2l, 34.l, 56.l, 789.l}); +#endif + + TEST_MATH_FUNCTION(atanh)({0}); + TEST_MATH_FUNCTION(atanh)({0l}); + TEST_MATH_FUNCTION(atanh)({0ll}); + TEST_MATH_FUNCTION(atanh)({0u}); + TEST_MATH_FUNCTION(atanh)({0ul}); + TEST_MATH_FUNCTION(atanh)({0ull}); + TEST_MATH_FUNCTION(atanh)({-.97f, .86f, -.53f, .42f, -.1f, 0.f}); + TEST_MATH_FUNCTION(atanh)({-.97, .86, -.53, .42, -.1, 0.}); +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS + TEST_MATH_FUNCTION(atanh)({-.97l, .86l, -.53l, .42l, -.1l, 0.l}); +#endif +} + +TEST(TEST_CATEGORY, mathematical_functions_error_and_gamma_functions) { + TEST_MATH_FUNCTION(erf)({-3, -2, -1, 0, 1}); + TEST_MATH_FUNCTION(erf)({-3l, -2l, -1l, 0l, 1l}); + TEST_MATH_FUNCTION(erf)({-3ll, -2ll, -1ll, 0ll, 1ll}); + TEST_MATH_FUNCTION(erf)({2u, 3u, 4u, 5u, 6u}); + TEST_MATH_FUNCTION(erf)({2ul, 3ul, 4ul, 5ul, 6ul}); + TEST_MATH_FUNCTION(erf)({2ull, 3ull, 4ull, 5ull, 6ull}); + TEST_MATH_FUNCTION(erf)({.1f, -2.f, 3.f}); + TEST_MATH_FUNCTION(erf)({-4., .5, -.6}); +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS + TEST_MATH_FUNCTION(erf)({.7l, .8l, .9l}); +#endif + + TEST_MATH_FUNCTION(erfc)({-3, -2, -1, 0, 1}); + TEST_MATH_FUNCTION(erfc)({-3l, -2l, -1l, 0l, 1l}); + TEST_MATH_FUNCTION(erfc)({-3ll, -2ll, -1ll, 0ll, 1ll}); + TEST_MATH_FUNCTION(erfc)({2u, 3u, 4u, 5u, 6u}); + TEST_MATH_FUNCTION(erfc)({2ul, 3ul, 4ul, 5ul, 6ul}); + TEST_MATH_FUNCTION(erfc)({2ull, 3ull, 4ull, 5ull, 6ull}); + TEST_MATH_FUNCTION(erfc)({.1f, -2.f, 3.f}); + TEST_MATH_FUNCTION(erfc)({-4., .5, -.6}); +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS + TEST_MATH_FUNCTION(erfc)({.7l, .8l, .9l}); +#endif + + TEST_MATH_FUNCTION(tgamma)({1, 2, 3, 4, 56, 78}); + TEST_MATH_FUNCTION(tgamma)({1l, 2l, 3l, 4l, 56l, 78l}); + TEST_MATH_FUNCTION(tgamma)({1ll, 2ll, 3ll, 4ll, 56ll, 78ll}); + TEST_MATH_FUNCTION(tgamma)({1u, 2u, 3u, 4u, 56u, 78u}); + TEST_MATH_FUNCTION(tgamma)({1ul, 2ul, 3ul, 4ul, 56ul, 78ul}); + TEST_MATH_FUNCTION(tgamma)({1ull, 2ull, 3ull, 4ull, 56ull, 78ull}); + TEST_MATH_FUNCTION(tgamma)({.1f, -2.2f, 3.f}); + TEST_MATH_FUNCTION(tgamma)({-4.4, .5, -.6}); +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS + TEST_MATH_FUNCTION(tgamma)({.7l, .8l, .9l}); +#endif + + TEST_MATH_FUNCTION(lgamma)({1, 2, 3, 4, 56, 78}); + TEST_MATH_FUNCTION(lgamma)({1l, 2l, 3l, 4l, 56l, 78l}); + TEST_MATH_FUNCTION(lgamma)({1ll, 2ll, 3ll, 4ll, 56ll, 78ll}); + TEST_MATH_FUNCTION(lgamma)({1u, 2u, 3u, 4u, 56u, 78u}); + TEST_MATH_FUNCTION(lgamma)({1ul, 2ul, 3ul, 4ul, 56ul, 78ul}); + TEST_MATH_FUNCTION(lgamma)({1ull, 2ull, 3ull, 4ull, 56ull, 78ull}); + TEST_MATH_FUNCTION(lgamma)({.1f, -2.2f, 3.f}); + TEST_MATH_FUNCTION(lgamma)({-4.4, .5, -.6}); +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS + TEST_MATH_FUNCTION(lgamma)({.7l, .8l, .9l}); +#endif +} + +TEST(TEST_CATEGORY, + mathematical_functions_nearest_interger_floating_point_operations) { + TEST_MATH_FUNCTION(ceil)({-3, -2, -1, 0, 1}); + TEST_MATH_FUNCTION(ceil)({-3l, -2l, -1l, 0l, 1l}); + TEST_MATH_FUNCTION(ceil)({-3ll, -2ll, -1ll, 0ll, 1ll}); + TEST_MATH_FUNCTION(ceil)({2u, 3u, 4u, 5u, 6u}); + TEST_MATH_FUNCTION(ceil)({2ul, 3ul, 4ul, 5ul, 6ul}); + TEST_MATH_FUNCTION(ceil)({2ull, 3ull, 4ull, 5ull, 6ull}); + TEST_MATH_FUNCTION(ceil)({-1.1f, 2.2f, -3.3f, 4.4f, -5.5f}); + TEST_MATH_FUNCTION(ceil)({-6.6, 7.7, -8.8, 9.9}); +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS + TEST_MATH_FUNCTION(ceil)({12.3l, 4.56l, 789.l}); +#endif + + TEST_MATH_FUNCTION(floor)({-3, -2, -1, 0, 1}); + TEST_MATH_FUNCTION(floor)({-3l, -2l, -1l, 0l, 1l}); + TEST_MATH_FUNCTION(floor)({-3ll, -2ll, -1ll, 0ll, 1ll}); + TEST_MATH_FUNCTION(floor)({2u, 3u, 4u, 5u, 6u}); + TEST_MATH_FUNCTION(floor)({2ul, 3ul, 4ul, 5ul, 6ul}); + TEST_MATH_FUNCTION(floor)({2ull, 3ull, 4ull, 5ull, 6ull}); + TEST_MATH_FUNCTION(floor)({-1.1f, 2.2f, -3.3f, 4.4f, -5.5f}); + TEST_MATH_FUNCTION(floor)({-6.6, 7.7, -8.8, 9.9}); +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS + TEST_MATH_FUNCTION(floor)({12.3l, 4.56l, 789.l}); +#endif + + TEST_MATH_FUNCTION(trunc)({-3, -2, -1, 0, 1}); + TEST_MATH_FUNCTION(trunc)({-3l, -2l, -1l, 0l, 1l}); + TEST_MATH_FUNCTION(trunc)({-3ll, -2ll, -1ll, 0ll, 1ll}); + TEST_MATH_FUNCTION(trunc)({2u, 3u, 4u, 5u, 6u}); + TEST_MATH_FUNCTION(trunc)({2ul, 3ul, 4ul, 5ul, 6ul}); + TEST_MATH_FUNCTION(trunc)({2ull, 3ull, 4ull, 5ull, 6ull}); + TEST_MATH_FUNCTION(trunc)({-1.1f, 2.2f, -3.3f, 4.4f, -5.5f}); + TEST_MATH_FUNCTION(trunc)({-6.6, 7.7, -8.8, 9.9}); +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS + TEST_MATH_FUNCTION(trunc)({12.3l, 4.56l, 789.l}); +#endif + +#ifndef KOKKOS_ENABLE_SYCL + TEST_MATH_FUNCTION(nearbyint)({-3, -2, -1, 0, 1}); + TEST_MATH_FUNCTION(nearbyint)({-3l, -2l, -1l, 0l, 1l}); + TEST_MATH_FUNCTION(nearbyint)({-3ll, -2ll, -1ll, 0ll, 1ll}); + TEST_MATH_FUNCTION(nearbyint)({2u, 3u, 4u, 5u, 6u}); + TEST_MATH_FUNCTION(nearbyint)({2ul, 3ul, 4ul, 5ul, 6ul}); + TEST_MATH_FUNCTION(nearbyint)({2ull, 3ull, 4ull, 5ull, 6ull}); + TEST_MATH_FUNCTION(nearbyint)({-1.1f, 2.2f, -3.3f, 4.4f, -5.5f}); + TEST_MATH_FUNCTION(nearbyint)({-6.6, 7.7, -8.8, 9.9}); +#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS + TEST_MATH_FUNCTION(nearbyint)({12.3l, 4.56l, 789.l}); +#endif +#endif +} diff --git a/packages/kokkos/core/unit_test/TestMemoryPool.hpp b/packages/kokkos/core/unit_test/TestMemoryPool.hpp new file mode 100644 index 0000000000000000000000000000000000000000..63895ad47dc435c98201a2b46d8b439d2a50ad51 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestMemoryPool.hpp @@ -0,0 +1,576 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_UNITTEST_MEMPOOL_HPP +#define KOKKOS_UNITTEST_MEMPOOL_HPP + +#include <cstdio> +#include <iostream> +#include <cmath> +#include <algorithm> + +#include <impl/Kokkos_Timer.hpp> + +namespace TestMemoryPool { + +template <typename MemSpace = Kokkos::HostSpace> +void test_host_memory_pool_defaults() { + using Space = typename MemSpace::execution_space; + using MemPool = typename Kokkos::MemoryPool<Space>; + + { + const size_t MemoryCapacity = 32000; + const size_t MinBlockSize = 64; + const size_t MaxBlockSize = 1024; + const size_t SuperBlockSize = 4096; + + MemPool pool(MemSpace(), MemoryCapacity, MinBlockSize, MaxBlockSize, + SuperBlockSize); + + typename MemPool::usage_statistics stats; + + pool.get_usage_statistics(stats); + + ASSERT_LE(MemoryCapacity, stats.capacity_bytes); + ASSERT_LE(MinBlockSize, stats.min_block_bytes); + ASSERT_LE(MaxBlockSize, stats.max_block_bytes); + ASSERT_LE(SuperBlockSize, stats.superblock_bytes); + } + + { + const size_t MemoryCapacity = 10000; + + MemPool pool(MemSpace(), MemoryCapacity); + + typename MemPool::usage_statistics stats; + + pool.get_usage_statistics(stats); + + ASSERT_LE(MemoryCapacity, stats.capacity_bytes); + ASSERT_LE(64u /* default */, stats.min_block_bytes); + ASSERT_LE(stats.min_block_bytes, stats.max_block_bytes); + ASSERT_LE(stats.max_block_bytes, stats.superblock_bytes); + ASSERT_LE(stats.superblock_bytes, stats.capacity_bytes); + } + + { + const size_t MemoryCapacity = 10000; + const size_t MinBlockSize = 32; // power of two is exact + + MemPool pool(MemSpace(), MemoryCapacity, MinBlockSize); + + typename MemPool::usage_statistics stats; + + pool.get_usage_statistics(stats); + + ASSERT_LE(MemoryCapacity, stats.capacity_bytes); + ASSERT_EQ(MinBlockSize, stats.min_block_bytes); + ASSERT_LE(stats.min_block_bytes, stats.max_block_bytes); + ASSERT_LE(stats.max_block_bytes, stats.superblock_bytes); + ASSERT_LE(stats.superblock_bytes, stats.capacity_bytes); + } + + { + const size_t MemoryCapacity = 32000; + const size_t MinBlockSize = 32; // power of two is exact + const size_t MaxBlockSize = 1024; // power of two is exact + + MemPool pool(MemSpace(), MemoryCapacity, MinBlockSize, MaxBlockSize); + + typename MemPool::usage_statistics stats; + + pool.get_usage_statistics(stats); + + ASSERT_LE(MemoryCapacity, stats.capacity_bytes); + ASSERT_EQ(MinBlockSize, stats.min_block_bytes); + ASSERT_EQ(MaxBlockSize, stats.max_block_bytes); + ASSERT_LE(stats.max_block_bytes, stats.superblock_bytes); + ASSERT_LE(stats.superblock_bytes, stats.capacity_bytes); + } +} + +template <typename MemSpace = Kokkos::HostSpace> +void test_host_memory_pool_stats() { + using Space = typename MemSpace::execution_space; + using MemPool = typename Kokkos::MemoryPool<Space>; + + const size_t MemoryCapacity = 32000; + const size_t MinBlockSize = 64; + const size_t MaxBlockSize = 1024; + const size_t SuperBlockSize = 4096; + + MemPool pool(MemSpace(), MemoryCapacity, MinBlockSize, MaxBlockSize, + SuperBlockSize); + + { + typename MemPool::usage_statistics stats; + + pool.get_usage_statistics(stats); + + ASSERT_LE(MemoryCapacity, stats.capacity_bytes); + ASSERT_LE(MinBlockSize, stats.min_block_bytes); + ASSERT_LE(MaxBlockSize, stats.max_block_bytes); + ASSERT_LE(SuperBlockSize, stats.superblock_bytes); + } + + void* p0064 = pool.allocate(64); + void* p0128 = pool.allocate(128); + void* p0256 = pool.allocate(256); + void* p1024 = pool.allocate(1024); + + // Aborts because exceeds max block size: + // void * p2048 = pool.allocate(2048); + + ASSERT_NE(p0064, nullptr); + ASSERT_NE(p0128, nullptr); + ASSERT_NE(p0256, nullptr); + ASSERT_NE(p1024, nullptr); + + pool.deallocate(p0064, 64); + pool.deallocate(p0128, 128); + pool.deallocate(p0256, 256); + pool.deallocate(p1024, 1024); +} + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +template <class DeviceType> +struct TestMemoryPool_Functor { + using ptrs_type = Kokkos::View<uintptr_t*, DeviceType>; + using pool_type = Kokkos::MemoryPool<DeviceType>; + + pool_type pool; + ptrs_type ptrs; + + TestMemoryPool_Functor(const pool_type& arg_pool, size_t n) + : pool(arg_pool), ptrs("ptrs", n) {} + + // Specify reduction argument value_type to avoid + // confusion with tag-dispatch. + + using value_type = long; + + struct TagAlloc {}; + + KOKKOS_INLINE_FUNCTION + void operator()(TagAlloc, int i, long& update) const noexcept { + unsigned alloc_size = 32 * (1 + (i % 5)); + ptrs(i) = (uintptr_t)pool.allocate(alloc_size); + if (ptrs(i)) { + ++update; + } + } + + struct TagDealloc {}; + + KOKKOS_INLINE_FUNCTION + void operator()(TagDealloc, int i, long& update) const noexcept { + if (ptrs(i) && (0 == i % 3)) { + unsigned alloc_size = 32 * (1 + (i % 5)); + pool.deallocate((void*)ptrs(i), alloc_size); + ptrs(i) = 0; + ++update; + } + } + + struct TagRealloc {}; + + KOKKOS_INLINE_FUNCTION + void operator()(TagRealloc, int i, long& update) const noexcept { + if (0 == ptrs(i)) { + unsigned alloc_size = 32 * (1 + (i % 5)); + ptrs(i) = (uintptr_t)pool.allocate(alloc_size); + if (ptrs(i)) { + ++update; + } + } + } + + struct TagMixItUp {}; + + KOKKOS_INLINE_FUNCTION + void operator()(TagMixItUp, int i, long& update) const noexcept { + if (ptrs(i) && (0 == i % 3)) { + unsigned alloc_size = 32 * (1 + (i % 5)); + + pool.deallocate((void*)ptrs(i), alloc_size); + + ptrs(i) = (uintptr_t)pool.allocate(alloc_size); + + if (ptrs(i)) { + ++update; + } + } + } +}; + +template <class PoolType> +void print_memory_pool_stats(typename PoolType::usage_statistics const& stats) { + std::cout << "MemoryPool {" << std::endl + << " bytes capacity = " << stats.capacity_bytes << std::endl + << " bytes used = " << stats.consumed_bytes << std::endl + << " bytes reserved = " << stats.reserved_bytes << std::endl + << " bytes free = " + << (stats.capacity_bytes - + (stats.consumed_bytes + stats.reserved_bytes)) + << std::endl + << " block used = " << stats.consumed_blocks << std::endl + << " block reserved = " << stats.reserved_blocks << std::endl + << " super used = " << stats.consumed_superblocks << std::endl + << " super reserved = " + << (stats.capacity_superblocks - stats.consumed_superblocks) + << std::endl + << "}" << std::endl; +} + +template <class DeviceType> +void test_memory_pool_v2(const bool print_statistics, + const bool print_superblocks) { + using memory_space = typename DeviceType::memory_space; + using execution_space = typename DeviceType::execution_space; + using pool_type = Kokkos::MemoryPool<DeviceType>; + using functor_type = TestMemoryPool_Functor<DeviceType>; + + using TagAlloc = typename functor_type::TagAlloc; + using TagDealloc = typename functor_type::TagDealloc; + using TagRealloc = typename functor_type::TagRealloc; + using TagMixItUp = typename functor_type::TagMixItUp; + + const size_t total_alloc_size = 10000000; + const unsigned min_block_size = 64; + const unsigned max_block_size = 256; + const long nfill = 70000; + + for (uint32_t k = 0, min_superblock_size = 10000; k < 3; + ++k, min_superblock_size *= 10) { + typename pool_type::usage_statistics stats; + + pool_type pool(memory_space(), total_alloc_size, min_block_size, + max_block_size, min_superblock_size); + + functor_type functor(pool, nfill); + + long result = 0; + long ndel = 0; + + Kokkos::parallel_reduce( + Kokkos::RangePolicy<execution_space, TagAlloc>(0, nfill), functor, + result); + + pool.get_usage_statistics(stats); + + const int fill_error = + (nfill != result) || (nfill != long(stats.consumed_blocks)); + + if (fill_error || print_statistics) + print_memory_pool_stats<pool_type>(stats); + if (fill_error || print_superblocks) pool.print_state(std::cout); + + ASSERT_EQ(nfill, result); + ASSERT_EQ(nfill, long(stats.consumed_blocks)); + + Kokkos::parallel_reduce( + Kokkos::RangePolicy<execution_space, TagDealloc>(0, nfill), functor, + ndel); + + pool.get_usage_statistics(stats); + + const int del_error = (nfill - ndel) != long(stats.consumed_blocks); + + if (del_error || print_statistics) + print_memory_pool_stats<pool_type>(stats); + if (del_error || print_superblocks) pool.print_state(std::cout); + + ASSERT_EQ((nfill - ndel), long(stats.consumed_blocks)); + + Kokkos::parallel_reduce( + Kokkos::RangePolicy<execution_space, TagRealloc>(0, nfill), functor, + result); + + pool.get_usage_statistics(stats); + + const int refill_error = + (ndel != result) || (nfill != long(stats.consumed_blocks)); + + if (refill_error || print_statistics) + print_memory_pool_stats<pool_type>(stats); + if (refill_error || print_superblocks) pool.print_state(std::cout); + + ASSERT_EQ(ndel, result); + ASSERT_EQ(nfill, long(stats.consumed_blocks)); + + Kokkos::parallel_reduce( + Kokkos::RangePolicy<execution_space, TagMixItUp>(0, nfill), functor, + result); + + pool.get_usage_statistics(stats); + + const int mix_error = + (ndel != result) || (nfill != long(stats.consumed_blocks)); + + if (mix_error || print_statistics) + print_memory_pool_stats<pool_type>(stats); + if (mix_error || print_superblocks) pool.print_state(std::cout); + + ASSERT_EQ(ndel, result); + ASSERT_EQ(nfill, long(stats.consumed_blocks)); + } +} + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +template <class DeviceType> +struct TestMemoryPoolCorners { + using ptrs_type = Kokkos::View<uintptr_t*, DeviceType>; + using pool_type = Kokkos::MemoryPool<DeviceType>; + + pool_type pool; + ptrs_type ptrs; + uint32_t size; + uint32_t stride; + + TestMemoryPoolCorners(const pool_type& arg_pool, const ptrs_type& arg_ptrs, + const uint32_t arg_base, const uint32_t arg_stride) + : pool(arg_pool), ptrs(arg_ptrs), size(arg_base), stride(arg_stride) {} + + // Specify reduction argument value_type to + // avoid confusion with tag-dispatch. + + using value_type = long; + + KOKKOS_INLINE_FUNCTION + void operator()(int i, long& err) const noexcept { + unsigned alloc_size = size << (i % stride); + if (0 == ptrs(i)) { + ptrs(i) = (uintptr_t)pool.allocate(alloc_size); + if (ptrs(i) && !alloc_size) { + ++err; + } + } + } + + struct TagDealloc {}; + + KOKKOS_INLINE_FUNCTION + void operator()(int i) const noexcept { + unsigned alloc_size = size << (i % stride); + if (ptrs(i)) { + pool.deallocate((void*)ptrs(i), alloc_size); + } + ptrs(i) = 0; + } +}; + +template <class DeviceType> +void test_memory_pool_corners(const bool print_statistics, + const bool print_superblocks) { + using memory_space = typename DeviceType::memory_space; + using execution_space = typename DeviceType::execution_space; + using pool_type = Kokkos::MemoryPool<DeviceType>; + using functor_type = TestMemoryPoolCorners<DeviceType>; + using ptrs_type = typename functor_type::ptrs_type; + + { + // superblock size 1 << 14 + const size_t min_superblock_size = 1u << 14; + + // four superblocks + const size_t total_alloc_size = min_superblock_size * 4; + + // block sizes { 64 , 128 , 256 , 512 } + // block counts { 256 , 128 , 64 , 32 } + const unsigned min_block_size = 64; + const unsigned max_block_size = 512; + const unsigned num_blocks = 480; + + pool_type pool(memory_space(), total_alloc_size, min_block_size, + max_block_size, min_superblock_size); + + // Allocate one block from each superblock to lock that + // superblock into the block size. + + ptrs_type ptrs("ptrs", num_blocks); + + long err = 0; + + Kokkos::parallel_reduce(Kokkos::RangePolicy<execution_space>(0, 4), + functor_type(pool, ptrs, 64, 4), err); + + if (print_statistics || err) { + typename pool_type::usage_statistics stats; + + pool.get_usage_statistics(stats); + + print_memory_pool_stats<pool_type>(stats); + } + + if (print_superblocks || err) { + pool.print_state(std::cout); + } + + // Now fill remaining allocations with small size + + Kokkos::parallel_reduce(Kokkos::RangePolicy<execution_space>(0, num_blocks), + functor_type(pool, ptrs, 64, 1), err); + + if (print_statistics || err) { + typename pool_type::usage_statistics stats; + + pool.get_usage_statistics(stats); + + print_memory_pool_stats<pool_type>(stats); + } + + if (print_superblocks || err) { + pool.print_state(std::cout); + } + } +} + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +template <class DeviceType, class Enable = void> +struct TestMemoryPoolHuge { + enum : size_t { num_superblock = 0 }; + + using value_type = long; + + KOKKOS_INLINE_FUNCTION + void operator()(int /*i*/, long& /*err*/) const noexcept {} + + KOKKOS_INLINE_FUNCTION + void operator()(int /*i*/) const noexcept {} +}; + +template <class DeviceType> +struct TestMemoryPoolHuge< + DeviceType, + typename std::enable_if<std::is_same< + Kokkos::HostSpace, typename DeviceType::memory_space>::value>::type> { + using ptrs_type = Kokkos::View<uintptr_t*, DeviceType>; + using pool_type = Kokkos::MemoryPool<DeviceType>; + using memory_space = typename DeviceType::memory_space; + + pool_type pool; + ptrs_type ptrs; + + enum : size_t { + min_block_size = 512, + max_block_size = 1lu << 31, + min_superblock_size = max_block_size, + num_superblock = 4, + total_alloc_size = num_superblock * max_block_size + }; + + TestMemoryPoolHuge() + : pool(memory_space(), total_alloc_size, min_block_size, max_block_size, + min_superblock_size), + ptrs("ptrs", num_superblock) {} + + // Specify reduction argument value_type to + // avoid confusion with tag-dispatch. + + using value_type = long; + + void operator()(int i, long& err) const noexcept { + if (i < int(num_superblock)) { + ptrs(i) = (uintptr_t)pool.allocate(max_block_size); +#if 0 + printf("TestMemoryPoolHuge size(0x%lx) ptr(0x%lx)\n" + , max_block_size + , ptrs(i) ); +#endif + if (!ptrs(i)) { + Kokkos::abort("TestMemoryPoolHuge"); + ++err; + } + } + } + + void operator()(int i) const noexcept { + if (i < int(num_superblock)) { + pool.deallocate((void*)ptrs(i), max_block_size); + ptrs(i) = 0; + } + } +}; + +template <class DeviceType> +void test_memory_pool_huge() { + using execution_space = typename DeviceType::execution_space; + using functor_type = TestMemoryPoolHuge<DeviceType>; + using policy_type = Kokkos::RangePolicy<execution_space>; + + functor_type f; + policy_type policy(0, functor_type::num_superblock); + + long err = 0; + + Kokkos::parallel_reduce(policy, f, err); + Kokkos::parallel_for(policy, f); +} + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +} // namespace TestMemoryPool + +namespace Test { + +TEST(TEST_CATEGORY, memory_pool) { + TestMemoryPool::test_host_memory_pool_defaults<>(); + TestMemoryPool::test_host_memory_pool_stats<>(); + TestMemoryPool::test_memory_pool_v2<TEST_EXECSPACE>(false, false); + TestMemoryPool::test_memory_pool_corners<TEST_EXECSPACE>(false, false); +#ifdef KOKKOS_ENABLE_LARGE_MEM_TESTS + TestMemoryPool::test_memory_pool_huge<TEST_EXECSPACE>(); +#endif +} + +} // namespace Test + +#endif diff --git a/packages/kokkos/core/unit_test/TestNonTrivialScalarTypes.hpp b/packages/kokkos/core/unit_test/TestNonTrivialScalarTypes.hpp new file mode 100644 index 0000000000000000000000000000000000000000..6c8a47a5861dd361364a94551abcfd50d0e85153 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestNonTrivialScalarTypes.hpp @@ -0,0 +1,338 @@ + +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef TESTNONTRIVIALSCALARTYPES_HPP_ +#define TESTNONTRIVIALSCALARTYPES_HPP_ + +#include <Kokkos_Core.hpp> + +#include <impl/Kokkos_Timer.hpp> +#include <iostream> +#include <cstdlib> +#include <cstdint> +#include <cinttypes> + +namespace Test { + +struct my_complex { + double re, im; + int dummy; + + KOKKOS_INLINE_FUNCTION + my_complex() { + re = 0.0; + im = 0.0; + dummy = 0; + } + + KOKKOS_INLINE_FUNCTION + my_complex(const my_complex &src) { + re = src.re; + im = src.im; + dummy = src.dummy; + } + + KOKKOS_INLINE_FUNCTION + my_complex &operator=(const my_complex &src) { + re = src.re; + im = src.im; + dummy = src.dummy; + return *this; + } + + KOKKOS_INLINE_FUNCTION + my_complex &operator=(const volatile my_complex &src) { + re = src.re; + im = src.im; + dummy = src.dummy; + return *this; + } + + KOKKOS_INLINE_FUNCTION + volatile my_complex &operator=(const my_complex &src) volatile { + re = src.re; + im = src.im; + dummy = src.dummy; + return *this; + } + + KOKKOS_INLINE_FUNCTION + volatile my_complex &operator=(const volatile my_complex &src) volatile { + re = src.re; + im = src.im; + dummy = src.dummy; + return *this; + } + + KOKKOS_INLINE_FUNCTION + my_complex(const volatile my_complex &src) { + re = src.re; + im = src.im; + dummy = src.dummy; + } + + KOKKOS_INLINE_FUNCTION + my_complex(const double &val) { + re = val; + im = 0.0; + dummy = 0; + } + + KOKKOS_INLINE_FUNCTION + my_complex &operator+=(const my_complex &src) { + re += src.re; + im += src.im; + dummy += src.dummy; + return *this; + } + + KOKKOS_INLINE_FUNCTION + void operator+=(const volatile my_complex &src) volatile { + re += src.re; + im += src.im; + dummy += src.dummy; + } + + KOKKOS_INLINE_FUNCTION + my_complex operator+(const my_complex &src) { + my_complex tmp = *this; + tmp.re += src.re; + tmp.im += src.im; + tmp.dummy += src.dummy; + return tmp; + } + + KOKKOS_INLINE_FUNCTION + my_complex operator+(const volatile my_complex &src) volatile { + my_complex tmp = *this; + tmp.re += src.re; + tmp.im += src.im; + tmp.dummy += src.dummy; + return tmp; + } + + KOKKOS_INLINE_FUNCTION + my_complex &operator*=(const my_complex &src) { + double re_tmp = re * src.re - im * src.im; + double im_tmp = re * src.im + im * src.re; + re = re_tmp; + im = im_tmp; + dummy *= src.dummy; + return *this; + } + + KOKKOS_INLINE_FUNCTION + void operator*=(const volatile my_complex &src) volatile { + double re_tmp = re * src.re - im * src.im; + double im_tmp = re * src.im + im * src.re; + re = re_tmp; + im = im_tmp; + dummy *= src.dummy; + } + + KOKKOS_INLINE_FUNCTION + bool operator==(const my_complex &src) const { + return (re == src.re) && (im == src.im) && (dummy == src.dummy); + } + + KOKKOS_INLINE_FUNCTION + bool operator!=(const my_complex &src) const { + return (re != src.re) || (im != src.im) || (dummy != src.dummy); + } + + KOKKOS_INLINE_FUNCTION + bool operator!=(const double &val) const { + return (re != val) || (im != 0) || (dummy != 0); + } + + KOKKOS_INLINE_FUNCTION + my_complex &operator=(const int &val) { + re = val; + im = 0.0; + dummy = 0; + return *this; + } + + KOKKOS_INLINE_FUNCTION + my_complex &operator=(const double &val) { + re = val; + im = 0.0; + dummy = 0; + return *this; + } + + KOKKOS_INLINE_FUNCTION + operator double() { return re; } +}; + +template <class scalar_t, int N> +struct array_reduce { + scalar_t data[N]; + KOKKOS_INLINE_FUNCTION + array_reduce() { + for (int i = 0; i < N; i++) data[i] = scalar_t(); + } + KOKKOS_INLINE_FUNCTION + array_reduce(const array_reduce &rhs) { + for (int i = 0; i < N; i++) data[i] = rhs.data[i]; + } + KOKKOS_INLINE_FUNCTION + array_reduce(const scalar_t value) { + for (int i = 0; i < N; i++) data[i] = scalar_t(value); + } + + KOKKOS_INLINE_FUNCTION + array_reduce &operator=(const array_reduce &src) { + for (int i = 0; i < N; i++) data[i] = src.data[i]; + return *this; + } + + KOKKOS_INLINE_FUNCTION + array_reduce &operator=(const volatile array_reduce &src) { + for (int i = 0; i < N; i++) data[i] = src.data[i]; + return *this; + } + + KOKKOS_INLINE_FUNCTION // add operator + array_reduce & + operator=(const scalar_t val) { + for (int i = 0; i < N; i++) data[i] = val; + return *this; + } + KOKKOS_INLINE_FUNCTION // add operator + array_reduce & + operator=(const int val) { + for (int i = 0; i < N; i++) data[i] = val; + return *this; + } + + KOKKOS_INLINE_FUNCTION // add operator + array_reduce & + operator+=(const array_reduce &src) { + for (int i = 0; i < N; i++) data[i] += src.data[i]; + return *this; + } + KOKKOS_INLINE_FUNCTION // volatile add operator + void + operator+=(const volatile array_reduce &src) volatile { + for (int i = 0; i < N; i++) data[i] += src.data[i]; + } + KOKKOS_INLINE_FUNCTION // add operator + array_reduce + operator+(const array_reduce &src) const { + array_reduce result(*this); + for (int i = 0; i < N; i++) result.data[i] += src.data[i]; + return result; + } + KOKKOS_INLINE_FUNCTION // add operator + array_reduce + operator-(const array_reduce &src) const { + array_reduce result(*this); + for (int i = 0; i < N; i++) result.data[i] -= src.data[i]; + return result; + } + KOKKOS_INLINE_FUNCTION // add operator + array_reduce & + operator*=(const array_reduce &src) { + for (int i = 0; i < N; i++) data[i] *= src.data[i]; + return *this; + } + KOKKOS_INLINE_FUNCTION // volatile add operator + void + operator*=(const volatile array_reduce &src) volatile { + for (int i = 0; i < N; i++) data[i] *= src.data[i]; + } + KOKKOS_INLINE_FUNCTION // add operator + array_reduce + operator*(const array_reduce &src) const { + array_reduce result(*this); + for (int i = 0; i < N; i++) result.data[i] *= src.data[i]; + return result; + } + KOKKOS_INLINE_FUNCTION + bool operator==(const array_reduce &src) const { + bool equal = true; + for (int i = 0; i < N; i++) equal = equal && (data[i] == src.data[i]); + return equal; + } + KOKKOS_INLINE_FUNCTION + bool operator!=(const array_reduce &src) const { + bool equal = true; + for (int i = 0; i < N; i++) equal = equal && (data[i] == src.data[i]); + return !equal; + } + KOKKOS_INLINE_FUNCTION + explicit operator double() const { + double lsum = 0.0; + for (int i = 0; i < N; i++) lsum += data[i]; + return lsum; + } +}; +} // namespace Test + +namespace Kokkos { +template <> +struct reduction_identity<Test::my_complex> { + using t_red_ident = reduction_identity<double>; + KOKKOS_FORCEINLINE_FUNCTION static Test::my_complex sum() { + return Test::my_complex(t_red_ident::sum()); + } + KOKKOS_FORCEINLINE_FUNCTION static Test::my_complex prod() { + return Test::my_complex(t_red_ident::prod()); + } +}; + +template <class scalar_t, int N> +struct reduction_identity<Test::array_reduce<scalar_t, N>> { + using t_red_ident = reduction_identity<scalar_t>; + KOKKOS_FORCEINLINE_FUNCTION static Test::array_reduce<scalar_t, N> sum() { + return Test::array_reduce<scalar_t, N>(t_red_ident::sum()); + } + KOKKOS_FORCEINLINE_FUNCTION static Test::array_reduce<scalar_t, N> prod() { + return Test::array_reduce<scalar_t, N>(t_red_ident::prod()); + } +}; +} // namespace Kokkos +#endif // TESTNONTRIVIALSCALARTYPES_HPP_ diff --git a/packages/kokkos/core/unit_test/TestNumericTraits.hpp b/packages/kokkos/core/unit_test/TestNumericTraits.hpp new file mode 100644 index 0000000000000000000000000000000000000000..fe01b83834f26eddc15e71360d77e85452ef0238 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestNumericTraits.hpp @@ -0,0 +1,336 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> +#include <type_traits> +#include "Kokkos_NumericTraits.hpp" +#include "Kokkos_ExecPolicy.hpp" + +struct extrema { +#define DEFINE_EXTREMA(T, m, M) \ + KOKKOS_FUNCTION static T min(T) { return m; } \ + KOKKOS_FUNCTION static T max(T) { return M; } + + DEFINE_EXTREMA(char, CHAR_MIN, CHAR_MAX); + DEFINE_EXTREMA(signed char, SCHAR_MIN, SCHAR_MAX); + DEFINE_EXTREMA(unsigned char, 0, UCHAR_MAX); + DEFINE_EXTREMA(short, SHRT_MIN, SHRT_MAX); + DEFINE_EXTREMA(unsigned short, 0, USHRT_MAX); + DEFINE_EXTREMA(int, INT_MIN, INT_MAX); + DEFINE_EXTREMA(unsigned, 0U, UINT_MAX); + DEFINE_EXTREMA(long, LONG_MIN, LONG_MAX); + DEFINE_EXTREMA(unsigned long, 0UL, ULONG_MAX); + DEFINE_EXTREMA(long long, LLONG_MIN, LLONG_MAX); + DEFINE_EXTREMA(unsigned long long, 0ULL, ULLONG_MAX); + + DEFINE_EXTREMA(float, -FLT_MAX, FLT_MAX); + DEFINE_EXTREMA(double, -DBL_MAX, DBL_MAX); + DEFINE_EXTREMA(long double, -LDBL_MAX, LDBL_MAX); + +#undef DEFINE_EXTREMA +}; + +// clang-format off +struct Infinity { template <class T> using trait = Kokkos::Experimental::infinity<T>; }; +struct Epsilon { template <class T> using trait = Kokkos::Experimental::epsilon<T>; }; +struct FiniteMin { template <class T> using trait = Kokkos::Experimental::finite_min<T>; }; +struct FiniteMax { template <class T> using trait = Kokkos::Experimental::finite_max<T>; }; +struct RoundError { template <class T> using trait = Kokkos::Experimental::round_error<T>; }; +struct NormMin { template <class T> using trait = Kokkos::Experimental::norm_min<T>; }; +struct Digits { template <class T> using trait = Kokkos::Experimental::digits<T>; }; +struct Digits10 { template <class T> using trait = Kokkos::Experimental::digits10<T>; }; +struct MaxDigits10 { template <class T> using trait = Kokkos::Experimental::max_digits10<T>; }; +struct Radix { template <class T> using trait = Kokkos::Experimental::radix<T>; }; +struct MinExponent { template <class T> using trait = Kokkos::Experimental::min_exponent<T>; }; +struct MaxExponent { template <class T> using trait = Kokkos::Experimental::max_exponent<T>; }; +struct MinExponent10 { template <class T> using trait = Kokkos::Experimental::min_exponent10<T>; }; +struct MaxExponent10 { template <class T> using trait = Kokkos::Experimental::max_exponent10<T>; }; +// clang-format on + +template <class T> +KOKKOS_FUNCTION T* take_address_of(T& arg) { + return &arg; +} + +template <class T> +KOKKOS_FUNCTION void take_by_value(T) {} + +template <class Space, class T, class Tag> +struct TestNumericTraits { + template <class U> + using trait = typename Tag::template trait<U>; + + Kokkos::View<T, Space> compare; + TestNumericTraits() { + compare = Kokkos::View<T, Space>("C"); + run(); + } + + void run() const { + int errors = 0; + Kokkos::parallel_reduce(Kokkos::RangePolicy<Space, Tag>(0, 1), *this, + errors); + ASSERT_EQ(errors, 0); + (void)take_address_of(trait<T>::value); // use on host + } + + KOKKOS_FUNCTION void operator()(Infinity, int, int& e) const { + using Kokkos::Experimental::infinity; + auto const inf = infinity<T>::value; + auto const zero = T(0); + e += (int)!(inf + inf == inf); + e += (int)!(inf != zero); + use_on_device(); + } + + KOKKOS_FUNCTION void operator()(Epsilon, int, int& e) const { + using Kokkos::Experimental::epsilon; + auto const eps = epsilon<T>::value; + auto const one = T(1); + // Avoid higher precision intermediate representation + compare() = one + eps; + e += (int)!(compare() != one); + compare() = one + eps / 2; + e += (int)!(compare() == one); + use_on_device(); + } + + KOKKOS_FUNCTION void operator()(FiniteMin, int, int& e) const { + using Kokkos::Experimental::finite_max; + using Kokkos::Experimental::finite_min; + auto const min = finite_min<T>::value; + auto const max = finite_max<T>::value; + e += (int)!(min == extrema::min(T{})); + e += (int)!(max == extrema::max(T{})); + use_on_device(); + } + + // clang-format off + KOKKOS_FUNCTION void operator()(FiniteMax, int, int&) const { use_on_device(); } + KOKKOS_FUNCTION void operator()(RoundError, int, int&) const { use_on_device(); } + KOKKOS_FUNCTION void operator()(NormMin, int, int&) const { use_on_device(); } + KOKKOS_FUNCTION void operator()(Digits, int, int&) const { use_on_device(); } + KOKKOS_FUNCTION void operator()(Digits10, int, int&) const { use_on_device(); } + KOKKOS_FUNCTION void operator()(MaxDigits10, int, int&) const { use_on_device(); } + KOKKOS_FUNCTION void operator()(Radix, int, int&) const { use_on_device(); } + KOKKOS_FUNCTION void operator()(MinExponent, int, int&) const { use_on_device(); } + KOKKOS_FUNCTION void operator()(MaxExponent, int, int&) const { use_on_device(); } + KOKKOS_FUNCTION void operator()(MinExponent10, int, int&) const { use_on_device(); } + KOKKOS_FUNCTION void operator()(MaxExponent10, int, int&) const { use_on_device(); } + // clang-format on + + KOKKOS_FUNCTION void use_on_device() const { +#if defined(KOKKOS_COMPILER_NVCC) || defined(KOKKOS_ENABLE_OPENMPTARGET) + take_by_value(trait<T>::value); +#else + (void)take_address_of(trait<T>::value); +#endif + } +}; + +#if defined(KOKKOS_COMPILER_NVCC) || defined(KOKKOS_ENABLE_SYCL) || \ + defined(KOKKOS_ENABLE_OPENMPTARGET) +template <class Tag> +struct TestNumericTraits< +#if defined(KOKKOS_ENABLE_CUDA) + Kokkos::Cuda, +#elif defined(KOKKOS_ENABLE_SYCL) + Kokkos::Experimental::SYCL, +#else + Kokkos::Experimental::OpenMPTarget, +#endif + long double, Tag> { + template <class T> + using trait = typename Tag::template trait<T>; + TestNumericTraits() { + (void)take_address_of(trait<long double>::value); + // Do nothing on the device. + // According to the doc + // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#constexpr-variables + // the traits member constant value cannot be directly used in device code. + } +}; +#endif + +TEST(TEST_CATEGORY, numeric_traits_infinity) { + TestNumericTraits<TEST_EXECSPACE, float, Infinity>(); + TestNumericTraits<TEST_EXECSPACE, double, Infinity>(); + TestNumericTraits<TEST_EXECSPACE, long double, Infinity>(); +} + +TEST(TEST_CATEGORY, numeric_traits_epsilon) { + TestNumericTraits<TEST_EXECSPACE, float, Epsilon>(); + TestNumericTraits<TEST_EXECSPACE, double, Epsilon>(); +#ifndef KOKKOS_COMPILER_IBM // fails with XL 16.1.1 + TestNumericTraits<TEST_EXECSPACE, long double, Epsilon>(); +#endif +} + +TEST(TEST_CATEGORY, numeric_traits_round_error) { + TestNumericTraits<TEST_EXECSPACE, float, RoundError>(); + TestNumericTraits<TEST_EXECSPACE, double, RoundError>(); + TestNumericTraits<TEST_EXECSPACE, long double, RoundError>(); +} + +TEST(TEST_CATEGORY, numeric_traits_norm_min) { + TestNumericTraits<TEST_EXECSPACE, float, NormMin>(); + TestNumericTraits<TEST_EXECSPACE, double, NormMin>(); + TestNumericTraits<TEST_EXECSPACE, long double, NormMin>(); +} + +TEST(TEST_CATEGORY, numeric_traits_finite_min_max) { + TestNumericTraits<TEST_EXECSPACE, char, FiniteMin>(); + TestNumericTraits<TEST_EXECSPACE, char, FiniteMax>(); + TestNumericTraits<TEST_EXECSPACE, signed char, FiniteMin>(); + TestNumericTraits<TEST_EXECSPACE, signed char, FiniteMax>(); + TestNumericTraits<TEST_EXECSPACE, unsigned char, FiniteMin>(); + TestNumericTraits<TEST_EXECSPACE, unsigned char, FiniteMax>(); + + TestNumericTraits<TEST_EXECSPACE, short, FiniteMin>(); + TestNumericTraits<TEST_EXECSPACE, short, FiniteMax>(); + TestNumericTraits<TEST_EXECSPACE, unsigned short, FiniteMin>(); + TestNumericTraits<TEST_EXECSPACE, unsigned short, FiniteMax>(); + + TestNumericTraits<TEST_EXECSPACE, int, FiniteMin>(); + TestNumericTraits<TEST_EXECSPACE, int, FiniteMax>(); + TestNumericTraits<TEST_EXECSPACE, unsigned int, FiniteMin>(); + TestNumericTraits<TEST_EXECSPACE, unsigned int, FiniteMax>(); + + TestNumericTraits<TEST_EXECSPACE, long, FiniteMin>(); + TestNumericTraits<TEST_EXECSPACE, long, FiniteMax>(); + TestNumericTraits<TEST_EXECSPACE, unsigned long, FiniteMin>(); + TestNumericTraits<TEST_EXECSPACE, unsigned long, FiniteMax>(); + + TestNumericTraits<TEST_EXECSPACE, long long, FiniteMin>(); + TestNumericTraits<TEST_EXECSPACE, long long, FiniteMax>(); + TestNumericTraits<TEST_EXECSPACE, unsigned long long, FiniteMin>(); + TestNumericTraits<TEST_EXECSPACE, unsigned long long, FiniteMax>(); + + TestNumericTraits<TEST_EXECSPACE, float, FiniteMin>(); + TestNumericTraits<TEST_EXECSPACE, float, FiniteMax>(); + TestNumericTraits<TEST_EXECSPACE, double, FiniteMin>(); + TestNumericTraits<TEST_EXECSPACE, double, FiniteMax>(); + TestNumericTraits<TEST_EXECSPACE, long double, FiniteMin>(); + TestNumericTraits<TEST_EXECSPACE, long double, FiniteMax>(); +} + +TEST(TEST_CATEGORY, numeric_traits_digits) { + TestNumericTraits<TEST_EXECSPACE, bool, Digits>(); + TestNumericTraits<TEST_EXECSPACE, char, Digits>(); + TestNumericTraits<TEST_EXECSPACE, signed char, Digits>(); + TestNumericTraits<TEST_EXECSPACE, unsigned char, Digits>(); + TestNumericTraits<TEST_EXECSPACE, short, Digits>(); + TestNumericTraits<TEST_EXECSPACE, unsigned short, Digits>(); + TestNumericTraits<TEST_EXECSPACE, int, Digits>(); + TestNumericTraits<TEST_EXECSPACE, unsigned int, Digits>(); + TestNumericTraits<TEST_EXECSPACE, long int, Digits>(); + TestNumericTraits<TEST_EXECSPACE, unsigned long int, Digits>(); + TestNumericTraits<TEST_EXECSPACE, long long int, Digits>(); + TestNumericTraits<TEST_EXECSPACE, unsigned long long int, Digits>(); + TestNumericTraits<TEST_EXECSPACE, float, Digits>(); + TestNumericTraits<TEST_EXECSPACE, double, Digits>(); + TestNumericTraits<TEST_EXECSPACE, long double, Digits>(); +} + +TEST(TEST_CATEGORY, numeric_traits_digits10) { + TestNumericTraits<TEST_EXECSPACE, bool, Digits10>(); + TestNumericTraits<TEST_EXECSPACE, char, Digits10>(); + TestNumericTraits<TEST_EXECSPACE, signed char, Digits10>(); + TestNumericTraits<TEST_EXECSPACE, unsigned char, Digits10>(); + TestNumericTraits<TEST_EXECSPACE, short, Digits10>(); + TestNumericTraits<TEST_EXECSPACE, unsigned short, Digits10>(); + TestNumericTraits<TEST_EXECSPACE, int, Digits10>(); + TestNumericTraits<TEST_EXECSPACE, unsigned int, Digits10>(); + TestNumericTraits<TEST_EXECSPACE, long int, Digits10>(); + TestNumericTraits<TEST_EXECSPACE, unsigned long int, Digits10>(); + TestNumericTraits<TEST_EXECSPACE, long long int, Digits10>(); + TestNumericTraits<TEST_EXECSPACE, unsigned long long int, Digits10>(); + TestNumericTraits<TEST_EXECSPACE, float, Digits10>(); + TestNumericTraits<TEST_EXECSPACE, double, Digits10>(); + TestNumericTraits<TEST_EXECSPACE, long double, Digits10>(); +} + +TEST(TEST_CATEGORY, numeric_traits_max_digits10) { + TestNumericTraits<TEST_EXECSPACE, float, MaxDigits10>(); + TestNumericTraits<TEST_EXECSPACE, double, MaxDigits10>(); + TestNumericTraits<TEST_EXECSPACE, long double, MaxDigits10>(); +} + +TEST(TEST_CATEGORY, numeric_traits_radix) { + TestNumericTraits<TEST_EXECSPACE, bool, Radix>(); + TestNumericTraits<TEST_EXECSPACE, char, Radix>(); + TestNumericTraits<TEST_EXECSPACE, signed char, Radix>(); + TestNumericTraits<TEST_EXECSPACE, unsigned char, Radix>(); + TestNumericTraits<TEST_EXECSPACE, short, Radix>(); + TestNumericTraits<TEST_EXECSPACE, unsigned short, Radix>(); + TestNumericTraits<TEST_EXECSPACE, int, Radix>(); + TestNumericTraits<TEST_EXECSPACE, unsigned int, Radix>(); + TestNumericTraits<TEST_EXECSPACE, long int, Radix>(); + TestNumericTraits<TEST_EXECSPACE, unsigned long int, Radix>(); + TestNumericTraits<TEST_EXECSPACE, long long int, Radix>(); + TestNumericTraits<TEST_EXECSPACE, unsigned long long int, Radix>(); + TestNumericTraits<TEST_EXECSPACE, float, Radix>(); + TestNumericTraits<TEST_EXECSPACE, double, Radix>(); + TestNumericTraits<TEST_EXECSPACE, long double, Radix>(); +} + +TEST(TEST_CATEGORY, numeric_traits_min_max_exponent) { + TestNumericTraits<TEST_EXECSPACE, float, MinExponent>(); + TestNumericTraits<TEST_EXECSPACE, float, MaxExponent>(); + TestNumericTraits<TEST_EXECSPACE, double, MinExponent>(); + TestNumericTraits<TEST_EXECSPACE, double, MaxExponent>(); + TestNumericTraits<TEST_EXECSPACE, long double, MinExponent>(); + TestNumericTraits<TEST_EXECSPACE, long double, MaxExponent>(); +} + +TEST(TEST_CATEGORY, numeric_traits_min_max_exponent10) { + TestNumericTraits<TEST_EXECSPACE, float, MinExponent10>(); + TestNumericTraits<TEST_EXECSPACE, float, MaxExponent10>(); + TestNumericTraits<TEST_EXECSPACE, double, MinExponent10>(); + TestNumericTraits<TEST_EXECSPACE, double, MaxExponent10>(); + TestNumericTraits<TEST_EXECSPACE, long double, MinExponent10>(); + TestNumericTraits<TEST_EXECSPACE, long double, MaxExponent10>(); +} diff --git a/packages/kokkos/core/unit_test/TestOther.hpp b/packages/kokkos/core/unit_test/TestOther.hpp new file mode 100644 index 0000000000000000000000000000000000000000..2d298d30784160ec4b65652864003985e2d488c4 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestOther.hpp @@ -0,0 +1,54 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_OTHER_HPP +#define KOKKOS_TEST_OTHER_HPP +#include <TestTemplateMetaFunctions.hpp> +#include <TestAggregate.hpp> +#include <TestMemoryPool.hpp> +#include <TestCXX11.hpp> + +#include <TestViewCtorPropEmbeddedDim.hpp> +#include <TestViewLayoutTiled.hpp> +#endif diff --git a/packages/kokkos/core/unit_test/TestPolicyConstruction.hpp b/packages/kokkos/core/unit_test/TestPolicyConstruction.hpp new file mode 100644 index 0000000000000000000000000000000000000000..0017c690e75c6e1bde1808e87203d8dbbea754cc --- /dev/null +++ b/packages/kokkos/core/unit_test/TestPolicyConstruction.hpp @@ -0,0 +1,916 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> +#include <stdexcept> +#include <sstream> +#include <iostream> +#include <type_traits> + +namespace Test { +struct SomeTag {}; + +template <class ExecutionSpace> +class TestRangePolicyConstruction { + public: + TestRangePolicyConstruction() { + test_compile_time_parameters(); + test_runtime_parameters(); + } + + private: + void test_compile_time_parameters() { + { + using policy_t = Kokkos::RangePolicy<>; + using execution_space = typename policy_t::execution_space; + using index_type = typename policy_t::index_type; + using schedule_type = typename policy_t::schedule_type; + using work_tag = typename policy_t::work_tag; + + ASSERT_TRUE(( + std::is_same<execution_space, Kokkos::DefaultExecutionSpace>::value)); + ASSERT_TRUE((std::is_same<index_type, + typename execution_space::size_type>::value)); + ASSERT_TRUE((std::is_same<schedule_type, + Kokkos::Schedule<Kokkos::Static>>::value)); + ASSERT_TRUE((std::is_same<work_tag, void>::value)); + } + + { + using policy_t = Kokkos::RangePolicy<ExecutionSpace>; + using execution_space = typename policy_t::execution_space; + using index_type = typename policy_t::index_type; + using schedule_type = typename policy_t::schedule_type; + using work_tag = typename policy_t::work_tag; + + ASSERT_TRUE((std::is_same<execution_space, ExecutionSpace>::value)); + ASSERT_TRUE((std::is_same<index_type, + typename execution_space::size_type>::value)); + ASSERT_TRUE((std::is_same<schedule_type, + Kokkos::Schedule<Kokkos::Static>>::value)); + ASSERT_TRUE((std::is_same<work_tag, void>::value)); + } + + { + using policy_t = Kokkos::RangePolicy<ExecutionSpace, + Kokkos::Schedule<Kokkos::Dynamic>>; + using execution_space = typename policy_t::execution_space; + using index_type = typename policy_t::index_type; + using schedule_type = typename policy_t::schedule_type; + using work_tag = typename policy_t::work_tag; + + ASSERT_TRUE((std::is_same<execution_space, ExecutionSpace>::value)); + ASSERT_TRUE((std::is_same<index_type, + typename execution_space::size_type>::value)); + ASSERT_TRUE((std::is_same<schedule_type, + Kokkos::Schedule<Kokkos::Dynamic>>::value)); + ASSERT_TRUE((std::is_same<work_tag, void>::value)); + } + + { + using policy_t = + Kokkos::RangePolicy<ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic>, + Kokkos::IndexType<long>>; + using execution_space = typename policy_t::execution_space; + using index_type = typename policy_t::index_type; + using schedule_type = typename policy_t::schedule_type; + using work_tag = typename policy_t::work_tag; + + ASSERT_TRUE((std::is_same<execution_space, ExecutionSpace>::value)); + ASSERT_TRUE((std::is_same<index_type, long>::value)); + ASSERT_TRUE((std::is_same<schedule_type, + Kokkos::Schedule<Kokkos::Dynamic>>::value)); + ASSERT_TRUE((std::is_same<work_tag, void>::value)); + } + + { + using policy_t = + Kokkos::RangePolicy<Kokkos::IndexType<long>, ExecutionSpace, + Kokkos::Schedule<Kokkos::Dynamic>>; + using execution_space = typename policy_t::execution_space; + using index_type = typename policy_t::index_type; + using schedule_type = typename policy_t::schedule_type; + using work_tag = typename policy_t::work_tag; + + ASSERT_TRUE((std::is_same<execution_space, ExecutionSpace>::value)); + ASSERT_TRUE((std::is_same<index_type, long>::value)); + ASSERT_TRUE((std::is_same<schedule_type, + Kokkos::Schedule<Kokkos::Dynamic>>::value)); + ASSERT_TRUE((std::is_same<work_tag, void>::value)); + } + + { + using policy_t = + Kokkos::RangePolicy<ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic>, + Kokkos::IndexType<long>, SomeTag>; + using execution_space = typename policy_t::execution_space; + using index_type = typename policy_t::index_type; + using schedule_type = typename policy_t::schedule_type; + using work_tag = typename policy_t::work_tag; + + ASSERT_TRUE((std::is_same<execution_space, ExecutionSpace>::value)); + ASSERT_TRUE((std::is_same<index_type, long>::value)); + ASSERT_TRUE((std::is_same<schedule_type, + Kokkos::Schedule<Kokkos::Dynamic>>::value)); + ASSERT_TRUE((std::is_same<work_tag, SomeTag>::value)); + } + + { + using policy_t = + Kokkos::RangePolicy<Kokkos::Schedule<Kokkos::Dynamic>, ExecutionSpace, + Kokkos::IndexType<long>, SomeTag>; + using execution_space = typename policy_t::execution_space; + using index_type = typename policy_t::index_type; + using schedule_type = typename policy_t::schedule_type; + using work_tag = typename policy_t::work_tag; + + ASSERT_TRUE((std::is_same<execution_space, ExecutionSpace>::value)); + ASSERT_TRUE((std::is_same<index_type, long>::value)); + ASSERT_TRUE((std::is_same<schedule_type, + Kokkos::Schedule<Kokkos::Dynamic>>::value)); + ASSERT_TRUE((std::is_same<work_tag, SomeTag>::value)); + } + + { + using policy_t = + Kokkos::RangePolicy<SomeTag, Kokkos::Schedule<Kokkos::Dynamic>, + Kokkos::IndexType<long>, ExecutionSpace>; + using execution_space = typename policy_t::execution_space; + using index_type = typename policy_t::index_type; + using schedule_type = typename policy_t::schedule_type; + using work_tag = typename policy_t::work_tag; + + ASSERT_TRUE((std::is_same<execution_space, ExecutionSpace>::value)); + ASSERT_TRUE((std::is_same<index_type, long>::value)); + ASSERT_TRUE((std::is_same<schedule_type, + Kokkos::Schedule<Kokkos::Dynamic>>::value)); + ASSERT_TRUE((std::is_same<work_tag, SomeTag>::value)); + } + + { + using policy_t = Kokkos::RangePolicy<Kokkos::Schedule<Kokkos::Dynamic>>; + using execution_space = typename policy_t::execution_space; + using index_type = typename policy_t::index_type; + using schedule_type = typename policy_t::schedule_type; + using work_tag = typename policy_t::work_tag; + + ASSERT_TRUE(( + std::is_same<execution_space, Kokkos::DefaultExecutionSpace>::value)); + ASSERT_TRUE((std::is_same<index_type, + typename execution_space::size_type>::value)); + ASSERT_TRUE((std::is_same<schedule_type, + Kokkos::Schedule<Kokkos::Dynamic>>::value)); + ASSERT_TRUE((std::is_same<work_tag, void>::value)); + } + + { + using policy_t = Kokkos::RangePolicy<Kokkos::Schedule<Kokkos::Dynamic>, + Kokkos::IndexType<long>>; + using execution_space = typename policy_t::execution_space; + using index_type = typename policy_t::index_type; + using schedule_type = typename policy_t::schedule_type; + using work_tag = typename policy_t::work_tag; + + ASSERT_TRUE(( + std::is_same<execution_space, Kokkos::DefaultExecutionSpace>::value)); + ASSERT_TRUE((std::is_same<index_type, long>::value)); + ASSERT_TRUE((std::is_same<schedule_type, + Kokkos::Schedule<Kokkos::Dynamic>>::value)); + ASSERT_TRUE((std::is_same<work_tag, void>::value)); + } + + { + using policy_t = Kokkos::RangePolicy<Kokkos::IndexType<long>, + Kokkos::Schedule<Kokkos::Dynamic>>; + using execution_space = typename policy_t::execution_space; + using index_type = typename policy_t::index_type; + using schedule_type = typename policy_t::schedule_type; + using work_tag = typename policy_t::work_tag; + + ASSERT_TRUE(( + std::is_same<execution_space, Kokkos::DefaultExecutionSpace>::value)); + ASSERT_TRUE((std::is_same<index_type, long>::value)); + ASSERT_TRUE((std::is_same<schedule_type, + Kokkos::Schedule<Kokkos::Dynamic>>::value)); + ASSERT_TRUE((std::is_same<work_tag, void>::value)); + } + + { + using policy_t = Kokkos::RangePolicy<Kokkos::Schedule<Kokkos::Dynamic>, + Kokkos::IndexType<long>, SomeTag>; + using execution_space = typename policy_t::execution_space; + using index_type = typename policy_t::index_type; + using schedule_type = typename policy_t::schedule_type; + using work_tag = typename policy_t::work_tag; + + ASSERT_TRUE(( + std::is_same<execution_space, Kokkos::DefaultExecutionSpace>::value)); + ASSERT_TRUE((std::is_same<index_type, long>::value)); + ASSERT_TRUE((std::is_same<schedule_type, + Kokkos::Schedule<Kokkos::Dynamic>>::value)); + ASSERT_TRUE((std::is_same<work_tag, SomeTag>::value)); + } + + { + using policy_t = Kokkos::RangePolicy<Kokkos::Schedule<Kokkos::Dynamic>, + Kokkos::IndexType<long>, SomeTag>; + using execution_space = typename policy_t::execution_space; + using index_type = typename policy_t::index_type; + using schedule_type = typename policy_t::schedule_type; + using work_tag = typename policy_t::work_tag; + + ASSERT_TRUE(( + std::is_same<execution_space, Kokkos::DefaultExecutionSpace>::value)); + ASSERT_TRUE((std::is_same<index_type, long>::value)); + ASSERT_TRUE((std::is_same<schedule_type, + Kokkos::Schedule<Kokkos::Dynamic>>::value)); + ASSERT_TRUE((std::is_same<work_tag, SomeTag>::value)); + } + + { + using policy_t = + Kokkos::RangePolicy<SomeTag, Kokkos::Schedule<Kokkos::Dynamic>, + Kokkos::IndexType<long>>; + using execution_space = typename policy_t::execution_space; + using index_type = typename policy_t::index_type; + using schedule_type = typename policy_t::schedule_type; + using work_tag = typename policy_t::work_tag; + + ASSERT_TRUE(( + std::is_same<execution_space, Kokkos::DefaultExecutionSpace>::value)); + ASSERT_TRUE((std::is_same<index_type, long>::value)); + ASSERT_TRUE((std::is_same<schedule_type, + Kokkos::Schedule<Kokkos::Dynamic>>::value)); + ASSERT_TRUE((std::is_same<work_tag, SomeTag>::value)); + } + } + void test_runtime_parameters() { + using policy_t = Kokkos::RangePolicy<>; + { + policy_t p(5, 15); + ASSERT_TRUE((p.begin() == 5)); + ASSERT_TRUE((p.end() == 15)); + } + { + policy_t p(Kokkos::DefaultExecutionSpace(), 5, 15); + ASSERT_TRUE((p.begin() == 5)); + ASSERT_TRUE((p.end() == 15)); + } + { + policy_t p(5, 15, Kokkos::ChunkSize(10)); + ASSERT_TRUE((p.begin() == 5)); + ASSERT_TRUE((p.end() == 15)); + ASSERT_TRUE((p.chunk_size() == 10)); + } + { + policy_t p(Kokkos::DefaultExecutionSpace(), 5, 15, Kokkos::ChunkSize(10)); + ASSERT_TRUE((p.begin() == 5)); + ASSERT_TRUE((p.end() == 15)); + ASSERT_TRUE((p.chunk_size() == 10)); + } + { + policy_t p; + ASSERT_TRUE((p.begin() == 0)); + ASSERT_TRUE((p.end() == 0)); + p = policy_t(5, 15, Kokkos::ChunkSize(10)); + ASSERT_TRUE((p.begin() == 5)); + ASSERT_TRUE((p.end() == 15)); + ASSERT_TRUE((p.chunk_size() == 10)); + } + } +}; + +template <class ExecutionSpace> +class TestTeamPolicyConstruction { + public: + TestTeamPolicyConstruction() { + test_compile_time_parameters(); + test_run_time_parameters(); + } + + private: + void test_compile_time_parameters() { + { + using policy_t = Kokkos::TeamPolicy<>; + using execution_space = typename policy_t::execution_space; + using index_type = typename policy_t::index_type; + using schedule_type = typename policy_t::schedule_type; + using work_tag = typename policy_t::work_tag; + + ASSERT_TRUE(( + std::is_same<execution_space, Kokkos::DefaultExecutionSpace>::value)); + ASSERT_TRUE((std::is_same<index_type, + typename execution_space::size_type>::value)); + ASSERT_TRUE((std::is_same<schedule_type, + Kokkos::Schedule<Kokkos::Static>>::value)); + ASSERT_TRUE((std::is_same<work_tag, void>::value)); + } + + { + using policy_t = Kokkos::TeamPolicy<ExecutionSpace>; + using execution_space = typename policy_t::execution_space; + using index_type = typename policy_t::index_type; + using schedule_type = typename policy_t::schedule_type; + using work_tag = typename policy_t::work_tag; + + ASSERT_TRUE((std::is_same<execution_space, ExecutionSpace>::value)); + ASSERT_TRUE((std::is_same<index_type, + typename execution_space::size_type>::value)); + ASSERT_TRUE((std::is_same<schedule_type, + Kokkos::Schedule<Kokkos::Static>>::value)); + ASSERT_TRUE((std::is_same<work_tag, void>::value)); + } + + { + using policy_t = + Kokkos::TeamPolicy<ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic>>; + using execution_space = typename policy_t::execution_space; + using index_type = typename policy_t::index_type; + using schedule_type = typename policy_t::schedule_type; + using work_tag = typename policy_t::work_tag; + + ASSERT_TRUE((std::is_same<execution_space, ExecutionSpace>::value)); + ASSERT_TRUE((std::is_same<index_type, + typename execution_space::size_type>::value)); + ASSERT_TRUE((std::is_same<schedule_type, + Kokkos::Schedule<Kokkos::Dynamic>>::value)); + ASSERT_TRUE((std::is_same<work_tag, void>::value)); + } + + { + using policy_t = + Kokkos::TeamPolicy<ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic>, + Kokkos::IndexType<long>>; + using execution_space = typename policy_t::execution_space; + using index_type = typename policy_t::index_type; + using schedule_type = typename policy_t::schedule_type; + using work_tag = typename policy_t::work_tag; + + ASSERT_TRUE((std::is_same<execution_space, ExecutionSpace>::value)); + ASSERT_TRUE((std::is_same<index_type, long>::value)); + ASSERT_TRUE((std::is_same<schedule_type, + Kokkos::Schedule<Kokkos::Dynamic>>::value)); + ASSERT_TRUE((std::is_same<work_tag, void>::value)); + } + + { + using policy_t = + Kokkos::TeamPolicy<Kokkos::IndexType<long>, ExecutionSpace, + Kokkos::Schedule<Kokkos::Dynamic>>; + using execution_space = typename policy_t::execution_space; + using index_type = typename policy_t::index_type; + using schedule_type = typename policy_t::schedule_type; + using work_tag = typename policy_t::work_tag; + + ASSERT_TRUE((std::is_same<execution_space, ExecutionSpace>::value)); + ASSERT_TRUE((std::is_same<index_type, long>::value)); + ASSERT_TRUE((std::is_same<schedule_type, + Kokkos::Schedule<Kokkos::Dynamic>>::value)); + ASSERT_TRUE((std::is_same<work_tag, void>::value)); + } + + { + using policy_t = + Kokkos::TeamPolicy<ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic>, + Kokkos::IndexType<long>, SomeTag>; + using execution_space = typename policy_t::execution_space; + using index_type = typename policy_t::index_type; + using schedule_type = typename policy_t::schedule_type; + using work_tag = typename policy_t::work_tag; + + ASSERT_TRUE((std::is_same<execution_space, ExecutionSpace>::value)); + ASSERT_TRUE((std::is_same<index_type, long>::value)); + ASSERT_TRUE((std::is_same<schedule_type, + Kokkos::Schedule<Kokkos::Dynamic>>::value)); + ASSERT_TRUE((std::is_same<work_tag, SomeTag>::value)); + } + + { + using policy_t = + Kokkos::TeamPolicy<Kokkos::Schedule<Kokkos::Dynamic>, ExecutionSpace, + Kokkos::IndexType<long>, SomeTag>; + using execution_space = typename policy_t::execution_space; + using index_type = typename policy_t::index_type; + using schedule_type = typename policy_t::schedule_type; + using work_tag = typename policy_t::work_tag; + + ASSERT_TRUE((std::is_same<execution_space, ExecutionSpace>::value)); + ASSERT_TRUE((std::is_same<index_type, long>::value)); + ASSERT_TRUE((std::is_same<schedule_type, + Kokkos::Schedule<Kokkos::Dynamic>>::value)); + ASSERT_TRUE((std::is_same<work_tag, SomeTag>::value)); + } + + { + using policy_t = + Kokkos::TeamPolicy<SomeTag, Kokkos::Schedule<Kokkos::Dynamic>, + Kokkos::IndexType<long>, ExecutionSpace>; + using execution_space = typename policy_t::execution_space; + using index_type = typename policy_t::index_type; + using schedule_type = typename policy_t::schedule_type; + using work_tag = typename policy_t::work_tag; + + ASSERT_TRUE((std::is_same<execution_space, ExecutionSpace>::value)); + ASSERT_TRUE((std::is_same<index_type, long>::value)); + ASSERT_TRUE((std::is_same<schedule_type, + Kokkos::Schedule<Kokkos::Dynamic>>::value)); + ASSERT_TRUE((std::is_same<work_tag, SomeTag>::value)); + } + + { + using policy_t = Kokkos::TeamPolicy<Kokkos::Schedule<Kokkos::Dynamic>>; + using execution_space = typename policy_t::execution_space; + using index_type = typename policy_t::index_type; + using schedule_type = typename policy_t::schedule_type; + using work_tag = typename policy_t::work_tag; + + ASSERT_TRUE(( + std::is_same<execution_space, Kokkos::DefaultExecutionSpace>::value)); + ASSERT_TRUE((std::is_same<index_type, + typename execution_space::size_type>::value)); + ASSERT_TRUE((std::is_same<schedule_type, + Kokkos::Schedule<Kokkos::Dynamic>>::value)); + ASSERT_TRUE((std::is_same<work_tag, void>::value)); + } + + { + using policy_t = Kokkos::TeamPolicy<Kokkos::Schedule<Kokkos::Dynamic>, + Kokkos::IndexType<long>>; + using execution_space = typename policy_t::execution_space; + using index_type = typename policy_t::index_type; + using schedule_type = typename policy_t::schedule_type; + using work_tag = typename policy_t::work_tag; + + ASSERT_TRUE(( + std::is_same<execution_space, Kokkos::DefaultExecutionSpace>::value)); + ASSERT_TRUE((std::is_same<index_type, long>::value)); + ASSERT_TRUE((std::is_same<schedule_type, + Kokkos::Schedule<Kokkos::Dynamic>>::value)); + ASSERT_TRUE((std::is_same<work_tag, void>::value)); + } + + { + using policy_t = Kokkos::TeamPolicy<Kokkos::IndexType<long>, + Kokkos::Schedule<Kokkos::Dynamic>>; + using execution_space = typename policy_t::execution_space; + using index_type = typename policy_t::index_type; + using schedule_type = typename policy_t::schedule_type; + using work_tag = typename policy_t::work_tag; + + ASSERT_TRUE(( + std::is_same<execution_space, Kokkos::DefaultExecutionSpace>::value)); + ASSERT_TRUE((std::is_same<index_type, long>::value)); + ASSERT_TRUE((std::is_same<schedule_type, + Kokkos::Schedule<Kokkos::Dynamic>>::value)); + ASSERT_TRUE((std::is_same<work_tag, void>::value)); + } + + { + using policy_t = Kokkos::TeamPolicy<Kokkos::Schedule<Kokkos::Dynamic>, + Kokkos::IndexType<long>, SomeTag>; + using execution_space = typename policy_t::execution_space; + using index_type = typename policy_t::index_type; + using schedule_type = typename policy_t::schedule_type; + using work_tag = typename policy_t::work_tag; + + ASSERT_TRUE(( + std::is_same<execution_space, Kokkos::DefaultExecutionSpace>::value)); + ASSERT_TRUE((std::is_same<index_type, long>::value)); + ASSERT_TRUE((std::is_same<schedule_type, + Kokkos::Schedule<Kokkos::Dynamic>>::value)); + ASSERT_TRUE((std::is_same<work_tag, SomeTag>::value)); + } + + { + using policy_t = Kokkos::TeamPolicy<Kokkos::Schedule<Kokkos::Dynamic>, + Kokkos::IndexType<long>, SomeTag>; + using execution_space = typename policy_t::execution_space; + using index_type = typename policy_t::index_type; + using schedule_type = typename policy_t::schedule_type; + using work_tag = typename policy_t::work_tag; + + ASSERT_TRUE(( + std::is_same<execution_space, Kokkos::DefaultExecutionSpace>::value)); + ASSERT_TRUE((std::is_same<index_type, long>::value)); + ASSERT_TRUE((std::is_same<schedule_type, + Kokkos::Schedule<Kokkos::Dynamic>>::value)); + ASSERT_TRUE((std::is_same<work_tag, SomeTag>::value)); + } + + { + using policy_t = + Kokkos::TeamPolicy<SomeTag, Kokkos::Schedule<Kokkos::Dynamic>, + Kokkos::IndexType<long>>; + using execution_space = typename policy_t::execution_space; + using index_type = typename policy_t::index_type; + using schedule_type = typename policy_t::schedule_type; + using work_tag = typename policy_t::work_tag; + + ASSERT_TRUE(( + std::is_same<execution_space, Kokkos::DefaultExecutionSpace>::value)); + ASSERT_TRUE((std::is_same<index_type, long>::value)); + ASSERT_TRUE((std::is_same<schedule_type, + Kokkos::Schedule<Kokkos::Dynamic>>::value)); + ASSERT_TRUE((std::is_same<work_tag, SomeTag>::value)); + } + } + + template <class policy_t> + void test_run_time_parameters_type() { + int league_size = 131; + int team_size = 4 < policy_t::execution_space::concurrency() + ? 4 + : policy_t::execution_space::concurrency(); +#ifdef KOKKOS_ENABLE_HPX + team_size = 1; +#endif +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (std::is_same<typename policy_t::execution_space, + Kokkos::Experimental::OpenMPTarget>::value) + team_size = 32; +#endif + int chunk_size = 4; + int per_team_scratch = 1024; + int per_thread_scratch = 16; + int scratch_size = per_team_scratch + per_thread_scratch * team_size; + + policy_t p1(league_size, team_size); + ASSERT_EQ(p1.league_size(), league_size); + ASSERT_EQ(p1.team_size(), team_size); +// FIXME_SYCL implement chunk_size +#ifndef KOKKOS_ENABLE_SYCL + ASSERT_TRUE(p1.chunk_size() > 0); +#endif + ASSERT_EQ(p1.scratch_size(0), 0); + + policy_t p2 = p1.set_chunk_size(chunk_size); + ASSERT_EQ(p1.league_size(), league_size); + ASSERT_EQ(p1.team_size(), team_size); + ASSERT_EQ(p1.chunk_size(), chunk_size); + ASSERT_EQ(p1.scratch_size(0), 0); + + ASSERT_EQ(p2.league_size(), league_size); + ASSERT_EQ(p2.team_size(), team_size); + ASSERT_EQ(p2.chunk_size(), chunk_size); + ASSERT_EQ(p2.scratch_size(0), 0); + + policy_t p3 = p2.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch)); + ASSERT_EQ(p2.league_size(), league_size); + ASSERT_EQ(p2.team_size(), team_size); + ASSERT_EQ(p2.chunk_size(), chunk_size); + ASSERT_EQ(p2.scratch_size(0), per_team_scratch); + ASSERT_EQ(p3.league_size(), league_size); + ASSERT_EQ(p3.team_size(), team_size); + ASSERT_EQ(p3.chunk_size(), chunk_size); + ASSERT_EQ(p3.scratch_size(0), per_team_scratch); + + policy_t p4 = p2.set_scratch_size(0, Kokkos::PerThread(per_thread_scratch)); + ASSERT_EQ(p2.league_size(), league_size); + ASSERT_EQ(p2.team_size(), team_size); + ASSERT_EQ(p2.chunk_size(), chunk_size); + ASSERT_EQ(p2.scratch_size(0), scratch_size); + ASSERT_EQ(p4.league_size(), league_size); + ASSERT_EQ(p4.team_size(), team_size); + ASSERT_EQ(p4.chunk_size(), chunk_size); + ASSERT_EQ(p4.scratch_size(0), scratch_size); + + policy_t p5 = p2.set_scratch_size(0, Kokkos::PerThread(per_thread_scratch), + Kokkos::PerTeam(per_team_scratch)); + ASSERT_EQ(p2.league_size(), league_size); + ASSERT_EQ(p2.team_size(), team_size); + ASSERT_EQ(p2.chunk_size(), chunk_size); + ASSERT_EQ(p2.scratch_size(0), scratch_size); + ASSERT_EQ(p5.league_size(), league_size); + ASSERT_EQ(p5.team_size(), team_size); + ASSERT_EQ(p5.chunk_size(), chunk_size); + ASSERT_EQ(p5.scratch_size(0), scratch_size); + + policy_t p6 = p2.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch), + Kokkos::PerThread(per_thread_scratch)); + ASSERT_EQ(p2.league_size(), league_size); + ASSERT_EQ(p2.team_size(), team_size); + ASSERT_EQ(p2.chunk_size(), chunk_size); + ASSERT_EQ(p2.scratch_size(0), scratch_size); + ASSERT_EQ(p6.league_size(), league_size); + ASSERT_EQ(p6.team_size(), team_size); + ASSERT_EQ(p6.chunk_size(), chunk_size); + ASSERT_EQ(p6.scratch_size(0), scratch_size); + + policy_t p7 = p3.set_scratch_size(0, Kokkos::PerTeam(per_team_scratch), + Kokkos::PerThread(per_thread_scratch)); + ASSERT_EQ(p3.league_size(), league_size); + ASSERT_EQ(p3.team_size(), team_size); + ASSERT_EQ(p3.chunk_size(), chunk_size); + ASSERT_EQ(p3.scratch_size(0), scratch_size); + ASSERT_EQ(p7.league_size(), league_size); + ASSERT_EQ(p7.team_size(), team_size); + ASSERT_EQ(p7.chunk_size(), chunk_size); + ASSERT_EQ(p7.scratch_size(0), scratch_size); + + policy_t p8; // default constructed + ASSERT_EQ(p8.league_size(), 0); + ASSERT_EQ(p8.scratch_size(0), 0); + p8 = p3; // call assignment operator + ASSERT_EQ(p3.league_size(), league_size); + ASSERT_EQ(p3.team_size(), team_size); + ASSERT_EQ(p3.chunk_size(), chunk_size); + ASSERT_EQ(p3.scratch_size(0), scratch_size); + ASSERT_EQ(p8.league_size(), league_size); + ASSERT_EQ(p8.team_size(), team_size); + ASSERT_EQ(p8.chunk_size(), chunk_size); + ASSERT_EQ(p8.scratch_size(0), scratch_size); + } + + void test_run_time_parameters() { + test_run_time_parameters_type<Kokkos::TeamPolicy<ExecutionSpace>>(); + test_run_time_parameters_type< + Kokkos::TeamPolicy<ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic>, + Kokkos::IndexType<long>>>(); + test_run_time_parameters_type< + Kokkos::TeamPolicy<Kokkos::IndexType<long>, ExecutionSpace, + Kokkos::Schedule<Kokkos::Dynamic>>>(); + test_run_time_parameters_type< + Kokkos::TeamPolicy<Kokkos::Schedule<Kokkos::Dynamic>, + Kokkos::IndexType<long>, ExecutionSpace, SomeTag>>(); + } +}; + +// semiregular is copyable and default initializable +// (regular requires equality comparable) +template <class Policy> +void check_semiregular() { + static_assert(std::is_default_constructible<Policy>::value, ""); + static_assert(std::is_copy_constructible<Policy>::value, ""); + static_assert(std::is_move_constructible<Policy>::value, ""); + static_assert(std::is_copy_assignable<Policy>::value, ""); + static_assert(std::is_move_assignable<Policy>::value, ""); + static_assert(std::is_destructible<Policy>::value, ""); +} + +TEST(TEST_CATEGORY, policy_construction) { + check_semiregular<Kokkos::RangePolicy<TEST_EXECSPACE>>(); + check_semiregular<Kokkos::TeamPolicy<TEST_EXECSPACE>>(); + check_semiregular<Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>>(); + + TestRangePolicyConstruction<TEST_EXECSPACE>(); + TestTeamPolicyConstruction<TEST_EXECSPACE>(); +} + +template <template <class...> class Policy, class... Args> +void check_converting_constructor_add_work_tag(Policy<Args...> const& policy) { + // Not the greatest but at least checking it compiles + struct WorkTag {}; + Policy<Args..., WorkTag> policy_with_tag = policy; + (void)policy_with_tag; +} + +TEST(TEST_CATEGORY, policy_converting_constructor_from_other_policy) { + check_converting_constructor_add_work_tag( + Kokkos::RangePolicy<TEST_EXECSPACE>{}); + check_converting_constructor_add_work_tag( + Kokkos::TeamPolicy<TEST_EXECSPACE>{}); + check_converting_constructor_add_work_tag( + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>{}); +} + +#ifndef KOKKOS_ENABLE_OPENMPTARGET // FIXME_OPENMPTARGET +TEST(TEST_CATEGORY_DEATH, policy_bounds_unsafe_narrowing_conversions) { + using Policy = Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>, + Kokkos::IndexType<unsigned>>; + + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + ASSERT_DEATH( + { + (void)Policy({-1, 0}, {2, 3}); + }, + "unsafe narrowing conversion"); +} +#endif + +template <class Policy> +void test_prefer_desired_occupancy(Policy const& policy) { + static_assert(!Policy::experimental_contains_desired_occupancy, ""); + + // MaximizeOccupancy -> MaximizeOccupancy + auto const policy_still_no_occ = Kokkos::Experimental::prefer( + policy, Kokkos::Experimental::MaximizeOccupancy{}); + static_assert( + !decltype(policy_still_no_occ)::experimental_contains_desired_occupancy, + ""); + + // MaximizeOccupancy -> DesiredOccupancy + auto const policy_with_occ = Kokkos::Experimental::prefer( + policy, Kokkos::Experimental::DesiredOccupancy{33}); + static_assert( + decltype(policy_with_occ)::experimental_contains_desired_occupancy, ""); + EXPECT_EQ(policy_with_occ.impl_get_desired_occupancy().value(), 33); + + // DesiredOccupancy -> DesiredOccupancy + auto const policy_change_occ = Kokkos::Experimental::prefer( + policy_with_occ, Kokkos::Experimental::DesiredOccupancy{24}); + static_assert( + decltype(policy_change_occ)::experimental_contains_desired_occupancy, ""); + EXPECT_EQ(policy_change_occ.impl_get_desired_occupancy().value(), 24); + + // DesiredOccupancy -> MaximizeOccupancy + auto const policy_drop_occ = Kokkos::Experimental::prefer( + policy_with_occ, Kokkos::Experimental::MaximizeOccupancy{}); + static_assert( + !decltype(policy_drop_occ)::experimental_contains_desired_occupancy, ""); +} + +template <class... Args> +struct DummyPolicy : Kokkos::Impl::PolicyTraits<Args...> { + using execution_policy = DummyPolicy; + + using base_t = Kokkos::Impl::PolicyTraits<Args...>; + using base_t::base_t; +}; + +TEST(TEST_CATEGORY, desired_occupancy_prefer) { + test_prefer_desired_occupancy(DummyPolicy<TEST_EXECSPACE>{}); + test_prefer_desired_occupancy(Kokkos::RangePolicy<TEST_EXECSPACE>{}); + test_prefer_desired_occupancy( + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>{}); + test_prefer_desired_occupancy(Kokkos::TeamPolicy<TEST_EXECSPACE>{}); +} + +// For a more informative static assertion: +template <size_t> +struct static_assert_dummy_policy_must_be_size_one; +template <> +struct static_assert_dummy_policy_must_be_size_one<1> {}; +template <size_t, size_t> +struct static_assert_dummy_policy_must_be_size_of_desired_occupancy; +template <> +struct static_assert_dummy_policy_must_be_size_of_desired_occupancy< + sizeof(Kokkos::Experimental::DesiredOccupancy), + sizeof(Kokkos::Experimental::DesiredOccupancy)> {}; + +TEST(TEST_CATEGORY, desired_occupancy_empty_base_optimization) { + DummyPolicy<TEST_EXECSPACE> const policy{}; + static_assert(sizeof(decltype(policy)) == 1, ""); + static_assert_dummy_policy_must_be_size_one<sizeof(decltype(policy))> + _assert1{}; + (void)_assert1; // avoid unused variable warning + + using Kokkos::Experimental::DesiredOccupancy; + auto policy_with_occ = + Kokkos::Experimental::prefer(policy, DesiredOccupancy{50}); + static_assert(sizeof(decltype(policy_with_occ)) == sizeof(DesiredOccupancy), + ""); + static_assert_dummy_policy_must_be_size_of_desired_occupancy< + sizeof(decltype(policy_with_occ)), sizeof(DesiredOccupancy)> + _assert2{}; + (void)_assert2; // avoid unused variable warning +} + +template <typename Policy> +void test_desired_occupancy_converting_constructors(Policy const& policy) { + auto policy_with_occ = Kokkos::Experimental::prefer( + policy, Kokkos::Experimental::DesiredOccupancy{50}); + EXPECT_EQ(policy_with_occ.impl_get_desired_occupancy().value(), 50); + + auto policy_with_hint = Kokkos::Experimental::require( + policy_with_occ, Kokkos::Experimental::WorkItemProperty::HintLightWeight); + EXPECT_EQ(policy_with_hint.impl_get_desired_occupancy().value(), 50); +} + +TEST(TEST_CATEGORY, desired_occupancy_converting_constructors) { + test_desired_occupancy_converting_constructors( + Kokkos::RangePolicy<TEST_EXECSPACE>{}); + test_desired_occupancy_converting_constructors( + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>{}); + test_desired_occupancy_converting_constructors( + Kokkos::TeamPolicy<TEST_EXECSPACE>{}); +} + +template <class T> +void more_md_range_policy_construction_test() { + (void)Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>{ + Kokkos::Array<T, 2>{}, Kokkos::Array<T, 2>{}}; + + (void)Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>{{{T(0), T(0)}}, + {{T(2), T(2)}}}; + + (void)Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>{{T(0), T(0)}, + {T(2), T(2)}}; +} + +TEST(TEST_CATEGORY, md_range_policy_construction_from_arrays) { + { + // Check that construction from Kokkos::Array of long compiles for backwards + // compability. This was broken in + // https://github.com/kokkos/kokkos/pull/3527/commits/88ea8eec6567c84739d77bdd25fdbc647fae28bb#r512323639 + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>> p1( + Kokkos::Array<long, 2>{{0, 1}}, Kokkos::Array<long, 2>{{2, 3}}); + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>> p2( + Kokkos::Array<long, 2>{{0, 1}}, Kokkos::Array<long, 2>{{2, 3}}); + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>> p3( + Kokkos::Array<long, 2>{{0, 1}}, Kokkos::Array<long, 2>{{2, 3}}, + Kokkos::Array<long, 1>{{4}}); + } + { + // Check that construction from Kokkos::Array of the specified index type + // works. + using index_type = unsigned long long; + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>, + Kokkos::IndexType<index_type>> + p1(Kokkos::Array<index_type, 2>{{0, 1}}, + Kokkos::Array<index_type, 2>{{2, 3}}); + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>, + Kokkos::IndexType<index_type>> + p2(Kokkos::Array<index_type, 2>{{0, 1}}, + Kokkos::Array<index_type, 2>{{2, 3}}); + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>, + Kokkos::IndexType<index_type>> + p3(Kokkos::Array<index_type, 2>{{0, 1}}, + Kokkos::Array<index_type, 2>{{2, 3}}, + Kokkos::Array<index_type, 1>{{4}}); + } + { + // Check that construction from double-braced initliazer list + // works. + using index_type = unsigned long long; + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>> p1({{0, 1}}, + {{2, 3}}); + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>, + Kokkos::IndexType<index_type>> + p2({{0, 1}}, {{2, 3}}); + } + + more_md_range_policy_construction_test<char>(); + more_md_range_policy_construction_test<int>(); + more_md_range_policy_construction_test<unsigned long>(); + more_md_range_policy_construction_test<std::int64_t>(); +} + +template <class WorkTag, class Policy> +constexpr auto set_worktag(Policy const& policy) { + static_assert(Kokkos::is_execution_policy<Policy>::value, ""); + using PolicyWithWorkTag = + Kokkos::Impl::WorkTagTrait::policy_with_trait<Policy, WorkTag>; + return PolicyWithWorkTag{policy}; +} + +TEST(TEST_CATEGORY, policy_set_worktag) { + struct SomeWorkTag {}; + struct OtherWorkTag {}; + + Kokkos::RangePolicy<> p1; + static_assert(std::is_void<decltype(p1)::work_tag>::value, ""); + + auto p2 = set_worktag<SomeWorkTag>(p1); + static_assert(std::is_same<decltype(p2)::work_tag, SomeWorkTag>::value, ""); + + auto p3 = set_worktag<OtherWorkTag>(p2); + static_assert(std::is_same<decltype(p3)::work_tag, OtherWorkTag>::value, ""); + + // NOTE this does not currently compile + // auto p4 = set_worktag<void>(p3); + // static_assert(std::is_void<decltype(p4)::work_tag>::value, ""); +} +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestRange.hpp b/packages/kokkos/core/unit_test/TestRange.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a6a6220f2dceea470414fb0d712796689f6d151c --- /dev/null +++ b/packages/kokkos/core/unit_test/TestRange.hpp @@ -0,0 +1,486 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <cstdio> + +#include <Kokkos_Core.hpp> + +namespace Test { + +namespace { + +template <class ExecSpace, class ScheduleType> +struct TestRange { + using value_type = int; ///< alias required for the parallel_reduce + + using view_type = Kokkos::View<value_type *, ExecSpace>; + + view_type m_flags; + view_type result_view; + + struct VerifyInitTag {}; + struct ResetTag {}; + struct VerifyResetTag {}; + struct OffsetTag {}; + struct VerifyOffsetTag {}; + + int N; +#ifndef KOKKOS_WORKAROUND_OPENMPTARGET_GCC + static const int offset = 13; +#else + int offset; +#endif + TestRange(const size_t N_) + : m_flags(Kokkos::view_alloc(Kokkos::WithoutInitializing, "flags"), N_), + result_view(Kokkos::view_alloc(Kokkos::WithoutInitializing, "results"), + N_), + N(N_) { +#ifdef KOKKOS_WORKAROUND_OPENMPTARGET_GCC + offset = 13; +#endif + } + + void test_for() { + typename view_type::HostMirror host_flags = + Kokkos::create_mirror_view(m_flags); + + Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace, ScheduleType>(0, N), + *this); + + { + using ThisType = TestRange<ExecSpace, ScheduleType>; + std::string label("parallel_for"); + Kokkos::Impl::ParallelConstructName<ThisType, void> pcn(label); + ASSERT_EQ(pcn.get(), label); + std::string empty_label(""); + Kokkos::Impl::ParallelConstructName<ThisType, void> empty_pcn( + empty_label); + ASSERT_EQ(empty_pcn.get(), typeid(ThisType).name()); + } + + Kokkos::parallel_for( + Kokkos::RangePolicy<ExecSpace, ScheduleType, VerifyInitTag>(0, N), + *this); + + { + using ThisType = TestRange<ExecSpace, ScheduleType>; + std::string label("parallel_for"); + Kokkos::Impl::ParallelConstructName<ThisType, VerifyInitTag> pcn(label); + ASSERT_EQ(pcn.get(), label); + std::string empty_label(""); + Kokkos::Impl::ParallelConstructName<ThisType, VerifyInitTag> empty_pcn( + empty_label); + ASSERT_EQ(empty_pcn.get(), std::string(typeid(ThisType).name()) + "/" + + typeid(VerifyInitTag).name()); + } + + Kokkos::deep_copy(host_flags, m_flags); + + int error_count = 0; + for (int i = 0; i < N; ++i) { + if (int(i) != host_flags(i)) ++error_count; + } + ASSERT_EQ(error_count, int(0)); + + Kokkos::parallel_for( + Kokkos::RangePolicy<ExecSpace, ScheduleType, ResetTag>(0, N), *this); + Kokkos::parallel_for( + std::string("TestKernelFor"), + Kokkos::RangePolicy<ExecSpace, ScheduleType, VerifyResetTag>(0, N), + *this); + + Kokkos::deep_copy(host_flags, m_flags); + + error_count = 0; + for (int i = 0; i < N; ++i) { + if (int(2 * i) != host_flags(i)) ++error_count; + } + ASSERT_EQ(error_count, int(0)); + + Kokkos::parallel_for( + Kokkos::RangePolicy<ExecSpace, ScheduleType, OffsetTag>(offset, + N + offset), + *this); + Kokkos::parallel_for( + std::string("TestKernelFor"), + Kokkos::RangePolicy<ExecSpace, ScheduleType, VerifyOffsetTag>(0, N), + *this); + + Kokkos::deep_copy(host_flags, m_flags); + + error_count = 0; + for (int i = 0; i < N; ++i) { + if (i + offset != host_flags(i)) ++error_count; + } + ASSERT_EQ(error_count, int(0)); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { m_flags(i) = i; } + + KOKKOS_INLINE_FUNCTION + void operator()(const VerifyInitTag &, const int i) const { + if (i != m_flags(i)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("TestRange::test_for_error at %d != %d\n", + i, m_flags(i)); + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const ResetTag &, const int i) const { + m_flags(i) = 2 * m_flags(i); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const VerifyResetTag &, const int i) const { + if (2 * i != m_flags(i)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("TestRange::test_for_error at %d != %d\n", + i, m_flags(i)); + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const OffsetTag &, const int i) const { + m_flags(i - offset) = i; + } + + KOKKOS_INLINE_FUNCTION + void operator()(const VerifyOffsetTag &, const int i) const { + if (i + offset != m_flags(i)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("TestRange::test_for_error at %d != %d\n", + i + offset, m_flags(i)); + } + } + + //---------------------------------------- + + void test_reduce() { + value_type total = 0; + + Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace, ScheduleType>(0, N), + *this); + + Kokkos::parallel_reduce("TestKernelReduce", + Kokkos::RangePolicy<ExecSpace, ScheduleType>(0, N), + *this, total); + // sum( 0 .. N-1 ) + ASSERT_EQ(size_t((N - 1) * (N) / 2), size_t(total)); + + Kokkos::parallel_reduce( + "TestKernelReduce_long", + Kokkos::RangePolicy<ExecSpace, ScheduleType, long>(0, N), *this, total); + // sum( 0 .. N-1 ) + ASSERT_EQ(size_t((N - 1) * (N) / 2), size_t(total)); + + Kokkos::parallel_reduce( + Kokkos::RangePolicy<ExecSpace, ScheduleType, OffsetTag>(offset, + N + offset), + *this, total); + // sum( 1 .. N ) + ASSERT_EQ(size_t((N) * (N + 1) / 2), size_t(total)); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, value_type &update) const { + update += m_flags(i); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const OffsetTag &, const int i, value_type &update) const { + update += 1 + m_flags(i - offset); + } + + //---------------------------------------- + + void test_scan() { + Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace, ScheduleType>(0, N), + *this); + + auto check_scan_results = [&]() { + auto const host_mirror = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), result_view); + for (int i = 0; i < N; ++i) { + if (((i + 1) * i) / 2 != host_mirror(i)) { + std::cout << "Error at " << i << std::endl; + EXPECT_EQ(size_t(((i + 1) * i) / 2), size_t(host_mirror(i))); + } + } + }; + + Kokkos::parallel_scan( + "TestKernelScan", + Kokkos::RangePolicy<ExecSpace, ScheduleType, OffsetTag>(0, N), *this); + + check_scan_results(); + + value_type total = 0; + Kokkos::parallel_scan( + "TestKernelScanWithTotal", + Kokkos::RangePolicy<ExecSpace, ScheduleType, OffsetTag>(0, N), *this, + total); + + check_scan_results(); + + ASSERT_EQ(size_t((N - 1) * (N) / 2), size_t(total)); // sum( 0 .. N-1 ) + } + + KOKKOS_INLINE_FUNCTION + void operator()(const OffsetTag &, const int i, value_type &update, + bool final) const { + update += m_flags(i); + + if (final) { + if (update != (i * (i + 1)) / 2) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "TestRange::test_scan error (%d,%d) : %d != %d\n", i, m_flags(i), + (i * (i + 1)) / 2, update); + } + result_view(i) = update; + } + } + + void test_dynamic_policy() { +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + auto const N_no_implicit_capture = N; + using policy_t = + Kokkos::RangePolicy<ExecSpace, Kokkos::Schedule<Kokkos::Dynamic> >; + + { + Kokkos::View<size_t *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > + count("Count", ExecSpace::concurrency()); + Kokkos::View<int *, ExecSpace> a("A", N); + + Kokkos::parallel_for( + policy_t(0, N), KOKKOS_LAMBDA(const int &i) { + for (int k = 0; k < (i < N_no_implicit_capture / 2 ? 1 : 10000); + k++) { + a(i)++; + } + count(ExecSpace::impl_hardware_thread_id())++; + }); + + int error = 0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy<ExecSpace>(0, N), + KOKKOS_LAMBDA(const int &i, value_type &lsum) { + lsum += (a(i) != (i < N_no_implicit_capture / 2 ? 1 : 10000)); + }, + error); + ASSERT_EQ(error, 0); + + if ((ExecSpace::concurrency() > (int)1) && + (N > static_cast<int>(4 * ExecSpace::concurrency()))) { + size_t min = N; + size_t max = 0; + for (int t = 0; t < ExecSpace::concurrency(); t++) { + if (count(t) < min) min = count(t); + if (count(t) > max) max = count(t); + } + ASSERT_TRUE(min < max); + + // if ( ExecSpace::concurrency() > 2 ) { + // ASSERT_TRUE( 2 * min < max ); + //} + } + } + + { + Kokkos::View<size_t *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > + count("Count", ExecSpace::concurrency()); + Kokkos::View<int *, ExecSpace> a("A", N); + + value_type sum = 0; + Kokkos::parallel_reduce( + policy_t(0, N), + KOKKOS_LAMBDA(const int &i, value_type &lsum) { + for (int k = 0; k < (i < N_no_implicit_capture / 2 ? 1 : 10000); + k++) { + a(i)++; + } + count(ExecSpace::impl_hardware_thread_id())++; + lsum++; + }, + sum); + ASSERT_EQ(sum, N); + + int error = 0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy<ExecSpace>(0, N), + KOKKOS_LAMBDA(const int &i, value_type &lsum) { + lsum += (a(i) != (i < N_no_implicit_capture / 2 ? 1 : 10000)); + }, + error); + ASSERT_EQ(error, 0); + + if ((ExecSpace::concurrency() > (int)1) && + (N > static_cast<int>(4 * ExecSpace::concurrency()))) { + size_t min = N; + size_t max = 0; + for (int t = 0; t < ExecSpace::concurrency(); t++) { + if (count(t) < min) min = count(t); + if (count(t) > max) max = count(t); + } + ASSERT_TRUE(min < max); + + // if ( ExecSpace::concurrency() > 2 ) { + // ASSERT_TRUE( 2 * min < max ); + //} + } + } +#endif + } +}; + +} // namespace + +TEST(TEST_CATEGORY, range_for) { + { + TestRange<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> > f(0); + f.test_for(); + } + { + TestRange<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> > f(0); + f.test_for(); + } + + { + TestRange<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> > f(2); + f.test_for(); + } + { + TestRange<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> > f(3); + f.test_for(); + } + + { + TestRange<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> > f(1000); + f.test_for(); + } + { + TestRange<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> > f(1001); + f.test_for(); + } +} + +TEST(TEST_CATEGORY, range_reduce) { + { + TestRange<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> > f(0); + f.test_reduce(); + } + { + TestRange<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> > f(0); + f.test_reduce(); + } + + { + TestRange<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> > f(2); + f.test_reduce(); + } + { + TestRange<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> > f(3); + f.test_reduce(); + } + + { + TestRange<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> > f(1000); + f.test_reduce(); + } + { + TestRange<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> > f(1001); + f.test_reduce(); + } +} + +#ifndef KOKKOS_ENABLE_OPENMPTARGET +TEST(TEST_CATEGORY, range_scan) { + { + TestRange<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> > f(0); + f.test_scan(); + } + { + TestRange<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> > f(0); + f.test_scan(); + } +#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \ + !defined(KOKKOS_ENABLE_SYCL) + { + TestRange<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> > f(0); + f.test_dynamic_policy(); + } +#endif + + { + TestRange<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> > f(2); + f.test_scan(); + } + { + TestRange<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> > f(3); + f.test_scan(); + } +#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \ + !defined(KOKKOS_ENABLE_SYCL) + { + TestRange<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> > f(3); + f.test_dynamic_policy(); + } +#endif + + { + TestRange<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> > f(1000); + f.test_scan(); + } + { + TestRange<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> > f(1001); + f.test_scan(); + } +#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \ + !defined(KOKKOS_ENABLE_SYCL) + { + TestRange<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> > f(1001); + f.test_dynamic_policy(); + } +#endif +} +#endif +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestRangePolicy.hpp b/packages/kokkos/core/unit_test/TestRangePolicy.hpp new file mode 100644 index 0000000000000000000000000000000000000000..3f40e24d1c8c912a5fb600aa3569f5a27515565b --- /dev/null +++ b/packages/kokkos/core/unit_test/TestRangePolicy.hpp @@ -0,0 +1,48 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_RANGEPOLICY_HPP +#define KOKKOS_TEST_RANGEPOLICY_HPP +#include <TestRange.hpp> +#endif diff --git a/packages/kokkos/core/unit_test/TestRangePolicyRequire.hpp b/packages/kokkos/core/unit_test/TestRangePolicyRequire.hpp new file mode 100644 index 0000000000000000000000000000000000000000..693f19613db6beb8c1c2a551574808de26633726 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestRangePolicyRequire.hpp @@ -0,0 +1,514 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <cstdio> + +#include <Kokkos_Core.hpp> + +// This file is largely duplicating TestRange.hpp but it applies +// Kokkos::Experimental require at every place where a parallel +// operation is executed. + +namespace Test { + +namespace { + +template <class ExecSpace, class ScheduleType, class Property> +struct TestRangeRequire { + using value_type = int; ///< alias required for the parallel_reduce + + using view_type = Kokkos::View<int *, ExecSpace>; + + view_type m_flags; + + struct VerifyInitTag {}; + struct ResetTag {}; + struct VerifyResetTag {}; + struct OffsetTag {}; + struct VerifyOffsetTag {}; + + int N; + static const int offset = 13; + TestRangeRequire(const size_t N_) + : m_flags(Kokkos::view_alloc(Kokkos::WithoutInitializing, "flags"), N_), + N(N_) {} + + void test_for() { + typename view_type::HostMirror host_flags = + Kokkos::create_mirror_view(m_flags); + + Kokkos::parallel_for( + Kokkos::Experimental::require( + Kokkos::RangePolicy<ExecSpace, ScheduleType>(0, N), Property()), + *this); + + { + using ThisType = TestRangeRequire<ExecSpace, ScheduleType, Property>; + std::string label("parallel_for"); + Kokkos::Impl::ParallelConstructName<ThisType, void> pcn(label); + ASSERT_EQ(pcn.get(), label); + std::string empty_label(""); + Kokkos::Impl::ParallelConstructName<ThisType, void> empty_pcn( + empty_label); + ASSERT_EQ(empty_pcn.get(), typeid(ThisType).name()); + } + + Kokkos::parallel_for( + Kokkos::Experimental::require( + Kokkos::RangePolicy<ExecSpace, ScheduleType, VerifyInitTag>(0, N), + Property()), + *this); + + { + using ThisType = TestRangeRequire<ExecSpace, ScheduleType, Property>; + std::string label("parallel_for"); + Kokkos::Impl::ParallelConstructName<ThisType, VerifyInitTag> pcn(label); + ASSERT_EQ(pcn.get(), label); + std::string empty_label(""); + Kokkos::Impl::ParallelConstructName<ThisType, VerifyInitTag> empty_pcn( + empty_label); + ASSERT_EQ(empty_pcn.get(), std::string(typeid(ThisType).name()) + "/" + + typeid(VerifyInitTag).name()); + } + + Kokkos::deep_copy(host_flags, m_flags); + + int error_count = 0; + for (int i = 0; i < N; ++i) { + if (int(i) != host_flags(i)) ++error_count; + } + ASSERT_EQ(error_count, int(0)); + + Kokkos::parallel_for( + Kokkos::Experimental::require( + Kokkos::RangePolicy<ExecSpace, ScheduleType, ResetTag>(0, N), + Property()), + *this); + Kokkos::parallel_for( + std::string("TestKernelFor"), + Kokkos::Experimental::require( + Kokkos::RangePolicy<ExecSpace, ScheduleType, VerifyResetTag>(0, N), + Property()), + *this); + + Kokkos::deep_copy(host_flags, m_flags); + + error_count = 0; + for (int i = 0; i < N; ++i) { + if (int(2 * i) != host_flags(i)) ++error_count; + } + ASSERT_EQ(error_count, int(0)); + + Kokkos::parallel_for( + Kokkos::Experimental::require( + Kokkos::RangePolicy<ExecSpace, ScheduleType, OffsetTag>(offset, + N + offset), + Property()), + *this); + Kokkos::parallel_for( + std::string("TestKernelFor"), + Kokkos::Experimental::require( + Kokkos::RangePolicy<ExecSpace, ScheduleType, + Kokkos::IndexType<unsigned int>, + VerifyOffsetTag>(0, N), + Property()), + *this); + + Kokkos::deep_copy(host_flags, m_flags); + + error_count = 0; + for (int i = 0; i < N; ++i) { + if (i + offset != host_flags(i)) ++error_count; + } + ASSERT_EQ(error_count, int(0)); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { m_flags(i) = i; } + + KOKKOS_INLINE_FUNCTION + void operator()(const VerifyInitTag &, const int i) const { + if (i != m_flags(i)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "TestRangeRequire::test_for error at %d != %d\n", i, m_flags(i)); + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const ResetTag &, const int i) const { + m_flags(i) = 2 * m_flags(i); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const VerifyResetTag &, const int i) const { + if (2 * i != m_flags(i)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "TestRangeRequire::test_for error at %d != %d\n", i, m_flags(i)); + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const OffsetTag &, const int i) const { + m_flags(i - offset) = i; + } + + KOKKOS_INLINE_FUNCTION + void operator()(const VerifyOffsetTag &, const int i) const { + if (i + offset != m_flags(i)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "TestRangeRequire::test_for error at %d != %d\n", i + offset, + m_flags(i)); + } + } + + //---------------------------------------- + + void test_reduce() { + int total = 0; + + Kokkos::parallel_for( + Kokkos::Experimental::require( + Kokkos::RangePolicy<ExecSpace, ScheduleType>(0, N), Property()), + *this); + + Kokkos::parallel_reduce( + "TestKernelReduce", + Kokkos::Experimental::require( + Kokkos::RangePolicy<ExecSpace, ScheduleType>(0, N), Property()), + *this, total); + // sum( 0 .. N-1 ) + ASSERT_EQ(size_t((N - 1) * (N) / 2), size_t(total)); + + Kokkos::parallel_reduce( + Kokkos::Experimental::require( + Kokkos::RangePolicy<ExecSpace, ScheduleType, OffsetTag>(offset, + N + offset), + Property()), + *this, total); + // sum( 1 .. N ) + ASSERT_EQ(size_t((N) * (N + 1) / 2), size_t(total)); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, value_type &update) const { + update += m_flags(i); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const OffsetTag &, const int i, value_type &update) const { + update += 1 + m_flags(i - offset); + } + + //---------------------------------------- + + void test_scan() { + Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace, ScheduleType>(0, N), + *this); + + Kokkos::parallel_scan( + "TestKernelScan", + Kokkos::RangePolicy<ExecSpace, ScheduleType, OffsetTag>(0, N), *this); + + int total = 0; + Kokkos::parallel_scan( + "TestKernelScanWithTotal", + Kokkos::RangePolicy<ExecSpace, ScheduleType, OffsetTag>(0, N), *this, + total); + ASSERT_EQ(size_t((N - 1) * (N) / 2), size_t(total)); // sum( 0 .. N-1 ) + } + + KOKKOS_INLINE_FUNCTION + void operator()(const OffsetTag &, const int i, value_type &update, + bool final) const { + update += m_flags(i); + + if (final) { + if (update != (i * (i + 1)) / 2) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "TestRangeRequire::test_scan error %d : %d != %d\n", i, + (i * (i + 1)) / 2, m_flags(i)); + } + } + } + + void test_dynamic_policy() { +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + auto const N_no_implicit_capture = N; + using policy_t = + Kokkos::RangePolicy<ExecSpace, Kokkos::Schedule<Kokkos::Dynamic> >; + + { + Kokkos::View<size_t *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > + count("Count", ExecSpace::concurrency()); + Kokkos::View<int *, ExecSpace> a("A", N); + + Kokkos::parallel_for( + policy_t(0, N), KOKKOS_LAMBDA(const int &i) { + for (int k = 0; k < (i < N_no_implicit_capture / 2 ? 1 : 10000); + k++) { + a(i)++; + } + count(ExecSpace::impl_hardware_thread_id())++; + }); + + int error = 0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy<ExecSpace>(0, N), + KOKKOS_LAMBDA(const int &i, int &lsum) { + lsum += (a(i) != (i < N_no_implicit_capture / 2 ? 1 : 10000)); + }, + error); + ASSERT_EQ(error, 0); + + if ((ExecSpace::concurrency() > (int)1) && + (N > static_cast<int>(4 * ExecSpace::concurrency()))) { + size_t min = N; + size_t max = 0; + for (int t = 0; t < ExecSpace::concurrency(); t++) { + if (count(t) < min) min = count(t); + if (count(t) > max) max = count(t); + } + ASSERT_TRUE(min < max); + + // if ( ExecSpace::concurrency() > 2 ) { + // ASSERT_TRUE( 2 * min < max ); + //} + } + } + + { + Kokkos::View<size_t *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > + count("Count", ExecSpace::concurrency()); + Kokkos::View<int *, ExecSpace> a("A", N); + + int sum = 0; + Kokkos::parallel_reduce( + policy_t(0, N), + KOKKOS_LAMBDA(const int &i, int &lsum) { + for (int k = 0; k < (i < N_no_implicit_capture / 2 ? 1 : 10000); + k++) { + a(i)++; + } + count(ExecSpace::impl_hardware_thread_id())++; + lsum++; + }, + sum); + ASSERT_EQ(sum, N); + + int error = 0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy<ExecSpace>(0, N), + KOKKOS_LAMBDA(const int &i, int &lsum) { + lsum += (a(i) != (i < N_no_implicit_capture / 2 ? 1 : 10000)); + }, + error); + ASSERT_EQ(error, 0); + + if ((ExecSpace::concurrency() > (int)1) && + (N > static_cast<int>(4 * ExecSpace::concurrency()))) { + size_t min = N; + size_t max = 0; + for (int t = 0; t < ExecSpace::concurrency(); t++) { + if (count(t) < min) min = count(t); + if (count(t) > max) max = count(t); + } + ASSERT_TRUE(min < max); + + // if ( ExecSpace::concurrency() > 2 ) { + // ASSERT_TRUE( 2 * min < max ); + //} + } + } +#endif + } +}; + +} // namespace + +TEST(TEST_CATEGORY, range_for_require) { + using Property = Kokkos::Experimental::WorkItemProperty::HintLightWeight_t; + { + TestRangeRequire<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>, Property> + f(0); + f.test_for(); + } + { + TestRangeRequire<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>, + Property> + f(0); + f.test_for(); + } + + { + TestRangeRequire<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>, Property> + f(2); + f.test_for(); + } + { + TestRangeRequire<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>, + Property> + f(3); + f.test_for(); + } + + { + TestRangeRequire<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>, Property> + f(1000); + f.test_for(); + } + { + TestRangeRequire<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>, + Property> + f(1001); + f.test_for(); + } +} + +TEST(TEST_CATEGORY, range_reduce_require) { + using Property = Kokkos::Experimental::WorkItemProperty::HintLightWeight_t; + { + TestRangeRequire<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>, Property> + f(0); + f.test_reduce(); + } + { + TestRangeRequire<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>, + Property> + f(0); + f.test_reduce(); + } + + { + TestRangeRequire<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>, Property> + f(2); + f.test_reduce(); + } + { + TestRangeRequire<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>, + Property> + f(3); + f.test_reduce(); + } + + { + TestRangeRequire<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>, Property> + f(1000); + f.test_reduce(); + } + { + TestRangeRequire<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>, + Property> + f(1001); + f.test_reduce(); + } +} + +#ifndef KOKKOS_ENABLE_OPENMPTARGET +TEST(TEST_CATEGORY, range_scan_require) { + using Property = Kokkos::Experimental::WorkItemProperty::HintLightWeight_t; + { + TestRangeRequire<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>, Property> + f(0); + f.test_scan(); + } + { + TestRangeRequire<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>, + Property> + f(0); + f.test_scan(); + } +#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \ + !defined(KOKKOS_ENABLE_SYCL) + { + TestRangeRequire<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>, + Property> + f(0); + f.test_dynamic_policy(); + } +#endif + + { + TestRangeRequire<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>, Property> + f(2); + f.test_scan(); + } + { + TestRangeRequire<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>, + Property> + f(3); + f.test_scan(); + } +#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \ + !defined(KOKKOS_ENABLE_SYCL) + { + TestRangeRequire<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>, + Property> + f(3); + f.test_dynamic_policy(); + } +#endif + + { + TestRangeRequire<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>, Property> + f(1000); + f.test_scan(); + } + { + TestRangeRequire<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>, + Property> + f(1001); + f.test_scan(); + } +#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \ + !defined(KOKKOS_ENABLE_SYCL) + { + TestRangeRequire<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>, + Property> + f(1001); + f.test_dynamic_policy(); + } +#endif +} +#endif +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestReduce.hpp b/packages/kokkos/core/unit_test/TestReduce.hpp new file mode 100644 index 0000000000000000000000000000000000000000..5f7fbd5623d6e8e4c25c261a0f092d79c1573fba --- /dev/null +++ b/packages/kokkos/core/unit_test/TestReduce.hpp @@ -0,0 +1,629 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <stdexcept> +#include <sstream> +#include <iostream> +#include <limits> + +#include <Kokkos_Core.hpp> + +namespace Test { + +struct ReducerTag {}; + +template <typename ScalarType, class DeviceType> +class ReduceFunctor { + public: + using execution_space = DeviceType; + using size_type = typename execution_space::size_type; + + struct value_type { + ScalarType value[3]; + }; + + const size_type nwork; + + KOKKOS_INLINE_FUNCTION + ReduceFunctor(const size_type& arg_nwork) : nwork(arg_nwork) {} + + KOKKOS_INLINE_FUNCTION + ReduceFunctor(const ReduceFunctor& rhs) : nwork(rhs.nwork) {} + + /* + KOKKOS_INLINE_FUNCTION + void init( value_type & dst ) const + { + dst.value[0] = 0; + dst.value[1] = 0; + dst.value[2] = 0; + } + */ + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dst, const volatile value_type& src) const { + dst.value[0] += src.value[0]; + dst.value[1] += src.value[1]; + dst.value[2] += src.value[2]; + } + + KOKKOS_INLINE_FUNCTION + void operator()(size_type iwork, value_type& dst) const { + dst.value[0] += 1; + dst.value[1] += iwork + 1; + dst.value[2] += nwork - iwork; + } +}; + +template <class DeviceType> +class ReduceFunctorFinal : public ReduceFunctor<int64_t, DeviceType> { + public: + using value_type = typename ReduceFunctor<int64_t, DeviceType>::value_type; + + KOKKOS_INLINE_FUNCTION + ReduceFunctorFinal(const size_t n) : ReduceFunctor<int64_t, DeviceType>(n) {} + + KOKKOS_INLINE_FUNCTION + void final(value_type& dst) const { + dst.value[0] = -dst.value[0]; + dst.value[1] = -dst.value[1]; + dst.value[2] = -dst.value[2]; + } +}; + +template <class DeviceType> +class ReduceFunctorFinalTag { + public: + using execution_space = DeviceType; + using size_type = typename execution_space::size_type; + using ScalarType = int64_t; + + struct value_type { + ScalarType value[3]; + }; + + const size_type nwork; + + KOKKOS_INLINE_FUNCTION + ReduceFunctorFinalTag(const size_type arg_nwork) : nwork(arg_nwork) {} + + KOKKOS_INLINE_FUNCTION + void join(const ReducerTag, volatile value_type& dst, + const volatile value_type& src) const { + dst.value[0] += src.value[0]; + dst.value[1] += src.value[1]; + dst.value[2] += src.value[2]; + } + + KOKKOS_INLINE_FUNCTION + void operator()(const ReducerTag, size_type iwork, value_type& dst) const { + dst.value[0] -= 1; + dst.value[1] -= iwork + 1; + dst.value[2] -= nwork - iwork; + } + + KOKKOS_INLINE_FUNCTION + void final(const ReducerTag, value_type& dst) const { + ++dst.value[0]; + ++dst.value[1]; + ++dst.value[2]; + } +}; + +template <typename ScalarType, class DeviceType> +class RuntimeReduceFunctor { + public: + // Required for functor: + using execution_space = DeviceType; + using value_type = ScalarType[]; + const unsigned value_count; + + // Unit test details: + + using size_type = typename execution_space::size_type; + + const size_type nwork; + + RuntimeReduceFunctor(const size_type arg_nwork, const size_type arg_count) + : value_count(arg_count), nwork(arg_nwork) {} + + KOKKOS_INLINE_FUNCTION + void init(ScalarType dst[]) const { + for (unsigned i = 0; i < value_count; ++i) dst[i] = 0; + } + + KOKKOS_INLINE_FUNCTION + void join(volatile ScalarType dst[], const volatile ScalarType src[]) const { + for (unsigned i = 0; i < value_count; ++i) dst[i] += src[i]; + } + + KOKKOS_INLINE_FUNCTION + void operator()(size_type iwork, ScalarType dst[]) const { + const size_type tmp[3] = {1, iwork + 1, nwork - iwork}; + + for (size_type i = 0; i < static_cast<size_type>(value_count); ++i) { + dst[i] += tmp[i % 3]; + } + } +}; + +template <typename ScalarType, class DeviceType> +class RuntimeReduceMinMax { + public: + // Required for functor: + using execution_space = DeviceType; + using value_type = ScalarType[]; + const unsigned value_count; + + // Unit test details: + + using size_type = typename execution_space::size_type; + + const size_type nwork; + const ScalarType amin; + const ScalarType amax; + + RuntimeReduceMinMax(const size_type arg_nwork, const size_type arg_count) + : value_count(arg_count), + nwork(arg_nwork), + amin(std::numeric_limits<ScalarType>::min()), + amax(std::numeric_limits<ScalarType>::max()) {} + + KOKKOS_INLINE_FUNCTION + void init(ScalarType dst[]) const { + for (unsigned i = 0; i < value_count; ++i) { + dst[i] = i % 2 ? amax : amin; + } + } + + KOKKOS_INLINE_FUNCTION + void join(volatile ScalarType dst[], const volatile ScalarType src[]) const { + for (unsigned i = 0; i < value_count; ++i) { + dst[i] = i % 2 ? (dst[i] < src[i] ? dst[i] : src[i]) // min + : (dst[i] > src[i] ? dst[i] : src[i]); // max + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(size_type iwork, ScalarType dst[]) const { + const ScalarType tmp[2] = {ScalarType(iwork + 1), + ScalarType(nwork - iwork)}; + + for (size_type i = 0; i < static_cast<size_type>(value_count); ++i) { + dst[i] = i % 2 ? (dst[i] < tmp[i % 2] ? dst[i] : tmp[i % 2]) + : (dst[i] > tmp[i % 2] ? dst[i] : tmp[i % 2]); + } + } +}; + +template <class DeviceType> +class RuntimeReduceFunctorFinal + : public RuntimeReduceFunctor<int64_t, DeviceType> { + public: + using base_type = RuntimeReduceFunctor<int64_t, DeviceType>; + using value_type = typename base_type::value_type; + using scalar_type = int64_t; + + RuntimeReduceFunctorFinal(const size_t theNwork, const size_t count) + : base_type(theNwork, count) {} + + KOKKOS_INLINE_FUNCTION + void final(value_type dst) const { + for (unsigned i = 0; i < base_type::value_count; ++i) { + dst[i] = -dst[i]; + } + } +}; + +template <class ValueType, class DeviceType> +class CombinedReduceFunctorSameType { + public: + using execution_space = typename DeviceType::execution_space; + using size_type = typename execution_space::size_type; + + const size_type nwork; + + KOKKOS_INLINE_FUNCTION + constexpr explicit CombinedReduceFunctorSameType(const size_type& arg_nwork) + : nwork(arg_nwork) {} + + KOKKOS_DEFAULTED_FUNCTION + constexpr CombinedReduceFunctorSameType( + const CombinedReduceFunctorSameType& rhs) = default; + + KOKKOS_INLINE_FUNCTION + void operator()(size_type iwork, ValueType& dst1, ValueType& dst2, + ValueType& dst3) const { + dst1 += 1; + dst2 += iwork + 1; + dst3 += nwork - iwork; + } + + KOKKOS_INLINE_FUNCTION + void operator()(size_type iwork, size_type always_zero_1, + size_type always_zero_2, ValueType& dst1, ValueType& dst2, + ValueType& dst3) const { + dst1 += 1 + always_zero_1; + dst2 += iwork + 1 + always_zero_2; + dst3 += nwork - iwork; + } +}; + +namespace { + +template <typename ScalarType, class DeviceType> +class TestReduce { + public: + using execution_space = DeviceType; + using size_type = typename execution_space::size_type; + + TestReduce(const size_type& nwork) { + run_test(nwork); + run_test_final(nwork); + run_test_final_tag(nwork); + } + + void run_test(const size_type& nwork) { + using functor_type = Test::ReduceFunctor<ScalarType, execution_space>; + using value_type = typename functor_type::value_type; + + enum { Count = 3 }; + enum { Repeat = 100 }; + + value_type result[Repeat]; + + const uint64_t nw = nwork; + const uint64_t nsum = nw % 2 ? nw * ((nw + 1) / 2) : (nw / 2) * (nw + 1); + + for (unsigned i = 0; i < Repeat; ++i) { + Kokkos::parallel_reduce(nwork, functor_type(nwork), result[i]); + } + + for (unsigned i = 0; i < Repeat; ++i) { + for (unsigned j = 0; j < Count; ++j) { + const uint64_t correct = 0 == j % 3 ? nw : nsum; + ASSERT_EQ((ScalarType)correct, result[i].value[j]); + } + } + } + + void run_test_final(const size_type& nwork) { + using functor_type = Test::ReduceFunctorFinal<execution_space>; + using value_type = typename functor_type::value_type; + + enum { Count = 3 }; + enum { Repeat = 100 }; + + value_type result[Repeat]; + + const uint64_t nw = nwork; + const uint64_t nsum = nw % 2 ? nw * ((nw + 1) / 2) : (nw / 2) * (nw + 1); + + for (unsigned i = 0; i < Repeat; ++i) { + if (i % 2 == 0) { + Kokkos::parallel_reduce(nwork, functor_type(nwork), result[i]); + } else { + Kokkos::parallel_reduce("Reduce", nwork, functor_type(nwork), + result[i]); + } + } + + for (unsigned i = 0; i < Repeat; ++i) { + for (unsigned j = 0; j < Count; ++j) { + const uint64_t correct = 0 == j % 3 ? nw : nsum; + ASSERT_EQ((ScalarType)correct, -result[i].value[j]); + } + } + } + + void run_test_final_tag(const size_type& nwork) { + using functor_type = Test::ReduceFunctorFinalTag<execution_space>; + using value_type = typename functor_type::value_type; + + enum { Count = 3 }; + enum { Repeat = 100 }; + + value_type result[Repeat]; + + const uint64_t nw = nwork; + const uint64_t nsum = nw % 2 ? nw * ((nw + 1) / 2) : (nw / 2) * (nw + 1); + + for (unsigned i = 0; i < Repeat; ++i) { + if (i % 2 == 0) { + Kokkos::parallel_reduce( + Kokkos::RangePolicy<execution_space, ReducerTag>(0, nwork), + functor_type(nwork), result[i]); + } else { + Kokkos::parallel_reduce( + "Reduce", + Kokkos::RangePolicy<execution_space, ReducerTag>(0, nwork), + functor_type(nwork), result[i]); + } + } + + for (unsigned i = 0; i < Repeat; ++i) { + for (unsigned j = 0; j < Count; ++j) { + const uint64_t correct = 0 == j % 3 ? nw : nsum; + ASSERT_EQ((ScalarType)correct, 1 - result[i].value[j]); + } + } + } +}; + +template <typename ScalarType, class DeviceType> +class TestReduceDynamic { + public: + using execution_space = DeviceType; + using size_type = typename execution_space::size_type; + + TestReduceDynamic(const size_type nwork) { + run_test_dynamic(nwork); + run_test_dynamic_minmax(nwork); + run_test_dynamic_final(nwork); + } + + void run_test_dynamic(const size_type nwork) { + using functor_type = + Test::RuntimeReduceFunctor<ScalarType, execution_space>; + + enum { Count = 3 }; + enum { Repeat = 100 }; + + ScalarType result[Repeat][Count]; + + const uint64_t nw = nwork; + const uint64_t nsum = nw % 2 ? nw * ((nw + 1) / 2) : (nw / 2) * (nw + 1); + + for (unsigned i = 0; i < Repeat; ++i) { + if (i % 2 == 0) { + Kokkos::parallel_reduce(nwork, functor_type(nwork, Count), result[i]); + } else { + Kokkos::parallel_reduce("Reduce", nwork, functor_type(nwork, Count), + result[i]); + } + } + + for (unsigned i = 0; i < Repeat; ++i) { + for (unsigned j = 0; j < Count; ++j) { + const uint64_t correct = 0 == j % 3 ? nw : nsum; + ASSERT_EQ((ScalarType)correct, result[i][j]); + } + } + } + + void run_test_dynamic_minmax(const size_type nwork) { + using functor_type = Test::RuntimeReduceMinMax<ScalarType, execution_space>; + + enum { Count = 2 }; + enum { Repeat = 100 }; + + ScalarType result[Repeat][Count]; + + for (unsigned i = 0; i < Repeat; ++i) { + if (i % 2 == 0) { + Kokkos::parallel_reduce(nwork, functor_type(nwork, Count), result[i]); + } else { + Kokkos::parallel_reduce("Reduce", nwork, functor_type(nwork, Count), + result[i]); + } + } + + for (unsigned i = 0; i < Repeat; ++i) { + for (unsigned j = 0; j < Count; ++j) { + if (nwork == 0) { + ScalarType amin(std::numeric_limits<ScalarType>::min()); + ScalarType amax(std::numeric_limits<ScalarType>::max()); + const ScalarType correct = (j % 2) ? amax : amin; + ASSERT_EQ((ScalarType)correct, result[i][j]); + } else { + const uint64_t correct = j % 2 ? 1 : nwork; + ASSERT_EQ((ScalarType)correct, result[i][j]); + } + } + } + } + + void run_test_dynamic_final(const size_type nwork) { + using functor_type = Test::RuntimeReduceFunctorFinal<execution_space>; + + enum { Count = 3 }; + enum { Repeat = 100 }; + + typename functor_type::scalar_type result[Repeat][Count]; + + const uint64_t nw = nwork; + const uint64_t nsum = nw % 2 ? nw * ((nw + 1) / 2) : (nw / 2) * (nw + 1); + + for (unsigned i = 0; i < Repeat; ++i) { + if (i % 2 == 0) { + Kokkos::parallel_reduce(nwork, functor_type(nwork, Count), result[i]); + } else { + Kokkos::parallel_reduce("TestKernelReduce", nwork, + functor_type(nwork, Count), result[i]); + } + } + + for (unsigned i = 0; i < Repeat; ++i) { + for (unsigned j = 0; j < Count; ++j) { + const uint64_t correct = 0 == j % 3 ? nw : nsum; + ASSERT_EQ((ScalarType)correct, -result[i][j]); + } + } + } +}; + +template <typename ScalarType, class DeviceType> +class TestReduceDynamicView { + public: + using execution_space = DeviceType; + using size_type = typename execution_space::size_type; + + TestReduceDynamicView(const size_type nwork) { run_test_dynamic_view(nwork); } + + void run_test_dynamic_view(const size_type nwork) { + using functor_type = + Test::RuntimeReduceFunctor<ScalarType, execution_space>; + + using result_type = Kokkos::View<ScalarType*, DeviceType>; + using result_host_type = typename result_type::HostMirror; + + const unsigned CountLimit = 23; + + const uint64_t nw = nwork; + const uint64_t nsum = nw % 2 ? nw * ((nw + 1) / 2) : (nw / 2) * (nw + 1); + + for (unsigned count = 0; count < CountLimit; ++count) { + result_type result("result", count); + result_host_type host_result = Kokkos::create_mirror(result); + + // Test result to host pointer: + + std::string str("TestKernelReduce"); + if (count % 2 == 0) { + Kokkos::parallel_reduce(nw, functor_type(nw, count), + host_result.data()); + } else { + Kokkos::parallel_reduce(str, nw, functor_type(nw, count), + host_result.data()); + } + + for (unsigned j = 0; j < count; ++j) { + const uint64_t correct = 0 == j % 3 ? nw : nsum; + ASSERT_EQ(host_result(j), (ScalarType)correct); + host_result(j) = 0; + } + } + } +}; + +} // namespace + +TEST(TEST_CATEGORY, int64_t_reduce) { + TestReduce<int64_t, TEST_EXECSPACE>(0); + TestReduce<int64_t, TEST_EXECSPACE>(1000000); +} + +TEST(TEST_CATEGORY, double_reduce) { + TestReduce<double, TEST_EXECSPACE>(0); + TestReduce<double, TEST_EXECSPACE>(1000000); +} + +TEST(TEST_CATEGORY, int64_t_reduce_dynamic) { + TestReduceDynamic<int64_t, TEST_EXECSPACE>(0); + TestReduceDynamic<int64_t, TEST_EXECSPACE>(1000000); +} + +TEST(TEST_CATEGORY, double_reduce_dynamic) { + TestReduceDynamic<double, TEST_EXECSPACE>(0); + TestReduceDynamic<double, TEST_EXECSPACE>(1000000); +} + +TEST(TEST_CATEGORY, int64_t_reduce_dynamic_view) { + TestReduceDynamicView<int64_t, TEST_EXECSPACE>(0); + TestReduceDynamicView<int64_t, TEST_EXECSPACE>(1000000); +} + +TEST(TEST_CATEGORY, int_combined_reduce) { + using functor_type = CombinedReduceFunctorSameType<int64_t, TEST_EXECSPACE>; + constexpr uint64_t nw = 1000; + + uint64_t nsum = (nw / 2) * (nw + 1); + + int64_t result1 = 0; + int64_t result2 = 0; + int64_t result3 = 0; + + Kokkos::parallel_reduce("int_combined_reduce", + Kokkos::RangePolicy<TEST_EXECSPACE>(0, nw), + functor_type(nw), result1, result2, result3); + + ASSERT_EQ(nw, result1); + ASSERT_EQ(nsum, result2); + ASSERT_EQ(nsum, result3); +} + +TEST(TEST_CATEGORY, mdrange_combined_reduce) { + using functor_type = CombinedReduceFunctorSameType<int64_t, TEST_EXECSPACE>; + constexpr uint64_t nw = 1000; + + uint64_t nsum = (nw / 2) * (nw + 1); + + int64_t result1 = 0; + int64_t result2 = 0; + int64_t result3 = 0; + + Kokkos::parallel_reduce( + "int_combined_reduce_mdrange", + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<3>>({{0, 0, 0}}, + {{nw, 1, 1}}), + functor_type(nw), result1, result2, result3); + + ASSERT_EQ(nw, result1); + ASSERT_EQ(nsum, result2); + ASSERT_EQ(nsum, result3); +} + +TEST(TEST_CATEGORY, int_combined_reduce_mixed) { + using functor_type = CombinedReduceFunctorSameType<int64_t, TEST_EXECSPACE>; + + constexpr uint64_t nw = 1000; + + uint64_t nsum = (nw / 2) * (nw + 1); + + auto result1_v = Kokkos::View<int64_t, Kokkos::HostSpace>{"result1_v"}; + + int64_t result2 = 0; + + auto result3_v = Kokkos::View<int64_t, Kokkos::HostSpace>{"result3_v"}; + + Kokkos::parallel_reduce("int_combined-reduce_mixed", + Kokkos::RangePolicy<TEST_EXECSPACE>(0, nw), + functor_type(nw), result1_v, result2, + Kokkos::Sum<int64_t, Kokkos::HostSpace>{result3_v}); + + ASSERT_EQ(nw, result1_v()); + ASSERT_EQ(nsum, result2); + ASSERT_EQ(nsum, result3_v()); +} +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestReduceCombinatorical.hpp b/packages/kokkos/core/unit_test/TestReduceCombinatorical.hpp new file mode 100644 index 0000000000000000000000000000000000000000..68e7d746dd91a68046c4d074884ef5aef7519427 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestReduceCombinatorical.hpp @@ -0,0 +1,669 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <stdexcept> +#include <sstream> +#include <iostream> +#include <limits> + +#include <Kokkos_Core.hpp> + +namespace Test { + +namespace ReduceCombinatorical { + +template <class Scalar, class Space = Kokkos::HostSpace> +struct AddPlus { + public: + // Required. + using reducer = AddPlus; + using value_type = Scalar; + + using result_view_type = + Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> >; + + private: + result_view_type result; + + public: + AddPlus(value_type& result_) : result(&result_) {} + + // Required. + KOKKOS_INLINE_FUNCTION + void join(value_type& dest, const value_type& src) const { dest += src + 1; } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& dest, const volatile value_type& src) const { + dest += src + 1; + } + + // Optional. + KOKKOS_INLINE_FUNCTION + void init(value_type& val) const { val = value_type(); } + + KOKKOS_INLINE_FUNCTION + value_type& reference() const { return result(); } + + KOKKOS_INLINE_FUNCTION + result_view_type view() const { return result; } +}; + +template <int ISTEAM> +struct FunctorScalar; + +template <> +struct FunctorScalar<0> { + Kokkos::View<double> result; + + FunctorScalar(Kokkos::View<double> r) : result(r) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int& i, double& update) const { update += i; } +}; + +template <> +struct FunctorScalar<1> { + using team_type = Kokkos::TeamPolicy<>::member_type; + + Kokkos::View<double> result; + + FunctorScalar(Kokkos::View<double> r) : result(r) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const team_type& team, double& update) const { + update += 1.0 / team.team_size() * team.league_rank(); + } +}; + +template <int ISTEAM> +struct FunctorScalarInit; + +template <> +struct FunctorScalarInit<0> { + Kokkos::View<double> result; + + FunctorScalarInit(Kokkos::View<double> r) : result(r) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int& i, double& update) const { update += i; } + + KOKKOS_INLINE_FUNCTION + void init(double& update) const { update = 0.0; } +}; + +template <> +struct FunctorScalarInit<1> { + using team_type = Kokkos::TeamPolicy<>::member_type; + + Kokkos::View<double> result; + + FunctorScalarInit(Kokkos::View<double> r) : result(r) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const team_type& team, double& update) const { + update += 1.0 / team.team_size() * team.league_rank(); + } + + KOKKOS_INLINE_FUNCTION + void init(double& update) const { update = 0.0; } +}; + +template <int ISTEAM> +struct FunctorScalarFinal; + +template <> +struct FunctorScalarFinal<0> { + Kokkos::View<double> result; + + FunctorScalarFinal(Kokkos::View<double> r) : result(r) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int& i, double& update) const { update += i; } + + KOKKOS_INLINE_FUNCTION + void final(double& update) const { result() = update; } +}; + +template <> +struct FunctorScalarFinal<1> { + using team_type = Kokkos::TeamPolicy<>::member_type; + + Kokkos::View<double> result; + + FunctorScalarFinal(Kokkos::View<double> r) : result(r) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const team_type& team, double& update) const { + update += 1.0 / team.team_size() * team.league_rank(); + } + + KOKKOS_INLINE_FUNCTION + void final(double& update) const { result() = update; } +}; + +template <int ISTEAM> +struct FunctorScalarJoin; + +template <> +struct FunctorScalarJoin<0> { + Kokkos::View<double> result; + + FunctorScalarJoin(Kokkos::View<double> r) : result(r) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int& i, double& update) const { update += i; } + + KOKKOS_INLINE_FUNCTION + void join(volatile double& dst, const volatile double& update) const { + dst += update; + } +}; + +template <> +struct FunctorScalarJoin<1> { + using team_type = Kokkos::TeamPolicy<>::member_type; + + Kokkos::View<double> result; + + FunctorScalarJoin(Kokkos::View<double> r) : result(r) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const team_type& team, double& update) const { + update += 1.0 / team.team_size() * team.league_rank(); + } + + KOKKOS_INLINE_FUNCTION + void join(volatile double& dst, const volatile double& update) const { + dst += update; + } +}; + +template <int ISTEAM> +struct FunctorScalarJoinFinal; + +template <> +struct FunctorScalarJoinFinal<0> { + Kokkos::View<double> result; + + FunctorScalarJoinFinal(Kokkos::View<double> r) : result(r) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int& i, double& update) const { update += i; } + + KOKKOS_INLINE_FUNCTION + void join(volatile double& dst, const volatile double& update) const { + dst += update; + } + + KOKKOS_INLINE_FUNCTION + void final(double& update) const { result() = update; } +}; + +template <> +struct FunctorScalarJoinFinal<1> { + using team_type = Kokkos::TeamPolicy<>::member_type; + + Kokkos::View<double> result; + + FunctorScalarJoinFinal(Kokkos::View<double> r) : result(r) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const team_type& team, double& update) const { + update += 1.0 / team.team_size() * team.league_rank(); + } + + KOKKOS_INLINE_FUNCTION + void join(volatile double& dst, const volatile double& update) const { + dst += update; + } + + KOKKOS_INLINE_FUNCTION + void final(double& update) const { result() = update; } +}; + +template <int ISTEAM> +struct FunctorScalarJoinInit; + +template <> +struct FunctorScalarJoinInit<0> { + Kokkos::View<double> result; + + FunctorScalarJoinInit(Kokkos::View<double> r) : result(r) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int& i, double& update) const { update += i; } + + KOKKOS_INLINE_FUNCTION + void join(volatile double& dst, const volatile double& update) const { + dst += update; + } + + KOKKOS_INLINE_FUNCTION + void init(double& update) const { update = 0.0; } +}; + +template <> +struct FunctorScalarJoinInit<1> { + using team_type = Kokkos::TeamPolicy<>::member_type; + + Kokkos::View<double> result; + + FunctorScalarJoinInit(Kokkos::View<double> r) : result(r) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const team_type& team, double& update) const { + update += 1.0 / team.team_size() * team.league_rank(); + } + + KOKKOS_INLINE_FUNCTION + void join(volatile double& dst, const volatile double& update) const { + dst += update; + } + + KOKKOS_INLINE_FUNCTION + void init(double& update) const { update = 0.0; } +}; + +template <int ISTEAM> +struct FunctorScalarJoinFinalInit; + +template <> +struct FunctorScalarJoinFinalInit<0> { + Kokkos::View<double> result; + + FunctorScalarJoinFinalInit(Kokkos::View<double> r) : result(r) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int& i, double& update) const { update += i; } + + KOKKOS_INLINE_FUNCTION + void join(volatile double& dst, const volatile double& update) const { + dst += update; + } + + KOKKOS_INLINE_FUNCTION + void final(double& update) const { result() = update; } + + KOKKOS_INLINE_FUNCTION + void init(double& update) const { update = 0.0; } +}; + +template <> +struct FunctorScalarJoinFinalInit<1> { + using team_type = Kokkos::TeamPolicy<>::member_type; + + Kokkos::View<double> result; + + FunctorScalarJoinFinalInit(Kokkos::View<double> r) : result(r) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const team_type& team, double& update) const { + update += 1.0 / team.team_size() * team.league_rank(); + } + + KOKKOS_INLINE_FUNCTION + void join(volatile double& dst, const volatile double& update) const { + dst += update; + } + + KOKKOS_INLINE_FUNCTION + void final(double& update) const { result() = update; } + + KOKKOS_INLINE_FUNCTION + void init(double& update) const { update = 0.0; } +}; + +struct Functor1 { + KOKKOS_INLINE_FUNCTION + void operator()(const int& i, double& update) const { update += i; } +}; + +struct Functor2 { + using value_type = double[]; + + const unsigned value_count; + + Functor2(unsigned n) : value_count(n) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const unsigned& i, double update[]) const { + for (unsigned j = 0; j < value_count; j++) { + update[j] += i; + } + } + + KOKKOS_INLINE_FUNCTION + void init(double dst[]) const { + for (unsigned i = 0; i < value_count; ++i) dst[i] = 0; + } + + KOKKOS_INLINE_FUNCTION + void join(volatile double dst[], const volatile double src[]) const { + for (unsigned i = 0; i < value_count; ++i) dst[i] += src[i]; + } +}; + +} // namespace ReduceCombinatorical + +template <class ExecSpace = Kokkos::DefaultExecutionSpace> +struct TestReduceCombinatoricalInstantiation { + template <class... Args> + static void CallParallelReduce(Args... args) { + Kokkos::parallel_reduce(args...); + } + + template <class... Args> + static void AddReturnArgument(int N, Args... args) { + Kokkos::View<double, Kokkos::HostSpace> result_view("ResultViewHost"); + Kokkos::View<double, ExecSpace> result_view_device("ResultViewDevice"); + double expected_result = (1.0 * N) * (1.0 * N - 1.0) / 2.0; + + double value = 99; + Kokkos::parallel_reduce(args..., value); + ASSERT_EQ(expected_result, value); + + result_view() = 99; + CallParallelReduce(args..., result_view); + Kokkos::fence(); + ASSERT_EQ(expected_result, result_view()); + +#ifndef KOKKOS_ENABLE_OPENMPTARGET + result_view() = 99; + CallParallelReduce(args..., result_view_device); + Kokkos::fence(); + Kokkos::deep_copy(result_view, result_view_device); + ASSERT_EQ(expected_result, result_view()); +#endif + + value = 99; + CallParallelReduce( + args..., + Kokkos::View<double, Kokkos::HostSpace, + Kokkos::MemoryTraits<Kokkos::Unmanaged> >(&value)); + Kokkos::fence(); + ASSERT_EQ(expected_result, value); + + result_view() = 99; + const Kokkos::View<double, Kokkos::HostSpace, + Kokkos::MemoryTraits<Kokkos::Unmanaged> > + result_view_const_um = result_view; + CallParallelReduce(args..., result_view_const_um); + Kokkos::fence(); + ASSERT_EQ(expected_result, result_view_const_um()); + + value = 99; +// WORKAROUND OPENMPTARGET Custom Reducers not implemented +#ifndef KOKKOS_ENABLE_OPENMPTARGET + CallParallelReduce(args..., + Test::ReduceCombinatorical::AddPlus<double>(value)); + if ((Kokkos::DefaultExecutionSpace::concurrency() > 1) && + (ExecSpace::concurrency() > 1) && (expected_result > 0)) { + ASSERT_TRUE(expected_result < value); + } else if (((Kokkos::DefaultExecutionSpace::concurrency() > 1) || + (ExecSpace::concurrency() > 1)) && + (expected_result > 0)) { + ASSERT_TRUE(expected_result <= value); + } else { + ASSERT_EQ(expected_result, value); + } + + value = 99; + Test::ReduceCombinatorical::AddPlus<double> add(value); + CallParallelReduce(args..., add); + if ((Kokkos::DefaultExecutionSpace::concurrency() > 1) && + (ExecSpace::concurrency() > 1) && (expected_result > 0)) { + ASSERT_TRUE(expected_result < value); + } else if (((Kokkos::DefaultExecutionSpace::concurrency() > 1) || + (ExecSpace::concurrency() > 1)) && + (expected_result > 0)) { + ASSERT_TRUE(expected_result <= value); + } else { + ASSERT_EQ(expected_result, value); + } +#endif + } + + template <class... Args> + static void AddLambdaRange(int N, void*, Args... args) { + AddReturnArgument( + N, args..., KOKKOS_LAMBDA(const int& i, double& lsum) { lsum += i; }); + } + + template <class... Args> + static void AddLambdaTeam(int N, void*, Args... args) { + AddReturnArgument( + N, args..., + KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type& team, + double& update) { + update += 1.0 / team.team_size() * team.league_rank(); + }); + } + + template <class... Args> + static void AddLambdaRange(int, Kokkos::InvalidType, Args... /*args*/) {} + + template <class... Args> + static void AddLambdaTeam(int, Kokkos::InvalidType, Args... /*args*/) {} + + template <int ISTEAM, class... Args> + static void AddFunctor(int N, Args... args) { + Kokkos::View<double, ExecSpace> result_view("FunctorView"); + auto h_r = Kokkos::create_mirror_view(result_view); + Test::ReduceCombinatorical::FunctorScalar<ISTEAM> functor(result_view); + + AddReturnArgument(N, args..., functor); + AddReturnArgument( + N, args..., + Test::ReduceCombinatorical::FunctorScalar<ISTEAM>(result_view)); +// WORKAROUND OPENMPTARGET: reductions with functor join/init/final +// not implemented +#if !defined(KOKKOS_ENABLE_OPENMPTARGET) + AddReturnArgument( + N, args..., + Test::ReduceCombinatorical::FunctorScalarInit<ISTEAM>(result_view)); + AddReturnArgument( + N, args..., + Test::ReduceCombinatorical::FunctorScalarJoin<ISTEAM>(result_view)); + AddReturnArgument( + N, args..., + Test::ReduceCombinatorical::FunctorScalarJoinInit<ISTEAM>(result_view)); + double expected_result = (1.0 * N) * (1.0 * N - 1.0) / 2.0; + + h_r() = 0; + Kokkos::deep_copy(result_view, h_r); + CallParallelReduce( + args..., + Test::ReduceCombinatorical::FunctorScalarFinal<ISTEAM>(result_view)); + Kokkos::fence(); + Kokkos::deep_copy(h_r, result_view); + ASSERT_EQ(expected_result, h_r()); + + h_r() = 0; + Kokkos::deep_copy(result_view, h_r); + CallParallelReduce( + args..., Test::ReduceCombinatorical::FunctorScalarJoinFinal<ISTEAM>( + result_view)); + Kokkos::fence(); + Kokkos::deep_copy(h_r, result_view); + ASSERT_EQ(expected_result, h_r()); + + h_r() = 0; + Kokkos::deep_copy(result_view, h_r); + CallParallelReduce( + args..., Test::ReduceCombinatorical::FunctorScalarJoinFinalInit<ISTEAM>( + result_view)); + Kokkos::fence(); + Kokkos::deep_copy(h_r, result_view); + ASSERT_EQ(expected_result, h_r()); +#endif + } + + template <class... Args> + static void AddFunctorLambdaRange(int N, Args... args) { + AddFunctor<0, Args...>(N, args...); +#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA + AddLambdaRange( + N, + typename std::conditional< + std::is_same<ExecSpace, Kokkos::DefaultExecutionSpace>::value, + void*, Kokkos::InvalidType>::type(), + args...); +#endif + } + + template <class... Args> + static void AddFunctorLambdaTeam(int N, Args... args) { + AddFunctor<1, Args...>(N, args...); +#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA + AddLambdaTeam( + N, + typename std::conditional< + std::is_same<ExecSpace, Kokkos::DefaultExecutionSpace>::value, + void*, Kokkos::InvalidType>::type(), + args...); +#endif + } + + template <class... Args> + static void AddPolicy_1(int N, Args... args) { + Kokkos::RangePolicy<ExecSpace> policy(0, N); + + AddFunctorLambdaRange(1000, args..., 1000); + AddFunctorLambdaRange(N, args..., N); + AddFunctorLambdaRange(N, args..., policy); + } + + template <class... Args> + static void AddPolicy_2(int N, Args... args) { + AddFunctorLambdaRange(N, args..., Kokkos::RangePolicy<ExecSpace>(0, N)); + AddFunctorLambdaRange( + N, args..., + Kokkos::RangePolicy<ExecSpace, Kokkos::Schedule<Kokkos::Dynamic> >(0, + N)); + AddFunctorLambdaRange( + N, args..., + Kokkos::RangePolicy<ExecSpace, Kokkos::Schedule<Kokkos::Static> >(0, N) + .set_chunk_size(16)); + AddFunctorLambdaRange( + N, args..., + Kokkos::RangePolicy<ExecSpace, Kokkos::Schedule<Kokkos::Dynamic> >(0, N) + .set_chunk_size(16)); + } + + template <class... Args> + static void AddPolicy_3(int N, Args... args) { + AddFunctorLambdaTeam(N, args..., + Kokkos::TeamPolicy<ExecSpace>(N, Kokkos::AUTO)); + AddFunctorLambdaTeam( + N, args..., + Kokkos::TeamPolicy<ExecSpace, Kokkos::Schedule<Kokkos::Dynamic> >( + N, Kokkos::AUTO)); + AddFunctorLambdaTeam( + N, args..., + Kokkos::TeamPolicy<ExecSpace, Kokkos::Schedule<Kokkos::Static> >( + N, Kokkos::AUTO) + .set_chunk_size(16)); + AddFunctorLambdaTeam( + N, args..., + Kokkos::TeamPolicy<ExecSpace, Kokkos::Schedule<Kokkos::Dynamic> >( + N, Kokkos::AUTO) + .set_chunk_size(16)); + } + + static void execute_a1() { AddPolicy_1(1000); } + + static void execute_b1() { + std::string s("Std::String"); + AddPolicy_1(1000, s.c_str()); + AddPolicy_1(1000, "Char Constant"); +#ifndef KOKKOS_ENABLE_OPENMPTARGET + AddPolicy_1(0, "Char Constant"); +#endif + } + + static void execute_c1() { + std::string s("Std::String"); + AddPolicy_1(1000, s); + } + + static void execute_a2() { AddPolicy_2(1000); } + + static void execute_b2() { + std::string s("Std::String"); + AddPolicy_2(1000, s.c_str()); + AddPolicy_2(1000, "Char Constant"); +#ifndef KOKKOS_ENABLE_OPENMPTARGET + AddPolicy_2(0, "Char Constant"); +#endif + } + + static void execute_c2() { + std::string s("Std::String"); + AddPolicy_2(1000, s); + } + + static void execute_a3() { +#ifndef KOKKOS_ENABLE_OPENMPTARGET + AddPolicy_3(1000); +#endif + } + + static void execute_b3() { +#ifndef KOKKOS_ENABLE_OPENMPTARGET + std::string s("Std::String"); + AddPolicy_3(1000, s.c_str()); + AddPolicy_3(1000, "Char Constant"); + AddPolicy_3(0, "Char Constant"); +#endif + } + + static void execute_c3() { +#ifndef KOKKOS_ENABLE_OPENMPTARGET + std::string s("Std::String"); + AddPolicy_3(1000, s); +#endif + } +}; + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestReducers.hpp b/packages/kokkos/core/unit_test/TestReducers.hpp new file mode 100644 index 0000000000000000000000000000000000000000..35f0e231fd2a7b1e88bbf4be568532aa5c219e3f --- /dev/null +++ b/packages/kokkos/core/unit_test/TestReducers.hpp @@ -0,0 +1,1054 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <stdexcept> +#include <sstream> +#include <iostream> +#include <limits> + +#include <Kokkos_Core.hpp> + +//-------------------------------------------------------------------------- + +namespace Test { + +struct ReducerTag {}; + +template <class Scalar, class ExecSpace = Kokkos::DefaultExecutionSpace> +struct TestReducers { + struct SumFunctor { + Kokkos::View<const Scalar*, ExecSpace> values; + + KOKKOS_INLINE_FUNCTION + void operator()(const int& i, Scalar& value) const { value += values(i); } + }; + + struct ProdFunctor { + Kokkos::View<const Scalar*, ExecSpace> values; + + KOKKOS_INLINE_FUNCTION + void operator()(const int& i, Scalar& value) const { value *= values(i); } + }; + + struct MinFunctor { + Kokkos::View<const Scalar*, ExecSpace> values; + + KOKKOS_INLINE_FUNCTION + void operator()(const int& i, Scalar& value) const { + if (values(i) < value) value = values(i); + } + }; + + struct MaxFunctor { + Kokkos::View<const Scalar*, ExecSpace> values; + + KOKKOS_INLINE_FUNCTION + void operator()(const int& i, Scalar& value) const { + if (values(i) > value) value = values(i); + } + }; + + struct MinLocFunctor { + Kokkos::View<const Scalar*, ExecSpace> values; + + KOKKOS_INLINE_FUNCTION + void operator()( + const int& i, + typename Kokkos::MinLoc<Scalar, int>::value_type& value) const { + if (values(i) < value.val) { + value.val = values(i); + value.loc = i; + } + } + }; + + struct MaxLocFunctor { + Kokkos::View<const Scalar*, ExecSpace> values; + + KOKKOS_INLINE_FUNCTION + void operator()( + const int& i, + typename Kokkos::MaxLoc<Scalar, int>::value_type& value) const { + if (values(i) > value.val) { + value.val = values(i); + value.loc = i; + } + } + }; + + struct MinMaxLocFunctor { + Kokkos::View<const Scalar*, ExecSpace> values; + + KOKKOS_INLINE_FUNCTION + void operator()( + const int& i, + typename Kokkos::MinMaxLoc<Scalar, int>::value_type& value) const { + if (values(i) > value.max_val) { + value.max_val = values(i); + value.max_loc = i; + } + + if (values(i) < value.min_val) { + value.min_val = values(i); + value.min_loc = i; + } + } + }; + + struct BAndFunctor { + Kokkos::View<const Scalar*, ExecSpace> values; + + KOKKOS_INLINE_FUNCTION + void operator()(const int& i, Scalar& value) const { + value = value & values(i); + } + }; + + struct BOrFunctor { + Kokkos::View<const Scalar*, ExecSpace> values; + + KOKKOS_INLINE_FUNCTION + void operator()(const int& i, Scalar& value) const { + value = value | values(i); + } + }; + + struct LAndFunctor { + Kokkos::View<const Scalar*, ExecSpace> values; + + KOKKOS_INLINE_FUNCTION + void operator()(const int& i, Scalar& value) const { + value = value && values(i); + } + }; + + struct LOrFunctor { + Kokkos::View<const Scalar*, ExecSpace> values; + + KOKKOS_INLINE_FUNCTION + void operator()(const int& i, Scalar& value) const { + value = value || values(i); + } + }; + + struct SumFunctorTag { + Kokkos::View<const Scalar*, ExecSpace> values; + + KOKKOS_INLINE_FUNCTION + void operator()(const ReducerTag, const int& i, Scalar& value) const { + value += values(i); + } + }; + + struct ProdFunctorTag { + Kokkos::View<const Scalar*, ExecSpace> values; + + KOKKOS_INLINE_FUNCTION + void operator()(const ReducerTag, const int& i, Scalar& value) const { + value *= values(i); + } + }; + + struct MinFunctorTag { + Kokkos::View<const Scalar*, ExecSpace> values; + + KOKKOS_INLINE_FUNCTION + void operator()(const ReducerTag, const int& i, Scalar& value) const { + if (values(i) < value) value = values(i); + } + }; + + struct MaxFunctorTag { + Kokkos::View<const Scalar*, ExecSpace> values; + + KOKKOS_INLINE_FUNCTION + void operator()(const ReducerTag, const int& i, Scalar& value) const { + if (values(i) > value) value = values(i); + } + }; + + struct MinLocFunctorTag { + Kokkos::View<const Scalar*, ExecSpace> values; + + KOKKOS_INLINE_FUNCTION + void operator()( + const ReducerTag, const int& i, + typename Kokkos::MinLoc<Scalar, int>::value_type& value) const { + if (values(i) < value.val) { + value.val = values(i); + value.loc = i; + } + } + }; + + struct MaxLocFunctorTag { + Kokkos::View<const Scalar*, ExecSpace> values; + + KOKKOS_INLINE_FUNCTION + void operator()( + const ReducerTag, const int& i, + typename Kokkos::MaxLoc<Scalar, int>::value_type& value) const { + if (values(i) > value.val) { + value.val = values(i); + value.loc = i; + } + } + }; + + struct MinMaxLocFunctorTag { + Kokkos::View<const Scalar*, ExecSpace> values; + + KOKKOS_INLINE_FUNCTION + void operator()( + const ReducerTag, const int& i, + typename Kokkos::MinMaxLoc<Scalar, int>::value_type& value) const { + if (values(i) > value.max_val) { + value.max_val = values(i); + value.max_loc = i; + } + + if (values(i) < value.min_val) { + value.min_val = values(i); + value.min_loc = i; + } + } + }; + + struct BAndFunctorTag { + Kokkos::View<const Scalar*, ExecSpace> values; + + KOKKOS_INLINE_FUNCTION + void operator()(const ReducerTag, const int& i, Scalar& value) const { + value = value & values(i); + } + }; + + struct BOrFunctorTag { + Kokkos::View<const Scalar*, ExecSpace> values; + + KOKKOS_INLINE_FUNCTION + void operator()(const ReducerTag, const int& i, Scalar& value) const { + value = value | values(i); + } + }; + + struct LAndFunctorTag { + Kokkos::View<const Scalar*, ExecSpace> values; + + KOKKOS_INLINE_FUNCTION + void operator()(const ReducerTag, const int& i, Scalar& value) const { + value = value && values(i); + } + }; + + struct LOrFunctorTag { + Kokkos::View<const Scalar*, ExecSpace> values; + + KOKKOS_INLINE_FUNCTION + void operator()(const ReducerTag, const int& i, Scalar& value) const { + value = value || values(i); + } + }; + static void test_sum(int N) { + Kokkos::View<Scalar*, ExecSpace> values("Values", N); + auto h_values = Kokkos::create_mirror_view(values); + Scalar reference_sum = 0; + + for (int i = 0; i < N; i++) { + h_values(i) = (Scalar)(rand() % 100); + reference_sum += h_values(i); + } + Kokkos::deep_copy(values, h_values); + + SumFunctor f; + f.values = values; + SumFunctorTag f_tag; + f_tag.values = values; + Scalar init = 0; + + { + Scalar sum_scalar = Scalar(1); + Kokkos::Sum<Scalar> reducer_scalar(sum_scalar); + + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, 0), f, + reducer_scalar); +// Zero length reduction not yet supported +#ifndef KOKKOS_ENABLE_OPENMPTARGET + ASSERT_EQ(sum_scalar, init); +#endif + + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, N), f, + reducer_scalar); + ASSERT_EQ(sum_scalar, reference_sum); + + sum_scalar = init; + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace, ReducerTag>(0, N), + f_tag, reducer_scalar); + ASSERT_EQ(sum_scalar, reference_sum); + + Scalar sum_scalar_view = reducer_scalar.reference(); + ASSERT_EQ(sum_scalar_view, reference_sum); + } + + { + Kokkos::View<Scalar, Kokkos::HostSpace> sum_view("View"); + sum_view() = Scalar(1); + Kokkos::Sum<Scalar> reducer_view(sum_view); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, 0), f, + reducer_view); + Kokkos::fence(); + Scalar sum_view_scalar = sum_view(); +// Zero length reduction not yet supported +#ifndef KOKKOS_ENABLE_OPENMPTARGET + ASSERT_EQ(sum_view_scalar, init); +#endif + + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, N), f, + reducer_view); + Kokkos::fence(); + sum_view_scalar = sum_view(); + ASSERT_EQ(sum_view_scalar, reference_sum); + + Scalar sum_view_view = reducer_view.reference(); + ASSERT_EQ(sum_view_view, reference_sum); + } + + // Reduction to device view not yet supported +#ifndef KOKKOS_ENABLE_OPENMPTARGET + { + Kokkos::View<Scalar, typename ExecSpace::memory_space> sum_view("View"); + Kokkos::deep_copy(sum_view, Scalar(1)); + Kokkos::Sum<Scalar, typename ExecSpace::memory_space> reducer_view( + sum_view); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, 0), f, + reducer_view); + Kokkos::fence(); + Scalar sum_view_scalar; + Kokkos::deep_copy(sum_view_scalar, sum_view); + ASSERT_EQ(sum_view_scalar, init); + + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, N), f, + reducer_view); + Kokkos::fence(); + Kokkos::deep_copy(sum_view_scalar, sum_view); + ASSERT_EQ(sum_view_scalar, reference_sum); + } +#endif + } + + static void test_prod(int N) { + Kokkos::View<Scalar*, ExecSpace> values("Values", N); + auto h_values = Kokkos::create_mirror_view(values); + Scalar reference_prod = 1; + + for (int i = 0; i < N; i++) { + h_values(i) = (Scalar)(rand() % 4 + 1); + reference_prod *= h_values(i); + } + Kokkos::deep_copy(values, h_values); + + ProdFunctor f; + f.values = values; + ProdFunctorTag f_tag; + f_tag.values = values; + Scalar init = 1; + + { + Scalar prod_scalar = Scalar(0); + Kokkos::Prod<Scalar> reducer_scalar(prod_scalar); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, 0), f, + reducer_scalar); +// Zero length reduction not yet supported +#ifndef KOKKOS_ENABLE_OPENMPTARGET + ASSERT_EQ(prod_scalar, init); +#endif + + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, N), f, + reducer_scalar); + ASSERT_EQ(prod_scalar, reference_prod); + + prod_scalar = init; + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace, ReducerTag>(0, N), + f_tag, reducer_scalar); + ASSERT_EQ(prod_scalar, reference_prod); + + Scalar prod_scalar_view = reducer_scalar.reference(); + ASSERT_EQ(prod_scalar_view, reference_prod); + } + + { + Kokkos::View<Scalar, Kokkos::HostSpace> prod_view("View"); + prod_view() = Scalar(0); + Kokkos::Prod<Scalar> reducer_view(prod_view); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, 0), f, + reducer_view); + Kokkos::fence(); + Scalar prod_view_scalar = prod_view(); +// Zero length reduction not yet supported +#ifndef KOKKOS_ENABLE_OPENMPTARGET + ASSERT_EQ(prod_view_scalar, init); +#endif + + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, N), f, + reducer_view); + Kokkos::fence(); + prod_view_scalar = prod_view(); + ASSERT_EQ(prod_view_scalar, reference_prod); + + Scalar prod_view_view = reducer_view.reference(); + ASSERT_EQ(prod_view_view, reference_prod); + } + + // Reduction to device view not yet supported +#ifndef KOKKOS_ENABLE_OPENMPTARGET + { + Kokkos::View<Scalar, typename ExecSpace::memory_space> prod_view("View"); + Kokkos::deep_copy(prod_view, Scalar(0)); + Kokkos::Prod<Scalar, typename ExecSpace::memory_space> reducer_view( + prod_view); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, 0), f, + reducer_view); + Kokkos::fence(); + Scalar prod_view_scalar; + Kokkos::deep_copy(prod_view_scalar, prod_view); + ASSERT_EQ(prod_view_scalar, init); + + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, N), f, + reducer_view); + Kokkos::fence(); + Kokkos::deep_copy(prod_view_scalar, prod_view); + ASSERT_EQ(prod_view_scalar, reference_prod); + } +#endif + } + + static void test_min(int N) { + Kokkos::View<Scalar*, ExecSpace> values("Values", N); + auto h_values = Kokkos::create_mirror_view(values); + Scalar reference_min = std::numeric_limits<Scalar>::max(); + + for (int i = 0; i < N; i++) { + h_values(i) = (Scalar)(rand() % 100000); + + if (h_values(i) < reference_min) reference_min = h_values(i); + } + Kokkos::deep_copy(values, h_values); + + MinFunctor f; + f.values = values; + MinFunctorTag f_tag; + f_tag.values = values; + Scalar init = std::numeric_limits<Scalar>::max(); + + { + Scalar min_scalar = init; + Kokkos::Min<Scalar> reducer_scalar(min_scalar); + + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, N), f, + reducer_scalar); + ASSERT_EQ(min_scalar, reference_min); + + min_scalar = init; + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace, ReducerTag>(0, N), + f_tag, reducer_scalar); + ASSERT_EQ(min_scalar, reference_min); + + Scalar min_scalar_view = reducer_scalar.reference(); + ASSERT_EQ(min_scalar_view, reference_min); + } + + { + Kokkos::View<Scalar, Kokkos::HostSpace> min_view("View"); + min_view() = init; + Kokkos::Min<Scalar> reducer_view(min_view); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, N), f, + reducer_view); + Kokkos::fence(); + + Scalar min_view_scalar = min_view(); + ASSERT_EQ(min_view_scalar, reference_min); + + Scalar min_view_view = reducer_view.reference(); + ASSERT_EQ(min_view_view, reference_min); + } + } + + static void test_max(int N) { + Kokkos::View<Scalar*, ExecSpace> values("Values", N); + auto h_values = Kokkos::create_mirror_view(values); + Scalar reference_max = std::numeric_limits<Scalar>::min(); + + for (int i = 0; i < N; i++) { + h_values(i) = (Scalar)(rand() % 100000 + 1); + + if (h_values(i) > reference_max) reference_max = h_values(i); + } + Kokkos::deep_copy(values, h_values); + + MaxFunctor f; + f.values = values; + MaxFunctorTag f_tag; + f_tag.values = values; + Scalar init = std::numeric_limits<Scalar>::min(); + + { + Scalar max_scalar = init; + Kokkos::Max<Scalar> reducer_scalar(max_scalar); + + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, N), f, + reducer_scalar); + ASSERT_EQ(max_scalar, reference_max); + + max_scalar = init; + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace, ReducerTag>(0, N), + f_tag, reducer_scalar); + ASSERT_EQ(max_scalar, reference_max); + + Scalar max_scalar_view = reducer_scalar.reference(); + ASSERT_EQ(max_scalar_view, reference_max); + } + + { + Kokkos::View<Scalar, Kokkos::HostSpace> max_view("View"); + max_view() = init; + Kokkos::Max<Scalar> reducer_view(max_view); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, N), f, + reducer_view); + Kokkos::fence(); + + Scalar max_view_scalar = max_view(); + ASSERT_EQ(max_view_scalar, reference_max); + + Scalar max_view_view = reducer_view.reference(); + ASSERT_EQ(max_view_view, reference_max); + } + } + + static void test_minloc(int N) { + using value_type = typename Kokkos::MinLoc<Scalar, int>::value_type; + + Kokkos::View<Scalar*, ExecSpace> values("Values", N); + auto h_values = Kokkos::create_mirror_view(values); + Scalar reference_min = std::numeric_limits<Scalar>::max(); + int reference_loc = -1; + + for (int i = 0; i < N; i++) { + h_values(i) = (Scalar)(rand() % 100000 + 2); + + if (h_values(i) < reference_min) { + reference_min = h_values(i); + reference_loc = i; + } else if (h_values(i) == reference_min) { + // Make min unique. + h_values(i) += Scalar(1); + } + } + Kokkos::deep_copy(values, h_values); + + MinLocFunctor f; + f.values = values; + MinLocFunctorTag f_tag; + f_tag.values = values; + + { + value_type min_scalar; + Kokkos::MinLoc<Scalar, int> reducer_scalar(min_scalar); + + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, N), f, + reducer_scalar); + ASSERT_EQ(min_scalar.val, reference_min); + ASSERT_EQ(min_scalar.loc, reference_loc); + + min_scalar = value_type(); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace, ReducerTag>(0, N), + f_tag, reducer_scalar); + ASSERT_EQ(min_scalar.val, reference_min); + ASSERT_EQ(min_scalar.loc, reference_loc); + + value_type min_scalar_view = reducer_scalar.reference(); + ASSERT_EQ(min_scalar_view.val, reference_min); + ASSERT_EQ(min_scalar_view.loc, reference_loc); + } + + { + Kokkos::View<value_type, Kokkos::HostSpace> min_view("View"); + Kokkos::MinLoc<Scalar, int> reducer_view(min_view); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, N), f, + reducer_view); + Kokkos::fence(); + + value_type min_view_scalar = min_view(); + ASSERT_EQ(min_view_scalar.val, reference_min); + ASSERT_EQ(min_view_scalar.loc, reference_loc); + + value_type min_view_view = reducer_view.reference(); + ASSERT_EQ(min_view_view.val, reference_min); + ASSERT_EQ(min_view_view.loc, reference_loc); + } + } + + static void test_maxloc(int N) { + using value_type = typename Kokkos::MaxLoc<Scalar, int>::value_type; + + Kokkos::View<Scalar*, ExecSpace> values("Values", N); + auto h_values = Kokkos::create_mirror_view(values); + Scalar reference_max = std::numeric_limits<Scalar>::min(); + int reference_loc = -1; + + for (int i = 0; i < N; i++) { + h_values(i) = (Scalar)(rand() % 100000 + 2); + + if (h_values(i) > reference_max) { + reference_max = h_values(i); + reference_loc = i; + } else if (h_values(i) == reference_max) { + // Make max unique. + h_values(i) -= Scalar(1); + } + } + Kokkos::deep_copy(values, h_values); + + MaxLocFunctor f; + f.values = values; + MaxLocFunctorTag f_tag; + f_tag.values = values; + + { + value_type max_scalar; + Kokkos::MaxLoc<Scalar, int> reducer_scalar(max_scalar); + + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, N), f, + reducer_scalar); + ASSERT_EQ(max_scalar.val, reference_max); + ASSERT_EQ(max_scalar.loc, reference_loc); + + max_scalar = value_type(); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace, ReducerTag>(0, N), + f_tag, reducer_scalar); + ASSERT_EQ(max_scalar.val, reference_max); + ASSERT_EQ(max_scalar.loc, reference_loc); + + value_type max_scalar_view = reducer_scalar.reference(); + ASSERT_EQ(max_scalar_view.val, reference_max); + ASSERT_EQ(max_scalar_view.loc, reference_loc); + } + + { + Kokkos::View<value_type, Kokkos::HostSpace> max_view("View"); + Kokkos::MaxLoc<Scalar, int> reducer_view(max_view); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, N), f, + reducer_view); + Kokkos::fence(); + + value_type max_view_scalar = max_view(); + ASSERT_EQ(max_view_scalar.val, reference_max); + ASSERT_EQ(max_view_scalar.loc, reference_loc); + + value_type max_view_view = reducer_view.reference(); + ASSERT_EQ(max_view_view.val, reference_max); + ASSERT_EQ(max_view_view.loc, reference_loc); + } + } + + static void test_minmaxloc(int N) { + using value_type = typename Kokkos::MinMaxLoc<Scalar, int>::value_type; + + Kokkos::View<Scalar*, ExecSpace> values("Values", N); + auto h_values = Kokkos::create_mirror_view(values); + Scalar reference_max = std::numeric_limits<Scalar>::min(); + Scalar reference_min = std::numeric_limits<Scalar>::max(); + int reference_minloc = -1; + int reference_maxloc = -1; + + for (int i = 0; i < N; i++) { + h_values(i) = (Scalar)(rand() % 100000 + 2); + } + + for (int i = 0; i < N; i++) { + if (h_values(i) > reference_max) { + reference_max = h_values(i); + reference_maxloc = i; + } else if (h_values(i) == reference_max) { + // Make max unique. + h_values(i) -= Scalar(1); + } + } + + for (int i = 0; i < N; i++) { + if (h_values(i) < reference_min) { + reference_min = h_values(i); + reference_minloc = i; + } else if (h_values(i) == reference_min) { + // Make min unique. + h_values(i) += Scalar(1); + } + } + + Kokkos::deep_copy(values, h_values); + + MinMaxLocFunctor f; + f.values = values; + MinMaxLocFunctorTag f_tag; + f_tag.values = values; + + { + value_type minmax_scalar; + Kokkos::MinMaxLoc<Scalar, int> reducer_scalar(minmax_scalar); + + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, N), f, + reducer_scalar); + ASSERT_EQ(minmax_scalar.min_val, reference_min); + + for (int i = 0; i < N; i++) { + if ((i == minmax_scalar.min_loc) && (h_values(i) == reference_min)) { + reference_minloc = i; + } + } + + ASSERT_EQ(minmax_scalar.min_loc, reference_minloc); + ASSERT_EQ(minmax_scalar.max_val, reference_max); + + for (int i = 0; i < N; i++) { + if ((i == minmax_scalar.max_loc) && (h_values(i) == reference_max)) { + reference_maxloc = i; + } + } + + ASSERT_EQ(minmax_scalar.max_loc, reference_maxloc); + + minmax_scalar = value_type(); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace, ReducerTag>(0, N), + f_tag, reducer_scalar); + ASSERT_EQ(minmax_scalar.min_val, reference_min); + + for (int i = 0; i < N; i++) { + if ((i == minmax_scalar.min_loc) && (h_values(i) == reference_min)) { + reference_minloc = i; + } + } + + ASSERT_EQ(minmax_scalar.min_loc, reference_minloc); + ASSERT_EQ(minmax_scalar.max_val, reference_max); + + for (int i = 0; i < N; i++) { + if ((i == minmax_scalar.max_loc) && (h_values(i) == reference_max)) { + reference_maxloc = i; + } + } + + ASSERT_EQ(minmax_scalar.max_loc, reference_maxloc); + + value_type minmax_scalar_view = reducer_scalar.reference(); + ASSERT_EQ(minmax_scalar_view.min_val, reference_min); + ASSERT_EQ(minmax_scalar_view.min_loc, reference_minloc); + ASSERT_EQ(minmax_scalar_view.max_val, reference_max); + ASSERT_EQ(minmax_scalar_view.max_loc, reference_maxloc); + } + + { + Kokkos::View<value_type, Kokkos::HostSpace> minmax_view("View"); + Kokkos::MinMaxLoc<Scalar, int> reducer_view(minmax_view); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, N), f, + reducer_view); + Kokkos::fence(); + + value_type minmax_view_scalar = minmax_view(); + ASSERT_EQ(minmax_view_scalar.min_val, reference_min); + ASSERT_EQ(minmax_view_scalar.min_loc, reference_minloc); + ASSERT_EQ(minmax_view_scalar.max_val, reference_max); + ASSERT_EQ(minmax_view_scalar.max_loc, reference_maxloc); + + value_type minmax_view_view = reducer_view.reference(); + ASSERT_EQ(minmax_view_view.min_val, reference_min); + ASSERT_EQ(minmax_view_view.min_loc, reference_minloc); + ASSERT_EQ(minmax_view_view.max_val, reference_max); + ASSERT_EQ(minmax_view_view.max_loc, reference_maxloc); + } + } + + static void test_BAnd(int N) { + Kokkos::View<Scalar*, ExecSpace> values("Values", N); + auto h_values = Kokkos::create_mirror_view(values); + Scalar reference_band = Scalar() | (~Scalar()); + + for (int i = 0; i < N; i++) { + h_values(i) = (Scalar)(rand() % 100000 + 1); + reference_band = reference_band & h_values(i); + } + Kokkos::deep_copy(values, h_values); + + BAndFunctor f; + f.values = values; + BAndFunctorTag f_tag; + f_tag.values = values; + Scalar init = Scalar() | (~Scalar()); + + { + Scalar band_scalar = init; + Kokkos::BAnd<Scalar> reducer_scalar(band_scalar); + + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, N), f, + reducer_scalar); + ASSERT_EQ(band_scalar, reference_band); + + band_scalar = init; + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace, ReducerTag>(0, N), + f_tag, reducer_scalar); + ASSERT_EQ(band_scalar, reference_band); + + Scalar band_scalar_view = reducer_scalar.reference(); + + ASSERT_EQ(band_scalar_view, reference_band); + } + + { + Kokkos::View<Scalar, Kokkos::HostSpace> band_view("View"); + band_view() = init; + Kokkos::BAnd<Scalar> reducer_view(band_view); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, N), f, + reducer_view); + Kokkos::fence(); + + Scalar band_view_scalar = band_view(); + ASSERT_EQ(band_view_scalar, reference_band); + + Scalar band_view_view = reducer_view.reference(); + ASSERT_EQ(band_view_view, reference_band); + } + } + + static void test_BOr(int N) { + Kokkos::View<Scalar*, ExecSpace> values("Values", N); + auto h_values = Kokkos::create_mirror_view(values); + Scalar reference_bor = Scalar() & (~Scalar()); + + for (int i = 0; i < N; i++) { + h_values(i) = (Scalar)((rand() % 100000 + 1) * 2); + reference_bor = reference_bor | h_values(i); + } + Kokkos::deep_copy(values, h_values); + + BOrFunctor f; + f.values = values; + BOrFunctorTag f_tag; + f_tag.values = values; + Scalar init = Scalar() & (~Scalar()); + + { + Scalar bor_scalar = init; + Kokkos::BOr<Scalar> reducer_scalar(bor_scalar); + + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, N), f, + reducer_scalar); + ASSERT_EQ(bor_scalar, reference_bor); + + bor_scalar = init; + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace, ReducerTag>(0, N), + f_tag, reducer_scalar); + ASSERT_EQ(bor_scalar, reference_bor); + + Scalar bor_scalar_view = reducer_scalar.reference(); + ASSERT_EQ(bor_scalar_view, reference_bor); + } + + { + Kokkos::View<Scalar, Kokkos::HostSpace> bor_view("View"); + bor_view() = init; + Kokkos::BOr<Scalar> reducer_view(bor_view); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, N), f, + reducer_view); + Kokkos::fence(); + + Scalar bor_view_scalar = bor_view(); + ASSERT_EQ(bor_view_scalar, reference_bor); + + Scalar bor_view_view = reducer_view.reference(); + ASSERT_EQ(bor_view_view, reference_bor); + } + } + + static void test_LAnd(int N) { + Kokkos::View<Scalar*, ExecSpace> values("Values", N); + auto h_values = Kokkos::create_mirror_view(values); + Scalar reference_land = 1; + + for (int i = 0; i < N; i++) { + h_values(i) = (Scalar)(rand() % 2); + reference_land = reference_land && h_values(i); + } + Kokkos::deep_copy(values, h_values); + + LAndFunctor f; + f.values = values; + LAndFunctorTag f_tag; + f_tag.values = values; + Scalar init = 1; + + { + Scalar land_scalar = init; + Kokkos::LAnd<Scalar> reducer_scalar(land_scalar); + + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, N), f, + reducer_scalar); + ASSERT_EQ(land_scalar, reference_land); + + land_scalar = init; + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace, ReducerTag>(0, N), + f_tag, reducer_scalar); + ASSERT_EQ(land_scalar, reference_land); + + Scalar land_scalar_view = reducer_scalar.reference(); + ASSERT_EQ(land_scalar_view, reference_land); + } + + { + Kokkos::View<Scalar, Kokkos::HostSpace> land_view("View"); + land_view() = init; + Kokkos::LAnd<Scalar> reducer_view(land_view); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, N), f, + reducer_view); + Kokkos::fence(); + + Scalar land_view_scalar = land_view(); + ASSERT_EQ(land_view_scalar, reference_land); + + Scalar land_view_view = reducer_view.reference(); + ASSERT_EQ(land_view_view, reference_land); + } + } + + static void test_LOr(int N) { + Kokkos::View<Scalar*, ExecSpace> values("Values", N); + auto h_values = Kokkos::create_mirror_view(values); + Scalar reference_lor = 0; + + for (int i = 0; i < N; i++) { + h_values(i) = (Scalar)(rand() % 2); + reference_lor = reference_lor || h_values(i); + } + Kokkos::deep_copy(values, h_values); + + LOrFunctor f; + f.values = values; + LOrFunctorTag f_tag; + f_tag.values = values; + Scalar init = 0; + + { + Scalar lor_scalar = init; + Kokkos::LOr<Scalar> reducer_scalar(lor_scalar); + + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, N), f, + reducer_scalar); + ASSERT_EQ(lor_scalar, reference_lor); + + lor_scalar = init; + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace, ReducerTag>(0, N), + f_tag, reducer_scalar); + ASSERT_EQ(lor_scalar, reference_lor); + + Scalar lor_scalar_view = reducer_scalar.reference(); + ASSERT_EQ(lor_scalar_view, reference_lor); + } + + { + Kokkos::View<Scalar, Kokkos::HostSpace> lor_view("View"); + lor_view() = init; + Kokkos::LOr<Scalar> reducer_view(lor_view); + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, N), f, + reducer_view); + Kokkos::fence(); + + Scalar lor_view_scalar = lor_view(); + ASSERT_EQ(lor_view_scalar, reference_lor); + + Scalar lor_view_view = reducer_view.reference(); + ASSERT_EQ(lor_view_view, reference_lor); + } + } + + static void execute_float() { + test_sum(10001); + test_prod(35); + test_min(10003); + test_minloc(10003); + test_max(10007); + test_maxloc(10007); + // FIXME_OPENMPTARGET - The minmaxloc test fails in the Release and + // RelWithDebInfo builds for the OPENMPTARGET backend but passes in Debug + // mode. +#if !defined(KOKKOS_ENABLE_OPENMPTARGET) + test_minmaxloc(10007); +#endif + } + + // NOTE test_prod generates N random numbers between 1 and 4. + // Although unlikely, the test below could still in principle overflow. + // For reference log(numeric_limits<int>)/log(4) is 15.5 + static void execute_integer() { + test_sum(10001); + test_prod(sizeof(Scalar) > 4 ? 35 : 19); // avoid int overflow (see above) + test_min(10003); + test_minloc(10003); + test_max(10007); + test_maxloc(10007); + // FIXME_OPENMPTARGET - The minmaxloc test fails in the Release and + // RelWithDebInfo builds for the OPENMPTARGET backend but passes in Debug + // mode. +#if !defined(KOKKOS_ENABLE_OPENMPTARGET) + test_minmaxloc(10007); +#endif + test_BAnd(35); + test_BOr(35); + test_LAnd(35); + test_LOr(35); + } + + static void execute_basic() { + test_sum(10001); + test_prod(35); + } +}; + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestReducers_a.hpp b/packages/kokkos/core/unit_test/TestReducers_a.hpp new file mode 100644 index 0000000000000000000000000000000000000000..4efc4f645064c7e01f501101ff9b42ca8e686309 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestReducers_a.hpp @@ -0,0 +1,52 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestReducers.hpp> + +namespace Test { +TEST(TEST_CATEGORY, reducers_int) { + TestReducers<int, TEST_EXECSPACE>::execute_integer(); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestReducers_b.hpp b/packages/kokkos/core/unit_test/TestReducers_b.hpp new file mode 100644 index 0000000000000000000000000000000000000000..57aa0f3b7661659acd04ccdb8fe171954e258eb9 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestReducers_b.hpp @@ -0,0 +1,51 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestReducers.hpp> + +namespace Test { +TEST(TEST_CATEGORY, reducers_size_t) { + TestReducers<size_t, TEST_EXECSPACE>::execute_integer(); +} +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestReducers_c.hpp b/packages/kokkos/core/unit_test/TestReducers_c.hpp new file mode 100644 index 0000000000000000000000000000000000000000..d1353b1f647e43c698adf994240be1086a9a5da3 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestReducers_c.hpp @@ -0,0 +1,51 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestReducers.hpp> + +namespace Test { +TEST(TEST_CATEGORY, reducers_double) { + TestReducers<double, TEST_EXECSPACE>::execute_float(); +} +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestReducers_d.hpp b/packages/kokkos/core/unit_test/TestReducers_d.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e2254a1c1fe653b22c3e6b9a9ebad50d07a9eb89 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestReducers_d.hpp @@ -0,0 +1,67 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <TestReducers.hpp> +#include <TestNonTrivialScalarTypes.hpp> + +namespace Test { +TEST(TEST_CATEGORY, reducers_complex_double) { + TestReducers<Kokkos::complex<double>, TEST_EXECSPACE>::execute_basic(); +} + +TEST(TEST_CATEGORY, reducers_struct) { + TestReducers<array_reduce<float, 1>, TEST_EXECSPACE>::test_sum(1031); + TestReducers<array_reduce<float, 2>, TEST_EXECSPACE>::test_sum(1031); + TestReducers<array_reduce<float, 4>, TEST_EXECSPACE>::test_sum(1031); + // FIXME_OPENMPTARGET - The size of data in array_reduce has to be a power of + // 2 for OPENMPTARGET backend in Release and RelWithDebInfo builds. +#ifdef KOKKOS_ENABLE_OPENMPTARGET + TestReducers<array_reduce<float, 8>, TEST_EXECSPACE>::test_sum(1031); +#else + TestReducers<array_reduce<float, 3>, TEST_EXECSPACE>::test_sum(1031); + TestReducers<array_reduce<float, 7>, TEST_EXECSPACE>::test_sum(1031); +#endif +} +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestReductions.hpp b/packages/kokkos/core/unit_test/TestReductions.hpp new file mode 100644 index 0000000000000000000000000000000000000000..949ca7eaf30a4746a8fec355f1b62c035c83d041 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestReductions.hpp @@ -0,0 +1,52 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_REDUCTIONS_HPP +#define KOKKOS_TEST_REDUCTIONS_HPP +#include <Kokkos_Macros.hpp> +#ifndef KOKKOS_ENABLE_OPENMPTARGET +#include <TestReduce.hpp> +#endif +#include <TestCXX11Deduction.hpp> +#endif diff --git a/packages/kokkos/core/unit_test/TestReductions_DeviceView.hpp b/packages/kokkos/core/unit_test/TestReductions_DeviceView.hpp new file mode 100644 index 0000000000000000000000000000000000000000..17563de335e5b6a6170985e392ea8ae0de5ae8c1 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestReductions_DeviceView.hpp @@ -0,0 +1,149 @@ +#include <Kokkos_Core.hpp> + +namespace Test { +namespace { + +struct TestIsAsynchFunctor { + Kokkos::View<double, TEST_EXECSPACE> atomic_test; + TestIsAsynchFunctor(Kokkos::View<double, TEST_EXECSPACE> atomic_test_) + : atomic_test(atomic_test_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int) const { Kokkos::atomic_add(&atomic_test(), 1.0); } +}; + +template <class PolicyType, class ReduceFunctor> +void test_reduce_device_view(int64_t N, PolicyType policy, + ReduceFunctor functor) { + using ExecSpace = TEST_EXECSPACE; + + Kokkos::View<int64_t, TEST_EXECSPACE> result("Result"); + Kokkos::View<double, TEST_EXECSPACE> atomic_test("Atomic"); + int64_t reducer_result, view_result, scalar_result; + + Kokkos::Timer timer; + + // Establish whether execspace is asynchronous + Kokkos::parallel_for("Test::ReduceDeviceView::TestIsAsynch", + Kokkos::RangePolicy<TEST_EXECSPACE>(0, 1000000), + TestIsAsynchFunctor(atomic_test)); + double time0 = timer.seconds(); + timer.reset(); + typename ExecSpace::execution_space().fence(); + double time_fence0 = timer.seconds(); + Kokkos::deep_copy(result, 0); + timer.reset(); + bool is_async = time0 < time_fence0; + + // Test Reducer + + Kokkos::parallel_reduce("Test::ReduceDeviceView::TestReducer", policy, + functor, + Kokkos::Sum<int64_t, TEST_EXECSPACE>(result)); + double time1 = timer.seconds(); + // Check whether it was asyncronous + timer.reset(); + typename ExecSpace::execution_space().fence(); + double time_fence1 = timer.seconds(); + Kokkos::deep_copy(reducer_result, result); + Kokkos::deep_copy(result, 0); + ASSERT_EQ(N, reducer_result); + timer.reset(); + + // Test View + Kokkos::parallel_reduce("Test::ReduceDeviceView::TestView", policy, functor, + result); + double time2 = timer.seconds(); + // Check whether it was asyncronous + timer.reset(); + typename ExecSpace::execution_space().fence(); + double time_fence2 = timer.seconds(); + Kokkos::deep_copy(view_result, result); + Kokkos::deep_copy(result, 0); + ASSERT_EQ(N, view_result); + timer.reset(); + + // Test Scalar + Kokkos::parallel_reduce("Test::ReduceDeviceView::TestScalar", policy, functor, + scalar_result); + double time3 = timer.seconds(); + + // Check whether it was asyncronous + timer.reset(); + typename ExecSpace::execution_space().fence(); + double time_fence3 = timer.seconds(); + + ASSERT_EQ(N, scalar_result); + if (is_async) { + ASSERT_TRUE(time1 < time_fence1); + } + if (is_async) { + ASSERT_TRUE(time2 < time_fence2); + ASSERT_TRUE(time3 > time_fence3); + } +} + +struct RangePolicyFunctor { + KOKKOS_INLINE_FUNCTION + void operator()(const int, int64_t& lsum) const { lsum += 1; } +}; + +struct MDRangePolicyFunctor { + KOKKOS_INLINE_FUNCTION + void operator()(const int, const int, const int, int64_t& lsum) const { + lsum += 1; + } +}; + +struct TeamPolicyFunctor { + int M; + TeamPolicyFunctor(int M_) : M(M_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const Kokkos::TeamPolicy<TEST_EXECSPACE>::member_type& team, + int64_t& lsum) const { + for (int i = team.team_rank(); i < M; i += team.team_size()) lsum += 1; + } +}; + +} // namespace + +TEST(TEST_CATEGORY, reduce_device_view_range_policy) { + // Avoid running out of memory +#ifdef KOKKOS_ENABLE_SYCL + int N = 100 * 1024 * 1024; +#else + int N = 1000 * 1024 * 1024; +#endif + test_reduce_device_view(N, Kokkos::RangePolicy<TEST_EXECSPACE>(0, N), + RangePolicyFunctor()); +} + +TEST(TEST_CATEGORY, reduce_device_view_mdrange_policy) { + int N = 1000 * 1024 * 1024; + test_reduce_device_view( + N, + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<3>>( + {0, 0, 0}, {1000, 1024, 1024}), + MDRangePolicyFunctor()); +} + +// FIXME_HIP +#ifndef KOKKOS_ENABLE_HIP +TEST(TEST_CATEGORY, reduce_device_view_team_policy) { +// FIXME_SYCL The number of workgroups on CUDA devices can not be larger than +// 65535 +#ifdef KOKKOS_ENABLE_SYCL + int N = 63 * 1024 * 1024; + test_reduce_device_view( + N, Kokkos::TeamPolicy<TEST_EXECSPACE>(63 * 1024, Kokkos::AUTO), + TeamPolicyFunctor(1024)); +#else + int N = 1000 * 1024 * 1024; + test_reduce_device_view( + N, Kokkos::TeamPolicy<TEST_EXECSPACE>(1000 * 1024, Kokkos::AUTO), + TeamPolicyFunctor(1024)); +#endif +} +#endif +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestResize.hpp b/packages/kokkos/core/unit_test/TestResize.hpp new file mode 100644 index 0000000000000000000000000000000000000000..cf5c0df6f9163039fbd3ca1df8aee2a4b24ac882 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestResize.hpp @@ -0,0 +1,401 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#ifndef TESTRESIZE_HPP_ +#define TESTRESIZE_HPP_ + +#include <gtest/gtest.h> +#include <Kokkos_Core.hpp> + +namespace TestViewResize { + +struct Default {}; +struct WithoutInitializing {}; + +template <typename View, typename... Args> +inline void resize_dispatch(Default, View& v, Args&&... args) { + Kokkos::resize(v, std::forward<Args>(args)...); +} + +template <typename View, typename... Args> +inline void resize_dispatch(WithoutInitializing, View& v, Args&&... args) { + Kokkos::resize(Kokkos::WithoutInitializing, v, std::forward<Args>(args)...); +} + +template <class DeviceType, class Tag = Default> +void impl_testResize() { + const size_t sizes[8] = {2, 3, 4, 5, 6, 7, 8, 9}; + + // Check #904 fix (no reallocation if dimensions didn't change). + { + using view_type = Kokkos::View<int*, DeviceType>; + view_type view_1d("view_1d", sizes[0]); + const int* oldPointer = view_1d.data(); + EXPECT_TRUE(oldPointer != nullptr); + resize_dispatch(Tag{}, view_1d, sizes[0]); + const int* newPointer = view_1d.data(); + EXPECT_TRUE(oldPointer == newPointer); + } + { + using view_type = Kokkos::View<int**, DeviceType>; + view_type view_2d("view_2d", sizes[0], sizes[1]); + const int* oldPointer = view_2d.data(); + EXPECT_TRUE(oldPointer != nullptr); + resize_dispatch(Tag{}, view_2d, sizes[0], sizes[1]); + const int* newPointer = view_2d.data(); + EXPECT_TRUE(oldPointer == newPointer); + } + { + using view_type = Kokkos::View<int***, DeviceType>; + view_type view_3d("view_3d", sizes[0], sizes[1], sizes[2]); + const int* oldPointer = view_3d.data(); + EXPECT_TRUE(oldPointer != nullptr); + resize_dispatch(Tag{}, view_3d, sizes[0], sizes[1], sizes[2]); + const int* newPointer = view_3d.data(); + EXPECT_TRUE(oldPointer == newPointer); + } + { + using view_type = Kokkos::View<int****, DeviceType>; + view_type view_4d("view_4d", sizes[0], sizes[1], sizes[2], sizes[3]); + const int* oldPointer = view_4d.data(); + EXPECT_TRUE(oldPointer != nullptr); + resize_dispatch(Tag{}, view_4d, sizes[0], sizes[1], sizes[2], sizes[3]); + const int* newPointer = view_4d.data(); + EXPECT_TRUE(oldPointer == newPointer); + } + { + using view_type = Kokkos::View<int*****, DeviceType>; + view_type view_5d("view_5d", sizes[0], sizes[1], sizes[2], sizes[3], + sizes[4]); + const int* oldPointer = view_5d.data(); + EXPECT_TRUE(oldPointer != nullptr); + resize_dispatch(Tag{}, view_5d, sizes[0], sizes[1], sizes[2], sizes[3], + sizes[4]); + const int* newPointer = view_5d.data(); + EXPECT_TRUE(oldPointer == newPointer); + } + { + using view_type = Kokkos::View<int******, DeviceType>; + view_type view_6d("view_6d", sizes[0], sizes[1], sizes[2], sizes[3], + sizes[4], sizes[5]); + const int* oldPointer = view_6d.data(); + EXPECT_TRUE(oldPointer != nullptr); + resize_dispatch(Tag{}, view_6d, sizes[0], sizes[1], sizes[2], sizes[3], + sizes[4], sizes[5]); + const int* newPointer = view_6d.data(); + EXPECT_TRUE(oldPointer == newPointer); + } + { + using view_type = Kokkos::View<int*******, DeviceType>; + view_type view_7d("view_7d", sizes[0], sizes[1], sizes[2], sizes[3], + sizes[4], sizes[5], sizes[6]); + const int* oldPointer = view_7d.data(); + EXPECT_TRUE(oldPointer != nullptr); + resize_dispatch(Tag{}, view_7d, sizes[0], sizes[1], sizes[2], sizes[3], + sizes[4], sizes[5], sizes[6]); + const int* newPointer = view_7d.data(); + EXPECT_TRUE(oldPointer == newPointer); + } + { + using view_type = Kokkos::View<int********, DeviceType>; + view_type view_8d("view_8d", sizes[0], sizes[1], sizes[2], sizes[3], + sizes[4], sizes[5], sizes[6], sizes[7]); + const int* oldPointer = view_8d.data(); + EXPECT_TRUE(oldPointer != nullptr); + resize_dispatch(Tag{}, view_8d, sizes[0], sizes[1], sizes[2], sizes[3], + sizes[4], sizes[5], sizes[6], sizes[7]); + const int* newPointer = view_8d.data(); + EXPECT_TRUE(oldPointer == newPointer); + } + // Resize without initialization: check if data preserved + { + using view_type = Kokkos::View<int*, DeviceType>; + view_type view_1d("view_1d", sizes[0]); + typename view_type::HostMirror h_view_1d_old = + Kokkos::create_mirror(view_1d); + Kokkos::deep_copy(view_1d, 111); + Kokkos::deep_copy(h_view_1d_old, view_1d); + resize_dispatch(Tag{}, view_1d, 2 * sizes[0]); + EXPECT_TRUE(view_1d.extent(0) == 2 * sizes[0]); + typename view_type::HostMirror h_view_1d = + Kokkos::create_mirror_view(view_1d); + Kokkos::deep_copy(h_view_1d, view_1d); + bool test = true; + for (size_t i0 = 0; i0 < sizes[0]; ++i0) { + if (h_view_1d(i0) != h_view_1d_old(i0)) { + test = false; + break; + } + } + EXPECT_TRUE(test == true); + } + { + using view_type = Kokkos::View<int**, DeviceType>; + view_type view_2d("view_2d", sizes[0], sizes[1]); + typename view_type::HostMirror h_view_2d_old = + Kokkos::create_mirror(view_2d); + Kokkos::deep_copy(view_2d, 222); + Kokkos::deep_copy(h_view_2d_old, view_2d); + resize_dispatch(Tag{}, view_2d, 2 * sizes[0], sizes[1]); + EXPECT_TRUE(view_2d.extent(0) == 2 * sizes[0]); + typename view_type::HostMirror h_view_2d = + Kokkos::create_mirror_view(view_2d); + Kokkos::deep_copy(h_view_2d, view_2d); + bool test = true; + for (size_t i0 = 0; i0 < sizes[0]; ++i0) { + for (size_t i1 = 0; i1 < sizes[1]; ++i1) { + if (h_view_2d(i0, i1) != h_view_2d_old(i0, i1)) { + test = false; + break; + } + } + } + EXPECT_TRUE(test == true); + } + { + using view_type = Kokkos::View<int***, DeviceType>; + view_type view_3d("view_3d", sizes[0], sizes[1], sizes[2]); + typename view_type::HostMirror h_view_3d_old = + Kokkos::create_mirror(view_3d); + Kokkos::deep_copy(view_3d, 333); + Kokkos::deep_copy(h_view_3d_old, view_3d); + resize_dispatch(Tag{}, view_3d, 2 * sizes[0], sizes[1], sizes[2]); + EXPECT_TRUE(view_3d.extent(0) == 2 * sizes[0]); + typename view_type::HostMirror h_view_3d = + Kokkos::create_mirror_view(view_3d); + Kokkos::deep_copy(h_view_3d, view_3d); + bool test = true; + for (size_t i0 = 0; i0 < sizes[0]; ++i0) { + for (size_t i1 = 0; i1 < sizes[1]; ++i1) { + for (size_t i2 = 0; i2 < sizes[2]; ++i2) { + if (h_view_3d(i0, i1, i2) != h_view_3d_old(i0, i1, i2)) { + test = false; + break; + } + } + } + } + EXPECT_TRUE(test == true); + } + { + using view_type = Kokkos::View<int****, DeviceType>; + view_type view_4d("view_4d", sizes[0], sizes[1], sizes[2], sizes[3]); + typename view_type::HostMirror h_view_4d_old = + Kokkos::create_mirror(view_4d); + Kokkos::deep_copy(view_4d, 444); + Kokkos::deep_copy(h_view_4d_old, view_4d); + resize_dispatch(Tag{}, view_4d, 2 * sizes[0], sizes[1], sizes[2], sizes[3]); + EXPECT_TRUE(view_4d.extent(0) == 2 * sizes[0]); + typename view_type::HostMirror h_view_4d = + Kokkos::create_mirror_view(view_4d); + Kokkos::deep_copy(h_view_4d, view_4d); + bool test = true; + for (size_t i0 = 0; i0 < sizes[0]; ++i0) { + for (size_t i1 = 0; i1 < sizes[1]; ++i1) { + for (size_t i2 = 0; i2 < sizes[2]; ++i2) { + for (size_t i3 = 0; i3 < sizes[3]; ++i3) { + if (h_view_4d(i0, i1, i2, i3) != h_view_4d_old(i0, i1, i2, i3)) { + test = false; + break; + } + } + } + } + } + EXPECT_TRUE(test == true); + } + { + using view_type = Kokkos::View<int*****, DeviceType>; + view_type view_5d("view_5d", sizes[0], sizes[1], sizes[2], sizes[3], + sizes[4]); + typename view_type::HostMirror h_view_5d_old = + Kokkos::create_mirror(view_5d); + Kokkos::deep_copy(view_5d, 555); + Kokkos::deep_copy(h_view_5d_old, view_5d); + resize_dispatch(Tag{}, view_5d, 2 * sizes[0], sizes[1], sizes[2], sizes[3], + sizes[4]); + EXPECT_TRUE(view_5d.extent(0) == 2 * sizes[0]); + typename view_type::HostMirror h_view_5d = + Kokkos::create_mirror_view(view_5d); + Kokkos::deep_copy(h_view_5d, view_5d); + bool test = true; + for (size_t i0 = 0; i0 < sizes[0]; ++i0) { + for (size_t i1 = 0; i1 < sizes[1]; ++i1) { + for (size_t i2 = 0; i2 < sizes[2]; ++i2) { + for (size_t i3 = 0; i3 < sizes[3]; ++i3) { + for (size_t i4 = 0; i4 < sizes[4]; ++i4) { + if (h_view_5d(i0, i1, i2, i3, i4) != + h_view_5d_old(i0, i1, i2, i3, i4)) { + test = false; + break; + } + } + } + } + } + } + EXPECT_TRUE(test == true); + } + { + using view_type = Kokkos::View<int******, DeviceType>; + view_type view_6d("view_6d", sizes[0], sizes[1], sizes[2], sizes[3], + sizes[4], sizes[5]); + typename view_type::HostMirror h_view_6d_old = + Kokkos::create_mirror(view_6d); + Kokkos::deep_copy(view_6d, 666); + Kokkos::deep_copy(h_view_6d_old, view_6d); + resize_dispatch(Tag{}, view_6d, 2 * sizes[0], sizes[1], sizes[2], sizes[3], + sizes[4], sizes[5]); + EXPECT_TRUE(view_6d.extent(0) == 2 * sizes[0]); + typename view_type::HostMirror h_view_6d = + Kokkos::create_mirror_view(view_6d); + Kokkos::deep_copy(h_view_6d, view_6d); + bool test = true; + for (size_t i0 = 0; i0 < sizes[0]; ++i0) { + for (size_t i1 = 0; i1 < sizes[1]; ++i1) { + for (size_t i2 = 0; i2 < sizes[2]; ++i2) { + for (size_t i3 = 0; i3 < sizes[3]; ++i3) { + for (size_t i4 = 0; i4 < sizes[4]; ++i4) { + for (size_t i5 = 0; i5 < sizes[5]; ++i5) { + if (h_view_6d(i0, i1, i2, i3, i4, i5) != + h_view_6d_old(i0, i1, i2, i3, i4, i5)) { + test = false; + break; + } + } + } + } + } + } + } + EXPECT_TRUE(test == true); + } + { + using view_type = Kokkos::View<int*******, DeviceType>; + view_type view_7d("view_7d", sizes[0], sizes[1], sizes[2], sizes[3], + sizes[4], sizes[5], sizes[6]); + typename view_type::HostMirror h_view_7d_old = + Kokkos::create_mirror(view_7d); + Kokkos::deep_copy(view_7d, 777); + Kokkos::deep_copy(h_view_7d_old, view_7d); + resize_dispatch(Tag{}, view_7d, 2 * sizes[0], sizes[1], sizes[2], sizes[3], + sizes[4], sizes[5], sizes[6]); + EXPECT_TRUE(view_7d.extent(0) == 2 * sizes[0]); + typename view_type::HostMirror h_view_7d = + Kokkos::create_mirror_view(view_7d); + Kokkos::deep_copy(h_view_7d, view_7d); + bool test = true; + for (size_t i0 = 0; i0 < sizes[0]; ++i0) { + for (size_t i1 = 0; i1 < sizes[1]; ++i1) { + for (size_t i2 = 0; i2 < sizes[2]; ++i2) { + for (size_t i3 = 0; i3 < sizes[3]; ++i3) { + for (size_t i4 = 0; i4 < sizes[4]; ++i4) { + for (size_t i5 = 0; i5 < sizes[5]; ++i5) { + for (size_t i6 = 0; i6 < sizes[6]; ++i6) { + if (h_view_7d(i0, i1, i2, i3, i4, i5, i6) != + h_view_7d_old(i0, i1, i2, i3, i4, i5, i6)) { + test = false; + break; + } + } + } + } + } + } + } + } + EXPECT_TRUE(test == true); + } + { + using view_type = Kokkos::View<int********, DeviceType>; + view_type view_8d("view_8d", sizes[0], sizes[1], sizes[2], sizes[3], + sizes[4], sizes[5], sizes[6], sizes[7]); + typename view_type::HostMirror h_view_8d_old = + Kokkos::create_mirror(view_8d); + Kokkos::deep_copy(view_8d, 888); + Kokkos::deep_copy(h_view_8d_old, view_8d); + resize_dispatch(Tag{}, view_8d, 2 * sizes[0], sizes[1], sizes[2], sizes[3], + sizes[4], sizes[5], sizes[6], sizes[7]); + EXPECT_TRUE(view_8d.extent(0) == 2 * sizes[0]); + typename view_type::HostMirror h_view_8d = + Kokkos::create_mirror_view(view_8d); + Kokkos::deep_copy(h_view_8d, view_8d); + bool test = true; + for (size_t i0 = 0; i0 < sizes[0]; ++i0) { + for (size_t i1 = 0; i1 < sizes[1]; ++i1) { + for (size_t i2 = 0; i2 < sizes[2]; ++i2) { + for (size_t i3 = 0; i3 < sizes[3]; ++i3) { + for (size_t i4 = 0; i4 < sizes[4]; ++i4) { + for (size_t i5 = 0; i5 < sizes[5]; ++i5) { + for (size_t i6 = 0; i6 < sizes[6]; ++i6) { + for (size_t i7 = 0; i7 < sizes[7]; ++i7) { + if (h_view_8d(i0, i1, i2, i3, i4, i5, i6, i7) != + h_view_8d_old(i0, i1, i2, i3, i4, i5, i6, i7)) { + test = false; + break; + } + } + } + } + } + } + } + } + } + EXPECT_TRUE(test == true); + } +} + +template <class DeviceType> +void testResize() { + { + impl_testResize<DeviceType>(); // with data initialization + } + { + impl_testResize<DeviceType, + WithoutInitializing>(); // without data initialization + } +} + +} // namespace TestViewResize +#endif // TESTRESIZE_HPP_ diff --git a/packages/kokkos/core/unit_test/TestScan.hpp b/packages/kokkos/core/unit_test/TestScan.hpp new file mode 100644 index 0000000000000000000000000000000000000000..67cb85553d6bf7ccd9cc76b85f7bc32bb0e2e5a7 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestScan.hpp @@ -0,0 +1,145 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <cstdio> + +namespace Test { + +template <class Device> +struct TestScan { + using execution_space = Device; + using value_type = int64_t; + + Kokkos::View<int, Device, Kokkos::MemoryTraits<Kokkos::Atomic> > errors; + + KOKKOS_INLINE_FUNCTION + void operator()(const int iwork, value_type& update, + const bool final_pass) const { + const value_type n = iwork + 1; + const value_type imbalance = ((1000 <= n) && (0 == n % 1000)) ? 1000 : 0; + + // Insert an artificial load imbalance + + for (value_type i = 0; i < imbalance; ++i) { + ++update; + } + + update += n - imbalance; + + if (final_pass) { + const value_type answer = + n & 1 ? (n * ((n + 1) / 2)) : ((n / 2) * (n + 1)); + + if (answer != update) { + int fail = errors()++; + + if (fail < 20) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("TestScan(%d,%ld) != %ld\n", iwork, + static_cast<long>(update), + static_cast<long>(answer)); + } + } + } + } + + KOKKOS_INLINE_FUNCTION + void init(value_type& update) const { update = 0; } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& update, + volatile const value_type& input) const { + update += input; + } + + TestScan(const size_t N) { + Kokkos::View<int, Device> errors_a("Errors"); + Kokkos::deep_copy(errors_a, 0); + errors = errors_a; + + Kokkos::parallel_scan(N, *this); + + value_type total = 0; + Kokkos::parallel_scan(N, *this, total); + + // We can't return a value in a constructor so use a lambda as wrapper to + // ignore it. + [&] { ASSERT_EQ(size_t((N + 1) * N / 2), size_t(total)); }(); + check_error(); + } + + TestScan(const size_t Start, const size_t N) { + using exec_policy = Kokkos::RangePolicy<execution_space>; + + Kokkos::View<int, Device> errors_a("Errors"); + Kokkos::deep_copy(errors_a, 0); + errors = errors_a; + + Kokkos::parallel_scan(exec_policy(Start, N), *this); + Kokkos::fence(); + + check_error(); + } + + void check_error() { + int total_errors; + Kokkos::deep_copy(total_errors, errors); + ASSERT_EQ(total_errors, 0); + } + + static void test_range(const size_t begin, const size_t end) { + for (auto i = begin; i < end; ++i) { + (void)TestScan(i); + } + } +}; + +TEST(TEST_CATEGORY, scan) { + TestScan<TEST_EXECSPACE>::test_range(1, 1000); + TestScan<TEST_EXECSPACE>(0); + TestScan<TEST_EXECSPACE>(100000); + TestScan<TEST_EXECSPACE>(10000000); + TEST_EXECSPACE().fence(); +} +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestSharedAlloc.hpp b/packages/kokkos/core/unit_test/TestSharedAlloc.hpp new file mode 100644 index 0000000000000000000000000000000000000000..b5eb77dc2a964fe1066048b2edfac61d531b4fab --- /dev/null +++ b/packages/kokkos/core/unit_test/TestSharedAlloc.hpp @@ -0,0 +1,251 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <stdexcept> +#include <sstream> +#include <iostream> + +#include <Kokkos_Core.hpp> + +/*--------------------------------------------------------------------------*/ + +namespace Test { + +struct SharedAllocDestroy { + volatile int* count; + + SharedAllocDestroy() = default; + SharedAllocDestroy(int* arg) : count(arg) {} + + void destroy_shared_allocation() { Kokkos::atomic_increment(count); } +}; + +template <class MemorySpace, class ExecutionSpace> +void test_shared_alloc() { +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + using Header = const Kokkos::Impl::SharedAllocationHeader; + using Tracker = Kokkos::Impl::SharedAllocationTracker; + using RecordBase = Kokkos::Impl::SharedAllocationRecord<void, void>; + using RecordMemS = Kokkos::Impl::SharedAllocationRecord<MemorySpace, void>; + using RecordFull = + Kokkos::Impl::SharedAllocationRecord<MemorySpace, SharedAllocDestroy>; + + static_assert(sizeof(Tracker) == sizeof(int*), + "SharedAllocationTracker has wrong size!"); + + MemorySpace s; + + const size_t N = 1200; + const size_t size = 8; + + RecordMemS* rarray[N]; + Header* harray[N]; + + RecordMemS** const r = rarray; + Header** const h = harray; + + Kokkos::RangePolicy<ExecutionSpace> range(0, N); + + { + // Since always executed on host space, leave [=] + Kokkos::parallel_for(range, [=](size_t i) { + char name[64]; + sprintf(name, "test_%.2d", int(i)); + + r[i] = RecordMemS::allocate(s, name, size * (i + 1)); + h[i] = Header::get_header(r[i]->data()); + + ASSERT_EQ(r[i]->use_count(), 0); + + for (size_t j = 0; j < (i / 10) + 1; ++j) RecordBase::increment(r[i]); + + ASSERT_EQ(r[i]->use_count(), (i / 10) + 1); + ASSERT_EQ(r[i], RecordMemS::get_record(r[i]->data())); + }); + + Kokkos::fence(); + +#ifdef KOKKOS_ENABLE_DEBUG + // Sanity check for the whole set of allocation records to which this record + // belongs. + RecordBase::is_sane(r[0]); + // RecordMemS::print_records( std::cout, s, true ); +#endif + + Kokkos::parallel_for(range, [=](size_t i) { + while (nullptr != + (r[i] = static_cast<RecordMemS*>(RecordBase::decrement(r[i])))) { +#ifdef KOKKOS_ENABLE_DEBUG + if (r[i]->use_count() == 1) RecordBase::is_sane(r[i]); +#endif + } + }); + + Kokkos::fence(); + } + + { + int destroy_count = 0; + SharedAllocDestroy counter(&destroy_count); + + Kokkos::parallel_for(range, [=](size_t i) { + char name[64]; + sprintf(name, "test_%.2d", int(i)); + + RecordFull* rec = RecordFull::allocate(s, name, size * (i + 1)); + + rec->m_destroy = counter; + + r[i] = rec; + h[i] = Header::get_header(r[i]->data()); + + ASSERT_EQ(r[i]->use_count(), 0); + + for (size_t j = 0; j < (i / 10) + 1; ++j) RecordBase::increment(r[i]); + + ASSERT_EQ(r[i]->use_count(), (i / 10) + 1); + ASSERT_EQ(r[i], RecordMemS::get_record(r[i]->data())); + }); + + Kokkos::fence(); + +#ifdef KOKKOS_ENABLE_DEBUG + RecordBase::is_sane(r[0]); +#endif + + Kokkos::parallel_for(range, [=](size_t i) { + while (nullptr != + (r[i] = static_cast<RecordMemS*>(RecordBase::decrement(r[i])))) { +#ifdef KOKKOS_ENABLE_DEBUG + if (r[i]->use_count() == 1) RecordBase::is_sane(r[i]); +#endif + } + }); + + Kokkos::fence(); + + ASSERT_EQ(destroy_count, int(N)); + } + + { + int destroy_count = 0; + + { + RecordFull* rec = RecordFull::allocate(s, "test", size); + + // ... Construction of the allocated { rec->data(), rec->size() } + + // Copy destruction function object into the allocation record. + rec->m_destroy = SharedAllocDestroy(&destroy_count); + + ASSERT_EQ(rec->use_count(), 0); + + // Start tracking, increments the use count from 0 to 1. + Tracker track; + + track.assign_allocated_record_to_uninitialized(rec); + + ASSERT_EQ(rec->use_count(), 1); + ASSERT_EQ(track.use_count(), 1); + + // Verify construction / destruction increment. + for (size_t i = 0; i < N; ++i) { + ASSERT_EQ(rec->use_count(), 1); + + { + Tracker local_tracker; + local_tracker.assign_allocated_record_to_uninitialized(rec); + ASSERT_EQ(rec->use_count(), 2); + ASSERT_EQ(local_tracker.use_count(), 2); + } + + ASSERT_EQ(rec->use_count(), 1); + ASSERT_EQ(track.use_count(), 1); + } + + Kokkos::parallel_for(range, [=](size_t) { + Tracker local_tracker; + local_tracker.assign_allocated_record_to_uninitialized(rec); + ASSERT_GT(rec->use_count(), 1); + }); + + Kokkos::fence(); + + ASSERT_EQ(rec->use_count(), 1); + ASSERT_EQ(track.use_count(), 1); + + // Destruction of 'track' object deallocates the 'rec' and invokes the + // destroy function object. + } + + ASSERT_EQ(destroy_count, 1); + } + +#endif /* #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) */ +} + +TEST(TEST_CATEGORY, impl_shared_alloc) { +#ifdef TEST_CATEGORY_NUMBER +#if (TEST_CATEGORY_NUMBER < 4) // serial threads openmp hpx + test_shared_alloc<Kokkos::HostSpace, TEST_EXECSPACE>(); +#elif (TEST_CATEGORY_NUMBER == 4) // openmptarget + test_shared_alloc<Kokkos::Experimental::OpenMPTargetSpace, + Kokkos::DefaultHostExecutionSpace>(); +#elif (TEST_CATEGORY_NUMBER == 5) // cuda + test_shared_alloc<Kokkos::CudaSpace, Kokkos::DefaultHostExecutionSpace>(); +#elif (TEST_CATEGORY_NUMBER == 6) // hip + test_shared_alloc<Kokkos::Experimental::HIPSpace, + Kokkos::DefaultHostExecutionSpace>(); +#elif (TEST_CATEGORY_NUMBER == 7) // sycl + test_shared_alloc<Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::DefaultHostExecutionSpace>(); +#endif +#else + test_shared_alloc<TEST_EXECSPACE, Kokkos::DefaultHostExecutionSpace>(); +#endif +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestStackTrace.cpp b/packages/kokkos/core/unit_test/TestStackTrace.cpp new file mode 100644 index 0000000000000000000000000000000000000000..76ab7e3ebe56292125214b07fb09ea20a4abd0c4 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestStackTrace.cpp @@ -0,0 +1,60 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <iostream> +#include "Kokkos_Core.hpp" + +#include <impl/Kokkos_Stacktrace.hpp> + +namespace Test { + +void my_fancy_handler() { + std::cerr << "I am the custom std::terminate handler." << std::endl; + std::abort(); +} + +} // namespace Test + +#include <TestStackTrace.hpp> +#include "UnitTestMainInit.cpp" diff --git a/packages/kokkos/core/unit_test/TestStackTrace.hpp b/packages/kokkos/core/unit_test/TestStackTrace.hpp new file mode 100644 index 0000000000000000000000000000000000000000..284332f3f85e87b9f8fc030084ecc78448da4e38 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestStackTrace.hpp @@ -0,0 +1,169 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#include <iostream> +#include <gtest/gtest.h> +#include "Kokkos_Core.hpp" + +#include <impl/Kokkos_Stacktrace.hpp> + +namespace Test { + +void stacktrace_test_f0(std::ostream& out); + +int stacktrace_test_f1(std::ostream& out); + +void stacktrace_test_f2(std::ostream& out); + +int stacktrace_test_f3(std::ostream& out, const int level); + +void stacktrace_test_f4(); + +void my_fancy_handler(); + +void test_stacktrace(bool bTerminate, bool bCustom = true) { + stacktrace_test_f1(std::cout); + bool bDynamic = false; + { + std::stringstream sstream; + Kokkos::Impl::print_saved_stacktrace(sstream); + std::string foutput = sstream.str(); + + bDynamic = std::string::npos != foutput.find("stacktrace"); + + if (bDynamic) { + printf("test_f1: %s \n", foutput.c_str()); + ASSERT_TRUE(std::string::npos != foutput.find("stacktrace_test_f1")); + for (auto x : {"stacktrace_test_f0", "stacktrace_test_f2", + "stacktrace_test_f3", "stacktrace_test_f4"}) { + ASSERT_TRUE(std::string::npos == foutput.find(x)); + } + } + } + + { + std::stringstream sstream; + Kokkos::Impl::print_demangled_saved_stacktrace(sstream); + + if (bDynamic) { + std::string foutput = sstream.str(); + printf("demangled test_f1: %s \n", foutput.c_str()); + ASSERT_TRUE(std::string::npos != + foutput.find("Test::stacktrace_test_f1")); + for (auto x : {"stacktrace_test_f0", "stacktrace_test_f2", + "stacktrace_test_f3", "stacktrace_test_f4"}) { + ASSERT_TRUE(std::string::npos == foutput.find(x)); + } + } + } + + int val = stacktrace_test_f3(std::cout, 4); + + // Don't remove this, otherwise the compiler will optimize away call sequences + // via + printf("StackTrace f3(std::cout, 4) returned: %i\n", val); + + // TODO test by making sure that f3 and f1, but no other functions, + // appear in the stack trace, and that f3 appears 5 times. + // Fix that f3 doesn't show up when compiling with -O3 + { + std::stringstream sstream; + Kokkos::Impl::print_saved_stacktrace(sstream); + + if (bDynamic) { + std::string foutput = sstream.str(); + printf("test_f3: %s \n", foutput.c_str()); + for (auto x : {"stacktrace_test_f1", "stacktrace_test_f3"}) { + ASSERT_TRUE(std::string::npos != foutput.find(x)); + } + } + // TODO make sure stacktrace_test_f2/4 don't show up + // TODO make sure stacktrace_test_f3 shows up 5 times + } + + { + std::stringstream sstream; + Kokkos::Impl::print_demangled_saved_stacktrace(sstream); + + if (bDynamic) { + std::string foutput = sstream.str(); + printf("demangled test_f3: %s \n", foutput.c_str()); + for (auto x : {"stacktrace_test_f1", "stacktrace_test_f3"}) { + ASSERT_TRUE(std::string::npos != foutput.find(x)); + } + } + + // TODO make sure stacktrace_test_f2/4 don't show up + // TODO make sure stacktrace_test_f3 shows up 5 times + } + std::cout << "Test setting std::terminate handler that prints " + "the last saved stack trace" + << std::endl; + + stacktrace_test_f4(); + + if (bCustom) { + Kokkos::Impl::set_kokkos_terminate_handler(my_fancy_handler); + } else { + Kokkos::Impl::set_kokkos_terminate_handler(); + } + + // TODO test that this prints "Oh noes!" and the correct stacktrace. + if (bTerminate) { + std::terminate(); + } +} + +TEST(defaultdevicetype, stacktrace_normal) { test_stacktrace(false); } + +TEST(defaultdevicetype_DeathTest, stacktrace_terminate) { + ASSERT_DEATH({ test_stacktrace(true); }, + "I am the custom std::terminate handler."); +} + +TEST(defaultdevicetype_DeathTest, stacktrace_generic_term) { + ASSERT_DEATH({ test_stacktrace(true, false); }, + "Kokkos observes that std::terminate has been called"); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestStackTrace_f0.cpp b/packages/kokkos/core/unit_test/TestStackTrace_f0.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1514600f7d1f300cfaa076790e3f3e4dbfe52e0d --- /dev/null +++ b/packages/kokkos/core/unit_test/TestStackTrace_f0.cpp @@ -0,0 +1,54 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <iostream> +#include "Kokkos_Core.hpp" + +#include <impl/Kokkos_Stacktrace.hpp> + +namespace Test { + +void stacktrace_test_f0(std::ostream& out) { out << "Top of f0" << std::endl; } + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestStackTrace_f1.cpp b/packages/kokkos/core/unit_test/TestStackTrace_f1.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b10c574c9bd0d7f8a82ff2fa3ead8b5204cf179f --- /dev/null +++ b/packages/kokkos/core/unit_test/TestStackTrace_f1.cpp @@ -0,0 +1,63 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <iostream> +#include "Kokkos_Core.hpp" + +#include <impl/Kokkos_Stacktrace.hpp> + +namespace Test { + +void stacktrace_test_f0(std::ostream& out); + +int stacktrace_test_f1(std::ostream& out) { + out << "Top of f1" << std::endl; + stacktrace_test_f0(out); + Kokkos::Impl::save_stacktrace(); + stacktrace_test_f0(out); + + return 42; +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestStackTrace_f2.cpp b/packages/kokkos/core/unit_test/TestStackTrace_f2.cpp new file mode 100644 index 0000000000000000000000000000000000000000..23c1895e6946b2ef45caf7ed8d5a30e808089290 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestStackTrace_f2.cpp @@ -0,0 +1,60 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <iostream> +#include "Kokkos_Core.hpp" + +#include <impl/Kokkos_Stacktrace.hpp> + +namespace Test { + +int stacktrace_test_f1(std::ostream& out); + +void stacktrace_test_f2(std::ostream& out) { + out << "Top of f2" << std::endl; + const int result = stacktrace_test_f1(out); + out << "f2: f1 returned " << result << std::endl; +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestStackTrace_f3.cpp b/packages/kokkos/core/unit_test/TestStackTrace_f3.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ac004243f11e52c21d15bd04a6f246148e2962d2 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestStackTrace_f3.cpp @@ -0,0 +1,62 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <iostream> +#include "Kokkos_Core.hpp" + +#include <impl/Kokkos_Stacktrace.hpp> + +namespace Test { + +int stacktrace_test_f1(std::ostream& out); + +int stacktrace_test_f3(std::ostream& out, const int level) { + out << "Top of f3" << std::endl; + if (level <= 0) { + return stacktrace_test_f1(out); + } else { + return stacktrace_test_f3(out, level - 1) + 17; + } +} +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestStackTrace_f4.cpp b/packages/kokkos/core/unit_test/TestStackTrace_f4.cpp new file mode 100644 index 0000000000000000000000000000000000000000..afae98a2a8fe70f1d0d9fa49709646bb70b9ce18 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestStackTrace_f4.cpp @@ -0,0 +1,53 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <iostream> +#include "Kokkos_Core.hpp" + +#include <impl/Kokkos_Stacktrace.hpp> + +namespace Test { + +void stacktrace_test_f4() { Kokkos::Impl::save_stacktrace(); } +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestSubView_a.hpp b/packages/kokkos/core/unit_test/TestSubView_a.hpp new file mode 100644 index 0000000000000000000000000000000000000000..ecfc96f82497d2e487b506da40d4b23cbd2f8978 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestSubView_a.hpp @@ -0,0 +1,106 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_SUBVIEW_A_HPP +#define KOKKOS_TEST_SUBVIEW_A_HPP +#include <TestViewSubview.hpp> + +namespace Test { + +#ifndef KOKKOS_ENABLE_SYCL +TEST(TEST_CATEGORY, view_subview_auto_1d_left) { + TestViewSubview::test_auto_1d<Kokkos::LayoutLeft, TEST_EXECSPACE>(); +} + +TEST(TEST_CATEGORY, view_subview_auto_1d_right) { + TestViewSubview::test_auto_1d<Kokkos::LayoutRight, TEST_EXECSPACE>(); +} + +TEST(TEST_CATEGORY, view_subview_auto_1d_stride) { + TestViewSubview::test_auto_1d<Kokkos::LayoutStride, TEST_EXECSPACE>(); +} +#endif + +TEST(TEST_CATEGORY, view_subview_assign_strided) { + TestViewSubview::test_1d_strided_assignment<TEST_EXECSPACE>(); +} + +TEST(TEST_CATEGORY, view_subview_left_0) { + TestViewSubview::test_left_0<TEST_EXECSPACE>(); +} + +TEST(TEST_CATEGORY, view_subview_left_1) { + TestViewSubview::test_left_1<TEST_EXECSPACE>(); +} + +TEST(TEST_CATEGORY, view_subview_left_2) { + TestViewSubview::test_left_2<TEST_EXECSPACE>(); +} + +TEST(TEST_CATEGORY, view_subview_left_3) { + TestViewSubview::test_left_3<TEST_EXECSPACE>(); +} + +TEST(TEST_CATEGORY, view_subview_right_0) { + TestViewSubview::test_right_0<TEST_EXECSPACE>(); +} + +TEST(TEST_CATEGORY, view_subview_right_1) { + TestViewSubview::test_right_1<TEST_EXECSPACE>(); +} + +TEST(TEST_CATEGORY, view_subview_right_3) { + TestViewSubview::test_right_3<TEST_EXECSPACE>(); +} + +TEST(TEST_CATEGORY, view_static_tests) { + TestViewSubview::TestSubviewStaticSizes<TEST_EXECSPACE, + Kokkos::LayoutLeft>()(); + TestViewSubview::TestSubviewStaticSizes<TEST_EXECSPACE, + Kokkos::LayoutRight>()(); + TestViewSubview::TestExtentsStaticTests<TEST_EXECSPACE>(); +} + +} // namespace Test +#endif diff --git a/packages/kokkos/core/unit_test/TestSubView_b.hpp b/packages/kokkos/core/unit_test/TestSubView_b.hpp new file mode 100644 index 0000000000000000000000000000000000000000..d83b7c832766660ff2fdba44b9a9b4a8a8fcdc1b --- /dev/null +++ b/packages/kokkos/core/unit_test/TestSubView_b.hpp @@ -0,0 +1,68 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_SUBVIEW_B_HPP +#define KOKKOS_TEST_SUBVIEW_B_HPP +#include <TestViewSubview.hpp> + +namespace Test { + +TEST(TEST_CATEGORY, view_subview_layoutleft_to_layoutleft) { + TestViewSubview::test_layoutleft_to_layoutleft<TEST_EXECSPACE>(); + TestViewSubview::test_layoutleft_to_layoutleft< + TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >(); + TestViewSubview::test_layoutleft_to_layoutleft< + TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >(); +} + +TEST(TEST_CATEGORY, view_subview_layoutright_to_layoutright) { + TestViewSubview::test_layoutright_to_layoutright<TEST_EXECSPACE>(); + TestViewSubview::test_layoutright_to_layoutright< + TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >(); + TestViewSubview::test_layoutright_to_layoutright< + TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >(); +} + +} // namespace Test +#endif diff --git a/packages/kokkos/core/unit_test/TestSubView_c01.hpp b/packages/kokkos/core/unit_test/TestSubView_c01.hpp new file mode 100644 index 0000000000000000000000000000000000000000..03e19768d2331a784b5cf62cb86cd12da3cfd47a --- /dev/null +++ b/packages/kokkos/core/unit_test/TestSubView_c01.hpp @@ -0,0 +1,56 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_SUBVIEW_C01_HPP +#define KOKKOS_TEST_SUBVIEW_C01_HPP +#include <TestViewSubview.hpp> + +namespace Test { + +TEST(TEST_CATEGORY, view_subview_1d_assign) { + TestViewSubview::test_1d_assign<TEST_EXECSPACE>(); +} + +} // namespace Test +#endif diff --git a/packages/kokkos/core/unit_test/TestSubView_c02.hpp b/packages/kokkos/core/unit_test/TestSubView_c02.hpp new file mode 100644 index 0000000000000000000000000000000000000000..9fba8dbfd81101b439ac0ccd66747d0b1f12653e --- /dev/null +++ b/packages/kokkos/core/unit_test/TestSubView_c02.hpp @@ -0,0 +1,57 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_SUBVIEW_C02_HPP +#define KOKKOS_TEST_SUBVIEW_C02_HPP +#include <TestViewSubview.hpp> + +namespace Test { + +TEST(TEST_CATEGORY, view_subview_1d_assign_atomic) { + TestViewSubview::test_1d_assign<TEST_EXECSPACE, + Kokkos::MemoryTraits<Kokkos::Atomic> >(); +} + +} // namespace Test +#endif diff --git a/packages/kokkos/core/unit_test/TestSubView_c03.hpp b/packages/kokkos/core/unit_test/TestSubView_c03.hpp new file mode 100644 index 0000000000000000000000000000000000000000..04391b8920b85b8947e7b013c1b9d7d68a793dc7 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestSubView_c03.hpp @@ -0,0 +1,57 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_SUBVIEW_C03_HPP +#define KOKKOS_TEST_SUBVIEW_C03_HPP +#include <TestViewSubview.hpp> + +namespace Test { + +TEST(TEST_CATEGORY, view_subview_1d_assign_randomaccess) { + TestViewSubview::test_1d_assign< + TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >(); +} + +} // namespace Test +#endif diff --git a/packages/kokkos/core/unit_test/TestSubView_c04.hpp b/packages/kokkos/core/unit_test/TestSubView_c04.hpp new file mode 100644 index 0000000000000000000000000000000000000000..64d5d4b406db3acb156aa10faad3ed9aa59ba056 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestSubView_c04.hpp @@ -0,0 +1,56 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_SUBVIEW_C04_HPP +#define KOKKOS_TEST_SUBVIEW_C04_HPP +#include <TestViewSubview.hpp> + +namespace Test { + +TEST(TEST_CATEGORY, view_subview_2d_from_3d) { + TestViewSubview::test_2d_subview_3d<TEST_EXECSPACE>(); +} + +} // namespace Test +#endif diff --git a/packages/kokkos/core/unit_test/TestSubView_c05.hpp b/packages/kokkos/core/unit_test/TestSubView_c05.hpp new file mode 100644 index 0000000000000000000000000000000000000000..ffc07800d13aebcae50fbc3d323fbfdc2eedd6f8 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestSubView_c05.hpp @@ -0,0 +1,57 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_SUBVIEW_C05_HPP +#define KOKKOS_TEST_SUBVIEW_C05_HPP +#include <TestViewSubview.hpp> + +namespace Test { + +TEST(TEST_CATEGORY, view_subview_2d_from_3d_atomic) { + TestViewSubview::test_2d_subview_3d<TEST_EXECSPACE, + Kokkos::MemoryTraits<Kokkos::Atomic> >(); +} + +} // namespace Test +#endif diff --git a/packages/kokkos/core/unit_test/TestSubView_c06.hpp b/packages/kokkos/core/unit_test/TestSubView_c06.hpp new file mode 100644 index 0000000000000000000000000000000000000000..f7f066f102f550b47a2b2ffbf885b5947094b896 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestSubView_c06.hpp @@ -0,0 +1,57 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_SUBVIEW_C06_HPP +#define KOKKOS_TEST_SUBVIEW_C06_HPP +#include <TestViewSubview.hpp> + +namespace Test { + +TEST(TEST_CATEGORY, view_subview_2d_from_3d_randomaccess) { + TestViewSubview::test_2d_subview_3d< + TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >(); +} + +} // namespace Test +#endif diff --git a/packages/kokkos/core/unit_test/TestSubView_c07.hpp b/packages/kokkos/core/unit_test/TestSubView_c07.hpp new file mode 100644 index 0000000000000000000000000000000000000000..87b5022826e5b05c6fbfd9ed03c9a0ee202fcb41 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestSubView_c07.hpp @@ -0,0 +1,56 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_SUBVIEW_C07_HPP +#define KOKKOS_TEST_SUBVIEW_C07_HPP +#include <TestViewSubview.hpp> + +namespace Test { + +TEST(TEST_CATEGORY, view_subview_3d_from_5d_left) { + TestViewSubview::test_3d_subview_5d_left<TEST_EXECSPACE>(); +} + +} // namespace Test +#endif diff --git a/packages/kokkos/core/unit_test/TestSubView_c08.hpp b/packages/kokkos/core/unit_test/TestSubView_c08.hpp new file mode 100644 index 0000000000000000000000000000000000000000..d18d4c1b7d0778a58ce2a795bfa47c598d2829d4 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestSubView_c08.hpp @@ -0,0 +1,57 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_SUBVIEW_C08_HPP +#define KOKKOS_TEST_SUBVIEW_C08_HPP +#include <TestViewSubview.hpp> + +namespace Test { + +TEST(TEST_CATEGORY, view_subview_3d_from_5d_left_atomic) { + TestViewSubview::test_3d_subview_5d_left< + TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >(); +} + +} // namespace Test +#endif diff --git a/packages/kokkos/core/unit_test/TestSubView_c09.hpp b/packages/kokkos/core/unit_test/TestSubView_c09.hpp new file mode 100644 index 0000000000000000000000000000000000000000..8f7ece4d298691541f2bc232597309dd72942667 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestSubView_c09.hpp @@ -0,0 +1,57 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_SUBVIEW_C09_HPP +#define KOKKOS_TEST_SUBVIEW_C09_HPP +#include <TestViewSubview.hpp> + +namespace Test { + +TEST(TEST_CATEGORY, view_subview_3d_from_5d_left_randomaccess) { + TestViewSubview::test_3d_subview_5d_left< + TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >(); +} + +} // namespace Test +#endif diff --git a/packages/kokkos/core/unit_test/TestSubView_c10.hpp b/packages/kokkos/core/unit_test/TestSubView_c10.hpp new file mode 100644 index 0000000000000000000000000000000000000000..0a0358b56f959aeb6bd45a595ede5a981f6644bb --- /dev/null +++ b/packages/kokkos/core/unit_test/TestSubView_c10.hpp @@ -0,0 +1,56 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_SUBVIEW_C10_HPP +#define KOKKOS_TEST_SUBVIEW_C10_HPP +#include <TestViewSubview.hpp> + +namespace Test { + +TEST(TEST_CATEGORY, view_subview_3d_from_5d_right) { + TestViewSubview::test_3d_subview_5d_right<TEST_EXECSPACE>(); +} + +} // namespace Test +#endif diff --git a/packages/kokkos/core/unit_test/TestSubView_c11.hpp b/packages/kokkos/core/unit_test/TestSubView_c11.hpp new file mode 100644 index 0000000000000000000000000000000000000000..bb0b34a646daf69dfdca5edee6cb3cc773dd68f2 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestSubView_c11.hpp @@ -0,0 +1,57 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_SUBVIEW_C11_HPP +#define KOKKOS_TEST_SUBVIEW_C11_HPP +#include <TestViewSubview.hpp> + +namespace Test { + +TEST(TEST_CATEGORY, view_subview_3d_from_5d_right_atomic) { + TestViewSubview::test_3d_subview_5d_right< + TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >(); +} + +} // namespace Test +#endif diff --git a/packages/kokkos/core/unit_test/TestSubView_c12.hpp b/packages/kokkos/core/unit_test/TestSubView_c12.hpp new file mode 100644 index 0000000000000000000000000000000000000000..b3af606771caf81c820126677de68067128e296d --- /dev/null +++ b/packages/kokkos/core/unit_test/TestSubView_c12.hpp @@ -0,0 +1,57 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_SUBVIEW_C12_HPP +#define KOKKOS_TEST_SUBVIEW_C12_HPP +#include <TestViewSubview.hpp> + +namespace Test { + +TEST(TEST_CATEGORY, view_subview_3d_from_5d_right_randomaccess) { + TestViewSubview::test_3d_subview_5d_right< + TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >(); +} + +} // namespace Test +#endif diff --git a/packages/kokkos/core/unit_test/TestSubView_c13.hpp b/packages/kokkos/core/unit_test/TestSubView_c13.hpp new file mode 100644 index 0000000000000000000000000000000000000000..05c4767368c5b9813a140f76cd20b2044273c3a8 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestSubView_c13.hpp @@ -0,0 +1,56 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_SUBVIEW_C13_HPP +#define KOKKOS_TEST_SUBVIEW_C13_HPP +#include <TestViewSubview.hpp> + +namespace Test { + +TEST(TEST_CATEGORY, view_test_unmanaged_subview_reset) { + TestViewSubview::test_unmanaged_subview_reset<TEST_EXECSPACE>(); +} + +} // namespace Test +#endif diff --git a/packages/kokkos/core/unit_test/TestSubView_c14.hpp b/packages/kokkos/core/unit_test/TestSubView_c14.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e6510c83a603481a9b8de0367894ec98407faba3 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestSubView_c14.hpp @@ -0,0 +1,56 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_SUBVIEW_C14_HPP +#define KOKKOS_TEST_SUBVIEW_C14_HPP +#include <TestViewSubview.hpp> + +namespace Test { + +TEST(TEST_CATEGORY, view_subview_memory_traits_construction) { + TestViewSubview::test_subview_memory_traits_construction(); +} + +} // namespace Test +#endif diff --git a/packages/kokkos/core/unit_test/TestTaskScheduler.hpp b/packages/kokkos/core/unit_test/TestTaskScheduler.hpp new file mode 100644 index 0000000000000000000000000000000000000000..6b9cd2c90f5d3b9a999e33bbbe7400c53b15aaf1 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestTaskScheduler.hpp @@ -0,0 +1,901 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_UNITTEST_TASKSCHEDULER_HPP +#define KOKKOS_UNITTEST_TASKSCHEDULER_HPP + +#include <Kokkos_Macros.hpp> +#if defined(KOKKOS_ENABLE_TASKDAG) +#include <Kokkos_Core.hpp> +#include <impl/Kokkos_FixedBufferMemoryPool.hpp> +#include <cstdio> +#include <iostream> +#include <cmath> + +//============================================================================== +// <editor-fold desc="TestFib"> {{{1 + +namespace TestTaskScheduler { + +namespace { + +inline long eval_fib(long n) { + constexpr long mask = 0x03; + + long fib[4] = {0, 1, 1, 2}; + + for (long i = 2; i <= n; ++i) { + fib[i & mask] = fib[(i - 1) & mask] + fib[(i - 2) & mask]; + } + + return fib[n & mask]; +} + +} // namespace + +template <typename Scheduler> +struct TestFib { + using sched_type = Scheduler; + using future_type = Kokkos::BasicFuture<long, Scheduler>; + using value_type = long; + + future_type fib_m1; + future_type fib_m2; + const value_type n; + + KOKKOS_INLINE_FUNCTION + TestFib(const value_type arg_n) : fib_m1(), fib_m2(), n(arg_n) {} + + KOKKOS_INLINE_FUNCTION + void operator()(typename sched_type::member_type& member, + value_type& result) { +#if 0 + printf( "\nTestFib(%ld) %d %d\n", n, int( !fib_m1.is_null() ), int( !fib_m2.is_null() ) ); +#endif + + auto& sched = member.scheduler(); + + if (n < 2) { + result = n; + } else if (!fib_m2.is_null() && !fib_m1.is_null()) { + result = fib_m1.get() + fib_m2.get(); + } else { + // Spawn new children and respawn myself to sum their results. + // Spawn lower value at higher priority as it has a shorter + // path to completion. + + fib_m2 = Kokkos::task_spawn( + Kokkos::TaskSingle(sched, Kokkos::TaskPriority::High), + TestFib(n - 2)); + + fib_m1 = Kokkos::task_spawn(Kokkos::TaskSingle(sched), TestFib(n - 1)); + + Kokkos::BasicFuture<void, Scheduler> dep[] = {fib_m1, fib_m2}; + Kokkos::BasicFuture<void, Scheduler> fib_all = sched.when_all(dep, 2); + + if (!fib_m2.is_null() && !fib_m1.is_null() && !fib_all.is_null()) { + // High priority to retire this branch. + Kokkos::respawn(this, fib_all, Kokkos::TaskPriority::High); + } else { +#if 1 + printf( + "TestFib(%ld) insufficient memory alloc_capacity(%d) task_max(%d) " + "task_accum(%ld)\n", + n, 0 // sched.allocation_capacity() + , + 0 // sched.allocated_task_count_max() + , + 0l // sched.allocated_task_count_accum() + ); +#endif + + Kokkos::abort("TestFib insufficient memory"); + } + } + } + + static void run(int i, size_t MemoryCapacity = 16000) { + using memory_space = typename sched_type::memory_space; + + enum { MinBlockSize = 64 }; + enum { MaxBlockSize = 1024 }; + enum { SuperBlockSize = 4096 }; + + sched_type root_sched(memory_space(), MemoryCapacity, MinBlockSize, + std::min(size_t(MaxBlockSize), MemoryCapacity), + std::min(size_t(SuperBlockSize), MemoryCapacity)); + + { + future_type f = + Kokkos::host_spawn(Kokkos::TaskSingle(root_sched), TestFib(i)); + + Kokkos::wait(root_sched); + + ASSERT_EQ(eval_fib(i), f.get()); + } + + ASSERT_EQ(root_sched.queue().allocation_count(), 0); + +#if 0 + fprintf( stdout, "\nTestFib::run(%d) spawn_size(%d) when_all_size(%d) alloc_capacity(%d) task_max(%d) task_accum(%ld)\n" + , i + , int(root_sched.template spawn_allocation_size<TestFib>()) + , int(root_sched.when_all_allocation_size(2)) + , root_sched.allocation_capacity() + , root_sched.allocated_task_count_max() + , root_sched.allocated_task_count_accum() + ); + fflush( stdout ); +#endif + } +}; + +} // namespace TestTaskScheduler + +// </editor-fold> end TestFib }}}1 +//============================================================================== + +//---------------------------------------------------------------------------- + +//============================================================================== +// <editor-fold desc="TestTaskDependence"> {{{1 + +namespace TestTaskScheduler { + +template <class Scheduler> +struct TestTaskDependence { + using sched_type = Scheduler; + using future_type = Kokkos::BasicFuture<void, Scheduler>; + using accum_type = Kokkos::View<long, typename sched_type::execution_space>; + using value_type = void; + + accum_type m_accum; + long m_count; + + KOKKOS_INLINE_FUNCTION + TestTaskDependence(long n, const accum_type& arg_accum) + : m_accum(arg_accum), m_count(n) {} + + KOKKOS_INLINE_FUNCTION + void operator()(typename sched_type::member_type& member) { + auto& sched = member.scheduler(); + enum { CHUNK = 8 }; + const int n = CHUNK < m_count ? CHUNK : m_count; + + if (1 < m_count) { + const int increment = (m_count + n - 1) / n; + + future_type f = sched.when_all(n, [this, &member, increment](int i) { + const long inc = increment; + const long begin = i * inc; + const long count = begin + inc < m_count ? inc : m_count - begin; + + return Kokkos::task_spawn(Kokkos::TaskSingle(member.scheduler()), + TestTaskDependence(count, m_accum)); + }); + + m_count = 0; + + Kokkos::respawn(this, f); + } else if (1 == m_count) { + Kokkos::atomic_increment(&m_accum()); + } + } + + static void run(int n) { + using memory_space = typename sched_type::memory_space; + + enum { MemoryCapacity = 16000 }; + enum { MinBlockSize = 64 }; + enum { MaxBlockSize = 1024 }; + enum { SuperBlockSize = 4096 }; + + sched_type sched(memory_space(), MemoryCapacity, MinBlockSize, MaxBlockSize, + SuperBlockSize); + + accum_type accum("accum"); + + typename accum_type::HostMirror host_accum = + Kokkos::create_mirror_view(accum); + + Kokkos::host_spawn(Kokkos::TaskSingle(sched), TestTaskDependence(n, accum)); + + Kokkos::wait(sched); + + Kokkos::deep_copy(host_accum, accum); + + ASSERT_EQ(host_accum(), n); + } +}; + +} // namespace TestTaskScheduler + +// </editor-fold> end TestTaskDependence }}}1 +//============================================================================== + +//---------------------------------------------------------------------------- + +namespace TestTaskScheduler { + +template <class Scheduler> +struct TestTaskTeam { + // enum { SPAN = 8 }; + enum { SPAN = 33 }; + // enum { SPAN = 1 }; + + using value_type = void; + using sched_type = Scheduler; + using future_type = Kokkos::BasicFuture<void, sched_type>; + using ExecSpace = typename sched_type::execution_space; + using view_type = Kokkos::View<long*, ExecSpace>; + + future_type future; + + view_type parfor_result; + view_type parreduce_check; + view_type parscan_result; + view_type parscan_check; + const long nvalue; + + KOKKOS_INLINE_FUNCTION + TestTaskTeam(const view_type& arg_parfor_result, + const view_type& arg_parreduce_check, + const view_type& arg_parscan_result, + const view_type& arg_parscan_check, const long arg_nvalue) + : future(), + parfor_result(arg_parfor_result), + parreduce_check(arg_parreduce_check), + parscan_result(arg_parscan_result), + parscan_check(arg_parscan_check), + nvalue(arg_nvalue) {} + + KOKKOS_INLINE_FUNCTION + void operator()(typename sched_type::member_type& member) { + auto& sched = member.scheduler(); + const long end = nvalue + 1; + // begin = max(end - SPAN, 0); + const long begin = 0 < end - SPAN ? end - SPAN : 0; + + if (0 < begin && future.is_null()) { + if (member.team_rank() == 0) { + future = Kokkos::task_spawn( + Kokkos::TaskTeam(sched), + TestTaskTeam(parfor_result, parreduce_check, parscan_result, + parscan_check, begin - 1)); + +#if !defined(__HIP_DEVICE_COMPILE__) && !defined(__CUDA_ARCH__) + assert(!future.is_null()); +#endif + + Kokkos::respawn(this, future); + } + + return; + } + + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, begin, end), + [&](int i) { parfor_result[i] = i; }); + + // Test parallel_reduce without join. + + long tot = 0; + long expected = (begin + end - 1) * (end - begin) * 0.5; + + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(member, begin, end), + [&](int i, long& res) { res += parfor_result[i]; }, tot); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, begin, end), + [&](int i) { parreduce_check[i] = expected - tot; }); + + // Test parallel_reduce with join. + + tot = 0; + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(member, begin, end), + [&](int i, long& res) { res += parfor_result[i]; }, + Kokkos::Sum<long>(tot)); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, begin, end), + [&](int i) { parreduce_check[i] += expected - tot; }); + + // Test parallel_scan. + + // Exclusive scan. + Kokkos::parallel_scan<long>(Kokkos::TeamThreadRange(member, begin, end), + [&](int i, long& val, const bool final) { + if (final) { + parscan_result[i] = val; + } + + val += i; + }); + + // Wait for 'parscan_result' before testing it. + member.team_barrier(); + + if (member.team_rank() == 0) { + for (long i = begin; i < end; ++i) { + parscan_check[i] = + (i * (i - 1) - begin * (begin - 1)) * 0.5 - parscan_result[i]; + } + } + + // Don't overwrite 'parscan_result' until it has been tested. + member.team_barrier(); + + // Inclusive scan. + Kokkos::parallel_scan<long>(Kokkos::TeamThreadRange(member, begin, end), + [&](int i, long& val, const bool final) { + val += i; + + if (final) { + parscan_result[i] = val; + } + }); + + // Wait for 'parscan_result' before testing it. + member.team_barrier(); + + if (member.team_rank() == 0) { + for (long i = begin; i < end; ++i) { + parscan_check[i] += + (i * (i + 1) - begin * (begin - 1)) * 0.5 - parscan_result[i]; + } + } + + // ThreadVectorRange check. + /* + long result = 0; + expected = ( begin + end - 1 ) * ( end - begin ) * 0.5; + Kokkos::parallel_reduce( Kokkos::TeamThreadRange( member, 0, 1 ) + , [&] ( const int i, long & outerUpdate ) + { + long sum_j = 0.0; + + Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( member, end - + begin ) , [&] ( const int j, long & innerUpdate ) + { + innerUpdate += begin + j; + }, sum_j ); + + outerUpdate += sum_j; + }, result ); + + Kokkos::parallel_for( Kokkos::TeamThreadRange( member, begin, end ) + , [&] ( int i ) + { + parreduce_check[i] += result - expected; + }); + */ + } + + static void run(long n) { + const unsigned memory_capacity = 400000; + + enum { MinBlockSize = 64 }; + enum { MaxBlockSize = 1024 }; + enum { SuperBlockSize = 4096 }; + + sched_type root_sched(typename sched_type::memory_space(), memory_capacity, + MinBlockSize, MaxBlockSize, SuperBlockSize); + + view_type root_parfor_result("parfor_result", n + 1); + view_type root_parreduce_check("parreduce_check", n + 1); + view_type root_parscan_result("parscan_result", n + 1); + view_type root_parscan_check("parscan_check", n + 1); + + typename view_type::HostMirror host_parfor_result = + Kokkos::create_mirror_view(root_parfor_result); + typename view_type::HostMirror host_parreduce_check = + Kokkos::create_mirror_view(root_parreduce_check); + typename view_type::HostMirror host_parscan_result = + Kokkos::create_mirror_view(root_parscan_result); + typename view_type::HostMirror host_parscan_check = + Kokkos::create_mirror_view(root_parscan_check); + + future_type f = Kokkos::host_spawn( + Kokkos::TaskTeam(root_sched), + TestTaskTeam(root_parfor_result, root_parreduce_check, + root_parscan_result, root_parscan_check, n)); + + Kokkos::wait(root_sched); + + Kokkos::deep_copy(host_parfor_result, root_parfor_result); + Kokkos::deep_copy(host_parreduce_check, root_parreduce_check); + Kokkos::deep_copy(host_parscan_result, root_parscan_result); + Kokkos::deep_copy(host_parscan_check, root_parscan_check); + + long error_count = 0; + + for (long i = 0; i <= n; ++i) { + const long answer = i; + + if (host_parfor_result(i) != answer) { + ++error_count; + std::cerr << "TestTaskTeam::run ERROR parallel_for result(" << i + << ") = " << host_parfor_result(i) << " != " << answer + << std::endl; + } + + if (host_parreduce_check(i) != 0) { + ++error_count; + std::cerr << "TestTaskTeam::run ERROR parallel_reduce check(" << i + << ") = " << host_parreduce_check(i) << " != 0" << std::endl; + } + + if (host_parscan_check(i) != 0) { + ++error_count; + std::cerr << "TestTaskTeam::run ERROR parallel_scan check(" << i + << ") = " << host_parscan_check(i) << " != 0" << std::endl; + } + } + + ASSERT_EQ(0L, error_count); + } +}; + +template <class Scheduler> +struct TestTaskTeamValue { + enum { SPAN = 8 }; + + using value_type = long; + using sched_type = Scheduler; + using future_type = Kokkos::BasicFuture<value_type, sched_type>; + using ExecSpace = typename sched_type::execution_space; + using view_type = Kokkos::View<long*, ExecSpace>; + + future_type future; + + view_type result; + const long nvalue; + + KOKKOS_INLINE_FUNCTION + TestTaskTeamValue(const view_type& arg_result, const long arg_nvalue) + : future(), result(arg_result), nvalue(arg_nvalue) {} + + KOKKOS_INLINE_FUNCTION + void operator()(typename sched_type::member_type const& member, + value_type& final) { + const long end = nvalue + 1; + const long begin = 0 < end - SPAN ? end - SPAN : 0; + + auto& sched = member.scheduler(); + + if (0 < begin && future.is_null()) { + if (member.team_rank() == 0) { + future = sched.task_spawn(TestTaskTeamValue(result, begin - 1), + Kokkos::TaskTeam); + +#if !defined(__HIP_DEVICE_COMPILE__) && !defined(__CUDA_ARCH__) + assert(!future.is_null()); +#endif + + sched.respawn(this, future); + } + + return; + } + + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, begin, end), + [&](int i) { result[i] = i + 1; }); + + if (member.team_rank() == 0) { + final = result[nvalue]; + } + + Kokkos::memory_fence(); + } + + static void run(long n) { + const unsigned memory_capacity = 100000; + + enum { MinBlockSize = 64 }; + enum { MaxBlockSize = 1024 }; + enum { SuperBlockSize = 4096 }; + + sched_type root_sched(typename sched_type::memory_space(), memory_capacity, + MinBlockSize, MaxBlockSize, SuperBlockSize); + + view_type root_result("result", n + 1); + + typename view_type::HostMirror host_result = + Kokkos::create_mirror_view(root_result); + + future_type fv = root_sched.host_spawn(TestTaskTeamValue(root_result, n), + Kokkos::TaskTeam); + + Kokkos::wait(root_sched); + + Kokkos::deep_copy(host_result, root_result); + + if (fv.get() != n + 1) { + std::cerr << "TestTaskTeamValue ERROR future = " << fv.get() + << " != " << n + 1 << std::endl; + } + + for (long i = 0; i <= n; ++i) { + const long answer = i + 1; + + if (host_result(i) != answer) { + std::cerr << "TestTaskTeamValue ERROR result(" << i + << ") = " << host_result(i) << " != " << answer << std::endl; + } + } + } +}; + +} // namespace TestTaskScheduler + +//---------------------------------------------------------------------------- + +namespace TestTaskScheduler { + +template <class Scheduler> +struct TestTaskSpawnWithPool { + using sched_type = Scheduler; + using future_type = Kokkos::BasicFuture<void, sched_type>; + using value_type = void; + using Space = typename sched_type::execution_space; + + int m_count; + Kokkos::MemoryPool<Space> m_pool; + + KOKKOS_INLINE_FUNCTION + TestTaskSpawnWithPool(const int& arg_count, + const Kokkos::MemoryPool<Space>& arg_pool) + : m_count(arg_count), m_pool(arg_pool) {} + + KOKKOS_INLINE_FUNCTION + void operator()(typename sched_type::member_type& member) { + if (m_count) { + Kokkos::task_spawn(Kokkos::TaskSingle(member.scheduler()), + TestTaskSpawnWithPool(m_count - 1, m_pool)); + } + } + + static void run() { + using memory_space = typename sched_type::memory_space; + + enum { MemoryCapacity = 16000 }; + enum { MinBlockSize = 64 }; + enum { MaxBlockSize = 1024 }; + enum { SuperBlockSize = 4096 }; + + sched_type sched(memory_space(), MemoryCapacity, MinBlockSize, MaxBlockSize, + SuperBlockSize); + + using other_memory_space = typename Space::memory_space; + Kokkos::MemoryPool<Space> pool(other_memory_space(), 10000, 100, 200, 1000); + auto f = Kokkos::host_spawn(Kokkos::TaskSingle(sched), + TestTaskSpawnWithPool(3, pool)); + + Kokkos::wait(sched); + } +}; + +} // namespace TestTaskScheduler + +//---------------------------------------------------------------------------- + +namespace TestTaskScheduler { + +template <class Scheduler> +struct TestTaskCtorsDevice { + using sched_type = Scheduler; + using future_type = Kokkos::BasicFuture<void, sched_type>; + using value_type = void; + using Space = typename sched_type::execution_space; + + int m_count; + + KOKKOS_INLINE_FUNCTION + TestTaskCtorsDevice(const int& arg_count) : m_count(arg_count) {} + + KOKKOS_INLINE_FUNCTION + void operator()(typename sched_type::member_type& member) { + // Note: Default construction on the device is not allowed + if (m_count == 4) { + Kokkos::task_spawn(Kokkos::TaskSingle(member.scheduler()), + TestTaskCtorsDevice(m_count - 1)); + } else if (m_count == 3) { + sched_type s = member.scheduler(); // move construct + s = member.scheduler(); // move assignment + Kokkos::task_spawn(Kokkos::TaskSingle(s), + TestTaskCtorsDevice(m_count - 1)); + } else if (m_count == 2) { + sched_type s3 = + member.scheduler(); // move construct from member.scheduler(); + Kokkos::task_spawn(Kokkos::TaskSingle(s3), + TestTaskCtorsDevice(m_count - 1)); + } else if (m_count == 1) { + sched_type s = + member.scheduler(); // move construct from member.scheduler(); + sched_type s2 = s; // copy construct from s + Kokkos::task_spawn(Kokkos::TaskSingle(s2), + TestTaskCtorsDevice(m_count - 1)); + } + } + + static void run() { + using memory_space = typename sched_type::memory_space; + + enum { MemoryCapacity = 16000 }; + enum { MinBlockSize = 64 }; + enum { MaxBlockSize = 1024 }; + enum { SuperBlockSize = 4096 }; + + sched_type sched(memory_space(), MemoryCapacity, MinBlockSize, MaxBlockSize, + SuperBlockSize); + + auto f = + Kokkos::host_spawn(Kokkos::TaskSingle(sched), TestTaskCtorsDevice(4)); + + Kokkos::wait(sched); + + // TODO assertions and sanity checks + } +}; + +} // namespace TestTaskScheduler + +//---------------------------------------------------------------------------- + +namespace TestTaskScheduler { + +template <class Scheduler> +struct TestMultipleDependence { + using sched_type = Scheduler; + using future_bool = Kokkos::BasicFuture<bool, sched_type>; + using future_int = Kokkos::BasicFuture<int, sched_type>; + using value_type = bool; + using execution_space = typename sched_type::execution_space; + + enum : int { NPerDepth = 6 }; + enum : int { NFanout = 3 }; + + // xlC doesn't like incomplete aggregate constructors, so we have do do this + // manually: + KOKKOS_INLINE_FUNCTION + TestMultipleDependence(int depth, int max_depth) + : m_depth(depth), m_max_depth(max_depth), m_dep() { + // gcc 4.8 has an internal compile error when I give the initializer in the + // class, so I have do do it here + for (int i = 0; i < NPerDepth; ++i) { + m_result_futures[i] = future_bool(); + } + } + + // xlC doesn't like incomplete aggregate constructors, so we have do do this + // manually: + KOKKOS_INLINE_FUNCTION + TestMultipleDependence(int depth, int max_depth, future_int dep) + : m_depth(depth), m_max_depth(max_depth), m_dep(dep) { + // gcc 4.8 has an internal compile error when I give the initializer in the + // class, so I have do do it here + for (int i = 0; i < NPerDepth; ++i) { + m_result_futures[i] = future_bool(); + } + } + + int m_depth; + int m_max_depth; + future_int m_dep; + future_bool m_result_futures[NPerDepth]; + + struct TestCheckReady { + future_int m_dep; + using value_type = bool; + KOKKOS_INLINE_FUNCTION + void operator()(typename Scheduler::member_type&, bool& value) { + // if it was "transiently" ready, this could be false even if we made it a + // dependence of this task + value = m_dep.is_ready(); + return; + } + }; + + struct TestComputeValue { + using value_type = int; + KOKKOS_INLINE_FUNCTION + void operator()(typename Scheduler::member_type&, int& result) { + double value = 0; + // keep this one busy for a while + for (int i = 0; i < 10000; ++i) { + value += i * i / 7.138 / value; + } + // Do something irrelevant + result = int(value) << 2; + return; + } + }; + + KOKKOS_INLINE_FUNCTION + void operator()(typename sched_type::member_type& member, bool& value) { + if (m_result_futures[0].is_null()) { + if (m_depth == 0) { + // Spawn one expensive task at the root + m_dep = Kokkos::task_spawn(Kokkos::TaskSingle(member.scheduler()), + TestComputeValue{}); + } + + // Then check for it to be ready in a whole bunch of other tasks that race + int n_checkers = NPerDepth; + if (m_depth < m_max_depth) { + n_checkers -= NFanout; + for (int i = n_checkers; i < NPerDepth; ++i) { + m_result_futures[i] = + Kokkos::task_spawn(Kokkos::TaskSingle(member.scheduler()), + TestMultipleDependence<Scheduler>( + m_depth + 1, m_max_depth, m_dep)); + } + } + + for (int i = 0; i < n_checkers; ++i) { + m_result_futures[i] = member.scheduler().spawn( + Kokkos::TaskSingle(m_dep), TestCheckReady{m_dep}); + } + auto done = member.scheduler().when_all(m_result_futures, NPerDepth); + Kokkos::respawn(this, done); + + return; + } else { + value = true; + for (int i = 0; i < NPerDepth; ++i) { + value = value && !m_result_futures[i].is_null(); + if (value) { + value = value && m_result_futures[i].get(); + } + } + return; + } + } + + static void run(int depth) { + using memory_space = typename sched_type::memory_space; + + enum { MemoryCapacity = 1 << 30 }; + enum { MinBlockSize = 64 }; + enum { MaxBlockSize = 1024 }; + enum { SuperBlockSize = 4096 }; + + sched_type sched(memory_space(), MemoryCapacity, MinBlockSize, MaxBlockSize, + SuperBlockSize); + + auto f = Kokkos::host_spawn(Kokkos::TaskSingle(sched), + TestMultipleDependence<Scheduler>(0, depth)); + + Kokkos::wait(sched); + + ASSERT_TRUE(f.get()); + } +}; + +} // namespace TestTaskScheduler + +//---------------------------------------------------------------------------- + +#define KOKKOS_PP_CAT_IMPL(x, y) x##y +#define KOKKOS_TEST_WITH_SUFFIX(x, y) KOKKOS_PP_CAT_IMPL(x, y) + +#define TEST_SCHEDULER_SUFFIX _deprecated +#define TEST_SCHEDULER Kokkos::DeprecatedTaskScheduler<TEST_EXECSPACE> +#include "TestTaskScheduler_single.hpp" +#undef TEST_SCHEDULER +#undef TEST_SCHEDULER_SUFFIX + +#define TEST_SCHEDULER_SUFFIX _deprecated_multiple +#define TEST_SCHEDULER Kokkos::DeprecatedTaskSchedulerMultiple<TEST_EXECSPACE> +#include "TestTaskScheduler_single.hpp" +#undef TEST_SCHEDULER +#undef TEST_SCHEDULER_SUFFIX + +#define TEST_SCHEDULER_SUFFIX _single +#define TEST_SCHEDULER Kokkos::TaskScheduler<TEST_EXECSPACE> +#include "TestTaskScheduler_single.hpp" +#undef TEST_SCHEDULER +#undef TEST_SCHEDULER_SUFFIX + +#define TEST_SCHEDULER_SUFFIX _multiple +#define TEST_SCHEDULER Kokkos::TaskSchedulerMultiple<TEST_EXECSPACE> +#include "TestTaskScheduler_single.hpp" +#undef TEST_SCHEDULER +#undef TEST_SCHEDULER_SUFFIX + +// KOKKOS WORKAROUND WIN32: Theses tests hang with msvc +#ifndef _WIN32 +#define TEST_SCHEDULER_SUFFIX _chase_lev +#define TEST_SCHEDULER Kokkos::ChaseLevTaskScheduler<TEST_EXECSPACE> +#include "TestTaskScheduler_single.hpp" +#undef TEST_SCHEDULER +#undef TEST_SCHEDULER_SUFFIX +#endif + +#if 0 +#define TEST_SCHEDULER_SUFFIX _fixed_mempool +#define TEST_SCHEDULER \ + Kokkos::SimpleTaskScheduler< \ + TEST_EXECSPACE, \ + Kokkos::Impl::SingleTaskQueue< \ + TEST_EXECSPACE, \ + Kokkos::Impl::default_tasking_memory_space_for_execution_space_t< \ + TEST_EXECSPACE>, \ + Kokkos::Impl::TaskQueueTraitsLockBased, \ + Kokkos::Impl::FixedBlockSizeMemoryPool< \ + Kokkos::Device< \ + TEST_EXECSPACE, \ + Kokkos::Impl:: \ + default_tasking_memory_space_for_execution_space_t< \ + TEST_EXECSPACE>>, \ + 128, 16>>> +#include "TestTaskScheduler_single.hpp" +#undef TEST_SCHEDULER +#undef TEST_SCHEDULER_SUFFIX + +#define TEST_SCHEDULER_SUFFIX _fixed_mempool_multiple +#define TEST_SCHEDULER \ + Kokkos::SimpleTaskScheduler< \ + TEST_EXECSPACE, \ + Kokkos::Impl::MultipleTaskQueue< \ + TEST_EXECSPACE, \ + Kokkos::Impl::default_tasking_memory_space_for_execution_space_t< \ + TEST_EXECSPACE>, \ + Kokkos::Impl::TaskQueueTraitsLockBased, \ + Kokkos::Impl::FixedBlockSizeMemoryPool< \ + Kokkos::Device< \ + TEST_EXECSPACE, \ + Kokkos::Impl:: \ + default_tasking_memory_space_for_execution_space_t< \ + TEST_EXECSPACE>>, \ + 128, 16>>> +#include "TestTaskScheduler_single.hpp" +#undef TEST_SCHEDULER +#undef TEST_SCHEDULER_SUFFIX +#endif + +#undef KOKKOS_TEST_WITH_SUFFIX +#undef KOKKOS_PP_CAT_IMPL + +#endif // #if defined( KOKKOS_ENABLE_TASKDAG ) +#endif // #ifndef KOKKOS_UNITTEST_TASKSCHEDULER_HPP diff --git a/packages/kokkos/core/unit_test/TestTaskScheduler_single.hpp b/packages/kokkos/core/unit_test/TestTaskScheduler_single.hpp new file mode 100644 index 0000000000000000000000000000000000000000..c5d05382f8f26cd6c55d02a53ae4f3c16c45bf50 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestTaskScheduler_single.hpp @@ -0,0 +1,92 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +namespace Test { + +TEST(TEST_CATEGORY, KOKKOS_TEST_WITH_SUFFIX(task_fib, TEST_SCHEDULER_SUFFIX)) { + const int N = 27; + for (int i = 0; i < N; ++i) { + TestTaskScheduler::TestFib<TEST_SCHEDULER>::run(i, + (i + 1) * (i + 1) * 64000); + } +} + +TEST(TEST_CATEGORY, + KOKKOS_TEST_WITH_SUFFIX(task_depend, TEST_SCHEDULER_SUFFIX)) { + for (int i = 0; i < 25; ++i) { + TestTaskScheduler::TestTaskDependence<TEST_SCHEDULER>::run(i); + } +} + +TEST(TEST_CATEGORY, KOKKOS_TEST_WITH_SUFFIX(task_team, TEST_SCHEDULER_SUFFIX)) { + TestTaskScheduler::TestTaskTeam<TEST_SCHEDULER>::run(1000); + // TestTaskScheduler::TestTaskTeamValue< TEST_EXECSPACE >::run( 1000 ); // Put + // back after testing. +} + +TEST(TEST_CATEGORY, + KOKKOS_TEST_WITH_SUFFIX(task_with_mempool, TEST_SCHEDULER_SUFFIX)) { + TestTaskScheduler::TestTaskSpawnWithPool<TEST_SCHEDULER>::run(); +} + +TEST(TEST_CATEGORY, + KOKKOS_TEST_WITH_SUFFIX(task_multiple_depend, TEST_SCHEDULER_SUFFIX)) { + for (int i = 2; i < 6; ++i) { + TestTaskScheduler::TestMultipleDependence<TEST_SCHEDULER>::run(i); + } +} + +TEST(TEST_CATEGORY, + KOKKOS_TEST_WITH_SUFFIX(task_scheduler_ctors, TEST_SCHEDULER_SUFFIX)) { + TEST_SCHEDULER sched; + TEST_SCHEDULER sched2 = sched; + sched = sched2; +} + +TEST(TEST_CATEGORY, KOKKOS_TEST_WITH_SUFFIX(task_scheduer_ctors_device, + TEST_SCHEDULER_SUFFIX)) { + TestTaskScheduler::TestTaskCtorsDevice<TEST_SCHEDULER>::run(); +} + +} // end namespace Test diff --git a/packages/kokkos/core/unit_test/TestTeam.hpp b/packages/kokkos/core/unit_test/TestTeam.hpp new file mode 100644 index 0000000000000000000000000000000000000000..97ddfd4cf58518bfa494eedf4445ba68fdb1132a --- /dev/null +++ b/packages/kokkos/core/unit_test/TestTeam.hpp @@ -0,0 +1,1566 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <cstdio> +#include <stdexcept> +#include <sstream> +#include <iostream> + +#include <Kokkos_Core.hpp> + +namespace Test { + +namespace { + +template <class ExecSpace, class ScheduleType> +struct TestTeamPolicy { + using team_member = + typename Kokkos::TeamPolicy<ScheduleType, ExecSpace>::member_type; + using view_type = Kokkos::View<int **, ExecSpace>; + + view_type m_flags; + + TestTeamPolicy(const size_t league_size) + : m_flags( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "flags"), + // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32 +#ifdef KOKKOS_ENABLE_OPENMPTARGET + Kokkos::TeamPolicy<ScheduleType, ExecSpace>(1, 32).team_size_max( + *this, Kokkos::ParallelReduceTag()), +#else + Kokkos::TeamPolicy<ScheduleType, ExecSpace>(1, 1).team_size_max( + *this, Kokkos::ParallelReduceTag()), +#endif + league_size) { + } + + struct VerifyInitTag {}; + + KOKKOS_INLINE_FUNCTION + void operator()(const team_member &member) const { + const int tid = + member.team_rank() + member.team_size() * member.league_rank(); + + m_flags(member.team_rank(), member.league_rank()) = tid; + static_assert( + (std::is_same<typename team_member::execution_space, ExecSpace>::value), + "TeamMember::execution_space is not the same as " + "TeamPolicy<>::execution_space"); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const VerifyInitTag &, const team_member &member) const { + const int tid = + member.team_rank() + member.team_size() * member.league_rank(); + + if (tid != m_flags(member.team_rank(), member.league_rank())) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "TestTeamPolicy member(%d,%d) error %d != %d\n", member.league_rank(), + member.team_rank(), tid, + m_flags(member.team_rank(), member.league_rank())); + } + } + + // Included for test_small_league_size. + TestTeamPolicy() : m_flags() {} + + // Included for test_small_league_size. + struct NoOpTag {}; + + KOKKOS_INLINE_FUNCTION + void operator()(const NoOpTag &, const team_member & /*member*/) const {} + + static void test_small_league_size() { + int bs = 8; // batch size (number of elements per batch) + int ns = 16; // total number of "problems" to process + + // Calculate total scratch memory space size. + const int level = 0; + int mem_size = 960; + const int num_teams = ns / bs; + Kokkos::TeamPolicy<ExecSpace, NoOpTag> policy(num_teams, Kokkos::AUTO()); + + Kokkos::parallel_for( + policy.set_scratch_size(level, Kokkos::PerTeam(mem_size), + Kokkos::PerThread(0)), + TestTeamPolicy()); + } + + static void test_constructors() { + constexpr const int smallest_work = 1; + // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32 +#ifdef KOKKOS_ENABLE_OPENMPTARGET + Kokkos::TeamPolicy<ExecSpace, NoOpTag> none_auto(smallest_work, 32, + smallest_work); +#else + Kokkos::TeamPolicy<ExecSpace, NoOpTag> none_auto( + smallest_work, smallest_work, smallest_work); +#endif + Kokkos::TeamPolicy<ExecSpace, NoOpTag> both_auto( + smallest_work, Kokkos::AUTO(), Kokkos::AUTO()); + // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32 +#ifdef KOKKOS_ENABLE_OPENMPTARGET + Kokkos::TeamPolicy<ExecSpace, NoOpTag> auto_vector(smallest_work, 32, + Kokkos::AUTO()); +#else + Kokkos::TeamPolicy<ExecSpace, NoOpTag> auto_vector( + smallest_work, smallest_work, Kokkos::AUTO()); +#endif + Kokkos::TeamPolicy<ExecSpace, NoOpTag> auto_team( + smallest_work, Kokkos::AUTO(), smallest_work); + } + + static void test_for(const size_t league_size) { + { + TestTeamPolicy functor(league_size); + using policy_type = Kokkos::TeamPolicy<ScheduleType, ExecSpace>; + using policy_type_init = + Kokkos::TeamPolicy<ScheduleType, ExecSpace, VerifyInitTag>; + + // FIXME_OPENMPTARGET temporary restriction for team size to be at least + // 32 +#ifdef KOKKOS_ENABLE_OPENMPTARGET + const int team_size = + policy_type(league_size, 32) + .team_size_max(functor, Kokkos::ParallelForTag()); + const int team_size_init = + policy_type_init(league_size, 32) + .team_size_max(functor, Kokkos::ParallelForTag()); +#else + const int team_size = + policy_type(league_size, 1) + .team_size_max(functor, Kokkos::ParallelForTag()); + const int team_size_init = + policy_type_init(league_size, 1) + .team_size_max(functor, Kokkos::ParallelForTag()); +#endif + + Kokkos::parallel_for(policy_type(league_size, team_size), functor); + Kokkos::parallel_for(policy_type_init(league_size, team_size_init), + functor); + } + + test_small_league_size(); + test_constructors(); + } + + struct ReduceTag {}; + + using value_type = int64_t; + + KOKKOS_INLINE_FUNCTION + void operator()(const team_member &member, value_type &update) const { + update += member.team_rank() + member.team_size() * member.league_rank(); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const ReduceTag &, const team_member &member, + value_type &update) const { + update += + 1 + member.team_rank() + member.team_size() * member.league_rank(); + } + + static void test_reduce(const size_t league_size) { + TestTeamPolicy functor(league_size); + + using policy_type = Kokkos::TeamPolicy<ScheduleType, ExecSpace>; + using policy_type_reduce = + Kokkos::TeamPolicy<ScheduleType, ExecSpace, ReduceTag>; + + // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32 +#ifdef KOKKOS_ENABLE_OPENMPTARGET + const int team_size = + policy_type_reduce(league_size, 32) + .team_size_max(functor, Kokkos::ParallelReduceTag()); +#else + const int team_size = + policy_type_reduce(league_size, 1) + .team_size_max(functor, Kokkos::ParallelReduceTag()); +#endif + + const int64_t N = team_size * league_size; + + int64_t total = 0; + + Kokkos::parallel_reduce(policy_type(league_size, team_size), functor, + total); + ASSERT_EQ(size_t((N - 1) * (N)) / 2, size_t(total)); + + Kokkos::parallel_reduce(policy_type_reduce(league_size, team_size), functor, + total); + ASSERT_EQ((size_t(N) * size_t(N + 1)) / 2, size_t(total)); + } +}; + +} // namespace + +} // namespace Test + +/*--------------------------------------------------------------------------*/ + +namespace Test { + +template <typename ScalarType, class DeviceType, class ScheduleType> +class ReduceTeamFunctor { + public: + using execution_space = DeviceType; + using policy_type = Kokkos::TeamPolicy<ScheduleType, execution_space>; + using size_type = typename execution_space::size_type; + + struct value_type { + ScalarType value[3]; + }; + + const size_type nwork; + + KOKKOS_INLINE_FUNCTION + ReduceTeamFunctor(const size_type &arg_nwork) : nwork(arg_nwork) {} + + KOKKOS_INLINE_FUNCTION + ReduceTeamFunctor(const ReduceTeamFunctor &rhs) : nwork(rhs.nwork) {} + + KOKKOS_INLINE_FUNCTION + void init(value_type &dst) const { + dst.value[0] = 0; + dst.value[1] = 0; + dst.value[2] = 0; + } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type &dst, const volatile value_type &src) const { + dst.value[0] += src.value[0]; + dst.value[1] += src.value[1]; + dst.value[2] += src.value[2]; + } + + KOKKOS_INLINE_FUNCTION + void operator()(const typename policy_type::member_type ind, + value_type &dst) const { + const int thread_rank = + ind.team_rank() + ind.team_size() * ind.league_rank(); + const int thread_size = ind.team_size() * ind.league_size(); + const int chunk = (nwork + thread_size - 1) / thread_size; + + size_type iwork = chunk * thread_rank; + const size_type iwork_end = iwork + chunk < nwork ? iwork + chunk : nwork; + + for (; iwork < iwork_end; ++iwork) { + dst.value[0] += 1; + dst.value[1] += iwork + 1; + dst.value[2] += nwork - iwork; + } + } +}; + +} // namespace Test + +namespace { + +template <typename ScalarType, class DeviceType, class ScheduleType> +class TestReduceTeam { + public: + using execution_space = DeviceType; + using policy_type = Kokkos::TeamPolicy<ScheduleType, execution_space>; + using size_type = typename execution_space::size_type; + + TestReduceTeam(const size_type &nwork) { run_test(nwork); } + + void run_test(const size_type &nwork) { + using functor_type = + Test::ReduceTeamFunctor<ScalarType, execution_space, ScheduleType>; + using value_type = typename functor_type::value_type; + using result_type = + Kokkos::View<value_type, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>; + + enum { Count = 3 }; + enum { Repeat = 100 }; + + value_type result[Repeat]; + + const uint64_t nw = nwork; + const uint64_t nsum = nw % 2 ? nw * ((nw + 1) / 2) : (nw / 2) * (nw + 1); + + policy_type team_exec(nw, 1); + + const unsigned team_size = team_exec.team_size_recommended( + functor_type(nwork), Kokkos::ParallelReduceTag()); + const unsigned league_size = (nwork + team_size - 1) / team_size; + + team_exec = policy_type(league_size, team_size); + + for (unsigned i = 0; i < Repeat; ++i) { + result_type tmp(&result[i]); + Kokkos::parallel_reduce(team_exec, functor_type(nwork), tmp); + } + + execution_space().fence(); + + for (unsigned i = 0; i < Repeat; ++i) { + for (unsigned j = 0; j < Count; ++j) { + const uint64_t correct = 0 == j % 3 ? nw : nsum; + ASSERT_EQ((ScalarType)correct, result[i].value[j]); + } + } + } +}; + +} // namespace + +/*--------------------------------------------------------------------------*/ + +namespace Test { + +template <class DeviceType, class ScheduleType> +class ScanTeamFunctor { + public: + using execution_space = DeviceType; + using policy_type = Kokkos::TeamPolicy<ScheduleType, execution_space>; + using value_type = int64_t; + + Kokkos::View<value_type, execution_space> accum; + Kokkos::View<value_type, execution_space> total; + + ScanTeamFunctor() : accum("accum"), total("total") {} + + KOKKOS_INLINE_FUNCTION + void init(value_type &error) const { error = 0; } + + KOKKOS_INLINE_FUNCTION + void join(value_type volatile &error, + value_type volatile const &input) const { + if (input) error = 1; + } + + struct JoinMax { + using value_type = int64_t; + + KOKKOS_INLINE_FUNCTION + void join(value_type volatile &dst, + value_type volatile const &input) const { + if (dst < input) dst = input; + } + }; + + KOKKOS_INLINE_FUNCTION + void operator()(const typename policy_type::member_type ind, + value_type &error) const { + if (0 == ind.league_rank() && 0 == ind.team_rank()) { + const int64_t thread_count = ind.league_size() * ind.team_size(); + total() = (thread_count * (thread_count + 1)) / 2; + } + + // Team max: + int64_t m = (int64_t)(ind.league_rank() + ind.team_rank()); + ind.team_reduce(Kokkos::Max<int64_t>(m)); + + if (m != ind.league_rank() + (ind.team_size() - 1)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "ScanTeamFunctor[%i.%i of %i.%i] reduce_max_answer(%li) != " + "reduce_max(%li)\n", + static_cast<int>(ind.league_rank()), + static_cast<int>(ind.team_rank()), + static_cast<int>(ind.league_size()), + static_cast<int>(ind.team_size()), + static_cast<long>(ind.league_rank() + (ind.team_size() - 1)), + static_cast<long>(m)); + } + + // Scan: + const int64_t answer = (ind.league_rank() + 1) * ind.team_rank() + + (ind.team_rank() * (ind.team_rank() + 1)) / 2; + + const int64_t result = + ind.team_scan(ind.league_rank() + 1 + ind.team_rank() + 1); + + const int64_t result2 = + ind.team_scan(ind.league_rank() + 1 + ind.team_rank() + 1); + + if (answer != result || answer != result2) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "ScanTeamFunctor[%i.%i of %i.%i] answer(%li) != scan_first(%li) or " + "scan_second(%li)\n", + static_cast<int>(ind.league_rank()), + static_cast<int>(ind.team_rank()), + static_cast<int>(ind.league_size()), + static_cast<int>(ind.team_size()), static_cast<long>(answer), + static_cast<long>(result), static_cast<long>(result2)); + + error = 1; + } + + const int64_t thread_rank = + ind.team_rank() + ind.team_size() * ind.league_rank(); + ind.team_scan(1 + thread_rank, accum.data()); + } +}; + +template <class DeviceType, class ScheduleType> +class TestScanTeam { + public: + using execution_space = DeviceType; + using value_type = int64_t; + using policy_type = Kokkos::TeamPolicy<ScheduleType, execution_space>; + using functor_type = Test::ScanTeamFunctor<DeviceType, ScheduleType>; + + TestScanTeam(const size_t nteam) { run_test(nteam); } + + void run_test(const size_t nteam) { + using result_type = + Kokkos::View<int64_t, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>; + + const unsigned REPEAT = 100000; + unsigned Repeat; + + if (nteam == 0) { + Repeat = 1; + } else { + Repeat = (REPEAT + nteam - 1) / nteam; // Error here. + } + + functor_type functor; + + policy_type team_exec(nteam, 1); + team_exec = policy_type( + nteam, team_exec.team_size_max(functor, Kokkos::ParallelReduceTag())); + + for (unsigned i = 0; i < Repeat; ++i) { + int64_t accum = 0; + int64_t total = 0; + int64_t error = 0; + Kokkos::deep_copy(functor.accum, total); + + Kokkos::parallel_reduce(team_exec, functor, result_type(&error)); + DeviceType().fence(); + + Kokkos::deep_copy(accum, functor.accum); + Kokkos::deep_copy(total, functor.total); + + ASSERT_EQ(error, 0); + ASSERT_EQ(total, accum); + } + + execution_space().fence(); + } +}; + +} // namespace Test + +/*--------------------------------------------------------------------------*/ + +namespace Test { + +template <class ExecSpace, class ScheduleType> +struct SharedTeamFunctor { + using execution_space = ExecSpace; + using value_type = int; + using policy_type = Kokkos::TeamPolicy<ScheduleType, execution_space>; + + enum { SHARED_COUNT = 1000 }; + + using shmem_space = typename ExecSpace::scratch_memory_space; + + // TBD: MemoryUnmanaged should be the default for shared memory space. + using shared_int_array_type = + Kokkos::View<int *, shmem_space, Kokkos::MemoryUnmanaged>; + + // Tell how much shared memory will be required by this functor. + inline unsigned team_shmem_size(int /*team_size*/) const { + return shared_int_array_type::shmem_size(SHARED_COUNT) + + shared_int_array_type::shmem_size(SHARED_COUNT); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const typename policy_type::member_type &ind, + value_type &update) const { + const shared_int_array_type shared_A(ind.team_shmem(), SHARED_COUNT); + const shared_int_array_type shared_B(ind.team_shmem(), SHARED_COUNT); + + if ((shared_A.data() == nullptr && SHARED_COUNT > 0) || + (shared_B.data() == nullptr && SHARED_COUNT > 0)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "member( %i/%i , %i/%i ) Failed to allocate shared memory of size " + "%lu\n", + static_cast<int>(ind.league_rank()), + static_cast<int>(ind.league_size()), + static_cast<int>(ind.team_rank()), static_cast<int>(ind.team_size()), + static_cast<unsigned long>(SHARED_COUNT)); + + ++update; // Failure to allocate is an error. + } else { + for (int i = ind.team_rank(); i < SHARED_COUNT; i += ind.team_size()) { + shared_A[i] = i + ind.league_rank(); + shared_B[i] = 2 * i + ind.league_rank(); + } + + ind.team_barrier(); + + if (ind.team_rank() + 1 == ind.team_size()) { + for (int i = 0; i < SHARED_COUNT; ++i) { + if (shared_A[i] != i + ind.league_rank()) { + ++update; + } + + if (shared_B[i] != 2 * i + ind.league_rank()) { + ++update; + } + } + } + } + } +}; + +} // namespace Test + +namespace { + +template <class ExecSpace, class ScheduleType> +struct TestSharedTeam { + TestSharedTeam() { run(); } + + void run() { + using Functor = Test::SharedTeamFunctor<ExecSpace, ScheduleType>; + using result_type = + Kokkos::View<typename Functor::value_type, Kokkos::HostSpace, + Kokkos::MemoryUnmanaged>; + +#ifdef KOKKOS_ENABLE_OPENMPTARGET + const size_t team_size = + Kokkos::TeamPolicy<ScheduleType, ExecSpace>(64, 32).team_size_max( + Functor(), Kokkos::ParallelReduceTag()); + + Kokkos::TeamPolicy<ScheduleType, ExecSpace> team_exec(32 / team_size, + team_size); +#else + const size_t team_size = + Kokkos::TeamPolicy<ScheduleType, ExecSpace>(8192, 1).team_size_max( + Functor(), Kokkos::ParallelReduceTag()); + + Kokkos::TeamPolicy<ScheduleType, ExecSpace> team_exec(8192 / team_size, + team_size); +#endif + + typename Functor::value_type error_count = 0; + + Kokkos::parallel_reduce(team_exec, Functor(), result_type(&error_count)); + Kokkos::fence(); + + ASSERT_EQ(error_count, 0); + } +}; + +} // namespace + +namespace Test { + +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) +template <class MemorySpace, class ExecSpace, class ScheduleType> +struct TestLambdaSharedTeam { + TestLambdaSharedTeam() { run(); } + + void run() { + using Functor = Test::SharedTeamFunctor<ExecSpace, ScheduleType>; + using result_type = Kokkos::View<typename Functor::value_type, MemorySpace, + Kokkos::MemoryUnmanaged>; + + using shmem_space = typename ExecSpace::scratch_memory_space; + + // TBD: MemoryUnmanaged should be the default for shared memory space. + using shared_int_array_type = + Kokkos::View<int *, shmem_space, Kokkos::MemoryUnmanaged>; + + const int SHARED_COUNT = 1000; +#ifdef KOKKOS_ENABLE_OPENMPTARGET + int team_size = 32; +#else + int team_size = 1; +#endif + +#ifdef KOKKOS_ENABLE_CUDA + if (std::is_same<ExecSpace, Kokkos::Cuda>::value) team_size = 128; +#endif + + Kokkos::TeamPolicy<ScheduleType, ExecSpace> team_exec(8192 / team_size, + team_size); + team_exec = team_exec.set_scratch_size( + 0, Kokkos::PerTeam(SHARED_COUNT * 2 * sizeof(int))); + + typename Functor::value_type error_count = 0; + + Kokkos::parallel_reduce( + team_exec, + KOKKOS_LAMBDA( + const typename Kokkos::TeamPolicy<ScheduleType, + ExecSpace>::member_type &ind, + int &update) { + const shared_int_array_type shared_A(ind.team_shmem(), SHARED_COUNT); + const shared_int_array_type shared_B(ind.team_shmem(), SHARED_COUNT); + + if ((shared_A.data() == nullptr && SHARED_COUNT > 0) || + (shared_B.data() == nullptr && SHARED_COUNT > 0)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "Failed to allocate shared memory of size %lu\n", + static_cast<unsigned long>(SHARED_COUNT)); + + ++update; // Failure to allocate is an error. + } else { + for (int i = ind.team_rank(); i < SHARED_COUNT; + i += ind.team_size()) { + shared_A[i] = i + ind.league_rank(); + shared_B[i] = 2 * i + ind.league_rank(); + } + + ind.team_barrier(); + + if (ind.team_rank() + 1 == ind.team_size()) { + for (int i = 0; i < SHARED_COUNT; ++i) { + if (shared_A[i] != i + ind.league_rank()) { + ++update; + } + + if (shared_B[i] != 2 * i + ind.league_rank()) { + ++update; + } + } + } + } + }, + result_type(&error_count)); + + Kokkos::fence(); + + ASSERT_EQ(error_count, 0); + } +}; +#endif + +} // namespace Test + +namespace Test { + +template <class ExecSpace, class ScheduleType> +struct ScratchTeamFunctor { + using execution_space = ExecSpace; + using value_type = int; + using policy_type = Kokkos::TeamPolicy<ScheduleType, execution_space>; + + enum { SHARED_TEAM_COUNT = 100 }; + enum { SHARED_THREAD_COUNT = 10 }; + + using shmem_space = typename ExecSpace::scratch_memory_space; + + // TBD: MemoryUnmanaged should be the default for shared memory space. + using shared_int_array_type = + Kokkos::View<size_t *, shmem_space, Kokkos::MemoryUnmanaged>; + + KOKKOS_INLINE_FUNCTION + void operator()(const typename policy_type::member_type &ind, + value_type &update) const { + const shared_int_array_type scratch_ptr(ind.team_scratch(1), + 3 * ind.team_size()); + const shared_int_array_type scratch_A(ind.team_scratch(1), + SHARED_TEAM_COUNT); + const shared_int_array_type scratch_B(ind.thread_scratch(1), + SHARED_THREAD_COUNT); + + if ((scratch_ptr.data() == nullptr) || + (scratch_A.data() == nullptr && SHARED_TEAM_COUNT > 0) || + (scratch_B.data() == nullptr && SHARED_THREAD_COUNT > 0)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "Failed to allocate shared memory of size %lu\n", + static_cast<unsigned long>(SHARED_TEAM_COUNT)); + + ++update; // Failure to allocate is an error. + } else { + Kokkos::parallel_for( + Kokkos::TeamThreadRange(ind, 0, (int)SHARED_TEAM_COUNT), + [&](const int &i) { scratch_A[i] = i + ind.league_rank(); }); + + for (int i = 0; i < SHARED_THREAD_COUNT; i++) { + scratch_B[i] = 10000 * ind.league_rank() + 100 * ind.team_rank() + i; + } + + scratch_ptr[ind.team_rank()] = (size_t)scratch_A.data(); + scratch_ptr[ind.team_rank() + ind.team_size()] = (size_t)scratch_B.data(); + + ind.team_barrier(); + + for (int i = 0; i < SHARED_TEAM_COUNT; i++) { + if (scratch_A[i] != size_t(i + ind.league_rank())) ++update; + } + + for (int i = 0; i < ind.team_size(); i++) { + if (scratch_ptr[0] != scratch_ptr[i]) ++update; + } + + if (scratch_ptr[1 + ind.team_size()] - scratch_ptr[0 + ind.team_size()] < + SHARED_THREAD_COUNT * sizeof(size_t)) { + ++update; + } + + for (int i = 1; i < ind.team_size(); i++) { + if ((scratch_ptr[i + ind.team_size()] - + scratch_ptr[i - 1 + ind.team_size()]) != + (scratch_ptr[1 + ind.team_size()] - + scratch_ptr[0 + ind.team_size()])) { + ++update; + } + } + } + } +}; + +} // namespace Test + +namespace { + +template <class ExecSpace, class ScheduleType> +struct TestScratchTeam { + TestScratchTeam() { run(); } + + void run() { + using Functor = Test::ScratchTeamFunctor<ExecSpace, ScheduleType>; + using result_type = + Kokkos::View<typename Functor::value_type, Kokkos::HostSpace, + Kokkos::MemoryUnmanaged>; + using p_type = Kokkos::TeamPolicy<ScheduleType, ExecSpace>; + + typename Functor::value_type error_count = 0; + + int thread_scratch_size = Functor::shared_int_array_type::shmem_size( + Functor::SHARED_THREAD_COUNT); + +#ifdef KOKKOS_ENABLE_OPENMPTARGET + p_type team_exec = p_type(64, 32).set_scratch_size( + 1, + Kokkos::PerTeam(Functor::shared_int_array_type::shmem_size( + Functor::SHARED_TEAM_COUNT)), + Kokkos::PerThread(thread_scratch_size + 3 * sizeof(int))); +#else + p_type team_exec = p_type(8192, 1).set_scratch_size( + 1, + Kokkos::PerTeam(Functor::shared_int_array_type::shmem_size( + Functor::SHARED_TEAM_COUNT)), + Kokkos::PerThread(thread_scratch_size + 3 * sizeof(int))); +#endif + + const size_t team_size = + team_exec.team_size_max(Functor(), Kokkos::ParallelReduceTag()); + + int team_scratch_size = + Functor::shared_int_array_type::shmem_size(Functor::SHARED_TEAM_COUNT) + + Functor::shared_int_array_type::shmem_size(3 * team_size); + +#ifdef KOKKOS_ENABLE_OPENMPTARGET + team_exec = p_type(64 / team_size, team_size); +#else + team_exec = p_type(8192 / team_size, team_size); +#endif + + Kokkos::parallel_reduce( + team_exec.set_scratch_size(1, Kokkos::PerTeam(team_scratch_size), + Kokkos::PerThread(thread_scratch_size)), + Functor(), result_type(&error_count)); + Kokkos::fence(); + ASSERT_EQ(error_count, 0); + } +}; + +} // namespace + +namespace Test { + +template <class ExecSpace> +KOKKOS_INLINE_FUNCTION int test_team_mulit_level_scratch_loop_body( + const typename Kokkos::TeamPolicy<ExecSpace>::member_type &team) { + Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > + a_team1(team.team_scratch(0), 128); + Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > + a_thread1(team.thread_scratch(0), 16); + Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > + a_team2(team.team_scratch(0), 128); + Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > + a_thread2(team.thread_scratch(0), 16); + + Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > + b_team1(team.team_scratch(1), 12800); + Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > + b_thread1(team.thread_scratch(1), 1600); + Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > + b_team2(team.team_scratch(1), 12800); + Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > + b_thread2(team.thread_scratch(1), 1600); + + Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > + a_team3(team.team_scratch(0), 128); + Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > + a_thread3(team.thread_scratch(0), 16); + Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > + b_team3(team.team_scratch(1), 12800); + Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > + b_thread3(team.thread_scratch(1), 1600); + + // The explicit types for 0 and 128 are here to test TeamThreadRange accepting + // different types for begin and end. + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, int(0), unsigned(128)), + [&](const int &i) { + a_team1(i) = 1000000 + i + team.league_rank() * 100000; + a_team2(i) = 2000000 + i + team.league_rank() * 100000; + a_team3(i) = 3000000 + i + team.league_rank() * 100000; + }); + team.team_barrier(); + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, int(0), unsigned(16)), + [&](const int &i) { + a_thread1(i) = 1000000 + 100000 * team.team_rank() + + 16 - i + team.league_rank() * 100000; + a_thread2(i) = 2000000 + 100000 * team.team_rank() + + 16 - i + team.league_rank() * 100000; + a_thread3(i) = 3000000 + 100000 * team.team_rank() + + 16 - i + team.league_rank() * 100000; + }); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, int(0), unsigned(12800)), + [&](const int &i) { + b_team1(i) = 1000000 + i + team.league_rank() * 100000; + b_team2(i) = 2000000 + i + team.league_rank() * 100000; + b_team3(i) = 3000000 + i + team.league_rank() * 100000; + }); + team.team_barrier(); + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, 1600), + [&](const int &i) { + b_thread1(i) = 1000000 + 100000 * team.team_rank() + + 16 - i + team.league_rank() * 100000; + b_thread2(i) = 2000000 + 100000 * team.team_rank() + + 16 - i + team.league_rank() * 100000; + b_thread3(i) = 3000000 + 100000 * team.team_rank() + + 16 - i + team.league_rank() * 100000; + }); + + team.team_barrier(); + + int error = 0; + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, 0, 128), [&](const int &i) { + if (a_team1(i) != 1000000 + i + team.league_rank() * 100000) error++; + if (a_team2(i) != 2000000 + i + team.league_rank() * 100000) error++; + if (a_team3(i) != 3000000 + i + team.league_rank() * 100000) error++; + }); + team.team_barrier(); + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, 16), [&](const int &i) { + if (a_thread1(i) != 1000000 + 100000 * team.team_rank() + 16 - i + + team.league_rank() * 100000) + error++; + if (a_thread2(i) != 2000000 + 100000 * team.team_rank() + 16 - i + + team.league_rank() * 100000) + error++; + if (a_thread3(i) != 3000000 + 100000 * team.team_rank() + 16 - i + + team.league_rank() * 100000) + error++; + }); + + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, 0, 12800), [&](const int &i) { + if (b_team1(i) != 1000000 + i + team.league_rank() * 100000) error++; + if (b_team2(i) != 2000000 + i + team.league_rank() * 100000) error++; + if (b_team3(i) != 3000000 + i + team.league_rank() * 100000) error++; + }); + team.team_barrier(); + + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(team, 1600), [&](const int &i) { + if (b_thread1(i) != 1000000 + 100000 * team.team_rank() + 16 - i + + team.league_rank() * 100000) + error++; + if (b_thread2(i) != 2000000 + 100000 * team.team_rank() + 16 - i + + team.league_rank() * 100000) + error++; + if (b_thread3(i) != 3000000 + 100000 * team.team_rank() + 16 - i + + team.league_rank() * 100000) + error++; + }); + + return error; +} + +struct TagReduce {}; +struct TagFor {}; + +template <class ExecSpace, class ScheduleType> +struct ClassNoShmemSizeFunction { + using member_type = + typename Kokkos::TeamPolicy<ExecSpace, ScheduleType>::member_type; + + Kokkos::View<int, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > errors; + + KOKKOS_INLINE_FUNCTION + void operator()(const TagFor &, const member_type &team) const { + int error = test_team_mulit_level_scratch_loop_body<ExecSpace>(team); + errors() += error; + } + + KOKKOS_INLINE_FUNCTION + void operator()(const TagReduce &, const member_type &team, + int &error) const { + error += test_team_mulit_level_scratch_loop_body<ExecSpace>(team); + } + + void run() { + Kokkos::View<int, ExecSpace> d_errors = + Kokkos::View<int, ExecSpace>("Errors"); + errors = d_errors; + + const int per_team0 = + 3 * + Kokkos::View<double *, ExecSpace, + Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(128); + const int per_thread0 = + 3 * + Kokkos::View<double *, ExecSpace, + Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(16); + + const int per_team1 = + 3 * Kokkos::View< + double *, ExecSpace, + Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(12800); + const int per_thread1 = + 3 * Kokkos::View< + double *, ExecSpace, + Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(1600); + + int team_size = 8; + if (team_size > ExecSpace::concurrency()) + team_size = ExecSpace::concurrency(); + { + Kokkos::TeamPolicy<TagFor, ExecSpace, ScheduleType> policy(10, team_size, + 16); + + Kokkos::parallel_for( + policy + .set_scratch_size(0, Kokkos::PerTeam(per_team0), + Kokkos::PerThread(per_thread0)) + .set_scratch_size(1, Kokkos::PerTeam(per_team1), + Kokkos::PerThread(per_thread1)), + *this); + Kokkos::fence(); + + typename Kokkos::View<int, ExecSpace>::HostMirror h_errors = + Kokkos::create_mirror_view(d_errors); + Kokkos::deep_copy(h_errors, d_errors); + ASSERT_EQ(h_errors(), 0); + } + + { + int error = 0; + Kokkos::TeamPolicy<TagReduce, ExecSpace, ScheduleType> policy( + 10, team_size, 16); + + Kokkos::parallel_reduce( + policy + .set_scratch_size(0, Kokkos::PerTeam(per_team0), + Kokkos::PerThread(per_thread0)) + .set_scratch_size(1, Kokkos::PerTeam(per_team1), + Kokkos::PerThread(per_thread1)), + *this, error); + + ASSERT_EQ(error, 0); + } + }; +}; + +template <class ExecSpace, class ScheduleType> +struct ClassWithShmemSizeFunction { + using member_type = + typename Kokkos::TeamPolicy<ExecSpace, ScheduleType>::member_type; + + Kokkos::View<int, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > errors; + + KOKKOS_INLINE_FUNCTION + void operator()(const TagFor &, const member_type &team) const { + int error = test_team_mulit_level_scratch_loop_body<ExecSpace>(team); + errors() += error; + } + + KOKKOS_INLINE_FUNCTION + void operator()(const TagReduce &, const member_type &team, + int &error) const { + error += test_team_mulit_level_scratch_loop_body<ExecSpace>(team); + } + + void run() { + Kokkos::View<int, ExecSpace> d_errors = + Kokkos::View<int, ExecSpace>("Errors"); + errors = d_errors; + + const int per_team1 = + 3 * Kokkos::View< + double *, ExecSpace, + Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(12800); + const int per_thread1 = + 3 * Kokkos::View< + double *, ExecSpace, + Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(1600); + + int team_size = 8; + if (team_size > ExecSpace::concurrency()) + team_size = ExecSpace::concurrency(); + + { + Kokkos::TeamPolicy<TagFor, ExecSpace, ScheduleType> policy(10, team_size, + 16); + + Kokkos::parallel_for( + policy.set_scratch_size(1, Kokkos::PerTeam(per_team1), + Kokkos::PerThread(per_thread1)), + *this); + Kokkos::fence(); + + typename Kokkos::View<int, ExecSpace>::HostMirror h_errors = + Kokkos::create_mirror_view(d_errors); + Kokkos::deep_copy(h_errors, d_errors); + ASSERT_EQ(h_errors(), 0); + } + + { + int error = 0; + Kokkos::TeamPolicy<TagReduce, ExecSpace, ScheduleType> policy( + 10, team_size, 16); + + Kokkos::parallel_reduce( + policy.set_scratch_size(1, Kokkos::PerTeam(per_team1), + Kokkos::PerThread(per_thread1)), + *this, error); + + ASSERT_EQ(error, 0); + } + }; + + unsigned team_shmem_size(int team_size) const { + const int per_team0 = + 3 * + Kokkos::View<double *, ExecSpace, + Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(128); + const int per_thread0 = + 3 * + Kokkos::View<double *, ExecSpace, + Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(16); + return per_team0 + team_size * per_thread0; + } +}; + +template <class ExecSpace, class ScheduleType> +void test_team_mulit_level_scratch_test_lambda() { +#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA + Kokkos::View<int, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > errors; + Kokkos::View<int, ExecSpace> d_errors("Errors"); + errors = d_errors; + + const int per_team0 = + 3 * + Kokkos::View<double *, ExecSpace, + Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(128); + const int per_thread0 = + 3 * + Kokkos::View<double *, ExecSpace, + Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(16); + + const int per_team1 = + 3 * + Kokkos::View<double *, ExecSpace, + Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(12800); + const int per_thread1 = + 3 * + Kokkos::View<double *, ExecSpace, + Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(1600); + + int team_size = 8; + if (team_size > ExecSpace::concurrency()) + team_size = ExecSpace::concurrency(); + + Kokkos::TeamPolicy<ExecSpace, ScheduleType> policy(10, team_size, 16); + + Kokkos::parallel_for( + policy + .set_scratch_size(0, Kokkos::PerTeam(per_team0), + Kokkos::PerThread(per_thread0)) + .set_scratch_size(1, Kokkos::PerTeam(per_team1), + Kokkos::PerThread(per_thread1)), + KOKKOS_LAMBDA( + const typename Kokkos::TeamPolicy<ExecSpace>::member_type &team) { + int error = test_team_mulit_level_scratch_loop_body<ExecSpace>(team); + errors() += error; + }); + Kokkos::fence(); + + typename Kokkos::View<int, ExecSpace>::HostMirror h_errors = + Kokkos::create_mirror_view(errors); + Kokkos::deep_copy(h_errors, d_errors); + ASSERT_EQ(h_errors(), 0); + + int error = 0; + Kokkos::parallel_reduce( + policy + .set_scratch_size(0, Kokkos::PerTeam(per_team0), + Kokkos::PerThread(per_thread0)) + .set_scratch_size(1, Kokkos::PerTeam(per_team1), + Kokkos::PerThread(per_thread1)), + KOKKOS_LAMBDA( + const typename Kokkos::TeamPolicy<ExecSpace>::member_type &team, + int &count) { + count += test_team_mulit_level_scratch_loop_body<ExecSpace>(team); + }, + error); + ASSERT_EQ(error, 0); +#endif +} + +} // namespace Test + +namespace { + +template <class ExecSpace, class ScheduleType> +struct TestMultiLevelScratchTeam { + TestMultiLevelScratchTeam() { run(); } + + void run() { +#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA + Test::test_team_mulit_level_scratch_test_lambda<ExecSpace, ScheduleType>(); +#endif + Test::ClassNoShmemSizeFunction<ExecSpace, ScheduleType> c1; + c1.run(); + + Test::ClassWithShmemSizeFunction<ExecSpace, ScheduleType> c2; + c2.run(); + } +}; + +} // namespace + +namespace Test { + +template <class ExecSpace> +struct TestShmemSize { + TestShmemSize() { run(); } + + void run() { + using view_type = Kokkos::View<int64_t ***, ExecSpace>; + + size_t d1 = 5; + size_t d2 = 6; + size_t d3 = 7; + + size_t size = view_type::shmem_size(d1, d2, d3); + + ASSERT_EQ(size, (d1 * d2 * d3 + 1) * sizeof(int64_t)); + + test_layout_stride(); + } + + void test_layout_stride() { + int rank = 3; + int order[3] = {2, 0, 1}; + int extents[3] = {100, 10, 3}; + auto s1 = + Kokkos::View<double ***, Kokkos::LayoutStride, ExecSpace>::shmem_size( + Kokkos::LayoutStride::order_dimensions(rank, order, extents)); + auto s2 = + Kokkos::View<double ***, Kokkos::LayoutRight, ExecSpace>::shmem_size( + extents[0], extents[1], extents[2]); + ASSERT_EQ(s1, s2); + } +}; + +} // namespace Test + +/*--------------------------------------------------------------------------*/ + +namespace Test { + +namespace { + +template <class ExecSpace, class ScheduleType, class T, class Enabled = void> +struct TestTeamBroadcast; + +template <class ExecSpace, class ScheduleType, class T> +struct TestTeamBroadcast< + ExecSpace, ScheduleType, T, + typename std::enable_if<(sizeof(T) == sizeof(char)), void>::type> { + using team_member = + typename Kokkos::TeamPolicy<ScheduleType, ExecSpace>::member_type; + using memory_space = typename ExecSpace::memory_space; + using value_type = T; + + const value_type offset; + + TestTeamBroadcast(const size_t /*league_size*/, const value_type os_) + : offset(os_) {} + + struct BroadcastTag {}; + + KOKKOS_INLINE_FUNCTION + void operator()(const team_member &teamMember, value_type &update) const { + int lid = teamMember.league_rank(); + int tid = teamMember.team_rank(); + int ts = teamMember.team_size(); + + value_type parUpdate = 0; + value_type value = (value_type)(tid % 0xFF) + offset; + + // broadcast boolean and value to team from source thread + teamMember.team_broadcast(value, lid % ts); + + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(teamMember, ts), + [&](const int /*j*/, value_type &teamUpdate) { teamUpdate |= value; }, + Kokkos::BOr<value_type, memory_space>(parUpdate)); + + if (teamMember.team_rank() == 0) update |= parUpdate; + } + + KOKKOS_INLINE_FUNCTION + void operator()(const BroadcastTag &, const team_member &teamMember, + value_type &update) const { + int lid = teamMember.league_rank(); + int tid = teamMember.team_rank(); + int ts = teamMember.team_size(); + + value_type parUpdate = 0; + value_type value = (value_type)(tid % 0xFF) + offset; + + teamMember.team_broadcast([&](value_type &var) { var -= offset; }, value, + lid % ts); + + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(teamMember, ts), + [&](const int /*j*/, value_type &teamUpdate) { teamUpdate |= value; }, + Kokkos::BOr<value_type, memory_space>(parUpdate)); + + if (teamMember.team_rank() == 0) update |= parUpdate; + } + + static void test_teambroadcast(const size_t league_size, + const value_type off) { + TestTeamBroadcast functor(league_size, off); + + using policy_type = Kokkos::TeamPolicy<ScheduleType, ExecSpace>; + using policy_type_f = + Kokkos::TeamPolicy<ScheduleType, ExecSpace, BroadcastTag>; + + // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32 +#ifdef KOKKOS_ENABLE_OPENMPTARGET + int fake_team_size = + std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value ? 32 + : 1; +#else + int fake_team_size = 1; +#endif + const int team_size = + policy_type_f(league_size, fake_team_size) + .team_size_max( + functor, + Kokkos:: + ParallelReduceTag()); // printf("team_size=%d\n",team_size); + + // team_broadcast with value + value_type total = 0; + + Kokkos::parallel_reduce(policy_type(league_size, team_size), functor, + Kokkos::BOr<value_type, Kokkos::HostSpace>(total)); + + value_type expected_result = 0; + for (unsigned int i = 0; i < league_size; i++) { + value_type val = (value_type((i % team_size % 0xFF)) + off); + expected_result |= val; + } + ASSERT_EQ(expected_result, total); + // printf("team_broadcast with value --" + //"expected_result=%x," + //"total=%x\n",expected_result, total); + + // team_broadcast with function object + total = 0; + + Kokkos::parallel_reduce(policy_type_f(league_size, team_size), functor, + Kokkos::BOr<value_type, Kokkos::HostSpace>(total)); + + expected_result = 0; + for (unsigned int i = 0; i < league_size; i++) { + value_type val = ((value_type)((i % team_size % 0xFF))); + expected_result |= val; + } + ASSERT_EQ(expected_result, total); + // printf("team_broadcast with function object --" + // "expected_result=%x," + // "total=%x\n",expected_result, total); + } +}; + +template <class ExecSpace, class ScheduleType, class T> +struct TestTeamBroadcast< + ExecSpace, ScheduleType, T, + typename std::enable_if<(sizeof(T) > sizeof(char)), void>::type> { + using team_member = + typename Kokkos::TeamPolicy<ScheduleType, ExecSpace>::member_type; + using value_type = T; + + const value_type offset; + + TestTeamBroadcast(const size_t /*league_size*/, const value_type os_) + : offset(os_) {} + + struct BroadcastTag {}; + + KOKKOS_INLINE_FUNCTION + void operator()(const team_member &teamMember, value_type &update) const { + int lid = teamMember.league_rank(); + int tid = teamMember.team_rank(); + int ts = teamMember.team_size(); + + value_type parUpdate = 0; + value_type value = (value_type)(tid * 3) + offset; + + // setValue is used to determine if the update should be + // performed at the bottom. The thread id must match the + // thread id used to broadcast the value. It is the + // thread id that matches the league rank mod team size + // this way each league rank will use a different thread id + // which is likely not 0 + bool setValue = ((lid % ts) == tid); + + // broadcast boolean and value to team from source thread + teamMember.team_broadcast(value, lid % ts); + teamMember.team_broadcast(setValue, lid % ts); + + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(teamMember, ts), + [&](const int /*j*/, value_type &teamUpdate) { teamUpdate += value; }, + parUpdate); + + if (teamMember.team_rank() == 0 && setValue) update += parUpdate; + } + + KOKKOS_INLINE_FUNCTION + void operator()(const BroadcastTag &, const team_member &teamMember, + value_type &update) const { + int lid = teamMember.league_rank(); + int tid = teamMember.team_rank(); + int ts = teamMember.team_size(); + + value_type parUpdate = 0; + value_type value = (value_type)(tid * 3) + offset; + + // setValue is used to determine if the update should be + // performed at the bottom. The thread id must match the + // thread id used to broadcast the value. It is the + // thread id that matches the league rank mod team size + // this way each league rank will use a different thread id + // which is likely not 0. Note the logic is switched from + // above because the functor switches it back. + bool setValue = ((lid % ts) != tid); + + teamMember.team_broadcast([&](value_type &var) { var *= 2; }, value, + lid % ts); + teamMember.team_broadcast([&](bool &bVar) { bVar = !bVar; }, setValue, + lid % ts); + + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(teamMember, ts), + [&](const int /*j*/, value_type &teamUpdate) { teamUpdate += value; }, + parUpdate); + + if (teamMember.team_rank() == 0 && setValue) update += parUpdate; + } + + template <class ScalarType> + static inline + typename std::enable_if<!std::is_integral<ScalarType>::value, void>::type + compare_test(ScalarType A, ScalarType B, double epsilon_factor) { + if (std::is_same<ScalarType, double>::value || + std::is_same<ScalarType, float>::value) { + ASSERT_NEAR((double)A, (double)B, + epsilon_factor * std::abs(A) * + std::numeric_limits<ScalarType>::epsilon()); + } else { + ASSERT_EQ(A, B); + } + } + + template <class ScalarType> + static inline + typename std::enable_if<std::is_integral<ScalarType>::value, void>::type + compare_test(ScalarType A, ScalarType B, double) { + ASSERT_EQ(A, B); + } + + static void test_teambroadcast(const size_t league_size, + const value_type off) { + TestTeamBroadcast functor(league_size, off); + + using policy_type = Kokkos::TeamPolicy<ScheduleType, ExecSpace>; + using policy_type_f = + Kokkos::TeamPolicy<ScheduleType, ExecSpace, BroadcastTag>; + + // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32 +#ifdef KOKKOS_ENABLE_OPENMPTARGET + int fake_team_size = + std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value ? 32 + : 1; +#else + int fake_team_size = 1; +#endif + const int team_size = + policy_type_f(league_size, fake_team_size) + .team_size_max( + functor, + Kokkos:: + ParallelReduceTag()); // printf("team_size=%d\n",team_size); + // team_broadcast with value + value_type total = 0; + + Kokkos::parallel_reduce(policy_type(league_size, team_size), functor, + total); + + value_type expected_result = 0; + for (unsigned int i = 0; i < league_size; i++) { + value_type val = + (value_type((i % team_size) * 3) + off) * (value_type)team_size; + expected_result += val; + } + // For comparison purposes treat the reduction as a random walk in the + // least significant digit, which gives a typical walk distance + // sqrt(league_size) Add 4x for larger sigma + compare_test(expected_result, total, 4.0 * std::sqrt(league_size)); + + // team_broadcast with function object + total = 0; + + Kokkos::parallel_reduce(policy_type_f(league_size, team_size), functor, + total); + + expected_result = 0; + for (unsigned int i = 0; i < league_size; i++) { + value_type val = ((value_type)((i % team_size) * 3) + off) * + (value_type)(2 * team_size); + expected_result += val; + } + // For comparison purposes treat the reduction as a random walk in the + // least significant digit, which gives a typical walk distance + // sqrt(league_size) Add 4x for larger sigma + compare_test(expected_result, total, 4.0 * std::sqrt(league_size)); + } +}; + +template <class ExecSpace> +struct TestScratchAlignment { + struct TestScalar { + double x, y, z; + }; + TestScratchAlignment() { + test(true); + test(false); + } + using ScratchView = + Kokkos::View<TestScalar *, typename ExecSpace::scratch_memory_space>; + using ScratchViewInt = + Kokkos::View<int *, typename ExecSpace::scratch_memory_space>; + void test(bool allocate_small) { + int shmem_size = ScratchView::shmem_size(11); +#ifdef KOKKOS_ENABLE_OPENMPTARGET + int team_size = 32; +#else + int team_size = 1; +#endif + if (allocate_small) shmem_size += ScratchViewInt::shmem_size(1); + Kokkos::parallel_for( + Kokkos::TeamPolicy<ExecSpace>(1, team_size) + .set_scratch_size(0, Kokkos::PerTeam(shmem_size)), + KOKKOS_LAMBDA( + const typename Kokkos::TeamPolicy<ExecSpace>::member_type &team) { + if (allocate_small) ScratchViewInt p(team.team_scratch(0), 1); + ScratchView a(team.team_scratch(0), 11); + if (ptrdiff_t(a.data()) % sizeof(TestScalar) != 0) + Kokkos::abort("Error: invalid scratch view alignment\n"); + }); + Kokkos::fence(); + } +}; + +} // namespace + +namespace { + +template <class ExecSpace> +struct TestTeamPolicyHandleByValue { + using scalar = double; + using exec_space = ExecSpace; + using mem_space = typename ExecSpace::memory_space; + + TestTeamPolicyHandleByValue() { test(); } + + void test() { +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + const int M = 1, N = 1; + Kokkos::View<scalar **, mem_space> a("a", M, N); + Kokkos::View<scalar **, mem_space> b("b", M, N); + Kokkos::deep_copy(a, 0.0); + Kokkos::deep_copy(b, 1.0); + Kokkos::parallel_for( + "test_tphandle_by_value", + Kokkos::TeamPolicy<exec_space>(M, Kokkos::AUTO(), 1), + KOKKOS_LAMBDA( + const typename Kokkos::TeamPolicy<exec_space>::member_type team) { + const int i = team.league_rank(); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 0, N), + [&](const int j) { a(i, j) += b(i, j); }); + }); +#endif + } +}; + +} // namespace + +} // namespace Test + +/*--------------------------------------------------------------------------*/ diff --git a/packages/kokkos/core/unit_test/TestTeamBasic.hpp b/packages/kokkos/core/unit_test/TestTeamBasic.hpp new file mode 100644 index 0000000000000000000000000000000000000000..87c010ac2a0c5701916049532a715c6a5addce15 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestTeamBasic.hpp @@ -0,0 +1,210 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_TEAM_BASIC_HPP +#define KOKKOS_TEST_TEAM_BASIC_HPP +#include <TestTeam.hpp> + +namespace Test { + +TEST(TEST_CATEGORY, team_for) { + TestTeamPolicy<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >::test_for( + 0); + TestTeamPolicy<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( + 0); + + TestTeamPolicy<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >::test_for( + 2); + TestTeamPolicy<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( + 2); + + TestTeamPolicy<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >::test_for( + 1000); + TestTeamPolicy<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( + 1000); +} + +// FIXME_OPENMPTARGET wrong results +#ifndef KOKKOS_ENABLE_OPENMPTARGET +TEST(TEST_CATEGORY, team_reduce) { + TestTeamPolicy<TEST_EXECSPACE, + Kokkos::Schedule<Kokkos::Static> >::test_reduce(0); + TestTeamPolicy<TEST_EXECSPACE, + Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(0); + TestTeamPolicy<TEST_EXECSPACE, + Kokkos::Schedule<Kokkos::Static> >::test_reduce(2); + TestTeamPolicy<TEST_EXECSPACE, + Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(2); + TestTeamPolicy<TEST_EXECSPACE, + Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000); + TestTeamPolicy<TEST_EXECSPACE, + Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(1000); +} +#endif + +TEST(TEST_CATEGORY, team_broadcast_long) { + TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>, + long>::test_teambroadcast(0, 1); + TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>, + long>::test_teambroadcast(0, 1); + + TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>, + long>::test_teambroadcast(2, 1); + TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>, + long>::test_teambroadcast(2, 1); + + TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>, + long>::test_teambroadcast(16, 1); + TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>, + long>::test_teambroadcast(16, 1); + + TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>, + long>::test_teambroadcast(1000, 1); + TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>, + long>::test_teambroadcast(1000, 1); +} + +TEST(TEST_CATEGORY, team_broadcast_char) { + { + TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>, + unsigned char>::test_teambroadcast(0, 1); + TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>, + unsigned char>::test_teambroadcast(0, 1); + + TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>, + unsigned char>::test_teambroadcast(2, 1); + TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>, + unsigned char>::test_teambroadcast(2, 1); + + TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>, + unsigned char>::test_teambroadcast(16, 1); + TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>, + unsigned char>::test_teambroadcast(16, 1); + + TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>, + long>::test_teambroadcast(1000, 1); + TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>, + long>::test_teambroadcast(1000, 1); + } +} + +TEST(TEST_CATEGORY, team_broadcast_float) { + { + TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>, + float>::test_teambroadcast(0, 1.3); + TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>, + float>::test_teambroadcast(0, 1.3); + + TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>, + float>::test_teambroadcast(2, 1.3); + TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>, + float>::test_teambroadcast(2, 1.3); + + TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>, + float>::test_teambroadcast(16, 1.3); + TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>, + float>::test_teambroadcast(16, 1.3); + + // FIXME_CUDA +#ifdef KOKKOS_ENABLE_CUDA + if (!std::is_same<TEST_EXECSPACE, Kokkos::Cuda>::value) +#endif + // FIXME_HIP +#ifdef KOKKOS_ENABLE_HIP + if (!std::is_same<TEST_EXECSPACE, Kokkos::Experimental::HIP>::value) +#endif + { + TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>, + float>::test_teambroadcast(1000, 1.3); + TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>, + float>::test_teambroadcast(1000, 1.3); + } + } +} + +TEST(TEST_CATEGORY, team_broadcast_double) { + { + TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>, + double>::test_teambroadcast(0, 1.3); + TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>, + double>::test_teambroadcast(0, 1.3); + + TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>, + double>::test_teambroadcast(2, 1.3); + TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>, + double>::test_teambroadcast(2, 1.3); + + TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>, + double>::test_teambroadcast(16, 1.3); + TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>, + double>::test_teambroadcast(16, 1.3); + + // FIXME_CUDA +#ifdef KOKKOS_ENABLE_CUDA + if (!std::is_same<TEST_EXECSPACE, Kokkos::Cuda>::value) +#endif + // FIXME_HIP +#ifdef KOKKOS_ENABLE_HIP + if (!std::is_same<TEST_EXECSPACE, Kokkos::Experimental::HIP>::value) +#endif + { + TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>, + double>::test_teambroadcast(1000, 1.3); + TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>, + + double>::test_teambroadcast(1000, 1.3); + } + } +} + +TEST(TEST_CATEGORY, team_handle_by_value) { + { TestTeamPolicyHandleByValue<TEST_EXECSPACE>(); } +} + +} // namespace Test + +#ifndef KOKKOS_ENABLE_OPENMPTARGET +#include <TestTeamVector.hpp> +#endif +#endif diff --git a/packages/kokkos/core/unit_test/TestTeamReductionScan.hpp b/packages/kokkos/core/unit_test/TestTeamReductionScan.hpp new file mode 100644 index 0000000000000000000000000000000000000000..3db0eafa339de221a8dad8feb3cf7b3fa62027f2 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestTeamReductionScan.hpp @@ -0,0 +1,107 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_TEAM_REDUCTION_SCAN_HPP +#define KOKKOS_TEST_TEAM_REDUCTION_SCAN_HPP +#include <TestTeam.hpp> + +namespace Test { + +TEST(TEST_CATEGORY, team_reduction_scan) { + TestScanTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >(0); + TestScanTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >(0); + TestScanTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >(10); + TestScanTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >(10); +// FIXME_HIP +#ifdef KOKKOS_ENABLE_HIP + if (!std::is_same<TEST_EXECSPACE, Kokkos::Experimental::HIP>::value) +#endif + { + TestScanTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >(10000); + TestScanTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >(10000); + } +} + +TEST(TEST_CATEGORY, team_long_reduce) { +#ifdef KOKKOS_ENABLE_OPENMPTARGET + // WORKAROUND OPENMPTARGET: Not implemented + if constexpr (!std::is_same<TEST_EXECSPACE, + Kokkos::Experimental::OpenMPTarget>::value) +#endif + { + TestReduceTeam<long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >(0); + TestReduceTeam<long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >(0); + TestReduceTeam<long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >(3); + TestReduceTeam<long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >(3); + TestReduceTeam<long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( + 100000); + TestReduceTeam<long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( + 100000); + } +} + +TEST(TEST_CATEGORY, team_double_reduce) { +#ifdef KOKKOS_ENABLE_OPENMPTARGET + // WORKAROUND OPENMPTARGET: Not implemented + if constexpr (!std::is_same<TEST_EXECSPACE, + Kokkos::Experimental::OpenMPTarget>::value) +#endif + { + TestReduceTeam<double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( + 0); + TestReduceTeam<double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( + 0); + TestReduceTeam<double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( + 3); + TestReduceTeam<double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( + 3); + TestReduceTeam<double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( + 100000); + TestReduceTeam<double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( + 100000); + } +} + +} // namespace Test +#endif diff --git a/packages/kokkos/core/unit_test/TestTeamScan.hpp b/packages/kokkos/core/unit_test/TestTeamScan.hpp new file mode 100644 index 0000000000000000000000000000000000000000..4693bae8c1e2f4f39603a46d36d1109c6effde22 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestTeamScan.hpp @@ -0,0 +1,182 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <impl/Kokkos_Stacktrace.hpp> +#include <cstdio> +#include <cstdint> +#include <sstream> +#include <type_traits> + +#if defined(__clang__) +#define is_clang true +#else +#define is_clang false +#endif + +#if !defined(KOKKOS_ENABLE_OPENMPTARGET) +// for avoid pre-processor block +namespace Kokkos { +namespace Experimental { +class OpenMPTarget; +} +} // namespace Kokkos +#endif + +#if !defined(KOKKOS_ENABLE_CUDA) +// for avoid pre-processor block +namespace Kokkos { +class Cuda; +} // namespace Kokkos +#endif + +namespace Test { + +template <class ExecutionSpace, class DataType> +struct TestTeamScan { + using execution_space = ExecutionSpace; + using value_type = DataType; + using policy_type = Kokkos::TeamPolicy<execution_space>; + using member_type = typename policy_type::member_type; + using view_type = Kokkos::View<value_type**, execution_space>; + + view_type a_d; + view_type a_r; + int32_t M = 0; + int32_t N = 0; + + KOKKOS_FUNCTION + void operator()(const member_type& team) const { + auto leagueRank = team.league_rank(); + + auto beg = 0; + auto end = N; + + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, beg, end), + [&](const int i) { a_d(leagueRank, i) = leagueRank * N + i; }); + + Kokkos::parallel_scan(Kokkos::TeamThreadRange(team, beg, end), + [&](int i, DataType& val, const bool final) { + val += a_d(leagueRank, i); + if (final) a_r(leagueRank, i) = val; + }); + } + + auto operator()(int32_t _M, int32_t _N) { + std::cout << "Launching " << Kokkos::Impl::demangle(typeid(*this).name()) + << " with " + << "M=" << _M << " and N=" << _N << "..." << std::endl; + M = _M; + N = _N; + a_d = view_type("a_d", M, N); + a_r = view_type("a_r", M, N); + // Set team size explicitly to + // a) check whether this works in CPU backends with team_size > 1 and + // b) make sure we have a power of 2 and for GPU backends due to limitation + // of the scan algorithm implemented in CUDA etc. + int team_size = 1; + if (ExecutionSpace().concurrency() > 2) { + if (ExecutionSpace().concurrency() > 10000) + team_size = 128; + else + team_size = 3; + } + Kokkos::parallel_for(policy_type(M, team_size), *this); + + auto a_i = Kokkos::create_mirror_view(a_d); + auto a_o = Kokkos::create_mirror_view(a_r); + Kokkos::deep_copy(a_i, a_d); + Kokkos::deep_copy(a_o, a_r); + + for (int32_t i = 0; i < M; ++i) { + value_type _scan_real = 0; + value_type _scan_calc = 0; + value_type _epsilon = std::numeric_limits<value_type>::epsilon(); + // each fp addition is subject to small loses in precision and these + // compound as loop so we set the base error to be the machine epsilon and + // then add in another epsilon each iteration. For example, with CUDA + // backend + 32-bit float + large N values (e.g. 1,000) + high + // thread-counts (e.g. 1024), this test will fail w/o epsilon + // accommodation + for (int32_t j = 0; j < N; ++j) { + _scan_real += a_i(i, j); + _scan_calc = a_o(i, j); + auto _get_mesg = [=]() { + std::stringstream ss, idx; + idx << "(" << i << ", " << j << ") = "; + ss << "a_d" << idx.str() << a_i(i, j); + ss << ", a_r" << idx.str() << a_o(i, j); + return ss.str(); + }; + if (std::is_integral<value_type>::value) { + ASSERT_EQ(_scan_real, _scan_calc) << _get_mesg(); + } else { + _epsilon += std::numeric_limits<value_type>::epsilon(); + ASSERT_NEAR(_scan_real, _scan_calc, _epsilon) << _get_mesg(); + } + } + } + } +}; + +TEST(TEST_CATEGORY, team_scan) { + TestTeamScan<TEST_EXECSPACE, int32_t>{}(0, 0); + TestTeamScan<TEST_EXECSPACE, int32_t>{}(0, 1); + TestTeamScan<TEST_EXECSPACE, int32_t>{}(1, 0); + TestTeamScan<TEST_EXECSPACE, uint32_t>{}(99, 32); + TestTeamScan<TEST_EXECSPACE, uint32_t>{}(139, 64); + TestTeamScan<TEST_EXECSPACE, uint32_t>{}(163, 128); + TestTeamScan<TEST_EXECSPACE, int64_t>{}(433, 256); + TestTeamScan<TEST_EXECSPACE, uint64_t>{}(976, 512); + TestTeamScan<TEST_EXECSPACE, uint64_t>{}(1234, 1024); + TestTeamScan<TEST_EXECSPACE, float>{}(2596, 34); + TestTeamScan<TEST_EXECSPACE, double>{}(2596, 59); + TestTeamScan<TEST_EXECSPACE, float>{}(2596, 65); + TestTeamScan<TEST_EXECSPACE, double>{}(2596, 371); + TestTeamScan<TEST_EXECSPACE, int64_t>{}(2596, 987); + TestTeamScan<TEST_EXECSPACE, double>{}(2596, 1311); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestTeamScratch.hpp b/packages/kokkos/core/unit_test/TestTeamScratch.hpp new file mode 100644 index 0000000000000000000000000000000000000000..75ca3587629ded5f5cc2dd2f3b8ef6623e8a07f7 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestTeamScratch.hpp @@ -0,0 +1,100 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_TEAM_SCRATCH_HPP +#define KOKKOS_TEST_TEAM_SCRATCH_HPP +#include <TestTeam.hpp> + +namespace Test { + +TEST(TEST_CATEGORY, team_shared_request) { + TestSharedTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >(); + TestSharedTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >(); +} + +TEST(TEST_CATEGORY, team_scratch_request) { + // FIXME_HIP the parallel_reduce in this test requires a team size larger than + // 256. Fixed in ROCm 3.9 +#if defined(KOKKOS_ENABLE_HIP) && (HIP_VERSION < 309) + if (!std::is_same<TEST_EXECSPACE, Kokkos::Experimental::HIP>::value) +#endif + { + TestScratchTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >(); + TestScratchTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >(); + } +} + +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) +TEST(TEST_CATEGORY, team_lambda_shared_request) { + TestLambdaSharedTeam<Kokkos::HostSpace, TEST_EXECSPACE, + Kokkos::Schedule<Kokkos::Static> >(); + TestLambdaSharedTeam<Kokkos::HostSpace, TEST_EXECSPACE, + Kokkos::Schedule<Kokkos::Dynamic> >(); +} +TEST(TEST_CATEGORY, scratch_align) { TestScratchAlignment<TEST_EXECSPACE>(); } +#endif + +TEST(TEST_CATEGORY, shmem_size) { TestShmemSize<TEST_EXECSPACE>(); } + +TEST(TEST_CATEGORY, multi_level_scratch) { + // FIXME_HIP the parallel_for and the parallel_reduce in this test requires a + // team size larger than 256. Fixed In ROCm 3.9 + // FIXME_OPENMPTARGET This unit test needs ~350KB of scratch memory for L0 and + // L1 combined per team. Currently OpenMPTarget cannot allocate this high + // amount of scratch memory. +#if !defined(KOKKOS_ENABLE_OPENMPTARGET) +#if defined(KOKKOS_ENABLE_HIP) && (HIP_VERSION < 309) + if (!std::is_same<TEST_EXECSPACE, Kokkos::Experimental::HIP>::value) +#endif + { + TestMultiLevelScratchTeam<TEST_EXECSPACE, + Kokkos::Schedule<Kokkos::Static> >(); + TestMultiLevelScratchTeam<TEST_EXECSPACE, + Kokkos::Schedule<Kokkos::Dynamic> >(); + } +#endif +} + +} // namespace Test +#endif diff --git a/packages/kokkos/core/unit_test/TestTeamTeamSize.hpp b/packages/kokkos/core/unit_test/TestTeamTeamSize.hpp new file mode 100644 index 0000000000000000000000000000000000000000..992e80397bacb9b5dc9a0746ca2543a1792cce22 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestTeamTeamSize.hpp @@ -0,0 +1,240 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <cstdio> +#include <stdexcept> +#include <sstream> +#include <iostream> + +#include <Kokkos_Core.hpp> + +namespace Test { + +namespace { +template <class T, int N> +class MyArray { + public: + T values[N]; + KOKKOS_INLINE_FUNCTION + void operator+=(const MyArray& src) { + for (int i = 0; i < N; i++) values[i] += src.values[i]; + } + KOKKOS_INLINE_FUNCTION + void operator=(const MyArray& src) { + for (int i = 0; i < N; i++) values[i] = src.values[i]; + } + KOKKOS_INLINE_FUNCTION + void operator+=(const volatile MyArray& src) volatile { + for (int i = 0; i < N; i++) values[i] += src.values[i]; + } + KOKKOS_INLINE_FUNCTION + void operator=(const volatile MyArray& src) volatile { + for (int i = 0; i < N; i++) values[i] = src.values[i]; + } +}; + +template <class T, int N, class PolicyType, int S> +struct FunctorFor { + double static_array[S]; + KOKKOS_INLINE_FUNCTION + void operator()(const typename PolicyType::member_type& /*team*/) const {} +}; +template <class T, int N, class PolicyType, int S> +struct FunctorReduce { + double static_array[S]; + KOKKOS_INLINE_FUNCTION + void operator()(const typename PolicyType::member_type& /*team*/, + MyArray<T, N>& lval) const { + for (int j = 0; j < N; j++) lval.values[j] += 1 + lval.values[0]; + } +}; +} // namespace + +using policy_type = Kokkos::TeamPolicy<TEST_EXECSPACE>; +using policy_type_128_8 = + Kokkos::TeamPolicy<TEST_EXECSPACE, Kokkos::LaunchBounds<128, 8> >; +using policy_type_1024_2 = + Kokkos::TeamPolicy<TEST_EXECSPACE, Kokkos::LaunchBounds<1024, 2> >; + +template <class T, int N, class PolicyType, int S> +void test_team_policy_max_recommended_static_size(int scratch_size) { + PolicyType p = PolicyType(10000, Kokkos::AUTO, 4) + .set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + int team_size_max_for = p.team_size_max(FunctorFor<T, N, PolicyType, S>(), + Kokkos::ParallelForTag()); + int team_size_rec_for = p.team_size_recommended( + FunctorFor<T, N, PolicyType, S>(), Kokkos::ParallelForTag()); + int team_size_max_reduce = p.team_size_max( + FunctorReduce<T, N, PolicyType, S>(), Kokkos::ParallelReduceTag()); + int team_size_rec_reduce = p.team_size_recommended( + FunctorReduce<T, N, PolicyType, S>(), Kokkos::ParallelReduceTag()); + + ASSERT_TRUE(team_size_max_for >= team_size_rec_for); + ASSERT_TRUE(team_size_max_reduce >= team_size_rec_reduce); + ASSERT_TRUE(team_size_max_for >= team_size_max_reduce); + + Kokkos::parallel_for(PolicyType(10000, team_size_max_for, 4) + .set_scratch_size(0, Kokkos::PerTeam(scratch_size)), + FunctorFor<T, N, PolicyType, S>()); + Kokkos::parallel_for(PolicyType(10000, team_size_rec_for, 4) + .set_scratch_size(0, Kokkos::PerTeam(scratch_size)), + FunctorFor<T, N, PolicyType, S>()); + MyArray<T, N> val; + double n_leagues = 10000; + // FIXME_HIP +#ifdef KOKKOS_ENABLE_HIP + if (N == 2) + n_leagues = 1000; + else + n_leagues = 500; +#endif + + Kokkos::parallel_reduce( + PolicyType(n_leagues, team_size_max_reduce, 4) + .set_scratch_size(0, Kokkos::PerTeam(scratch_size)), + FunctorReduce<T, N, PolicyType, S>(), val); + Kokkos::parallel_reduce( + PolicyType(n_leagues, team_size_rec_reduce, 4) + .set_scratch_size(0, Kokkos::PerTeam(scratch_size)), + FunctorReduce<T, N, PolicyType, S>(), val); + Kokkos::fence(); +} + +template <class T, int N, class PolicyType> +void test_team_policy_max_recommended(int scratch_size) { + test_team_policy_max_recommended_static_size<T, N, PolicyType, 1>( + scratch_size); + // FIXME_SYCL prevent running out of total kernel argument size limit +#ifdef KOKKOS_ENABLE_SYCL + test_team_policy_max_recommended_static_size<T, N, PolicyType, 100>( + scratch_size); +#else + test_team_policy_max_recommended_static_size<T, N, PolicyType, 1000>( + scratch_size); +#endif +} + +TEST(TEST_CATEGORY, team_policy_max_recommended) { + int max_scratch_size = policy_type::scratch_size_max(0); + test_team_policy_max_recommended<double, 2, policy_type>(0); + test_team_policy_max_recommended<double, 2, policy_type>(max_scratch_size / + 3); + test_team_policy_max_recommended<double, 2, policy_type>(max_scratch_size); + test_team_policy_max_recommended<double, 2, policy_type_128_8>(0); + test_team_policy_max_recommended<double, 2, policy_type_128_8>( + max_scratch_size / 3 / 8); + test_team_policy_max_recommended<double, 2, policy_type_128_8>( + max_scratch_size / 8); + test_team_policy_max_recommended<double, 2, policy_type_1024_2>(0); + test_team_policy_max_recommended<double, 2, policy_type_1024_2>( + max_scratch_size / 3 / 2); + test_team_policy_max_recommended<double, 2, policy_type_1024_2>( + max_scratch_size / 2); + + test_team_policy_max_recommended<double, 16, policy_type>(0); + test_team_policy_max_recommended<double, 16, policy_type>(max_scratch_size / + 3); + test_team_policy_max_recommended<double, 16, policy_type>(max_scratch_size); + test_team_policy_max_recommended<double, 16, policy_type_128_8>(0); + test_team_policy_max_recommended<double, 16, policy_type_128_8>( + max_scratch_size / 3 / 8); + test_team_policy_max_recommended<double, 16, policy_type_128_8>( + max_scratch_size / 8); + test_team_policy_max_recommended<double, 16, policy_type_1024_2>(0); + test_team_policy_max_recommended<double, 16, policy_type_1024_2>( + max_scratch_size / 3 / 2); + test_team_policy_max_recommended<double, 16, policy_type_1024_2>( + max_scratch_size / 2); +} + +template <typename TeamHandleType, typename ReducerValueType> +struct PrintFunctor1 { + KOKKOS_INLINE_FUNCTION void operator()(const TeamHandleType& team, + ReducerValueType&) const { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Test %i %i\n", int(team.league_rank()), + int(team.team_rank())); + } +}; + +template <typename TeamHandleType, typename ReducerValueType> +struct PrintFunctor2 { + KOKKOS_INLINE_FUNCTION void operator()(const TeamHandleType& team, + ReducerValueType& teamVal) const { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Test %i %i\n", int(team.league_rank()), + int(team.team_rank())); + teamVal += 1; + } +}; + +TEST(TEST_CATEGORY, team_policy_max_scalar_without_plus_equal_k) { + using ExecSpace = TEST_EXECSPACE; + using ReducerType = Kokkos::MinMax<double, Kokkos::HostSpace>; + using ReducerValueType = typename ReducerType::value_type; + using DynamicScheduleType = Kokkos::Schedule<Kokkos::Dynamic>; + using TeamPolicyType = Kokkos::TeamPolicy<ExecSpace, DynamicScheduleType>; + using TeamHandleType = typename TeamPolicyType::member_type; + + static constexpr int num_teams = 17; + ReducerValueType val; + ReducerType reducer(val); + + TeamPolicyType p(num_teams, Kokkos::AUTO); + PrintFunctor1<TeamHandleType, ReducerValueType> f1; + const int max_team_size = + p.team_size_max(f1, reducer, Kokkos::ParallelReduceTag()); + + const int recommended_team_size = + p.team_size_recommended(f1, reducer, Kokkos::ParallelReduceTag()); + + printf("Max TeamSize: %i Recommended TeamSize: %i\n", max_team_size, + recommended_team_size); + + Kokkos::parallel_reduce(p, f1, reducer); + double sum; + Kokkos::parallel_reduce(TeamPolicyType(num_teams, Kokkos::AUTO), + PrintFunctor2<TeamHandleType, double>{}, sum); + printf("Sum: %lf\n", sum); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestTeamVector.hpp b/packages/kokkos/core/unit_test/TestTeamVector.hpp new file mode 100644 index 0000000000000000000000000000000000000000..ba11dc07a962989f2826a3d0def3649112c00da6 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestTeamVector.hpp @@ -0,0 +1,1047 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> + +#include <impl/Kokkos_Timer.hpp> +#include <iostream> +#include <cstdlib> +#include <cstdint> +#include <cinttypes> +#include <TestNonTrivialScalarTypes.hpp> + +namespace TestTeamVector { + +template <typename Scalar, class ExecutionSpace> +struct functor_team_for { + using policy_type = Kokkos::TeamPolicy<ExecutionSpace>; + using execution_space = ExecutionSpace; + + Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag; + + functor_team_for(Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag_) + : flag(flag_) {} + + using shmem_space = typename ExecutionSpace::scratch_memory_space; + using shared_int = + Kokkos::View<Scalar *, shmem_space, Kokkos::MemoryUnmanaged>; + unsigned team_shmem_size(int team_size) const { + return shared_int::shmem_size(team_size * 13); + } + + KOKKOS_INLINE_FUNCTION + void operator()(typename policy_type::member_type team) const { + using size_type = typename shmem_space::size_type; + const size_type shmemSize = team.team_size() * 13; + shared_int values = shared_int(team.team_shmem(), shmemSize); + + if (values.data() == nullptr || + static_cast<size_type>(values.extent(0)) < shmemSize) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "FAILED to allocate shared memory of size %u\n", + static_cast<unsigned int>(shmemSize)); + } else { + // Initialize shared memory. + values(team.team_rank()) = 0; + + // Accumulate value into per thread shared memory. + // This is non blocking. + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 131), [&](int i) { + values(team.team_rank()) += + i - team.league_rank() + team.league_size() + team.team_size(); + }); + + // Wait for all memory to be written. + team.team_barrier(); + + // One thread per team executes the comparison. + Kokkos::single(Kokkos::PerTeam(team), [&]() { + Scalar test = 0; + Scalar value = 0; + + for (int i = 0; i < 131; ++i) { + test += + i - team.league_rank() + team.league_size() + team.team_size(); + } + + for (int i = 0; i < team.team_size(); ++i) { + value += values(i); + } + + if (test != value) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "FAILED team_parallel_for %i %i %f %f\n", team.league_rank(), + team.team_rank(), static_cast<double>(test), + static_cast<double>(value)); + flag() = 1; + } + }); + } + } +}; + +template <typename Scalar, class ExecutionSpace> +struct functor_team_reduce { + using policy_type = Kokkos::TeamPolicy<ExecutionSpace>; + using execution_space = ExecutionSpace; + + Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag; + + functor_team_reduce( + Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag_) + : flag(flag_) {} + + using shmem_space = typename ExecutionSpace::scratch_memory_space; + using shared_scalar_t = + Kokkos::View<Scalar *, shmem_space, Kokkos::MemoryUnmanaged>; + unsigned team_shmem_size(int team_size) const { + return shared_scalar_t::shmem_size(team_size * 13); + } + + KOKKOS_INLINE_FUNCTION + void operator()(typename policy_type::member_type team) const { + Scalar value = Scalar(); + shared_scalar_t shared_value(team.team_scratch(0), 1); + + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(team, 131), + [&](int i, Scalar &val) { + val += i - team.league_rank() + team.league_size() + team.team_size(); + }, + value); + + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(team, 131), + [&](int i, Scalar &val) { + val += i - team.league_rank() + team.league_size() + team.team_size(); + }, + shared_value(0)); + + team.team_barrier(); + + Kokkos::single(Kokkos::PerTeam(team), [&]() { + Scalar test = 0; + + for (int i = 0; i < 131; ++i) { + test += i - team.league_rank() + team.league_size() + team.team_size(); + } + + if (test != value) { + if (team.league_rank() == 0) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "FAILED team_parallel_reduce %i %i %lf %lf %lu\n", + team.league_rank(), team.team_rank(), static_cast<double>(test), + static_cast<double>(value), + static_cast<unsigned long>(sizeof(Scalar))); + } + + flag() = 1; + } + if (test != shared_value(0)) { + if (team.league_rank() == 0) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "FAILED team_parallel_reduce with shared result %i %i %lf %lf " + "%lu\n", + team.league_rank(), team.team_rank(), static_cast<double>(test), + static_cast<double>(shared_value(0)), + static_cast<unsigned long>(sizeof(Scalar))); + } + + flag() = 1; + } + }); + } +}; + +template <typename Scalar, class ExecutionSpace> +struct functor_team_reduce_reducer { + using policy_type = Kokkos::TeamPolicy<ExecutionSpace>; + using execution_space = ExecutionSpace; + + Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag; + + functor_team_reduce_reducer( + Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag_) + : flag(flag_) {} + + using shmem_space = typename ExecutionSpace::scratch_memory_space; + using shared_scalar_t = + Kokkos::View<Scalar *, shmem_space, Kokkos::MemoryUnmanaged>; + unsigned team_shmem_size(int team_size) const { + return shared_scalar_t::shmem_size(team_size * 13); + } + + KOKKOS_INLINE_FUNCTION + void operator()(typename policy_type::member_type team) const { + Scalar value = 0; + shared_scalar_t shared_value(team.team_scratch(0), 1); + + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(team, 131), + [&](int i, Scalar &val) { + val += i - team.league_rank() + team.league_size() + team.team_size(); + }, + Kokkos::Sum<Scalar>(value)); + + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(team, 131), + [&](int i, Scalar &val) { + val += i - team.league_rank() + team.league_size() + team.team_size(); + }, + Kokkos::Sum<Scalar>(shared_value(0))); + + team.team_barrier(); + + Kokkos::single(Kokkos::PerTeam(team), [&]() { + Scalar test = 0; + + for (int i = 0; i < 131; ++i) { + test += i - team.league_rank() + team.league_size() + team.team_size(); + } + + if (test != value) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "FAILED team_vector_parallel_reduce_reducer %i %i %lf %lf\n", + team.league_rank(), team.team_rank(), static_cast<double>(test), + static_cast<double>(value)); + + flag() = 1; + } + if (test != shared_value(0)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "FAILED team_vector_parallel_reduce_reducer shared value %i %i %lf " + "%lf\n", + team.league_rank(), team.team_rank(), static_cast<double>(test), + static_cast<double>(shared_value(0))); + + flag() = 1; + } + }); + } +}; + +template <typename Scalar, class ExecutionSpace> +struct functor_team_vector_for { + using policy_type = Kokkos::TeamPolicy<ExecutionSpace>; + using execution_space = ExecutionSpace; + + Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag; + + functor_team_vector_for( + Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag_) + : flag(flag_) {} + + using shmem_space = typename ExecutionSpace::scratch_memory_space; + using shared_int = + Kokkos::View<Scalar *, shmem_space, Kokkos::MemoryUnmanaged>; + unsigned team_shmem_size(int team_size) const { + return shared_int::shmem_size(team_size * 13); + } + + KOKKOS_INLINE_FUNCTION + void operator()(typename policy_type::member_type team) const { + using size_type = typename shared_int::size_type; + + const size_type shmemSize = team.team_size() * 13; + shared_int values = shared_int(team.team_shmem(), shmemSize); + + if (values.data() == nullptr || + static_cast<size_type>(values.extent(0)) < shmemSize) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "FAILED to allocate shared memory of size %u\n", + static_cast<unsigned int>(shmemSize)); + } else { + team.team_barrier(); + + Kokkos::single(Kokkos::PerThread(team), + [&]() { values(team.team_rank()) = 0; }); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 131), [&](int i) { + Kokkos::single(Kokkos::PerThread(team), [&]() { + values(team.team_rank()) += + i - team.league_rank() + team.league_size() + team.team_size(); + }); + }); + + team.team_barrier(); + + Kokkos::single(Kokkos::PerTeam(team), [&]() { + Scalar test = 0; + Scalar value = 0; + + for (int i = 0; i < 131; ++i) { + test += + i - team.league_rank() + team.league_size() + team.team_size(); + } + + for (int i = 0; i < team.team_size(); ++i) { + value += values(i); + } + + if (test != value) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "FAILED team_vector_parallel_for %i %i %f %f\n", + team.league_rank(), team.team_rank(), static_cast<double>(test), + static_cast<double>(value)); + + flag() = 1; + } + }); + } + } +}; + +template <typename Scalar, class ExecutionSpace> +struct functor_team_vector_reduce { + using policy_type = Kokkos::TeamPolicy<ExecutionSpace>; + using execution_space = ExecutionSpace; + + Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag; + functor_team_vector_reduce( + Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag_) + : flag(flag_) {} + + using shmem_space = typename ExecutionSpace::scratch_memory_space; + using shared_int = + Kokkos::View<Scalar *, shmem_space, Kokkos::MemoryUnmanaged>; + unsigned team_shmem_size(int team_size) const { + return shared_int::shmem_size(team_size * 13); + } + + KOKKOS_INLINE_FUNCTION + void operator()(typename policy_type::member_type team) const { + Scalar value = Scalar(); + + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(team, 131), + [&](int i, Scalar &val) { + val += i - team.league_rank() + team.league_size() + team.team_size(); + }, + value); + + team.team_barrier(); + + Kokkos::single(Kokkos::PerTeam(team), [&]() { + Scalar test = 0; + + for (int i = 0; i < 131; ++i) { + test += i - team.league_rank() + team.league_size() + team.team_size(); + } + + if (test != value) { + if (team.league_rank() == 0) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "FAILED team_vector_parallel_reduce %i %i %f %f %lu\n", + team.league_rank(), team.team_rank(), static_cast<double>(test), + static_cast<double>(value), + static_cast<unsigned long>(sizeof(Scalar))); + } + + flag() = 1; + } + }); + } +}; + +template <typename Scalar, class ExecutionSpace> +struct functor_team_vector_reduce_reducer { + using policy_type = Kokkos::TeamPolicy<ExecutionSpace>; + using execution_space = ExecutionSpace; + + Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag; + + functor_team_vector_reduce_reducer( + Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag_) + : flag(flag_) {} + + using shmem_space = typename ExecutionSpace::scratch_memory_space; + using shared_int = + Kokkos::View<Scalar *, shmem_space, Kokkos::MemoryUnmanaged>; + unsigned team_shmem_size(int team_size) const { + return shared_int::shmem_size(team_size * 13); + } + + KOKKOS_INLINE_FUNCTION + void operator()(typename policy_type::member_type team) const { + Scalar value = 0; + + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(team, 131), + [&](int i, Scalar &val) { + val += i - team.league_rank() + team.league_size() + team.team_size(); + }, + Kokkos::Sum<Scalar>(value)); + + team.team_barrier(); + + Kokkos::single(Kokkos::PerTeam(team), [&]() { + Scalar test = 0; + + for (int i = 0; i < 131; ++i) { + test += i - team.league_rank() + team.league_size() + team.team_size(); + } + + if (test != value) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "FAILED team_vector_parallel_reduce_reducer %i %i %f %f\n", + team.league_rank(), team.team_rank(), static_cast<double>(test), + static_cast<double>(value)); + + flag() = 1; + } + }); + } +}; + +template <typename Scalar, class ExecutionSpace> +struct functor_vec_single { + using policy_type = Kokkos::TeamPolicy<ExecutionSpace>; + using execution_space = ExecutionSpace; + + Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag; + int nStart; + int nEnd; + + functor_vec_single( + Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag_, + const int start_, const int end_) + : flag(flag_), nStart(start_), nEnd(end_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(typename policy_type::member_type team) const { + // Warning: this test case intentionally violates permissible semantics. + // It is not valid to get references to members of the enclosing region + // inside a parallel_for and write to it. + Scalar value = 0; + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, nStart, nEnd), + [&](int i) { + value = i; // This write is violating Kokkos + // semantics for nested parallelism. + }); + + Kokkos::single( + Kokkos::PerThread(team), [&](Scalar &val) { val = 1; }, value); + + Scalar value2 = 0; + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, nStart, nEnd), + [&](int /*i*/, Scalar &val) { val += value; }, value2); + + if (value2 != (value * Scalar(nEnd - nStart))) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "FAILED vector_single broadcast %i %i %f %f\n", team.league_rank(), + team.team_rank(), (double)value2, (double)value); + + flag() = 1; + } + } +}; + +template <typename Scalar, class ExecutionSpace> +struct functor_vec_for { + using policy_type = Kokkos::TeamPolicy<ExecutionSpace>; + using execution_space = ExecutionSpace; + + Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag; + + functor_vec_for(Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag_) + : flag(flag_) {} + + using shmem_space = typename ExecutionSpace::scratch_memory_space; + using shared_int = + Kokkos::View<Scalar *, shmem_space, Kokkos::MemoryUnmanaged>; + unsigned team_shmem_size(int team_size) const { + return shared_int::shmem_size(team_size * 13); + } + + KOKKOS_INLINE_FUNCTION + void operator()(typename policy_type::member_type team) const { + shared_int values = shared_int(team.team_shmem(), team.team_size() * 13); + + if (values.data() == nullptr || + values.extent(0) < (unsigned)team.team_size() * 13) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("FAILED to allocate memory of size %i\n", + static_cast<int>(team.team_size() * 13)); + flag() = 1; + } else { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, 13), [&](int i) { + values(13 * team.team_rank() + i) = + i - team.team_rank() - team.league_rank() + team.league_size() + + team.team_size(); + }); + + Kokkos::single(Kokkos::PerThread(team), [&]() { + Scalar test = 0; + Scalar value = 0; + + for (int i = 0; i < 13; ++i) { + test += i - team.team_rank() - team.league_rank() + + team.league_size() + team.team_size(); + value += values(13 * team.team_rank() + i); + } + + if (test != value) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("FAILED vector_par_for %i %i %f %f\n", + team.league_rank(), team.team_rank(), + static_cast<double>(test), + static_cast<double>(value)); + + flag() = 1; + } + }); + } + } +}; + +template <typename Scalar, class ExecutionSpace> +struct functor_vec_red { + using policy_type = Kokkos::TeamPolicy<ExecutionSpace>; + using execution_space = ExecutionSpace; + + Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag; + + functor_vec_red(Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag_) + : flag(flag_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(typename policy_type::member_type team) const { + Scalar value = 0; + + // When no reducer is given the default is summation. + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, 13), + [&](int i, Scalar &val) { val += i; }, value); + + Kokkos::single(Kokkos::PerThread(team), [&]() { + Scalar test = 0; + + for (int i = 0; i < 13; i++) test += i; + + if (test != value) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("FAILED vector_par_reduce %i %i %f %f\n", + team.league_rank(), team.team_rank(), + (double)test, (double)value); + + flag() = 1; + } + }); + } +}; + +template <typename Scalar, class ExecutionSpace> +struct functor_vec_red_reducer { + using policy_type = Kokkos::TeamPolicy<ExecutionSpace>; + using execution_space = ExecutionSpace; + + Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag; + + functor_vec_red_reducer( + Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag_) + : flag(flag_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(typename policy_type::member_type team) const { + // Must initialize to the identity value for the reduce operation + // for this test: + // ( identity, operation ) = ( 1 , *= ) + Scalar value = 1; + + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, 13), + [&](int i, Scalar &val) { val *= (i % 5 + 1); }, + Kokkos::Prod<Scalar>(value)); + + Kokkos::single(Kokkos::PerThread(team), [&]() { + Scalar test = 1; + + for (int i = 0; i < 13; i++) test *= (i % 5 + 1); + + if (test != value) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "FAILED vector_par_reduce_reducer %i %i %f %f\n", + team.league_rank(), team.team_rank(), (double)test, (double)value); + + flag() = 1; + } + }); + } +}; + +template <typename Scalar, class ExecutionSpace> +struct functor_vec_scan { + using policy_type = Kokkos::TeamPolicy<ExecutionSpace>; + using execution_space = ExecutionSpace; + + Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag; + functor_vec_scan(Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag_) + : flag(flag_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(typename policy_type::member_type team) const { + Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team, 13), + [&](int i, Scalar &val, bool final) { + val += i; + + if (final) { + Scalar test = 0; + for (int k = 0; k <= i; k++) test += k; + + if (test != val) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "FAILED vector_par_scan %i %i %f %f\n", + team.league_rank(), team.team_rank(), + (double)test, (double)val); + + flag() = 1; + } + } + }); + } +}; + +template <typename Scalar, class ExecutionSpace> +struct functor_reduce { + using value_type = double; + using policy_type = Kokkos::TeamPolicy<ExecutionSpace>; + using execution_space = ExecutionSpace; + + Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag; + functor_reduce(Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag_) + : flag(flag_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(typename policy_type::member_type team, double &sum) const { + sum += team.league_rank() * 100 + team.thread_rank(); + } +}; + +template <typename Scalar, class ExecutionSpace> +bool test_scalar(int nteams, int team_size, int test) { + Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> d_flag("flag"); + typename Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace>::HostMirror + h_flag("h_flag"); + h_flag() = 0; + Kokkos::deep_copy(d_flag, h_flag); + + if (test == 0) { + Kokkos::parallel_for( + std::string("A"), + Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size, 8), + functor_vec_red<Scalar, ExecutionSpace>(d_flag)); + } else if (test == 1) { + Kokkos::parallel_for( + Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size, 8), + functor_vec_red_reducer<Scalar, ExecutionSpace>(d_flag)); + } else if (test == 2) { + Kokkos::parallel_for( + Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size, 8), + functor_vec_scan<Scalar, ExecutionSpace>(d_flag)); + } else if (test == 3) { + Kokkos::parallel_for( + Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size, 8), + functor_vec_for<Scalar, ExecutionSpace>(d_flag)); + } else if (test == 4) { + Kokkos::parallel_for( + "B", Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size, 8), + functor_vec_single<Scalar, ExecutionSpace>(d_flag, 0, 13)); + } else if (test == 5) { + Kokkos::parallel_for(Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size), + functor_team_for<Scalar, ExecutionSpace>(d_flag)); + } else if (test == 6) { + Kokkos::parallel_for(Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size), + functor_team_reduce<Scalar, ExecutionSpace>(d_flag)); + } else if (test == 7) { + Kokkos::parallel_for( + Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size), + functor_team_reduce_reducer<Scalar, ExecutionSpace>(d_flag)); + } else if (test == 8) { + Kokkos::parallel_for( + Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size, 8), + functor_team_vector_for<Scalar, ExecutionSpace>(d_flag)); + } else if (test == 9) { + Kokkos::parallel_for( + Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size, 8), + functor_team_vector_reduce<Scalar, ExecutionSpace>(d_flag)); + } else if (test == 10) { + Kokkos::parallel_for( + Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size, 8), + functor_team_vector_reduce_reducer<Scalar, ExecutionSpace>(d_flag)); + } else if (test == 11) { + Kokkos::parallel_for( + "B", Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size, 8), + functor_vec_single<Scalar, ExecutionSpace>(d_flag, 4, 13)); + } + + Kokkos::deep_copy(h_flag, d_flag); + + return (h_flag() == 0); +} + +template <class ExecutionSpace> +bool Test(int test) { + bool passed = true; + + int team_size = 33; + if (team_size > int(ExecutionSpace::concurrency())) + team_size = int(ExecutionSpace::concurrency()); + passed = passed && test_scalar<int, ExecutionSpace>(317, team_size, test); + passed = passed && + test_scalar<long long int, ExecutionSpace>(317, team_size, test); + passed = passed && test_scalar<float, ExecutionSpace>(317, team_size, test); + passed = passed && test_scalar<double, ExecutionSpace>(317, team_size, test); + passed = passed && + test_scalar<Test::my_complex, ExecutionSpace>(317, team_size, test); + passed = passed && test_scalar<Test::array_reduce<double, 1>, ExecutionSpace>( + 317, team_size, test); + passed = passed && test_scalar<Test::array_reduce<float, 1>, ExecutionSpace>( + 317, team_size, test); + passed = passed && test_scalar<Test::array_reduce<double, 3>, ExecutionSpace>( + 317, team_size, test); + + return passed; +} + +} // namespace TestTeamVector + +namespace Test { + +// Computes y^T*A*x +// ( modified from kokkos-tutorials/GTC2016/Exercises/ThreeLevelPar ) + +#if (!defined(KOKKOS_ENABLE_CUDA)) || defined(KOKKOS_ENABLE_CUDA_LAMBDA) +template <typename ScalarType, class DeviceType> +class TestTripleNestedReduce { + public: + using execution_space = DeviceType; + using size_type = typename execution_space::size_type; + + TestTripleNestedReduce(const size_type &nrows, const size_type &ncols, + const size_type &team_size, + const size_type &vector_length) { + run_test(nrows, ncols, team_size, vector_length); + } + + void run_test(const size_type &nrows, const size_type &ncols, + size_type team_size, const size_type &vector_length) { + if (team_size > size_type(DeviceType::execution_space::concurrency())) + team_size = size_type(DeviceType::execution_space::concurrency()); + +#ifdef KOKKOS_ENABLE_HPX + team_size = 1; + if (!std::is_same<execution_space, Kokkos::Experimental::HPX>::value) { + team_size = 1; + } +#endif + + // using Layout = Kokkos::LayoutLeft; + using Layout = Kokkos::LayoutRight; + + using ViewVector = Kokkos::View<ScalarType *, DeviceType>; + using ViewMatrix = Kokkos::View<ScalarType **, Layout, DeviceType>; + + ViewVector y("y", nrows); + ViewVector x("x", ncols); + ViewMatrix A("A", nrows, ncols); + + using range_policy = Kokkos::RangePolicy<DeviceType>; + + // Initialize y vector. + Kokkos::parallel_for( + range_policy(0, nrows), KOKKOS_LAMBDA(const int i) { y(i) = 1; }); + + // Initialize x vector. + Kokkos::parallel_for( + range_policy(0, ncols), KOKKOS_LAMBDA(const int i) { x(i) = 1; }); + Kokkos::fence(); + + using team_policy = Kokkos::TeamPolicy<DeviceType>; + using member_type = typename Kokkos::TeamPolicy<DeviceType>::member_type; + + // Initialize A matrix, note 2D indexing computation. + Kokkos::parallel_for( + team_policy(nrows, Kokkos::AUTO), + KOKKOS_LAMBDA(const member_type &teamMember) { + const int j = teamMember.league_rank(); + Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, ncols), + [&](const int i) { A(j, i) = 1; }); + }); + Kokkos::fence(); + + // Three level parallelism kernel to force caching of vector x. + ScalarType result = 0.0; + int chunk_size = 128; + Kokkos::parallel_reduce( + team_policy(nrows / chunk_size, team_size, vector_length), + KOKKOS_LAMBDA(const member_type &teamMember, double &update) { + const int row_start = teamMember.league_rank() * chunk_size; + const int row_end = row_start + chunk_size; + Kokkos::parallel_for( + Kokkos::TeamThreadRange(teamMember, row_start, row_end), + [&](const int i) { + ScalarType sum_i = 0.0; + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(teamMember, ncols), + [&](const int j, ScalarType &innerUpdate) { + innerUpdate += A(i, j) * x(j); + }, + sum_i); + Kokkos::single(Kokkos::PerThread(teamMember), + [&]() { update += y(i) * sum_i; }); + }); + }, + result); + Kokkos::fence(); + + const ScalarType solution = (ScalarType)nrows * (ScalarType)ncols; + if (int64_t(solution) != int64_t(result)) { + printf(" TestTripleNestedReduce failed solution(%" PRId64 + ") != result(%" PRId64 + ")," + " nrows(%" PRId32 ") ncols(%" PRId32 ") league_size(%" PRId32 + ") team_size(%" PRId32 ")\n", + int64_t(solution), int64_t(result), int32_t(nrows), int32_t(ncols), + int32_t(nrows / chunk_size), int32_t(team_size)); + } + + ASSERT_EQ(solution, result); + } +}; + +#else // #if ( ! defined( KOKKOS_ENABLE_CUDA ) ) || defined( + // KOKKOS_ENABLE_CUDA_LAMBDA ) + +template <typename ScalarType, class DeviceType> +class TestTripleNestedReduce { + public: + using execution_space = DeviceType; + using size_type = typename execution_space::size_type; + + TestTripleNestedReduce(const size_type &, const size_type, const size_type &, + const size_type) {} +}; + +#endif + +namespace VectorScanReducer { +enum class ScanType : bool { Inclusive, Exclusive }; + +template <typename ExecutionSpace, ScanType scan_type, int n, + int n_vector_range, class Reducer> +struct checkScan { + const int n_team_thread_range = 1000; + const int n_per_team = n_team_thread_range * n_vector_range; + + using size_type = typename ExecutionSpace::size_type; + using value_type = typename Reducer::value_type; + using view_type = Kokkos::View<value_type[n], ExecutionSpace>; + + view_type inputs = view_type{"inputs"}; + view_type outputs = view_type{"outputs"}; + + value_type result; + Reducer reducer = {result}; + + struct ThreadVectorFunctor { + KOKKOS_FUNCTION void operator()(const size_type j, value_type &update, + const bool final) const { + const size_type element = j + m_team_offset + m_thread_offset; + const auto tmp = m_inputs(element); + if (scan_type == ScanType::Inclusive) { + m_reducer.join(update, tmp); + if (final) { + m_outputs(element) = update; + } + } else { + if (final) { + m_outputs(element) = update; + } + m_reducer.join(update, tmp); + } + } + + const Reducer &m_reducer; + const size_type &m_team_offset; + const size_type &m_thread_offset; + const view_type &m_outputs; + const view_type &m_inputs; + }; + + struct TeamThreadRangeFunctor { + KOKKOS_FUNCTION void operator()(const size_type i) const { + const size_type thread_offset = i * n_vector_range; + Kokkos::parallel_scan( + Kokkos::ThreadVectorRange(m_team, n_vector_range), + ThreadVectorFunctor{m_reducer, m_team_offset, thread_offset, + m_outputs, m_inputs}, + m_reducer); + } + + const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &m_team; + const Reducer &m_reducer; + const size_type &m_team_offset; + const view_type &m_outputs; + const view_type &m_inputs; + }; + + KOKKOS_FUNCTION void operator()( + const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team) + const { + const size_type iTeam = team.league_rank(); + const size_type iTeamOffset = iTeam * n_per_team; + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, n_team_thread_range), + TeamThreadRangeFunctor{team, reducer, iTeamOffset, outputs, inputs}); + } + + KOKKOS_FUNCTION void operator()(size_type i) const { inputs(i) = i * 1. / n; } + + void run() { + const int n_teams = n / n_per_team; + + Kokkos::parallel_for(Kokkos::RangePolicy<ExecutionSpace>(0, n), *this); + + // run ThreadVectorRange parallel_scan + Kokkos::TeamPolicy<ExecutionSpace> policy(n_teams, Kokkos::AUTO, + Kokkos::AUTO); + const std::string label = + (scan_type == ScanType::Inclusive ? std::string("inclusive") + : std::string("exclusive")) + + "Scan" + typeid(Reducer).name(); + Kokkos::parallel_for(label, policy, *this); + Kokkos::fence(); + + auto host_outputs = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, outputs); + auto host_inputs = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, inputs); + + Kokkos::View<value_type[n], Kokkos::HostSpace> expected("expected"); + { + value_type identity; + reducer.init(identity); + for (int i = 0; i < expected.extent_int(0); ++i) { + const int vector = i % n_vector_range; + const value_type accum = vector == 0 ? identity : expected(i - 1); + const value_type val = + scan_type == ScanType::Inclusive + ? host_inputs(i) + : (vector == 0 ? identity : host_inputs(i - 1)); + expected(i) = accum; + reducer.join(expected(i), val); + } + } + for (int i = 0; i < host_outputs.extent_int(0); ++i) + ASSERT_EQ(host_outputs(i), expected(i)); + } +}; +} // namespace VectorScanReducer + +#if !(defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) || defined(KOKKOS_ENABLE_HIP)) +TEST(TEST_CATEGORY, team_vector) { + ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(0))); + ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(1))); + ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(2))); + ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(3))); + ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(4))); + ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(5))); + ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(6))); + ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(7))); + ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(8))); + ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(9))); + ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(10))); + ASSERT_TRUE((TestTeamVector::Test<TEST_EXECSPACE>(11))); +} +#endif + +#if !defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) +TEST(TEST_CATEGORY, triple_nested_parallelism) { +// With KOKKOS_ENABLE_DEBUG enabled, the functor uses too many registers to run +// with a team size of 32 on GPUs, 16 is the max possible (at least on a K80 +// GPU) See https://github.com/kokkos/kokkos/issues/1513 +#if defined(KOKKOS_ENABLE_DEBUG) && defined(KOKKOS_ENABLE_CUDA) + if (!std::is_same<TEST_EXECSPACE, Kokkos::Cuda>::value) { +#endif + TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 32, 32); + TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 32, 16); +#if defined(KOKKOS_ENABLE_DEBUG) && defined(KOKKOS_ENABLE_CUDA) + } +#endif + TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 16, 16); + TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 16, 33); + TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 16, 19); + TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 7, 16); +} +#endif + +TEST(TEST_CATEGORY, parallel_scan_with_reducers) { + using T = double; + using namespace VectorScanReducer; + + static constexpr int n = 1000000; + static constexpr int n_vector_range = 100; + + checkScan<TEST_EXECSPACE, ScanType::Exclusive, n, n_vector_range, + Kokkos::Prod<T, TEST_EXECSPACE>>() + .run(); + checkScan<TEST_EXECSPACE, ScanType::Inclusive, n, n_vector_range, + Kokkos::Prod<T, TEST_EXECSPACE>>() + .run(); + + checkScan<TEST_EXECSPACE, ScanType::Exclusive, n, n_vector_range, + Kokkos::Max<T, TEST_EXECSPACE>>() + .run(); + checkScan<TEST_EXECSPACE, ScanType::Inclusive, n, n_vector_range, + Kokkos::Max<T, TEST_EXECSPACE>>() + .run(); + + checkScan<TEST_EXECSPACE, ScanType::Exclusive, n, n_vector_range, + Kokkos::Min<T, TEST_EXECSPACE>>() + .run(); + checkScan<TEST_EXECSPACE, ScanType::Inclusive, n, n_vector_range, + Kokkos::Min<T, TEST_EXECSPACE>>() + .run(); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestTeamVectorRange.hpp b/packages/kokkos/core/unit_test/TestTeamVectorRange.hpp new file mode 100644 index 0000000000000000000000000000000000000000..7342ebad8433526719b52058ff6d6b75e41a107a --- /dev/null +++ b/packages/kokkos/core/unit_test/TestTeamVectorRange.hpp @@ -0,0 +1,527 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> + +#include <impl/Kokkos_Timer.hpp> +#include <iostream> +#include <cstdlib> +#include <cstdint> +#include <cinttypes> + +namespace TestTeamVectorRange { + +struct my_complex { + double re, im; + int dummy; + + KOKKOS_INLINE_FUNCTION + my_complex() { + re = 0.0; + im = 0.0; + dummy = 0; + } + + KOKKOS_INLINE_FUNCTION + my_complex(const my_complex& src) { + re = src.re; + im = src.im; + dummy = src.dummy; + } + + KOKKOS_INLINE_FUNCTION + my_complex& operator=(const my_complex& src) { + re = src.re; + im = src.im; + dummy = src.dummy; + return *this; + } + + KOKKOS_INLINE_FUNCTION + my_complex& operator=(const volatile my_complex& src) { + re = src.re; + im = src.im; + dummy = src.dummy; + return *this; + } + + KOKKOS_INLINE_FUNCTION + volatile my_complex& operator=(const my_complex& src) volatile { + re = src.re; + im = src.im; + dummy = src.dummy; + return *this; + } + + KOKKOS_INLINE_FUNCTION + volatile my_complex& operator=(const volatile my_complex& src) volatile { + re = src.re; + im = src.im; + dummy = src.dummy; + return *this; + } + + KOKKOS_INLINE_FUNCTION + my_complex(const volatile my_complex& src) { + re = src.re; + im = src.im; + dummy = src.dummy; + } + + KOKKOS_INLINE_FUNCTION + my_complex(const double& val) { + re = val; + im = 0.0; + dummy = 0; + } + + KOKKOS_INLINE_FUNCTION + my_complex& operator+=(const my_complex& src) { + re += src.re; + im += src.im; + dummy += src.dummy; + return *this; + } + + KOKKOS_INLINE_FUNCTION + void operator+=(const volatile my_complex& src) volatile { + re += src.re; + im += src.im; + dummy += src.dummy; + } + + KOKKOS_INLINE_FUNCTION + my_complex operator+(const my_complex& src) { + my_complex tmp = *this; + tmp.re += src.re; + tmp.im += src.im; + tmp.dummy += src.dummy; + return tmp; + } + + KOKKOS_INLINE_FUNCTION + my_complex operator+(const volatile my_complex& src) volatile { + my_complex tmp = *this; + tmp.re += src.re; + tmp.im += src.im; + tmp.dummy += src.dummy; + return tmp; + } + + KOKKOS_INLINE_FUNCTION + my_complex& operator*=(const my_complex& src) { + double re_tmp = re * src.re - im * src.im; + double im_tmp = re * src.im + im * src.re; + re = re_tmp; + im = im_tmp; + dummy *= src.dummy; + return *this; + } + + KOKKOS_INLINE_FUNCTION + void operator*=(const volatile my_complex& src) volatile { + double re_tmp = re * src.re - im * src.im; + double im_tmp = re * src.im + im * src.re; + re = re_tmp; + im = im_tmp; + dummy *= src.dummy; + } + + KOKKOS_INLINE_FUNCTION + bool operator==(const my_complex& src) const { + return (re == src.re) && (im == src.im) && (dummy == src.dummy); + } + + KOKKOS_INLINE_FUNCTION + bool operator!=(const my_complex& src) const { + return (re != src.re) || (im != src.im) || (dummy != src.dummy); + } + + KOKKOS_INLINE_FUNCTION + bool operator!=(const double& val) const { + return (re != val) || (im != 0) || (dummy != 0); + } + + KOKKOS_INLINE_FUNCTION + my_complex& operator=(const int& val) { + re = val; + im = 0.0; + dummy = 0; + return *this; + } + + KOKKOS_INLINE_FUNCTION + my_complex& operator=(const double& val) { + re = val; + im = 0.0; + dummy = 0; + return *this; + } + + KOKKOS_INLINE_FUNCTION + operator double() { return re; } +}; +} // namespace TestTeamVectorRange + +namespace Kokkos { +template <> +struct reduction_identity<TestTeamVectorRange::my_complex> { + using t_red_ident = reduction_identity<double>; + KOKKOS_FORCEINLINE_FUNCTION static TestTeamVectorRange::my_complex sum() { + return TestTeamVectorRange::my_complex(t_red_ident::sum()); + } + KOKKOS_FORCEINLINE_FUNCTION static TestTeamVectorRange::my_complex prod() { + return TestTeamVectorRange::my_complex(t_red_ident::prod()); + } +}; +} // namespace Kokkos + +namespace TestTeamVectorRange { + +template <typename Scalar, class ExecutionSpace> +struct functor_teamvector_for { + using policy_type = Kokkos::TeamPolicy<ExecutionSpace>; + using execution_space = ExecutionSpace; + + Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag; + + functor_teamvector_for( + Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag_) + : flag(flag_) {} + + using shmem_space = typename ExecutionSpace::scratch_memory_space; + using shared_int = + Kokkos::View<Scalar*, shmem_space, Kokkos::MemoryUnmanaged>; + unsigned team_shmem_size(int /*team_size*/) const { + return shared_int::shmem_size(131); + } + + KOKKOS_INLINE_FUNCTION + void operator()(typename policy_type::member_type team) const { + using size_type = typename shmem_space::size_type; + const size_type shmemSize = 131; + shared_int values = shared_int(team.team_shmem(), shmemSize); + + if (values.data() == nullptr || values.extent(0) < shmemSize) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "FAILED to allocate shared memory of size %u\n", + static_cast<unsigned int>(shmemSize)); + } else { + // Initialize shared memory. + Kokkos::parallel_for(Kokkos::TeamVectorRange(team, 131), + [&](int i) { values(i) = 0; }); + // Wait for all memory to be written. + team.team_barrier(); + + // Accumulate value into per thread shared memory. + // This is non blocking. + Kokkos::parallel_for(Kokkos::TeamVectorRange(team, 131), [&](int i) { + values(i) += + i - team.league_rank() + team.league_size() + team.team_size(); + }); + + // Wait for all memory to be written. + team.team_barrier(); + + // One thread per team executes the comparison. + Kokkos::single(Kokkos::PerTeam(team), [&]() { + Scalar test = 0; + Scalar value = 0; + + for (int i = 0; i < 131; ++i) { + test += + i - team.league_rank() + team.league_size() + team.team_size(); + } + + for (int i = 0; i < 131; ++i) { + value += values(i); + } + + if (test != value) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "FAILED teamvector_parallel_for %i %i %f %f\n", + team.league_rank(), team.team_rank(), static_cast<double>(test), + static_cast<double>(value)); + flag() = 1; + } + }); + } + } +}; + +template <typename Scalar, class ExecutionSpace> +struct functor_teamvector_reduce { + using policy_type = Kokkos::TeamPolicy<ExecutionSpace>; + using execution_space = ExecutionSpace; + + Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag; + + functor_teamvector_reduce( + Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag_) + : flag(flag_) {} + + using shmem_space = typename ExecutionSpace::scratch_memory_space; + using shared_scalar_t = + Kokkos::View<Scalar*, shmem_space, Kokkos::MemoryUnmanaged>; + unsigned team_shmem_size(int team_size) const { + return shared_scalar_t::shmem_size(team_size * 13); + } + + KOKKOS_INLINE_FUNCTION + void operator()(typename policy_type::member_type team) const { + Scalar value = Scalar(); + shared_scalar_t shared_value(team.team_scratch(0), 1); + + Kokkos::parallel_reduce( + Kokkos::TeamVectorRange(team, 131), + [&](int i, Scalar& val) { + val += i - team.league_rank() + team.league_size() + team.team_size(); + }, + shared_value(0)); + + team.team_barrier(); + Kokkos::parallel_reduce( + Kokkos::TeamVectorRange(team, 131), + [&](int i, Scalar& val) { + val += i - team.league_rank() + team.league_size() + team.team_size(); + }, + value); + + // Kokkos::parallel_reduce( Kokkos::TeamVectorRange( team, 131 ), [&] ( + // int i, Scalar & val ) + // { + // val += i - team.league_rank() + team.league_size() + + // team.team_size(); + // }, shared_value(0) ); + + team.team_barrier(); + + Kokkos::single(Kokkos::PerTeam(team), [&]() { + Scalar test = 0; + + for (int i = 0; i < 131; ++i) { + test += i - team.league_rank() + team.league_size() + team.team_size(); + } + + if (test != value) { + if (team.league_rank() == 0) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "FAILED teamvector_parallel_reduce %i %i %lf %lf %lu\n", + (int)team.league_rank(), (int)team.team_rank(), + static_cast<double>(test), static_cast<double>(value), + static_cast<unsigned long>(sizeof(Scalar))); + } + + flag() = 1; + } + if (test != shared_value(0)) { + if (team.league_rank() == 0) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "FAILED teamvector_parallel_reduce with shared result %i %i %lf " + "%lf %lu\n", + static_cast<int>(team.league_rank()), + static_cast<int>(team.team_rank()), static_cast<double>(test), + static_cast<double>(shared_value(0)), + static_cast<unsigned long>(sizeof(Scalar))); + } + + flag() = 1; + } + }); + } +}; + +template <typename Scalar, class ExecutionSpace> +struct functor_teamvector_reduce_reducer { + using policy_type = Kokkos::TeamPolicy<ExecutionSpace>; + using execution_space = ExecutionSpace; + + Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag; + + functor_teamvector_reduce_reducer( + Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> flag_) + : flag(flag_) {} + + using shmem_space = typename ExecutionSpace::scratch_memory_space; + using shared_scalar_t = + Kokkos::View<Scalar*, shmem_space, Kokkos::MemoryUnmanaged>; + unsigned team_shmem_size(int team_size) const { + return shared_scalar_t::shmem_size(team_size * 13); + } + + KOKKOS_INLINE_FUNCTION + void operator()(typename policy_type::member_type team) const { + Scalar value = 0; + shared_scalar_t shared_value(team.team_scratch(0), 1); + + Kokkos::parallel_reduce( + Kokkos::TeamVectorRange(team, 131), + [&](int i, Scalar& val) { + val += i - team.league_rank() + team.league_size() + team.team_size(); + }, + Kokkos::Sum<Scalar>(value)); + + Kokkos::parallel_reduce( + Kokkos::TeamVectorRange(team, 131), + [&](int i, Scalar& val) { + val += i - team.league_rank() + team.league_size() + team.team_size(); + }, + Kokkos::Sum<Scalar>(shared_value(0))); + + team.team_barrier(); + + Kokkos::single(Kokkos::PerTeam(team), [&]() { + Scalar test = 0; + + for (int i = 0; i < 131; ++i) { + test += i - team.league_rank() + team.league_size() + team.team_size(); + } + + if (test != value) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "FAILED teamvector_parallel_reduce_reducer %i %i %lf %lf\n", + team.league_rank(), team.team_rank(), static_cast<double>(test), + static_cast<double>(value)); + + flag() = 1; + } + if (test != shared_value(0)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "FAILED teamvector_parallel_reduce_reducer shared value %i %i %lf " + "%lf\n", + team.league_rank(), team.team_rank(), static_cast<double>(test), + static_cast<double>(shared_value(0))); + + flag() = 1; + } + }); + } +}; + +template <typename Scalar, class ExecutionSpace> +bool test_scalar(int nteams, int team_size, int test) { + Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace> d_flag("flag"); + typename Kokkos::View<int, Kokkos::LayoutLeft, ExecutionSpace>::HostMirror + h_flag("h_flag"); + h_flag() = 0; + Kokkos::deep_copy(d_flag, h_flag); + + Kokkos::TeamPolicy<ExecutionSpace> policy(nteams, team_size, 8); + + // FIXME_OPENMPTARGET - Need to allocate scratch space via set_scratch_space + // for the OPENMPTARGET backend. +#ifdef KOKKOS_ENABLE_OPENMPTARGET + using scratch_t = Kokkos::View<Scalar*, ExecutionSpace, + Kokkos::MemoryTraits<Kokkos::Unmanaged> >; + + int scratch_size = 0; + if (test == 0) { + scratch_size = scratch_t::shmem_size(131); + } else { + // FIXME_OPENMPTARGET - Currently allocating more than one team for nested + // reduction leads to runtime errors of illegal memory access, caused mostly + // due to the OpenMP memory allocation constraints. + policy = Kokkos::TeamPolicy<ExecutionSpace>(1, team_size, 8); + scratch_size = scratch_t::shmem_size(1); + } + + policy.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); +#endif + + if (test == 0) { + Kokkos::parallel_for( + "Test::TeamVectorFor", policy, + functor_teamvector_for<Scalar, ExecutionSpace>(d_flag)); + } else if (test == 1) { + Kokkos::parallel_for( + "Test::TeamVectorReduce", policy, + functor_teamvector_reduce<Scalar, ExecutionSpace>(d_flag)); + } else if (test == 2) { + Kokkos::parallel_for( + "Test::TeamVectorReduceReducer", + Kokkos::TeamPolicy<ExecutionSpace>(nteams, team_size, 8), + functor_teamvector_reduce_reducer<Scalar, ExecutionSpace>(d_flag)); + } + + Kokkos::deep_copy(h_flag, d_flag); + + return (h_flag() == 0); +} + +template <class ExecutionSpace> +bool Test(int test) { + bool passed = true; + + int team_size = 33; + if (team_size > int(ExecutionSpace::concurrency())) + team_size = int(ExecutionSpace::concurrency()); + passed = passed && test_scalar<int, ExecutionSpace>(317, team_size, test); + passed = passed && + test_scalar<long long int, ExecutionSpace>(317, team_size, test); + passed = passed && test_scalar<float, ExecutionSpace>(317, team_size, test); + passed = passed && test_scalar<double, ExecutionSpace>(317, team_size, test); + // FIXME_OPENMPTARGET - Use of custom reducers currently results in runtime + // memory errors. +#if !defined(KOKKOS_ENABLE_OPENMPTARGET) + passed = + passed && test_scalar<my_complex, ExecutionSpace>(317, team_size, test); +#endif + + return passed; +} + +} // namespace TestTeamVectorRange + +namespace Test { + +TEST(TEST_CATEGORY, team_teamvector_range) { + ASSERT_TRUE((TestTeamVectorRange::Test<TEST_EXECSPACE>(0))); + ASSERT_TRUE((TestTeamVectorRange::Test<TEST_EXECSPACE>(1))); + // FIXME_OPENMPTARGET - Use of kokkos reducers currently results in runtime + // memory errors. +#if !defined(KOKKOS_ENABLE_OPENMPTARGET) + ASSERT_TRUE((TestTeamVectorRange::Test<TEST_EXECSPACE>(2))); +#endif +} +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp b/packages/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a0bc7c4304a040a10bc182e5d23d7c9ba08c4110 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp @@ -0,0 +1,215 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> + +#define KOKKOS_PRAGMA_UNROLL(a) + +namespace { + +template <class Scalar, class ExecutionSpace> +struct SumPlain { + using execution_space = ExecutionSpace; + using type = typename Kokkos::View<Scalar*, execution_space>; + + type view; + + SumPlain(type view_) : view(view_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(int /*i*/, Scalar& val) { val += Scalar(); } +}; + +template <class Scalar, class ExecutionSpace> +struct SumInitJoinFinalValueType { + using execution_space = ExecutionSpace; + using type = typename Kokkos::View<Scalar*, execution_space>; + using value_type = Scalar; + + type view; + + SumInitJoinFinalValueType(type view_) : view(view_) {} + + KOKKOS_INLINE_FUNCTION + void init(value_type& val) const { val = value_type(); } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& val, volatile value_type& src) const { + val += src; + } + + KOKKOS_INLINE_FUNCTION + void operator()(int /*i*/, value_type& val) const { val += value_type(); } +}; + +template <class Scalar, class ExecutionSpace> +struct SumInitJoinFinalValueType2 { + using execution_space = ExecutionSpace; + using type = typename Kokkos::View<Scalar*, execution_space>; + using value_type = Scalar; + + type view; + + SumInitJoinFinalValueType2(type view_) : view(view_) {} + + KOKKOS_INLINE_FUNCTION + void init(volatile value_type& val) const { val = value_type(); } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& val, const volatile value_type& src) const { + val += src; + } + + KOKKOS_INLINE_FUNCTION + void operator()(int /*i*/, value_type& val) const { val += value_type(); } +}; + +template <class Scalar, class ExecutionSpace> +struct SumInitJoinFinalValueTypeArray { + using execution_space = ExecutionSpace; + using type = typename Kokkos::View<Scalar*, execution_space>; + using value_type = Scalar[]; + + type view; + int n; + + SumInitJoinFinalValueTypeArray(type view_, int n_) : view(view_), n(n_) {} + + KOKKOS_INLINE_FUNCTION + void init(value_type val) const { + for (int k = 0; k < n; k++) { + val[k] = 0; + } + } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type val, const volatile value_type src) const { + for (int k = 0; k < n; k++) { + val[k] += src[k]; + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(int i, value_type val) const { + for (int k = 0; k < n; k++) { + val[k] += k * i; + } + } +}; + +template <class Scalar, class ExecutionSpace> +struct SumWrongInitJoinFinalValueType { + using execution_space = ExecutionSpace; + using type = typename Kokkos::View<Scalar*, execution_space>; + using value_type = Scalar; + + type view; + + SumWrongInitJoinFinalValueType(type view_) : view(view_) {} + + KOKKOS_INLINE_FUNCTION + void init(double& val) const { val = double(); } + + KOKKOS_INLINE_FUNCTION + void join(volatile value_type& val, const value_type& src) const { + val += src; + } + + KOKKOS_INLINE_FUNCTION + void operator()(int /*i*/, value_type& val) const { val += value_type(); } +}; + +template <class Scalar, class ExecutionSpace> +void TestTemplateMetaFunctions() { + using type = typename Kokkos::View<Scalar*, ExecutionSpace>; + type a("A", 100); + /* + int sum_plain_has_init_arg = Kokkos::Impl::FunctorHasInit< SumPlain<Scalar, + ExecutionSpace>, Scalar & >::value; ASSERT_EQ( sum_plain_has_init_arg, 0 ); + int sum_initjoinfinalvaluetype_has_init_arg = Kokkos::Impl::FunctorHasInit< + SumInitJoinFinalValueType<Scalar, ExecutionSpace>, Scalar >::value; + ASSERT_EQ( sum_initjoinfinalvaluetype_has_init_arg, 1 ); + int sum_initjoinfinalvaluetype_has_init_arg2 = Kokkos::Impl::FunctorHasInit< + SumInitJoinFinalValueType2<Scalar,ExecutionSpace>, Scalar >::value; + ASSERT_EQ( sum_initjoinfinalvaluetype_has_init_arg2, 1 ); + int sum_wronginitjoinfinalvaluetype_has_init_arg = + Kokkos::Impl::FunctorHasInit< SumWrongInitJoinFinalValueType<Scalar, + ExecutionSpace>, Scalar >::value; ASSERT_EQ( + sum_wronginitjoinfinalvaluetype_has_init_arg, 0 ); + + //int sum_initjoinfinalvaluetypearray_has_init_arg = + Kokkos::Impl::FunctorHasInit< SumInitJoinFinalValueTypeArray<Scalar, + ExecutionSpace>, Scalar[] >::value; + //ASSERT_EQ( sum_initjoinfinalvaluetypearray_has_init_arg, 1 ); + + //printf( "Values Init: %i %i %i\n", sum_plain_has_init_arg, + sum_initjoinfinalvaluetype_has_init_arg, + sum_wronginitjoinfinalvaluetype_has_init_arg ); + + int sum_plain_has_join_arg = Kokkos::Impl::FunctorHasJoin< SumPlain<Scalar, + ExecutionSpace>, Scalar >::value; ASSERT_EQ( sum_plain_has_join_arg, 0 ); + int sum_initjoinfinalvaluetype_has_join_arg = Kokkos::Impl::FunctorHasJoin< + SumInitJoinFinalValueType<Scalar, ExecutionSpace>, Scalar >::value; + ASSERT_EQ( sum_initjoinfinalvaluetype_has_join_arg, 1 ); + int sum_initjoinfinalvaluetype_has_join_arg2 = Kokkos::Impl::FunctorHasJoin< + SumInitJoinFinalValueType2<Scalar, ExecutionSpace>, Scalar >::value; + ASSERT_EQ( sum_initjoinfinalvaluetype_has_join_arg2, 1 ); + int sum_wronginitjoinfinalvaluetype_has_join_arg = + Kokkos::Impl::FunctorHasJoin< SumWrongInitJoinFinalValueType<Scalar, + ExecutionSpace>, Scalar >::value; ASSERT_EQ( + sum_wronginitjoinfinalvaluetype_has_join_arg, 0 ); + + //printf( "Values Join: %i %i %i\n", sum_plain_has_join_arg, + sum_initjoinfinalvaluetype_has_join_arg, + sum_wronginitjoinfinalvaluetype_has_join_arg ); + */ +} + +} // namespace + +namespace Test { +TEST(TEST_CATEGORY, template_meta_functions) { + TestTemplateMetaFunctions<int, TEST_EXECSPACE>(); +} +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestUniqueToken.hpp b/packages/kokkos/core/unit_test/TestUniqueToken.hpp new file mode 100644 index 0000000000000000000000000000000000000000..4ba48bf73f069c6097a079ce1bcde5fd9452155c --- /dev/null +++ b/packages/kokkos/core/unit_test/TestUniqueToken.hpp @@ -0,0 +1,289 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <iostream> + +#include <Kokkos_Core.hpp> + +namespace Test { + +template <class Space, Kokkos::Experimental::UniqueTokenScope Scope> +class TestUniqueToken { + public: + using execution_space = typename Space::execution_space; + using view_type = Kokkos::View<int*, execution_space>; + + Kokkos::Experimental::UniqueToken<execution_space, Scope> tokens; + + view_type verify; + view_type counts; + view_type errors; + + struct count_test_start_tag {}; + struct count_test_check_tag {}; + + KOKKOS_INLINE_FUNCTION + void operator()(long) const { + Kokkos::Experimental::AcquireUniqueToken<execution_space, Scope> token_val( + tokens); + const int32_t t = token_val.value(); + + bool ok = true; + + ok = ok && 0 <= t; + ok = ok && t < tokens.size(); + ok = ok && 0 == Kokkos::atomic_fetch_add(&verify(t), 1); + + Kokkos::atomic_fetch_add(&counts(t), 1); + + ok = ok && 1 == Kokkos::atomic_fetch_add(&verify(t), -1); + + if (!ok) { + Kokkos::atomic_fetch_add(&errors(0), 1); + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(count_test_start_tag, long) const { + constexpr int R = 10; + int id = tokens.acquire(); + for (int j = 0; j < R; j++) counts(id)++; + tokens.release(id); + } + + KOKKOS_INLINE_FUNCTION + void operator()(count_test_check_tag, long i, int64_t& lsum) const { + lsum += counts(i); + } + + TestUniqueToken() + : tokens(execution_space()), + verify("TestUniqueTokenVerify", tokens.size()), + counts("TestUniqueTokenCounts", tokens.size()), + errors("TestUniqueTokenErrors", 1) {} + + static void run() { + using policy = Kokkos::RangePolicy<execution_space>; + + TestUniqueToken self; + + { + const int duplicate = 100; + const long n = duplicate * self.tokens.size(); + + Kokkos::parallel_for(policy(0, n), self); + Kokkos::parallel_for(policy(0, n), self); + Kokkos::parallel_for(policy(0, n), self); + Kokkos::fence(); + } + + typename view_type::HostMirror host_counts = + Kokkos::create_mirror_view(self.counts); + + Kokkos::deep_copy(host_counts, self.counts); + + int32_t max = 0; + + { + const long n = host_counts.extent(0); + for (long i = 0; i < n; ++i) { + if (max < host_counts[i]) max = host_counts[i]; + } + } + + // FIXME_SYCL wrong result on NVIDIA GPUs but correct on host and Intel GPUs +#ifndef KOKKOS_ENABLE_SYCL + // Count test for pull request #3260 + { + constexpr int N = 1000000; + constexpr int R = 10; + int num = self.tokens.size(); + Kokkos::resize(self.counts, num); + Kokkos::deep_copy(self.counts, 0); + Kokkos::parallel_for( + "Start", Kokkos::RangePolicy<Space, count_test_start_tag>(0, N), + self); + int64_t sum = 0; + Kokkos::parallel_reduce( + "Check", Kokkos::RangePolicy<Space, count_test_check_tag>(0, num), + self, sum); + ASSERT_EQ(sum, int64_t(N) * R); + } +#endif + + std::cout << "TestUniqueToken max reuse = " << max << std::endl; + + typename view_type::HostMirror host_errors = + Kokkos::create_mirror_view(self.errors); + + Kokkos::deep_copy(host_errors, self.errors); + + ASSERT_EQ(host_errors(0), 0); + } +}; + +TEST(TEST_CATEGORY, unique_token_global) { + TestUniqueToken<TEST_EXECSPACE, + Kokkos::Experimental::UniqueTokenScope::Global>::run(); +} + +TEST(TEST_CATEGORY, unique_token_instance) { + TestUniqueToken<TEST_EXECSPACE, + Kokkos::Experimental::UniqueTokenScope::Instance>::run(); +} + +template <class Space> +class TestAcquireTeamUniqueToken { + public: + using execution_space = typename Space::execution_space; + using view_type = Kokkos::View<int*, execution_space>; + using scratch_view = + Kokkos::View<int, typename execution_space::scratch_memory_space, + Kokkos::MemoryUnmanaged>; + using team_policy_type = Kokkos::TeamPolicy<execution_space>; + using team_member_type = typename team_policy_type::member_type; + using tokens_type = Kokkos::Experimental::UniqueToken<execution_space>; + + tokens_type tokens; + + view_type verify; + view_type counts; + view_type errors; + + KOKKOS_INLINE_FUNCTION + void operator()(team_member_type team) const { + Kokkos::Experimental::AcquireTeamUniqueToken<team_policy_type> token_val( + tokens, team); + scratch_view team_rank_0_token_val(team.team_scratch(0)); + const int32_t t = token_val.value(); + + bool ok = true; + + ok = ok && 0 <= t; + ok = ok && t < tokens.size(); + + Kokkos::single(Kokkos::PerTeam(team), [&]() { + ok = ok && 0 == Kokkos::atomic_fetch_add(&verify(t), 1); + + Kokkos::atomic_fetch_add(&counts(t), 1); + + ok = ok && 1 == Kokkos::atomic_fetch_add(&verify(t), -1); + }); + + if (team.team_rank() == 0) { + team_rank_0_token_val() = t; + } + team.team_barrier(); + ok = ok && team_rank_0_token_val() == t; + + if (!ok) { + Kokkos::atomic_fetch_add(&errors(0), 1); + } + } + + TestAcquireTeamUniqueToken(int team_size) + : tokens(execution_space::concurrency() / team_size, execution_space()), + verify("TestAcquireTeamUniqueTokenVerify", tokens.size()), + counts("TestAcquireTeamUniqueTokenCounts", tokens.size()), + errors("TestAcquireTeamUniqueTokenErrors", 1) {} + + static void run() { + const int max_team_size = team_policy_type(1, 1).team_size_max( + TestAcquireTeamUniqueToken(1), Kokkos::ParallelForTag()); + const int team_size = std::min(2, max_team_size); + TestAcquireTeamUniqueToken self(team_size); + + { + const int duplicate = 100; + // FIXME_SYCL The number of workgroups on CUDA devices can not be larger + // than 65535 +#ifdef KOKKOS_ENABLE_SYCL + const long n = std::min(65535, duplicate * self.tokens.size()); +#else + const long n = duplicate * self.tokens.size(); +#endif + + team_policy_type team_policy(n, team_size); + team_policy.set_scratch_size( + 0, Kokkos::PerTeam(Kokkos::Experimental::AcquireTeamUniqueToken< + team_policy_type>::shmem_size() + + scratch_view::shmem_size())); + + Kokkos::parallel_for(team_policy, self); + Kokkos::fence(); + } + + typename view_type::HostMirror host_counts = + Kokkos::create_mirror_view(self.counts); + + Kokkos::deep_copy(host_counts, self.counts); + + int32_t max = 0; + + { + const long n = host_counts.extent(0); + for (long i = 0; i < n; ++i) { + if (max < host_counts[i]) max = host_counts[i]; + } + } + + std::cout << "TestAcquireTeamUniqueToken max reuse = " << max << std::endl; + + typename view_type::HostMirror host_errors = + Kokkos::create_mirror_view(self.errors); + + Kokkos::deep_copy(host_errors, self.errors); + + ASSERT_EQ(host_errors(0), 0); + } +}; + +TEST(TEST_CATEGORY, acquire_team_unique_token) { + // FIXME_OPENMPTARGET - Not yet implemented. +#if !defined(KOKKOS_ENABLE_OPENMPTARGET) + TestAcquireTeamUniqueToken<TEST_EXECSPACE>::run(); +#endif +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestUtilities.hpp b/packages/kokkos/core/unit_test/TestUtilities.hpp new file mode 100644 index 0000000000000000000000000000000000000000..1d3e19da105161e0b71c733ad2bb1232add1d8aa --- /dev/null +++ b/packages/kokkos/core/unit_test/TestUtilities.hpp @@ -0,0 +1,91 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <stdexcept> +#include <sstream> +#include <iostream> + +#include <Kokkos_Core.hpp> + +namespace Test { + +void test_is_specialization_of() { + using Kokkos::Impl::is_specialization_of; + static_assert(is_specialization_of<Kokkos::pair<float, int>, Kokkos::pair>{}, + ""); + static_assert(!is_specialization_of<Kokkos::View<int*>, Kokkos::pair>{}, ""); + static_assert(is_specialization_of<Kokkos::View<int*>, Kokkos::View>{}, ""); + // NOTE Not removing cv-qualifiers + static_assert(!is_specialization_of<Kokkos::View<int*> const, Kokkos::View>{}, + ""); + // NOTE Would not compile because Kokkos::Array takes a non-type template + // parameter + // static_assert(is_specialization_of<Kokkos::Array<int, 4>, Kokkos::Array>{}, + // ""); + // But this is fine of course + static_assert(!is_specialization_of<Kokkos::Array<float, 2>, Kokkos::pair>{}, + ""); +} + +template <std::size_t... Idxs, class... Args> +std::size_t do_comma_emulation_test(std::integer_sequence<std::size_t, Idxs...>, + Args... args) { + // Count the bugs, since ASSERT_EQ is a statement and not an expression + std::size_t bugs = 0; + // Ensure in-order evaluation + std::size_t i = 0; + KOKKOS_IMPL_FOLD_COMMA_OPERATOR(bugs += std::size_t(Idxs != i++) /*, ...*/); + // Ensure expansion of multiple packs works + KOKKOS_IMPL_FOLD_COMMA_OPERATOR(bugs += std::size_t(Idxs != args) /*, ...*/); + return bugs; +} + +TEST(utilities, comma_operator_emulation) { + ASSERT_EQ( + 0, do_comma_emulation_test(std::make_index_sequence<5>{}, 0, 1, 2, 3, 4)); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestViewAPI.hpp b/packages/kokkos/core/unit_test/TestViewAPI.hpp new file mode 100644 index 0000000000000000000000000000000000000000..570281f9fd66a230e69b9bb924a84a0078e12168 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestViewAPI.hpp @@ -0,0 +1,1517 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> +#include <stdexcept> +#include <sstream> +#include <iostream> + +namespace Test { + +template <class T, class... P> +size_t allocation_count(const Kokkos::View<T, P...> &view) { + const size_t card = view.size(); + const size_t alloc = view.span(); + + const int memory_span = Kokkos::View<int *>::required_allocation_size(100); + + return (card <= alloc && memory_span == 400) ? alloc : 0; +} + +/*--------------------------------------------------------------------------*/ + +template <typename T, class DeviceType> +struct TestViewOperator { + using execution_space = typename DeviceType::execution_space; + + enum { N = 1000 }; + enum { D = 3 }; + + using view_type = Kokkos::View<T * [D], execution_space>; + + const view_type v1; + const view_type v2; + + TestViewOperator() : v1("v1", N), v2("v2", N) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const unsigned i) const { + const unsigned X = 0; + const unsigned Y = 1; + const unsigned Z = 2; + + v2(i, X) = v1(i, X); + v2(i, Y) = v1(i, Y); + v2(i, Z) = v1(i, Z); + } +}; + +/*--------------------------------------------------------------------------*/ + +template <class DataType, class DeviceType, + unsigned Rank = Kokkos::ViewTraits<DataType>::rank> +struct TestViewOperator_LeftAndRight; + +template <class DataType, class DeviceType> +struct TestViewOperator_LeftAndRight<DataType, DeviceType, 8> { + using execution_space = typename DeviceType::execution_space; + using memory_space = typename DeviceType::memory_space; + using size_type = typename execution_space::size_type; + + using value_type = int; + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type &update, + const volatile value_type &input) { + update |= input; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type &update) { update = 0; } + + using left_view = Kokkos::View<DataType, Kokkos::LayoutLeft, execution_space>; + using right_view = + Kokkos::View<DataType, Kokkos::LayoutRight, execution_space>; + using stride_view = + Kokkos::View<DataType, Kokkos::LayoutStride, execution_space>; + + left_view left; + right_view right; + stride_view left_stride; + stride_view right_stride; + long left_alloc; + long right_alloc; + + TestViewOperator_LeftAndRight() + : left("left"), + right("right"), + left_stride(left), + right_stride(right), + left_alloc(allocation_count(left)), + right_alloc(allocation_count(right)) {} + + void testit() { + int error_flag = 0; + + Kokkos::parallel_reduce(1, *this, error_flag); + + ASSERT_EQ(error_flag, 0); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const size_type, value_type &update) const { + long offset = -1; + + for (unsigned i7 = 0; i7 < unsigned(left.extent(7)); ++i7) + for (unsigned i6 = 0; i6 < unsigned(left.extent(6)); ++i6) + for (unsigned i5 = 0; i5 < unsigned(left.extent(5)); ++i5) + for (unsigned i4 = 0; i4 < unsigned(left.extent(4)); ++i4) + for (unsigned i3 = 0; i3 < unsigned(left.extent(3)); ++i3) + for (unsigned i2 = 0; i2 < unsigned(left.extent(2)); ++i2) + for (unsigned i1 = 0; i1 < unsigned(left.extent(1)); ++i1) + for (unsigned i0 = 0; i0 < unsigned(left.extent(0)); ++i0) { + const long j = &left(i0, i1, i2, i3, i4, i5, i6, i7) - + &left(0, 0, 0, 0, 0, 0, 0, 0); + if (j <= offset || left_alloc <= j) { + update |= 1; + } + offset = j; + + if (&left(i0, i1, i2, i3, i4, i5, i6, i7) != + &left_stride(i0, i1, i2, i3, i4, i5, i6, i7)) { + update |= 4; + } + } + + offset = -1; + + for (unsigned i0 = 0; i0 < unsigned(right.extent(0)); ++i0) + for (unsigned i1 = 0; i1 < unsigned(right.extent(1)); ++i1) + for (unsigned i2 = 0; i2 < unsigned(right.extent(2)); ++i2) + for (unsigned i3 = 0; i3 < unsigned(right.extent(3)); ++i3) + for (unsigned i4 = 0; i4 < unsigned(right.extent(4)); ++i4) + for (unsigned i5 = 0; i5 < unsigned(right.extent(5)); ++i5) + for (unsigned i6 = 0; i6 < unsigned(right.extent(6)); ++i6) + for (unsigned i7 = 0; i7 < unsigned(right.extent(7)); ++i7) { + const long j = &right(i0, i1, i2, i3, i4, i5, i6, i7) - + &right(0, 0, 0, 0, 0, 0, 0, 0); + if (j <= offset || right_alloc <= j) { + update |= 2; + } + offset = j; + + if (&right(i0, i1, i2, i3, i4, i5, i6, i7) != + &right_stride(i0, i1, i2, i3, i4, i5, i6, i7)) { + update |= 8; + } + } + } +}; + +template <class DataType, class DeviceType> +struct TestViewOperator_LeftAndRight<DataType, DeviceType, 7> { + using execution_space = typename DeviceType::execution_space; + using memory_space = typename DeviceType::memory_space; + using size_type = typename execution_space::size_type; + + using value_type = int; + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type &update, + const volatile value_type &input) { + update |= input; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type &update) { update = 0; } + + using left_view = Kokkos::View<DataType, Kokkos::LayoutLeft, execution_space>; + using right_view = + Kokkos::View<DataType, Kokkos::LayoutRight, execution_space>; + + left_view left; + right_view right; + long left_alloc; + long right_alloc; + + TestViewOperator_LeftAndRight() + : left("left"), + right("right"), + left_alloc(allocation_count(left)), + right_alloc(allocation_count(right)) {} + + void testit() { + int error_flag = 0; + + Kokkos::parallel_reduce(1, *this, error_flag); + + ASSERT_EQ(error_flag, 0); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const size_type, value_type &update) const { + long offset = -1; + + for (unsigned i6 = 0; i6 < unsigned(left.extent(6)); ++i6) + for (unsigned i5 = 0; i5 < unsigned(left.extent(5)); ++i5) + for (unsigned i4 = 0; i4 < unsigned(left.extent(4)); ++i4) + for (unsigned i3 = 0; i3 < unsigned(left.extent(3)); ++i3) + for (unsigned i2 = 0; i2 < unsigned(left.extent(2)); ++i2) + for (unsigned i1 = 0; i1 < unsigned(left.extent(1)); ++i1) + for (unsigned i0 = 0; i0 < unsigned(left.extent(0)); ++i0) { + const long j = &left(i0, i1, i2, i3, i4, i5, i6) - + &left(0, 0, 0, 0, 0, 0, 0); + if (j <= offset || left_alloc <= j) { + update |= 1; + } + offset = j; + } + + offset = -1; + + for (unsigned i0 = 0; i0 < unsigned(right.extent(0)); ++i0) + for (unsigned i1 = 0; i1 < unsigned(right.extent(1)); ++i1) + for (unsigned i2 = 0; i2 < unsigned(right.extent(2)); ++i2) + for (unsigned i3 = 0; i3 < unsigned(right.extent(3)); ++i3) + for (unsigned i4 = 0; i4 < unsigned(right.extent(4)); ++i4) + for (unsigned i5 = 0; i5 < unsigned(right.extent(5)); ++i5) + for (unsigned i6 = 0; i6 < unsigned(right.extent(6)); ++i6) { + const long j = &right(i0, i1, i2, i3, i4, i5, i6) - + &right(0, 0, 0, 0, 0, 0, 0); + if (j <= offset || right_alloc <= j) { + update |= 2; + } + offset = j; + } + } +}; + +template <class DataType, class DeviceType> +struct TestViewOperator_LeftAndRight<DataType, DeviceType, 6> { + using execution_space = typename DeviceType::execution_space; + using memory_space = typename DeviceType::memory_space; + using size_type = typename execution_space::size_type; + + using value_type = int; + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type &update, + const volatile value_type &input) { + update |= input; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type &update) { update = 0; } + + using left_view = Kokkos::View<DataType, Kokkos::LayoutLeft, execution_space>; + using right_view = + Kokkos::View<DataType, Kokkos::LayoutRight, execution_space>; + + left_view left; + right_view right; + long left_alloc; + long right_alloc; + + TestViewOperator_LeftAndRight() + : left("left"), + right("right"), + left_alloc(allocation_count(left)), + right_alloc(allocation_count(right)) {} + + void testit() { + int error_flag = 0; + + Kokkos::parallel_reduce(1, *this, error_flag); + + ASSERT_EQ(error_flag, 0); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const size_type, value_type &update) const { + long offset = -1; + + for (unsigned i5 = 0; i5 < unsigned(left.extent(5)); ++i5) + for (unsigned i4 = 0; i4 < unsigned(left.extent(4)); ++i4) + for (unsigned i3 = 0; i3 < unsigned(left.extent(3)); ++i3) + for (unsigned i2 = 0; i2 < unsigned(left.extent(2)); ++i2) + for (unsigned i1 = 0; i1 < unsigned(left.extent(1)); ++i1) + for (unsigned i0 = 0; i0 < unsigned(left.extent(0)); ++i0) { + const long j = + &left(i0, i1, i2, i3, i4, i5) - &left(0, 0, 0, 0, 0, 0); + if (j <= offset || left_alloc <= j) { + update |= 1; + } + offset = j; + } + + offset = -1; + + for (unsigned i0 = 0; i0 < unsigned(right.extent(0)); ++i0) + for (unsigned i1 = 0; i1 < unsigned(right.extent(1)); ++i1) + for (unsigned i2 = 0; i2 < unsigned(right.extent(2)); ++i2) + for (unsigned i3 = 0; i3 < unsigned(right.extent(3)); ++i3) + for (unsigned i4 = 0; i4 < unsigned(right.extent(4)); ++i4) + for (unsigned i5 = 0; i5 < unsigned(right.extent(5)); ++i5) { + const long j = + &right(i0, i1, i2, i3, i4, i5) - &right(0, 0, 0, 0, 0, 0); + if (j <= offset || right_alloc <= j) { + update |= 2; + } + offset = j; + } + } +}; + +template <class DataType, class DeviceType> +struct TestViewOperator_LeftAndRight<DataType, DeviceType, 5> { + using execution_space = typename DeviceType::execution_space; + using memory_space = typename DeviceType::memory_space; + using size_type = typename execution_space::size_type; + + using value_type = int; + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type &update, + const volatile value_type &input) { + update |= input; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type &update) { update = 0; } + + using left_view = Kokkos::View<DataType, Kokkos::LayoutLeft, execution_space>; + using right_view = + Kokkos::View<DataType, Kokkos::LayoutRight, execution_space>; + using stride_view = + Kokkos::View<DataType, Kokkos::LayoutStride, execution_space>; + + left_view left; + right_view right; + stride_view left_stride; + stride_view right_stride; + long left_alloc; + long right_alloc; + + TestViewOperator_LeftAndRight() + : left("left"), + right("right"), + left_stride(left), + right_stride(right), + left_alloc(allocation_count(left)), + right_alloc(allocation_count(right)) {} + + void testit() { + int error_flag = 0; + + Kokkos::parallel_reduce(1, *this, error_flag); + + ASSERT_EQ(error_flag, 0); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const size_type, value_type &update) const { + long offset = -1; + + for (unsigned i4 = 0; i4 < unsigned(left.extent(4)); ++i4) + for (unsigned i3 = 0; i3 < unsigned(left.extent(3)); ++i3) + for (unsigned i2 = 0; i2 < unsigned(left.extent(2)); ++i2) + for (unsigned i1 = 0; i1 < unsigned(left.extent(1)); ++i1) + for (unsigned i0 = 0; i0 < unsigned(left.extent(0)); ++i0) { + const long j = &left(i0, i1, i2, i3, i4) - &left(0, 0, 0, 0, 0); + if (j <= offset || left_alloc <= j) { + update |= 1; + } + offset = j; + + if (&left(i0, i1, i2, i3, i4) != + &left_stride(i0, i1, i2, i3, i4)) { + update |= 4; + } + } + + offset = -1; + + for (unsigned i0 = 0; i0 < unsigned(right.extent(0)); ++i0) + for (unsigned i1 = 0; i1 < unsigned(right.extent(1)); ++i1) + for (unsigned i2 = 0; i2 < unsigned(right.extent(2)); ++i2) + for (unsigned i3 = 0; i3 < unsigned(right.extent(3)); ++i3) + for (unsigned i4 = 0; i4 < unsigned(right.extent(4)); ++i4) { + const long j = &right(i0, i1, i2, i3, i4) - &right(0, 0, 0, 0, 0); + if (j <= offset || right_alloc <= j) { + update |= 2; + } + offset = j; + + if (&right(i0, i1, i2, i3, i4) != + &right_stride(i0, i1, i2, i3, i4)) { + update |= 8; + } + } + } +}; + +template <class DataType, class DeviceType> +struct TestViewOperator_LeftAndRight<DataType, DeviceType, 4> { + using execution_space = typename DeviceType::execution_space; + using memory_space = typename DeviceType::memory_space; + using size_type = typename execution_space::size_type; + + using value_type = int; + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type &update, + const volatile value_type &input) { + update |= input; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type &update) { update = 0; } + + using left_view = Kokkos::View<DataType, Kokkos::LayoutLeft, execution_space>; + using right_view = + Kokkos::View<DataType, Kokkos::LayoutRight, execution_space>; + + left_view left; + right_view right; + long left_alloc; + long right_alloc; + + TestViewOperator_LeftAndRight() + : left("left"), + right("right"), + left_alloc(allocation_count(left)), + right_alloc(allocation_count(right)) {} + + void testit() { + int error_flag = 0; + + Kokkos::parallel_reduce(1, *this, error_flag); + + ASSERT_EQ(error_flag, 0); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const size_type, value_type &update) const { + long offset = -1; + + for (unsigned i3 = 0; i3 < unsigned(left.extent(3)); ++i3) + for (unsigned i2 = 0; i2 < unsigned(left.extent(2)); ++i2) + for (unsigned i1 = 0; i1 < unsigned(left.extent(1)); ++i1) + for (unsigned i0 = 0; i0 < unsigned(left.extent(0)); ++i0) { + const long j = &left(i0, i1, i2, i3) - &left(0, 0, 0, 0); + if (j <= offset || left_alloc <= j) { + update |= 1; + } + offset = j; + } + + offset = -1; + + for (unsigned i0 = 0; i0 < unsigned(right.extent(0)); ++i0) + for (unsigned i1 = 0; i1 < unsigned(right.extent(1)); ++i1) + for (unsigned i2 = 0; i2 < unsigned(right.extent(2)); ++i2) + for (unsigned i3 = 0; i3 < unsigned(right.extent(3)); ++i3) { + const long j = &right(i0, i1, i2, i3) - &right(0, 0, 0, 0); + if (j <= offset || right_alloc <= j) { + update |= 2; + } + offset = j; + } + } +}; + +template <class DataType, class DeviceType> +struct TestViewOperator_LeftAndRight<DataType, DeviceType, 3> { + using execution_space = typename DeviceType::execution_space; + using memory_space = typename DeviceType::memory_space; + using size_type = typename execution_space::size_type; + + using value_type = int; + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type &update, + const volatile value_type &input) { + update |= input; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type &update) { update = 0; } + + using left_view = Kokkos::View<DataType, Kokkos::LayoutLeft, execution_space>; + using right_view = + Kokkos::View<DataType, Kokkos::LayoutRight, execution_space>; + using stride_view = + Kokkos::View<DataType, Kokkos::LayoutStride, execution_space>; + + left_view left; + right_view right; + stride_view left_stride; + stride_view right_stride; + long left_alloc; + long right_alloc; + + TestViewOperator_LeftAndRight() + : left(std::string("left")), + right(std::string("right")), + left_stride(left), + right_stride(right), + left_alloc(allocation_count(left)), + right_alloc(allocation_count(right)) {} + + void testit() { + int error_flag = 0; + + Kokkos::parallel_reduce(1, *this, error_flag); + + ASSERT_EQ(error_flag, 0); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const size_type, value_type &update) const { + long offset = -1; + + for (unsigned i2 = 0; i2 < unsigned(left.extent(2)); ++i2) + for (unsigned i1 = 0; i1 < unsigned(left.extent(1)); ++i1) + for (unsigned i0 = 0; i0 < unsigned(left.extent(0)); ++i0) { + const long j = &left(i0, i1, i2) - &left(0, 0, 0); + if (j <= offset || left_alloc <= j) { + update |= 1; + } + offset = j; + + if (&left(i0, i1, i2) != &left_stride(i0, i1, i2)) { + update |= 4; + } + } + + offset = -1; + + for (unsigned i0 = 0; i0 < unsigned(right.extent(0)); ++i0) + for (unsigned i1 = 0; i1 < unsigned(right.extent(1)); ++i1) + for (unsigned i2 = 0; i2 < unsigned(right.extent(2)); ++i2) { + const long j = &right(i0, i1, i2) - &right(0, 0, 0); + if (j <= offset || right_alloc <= j) { + update |= 2; + } + offset = j; + + if (&right(i0, i1, i2) != &right_stride(i0, i1, i2)) { + update |= 8; + } + } + + for (unsigned i0 = 0; i0 < unsigned(left.extent(0)); ++i0) + for (unsigned i1 = 0; i1 < unsigned(left.extent(1)); ++i1) + for (unsigned i2 = 0; i2 < unsigned(left.extent(2)); ++i2) { + if (&left(i0, i1, i2) != &left.access(i0, i1, i2, 0, 0, 0, 0, 0)) { + update |= 3; + } + if (&right(i0, i1, i2) != &right.access(i0, i1, i2, 0, 0, 0, 0, 0)) { + update |= 3; + } + } + } +}; + +template <class DataType, class DeviceType> +struct TestViewOperator_LeftAndRight<DataType, DeviceType, 2> { + using execution_space = typename DeviceType::execution_space; + using memory_space = typename DeviceType::memory_space; + using size_type = typename execution_space::size_type; + + using value_type = int; + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type &update, + const volatile value_type &input) { + update |= input; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type &update) { update = 0; } + + using left_view = Kokkos::View<DataType, Kokkos::LayoutLeft, execution_space>; + using right_view = + Kokkos::View<DataType, Kokkos::LayoutRight, execution_space>; + + left_view left; + right_view right; + long left_alloc; + long right_alloc; + + TestViewOperator_LeftAndRight() + : left("left"), + right("right"), + left_alloc(allocation_count(left)), + right_alloc(allocation_count(right)) {} + + void testit() { + int error_flag = 0; + + Kokkos::parallel_reduce(1, *this, error_flag); + + ASSERT_EQ(error_flag, 0); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const size_type, value_type &update) const { + long offset = -1; + + for (unsigned i1 = 0; i1 < unsigned(left.extent(1)); ++i1) + for (unsigned i0 = 0; i0 < unsigned(left.extent(0)); ++i0) { + const long j = &left(i0, i1) - &left(0, 0); + if (j <= offset || left_alloc <= j) { + update |= 1; + } + offset = j; + } + + offset = -1; + + for (unsigned i0 = 0; i0 < unsigned(right.extent(0)); ++i0) + for (unsigned i1 = 0; i1 < unsigned(right.extent(1)); ++i1) { + const long j = &right(i0, i1) - &right(0, 0); + if (j <= offset || right_alloc <= j) { + update |= 2; + } + offset = j; + } + + for (unsigned i0 = 0; i0 < unsigned(left.extent(0)); ++i0) + for (unsigned i1 = 0; i1 < unsigned(left.extent(1)); ++i1) { + if (&left(i0, i1) != &left.access(i0, i1, 0, 0, 0, 0, 0, 0)) { + update |= 3; + } + if (&right(i0, i1) != &right.access(i0, i1, 0, 0, 0, 0, 0, 0)) { + update |= 3; + } + } + } +}; + +template <class DataType, class DeviceType> +struct TestViewOperator_LeftAndRight<DataType, DeviceType, 1> { + using execution_space = typename DeviceType::execution_space; + using memory_space = typename DeviceType::memory_space; + using size_type = typename execution_space::size_type; + + using value_type = int; + + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type &update, + const volatile value_type &input) { + update |= input; + } + + KOKKOS_INLINE_FUNCTION + static void init(value_type &update) { update = 0; } + + using left_view = Kokkos::View<DataType, Kokkos::LayoutLeft, execution_space>; + using right_view = + Kokkos::View<DataType, Kokkos::LayoutRight, execution_space>; + using stride_view = + Kokkos::View<DataType, Kokkos::LayoutStride, execution_space>; + + left_view left; + right_view right; + stride_view left_stride; + stride_view right_stride; + long left_alloc; + long right_alloc; + + TestViewOperator_LeftAndRight() + : left("left"), + right("right"), + left_stride(left), + right_stride(right), + left_alloc(allocation_count(left)), + right_alloc(allocation_count(right)) {} + + void testit() { + TestViewOperator_LeftAndRight driver; + + int error_flag = 0; + + Kokkos::parallel_reduce(1, *this, error_flag); + + ASSERT_EQ(error_flag, 0); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const size_type, value_type &update) const { + for (unsigned i0 = 0; i0 < unsigned(left.extent(0)); ++i0) { + if (&left(i0) != &left.access(i0, 0, 0, 0, 0, 0, 0, 0)) { + update |= 3; + } + if (&right(i0) != &right.access(i0, 0, 0, 0, 0, 0, 0, 0)) { + update |= 3; + } + if (&left(i0) != &left_stride(i0)) { + update |= 4; + } + if (&right(i0) != &right_stride(i0)) { + update |= 8; + } + } + } +}; + +template <class Layout, class DeviceType> +struct TestViewMirror { + template <class MemoryTraits> + void static test_mirror() { + Kokkos::View<double *, Layout, Kokkos::HostSpace> a_org("A", 1000); + Kokkos::View<double *, Layout, Kokkos::HostSpace, MemoryTraits> a_h = a_org; + auto a_h2 = Kokkos::create_mirror(Kokkos::HostSpace(), a_h); + auto a_d = Kokkos::create_mirror(DeviceType(), a_h); + + int equal_ptr_h_h2 = (a_h.data() == a_h2.data()) ? 1 : 0; + int equal_ptr_h_d = (a_h.data() == a_d.data()) ? 1 : 0; + int equal_ptr_h2_d = (a_h2.data() == a_d.data()) ? 1 : 0; + + ASSERT_EQ(equal_ptr_h_h2, 0); + ASSERT_EQ(equal_ptr_h_d, 0); + ASSERT_EQ(equal_ptr_h2_d, 0); + + ASSERT_EQ(a_h.extent(0), a_h2.extent(0)); + ASSERT_EQ(a_h.extent(0), a_d.extent(0)); + } + + template <class MemoryTraits> + void static test_mirror_view() { + Kokkos::View<double *, Layout, Kokkos::HostSpace> a_org("A", 1000); + Kokkos::View<double *, Layout, Kokkos::HostSpace, MemoryTraits> a_h = a_org; + auto a_h2 = Kokkos::create_mirror_view(Kokkos::HostSpace(), a_h); + auto a_d = Kokkos::create_mirror_view(DeviceType(), a_h); + + int equal_ptr_h_h2 = a_h.data() == a_h2.data() ? 1 : 0; + int equal_ptr_h_d = a_h.data() == a_d.data() ? 1 : 0; + int equal_ptr_h2_d = a_h2.data() == a_d.data() ? 1 : 0; + + int is_same_memspace = + std::is_same<Kokkos::HostSpace, + typename DeviceType::memory_space>::value + ? 1 + : 0; + ASSERT_EQ(equal_ptr_h_h2, 1); + ASSERT_EQ(equal_ptr_h_d, is_same_memspace); + ASSERT_EQ(equal_ptr_h2_d, is_same_memspace); + + ASSERT_EQ(a_h.extent(0), a_h2.extent(0)); + ASSERT_EQ(a_h.extent(0), a_d.extent(0)); + } + + template <class MemoryTraits> + void static test_mirror_copy() { + Kokkos::View<double *, Layout, Kokkos::HostSpace> a_org("A", 10); + a_org(5) = 42.0; + Kokkos::View<double *, Layout, Kokkos::HostSpace, MemoryTraits> a_h = a_org; + auto a_h2 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), a_h); + auto a_d = Kokkos::create_mirror_view_and_copy(DeviceType(), a_h); + auto a_h3 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), a_d); + + int equal_ptr_h_h2 = a_h.data() == a_h2.data() ? 1 : 0; + int equal_ptr_h_d = a_h.data() == a_d.data() ? 1 : 0; + int equal_ptr_h2_d = a_h2.data() == a_d.data() ? 1 : 0; + int equal_ptr_h3_d = a_h3.data() == a_d.data() ? 1 : 0; + + int is_same_memspace = + std::is_same<Kokkos::HostSpace, + typename DeviceType::memory_space>::value + ? 1 + : 0; + ASSERT_EQ(equal_ptr_h_h2, 1); + ASSERT_EQ(equal_ptr_h_d, is_same_memspace); + ASSERT_EQ(equal_ptr_h2_d, is_same_memspace); + ASSERT_EQ(equal_ptr_h3_d, is_same_memspace); + + ASSERT_EQ(a_h.extent(0), a_h3.extent(0)); + ASSERT_EQ(a_h.extent(0), a_h2.extent(0)); + ASSERT_EQ(a_h.extent(0), a_d.extent(0)); + ASSERT_EQ(a_org(5), a_h3(5)); + } + + template <typename View> + static typename View::const_type view_const_cast(View const &v) { + return v; + } + + static void test_allocated() { + using ExecutionSpace = typename DeviceType::execution_space; + using dynamic_view = Kokkos::View<int *, ExecutionSpace>; + using static_view = Kokkos::View<int[5], ExecutionSpace>; + using unmanaged_view = + Kokkos::View<int *, ExecutionSpace, + Kokkos::MemoryTraits<Kokkos::Unmanaged> >; + int const N = 100; + + dynamic_view d1; + static_view s1; + unmanaged_view u1; + ASSERT_FALSE(d1.is_allocated()); + ASSERT_FALSE(s1.is_allocated()); + ASSERT_FALSE(u1.is_allocated()); + + d1 = dynamic_view("d1", N); + dynamic_view d2(d1); + dynamic_view d3("d3", N); + ASSERT_TRUE(d1.is_allocated()); + ASSERT_TRUE(d2.is_allocated()); + ASSERT_TRUE(d3.is_allocated()); + + s1 = static_view("s1"); + static_view s2(s1); + static_view s3("s3"); + ASSERT_TRUE(s1.is_allocated()); + ASSERT_TRUE(s2.is_allocated()); + ASSERT_TRUE(s3.is_allocated()); + + u1 = unmanaged_view(d1.data(), N); + unmanaged_view u2(u1); + unmanaged_view u3(d1.data(), N); + ASSERT_TRUE(u1.is_allocated()); + ASSERT_TRUE(u2.is_allocated()); + ASSERT_TRUE(u3.is_allocated()); + } + + static void test_mirror_copy_const_data_type() { + using ExecutionSpace = typename DeviceType::execution_space; + int const N = 100; + Kokkos::View<int *, ExecutionSpace> v("v", N); + Kokkos::deep_copy(v, 255); + auto v_m1 = Kokkos::create_mirror_view_and_copy( + Kokkos::DefaultHostExecutionSpace(), view_const_cast(v)); + auto v_m2 = Kokkos::create_mirror_view_and_copy(ExecutionSpace(), + view_const_cast(v)); + } + + template <class MemoryTraits, class Space> + struct CopyUnInit { + using mirror_view_type = typename Kokkos::Impl::MirrorViewType< + Space, double *, Layout, Kokkos::HostSpace, MemoryTraits>::view_type; + + mirror_view_type a_d; + + KOKKOS_INLINE_FUNCTION + CopyUnInit(mirror_view_type &a_d_) : a_d(a_d_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const typename Space::size_type i) const { + a_d(i) = (double)(10 - i); + } + }; + + template <class MemoryTraits> + void static test_mirror_no_initialize() { + Kokkos::View<double *, Layout, Kokkos::HostSpace> a_org("A", 10); + Kokkos::View<double *, Layout, Kokkos::HostSpace, MemoryTraits> a_h = a_org; + + for (int i = 0; i < 10; i++) { + a_h(i) = (double)i; + } + auto a_d = Kokkos::create_mirror_view(DeviceType(), a_h, + Kokkos::WithoutInitializing); + + int equal_ptr_h_d = (a_h.data() == a_d.data()) ? 1 : 0; + constexpr int is_same_memspace = + std::is_same<Kokkos::HostSpace, + typename DeviceType::memory_space>::value + ? 1 + : 0; + + ASSERT_EQ(equal_ptr_h_d, is_same_memspace); + + Kokkos::parallel_for( + Kokkos::RangePolicy<typename DeviceType::execution_space>(0, int(10)), + CopyUnInit<MemoryTraits, DeviceType>(a_d)); + + Kokkos::deep_copy(a_h, a_d); + + for (int i = 0; i < 10; i++) { + ASSERT_EQ(a_h(i), (double)(10 - i)); + } + } + + void static testit() { + test_mirror<Kokkos::MemoryTraits<0> >(); + test_mirror<Kokkos::MemoryTraits<Kokkos::Unmanaged> >(); + test_mirror_view<Kokkos::MemoryTraits<0> >(); + test_mirror_view<Kokkos::MemoryTraits<Kokkos::Unmanaged> >(); + test_mirror_copy<Kokkos::MemoryTraits<0> >(); + test_mirror_copy<Kokkos::MemoryTraits<Kokkos::Unmanaged> >(); + test_mirror_copy_const_data_type(); + test_allocated(); + test_mirror_no_initialize<Kokkos::MemoryTraits<0> >(); + test_mirror_no_initialize<Kokkos::MemoryTraits<Kokkos::Unmanaged> >(); + } +}; + +/*--------------------------------------------------------------------------*/ + +template <typename T, class DeviceType> +class TestViewAPI { + public: + using device = DeviceType; + + enum { N0 = 1000, N1 = 3, N2 = 5, N3 = 7 }; + + using dView0 = Kokkos::View<T, device>; + using dView1 = Kokkos::View<T *, device>; + using dView2 = Kokkos::View<T * [N1], device>; + using dView3 = Kokkos::View<T * [N1][N2], device>; + using dView4 = Kokkos::View<T * [N1][N2][N3], device>; + using const_dView4 = Kokkos::View<const T * [N1][N2][N3], device>; + using dView4_unmanaged = + Kokkos::View<T ****, device, Kokkos::MemoryUnmanaged>; + using host = typename dView0::host_mirror_space; + + static void run_test_view_operator_a() { + { + TestViewOperator<T, device> f; + Kokkos::parallel_for(int(N0), f); + Kokkos::fence(); + } +#ifndef KOKKOS_ENABLE_OPENMPTARGET + TestViewOperator_LeftAndRight<int[2][3][4][2][3][4], device> f6; + f6.testit(); + TestViewOperator_LeftAndRight<int[2][3][4][2][3], device> f5; + f5.testit(); + TestViewOperator_LeftAndRight<int[2][3][4][2], device> f4; + f4.testit(); + TestViewOperator_LeftAndRight<int[2][3][4], device> f3; + f3.testit(); + TestViewOperator_LeftAndRight<int[2][3], device> f2; + f2.testit(); + TestViewOperator_LeftAndRight<int[2], device> f1; + f1.testit(); +#endif + } + + static void run_test_view_operator_b() { +#ifndef KOKKOS_ENABLE_OPENMPTARGET + TestViewOperator_LeftAndRight<int[2][3][4][2][3][4][2], device> f7; + f7.testit(); +#endif + } + + static void run_test_view_operator_c() { +#ifndef KOKKOS_ENABLE_OPENMPTARGET + TestViewOperator_LeftAndRight<int[2][3][4][2][3][4][2][3], device> f8; + f8.testit(); +#endif + } + + static void run_test_mirror() { + using view_type = Kokkos::View<int, host>; + using mirror_type = typename view_type::HostMirror; + + static_assert(std::is_same<typename view_type::memory_space, + typename mirror_type::memory_space>::value, + ""); + + view_type a("a"); + mirror_type am = Kokkos::create_mirror_view(a); + mirror_type ax = Kokkos::create_mirror(a); + ASSERT_EQ(&a(), &am()); + + TestViewMirror<Kokkos::LayoutLeft, device>::testit(); + TestViewMirror<Kokkos::LayoutRight, device>::testit(); + } + + static void run_test_scalar() { + using hView0 = typename dView0::HostMirror; + + dView0 dx, dy; + hView0 hx, hy; + + dx = dView0("dx"); + dy = dView0("dy"); + + hx = Kokkos::create_mirror(dx); + hy = Kokkos::create_mirror(dy); + + hx() = 1; + + Kokkos::deep_copy(dx, hx); + Kokkos::deep_copy(dy, dx); + Kokkos::deep_copy(hy, dy); +#ifndef KOKKOS_ENABLE_OPENMPTARGET + ASSERT_EQ(hx(), hy()); +#endif + } + + static void run_test() { + // mfh 14 Feb 2014: This test doesn't actually create instances of + // these types. In order to avoid "unused type alias" + // warnings, we declare empty instances of these types, with the + // usual "(void)" marker to avoid compiler warnings for unused + // variables. + + using hView0 = typename dView0::HostMirror; + using hView1 = typename dView1::HostMirror; + using hView2 = typename dView2::HostMirror; + using hView3 = typename dView3::HostMirror; + using hView4 = typename dView4::HostMirror; + + { + hView0 thing; + (void)thing; + } + { + hView1 thing; + (void)thing; + } + { + hView2 thing; + (void)thing; + } + { + hView3 thing; + (void)thing; + } + { + hView4 thing; + (void)thing; + } + + dView4 dx, dy, dz; + hView4 hx, hy, hz; + + ASSERT_TRUE(dx.data() == nullptr); + ASSERT_TRUE(dy.data() == nullptr); + ASSERT_TRUE(dz.data() == nullptr); + ASSERT_TRUE(hx.data() == nullptr); + ASSERT_TRUE(hy.data() == nullptr); + ASSERT_TRUE(hz.data() == nullptr); + ASSERT_EQ(dx.extent(0), 0u); + ASSERT_EQ(dy.extent(0), 0u); + ASSERT_EQ(dz.extent(0), 0u); + ASSERT_EQ(hx.extent(0), 0u); + ASSERT_EQ(hy.extent(0), 0u); + ASSERT_EQ(hz.extent(0), 0u); + ASSERT_EQ(dx.extent(1), unsigned(N1)); + ASSERT_EQ(dy.extent(1), unsigned(N1)); + ASSERT_EQ(dz.extent(1), unsigned(N1)); + ASSERT_EQ(hx.extent(1), unsigned(N1)); + ASSERT_EQ(hy.extent(1), unsigned(N1)); + ASSERT_EQ(hz.extent(1), unsigned(N1)); + + dx = dView4("dx", N0); + dy = dView4("dy", N0); + + ASSERT_EQ(dx.use_count(), size_t(1)); + + dView4_unmanaged unmanaged_dx = dx; + ASSERT_EQ(dx.use_count(), size_t(1)); + + dView4_unmanaged unmanaged_from_ptr_dx = dView4_unmanaged( + dx.data(), dx.extent(0), dx.extent(1), dx.extent(2), dx.extent(3)); + + { + // Destruction of this view should be harmless. + + const_dView4 unmanaged_from_ptr_const_dx(dx.data(), dx.extent(0)); + } + + const_dView4 const_dx = dx; + ASSERT_EQ(dx.use_count(), size_t(2)); + + { + const_dView4 const_dx2; + const_dx2 = const_dx; + ASSERT_EQ(dx.use_count(), size_t(3)); + + const_dx2 = dy; + ASSERT_EQ(dx.use_count(), size_t(2)); + + const_dView4 const_dx3(dx); + ASSERT_EQ(dx.use_count(), size_t(3)); + + dView4_unmanaged dx4_unmanaged(dx); + ASSERT_EQ(dx.use_count(), size_t(3)); + } + + ASSERT_EQ(dx.use_count(), size_t(2)); + + ASSERT_FALSE(dx.data() == nullptr); + ASSERT_FALSE(const_dx.data() == nullptr); + ASSERT_FALSE(unmanaged_dx.data() == nullptr); + ASSERT_FALSE(unmanaged_from_ptr_dx.data() == nullptr); + ASSERT_FALSE(dy.data() == nullptr); + ASSERT_NE(dx, dy); + + ASSERT_EQ(dx.extent(0), unsigned(N0)); + ASSERT_EQ(dx.extent(1), unsigned(N1)); + ASSERT_EQ(dx.extent(2), unsigned(N2)); + ASSERT_EQ(dx.extent(3), unsigned(N3)); + + ASSERT_EQ(dy.extent(0), unsigned(N0)); + ASSERT_EQ(dy.extent(1), unsigned(N1)); + ASSERT_EQ(dy.extent(2), unsigned(N2)); + ASSERT_EQ(dy.extent(3), unsigned(N3)); + + ASSERT_EQ(unmanaged_from_ptr_dx.span(), + unsigned(N0) * unsigned(N1) * unsigned(N2) * unsigned(N3)); +#ifdef KOKKOS_ENABLE_OPENMPTARGET + return; +#endif + hx = Kokkos::create_mirror(dx); + hy = Kokkos::create_mirror(dy); + + // T v1 = hx(); // Generates compile error as intended. + // T v2 = hx( 0, 0 ); // Generates compile error as intended. + // hx( 0, 0 ) = v2; // Generates compile error as intended. + + // Testing with asynchronous deep copy with respect to device + { + size_t count = 0; + + for (size_t ip = 0; ip < N0; ++ip) + for (size_t i1 = 0; i1 < hx.extent(1); ++i1) + for (size_t i2 = 0; i2 < hx.extent(2); ++i2) + for (size_t i3 = 0; i3 < hx.extent(3); ++i3) { + hx(ip, i1, i2, i3) = ++count; + } + + Kokkos::deep_copy(typename hView4::execution_space(), dx, hx); + Kokkos::deep_copy(typename hView4::execution_space(), dy, dx); + Kokkos::deep_copy(typename hView4::execution_space(), hy, dy); + typename dView4::execution_space().fence(); + + for (size_t ip = 0; ip < N0; ++ip) + for (size_t i1 = 0; i1 < N1; ++i1) + for (size_t i2 = 0; i2 < N2; ++i2) + for (size_t i3 = 0; i3 < N3; ++i3) { + ASSERT_EQ(hx(ip, i1, i2, i3), hy(ip, i1, i2, i3)); + } + + Kokkos::deep_copy(typename hView4::execution_space(), dx, T(0)); + Kokkos::deep_copy(typename hView4::execution_space(), hx, dx); + typename dView4::execution_space().fence(); + + for (size_t ip = 0; ip < N0; ++ip) + for (size_t i1 = 0; i1 < N1; ++i1) + for (size_t i2 = 0; i2 < N2; ++i2) + for (size_t i3 = 0; i3 < N3; ++i3) { + ASSERT_EQ(hx(ip, i1, i2, i3), T(0)); + } + } + + // Testing with asynchronous deep copy with respect to host. + { + size_t count = 0; + + for (size_t ip = 0; ip < N0; ++ip) + for (size_t i1 = 0; i1 < hx.extent(1); ++i1) + for (size_t i2 = 0; i2 < hx.extent(2); ++i2) + for (size_t i3 = 0; i3 < hx.extent(3); ++i3) { + hx(ip, i1, i2, i3) = ++count; + } + + Kokkos::deep_copy(typename dView4::execution_space(), dx, hx); + Kokkos::deep_copy(typename dView4::execution_space(), dy, dx); + Kokkos::deep_copy(typename dView4::execution_space(), hy, dy); + typename dView4::execution_space().fence(); + + for (size_t ip = 0; ip < N0; ++ip) + for (size_t i1 = 0; i1 < N1; ++i1) + for (size_t i2 = 0; i2 < N2; ++i2) + for (size_t i3 = 0; i3 < N3; ++i3) { + ASSERT_EQ(hx(ip, i1, i2, i3), hy(ip, i1, i2, i3)); + } + + Kokkos::deep_copy(typename dView4::execution_space(), dx, T(0)); + Kokkos::deep_copy(typename dView4::execution_space(), hx, dx); + typename dView4::execution_space().fence(); + + for (size_t ip = 0; ip < N0; ++ip) + for (size_t i1 = 0; i1 < N1; ++i1) + for (size_t i2 = 0; i2 < N2; ++i2) + for (size_t i3 = 0; i3 < N3; ++i3) { + ASSERT_EQ(hx(ip, i1, i2, i3), T(0)); + } + } + + // Testing with synchronous deep copy. + { + size_t count = 0; + + for (size_t ip = 0; ip < N0; ++ip) + for (size_t i1 = 0; i1 < hx.extent(1); ++i1) + for (size_t i2 = 0; i2 < hx.extent(2); ++i2) + for (size_t i3 = 0; i3 < hx.extent(3); ++i3) { + hx(ip, i1, i2, i3) = ++count; + } + + Kokkos::deep_copy(dx, hx); + Kokkos::deep_copy(dy, dx); + Kokkos::deep_copy(hy, dy); + + for (size_t ip = 0; ip < N0; ++ip) + for (size_t i1 = 0; i1 < N1; ++i1) + for (size_t i2 = 0; i2 < N2; ++i2) + for (size_t i3 = 0; i3 < N3; ++i3) { + ASSERT_EQ(hx(ip, i1, i2, i3), hy(ip, i1, i2, i3)); + } + + Kokkos::deep_copy(dx, T(0)); + Kokkos::deep_copy(hx, dx); + + for (size_t ip = 0; ip < N0; ++ip) + for (size_t i1 = 0; i1 < N1; ++i1) + for (size_t i2 = 0; i2 < N2; ++i2) + for (size_t i3 = 0; i3 < N3; ++i3) { + ASSERT_EQ(hx(ip, i1, i2, i3), T(0)); + } + } + + dz = dx; + ASSERT_EQ(dx, dz); + ASSERT_NE(dy, dz); + + dz = dy; + ASSERT_EQ(dy, dz); + ASSERT_NE(dx, dz); + + dx = dView4(); + ASSERT_TRUE(dx.data() == nullptr); + ASSERT_FALSE(dy.data() == nullptr); + ASSERT_FALSE(dz.data() == nullptr); + + dy = dView4(); + ASSERT_TRUE(dx.data() == nullptr); + ASSERT_TRUE(dy.data() == nullptr); + ASSERT_FALSE(dz.data() == nullptr); + + dz = dView4(); + ASSERT_TRUE(dx.data() == nullptr); + ASSERT_TRUE(dy.data() == nullptr); + ASSERT_TRUE(dz.data() == nullptr); + } + + static void run_test_deep_copy_empty() { + // Check Deep Copy of LayoutLeft to LayoutRight + { + Kokkos::View<double *, Kokkos::LayoutLeft> dll("dll", 10); + Kokkos::View<double *, Kokkos::LayoutRight, Kokkos::HostSpace> hlr("hlr", + 10); + Kokkos::deep_copy(dll, hlr); + Kokkos::deep_copy(hlr, dll); + } + + // Check Deep Copy of two empty 1D views + { + Kokkos::View<double *> d; + Kokkos::View<double *, Kokkos::HostSpace> h; + Kokkos::deep_copy(d, h); + Kokkos::deep_copy(h, d); + } + + // Check Deep Copy of two empty 2D views + { + Kokkos::View<double * [3], Kokkos::LayoutRight> d; + Kokkos::View<double * [3], Kokkos::LayoutRight, Kokkos::HostSpace> h; + Kokkos::deep_copy(d, h); + Kokkos::deep_copy(h, d); + } + } + + using DataType = T[2]; + + static void check_auto_conversion_to_const( + const Kokkos::View<const DataType, device> &arg_const, + const Kokkos::View<DataType, device> &arg) { + ASSERT_TRUE(arg_const == arg); + } + + static void run_test_const() { + using typeX = Kokkos::View<DataType, device>; + using const_typeX = Kokkos::View<const DataType, device>; + using const_typeR = + Kokkos::View<const DataType, device, Kokkos::MemoryRandomAccess>; + + typeX x("X"); + const_typeX xc = x; + const_typeR xr = x; + + ASSERT_TRUE(xc == x); + ASSERT_TRUE(x == xc); + + // For CUDA the constant random access View does not return + // an lvalue reference due to retrieving through texture cache + // therefore not allowed to query the underlying pointer. +#if defined(KOKKOS_ENABLE_CUDA) + if (!std::is_same<typename device::execution_space, Kokkos::Cuda>::value) +#endif + { + ASSERT_TRUE(x.data() == xr.data()); + } + + // typeX xf = xc; // Setting non-const from const must not compile. + + check_auto_conversion_to_const(x, x); + } + + static void run_test_subview() { + using sView = Kokkos::View<const T, device>; + + dView0 d0("d0"); + dView1 d1("d1", N0); + dView2 d2("d2", N0); + dView3 d3("d3", N0); + dView4 d4("d4", N0); + + sView s0 = d0; + sView s1 = Kokkos::subview(d1, 1); + sView s2 = Kokkos::subview(d2, 1, 1); + sView s3 = Kokkos::subview(d3, 1, 1, 1); + sView s4 = Kokkos::subview(d4, 1, 1, 1, 1); + } + + static void run_test_subview_strided() { + using view_left_4 = Kokkos::View<int ****, Kokkos::LayoutLeft, host>; + using view_right_4 = Kokkos::View<int ****, Kokkos::LayoutRight, host>; + using view_left_2 = Kokkos::View<int **, Kokkos::LayoutLeft, host>; + using view_right_2 = Kokkos::View<int **, Kokkos::LayoutRight, host>; + + using view_stride_1 = Kokkos::View<int *, Kokkos::LayoutStride, host>; + using view_stride_2 = Kokkos::View<int **, Kokkos::LayoutStride, host>; + + view_left_2 xl2("xl2", 100, 200); + view_right_2 xr2("xr2", 100, 200); + view_stride_1 yl1 = Kokkos::subview(xl2, 0, Kokkos::ALL()); + view_stride_1 yl2 = Kokkos::subview(xl2, 1, Kokkos::ALL()); + view_stride_1 yr1 = Kokkos::subview(xr2, 0, Kokkos::ALL()); + view_stride_1 yr2 = Kokkos::subview(xr2, 1, Kokkos::ALL()); + + ASSERT_EQ(yl1.extent(0), xl2.extent(1)); + ASSERT_EQ(yl2.extent(0), xl2.extent(1)); + ASSERT_EQ(yr1.extent(0), xr2.extent(1)); + ASSERT_EQ(yr2.extent(0), xr2.extent(1)); + + ASSERT_EQ(&yl1(0) - &xl2(0, 0), 0); + ASSERT_EQ(&yl2(0) - &xl2(1, 0), 0); + ASSERT_EQ(&yr1(0) - &xr2(0, 0), 0); + ASSERT_EQ(&yr2(0) - &xr2(1, 0), 0); + + view_left_4 xl4("xl4", 10, 20, 30, 40); + view_right_4 xr4("xr4", 10, 20, 30, 40); + + view_stride_2 yl4 = + Kokkos::subview(xl4, 1, Kokkos::ALL(), 2, Kokkos::ALL()); + view_stride_2 yr4 = + Kokkos::subview(xr4, 1, Kokkos::ALL(), 2, Kokkos::ALL()); + + ASSERT_EQ(yl4.extent(0), xl4.extent(1)); + ASSERT_EQ(yl4.extent(1), xl4.extent(3)); + ASSERT_EQ(yr4.extent(0), xr4.extent(1)); + ASSERT_EQ(yr4.extent(1), xr4.extent(3)); + + ASSERT_EQ(&yl4(4, 4) - &xl4(1, 4, 2, 4), 0); + ASSERT_EQ(&yr4(4, 4) - &xr4(1, 4, 2, 4), 0); + } + + static void run_test_vector() { + static const unsigned Length = 1000, Count = 8; + + using vector_type = Kokkos::View<T *, Kokkos::LayoutLeft, host>; + using multivector_type = Kokkos::View<T **, Kokkos::LayoutLeft, host>; + + using vector_right_type = Kokkos::View<T *, Kokkos::LayoutRight, host>; + using multivector_right_type = + Kokkos::View<T **, Kokkos::LayoutRight, host>; + + using const_vector_right_type = + Kokkos::View<const T *, Kokkos::LayoutRight, host>; + using const_vector_type = Kokkos::View<const T *, Kokkos::LayoutLeft, host>; + using const_multivector_type = + Kokkos::View<const T **, Kokkos::LayoutLeft, host>; + + multivector_type mv = multivector_type("mv", Length, Count); + multivector_right_type mv_right = + multivector_right_type("mv", Length, Count); + + vector_type v1 = Kokkos::subview(mv, Kokkos::ALL(), 0); + vector_type v2 = Kokkos::subview(mv, Kokkos::ALL(), 1); + vector_type v3 = Kokkos::subview(mv, Kokkos::ALL(), 2); + + vector_type rv1 = Kokkos::subview(mv_right, 0, Kokkos::ALL()); + vector_type rv2 = Kokkos::subview(mv_right, 1, Kokkos::ALL()); + vector_type rv3 = Kokkos::subview(mv_right, 2, Kokkos::ALL()); + + multivector_type mv1 = + Kokkos::subview(mv, std::make_pair(1, 998), std::make_pair(2, 5)); + + multivector_right_type mvr1 = + Kokkos::subview(mv_right, std::make_pair(1, 998), std::make_pair(2, 5)); + + const_vector_type cv1 = Kokkos::subview(mv, Kokkos::ALL(), 0); + const_vector_type cv2 = Kokkos::subview(mv, Kokkos::ALL(), 1); + const_vector_type cv3 = Kokkos::subview(mv, Kokkos::ALL(), 2); + + vector_right_type vr1 = Kokkos::subview(mv, Kokkos::ALL(), 0); + vector_right_type vr2 = Kokkos::subview(mv, Kokkos::ALL(), 1); + vector_right_type vr3 = Kokkos::subview(mv, Kokkos::ALL(), 2); + + const_vector_right_type cvr1 = Kokkos::subview(mv, Kokkos::ALL(), 0); + const_vector_right_type cvr2 = Kokkos::subview(mv, Kokkos::ALL(), 1); + const_vector_right_type cvr3 = Kokkos::subview(mv, Kokkos::ALL(), 2); + + ASSERT_TRUE(&v1[0] == &v1(0)); + ASSERT_TRUE(&v1[0] == &mv(0, 0)); + ASSERT_TRUE(&v2[0] == &mv(0, 1)); + ASSERT_TRUE(&v3[0] == &mv(0, 2)); + + ASSERT_TRUE(&cv1[0] == &mv(0, 0)); + ASSERT_TRUE(&cv2[0] == &mv(0, 1)); + ASSERT_TRUE(&cv3[0] == &mv(0, 2)); + + ASSERT_TRUE(&vr1[0] == &mv(0, 0)); + ASSERT_TRUE(&vr2[0] == &mv(0, 1)); + ASSERT_TRUE(&vr3[0] == &mv(0, 2)); + + ASSERT_TRUE(&cvr1[0] == &mv(0, 0)); + ASSERT_TRUE(&cvr2[0] == &mv(0, 1)); + ASSERT_TRUE(&cvr3[0] == &mv(0, 2)); + + ASSERT_TRUE(&mv1(0, 0) == &mv(1, 2)); + ASSERT_TRUE(&mv1(1, 1) == &mv(2, 3)); + ASSERT_TRUE(&mv1(3, 2) == &mv(4, 4)); + ASSERT_TRUE(&mvr1(0, 0) == &mv_right(1, 2)); + ASSERT_TRUE(&mvr1(1, 1) == &mv_right(2, 3)); + ASSERT_TRUE(&mvr1(3, 2) == &mv_right(4, 4)); + + const_vector_type c_cv1(v1); + typename vector_type::const_type c_cv2(v2); + typename const_vector_type::const_type c_ccv2(v2); + + const_multivector_type cmv(mv); + typename multivector_type::const_type cmvX(cmv); + typename const_multivector_type::const_type ccmvX(cmv); + } + + static void run_test_error() { +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (std::is_same<typename dView1::memory_space, + Kokkos::Experimental::OpenMPTargetSpace>::value) + return; +#endif + auto alloc_size = std::numeric_limits<size_t>::max() - 42; + try { + auto should_always_fail = dView1("hello_world_failure", alloc_size); + } catch (std::runtime_error const &error) { + // TODO once we remove the conversion to std::runtime_error, catch the + // appropriate Kokkos error here + std::string msg = error.what(); + ASSERT_PRED_FORMAT2(::testing::IsSubstring, "hello_world_failure", msg); + ASSERT_PRED_FORMAT2(::testing::IsSubstring, + typename device::memory_space{}.name(), msg); + // Can't figure out how to make assertions either/or, so we'll just use + // an if statement here for now. Test failure message will be a bit + // misleading, but developers should figure out what's going on pretty + // quickly. + if (msg.find("is not a valid size") != std::string::npos) { + ASSERT_PRED_FORMAT2(::testing::IsSubstring, "is not a valid size", msg); + } else +#ifdef KOKKOS_ENABLE_SYCL + if (msg.find("insufficient memory") != std::string::npos) +#endif + { + ASSERT_PRED_FORMAT2(::testing::IsSubstring, "insufficient memory", msg); + } + // SYCL cannot tell the reason why a memory allocation failed +#ifdef KOKKOS_ENABLE_SYCL + else { + // Otherwise, there has to be some sort of "unknown error" error + ASSERT_PRED_FORMAT2(::testing::IsSubstring, + "because of an unknown error.", msg); + } +#endif + } + } +}; + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestViewAPI_a.hpp b/packages/kokkos/core/unit_test/TestViewAPI_a.hpp new file mode 100644 index 0000000000000000000000000000000000000000..048ba51c17001a0a65027922c7132eb239d5b8ea --- /dev/null +++ b/packages/kokkos/core/unit_test/TestViewAPI_a.hpp @@ -0,0 +1,53 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestViewAPI.hpp> + +namespace Test { + +TEST(TEST_CATEGORY, view_api_a) { + TestViewAPI<double, TEST_EXECSPACE>::run_test(); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestViewAPI_b.hpp b/packages/kokkos/core/unit_test/TestViewAPI_b.hpp new file mode 100644 index 0000000000000000000000000000000000000000..ad9069e397138957dd00d6326bee392876f22aec --- /dev/null +++ b/packages/kokkos/core/unit_test/TestViewAPI_b.hpp @@ -0,0 +1,55 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestViewAPI.hpp> + +namespace Test { + +TEST(TEST_CATEGORY, view_api_b) { + TestViewAPI<double, TEST_EXECSPACE>::run_test_view_operator_a(); + TestViewAPI<double, TEST_EXECSPACE>::run_test_mirror(); + TestViewAPI<double, TEST_EXECSPACE>::run_test_scalar(); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestViewAPI_c.hpp b/packages/kokkos/core/unit_test/TestViewAPI_c.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a70792dc623b63bb8aa1a84fec93ca413ffa94a1 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestViewAPI_c.hpp @@ -0,0 +1,54 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestViewAPI.hpp> + +namespace Test { + +TEST(TEST_CATEGORY, view_api_c) { + TestViewAPI<double, TEST_EXECSPACE>::run_test_deep_copy_empty(); + TestViewAPI<double, TEST_EXECSPACE>::run_test_view_operator_b(); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestViewAPI_d.hpp b/packages/kokkos/core/unit_test/TestViewAPI_d.hpp new file mode 100644 index 0000000000000000000000000000000000000000..ecb65804b7900d773e0018f4763a0a3d60aa566b --- /dev/null +++ b/packages/kokkos/core/unit_test/TestViewAPI_d.hpp @@ -0,0 +1,61 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestViewAPI.hpp> + +namespace Test { + +TEST(TEST_CATEGORY, view_api_d) { + TestViewAPI<double, TEST_EXECSPACE>::run_test_const(); + TestViewAPI<double, TEST_EXECSPACE>::run_test_subview(); + TestViewAPI<double, TEST_EXECSPACE>::run_test_subview_strided(); + TestViewAPI<double, TEST_EXECSPACE>::run_test_vector(); + TestViewAPI<double, TEST_EXECSPACE>::run_test_view_operator_c(); +} + +TEST(TEST_CATEGORY, view_allocation_error) { + TestViewAPI<double, TEST_EXECSPACE>::run_test_error(); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestViewAPI_e.hpp b/packages/kokkos/core/unit_test/TestViewAPI_e.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a5dc6cf29a467bd576bd96bca52f90b3db26324b --- /dev/null +++ b/packages/kokkos/core/unit_test/TestViewAPI_e.hpp @@ -0,0 +1,244 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> +#include <stdexcept> +#include <sstream> +#include <iostream> + +namespace Test { + +TEST(TEST_CATEGORY, view_remap) { + enum { N0 = 3, N1 = 2, N2 = 8, N3 = 9 }; + +#ifdef KOKKOS_ENABLE_CUDA +#define EXECSPACE \ + std::conditional<std::is_same<TEST_EXECSPACE, Kokkos::Cuda>::value, \ + Kokkos::CudaHostPinnedSpace, TEST_EXECSPACE>::type +#else +#ifdef KOKKOS_ENABLE_HIP +#define EXECSPACE \ + std::conditional< \ + std::is_same<TEST_EXECSPACE, Kokkos::Experimental::HIP>::value, \ + Kokkos::Experimental::HIPHostPinnedSpace, TEST_EXECSPACE>::type +#else +#if defined(KOKKOS_ENABLE_OPENMPTARGET) || defined(KOKKOS_ENABLE_SYCL) +#define EXECSPACE Kokkos::HostSpace +#else +#define EXECSPACE TEST_EXECSPACE +#endif +#endif +#endif + + using output_type = + Kokkos::View<double * [N1][N2][N3], Kokkos::LayoutRight, EXECSPACE>; + + using input_type = + Kokkos::View<int* * [N2][N3], Kokkos::LayoutLeft, EXECSPACE>; + + using diff_type = + Kokkos::View<int * [N0][N2][N3], Kokkos::LayoutLeft, EXECSPACE>; + + output_type output("output", N0); + input_type input("input", N0, N1); + diff_type diff("diff", N0); + + Kokkos::fence(); + int value = 0; + + for (size_t i3 = 0; i3 < N3; ++i3) + for (size_t i2 = 0; i2 < N2; ++i2) + for (size_t i1 = 0; i1 < N1; ++i1) + for (size_t i0 = 0; i0 < N0; ++i0) { + input(i0, i1, i2, i3) = ++value; + } + + Kokkos::fence(); + // Kokkos::deep_copy( diff, input ); // Throw with incompatible shape. + Kokkos::deep_copy(output, input); + Kokkos::fence(); + + value = 0; + + for (size_t i3 = 0; i3 < N3; ++i3) + for (size_t i2 = 0; i2 < N2; ++i2) + for (size_t i1 = 0; i1 < N1; ++i1) + for (size_t i0 = 0; i0 < N0; ++i0) { + ++value; + ASSERT_EQ(value, ((int)output(i0, i1, i2, i3))); + } +} + +TEST(TEST_CATEGORY, view_mirror_nonconst) { + Kokkos::View<int*, TEST_EXECSPACE> d_view("d_view", 10); + Kokkos::View<const int*, TEST_EXECSPACE> d_view_const = d_view; + auto h_view = Kokkos::create_mirror(d_view_const); + Kokkos::deep_copy(h_view, d_view_const); + auto h_view2 = Kokkos::create_mirror(Kokkos::HostSpace(), d_view_const); + Kokkos::deep_copy(h_view2, d_view_const); +} + +template <typename DataType, typename... Extents> +void test_left_stride(Extents... extents) { + using view_type = + Kokkos::View<DataType, Kokkos::LayoutLeft, Kokkos::HostSpace>; + view_type view("view", extents...); + size_t expected_stride = 1; + size_t all_strides[view_type::rank + 1]; + view.stride(all_strides); + for (int i = 0; i < view_type::rank; ++i) { + ASSERT_EQ(view.stride(i), expected_stride); + ASSERT_EQ(all_strides[i], expected_stride); + expected_stride *= view.extent(i); + } +} + +template <typename DataType, typename... Extents> +void test_right_stride(Extents... extents) { + using view_type = + Kokkos::View<DataType, Kokkos::LayoutRight, Kokkos::HostSpace>; + view_type view("view", extents...); + size_t expected_stride = 1; + size_t all_strides[view_type::rank + 1]; + view.stride(all_strides); + for (int ri = 0; ri < view_type::rank; ++ri) { + auto i = view_type::rank - 1 - ri; + ASSERT_EQ(view.stride(i), expected_stride); + ASSERT_EQ(all_strides[i], expected_stride); + expected_stride *= view.extent(i); + } +} + +template <typename DataType, typename... Extents> +void test_stride(Extents... extents) { + test_right_stride<DataType>(extents...); + test_left_stride<DataType>(extents...); +} + +TEST(TEST_CATEGORY, view_stride_method) { + test_stride<double[3]>(); + test_stride<double*>(3); + test_stride<double[3][7][13]>(); + test_stride<double***>(3, 7, 13); + // factorial(8) = 40320 + test_stride<double[1][2][3][4][5][6][7][8]>(); + test_stride<double********>(1, 2, 3, 4, 5, 6, 7, 8); +} + +inline void test_anonymous_space() { + /* apparently TEST_EXECSPACE is sometimes a memory space. */ + using ExecSpace = TEST_EXECSPACE::execution_space; + int host_array[10]; + Kokkos::View<int[10], Kokkos::AnonymousSpace> host_anon_stat_view(host_array); + Kokkos::View<int*, Kokkos::AnonymousSpace> host_anon_dyn_view(host_array, 10); + Kokkos::View<int*, Kokkos::HostSpace> host_view("host_view", 10); + Kokkos::View<int*, Kokkos::AnonymousSpace> host_anon_assign_view = host_view; + for (int i = 0; i < 10; ++i) { + host_anon_stat_view(i) = host_anon_dyn_view(i) = 142; + host_anon_assign_view(i) = 142; + } + Kokkos::View<int**, Kokkos::LayoutRight, ExecSpace> d_view("d_view", 100, 10); +#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA + Kokkos::parallel_for( + Kokkos::RangePolicy<ExecSpace, int>(0, 100), KOKKOS_LAMBDA(int i) { + int* ptr = &(d_view(i, 0)); + Kokkos::View<int[10], Kokkos::AnonymousSpace> d_anon_stat_view(ptr); + Kokkos::View<int*, Kokkos::AnonymousSpace> d_anon_dyn_view(ptr, 10); + auto sub = Kokkos::subview(d_view, i, Kokkos::ALL()); + Kokkos::View<int*, Kokkos::AnonymousSpace> d_anon_assign_view = sub; + for (int j = 0; j < 10; ++j) { + d_anon_stat_view(j) = 50; + d_anon_assign_view(j) += 50; + d_anon_dyn_view(j) += 42; + } + }); + Kokkos::fence(); +#endif +} + +TEST(TEST_CATEGORY, anonymous_space) { test_anonymous_space(); } + +template <class ExecSpace> +struct TestViewOverloadResolution { + // Overload based on value_type and rank + static int foo(Kokkos::View<const double**, ExecSpace> /*a*/) { return 1; } + static int foo(Kokkos::View<const int**, ExecSpace> /*a*/) { return 2; } + static int foo(Kokkos::View<const double***, ExecSpace> /*a*/) { return 3; } + + // Overload based on compile time dimensions + static int bar(Kokkos::View<double * [3], ExecSpace> /*a*/) { return 4; } + static int bar(Kokkos::View<double * [4], ExecSpace> /*a*/) { return 5; } + + static void test_function_overload() { + Kokkos::View<double**, typename ExecSpace::execution_space::array_layout, + ExecSpace> + a("A", 10, 3); + int data_type_1 = foo(a); + int data_type_3 = + foo(Kokkos::View<const double**, + typename ExecSpace::execution_space::array_layout, + ExecSpace>(a)); + Kokkos::View<double***, typename ExecSpace::execution_space::array_layout, + ExecSpace> + b("B", 10, 3, 4); + int data_type_2 = foo(b); + Kokkos::View<double * [3], + typename ExecSpace::execution_space::array_layout, ExecSpace> + c(a); + int static_extent = bar(c); + ASSERT_EQ(1, data_type_1); + ASSERT_EQ(3, data_type_2); + ASSERT_EQ(1, data_type_3); + ASSERT_EQ(4, static_extent); + } +}; + +TEST(TEST_CATEGORY, view_overload_resolution) { + TestViewOverloadResolution<TEST_EXECSPACE>::test_function_overload(); +} +} // namespace Test + +#include <TestViewIsAssignable.hpp> diff --git a/packages/kokkos/core/unit_test/TestViewCopy_a.hpp b/packages/kokkos/core/unit_test/TestViewCopy_a.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e25cb9e39ca6fd4c3cd45ef2b60b404ed82c03e7 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestViewCopy_a.hpp @@ -0,0 +1,328 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <cstdio> + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +namespace Test { + +namespace { + +template <class ViewType> +struct CheckResult { + using value_type = typename ViewType::non_const_value_type; + ViewType v; + value_type value; + CheckResult(ViewType v_, value_type value_) : v(v_), value(value_){}; + KOKKOS_FUNCTION + void operator()(const int i, int& lsum) const { + for (int j = 0; j < static_cast<int>(v.extent(1)); j++) { + if (v.access(i, j) != value) lsum++; + } + } +}; + +template <class ViewType> +bool run_check(ViewType v, typename ViewType::value_type value) { + using exec_space = typename ViewType::memory_space::execution_space; + int errors = 0; + Kokkos::fence(); + Kokkos::parallel_reduce(Kokkos::RangePolicy<exec_space>(0, v.extent(0)), + CheckResult<ViewType>(v, value), errors); + return errors == 0; +} + +} // namespace + +TEST(TEST_CATEGORY, view_copy_tests) { + int N = 10000; + int M = 10; + + Kokkos::View<int**, Kokkos::LayoutRight, TEST_EXECSPACE> defaulted; + Kokkos::View<int**, Kokkos::LayoutRight, TEST_EXECSPACE> a("A", N, M); + Kokkos::View<int**, Kokkos::LayoutRight, TEST_EXECSPACE> b("B", N, M); + auto h_a = Kokkos::create_mirror(a); + auto h_b = Kokkos::create_mirror(b); + auto m_a = Kokkos::create_mirror_view(a); + auto s_a = Kokkos::subview(a, Kokkos::ALL, 1); + auto s_b = Kokkos::subview(b, Kokkos::ALL, 1); + auto hs_a = Kokkos::subview(h_a, Kokkos::ALL, 1); + auto hs_b = Kokkos::subview(h_b, Kokkos::ALL, 1); + auto dev = typename TEST_EXECSPACE::execution_space(); + auto host = Kokkos::DefaultHostExecutionSpace(); + + constexpr bool DevExecCanAccessHost = + Kokkos::Impl::SpaceAccessibility<typename TEST_EXECSPACE::execution_space, + Kokkos::HostSpace>::accessible; + + constexpr bool HostExecCanAccessDev = Kokkos::Impl::SpaceAccessibility< + typename Kokkos::HostSpace::execution_space, + typename TEST_EXECSPACE::memory_space>::accessible; + + // Contiguous copies + { Kokkos::deep_copy(defaulted, defaulted); } + { + Kokkos::deep_copy(a, 1); + ASSERT_TRUE(run_check(a, 1)); + } + { + Kokkos::deep_copy(a, a); + ASSERT_TRUE(run_check(a, 1)); + } + { + Kokkos::deep_copy(m_a, a); + ASSERT_TRUE(run_check(m_a, 1)); + } + { + Kokkos::deep_copy(m_a, 2); + ASSERT_TRUE(run_check(m_a, 2)); + } + { + Kokkos::deep_copy(a, m_a); + ASSERT_TRUE(run_check(a, 2)); + } + { + Kokkos::deep_copy(b, 3); + ASSERT_TRUE(run_check(b, 3)); + } + { + Kokkos::deep_copy(h_a, 4); + ASSERT_TRUE(run_check(h_a, 4)); + } + { + Kokkos::deep_copy(a, b); + ASSERT_TRUE(run_check(a, 3)); + } + { + Kokkos::deep_copy(h_b, h_a); + ASSERT_TRUE(run_check(h_b, 4)); + } + { + Kokkos::deep_copy(h_a, a); + ASSERT_TRUE(run_check(h_a, 3)); + } + { + Kokkos::deep_copy(b, h_b); + ASSERT_TRUE(run_check(b, 4)); + } + // Non contiguous copies + { + Kokkos::deep_copy(s_a, 5); + ASSERT_TRUE(run_check(s_a, 5)); + } + { + Kokkos::deep_copy(hs_a, 6); + ASSERT_TRUE(run_check(hs_a, 6)); + } + { + Kokkos::deep_copy(s_b, s_a); + ASSERT_TRUE(run_check(s_b, 5)); + } + { + Kokkos::deep_copy(hs_b, hs_a); + ASSERT_TRUE(run_check(hs_b, 6)); + } + if (DevExecCanAccessHost || HostExecCanAccessDev) { + { + Kokkos::deep_copy(hs_b, s_b); + ASSERT_TRUE(run_check(hs_b, 5)); + } + { + Kokkos::deep_copy(s_a, hs_a); + ASSERT_TRUE(run_check(s_a, 6)); + } + } + + // Contiguous copies + { Kokkos::deep_copy(dev, defaulted, defaulted); } + { + Kokkos::deep_copy(dev, a, 1); + ASSERT_TRUE(run_check(a, 1)); + } + { + Kokkos::deep_copy(dev, a, a); + ASSERT_TRUE(run_check(a, 1)); + } + { + Kokkos::deep_copy(dev, m_a, a); + ASSERT_TRUE(run_check(m_a, 1)); + } + { + Kokkos::deep_copy(dev, m_a, 2); + ASSERT_TRUE(run_check(m_a, 2)); + } + { + Kokkos::deep_copy(dev, a, m_a); + ASSERT_TRUE(run_check(a, 2)); + } + { + Kokkos::deep_copy(dev, b, 3); + ASSERT_TRUE(run_check(b, 3)); + } + { + Kokkos::deep_copy(dev, h_a, 4); + ASSERT_TRUE(run_check(h_a, 4)); + } + { + Kokkos::deep_copy(dev, a, b); + ASSERT_TRUE(run_check(a, 3)); + } + { + Kokkos::deep_copy(dev, h_b, h_a); + ASSERT_TRUE(run_check(h_b, 4)); + } + { + Kokkos::deep_copy(dev, h_a, a); + ASSERT_TRUE(run_check(h_a, 3)); + } + { + Kokkos::deep_copy(dev, b, h_b); + ASSERT_TRUE(run_check(b, 4)); + } + // Non contiguous copies + { + Kokkos::deep_copy(dev, s_a, 5); + ASSERT_TRUE(run_check(s_a, 5)); + } + { + Kokkos::deep_copy(dev, hs_a, 6); + ASSERT_TRUE(run_check(hs_a, 6)); + } + { + Kokkos::deep_copy(dev, s_b, s_a); + ASSERT_TRUE(run_check(s_b, 5)); + } + { + Kokkos::deep_copy(dev, hs_b, hs_a); + ASSERT_TRUE(run_check(hs_b, 6)); + } + if (DevExecCanAccessHost || HostExecCanAccessDev) { + { + Kokkos::deep_copy(dev, hs_b, s_b); + ASSERT_TRUE(run_check(hs_b, 5)); + } + { + Kokkos::deep_copy(dev, s_a, hs_a); + ASSERT_TRUE(run_check(s_a, 6)); + } + } + + // Contiguous copies + { Kokkos::deep_copy(host, defaulted, defaulted); } + { + Kokkos::deep_copy(host, a, 1); + ASSERT_TRUE(run_check(a, 1)); + } + { + Kokkos::deep_copy(host, a, a); + ASSERT_TRUE(run_check(a, 1)); + } + { + Kokkos::deep_copy(host, m_a, a); + ASSERT_TRUE(run_check(m_a, 1)); + } + { + Kokkos::deep_copy(host, m_a, 2); + ASSERT_TRUE(run_check(m_a, 2)); + } + { + Kokkos::deep_copy(host, a, m_a); + ASSERT_TRUE(run_check(a, 2)); + } + { + Kokkos::deep_copy(host, b, 3); + ASSERT_TRUE(run_check(b, 3)); + } + { + Kokkos::deep_copy(host, h_a, 4); + ASSERT_TRUE(run_check(h_a, 4)); + } + { + Kokkos::deep_copy(host, a, b); + ASSERT_TRUE(run_check(a, 3)); + } + { + Kokkos::deep_copy(host, h_b, h_a); + ASSERT_TRUE(run_check(h_b, 4)); + } + { + Kokkos::deep_copy(host, h_a, a); + ASSERT_TRUE(run_check(h_a, 3)); + } + { + Kokkos::deep_copy(host, b, h_b); + ASSERT_TRUE(run_check(b, 4)); + } + // Non contiguous copies + { + Kokkos::deep_copy(host, s_a, 5); + ASSERT_TRUE(run_check(s_a, 5)); + } + { + Kokkos::deep_copy(host, hs_a, 6); + ASSERT_TRUE(run_check(hs_a, 6)); + } + { + Kokkos::deep_copy(host, s_b, s_a); + ASSERT_TRUE(run_check(s_b, 5)); + } + { + Kokkos::deep_copy(host, hs_b, hs_a); + ASSERT_TRUE(run_check(hs_b, 6)); + } + if (DevExecCanAccessHost || HostExecCanAccessDev) { + { + Kokkos::deep_copy(host, hs_b, s_b); + ASSERT_TRUE(run_check(hs_b, 5)); + } + { + Kokkos::deep_copy(host, s_a, hs_a); + ASSERT_TRUE(run_check(s_a, 6)); + } + } +} +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestViewCopy_b.hpp b/packages/kokkos/core/unit_test/TestViewCopy_b.hpp new file mode 100644 index 0000000000000000000000000000000000000000..79647caa90a11ed93f13e32aa49226b12ebd8707 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestViewCopy_b.hpp @@ -0,0 +1,268 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <cstdio> + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +namespace Test { + +namespace { + +template <class ViewType> +struct CheckResult { + using value_type = typename ViewType::non_const_value_type; + ViewType v; + value_type value; + CheckResult(ViewType v_, value_type value_) : v(v_), value(value_){}; + KOKKOS_FUNCTION + void operator()(const int i, int& lsum) const { + for (int j = 0; j < static_cast<int>(v.extent(1)); j++) { + if (v.access(i, j) != value) lsum++; + } + } +}; + +template <class ViewType> +bool run_check(ViewType v, typename ViewType::value_type value) { + using exec_space = typename ViewType::memory_space::execution_space; + int errors = 0; + Kokkos::fence(); + Kokkos::parallel_reduce(Kokkos::RangePolicy<exec_space>(0, v.extent(0)), + CheckResult<ViewType>(v, value), errors); + return errors == 0; +} + +} // namespace + +TEST(TEST_CATEGORY, view_copy_tests_rank_0) { + Kokkos::View<int, TEST_EXECSPACE> defaulted; + Kokkos::View<int, TEST_EXECSPACE> a("A"); + Kokkos::View<int, TEST_EXECSPACE> b("B"); + auto h_a = Kokkos::create_mirror(a); + auto h_b = Kokkos::create_mirror(b); + auto m_a = Kokkos::create_mirror_view(a); + auto dev = typename TEST_EXECSPACE::execution_space(); + auto host = Kokkos::DefaultHostExecutionSpace(); + + // No execution space + { Kokkos::deep_copy(defaulted, defaulted); } + { + Kokkos::deep_copy(a, 1); + ASSERT_TRUE(run_check(a, 1)); + } + { + Kokkos::deep_copy(a, a); + ASSERT_TRUE(run_check(a, 1)); + } + { + Kokkos::deep_copy(m_a, a); + ASSERT_TRUE(run_check(m_a, 1)); + } + { + Kokkos::deep_copy(m_a, 2); + ASSERT_TRUE(run_check(m_a, 2)); + } + { + Kokkos::deep_copy(a, m_a); + ASSERT_TRUE(run_check(a, 2)); + } + { + Kokkos::deep_copy(b, 3); + ASSERT_TRUE(run_check(b, 3)); + } + { + Kokkos::deep_copy(h_a, 4); + ASSERT_TRUE(run_check(h_a, 4)); + } + { + Kokkos::deep_copy(a, b); + ASSERT_TRUE(run_check(a, 3)); + } + { + Kokkos::deep_copy(h_b, h_a); + ASSERT_TRUE(run_check(h_b, 4)); + } + { + Kokkos::deep_copy(h_a, a); + ASSERT_TRUE(run_check(h_a, 3)); + } + { + Kokkos::deep_copy(b, h_b); + ASSERT_TRUE(run_check(b, 4)); + } + + // Device + { Kokkos::deep_copy(dev, defaulted, defaulted); } + { + Kokkos::deep_copy(dev, a, 1); + ASSERT_TRUE(run_check(a, 1)); + } + { + Kokkos::deep_copy(dev, a, a); + ASSERT_TRUE(run_check(a, 1)); + } + { + Kokkos::deep_copy(dev, m_a, a); + ASSERT_TRUE(run_check(m_a, 1)); + } + { + Kokkos::deep_copy(dev, m_a, 2); + ASSERT_TRUE(run_check(m_a, 2)); + } + { + Kokkos::deep_copy(dev, a, m_a); + ASSERT_TRUE(run_check(a, 2)); + } + { + Kokkos::deep_copy(dev, b, 3); + ASSERT_TRUE(run_check(b, 3)); + } + { + Kokkos::deep_copy(dev, h_a, 4); + ASSERT_TRUE(run_check(h_a, 4)); + } + { + Kokkos::deep_copy(dev, a, b); + ASSERT_TRUE(run_check(a, 3)); + } + { + Kokkos::deep_copy(dev, h_b, h_a); + ASSERT_TRUE(run_check(h_b, 4)); + } + { + Kokkos::deep_copy(dev, h_a, a); + ASSERT_TRUE(run_check(h_a, 3)); + } + { + Kokkos::deep_copy(dev, b, h_b); + ASSERT_TRUE(run_check(b, 4)); + } + + // Host + { Kokkos::deep_copy(host, defaulted, defaulted); } + { + Kokkos::deep_copy(host, a, 1); + ASSERT_TRUE(run_check(a, 1)); + } + { + Kokkos::deep_copy(host, a, a); + ASSERT_TRUE(run_check(a, 1)); + } + { + Kokkos::deep_copy(host, m_a, a); + ASSERT_TRUE(run_check(m_a, 1)); + } + { + Kokkos::deep_copy(host, m_a, 2); + ASSERT_TRUE(run_check(m_a, 2)); + } + { + Kokkos::deep_copy(host, a, m_a); + ASSERT_TRUE(run_check(a, 2)); + } + { + Kokkos::deep_copy(host, b, 3); + ASSERT_TRUE(run_check(b, 3)); + } + { + Kokkos::deep_copy(host, h_a, 4); + ASSERT_TRUE(run_check(h_a, 4)); + } + { + Kokkos::deep_copy(host, a, b); + ASSERT_TRUE(run_check(a, 3)); + } + { + Kokkos::deep_copy(host, h_b, h_a); + ASSERT_TRUE(run_check(h_b, 4)); + } + { + Kokkos::deep_copy(host, h_a, a); + ASSERT_TRUE(run_check(h_a, 3)); + } + { + Kokkos::deep_copy(host, b, h_b); + ASSERT_TRUE(run_check(b, 4)); + } +} + +TEST(TEST_CATEGORY, view_copy_degenerated) { + Kokkos::View<int*, TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Unmanaged>> + v_um_def_1; + Kokkos::View<int*, TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Unmanaged>> + v_um_1(reinterpret_cast<int*>(-1), 0); + Kokkos::View<int*, TEST_EXECSPACE> v_m_def_1; + Kokkos::View<int*, TEST_EXECSPACE> v_m_1("v_m_1", 0); + + Kokkos::View<int*, TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Unmanaged>> + v_um_def_2; + Kokkos::View<int*, TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Unmanaged>> + v_um_2(reinterpret_cast<int*>(-1), 0); + Kokkos::View<int*, TEST_EXECSPACE> v_m_def_2; + Kokkos::View<int*, TEST_EXECSPACE> v_m_2("v_m_2", 0); + + Kokkos::deep_copy(v_um_def_1, v_um_def_2); + Kokkos::deep_copy(v_um_def_1, v_um_2); + Kokkos::deep_copy(v_um_def_1, v_m_def_2); + Kokkos::deep_copy(v_um_def_1, v_m_2); + + Kokkos::deep_copy(v_um_1, v_um_def_2); + Kokkos::deep_copy(v_um_1, v_um_2); + Kokkos::deep_copy(v_um_1, v_m_def_2); + Kokkos::deep_copy(v_um_1, v_m_2); + + Kokkos::deep_copy(v_m_def_1, v_um_def_2); + Kokkos::deep_copy(v_m_def_1, v_um_2); + Kokkos::deep_copy(v_m_def_1, v_m_def_2); + Kokkos::deep_copy(v_m_def_1, v_m_2); + + Kokkos::deep_copy(v_m_1, v_um_def_2); + Kokkos::deep_copy(v_m_1, v_um_2); + Kokkos::deep_copy(v_m_1, v_m_def_2); + Kokkos::deep_copy(v_m_1, v_m_2); +} +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestViewCtorPropEmbeddedDim.hpp b/packages/kokkos/core/unit_test/TestViewCtorPropEmbeddedDim.hpp new file mode 100644 index 0000000000000000000000000000000000000000..d6b3c33ebe1872c6c740d13e46c46ac7256ad59c --- /dev/null +++ b/packages/kokkos/core/unit_test/TestViewCtorPropEmbeddedDim.hpp @@ -0,0 +1,166 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <cstdio> + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +#include <type_traits> +#include <typeinfo> + +namespace Test { + +namespace { + +template <typename ExecSpace> +struct TestViewCtorProp_EmbeddedDim { + using ViewIntType = typename Kokkos::View<int**, ExecSpace>; + using ViewDoubleType = typename Kokkos::View<double*, ExecSpace>; + + // Cuda 7.0 has issues with using a lambda in parallel_for to initialize the + // view - replace with this functor + template <class ViewType> + struct Functor { + ViewType v; + + Functor(const ViewType& v_) : v(v_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { v(i) = i; } + }; + + static void test_vcpt(const int N0, const int N1) { + // Create views to test + { + using VIT = typename TestViewCtorProp_EmbeddedDim::ViewIntType; + using VDT = typename TestViewCtorProp_EmbeddedDim::ViewDoubleType; + + VIT vi1("vi1", N0, N1); + VDT vd1("vd1", N0); + + // TEST: Test for common type between two views, one with type double, + // other with type int Deduce common value_type and construct a view with + // that type + { + // Two views + auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1, vd1); + using CommonViewValueType = + typename decltype(view_alloc_arg)::value_type; + using CVT = typename Kokkos::View<CommonViewValueType*, ExecSpace>; + using HostCVT = typename CVT::HostMirror; + + // Construct View using the common type; for case of specialization, an + // 'embedded_dim' would be stored by view_alloc_arg + CVT cv1(Kokkos::view_alloc("cv1", view_alloc_arg), N0 * N1); + + Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, N0 * N1), + Functor<CVT>(cv1)); + + HostCVT hcv1 = Kokkos::create_mirror_view(cv1); + Kokkos::deep_copy(hcv1, cv1); + + ASSERT_EQ((std::is_same<CommonViewValueType, double>::value), true); + ASSERT_EQ( + (std::is_same<typename decltype(view_alloc_arg)::scalar_array_type, + CommonViewValueType>::value), + true); +#if 0 + // debug output + for ( int i = 0; i < N0*N1; ++i ) { + printf(" Output check: hcv1(%d) = %lf\n ", i, hcv1(i) ); + } + + printf( " Common value type view: %s \n", typeid( CVT() ).name() ); + printf( " Common value type: %s \n", typeid( CommonViewValueType() ).name() ); + if ( std::is_same< CommonViewValueType, double >::value == true ) { + printf("Proper common value_type\n"); + } + else { + printf("WRONG common value_type\n"); + } + // end debug output +#endif + } + + { + // Single view + auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1); + using CommonViewValueType = + typename decltype(view_alloc_arg)::value_type; + using CVT = typename Kokkos::View<CommonViewValueType*, ExecSpace>; + using HostCVT = typename CVT::HostMirror; + + // Construct View using the common type; for case of specialization, an + // 'embedded_dim' would be stored by view_alloc_arg + CVT cv1(Kokkos::view_alloc("cv1", view_alloc_arg), N0 * N1); + + Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, N0 * N1), + Functor<CVT>(cv1)); + + HostCVT hcv1 = Kokkos::create_mirror_view(cv1); + Kokkos::deep_copy(hcv1, cv1); + + ASSERT_EQ((std::is_same<CommonViewValueType, int>::value), true); + } + } + + } // end test_vcpt + +}; // end struct + +} // namespace + +TEST(TEST_CATEGORY, viewctorprop_embedded_dim) { + TestViewCtorProp_EmbeddedDim<TEST_EXECSPACE>::test_vcpt(2, 3); +} + +TEST(TEST_CATEGORY, + viewctorpop_view_allocate_without_initializing_backward_compatility) { + using deprecated_view_alloc = Kokkos::ViewAllocateWithoutInitializing; + Kokkos::View<int**, TEST_EXECSPACE> v(deprecated_view_alloc("v"), 5, 7); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestViewIsAssignable.hpp b/packages/kokkos/core/unit_test/TestViewIsAssignable.hpp new file mode 100644 index 0000000000000000000000000000000000000000..fcf9f75f37a22019e5e5e0713104e2fb7fed30ee --- /dev/null +++ b/packages/kokkos/core/unit_test/TestViewIsAssignable.hpp @@ -0,0 +1,145 @@ +#include <Kokkos_Core.hpp> + +namespace Test { +namespace Impl { +template <class ViewTypeDst, class ViewTypeSrc> +struct TestAssignability { + using mapping_type = + Kokkos::Impl::ViewMapping<typename ViewTypeDst::traits, + typename ViewTypeSrc::traits, + typename ViewTypeDst::specialize>; + + template <class MappingType> + static void try_assign( + ViewTypeDst& dst, ViewTypeSrc& src, + typename std::enable_if<MappingType::is_assignable>::type* = nullptr) { + dst = src; + } + + template <class MappingType> + static void try_assign( + ViewTypeDst&, ViewTypeSrc&, + typename std::enable_if<!MappingType::is_assignable>::type* = nullptr) { + Kokkos::Impl::throw_runtime_exception( + "TestAssignability::try_assign: Unexpected call path"); + } + + template <class... Dimensions> + static void test(bool always, bool sometimes, Dimensions... dims) { + ViewTypeDst dst; + ViewTypeSrc src("SRC", dims...); + + bool is_always_assignable = + Kokkos::is_always_assignable<ViewTypeDst, ViewTypeSrc>::value; + bool is_assignable = Kokkos::is_assignable(dst, src); + + // Print out if there is an error with typeid so you can just filter the + // output with c++filt -t to see which assignment causes the error. + if (is_always_assignable != always || is_assignable != sometimes) + printf( + "is_always_assignable: %i (%i), is_assignable: %i (%i) [ %s ] to [ " + "%s ]\n", + is_always_assignable ? 1 : 0, always ? 1 : 0, is_assignable ? 1 : 0, + sometimes ? 1 : 0, typeid(ViewTypeSrc).name(), + typeid(ViewTypeDst).name()); + if (sometimes) { + ASSERT_NO_THROW(try_assign<mapping_type>(dst, src)); + } + ASSERT_EQ(always, is_always_assignable); + ASSERT_EQ(sometimes, is_assignable); + } +}; + +} // namespace Impl + +TEST(TEST_CATEGORY, view_is_assignable) { + using namespace Kokkos; + using h_exec = typename DefaultHostExecutionSpace::memory_space; + using d_exec = typename TEST_EXECSPACE::memory_space; + using left = LayoutLeft; + using right = LayoutRight; + using stride = LayoutStride; + // Static/Dynamic Extents + Impl::TestAssignability<View<int*, left, d_exec>, + View<int*, left, d_exec>>::test(true, true, 10); + Impl::TestAssignability<View<int[10], left, d_exec>, + View<int*, left, d_exec>>::test(false, true, 10); + Impl::TestAssignability<View<int[5], left, d_exec>, + View<int*, left, d_exec>>::test(false, false, 10); + Impl::TestAssignability<View<int*, left, d_exec>, + View<int[10], left, d_exec>>::test(true, true); + Impl::TestAssignability<View<int[10], left, d_exec>, + View<int[10], left, d_exec>>::test(true, true); + Impl::TestAssignability<View<int[5], left, d_exec>, + View<int[10], left, d_exec>>::test(false, false); + Impl::TestAssignability<View<int**, left, d_exec>, + View<int**, left, d_exec>>::test(true, true, 10, 10); + Impl::TestAssignability<View<int * [10], left, d_exec>, + View<int**, left, d_exec>>::test(false, true, 10, 10); + Impl::TestAssignability<View<int * [5], left, d_exec>, + View<int**, left, d_exec>>::test(false, false, 10, + 10); + Impl::TestAssignability<View<int**, left, d_exec>, + View<int * [10], left, d_exec>>::test(true, true, 10); + Impl::TestAssignability<View<int * [10], left, d_exec>, + View<int * [10], left, d_exec>>::test(true, true, 10); + Impl::TestAssignability<View<int * [5], left, d_exec>, + View<int * [10], left, d_exec>>::test(false, false, + 10); + + // Mismatch value_type + Impl::TestAssignability<View<int*, left, d_exec>, + View<double*, left, d_exec>>::test(false, false, 10); + + // Layout assignment + Impl::TestAssignability<View<int*, left, d_exec>, + View<int*, right, d_exec>>::test(true, true, 10); + + // This could be made possible (due to the degenerate nature of the views) but + // we do not allow this yet + // TestAssignability<View<int**,left,d_exec>,View<int**,right,d_exec>>::test(false,true,10,1); + Impl::TestAssignability<View<int**, left, d_exec>, + View<int**, right, d_exec>>::test(false, false, 10, + 2); + Impl::TestAssignability<View<int**, stride, d_exec>, + View<int**, right, d_exec>>::test(true, true, 10, 2); + Impl::TestAssignability<View<int**, stride, d_exec>, + View<int**, left, d_exec>>::test(true, true, 10, 2); + + // Space Assignment + bool expected = Kokkos::Impl::MemorySpaceAccess<d_exec, h_exec>::assignable; + Impl::TestAssignability<View<int*, left, d_exec>, + View<int*, left, h_exec>>::test(expected, expected, + 10); + expected = Kokkos::Impl::MemorySpaceAccess<h_exec, d_exec>::assignable; + Impl::TestAssignability<View<int*, left, h_exec>, + View<int*, left, d_exec>>::test(expected, expected, + 10); + + // reference type and const-qualified types + using SomeViewType = View<int*, left, d_exec>; +#if defined(KOKKOS_ENABLE_CXX17) + static_assert(is_always_assignable_v<SomeViewType, SomeViewType>); + static_assert(is_always_assignable_v<SomeViewType, SomeViewType&>); + static_assert(is_always_assignable_v<SomeViewType, SomeViewType const>); + static_assert(is_always_assignable_v<SomeViewType, SomeViewType const&>); + static_assert(is_always_assignable_v<SomeViewType&, SomeViewType>); + static_assert(is_always_assignable_v<SomeViewType&, SomeViewType&>); + static_assert(is_always_assignable_v<SomeViewType&, SomeViewType const>); + static_assert(is_always_assignable_v<SomeViewType&, SomeViewType const&>); +#else + static_assert(is_always_assignable<SomeViewType, SomeViewType>::value, ""); + static_assert(is_always_assignable<SomeViewType, SomeViewType&>::value, ""); + static_assert(is_always_assignable<SomeViewType, SomeViewType const>::value, + ""); + static_assert(is_always_assignable<SomeViewType, SomeViewType const&>::value, + ""); + static_assert(is_always_assignable<SomeViewType&, SomeViewType>::value, ""); + static_assert(is_always_assignable<SomeViewType&, SomeViewType&>::value, ""); + static_assert(is_always_assignable<SomeViewType&, SomeViewType const>::value, + ""); + static_assert(is_always_assignable<SomeViewType&, SomeViewType const&>::value, + ""); +#endif +} +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestViewLayoutStrideAssignment.hpp b/packages/kokkos/core/unit_test/TestViewLayoutStrideAssignment.hpp new file mode 100644 index 0000000000000000000000000000000000000000..9ce3a34236956572b5a63c38765c05564a536140 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestViewLayoutStrideAssignment.hpp @@ -0,0 +1,932 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <stdexcept> +#include <sstream> +#include <iostream> +#include <time.h> + +#include <Kokkos_Core.hpp> + +namespace Test { + +TEST(TEST_CATEGORY, view_layoutstride_left_to_layoutleft_assignment) { + using exec_space = TEST_EXECSPACE; + + auto t = time(nullptr); + srand(t); // Use current time as seed for random generator + printf("view_layoutstride_left_to_layoutleft_assignment: srand(%lu)\n", + static_cast<unsigned long>(t)); + + { // Assignment of rank-1 LayoutLeft = LayoutStride + int ndims = 1; + int dims[] = {10}; + int order[] = {0}; + Kokkos::LayoutStride layout = + Kokkos::LayoutStride::order_dimensions(ndims, order, dims); + Kokkos::View<double*, Kokkos::LayoutStride, exec_space> src("LayoutStride", + layout); + + Kokkos::View<double*, Kokkos::LayoutStride, exec_space>::HostMirror h_src = + Kokkos::create_mirror_view(src); + + for (size_t i = 0; i < src.span(); i++) + h_src.data()[i] = (double)rand() / RAND_MAX * (100); + + Kokkos::deep_copy(src, h_src); + + Kokkos::View<double*, Kokkos::LayoutLeft, exec_space> dst = src; + + Kokkos::View<double*, Kokkos::LayoutLeft, exec_space>::HostMirror h_dst = + Kokkos::create_mirror_view(dst); + + Kokkos::deep_copy(h_dst, dst); + + bool test = true; + for (size_t i = 0; i < src.span(); i++) { + if (h_src.data()[i] != h_dst.data()[i]) { + test = false; + break; + } + } + ASSERT_EQ(dst.span(), src.span()); + ASSERT_EQ(test, true); + } + { // Assignment of rank-2 LayoutLeft = LayoutStride + int ndims = 2; + int dims[] = {10, 9}; + int order[] = {0, 1}; + Kokkos::LayoutStride layout = + Kokkos::LayoutStride::order_dimensions(ndims, order, dims); + Kokkos::View<double**, Kokkos::LayoutStride, exec_space> src("LayoutStride", + layout); + + Kokkos::View<double**, Kokkos::LayoutStride, exec_space>::HostMirror h_src = + Kokkos::create_mirror_view(src); + + for (size_t i = 0; i < src.span(); i++) + h_src.data()[i] = (double)rand() / RAND_MAX * (100); + + Kokkos::deep_copy(src, h_src); + + Kokkos::View<double**, Kokkos::LayoutLeft, exec_space> dst = src; + + Kokkos::View<double**, Kokkos::LayoutLeft, exec_space>::HostMirror h_dst = + Kokkos::create_mirror_view(dst); + + Kokkos::deep_copy(h_dst, dst); + + bool test = true; + for (size_t i = 0; i < src.span(); i++) { + if (h_src.data()[i] != h_dst.data()[i]) { + test = false; + break; + } + } + ASSERT_EQ(dst.span(), src.span()); + ASSERT_EQ(test, true); + } + { // Assignment of rank-3 LayoutLeft = LayoutStride + int ndims = 3; + int dims[] = {10, 9, 8}; + int order[] = {0, 1, 2}; + Kokkos::LayoutStride layout = + Kokkos::LayoutStride::order_dimensions(ndims, order, dims); + Kokkos::View<double***, Kokkos::LayoutStride, exec_space> src( + "LayoutStride", layout); + + Kokkos::View<double***, Kokkos::LayoutStride, exec_space>::HostMirror + h_src = Kokkos::create_mirror_view(src); + + for (size_t i = 0; i < src.span(); i++) + h_src.data()[i] = (double)rand() / RAND_MAX * (100); + + Kokkos::deep_copy(src, h_src); + + Kokkos::View<double***, Kokkos::LayoutLeft, exec_space> dst = src; + + Kokkos::View<double***, Kokkos::LayoutLeft, exec_space>::HostMirror h_dst = + Kokkos::create_mirror_view(dst); + + Kokkos::deep_copy(h_dst, dst); + + bool test = true; + for (size_t i = 0; i < src.span(); i++) { + if (h_src.data()[i] != h_dst.data()[i]) { + test = false; + break; + } + } + ASSERT_EQ(dst.span(), src.span()); + ASSERT_EQ(test, true); + } + { // Assignment of rank-4 LayoutLeft = LayoutStride + int ndims = 4; + int dims[] = {10, 9, 8, 7}; + int order[] = {0, 1, 2, 3}; + Kokkos::LayoutStride layout = + Kokkos::LayoutStride::order_dimensions(ndims, order, dims); + Kokkos::View<double****, Kokkos::LayoutStride, exec_space> src( + "LayoutStride", layout); + + Kokkos::View<double****, Kokkos::LayoutStride, exec_space>::HostMirror + h_src = Kokkos::create_mirror_view(src); + + for (size_t i = 0; i < src.span(); i++) + h_src.data()[i] = (double)rand() / RAND_MAX * (100); + + Kokkos::deep_copy(src, h_src); + + Kokkos::View<double****, Kokkos::LayoutLeft, exec_space> dst = src; + + Kokkos::View<double****, Kokkos::LayoutLeft, exec_space>::HostMirror h_dst = + Kokkos::create_mirror_view(dst); + + Kokkos::deep_copy(h_dst, dst); + + bool test = true; + for (size_t i = 0; i < src.span(); i++) { + if (h_src.data()[i] != h_dst.data()[i]) { + test = false; + break; + } + } + ASSERT_EQ(dst.span(), src.span()); + ASSERT_EQ(test, true); + } + { // Assignment of rank-5 LayoutLeft = LayoutStride + int ndims = 5; + int dims[] = {10, 9, 8, 7, 6}; + int order[] = {0, 1, 2, 3, 4}; + Kokkos::LayoutStride layout = + Kokkos::LayoutStride::order_dimensions(ndims, order, dims); + Kokkos::View<double*****, Kokkos::LayoutStride, exec_space> src( + "LayoutStride", layout); + + Kokkos::View<double*****, Kokkos::LayoutStride, exec_space>::HostMirror + h_src = Kokkos::create_mirror_view(src); + + for (size_t i = 0; i < src.span(); i++) + h_src.data()[i] = (double)rand() / RAND_MAX * (100); + + Kokkos::deep_copy(src, h_src); + + Kokkos::View<double*****, Kokkos::LayoutLeft, exec_space> dst = src; + + Kokkos::View<double*****, Kokkos::LayoutLeft, exec_space>::HostMirror + h_dst = Kokkos::create_mirror_view(dst); + + Kokkos::deep_copy(h_dst, dst); + + bool test = true; + for (size_t i = 0; i < src.span(); i++) { + if (h_src.data()[i] != h_dst.data()[i]) { + test = false; + break; + } + } + ASSERT_EQ(dst.span(), src.span()); + ASSERT_EQ(test, true); + } + { // Assignment of rank-6 LayoutLeft = LayoutStride + int ndims = 6; + int dims[] = {10, 9, 8, 7, 6, 5}; + int order[] = {0, 1, 2, 3, 4, 5}; + Kokkos::LayoutStride layout = + Kokkos::LayoutStride::order_dimensions(ndims, order, dims); + Kokkos::View<double******, Kokkos::LayoutStride, exec_space> src( + "LayoutStride", layout); + + Kokkos::View<double******, Kokkos::LayoutStride, exec_space>::HostMirror + h_src = Kokkos::create_mirror_view(src); + + for (size_t i = 0; i < src.span(); i++) + h_src.data()[i] = (double)rand() / RAND_MAX * (100); + + Kokkos::deep_copy(src, h_src); + + Kokkos::View<double******, Kokkos::LayoutLeft, exec_space> dst = src; + + Kokkos::View<double******, Kokkos::LayoutLeft, exec_space>::HostMirror + h_dst = Kokkos::create_mirror_view(dst); + + Kokkos::deep_copy(h_dst, dst); + + bool test = true; + for (size_t i = 0; i < src.span(); i++) { + if (h_src.data()[i] != h_dst.data()[i]) { + test = false; + break; + } + } + ASSERT_EQ(dst.span(), src.span()); + ASSERT_EQ(test, true); + } + { // Assignment of rank-7 LayoutLeft = LayoutStride + int ndims = 7; + int dims[] = {10, 9, 8, 7, 6, 5, 4}; + int order[] = {0, 1, 2, 3, 4, 5, 6}; + Kokkos::LayoutStride layout = + Kokkos::LayoutStride::order_dimensions(ndims, order, dims); + Kokkos::View<double*******, Kokkos::LayoutStride, exec_space> src( + "LayoutStride", layout); + + Kokkos::View<double*******, Kokkos::LayoutStride, exec_space>::HostMirror + h_src = Kokkos::create_mirror_view(src); + + for (size_t i = 0; i < src.span(); i++) + h_src.data()[i] = (double)rand() / RAND_MAX * (100); + + Kokkos::deep_copy(src, h_src); + + Kokkos::View<double*******, Kokkos::LayoutLeft, exec_space> dst = src; + + Kokkos::View<double*******, Kokkos::LayoutLeft, exec_space>::HostMirror + h_dst = Kokkos::create_mirror_view(dst); + + Kokkos::deep_copy(h_dst, dst); + + bool test = true; + for (size_t i = 0; i < src.span(); i++) { + if (h_src.data()[i] != h_dst.data()[i]) { + test = false; + break; + } + } + ASSERT_EQ(dst.span(), src.span()); + ASSERT_EQ(test, true); + } + { // Assignment of rank-8 LayoutLeft = LayoutStride + int ndims = 8; + int dims[] = {10, 9, 8, 7, 6, 5, 4, 3}; + int order[] = {0, 1, 2, 3, 4, 5, 6, 7}; + Kokkos::LayoutStride layout = + Kokkos::LayoutStride::order_dimensions(ndims, order, dims); + Kokkos::View<double********, Kokkos::LayoutStride, exec_space> src( + "LayoutStride", layout); + + Kokkos::View<double********, Kokkos::LayoutStride, exec_space>::HostMirror + h_src = Kokkos::create_mirror_view(src); + + for (size_t i = 0; i < src.span(); i++) + h_src.data()[i] = (double)rand() / RAND_MAX * (100); + + Kokkos::deep_copy(src, h_src); + + Kokkos::View<double********, Kokkos::LayoutLeft, exec_space> dst = src; + + Kokkos::View<double********, Kokkos::LayoutLeft, exec_space>::HostMirror + h_dst = Kokkos::create_mirror_view(dst); + + Kokkos::deep_copy(h_dst, dst); + + bool test = true; + for (size_t i = 0; i < src.span(); i++) { + if (h_src.data()[i] != h_dst.data()[i]) { + test = false; + break; + } + } + ASSERT_EQ(dst.span(), src.span()); + ASSERT_EQ(test, true); + } +} + +TEST(TEST_CATEGORY, view_layoutstride_right_to_layoutright_assignment) { + using exec_space = TEST_EXECSPACE; + + auto t = time(nullptr); + srand(t); // Use current time as seed for random generator + printf("view_layoutstride_right_to_layoutright_assignment: srand(%lu)\n", + static_cast<unsigned long>(t)); + + { // Assignment of rank-1 LayoutRight = LayoutStride + int ndims = 1; + int dims[] = {10}; + int order[] = {0}; + Kokkos::LayoutStride layout = + Kokkos::LayoutStride::order_dimensions(ndims, order, dims); + Kokkos::View<double*, Kokkos::LayoutStride, exec_space> src("LayoutStride", + layout); + + Kokkos::View<double*, Kokkos::LayoutStride, exec_space>::HostMirror h_src = + Kokkos::create_mirror_view(src); + + for (size_t i = 0; i < src.span(); i++) + h_src.data()[i] = (double)rand() / RAND_MAX * (100); + + Kokkos::deep_copy(src, h_src); + + Kokkos::View<double*, Kokkos::LayoutRight, exec_space> dst = src; + + Kokkos::View<double*, Kokkos::LayoutRight, exec_space>::HostMirror h_dst = + Kokkos::create_mirror_view(dst); + + Kokkos::deep_copy(h_dst, dst); + + bool test = true; + for (size_t i = 0; i < src.span(); i++) { + if (h_src.data()[i] != h_dst.data()[i]) { + test = false; + break; + } + } + ASSERT_EQ(dst.span(), src.span()); + ASSERT_EQ(test, true); + } + { // Assignment of rank-2 LayoutRight = LayoutStride + int ndims = 2; + int dims[] = {10, 9}; + int order[] = {1, 0}; + Kokkos::LayoutStride layout = + Kokkos::LayoutStride::order_dimensions(ndims, order, dims); + Kokkos::View<double**, Kokkos::LayoutStride, exec_space> src("LayoutStride", + layout); + + Kokkos::View<double**, Kokkos::LayoutStride, exec_space>::HostMirror h_src = + Kokkos::create_mirror_view(src); + + for (size_t i = 0; i < src.span(); i++) + h_src.data()[i] = (double)rand() / RAND_MAX * (100); + + Kokkos::deep_copy(src, h_src); + + Kokkos::View<double**, Kokkos::LayoutRight, exec_space> dst = src; + + Kokkos::View<double**, Kokkos::LayoutRight, exec_space>::HostMirror h_dst = + Kokkos::create_mirror_view(dst); + + Kokkos::deep_copy(h_dst, dst); + + bool test = true; + for (size_t i = 0; i < src.span(); i++) { + if (h_src.data()[i] != h_dst.data()[i]) { + test = false; + break; + } + } + ASSERT_EQ(dst.span(), src.span()); + ASSERT_EQ(test, true); + } + { // Assignment of rank-3 LayoutRight = LayoutStride + int ndims = 3; + int dims[] = {10, 9, 8}; + int order[] = {2, 1, 0}; + Kokkos::LayoutStride layout = + Kokkos::LayoutStride::order_dimensions(ndims, order, dims); + Kokkos::View<double***, Kokkos::LayoutStride, exec_space> src( + "LayoutStride", layout); + + Kokkos::View<double***, Kokkos::LayoutStride, exec_space>::HostMirror + h_src = Kokkos::create_mirror_view(src); + + for (size_t i = 0; i < src.span(); i++) + h_src.data()[i] = (double)rand() / RAND_MAX * (100); + + Kokkos::deep_copy(src, h_src); + + Kokkos::View<double***, Kokkos::LayoutRight, exec_space> dst = src; + + Kokkos::View<double***, Kokkos::LayoutRight, exec_space>::HostMirror h_dst = + Kokkos::create_mirror_view(dst); + + Kokkos::deep_copy(h_dst, dst); + + bool test = true; + for (size_t i = 0; i < src.span(); i++) { + if (h_src.data()[i] != h_dst.data()[i]) { + test = false; + break; + } + } + ASSERT_EQ(dst.span(), src.span()); + ASSERT_EQ(test, true); + } + { // Assignment of rank-4 LayoutRight = LayoutStride + int ndims = 4; + int dims[] = {10, 9, 8, 7}; + int order[] = {3, 2, 1, 0}; + Kokkos::LayoutStride layout = + Kokkos::LayoutStride::order_dimensions(ndims, order, dims); + Kokkos::View<double****, Kokkos::LayoutStride, exec_space> src( + "LayoutStride", layout); + + Kokkos::View<double****, Kokkos::LayoutStride, exec_space>::HostMirror + h_src = Kokkos::create_mirror_view(src); + + for (size_t i = 0; i < src.span(); i++) + h_src.data()[i] = (double)rand() / RAND_MAX * (100); + + Kokkos::deep_copy(src, h_src); + + Kokkos::View<double****, Kokkos::LayoutRight, exec_space> dst = src; + + Kokkos::View<double****, Kokkos::LayoutRight, exec_space>::HostMirror + h_dst = Kokkos::create_mirror_view(dst); + + Kokkos::deep_copy(h_dst, dst); + + bool test = true; + for (size_t i = 0; i < src.span(); i++) { + if (h_src.data()[i] != h_dst.data()[i]) { + test = false; + break; + } + } + ASSERT_EQ(dst.span(), src.span()); + ASSERT_EQ(test, true); + } + { // Assignment of rank-5 LayoutRight = LayoutStride + int ndims = 5; + int dims[] = {10, 9, 8, 7, 6}; + int order[] = {4, 3, 2, 1, 0}; + Kokkos::LayoutStride layout = + Kokkos::LayoutStride::order_dimensions(ndims, order, dims); + Kokkos::View<double*****, Kokkos::LayoutStride, exec_space> src( + "LayoutStride", layout); + + Kokkos::View<double*****, Kokkos::LayoutStride, exec_space>::HostMirror + h_src = Kokkos::create_mirror_view(src); + + for (size_t i = 0; i < src.span(); i++) + h_src.data()[i] = (double)rand() / RAND_MAX * (100); + + Kokkos::deep_copy(src, h_src); + + Kokkos::View<double*****, Kokkos::LayoutRight, exec_space> dst = src; + + Kokkos::View<double*****, Kokkos::LayoutRight, exec_space>::HostMirror + h_dst = Kokkos::create_mirror_view(dst); + + Kokkos::deep_copy(h_dst, dst); + + bool test = true; + for (size_t i = 0; i < src.span(); i++) { + if (h_src.data()[i] != h_dst.data()[i]) { + test = false; + break; + } + } + ASSERT_EQ(dst.span(), src.span()); + ASSERT_EQ(test, true); + } + { // Assignment of rank-6 LayoutRight = LayoutStride + int ndims = 6; + int dims[] = {10, 9, 8, 7, 6, 5}; + int order[] = {5, 4, 3, 2, 1, 0}; + Kokkos::LayoutStride layout = + Kokkos::LayoutStride::order_dimensions(ndims, order, dims); + Kokkos::View<double******, Kokkos::LayoutStride, exec_space> src( + "LayoutStride", layout); + + Kokkos::View<double******, Kokkos::LayoutStride, exec_space>::HostMirror + h_src = Kokkos::create_mirror_view(src); + + for (size_t i = 0; i < src.span(); i++) + h_src.data()[i] = (double)rand() / RAND_MAX * (100); + + Kokkos::deep_copy(src, h_src); + + Kokkos::View<double******, Kokkos::LayoutRight, exec_space> dst = src; + + Kokkos::View<double******, Kokkos::LayoutRight, exec_space>::HostMirror + h_dst = Kokkos::create_mirror_view(dst); + + Kokkos::deep_copy(h_dst, dst); + + bool test = true; + for (size_t i = 0; i < src.span(); i++) { + if (h_src.data()[i] != h_dst.data()[i]) { + test = false; + break; + } + } + ASSERT_EQ(dst.span(), src.span()); + ASSERT_EQ(test, true); + } + { // Assignment of rank-7 LayoutRight = LayoutStride + int ndims = 7; + int dims[] = {10, 9, 8, 7, 6, 5, 4}; + int order[] = {6, 5, 4, 3, 2, 1, 0}; + Kokkos::LayoutStride layout = + Kokkos::LayoutStride::order_dimensions(ndims, order, dims); + Kokkos::View<double*******, Kokkos::LayoutStride, exec_space> src( + "LayoutStride", layout); + + Kokkos::View<double*******, Kokkos::LayoutStride, exec_space>::HostMirror + h_src = Kokkos::create_mirror_view(src); + + for (size_t i = 0; i < src.span(); i++) + h_src.data()[i] = (double)rand() / RAND_MAX * (100); + + Kokkos::deep_copy(src, h_src); + + Kokkos::View<double*******, Kokkos::LayoutRight, exec_space> dst = src; + + Kokkos::View<double*******, Kokkos::LayoutRight, exec_space>::HostMirror + h_dst = Kokkos::create_mirror_view(dst); + + Kokkos::deep_copy(h_dst, dst); + + bool test = true; + for (size_t i = 0; i < src.span(); i++) { + if (h_src.data()[i] != h_dst.data()[i]) { + test = false; + break; + } + } + ASSERT_EQ(dst.span(), src.span()); + ASSERT_EQ(test, true); + } + { // Assignment of rank-8 LayoutRight = LayoutStride + int ndims = 8; + int dims[] = {10, 9, 8, 7, 6, 5, 4, 3}; + int order[] = {7, 6, 5, 4, 3, 2, 1, 0}; + Kokkos::LayoutStride layout = + Kokkos::LayoutStride::order_dimensions(ndims, order, dims); + Kokkos::View<double********, Kokkos::LayoutStride, exec_space> src( + "LayoutStride", layout); + + Kokkos::View<double********, Kokkos::LayoutStride, exec_space>::HostMirror + h_src = Kokkos::create_mirror_view(src); + + for (size_t i = 0; i < src.span(); i++) + h_src.data()[i] = (double)rand() / RAND_MAX * (100); + + Kokkos::deep_copy(src, h_src); + + Kokkos::View<double********, Kokkos::LayoutRight, exec_space> dst = src; + + Kokkos::View<double********, Kokkos::LayoutRight, exec_space>::HostMirror + h_dst = Kokkos::create_mirror_view(dst); + + Kokkos::deep_copy(h_dst, dst); + + bool test = true; + for (size_t i = 0; i < src.span(); i++) { + if (h_src.data()[i] != h_dst.data()[i]) { + test = false; + break; + } + } + ASSERT_EQ(dst.span(), src.span()); + ASSERT_EQ(test, true); + } +} + +TEST(TEST_CATEGORY_DEATH, view_layoutstride_right_to_layoutleft_assignment) { + using exec_space = TEST_EXECSPACE; + + auto t = time(nullptr); + srand(t); // Use current time as seed for random generator + printf("view_layoutstride_right_to_layoutleft_assignment: srand(%lu)\n", + static_cast<unsigned long>(t)); + + { // Assignment of rank-1 LayoutLeft = LayoutStride (LayoutRight compatible) + int ndims = 1; + int dims[] = {10}; + int order[] = {0}; + Kokkos::LayoutStride layout = + Kokkos::LayoutStride::order_dimensions(ndims, order, dims); + Kokkos::View<double*, Kokkos::LayoutStride, exec_space> src("LayoutStride", + layout); + + Kokkos::View<double*, Kokkos::LayoutStride, exec_space>::HostMirror h_src = + Kokkos::create_mirror_view(src); + + for (size_t i = 0; i < src.span(); i++) + h_src.data()[i] = (double)rand() / RAND_MAX * (100); + + Kokkos::deep_copy(src, h_src); + + Kokkos::View<double*, Kokkos::LayoutLeft, exec_space> dst; + + dst = src; + + Kokkos::View<double*, Kokkos::LayoutLeft, exec_space>::HostMirror h_dst = + Kokkos::create_mirror_view(dst); + + Kokkos::deep_copy(h_dst, dst); + + bool test = true; + for (size_t i = 0; i < src.span(); i++) { + if (h_src.data()[i] != h_dst.data()[i]) { + test = false; + break; + } + } + ASSERT_EQ(dst.span(), src.span()); + ASSERT_EQ(test, true); + } +// WORKAROUND OPENMPTARGET : death tests don't seem to work ... +#if defined(KOKKOS_ENABLE_OPENMPTARGET) + return; +#endif + { // Assignment of rank-2 LayoutLeft = LayoutStride (LayoutRight compatible) + int ndims = 2; + int dims[] = {10, 9}; + int order[] = {1, 0}; + Kokkos::LayoutStride layout = + Kokkos::LayoutStride::order_dimensions(ndims, order, dims); + Kokkos::View<double**, Kokkos::LayoutStride, exec_space> src("LayoutStride", + layout); + + Kokkos::View<double**, Kokkos::LayoutLeft, exec_space> dst; + + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + ASSERT_DEATH({ dst = src; }, + "View assignment must have compatible layouts"); + } + { // Assignment of rank-3 LayoutLeft = LayoutStride (LayoutRight compatible) + int ndims = 3; + int dims[] = {10, 9, 8}; + int order[] = {2, 1, 0}; + Kokkos::LayoutStride layout = + Kokkos::LayoutStride::order_dimensions(ndims, order, dims); + Kokkos::View<double***, Kokkos::LayoutStride, exec_space> src( + "LayoutStride", layout); + + Kokkos::View<double***, Kokkos::LayoutLeft, exec_space> dst; + + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + ASSERT_DEATH({ dst = src; }, + "View assignment must have compatible layouts"); + } + { // Assignment of rank-4 LayoutLeft = LayoutStride (LayoutRight compatible) + int ndims = 4; + int dims[] = {10, 9, 8, 7}; + int order[] = {3, 2, 1, 0}; + Kokkos::LayoutStride layout = + Kokkos::LayoutStride::order_dimensions(ndims, order, dims); + Kokkos::View<double****, Kokkos::LayoutStride, exec_space> src( + "LayoutStride", layout); + + Kokkos::View<double****, Kokkos::LayoutLeft, exec_space> dst; + + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + ASSERT_DEATH({ dst = src; }, + "View assignment must have compatible layouts"); + } + { // Assignment of rank-5 LayoutLeft = LayoutStride (LayoutRight compatible) + int ndims = 5; + int dims[] = {10, 9, 8, 7, 6}; + int order[] = {4, 3, 2, 1, 0}; + Kokkos::LayoutStride layout = + Kokkos::LayoutStride::order_dimensions(ndims, order, dims); + Kokkos::View<double*****, Kokkos::LayoutStride, exec_space> src( + "LayoutStride", layout); + + Kokkos::View<double*****, Kokkos::LayoutLeft, exec_space> dst; + + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + ASSERT_DEATH({ dst = src; }, + "View assignment must have compatible layouts"); + } + { // Assignment of rank-6 LayoutLeft = LayoutStride (LayoutRight compatible) + int ndims = 6; + int dims[] = {10, 9, 8, 7, 6, 5}; + int order[] = {5, 4, 3, 2, 1, 0}; + Kokkos::LayoutStride layout = + Kokkos::LayoutStride::order_dimensions(ndims, order, dims); + Kokkos::View<double******, Kokkos::LayoutStride, exec_space> src( + "LayoutStride", layout); + + Kokkos::View<double******, Kokkos::LayoutLeft, exec_space> dst; + + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + ASSERT_DEATH({ dst = src; }, + "View assignment must have compatible layouts"); + } + { // Assignment of rank-7 LayoutLeft = LayoutStride (LayoutRight compatible) + int ndims = 7; + int dims[] = {10, 9, 8, 7, 6, 5, 4}; + int order[] = {6, 5, 4, 3, 2, 1, 0}; + Kokkos::LayoutStride layout = + Kokkos::LayoutStride::order_dimensions(ndims, order, dims); + Kokkos::View<double*******, Kokkos::LayoutStride, exec_space> src( + "LayoutStride", layout); + + Kokkos::View<double*******, Kokkos::LayoutLeft, exec_space> dst; + + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + ASSERT_DEATH({ dst = src; }, + "View assignment must have compatible layouts"); + } + { // Assignment of rank-8 LayoutLeft = LayoutStride (LayoutRight compatible) + int ndims = 8; + int dims[] = {10, 9, 8, 7, 6, 5, 4, 3}; + int order[] = {7, 6, 5, 4, 3, 2, 1, 0}; + Kokkos::LayoutStride layout = + Kokkos::LayoutStride::order_dimensions(ndims, order, dims); + Kokkos::View<double********, Kokkos::LayoutStride, exec_space> src( + "LayoutStride", layout); + + Kokkos::View<double********, Kokkos::LayoutLeft, exec_space> dst; + + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + ASSERT_DEATH({ dst = src; }, + "View assignment must have compatible layouts"); + } +} + +TEST(TEST_CATEGORY_DEATH, view_layoutstride_left_to_layoutright_assignment) { + using exec_space = TEST_EXECSPACE; + + auto t = time(nullptr); + srand(t); // Use current time as seed for random generator + printf("view_layoutstride_left_to_layoutright_assignment: srand(%lu)\n", + static_cast<unsigned long>(t)); + + { // Assignment of rank-1 LayoutRight = LayoutStride (LayoutLeft compatible) + int ndims = 1; + int dims[] = {10}; + int order[] = {0}; + Kokkos::LayoutStride layout = + Kokkos::LayoutStride::order_dimensions(ndims, order, dims); + Kokkos::View<double*, Kokkos::LayoutStride, exec_space> src("LayoutStride", + layout); + + Kokkos::View<double*, Kokkos::LayoutStride, exec_space>::HostMirror h_src = + Kokkos::create_mirror_view(src); + + for (size_t i = 0; i < src.span(); i++) + h_src.data()[i] = (double)rand() / RAND_MAX * (100); + + Kokkos::deep_copy(src, h_src); + + Kokkos::View<double*, Kokkos::LayoutRight, exec_space> dst; + + dst = src; + + Kokkos::View<double*, Kokkos::LayoutRight, exec_space>::HostMirror h_dst = + Kokkos::create_mirror_view(dst); + + Kokkos::deep_copy(h_dst, dst); + + bool test = true; + for (size_t i = 0; i < src.span(); i++) { + if (h_src.data()[i] != h_dst.data()[i]) { + test = false; + break; + } + } + ASSERT_EQ(dst.span(), src.span()); + ASSERT_EQ(test, true); + } +// WORKAROUND OPENMPTARGET : death tests don't seem to work ... +#if defined(KOKKOS_ENABLE_OPENMPTARGET) + return; +#endif + { // Assignment of rank-2 LayoutRight = LayoutStride (LayoutLeft compatible) + int ndims = 2; + int dims[] = {10, 9}; + int order[] = {0, 1}; + Kokkos::LayoutStride layout = + Kokkos::LayoutStride::order_dimensions(ndims, order, dims); + Kokkos::View<double**, Kokkos::LayoutStride, exec_space> src("LayoutStride", + layout); + + Kokkos::View<double**, Kokkos::LayoutRight, exec_space> dst; + + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + ASSERT_DEATH({ dst = src; }, + "View assignment must have compatible layouts"); + } + { // Assignment of rank-3 LayoutRight = LayoutStride (LayoutLeft compatible) + int ndims = 3; + int dims[] = {10, 9, 8}; + int order[] = {0, 1, 2}; + Kokkos::LayoutStride layout = + Kokkos::LayoutStride::order_dimensions(ndims, order, dims); + Kokkos::View<double***, Kokkos::LayoutStride, exec_space> src( + "LayoutStride", layout); + + Kokkos::View<double***, Kokkos::LayoutRight, exec_space> dst; + + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + ASSERT_DEATH({ dst = src; }, + "View assignment must have compatible layouts"); + } + { // Assignment of rank-4 LayoutRight = LayoutStride (LayoutLeft compatible) + int ndims = 4; + int dims[] = {10, 9, 8, 7}; + int order[] = {0, 1, 2, 3}; + Kokkos::LayoutStride layout = + Kokkos::LayoutStride::order_dimensions(ndims, order, dims); + Kokkos::View<double****, Kokkos::LayoutStride, exec_space> src( + "LayoutStride", layout); + + Kokkos::View<double****, Kokkos::LayoutRight, exec_space> dst; + + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + ASSERT_DEATH({ dst = src; }, + "View assignment must have compatible layouts"); + } + { // Assignment of rank-5 LayoutRight = LayoutStride (LayoutLeft compatible) + int ndims = 5; + int dims[] = {10, 9, 8, 7, 6}; + int order[] = {0, 1, 2, 3, 4}; + Kokkos::LayoutStride layout = + Kokkos::LayoutStride::order_dimensions(ndims, order, dims); + Kokkos::View<double*****, Kokkos::LayoutStride, exec_space> src( + "LayoutStride", layout); + + Kokkos::View<double*****, Kokkos::LayoutRight, exec_space> dst; + + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + ASSERT_DEATH({ dst = src; }, + "View assignment must have compatible layouts"); + } + { // Assignment of rank-6 LayoutRight = LayoutStride (LayoutLeft compatible) + int ndims = 6; + int dims[] = {10, 9, 8, 7, 6, 5}; + int order[] = {0, 1, 2, 3, 4, 5}; + Kokkos::LayoutStride layout = + Kokkos::LayoutStride::order_dimensions(ndims, order, dims); + Kokkos::View<double******, Kokkos::LayoutStride, exec_space> src( + "LayoutStride", layout); + + Kokkos::View<double******, Kokkos::LayoutRight, exec_space> dst; + + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + ASSERT_DEATH({ dst = src; }, + "View assignment must have compatible layouts"); + } + { // Assignment of rank-7 LayoutRight = LayoutStride (LayoutLeft compatible) + int ndims = 7; + int dims[] = {10, 9, 8, 7, 6, 5, 4}; + int order[] = {0, 1, 2, 3, 4, 5, 6}; + Kokkos::LayoutStride layout = + Kokkos::LayoutStride::order_dimensions(ndims, order, dims); + Kokkos::View<double*******, Kokkos::LayoutStride, exec_space> src( + "LayoutStride", layout); + + Kokkos::View<double*******, Kokkos::LayoutRight, exec_space> dst; + + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + ASSERT_DEATH({ dst = src; }, + "View assignment must have compatible layouts"); + } + { // Assignment of rank-8 LayoutRight = LayoutStride (LayoutLeft compatible) + int ndims = 8; + int dims[] = {10, 9, 8, 7, 6, 5, 4, 3}; + int order[] = {0, 1, 2, 3, 4, 5, 6, 7}; + Kokkos::LayoutStride layout = + Kokkos::LayoutStride::order_dimensions(ndims, order, dims); + Kokkos::View<double********, Kokkos::LayoutStride, exec_space> src( + "LayoutStride", layout); + + Kokkos::View<double********, Kokkos::LayoutRight, exec_space> dst; + + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + ASSERT_DEATH({ dst = src; }, + "View assignment must have compatible layouts"); + } +} + +} // namespace Test + +#include <TestIrregularLayout.hpp> diff --git a/packages/kokkos/core/unit_test/TestViewLayoutTiled.hpp b/packages/kokkos/core/unit_test/TestViewLayoutTiled.hpp new file mode 100644 index 0000000000000000000000000000000000000000..2510a1244664c4b902160faf5f93d022e48ed7aa --- /dev/null +++ b/packages/kokkos/core/unit_test/TestViewLayoutTiled.hpp @@ -0,0 +1,1778 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <cstdio> + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> +#include <impl/Kokkos_ViewLayoutTiled.hpp> + +#include <type_traits> +#include <typeinfo> + +namespace Test { + +namespace { + +template <typename ExecSpace> +struct TestViewLayoutTiled { + using Scalar = double; + + static constexpr int T0 = 2; + static constexpr int T1 = 4; + static constexpr int T2 = 4; + static constexpr int T3 = 2; + static constexpr int T4 = 2; + static constexpr int T5 = 2; + static constexpr int T6 = 2; + static constexpr int T7 = 2; + + // Rank 2 + using LayoutLL_2D_2x4 = + Kokkos::Experimental::LayoutTiled<Kokkos::Iterate::Left, + Kokkos::Iterate::Left, T0, T1>; + using LayoutRL_2D_2x4 = + Kokkos::Experimental::LayoutTiled<Kokkos::Iterate::Right, + Kokkos::Iterate::Left, T0, T1>; + using LayoutLR_2D_2x4 = + Kokkos::Experimental::LayoutTiled<Kokkos::Iterate::Left, + Kokkos::Iterate::Right, T0, T1>; + using LayoutRR_2D_2x4 = + Kokkos::Experimental::LayoutTiled<Kokkos::Iterate::Right, + Kokkos::Iterate::Right, T0, T1>; + + // Rank 3 + using LayoutLL_3D_2x4x4 = + Kokkos::Experimental::LayoutTiled<Kokkos::Iterate::Left, + Kokkos::Iterate::Left, T0, T1, T2>; + using LayoutRL_3D_2x4x4 = + Kokkos::Experimental::LayoutTiled<Kokkos::Iterate::Right, + Kokkos::Iterate::Left, T0, T1, T2>; + using LayoutLR_3D_2x4x4 = + Kokkos::Experimental::LayoutTiled<Kokkos::Iterate::Left, + Kokkos::Iterate::Right, T0, T1, T2>; + using LayoutRR_3D_2x4x4 = + Kokkos::Experimental::LayoutTiled<Kokkos::Iterate::Right, + Kokkos::Iterate::Right, T0, T1, T2>; + + // Rank 4 + using LayoutLL_4D_2x4x4x2 = + Kokkos::Experimental::LayoutTiled<Kokkos::Iterate::Left, + Kokkos::Iterate::Left, T0, T1, T2, T3>; + using LayoutRL_4D_2x4x4x2 = + Kokkos::Experimental::LayoutTiled<Kokkos::Iterate::Right, + Kokkos::Iterate::Left, T0, T1, T2, T3>; + using LayoutLR_4D_2x4x4x2 = + Kokkos::Experimental::LayoutTiled<Kokkos::Iterate::Left, + Kokkos::Iterate::Right, T0, T1, T2, T3>; + using LayoutRR_4D_2x4x4x2 = + Kokkos::Experimental::LayoutTiled<Kokkos::Iterate::Right, + Kokkos::Iterate::Right, T0, T1, T2, T3>; + +#if !defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + static void test_view_layout_tiled_2d(const int, const int) { +#else + static void test_view_layout_tiled_2d(const int N0, const int N1) { + const int FT = T0 * T1; + + const int NT0 = int(std::ceil(N0 / T0)); + const int NT1 = int(std::ceil(N1 / T1)); + // Test create_mirror_view, deep_copy + // Create LL View + { + using ViewType = + typename Kokkos::View<Scalar**, LayoutLL_2D_2x4, ExecSpace>; + ViewType v("v", N0, N1); + + typename ViewType::HostMirror hv = Kokkos::create_mirror_view(v); + + // Initialize host-view + for (int tj = 0; tj < NT1; ++tj) { + for (int ti = 0; ti < NT0; ++ti) { + for (int j = 0; j < T1; ++j) { + for (int i = 0; i < T0; ++i) { + hv(ti * T0 + i, tj * T1 + j) = + (ti + tj * NT0) * FT + (i + j * T0); + } + } + } + } + + // copy to device + Kokkos::deep_copy(v, hv); + + Kokkos::MDRangePolicy< + Kokkos::Rank<2, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, + ExecSpace> + mdrangepolicy({0, 0}, {NT0, NT1}, {T0, T1}); + + // iterate by tile + Kokkos::parallel_for( + "ViewTile rank 2 LL", mdrangepolicy, + KOKKOS_LAMBDA(const int ti, const int tj) { + for (int j = 0; j < T1; ++j) { + for (int i = 0; i < T0; ++i) { + if ((ti * T0 + i < N0) && (tj * T1 + j < N1)) { + v(ti * T0 + i, tj * T1 + j) += 1; + } + } + } + }); + + Kokkos::deep_copy(hv, v); + + long counter_subview = 0; + long counter_inc = 0; + for (int tj = 0; tj < NT1; ++tj) { + for (int ti = 0; ti < NT0; ++ti) { + auto tile_subview = Kokkos::tile_subview(hv, ti, tj); + for (int j = 0; j < T1; ++j) { + for (int i = 0; i < T0; ++i) { + if (tile_subview(i, j) != hv(ti * T0 + i, tj * T1 + j)) { + ++counter_subview; + } + if (tile_subview(i, j) != + ((ti + tj * NT0) * FT + (i + j * T0) + 1)) { + ++counter_inc; + } + } + } + } + } + ASSERT_EQ(counter_subview, long(0)); + ASSERT_EQ(counter_inc, long(0)); + } + + // Create RL View + { + using ViewType = + typename Kokkos::View<Scalar**, LayoutRL_2D_2x4, ExecSpace>; + Kokkos::View<Scalar**, LayoutRL_2D_2x4, ExecSpace> v("v", N0, N1); + + typename ViewType::HostMirror hv = Kokkos::create_mirror_view(v); + + // Initialize host-view + for (int ti = 0; ti < NT0; ++ti) { + for (int tj = 0; tj < NT1; ++tj) { + for (int j = 0; j < T1; ++j) { + for (int i = 0; i < T0; ++i) { + hv(ti * T0 + i, tj * T1 + j) = + (ti * NT1 + tj) * FT + (i + j * T0); + } + } + } + } + + // copy to device + Kokkos::deep_copy(v, hv); + + Kokkos::MDRangePolicy< + Kokkos::Rank<2, Kokkos::Iterate::Right, Kokkos::Iterate::Left>, + ExecSpace> + mdrangepolicy({0, 0}, {NT0, NT1}, {T0, T1}); + + // iterate by tile + Kokkos::parallel_for( + "ViewTile rank 2 RL", mdrangepolicy, + KOKKOS_LAMBDA(const int ti, const int tj) { + for (int j = 0; j < T1; ++j) { + for (int i = 0; i < T0; ++i) { + if ((ti * T0 + i < N0) && (tj * T1 + j < N1)) { + v(ti * T0 + i, tj * T1 + j) += 1; + } + } + } + }); + + Kokkos::deep_copy(hv, v); + + long counter_subview = 0; + long counter_inc = 0; + + for (int ti = 0; ti < NT0; ++ti) { + for (int tj = 0; tj < NT1; ++tj) { + auto tile_subview = Kokkos::tile_subview(hv, ti, tj); + for (int j = 0; j < T1; ++j) { + for (int i = 0; i < T0; ++i) { + if (tile_subview(i, j) != hv(ti * T0 + i, tj * T1 + j)) { + ++counter_subview; + } + if (tile_subview(i, j) != + ((ti * NT1 + tj) * FT + (i + j * T0) + 1)) { + ++counter_inc; + } + } + } + } + } + ASSERT_EQ(counter_subview, long(0)); + ASSERT_EQ(counter_inc, long(0)); + } // end scope + + // Create LR View + { + using ViewType = + typename Kokkos::View<Scalar**, LayoutLR_2D_2x4, ExecSpace>; + Kokkos::View<Scalar**, LayoutLR_2D_2x4, ExecSpace> v("v", N0, N1); + + typename ViewType::HostMirror hv = Kokkos::create_mirror_view(v); + + // Initialize host-view + for (int tj = 0; tj < NT1; ++tj) { + for (int ti = 0; ti < NT0; ++ti) { + for (int i = 0; i < T0; ++i) { + for (int j = 0; j < T1; ++j) { + hv(ti * T0 + i, tj * T1 + j) = + (ti + tj * NT0) * FT + (i * T1 + j); + } + } + } + } + + // copy to device + Kokkos::deep_copy(v, hv); + + Kokkos::MDRangePolicy< + Kokkos::Rank<2, Kokkos::Iterate::Left, Kokkos::Iterate::Right>, + ExecSpace> + mdrangepolicy({0, 0}, {NT0, NT1}, {T0, T1}); + + // iterate by tile + Kokkos::parallel_for( + "ViewTile rank 2 LR", mdrangepolicy, + KOKKOS_LAMBDA(const int ti, const int tj) { + for (int j = 0; j < T1; ++j) { + for (int i = 0; i < T0; ++i) { + if ((ti * T0 + i < N0) && (tj * T1 + j < N1)) { + v(ti * T0 + i, tj * T1 + j) += 1; + } + } + } + }); + + Kokkos::deep_copy(hv, v); + + long counter_subview = 0; + long counter_inc = 0; + + for (int tj = 0; tj < NT1; ++tj) { + for (int ti = 0; ti < NT0; ++ti) { + auto tile_subview = Kokkos::tile_subview(hv, ti, tj); + for (int i = 0; i < T0; ++i) { + for (int j = 0; j < T1; ++j) { + if (tile_subview(i, j) != hv(ti * T0 + i, tj * T1 + j)) { + ++counter_subview; + } + if (tile_subview(i, j) != + ((ti + tj * NT0) * FT + (i * T1 + j) + 1)) { + ++counter_inc; + } + } + } + } + } + ASSERT_EQ(counter_subview, long(0)); + ASSERT_EQ(counter_inc, long(0)); + } // end scope + + // Create RR View + { + using ViewType = + typename Kokkos::View<Scalar**, LayoutRR_2D_2x4, ExecSpace>; + Kokkos::View<Scalar**, LayoutRR_2D_2x4, ExecSpace> v("v", N0, N1); + + typename ViewType::HostMirror hv = Kokkos::create_mirror_view(v); + + // Initialize host-view + for (int ti = 0; ti < NT0; ++ti) { + for (int tj = 0; tj < NT1; ++tj) { + for (int i = 0; i < T0; ++i) { + for (int j = 0; j < T1; ++j) { + hv(ti * T0 + i, tj * T1 + j) = + (ti * NT1 + tj) * FT + (i * T1 + j); + } + } + } + } + + // copy to device + Kokkos::deep_copy(v, hv); + + Kokkos::MDRangePolicy< + Kokkos::Rank<2, Kokkos::Iterate::Left, Kokkos::Iterate::Right>, + ExecSpace> + mdrangepolicy({0, 0}, {NT0, NT1}, {T0, T1}); + + // iterate by tile + Kokkos::parallel_for( + "ViewTile rank 2 LR", mdrangepolicy, + KOKKOS_LAMBDA(const int ti, const int tj) { + for (int j = 0; j < T1; ++j) { + for (int i = 0; i < T0; ++i) { + if ((ti * T0 + i < N0) && (tj * T1 + j < N1)) { + v(ti * T0 + i, tj * T1 + j) += 1; + } + } + } + }); + + Kokkos::deep_copy(hv, v); + + long counter_subview = 0; + long counter_inc = 0; + + for (int ti = 0; ti < NT0; ++ti) { + for (int tj = 0; tj < NT1; ++tj) { + auto tile_subview = Kokkos::tile_subview(hv, ti, tj); + for (int i = 0; i < T0; ++i) { + for (int j = 0; j < T1; ++j) { + if (tile_subview(i, j) != hv(ti * T0 + i, tj * T1 + j)) { + ++counter_subview; + } + if (tile_subview(i, j) != + ((ti * NT1 + tj) * FT + (i * T1 + j) + 1)) { + ++counter_inc; + } + } + } + } + } + ASSERT_EQ(counter_subview, long(0)); + ASSERT_EQ(counter_inc, long(0)); + } // end scope +#endif + } // end test_view_layout_tiled_2d + +#if !defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + static void test_view_layout_tiled_3d(const int, const int, const int) { +#else + static void test_view_layout_tiled_3d(const int N0, const int N1, + const int N2) { + const int FT = T0 * T1 * T2; + + const int NT0 = int(std::ceil(N0 / T0)); + const int NT1 = int(std::ceil(N1 / T1)); + const int NT2 = int(std::ceil(N2 / T2)); + + // Create LL View + { + using ViewType = Kokkos::View<Scalar***, LayoutLL_3D_2x4x4, ExecSpace>; + Kokkos::View<Scalar***, LayoutLL_3D_2x4x4, ExecSpace> dv("dv", N0, N1, + N2); + + typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); + + // Initialize on host + for (int tk = 0; tk < NT2; ++tk) { + for (int tj = 0; tj < NT1; ++tj) { + for (int ti = 0; ti < NT0; ++ti) { + for (int k = 0; k < T2; ++k) { + for (int j = 0; j < T1; ++j) { + for (int i = 0; i < T0; ++i) { + v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = + (ti + tj * NT0 + tk * N0 * N1) * FT + + (i + j * T0 + k * T0 * T1); + } + } + } + } + } + } + + // copy to device + Kokkos::deep_copy(dv, v); + + Kokkos::MDRangePolicy< + Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, + ExecSpace> + mdrangepolicy({0, 0, 0}, {N0, N1, N2}, {T0, T1, T2}); + + // iterate by tile + Kokkos::parallel_for( + "ViewTile rank 3 LL", mdrangepolicy, + KOKKOS_LAMBDA(const int i, const int j, const int k) { + dv(i, j, k) += 1; + }); + + Kokkos::deep_copy(v, dv); + + long counter_subview = 0; + long counter_inc = 0; + + for (int tk = 0; tk < NT2; ++tk) { + for (int tj = 0; tj < NT1; ++tj) { + for (int ti = 0; ti < NT0; ++ti) { + auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); + for (int k = 0; k < T2; ++k) { + for (int j = 0; j < T1; ++j) { + for (int i = 0; i < T0; ++i) { + if (tile_subview(i, j, k) != + v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { + ++counter_subview; + } + if (tile_subview(i, j, k) != + ((ti + tj * NT0 + tk * N0 * N1) * FT + + (i + j * T0 + k * T0 * T1) + 1)) { + ++counter_inc; + } + } + } + } + } + } + } + ASSERT_EQ(counter_subview, long(0)); + ASSERT_EQ(counter_inc, long(0)); + } // end scope + + // Create RL View + { + using ViewType = Kokkos::View<Scalar***, LayoutRL_3D_2x4x4, ExecSpace>; + Kokkos::View<Scalar***, LayoutRL_3D_2x4x4, ExecSpace> dv("dv", N0, N1, + N2); + + typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); + + // Initialize on host + for (int ti = 0; ti < NT0; ++ti) { + for (int tj = 0; tj < NT1; ++tj) { + for (int tk = 0; tk < NT2; ++tk) { + for (int k = 0; k < T2; ++k) { + for (int j = 0; j < T1; ++j) { + for (int i = 0; i < T0; ++i) { + v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = + (ti * NT1 * NT2 + tj * NT2 + tk) * FT + + (i + j * T0 + k * T0 * T1); + } + } + } + } + } + } + + // copy to device + Kokkos::deep_copy(dv, v); + + Kokkos::MDRangePolicy< + Kokkos::Rank<3, Kokkos::Iterate::Right, Kokkos::Iterate::Left>, + ExecSpace> + mdrangepolicy({0, 0, 0}, {N0, N1, N2}, {T0, T1, T2}); + + // iterate by tile + Kokkos::parallel_for( + "ViewTile rank 3 RL", mdrangepolicy, + KOKKOS_LAMBDA(const int i, const int j, const int k) { + dv(i, j, k) += 1; + }); + + Kokkos::deep_copy(v, dv); + + long counter_subview = 0; + long counter_inc = 0; + + for (int ti = 0; ti < NT0; ++ti) { + for (int tj = 0; tj < NT1; ++tj) { + for (int tk = 0; tk < NT2; ++tk) { + auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); + for (int k = 0; k < T2; ++k) { + for (int j = 0; j < T1; ++j) { + for (int i = 0; i < T0; ++i) { + if (tile_subview(i, j, k) != + v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { + ++counter_subview; + } + if (tile_subview(i, j, k) != + ((ti * NT1 * NT2 + tj * NT2 + tk) * FT + + (i + j * T0 + k * T0 * T1) + 1)) { + ++counter_inc; + } + } + } + } + } + } + } + ASSERT_EQ(counter_subview, long(0)); + ASSERT_EQ(counter_inc, long(0)); + } // end scope + + // Create LR View + { + using ViewType = Kokkos::View<Scalar***, LayoutLR_3D_2x4x4, ExecSpace>; + Kokkos::View<Scalar***, LayoutLR_3D_2x4x4, ExecSpace> dv("dv", N0, N1, + N2); + + typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); + + // Initialize on host + for (int tk = 0; tk < NT2; ++tk) { + for (int tj = 0; tj < NT1; ++tj) { + for (int ti = 0; ti < NT0; ++ti) { + for (int i = 0; i < T0; ++i) { + for (int j = 0; j < T1; ++j) { + for (int k = 0; k < T2; ++k) { + v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = + (ti + tj * NT0 + tk * NT0 * NT1) * FT + + (i * T1 * T2 + j * T2 + k); + } + } + } + } + } + } + + // copy to device + Kokkos::deep_copy(dv, v); + + Kokkos::MDRangePolicy< + Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Right>, + ExecSpace> + mdrangepolicy({0, 0, 0}, {N0, N1, N2}, {T0, T1, T2}); + + // iterate by tile + Kokkos::parallel_for( + "ViewTile rank 3 LR", mdrangepolicy, + KOKKOS_LAMBDA(const int i, const int j, const int k) { + dv(i, j, k) += 1; + }); + + Kokkos::deep_copy(v, dv); + + long counter_subview = 0; + long counter_inc = 0; + + for (int tk = 0; tk < NT2; ++tk) { + for (int tj = 0; tj < NT1; ++tj) { + for (int ti = 0; ti < NT0; ++ti) { + auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); + for (int i = 0; i < T0; ++i) { + for (int j = 0; j < T1; ++j) { + for (int k = 0; k < T2; ++k) { + if (tile_subview(i, j, k) != + v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { + ++counter_subview; + } + if (tile_subview(i, j, k) != + ((ti + tj * NT0 + tk * NT0 * NT1) * FT + + (i * T1 * T2 + j * T2 + k) + 1)) { + ++counter_inc; + } + } + } + } + } + } + } + ASSERT_EQ(counter_subview, long(0)); + ASSERT_EQ(counter_inc, long(0)); + } // end scope + + // Create RR View + { + using ViewType = Kokkos::View<Scalar***, LayoutRR_3D_2x4x4, ExecSpace>; + Kokkos::View<Scalar***, LayoutRR_3D_2x4x4, ExecSpace> dv("dv", N0, N1, + N2); + + typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); + + // Initialize on host + for (int ti = 0; ti < NT0; ++ti) { + for (int tj = 0; tj < NT1; ++tj) { + for (int tk = 0; tk < NT2; ++tk) { + for (int i = 0; i < T0; ++i) { + for (int j = 0; j < T1; ++j) { + for (int k = 0; k < T2; ++k) { + v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = + (ti * NT1 * NT2 + tj * NT2 + tk) * FT + + (i * T1 * T2 + j * T2 + k); + } + } + } + } + } + } + + // copy to device + Kokkos::deep_copy(dv, v); + + Kokkos::MDRangePolicy< + Kokkos::Rank<3, Kokkos::Iterate::Right, Kokkos::Iterate::Right>, + ExecSpace> + mdrangepolicy({0, 0, 0}, {N0, N1, N2}, {T0, T1, T2}); + + // iterate by tile + Kokkos::parallel_for( + "ViewTile rank 3 RR", mdrangepolicy, + KOKKOS_LAMBDA(const int i, const int j, const int k) { + dv(i, j, k) += 1; + }); + + Kokkos::deep_copy(v, dv); + + long counter_subview = 0; + long counter_inc = 0; + + for (int ti = 0; ti < NT0; ++ti) { + for (int tj = 0; tj < NT1; ++tj) { + for (int tk = 0; tk < NT2; ++tk) { + auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); + for (int i = 0; i < T0; ++i) { + for (int j = 0; j < T1; ++j) { + for (int k = 0; k < T2; ++k) { + if (tile_subview(i, j, k) != + v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { + ++counter_subview; + } + if (tile_subview(i, j, k) != + ((ti * NT1 * NT2 + tj * NT2 + tk) * FT + + (i * T1 * T2 + j * T2 + k) + 1)) { + ++counter_inc; + } + } + } + } + } + } + } + ASSERT_EQ(counter_subview, long(0)); + ASSERT_EQ(counter_inc, long(0)); + } // end scope +#endif + } // end test_view_layout_tiled_3d + +#if !defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + static void test_view_layout_tiled_4d(const int, const int, const int, + const int){ +#else + static void test_view_layout_tiled_4d(const int N0, const int N1, + const int N2, const int N3) { + const int FT = T0 * T1 * T2 * T3; + + const int NT0 = int(std::ceil(N0 / T0)); + const int NT1 = int(std::ceil(N1 / T1)); + const int NT2 = int(std::ceil(N2 / T2)); + const int NT3 = int(std::ceil(N3 / T3)); + + // Create LL View + { + using ViewType = Kokkos::View<Scalar****, LayoutLL_4D_2x4x4x2, ExecSpace>; + Kokkos::View<Scalar****, LayoutLL_4D_2x4x4x2, ExecSpace> dv("dv", N0, N1, + N2, N3); + + typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); + + // Initialize on host + for (int tl = 0; tl < NT3; ++tl) { + for (int tk = 0; tk < NT2; ++tk) { + for (int tj = 0; tj < NT1; ++tj) { + for (int ti = 0; ti < NT0; ++ti) { + for (int l = 0; l < T3; ++l) { + for (int k = 0; k < T2; ++k) { + for (int j = 0; j < T1; ++j) { + for (int i = 0; i < T0; ++i) { + v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = + (ti + tj * NT0 + tk * N0 * N1 + tl * N0 * N1 * N2) * + FT + + (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2); + } + } + } + } + } + } + } + } + + // copy to device + Kokkos::deep_copy(dv, v); + + Kokkos::MDRangePolicy< + Kokkos::Rank<4, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, + ExecSpace> + mdrangepolicy({0, 0, 0, 0}, {N0, N1, N2, N3}, {T0, T1, T2, T3}); + + // iterate by tile + Kokkos::parallel_for( + "ViewTile rank 4 LL", mdrangepolicy, + KOKKOS_LAMBDA(const int i, const int j, const int k, const int l) { + dv(i, j, k, l) += 1; + }); + + Kokkos::deep_copy(v, dv); + + long counter_subview = 0; + long counter_inc = 0; + + for (int tl = 0; tl < NT3; ++tl) { + for (int tk = 0; tk < NT2; ++tk) { + for (int tj = 0; tj < NT1; ++tj) { + for (int ti = 0; ti < NT0; ++ti) { + auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); + for (int l = 0; l < T3; ++l) { + for (int k = 0; k < T2; ++k) { + for (int j = 0; j < T1; ++j) { + for (int i = 0; i < T0; ++i) { + if (tile_subview(i, j, k, l) != + v(ti * T0 + i, tj * T1 + j, tk * T2 + k, + tl * T3 + l)) { + ++counter_subview; + } + if (tile_subview(i, j, k, l) != + ((ti + tj * NT0 + tk * N0 * N1 + tl * N0 * N1 * N2) * + FT + + (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2) + 1)) { + ++counter_inc; + } + } + } + } + } + } + } + } + } + ASSERT_EQ(counter_subview, long(0)); + ASSERT_EQ(counter_inc, long(0)); + } // end scope + + // Create RL View + { + using ViewType = Kokkos::View<Scalar****, LayoutRL_4D_2x4x4x2, ExecSpace>; + Kokkos::View<Scalar****, LayoutRL_4D_2x4x4x2, ExecSpace> dv("dv", N0, N1, + N2, N3); + + typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); + + // Initialize on host + for (int ti = 0; ti < NT0; ++ti) { + for (int tj = 0; tj < NT1; ++tj) { + for (int tk = 0; tk < NT2; ++tk) { + for (int tl = 0; tl < NT3; ++tl) { + for (int l = 0; l < T3; ++l) { + for (int k = 0; k < T2; ++k) { + for (int j = 0; j < T1; ++j) { + for (int i = 0; i < T0; ++i) { + v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = + (ti * NT1 * NT2 * N3 + tj * NT2 * N3 + tk * N3 + tl) * + FT + + (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2); + } + } + } + } + } + } + } + } + + // copy to device + Kokkos::deep_copy(dv, v); + + Kokkos::MDRangePolicy< + Kokkos::Rank<4, Kokkos::Iterate::Right, Kokkos::Iterate::Left>, + ExecSpace> + mdrangepolicy({0, 0, 0, 0}, {N0, N1, N2, N3}, {T0, T1, T2, T3}); + + // iterate by tile + Kokkos::parallel_for( + "ViewTile rank 4 RL", mdrangepolicy, + KOKKOS_LAMBDA(const int i, const int j, const int k, const int l) { + dv(i, j, k, l) += 1; + }); + + Kokkos::deep_copy(v, dv); + + long counter_subview = 0; + long counter_inc = 0; + + for (int ti = 0; ti < NT0; ++ti) { + for (int tj = 0; tj < NT1; ++tj) { + for (int tk = 0; tk < NT2; ++tk) { + for (int tl = 0; tl < NT3; ++tl) { + auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); + for (int l = 0; l < T3; ++l) { + for (int k = 0; k < T2; ++k) { + for (int j = 0; j < T1; ++j) { + for (int i = 0; i < T0; ++i) { + if (tile_subview(i, j, k, l) != + v(ti * T0 + i, tj * T1 + j, tk * T2 + k, + tl * T3 + l)) { + ++counter_subview; + } + if (tile_subview(i, j, k, l) != + ((ti * NT1 * NT2 * N3 + tj * NT2 * N3 + tk * N3 + + tl) * + FT + + (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2) + 1)) { + ++counter_inc; + } + } + } + } + } + } + } + } + } + ASSERT_EQ(counter_subview, long(0)); + ASSERT_EQ(counter_inc, long(0)); + } // end scope + + // Create LR View + { + using ViewType = Kokkos::View<Scalar****, LayoutLR_4D_2x4x4x2, ExecSpace>; + Kokkos::View<Scalar****, LayoutLR_4D_2x4x4x2, ExecSpace> dv("dv", N0, N1, + N2, N3); + + typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); + + // Initialize on host + for (int tl = 0; tl < NT3; ++tl) { + for (int tk = 0; tk < NT2; ++tk) { + for (int tj = 0; tj < NT1; ++tj) { + for (int ti = 0; ti < NT0; ++ti) { + for (int i = 0; i < T0; ++i) { + for (int j = 0; j < T1; ++j) { + for (int k = 0; k < T2; ++k) { + for (int l = 0; l < T3; ++l) { + v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = + (ti + tj * NT0 + tk * NT0 * NT1 + + tl * NT0 * NT1 * NT2) * + FT + + (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l); + } + } + } + } + } + } + } + } + + // copy to device + Kokkos::deep_copy(dv, v); + + Kokkos::MDRangePolicy< + Kokkos::Rank<4, Kokkos::Iterate::Left, Kokkos::Iterate::Right>, + ExecSpace> + mdrangepolicy({0, 0, 0, 0}, {N0, N1, N2, N3}, {T0, T1, T2, T3}); + + // iterate by tile + Kokkos::parallel_for( + "ViewTile rank 4 LR", mdrangepolicy, + KOKKOS_LAMBDA(const int i, const int j, const int k, const int l) { + dv(i, j, k, l) += 1; + }); + + Kokkos::deep_copy(v, dv); + + long counter_subview = 0; + long counter_inc = 0; + + for (int tl = 0; tl < NT3; ++tl) { + for (int tk = 0; tk < NT2; ++tk) { + for (int tj = 0; tj < NT1; ++tj) { + for (int ti = 0; ti < NT0; ++ti) { + auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); + for (int i = 0; i < T0; ++i) { + for (int j = 0; j < T1; ++j) { + for (int k = 0; k < T2; ++k) { + for (int l = 0; l < T3; ++l) { + if (tile_subview(i, j, k, l) != + v(ti * T0 + i, tj * T1 + j, tk * T2 + k, + tl * T3 + l)) { + ++counter_subview; + } + if (tile_subview(i, j, k, l) != + ((ti + tj * NT0 + tk * NT0 * NT1 + + tl * NT0 * NT1 * NT2) * + FT + + (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l) + 1)) { + ++counter_inc; + } + } + } + } + } + } + } + } + } + ASSERT_EQ(counter_subview, long(0)); + ASSERT_EQ(counter_inc, long(0)); + } // end scope + + // Create RR View + { + using ViewType = Kokkos::View<Scalar****, LayoutRR_4D_2x4x4x2, ExecSpace>; + Kokkos::View<Scalar****, LayoutRR_4D_2x4x4x2, ExecSpace> dv("dv", N0, N1, + N2, N3); + + typename ViewType::HostMirror v = Kokkos::create_mirror_view(dv); + + // Initialize on host + for (int ti = 0; ti < NT0; ++ti) { + for (int tj = 0; tj < NT1; ++tj) { + for (int tk = 0; tk < NT2; ++tk) { + for (int tl = 0; tl < NT3; ++tl) { + for (int i = 0; i < T0; ++i) { + for (int j = 0; j < T1; ++j) { + for (int k = 0; k < T2; ++k) { + for (int l = 0; l < T3; ++l) { + v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = + (ti * NT1 * NT2 * NT3 + tj * NT2 * NT3 + tk * NT3 + + tl) * + FT + + (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l); + } + } + } + } + } + } + } + } + + // copy to device + Kokkos::deep_copy(dv, v); + + Kokkos::MDRangePolicy< + Kokkos::Rank<4, Kokkos::Iterate::Right, Kokkos::Iterate::Right>, + ExecSpace> + mdrangepolicy({0, 0, 0, 0}, {N0, N1, N2, N3}, {T0, T1, T2, T3}); + + // iterate by tile + Kokkos::parallel_for( + "ViewTile rank 4 RR", mdrangepolicy, + KOKKOS_LAMBDA(const int i, const int j, const int k, const int l) { + dv(i, j, k, l) += 1; + }); + + Kokkos::deep_copy(v, dv); + + long counter_subview = 0; + long counter_inc = 0; + + for (int ti = 0; ti < NT0; ++ti) { + for (int tj = 0; tj < NT1; ++tj) { + for (int tk = 0; tk < NT2; ++tk) { + for (int tl = 0; tl < NT3; ++tl) { + auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); + for (int i = 0; i < T0; ++i) { + for (int j = 0; j < T1; ++j) { + for (int k = 0; k < T2; ++k) { + for (int l = 0; l < T3; ++l) { + if (tile_subview(i, j, k, l) != + v(ti * T0 + i, tj * T1 + j, tk * T2 + k, + tl * T3 + l)) { + ++counter_subview; + } + if (tile_subview(i, j, k, l) != + ((ti * NT1 * NT2 * NT3 + tj * NT2 * NT3 + tk * NT3 + + tl) * + FT + + (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l) + 1)) { + ++counter_inc; + } + } + } + } + } + } + } + } + } + ASSERT_EQ(counter_subview, long(0)); + ASSERT_EQ(counter_inc, long(0)); + } // end scope +#endif + } // end test_view_layout_tiled_4d + + static void test_view_layout_tiled_subtile_2d(const int N0, const int N1) { + const int FT = T0 * T1; + + const int NT0 = int(std::ceil(N0 / T0)); + const int NT1 = int(std::ceil(N1 / T1)); + + // Counter to check for errors at the end + long counter[4] = {0}; + + // Create LL View + { + Kokkos::View<Scalar**, LayoutLL_2D_2x4, Kokkos::HostSpace> v("v", N0, N1); + for (int tj = 0; tj < NT1; ++tj) { + for (int ti = 0; ti < NT0; ++ti) { + for (int j = 0; j < T1; ++j) { + for (int i = 0; i < T0; ++i) { + v(ti * T0 + i, tj * T1 + j) = (ti + tj * NT0) * FT + (i + j * T0); + } + } + } + } + + for (int tj = 0; tj < NT1; ++tj) { + for (int ti = 0; ti < NT0; ++ti) { + auto tile_subview = Kokkos::tile_subview(v, ti, tj); + for (int j = 0; j < T1; ++j) { + for (int i = 0; i < T0; ++i) { + if (tile_subview(i, j) != v(ti * T0 + i, tj * T1 + j)) { + ++counter[0]; + } +#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT + std::cout << "idx0,idx1 = " << ti * T0 + i << "," << tj * T1 + j + << std::endl; + std::cout << "ti,tj,i,j: " << ti << "," << tj << "," << i << "," + << j << " v = " << v(ti * T0 + i, tj * T1 + j) + << " flat idx = " + << (ti + tj * NT0) * FT + (i + j * T0) << std::endl; + std::cout << "subview_tile output = " << tile_subview(i, j) + << std::endl; +#endif + } + } + } + } + } // end scope + + // Create RL View + { + Kokkos::View<Scalar**, LayoutRL_2D_2x4, Kokkos::HostSpace> v("v", N0, N1); + for (int ti = 0; ti < NT0; ++ti) { + for (int tj = 0; tj < NT1; ++tj) { + for (int j = 0; j < T1; ++j) { + for (int i = 0; i < T0; ++i) { + v(ti * T0 + i, tj * T1 + j) = (ti * NT1 + tj) * FT + (i + j * T0); + } + } + } + } + + for (int ti = 0; ti < NT0; ++ti) { + for (int tj = 0; tj < NT1; ++tj) { + auto tile_subview = Kokkos::tile_subview(v, ti, tj); + for (int j = 0; j < T1; ++j) { + for (int i = 0; i < T0; ++i) { + if (tile_subview(i, j) != v(ti * T0 + i, tj * T1 + j)) { + ++counter[1]; + } +#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT + std::cout << "idx0,idx1 = " << ti * T0 + i << "," << tj * T1 + j + << std::endl; + std::cout << "ti,tj,i,j: " << ti << "," << tj << "," << i << "," + << j << " v = " << v(ti * T0 + i, tj * T1 + j) + << " flat idx = " + << (ti * NT1 + tj) * FT + (i + j * T0) << std::endl; + std::cout << "subview_tile output = " << tile_subview(i, j) + << std::endl; +#endif + } + } + } + } + } // end scope + + // Create LR View + { + Kokkos::View<Scalar**, LayoutLR_2D_2x4, Kokkos::HostSpace> v("v", N0, N1); + for (int tj = 0; tj < NT1; ++tj) { + for (int ti = 0; ti < NT0; ++ti) { + for (int i = 0; i < T0; ++i) { + for (int j = 0; j < T1; ++j) { + v(ti * T0 + i, tj * T1 + j) = (ti + tj * NT0) * FT + (i * T1 + j); + } + } + } + } + + for (int tj = 0; tj < NT1; ++tj) { + for (int ti = 0; ti < NT0; ++ti) { + auto tile_subview = Kokkos::tile_subview(v, ti, tj); + for (int i = 0; i < T0; ++i) { + for (int j = 0; j < T1; ++j) { + if (tile_subview(i, j) != v(ti * T0 + i, tj * T1 + j)) { + ++counter[2]; + } +#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT + std::cout << "idx0,idx1 = " << ti * T0 + i << "," << tj * T1 + j + << std::endl; + std::cout << "ti,tj,i,j: " << ti << "," << tj << "," << i << "," + << j << " v = " << v(ti * T0 + i, tj * T1 + j) + << " flat idx = " + << (ti + tj * NT0) * FT + (i * T1 + j) << std::endl; + std::cout << "subview_tile output = " << tile_subview(i, j) + << std::endl; +#endif + } + } + } + } + } // end scope + + // Create RR View + { + Kokkos::View<Scalar**, LayoutRR_2D_2x4, Kokkos::HostSpace> v("v", N0, N1); + for (int ti = 0; ti < NT0; ++ti) { + for (int tj = 0; tj < NT1; ++tj) { + for (int i = 0; i < T0; ++i) { + for (int j = 0; j < T1; ++j) { + v(ti * T0 + i, tj * T1 + j) = (ti * NT1 + tj) * FT + (i * T1 + j); + } + } + } + } + + for (int ti = 0; ti < NT0; ++ti) { + for (int tj = 0; tj < NT1; ++tj) { + auto tile_subview = Kokkos::tile_subview(v, ti, tj); + for (int i = 0; i < T0; ++i) { + for (int j = 0; j < T1; ++j) { + if (tile_subview(i, j) != v(ti * T0 + i, tj * T1 + j)) { + ++counter[3]; + } +#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT + std::cout << "idx0,idx1 = " << ti * T0 + i << "," << tj * T1 + j + << std::endl; + std::cout << "ti,tj,i,j: " << ti << "," << tj << "," << i << "," + << j << " v = " << v(ti * T0 + i, tj * T1 + j) + << " flat idx = " + << (ti * NT1 + tj) * FT + (i * T1 + j) << std::endl; + std::cout << "subview_tile output = " << tile_subview(i, j) + << std::endl; + std::cout << "subview tile rank = " << Kokkos::rank(tile_subview) + << std::endl; +#endif + } + } + } + } + } // end scope + +#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT + std::cout << "subview_tile vs view errors:\n" + << " LL: " << counter[0] << " RL: " << counter[1] + << " LR: " << counter[2] << " RR: " << counter[3] << std::endl; +#endif + + ASSERT_EQ(counter[0], long(0)); + ASSERT_EQ(counter[1], long(0)); + ASSERT_EQ(counter[2], long(0)); + ASSERT_EQ(counter[3], long(0)); + } // end test_view_layout_tiled_subtile_2d + + static void test_view_layout_tiled_subtile_3d(const int N0, const int N1, + const int N2) { + const int FT = T0 * T1 * T2; + + const int NT0 = int(std::ceil(N0 / T0)); + const int NT1 = int(std::ceil(N1 / T1)); + const int NT2 = int(std::ceil(N2 / T2)); + + // Counter to check for errors at the end + long counter[4] = {0}; + // Create LL View + { + Kokkos::View<Scalar***, LayoutLL_3D_2x4x4, Kokkos::HostSpace> v("v", N0, + N1, N2); + for (int tk = 0; tk < NT2; ++tk) { + for (int tj = 0; tj < NT1; ++tj) { + for (int ti = 0; ti < NT0; ++ti) { + for (int k = 0; k < T2; ++k) { + for (int j = 0; j < T1; ++j) { + for (int i = 0; i < T0; ++i) { + v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = + (ti + tj * NT0 + tk * N0 * N1) * FT + + (i + j * T0 + k * T0 * T1); + } + } + } + } + } + } + + for (int tk = 0; tk < NT2; ++tk) { + for (int tj = 0; tj < NT1; ++tj) { + for (int ti = 0; ti < NT0; ++ti) { + auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); + for (int k = 0; k < T2; ++k) { + for (int j = 0; j < T1; ++j) { + for (int i = 0; i < T0; ++i) { + if (tile_subview(i, j, k) != + v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { + ++counter[0]; + } +#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT + std::cout << "idx0,idx1,idx2 = " << ti * T0 + i << "," + << tj * T1 + j << "," << tk * T2 + k << std::endl; + std::cout + << "ti,tj,tk,i,j,k: " << ti << "," << tj << "," << tk + << "," << i << "," << j << "," << k + << " v = " << v(ti * T0 + i, tj * T1 + j, tk * T2 + k) + << " flat idx = " + << (ti + tj * NT0 + tk * N0 * N1) * FT + + (i + j * T0 + k * T0 * T1) + << std::endl; + std::cout << "subview_tile output = " << tile_subview(i, j, k) + << std::endl; + std::cout + << "subview tile rank = " << Kokkos::rank(tile_subview) + << std::endl; +#endif + } + } + } + } + } + } + } // end scope + + // Create RL View + { + Kokkos::View<Scalar***, LayoutRL_3D_2x4x4, Kokkos::HostSpace> v("v", N0, + N1, N2); + for (int ti = 0; ti < NT0; ++ti) { + for (int tj = 0; tj < NT1; ++tj) { + for (int tk = 0; tk < NT2; ++tk) { + for (int k = 0; k < T2; ++k) { + for (int j = 0; j < T1; ++j) { + for (int i = 0; i < T0; ++i) { + v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = + (ti * NT1 * NT2 + tj * NT2 + tk) * FT + + (i + j * T0 + k * T0 * T1); + } + } + } + } + } + } + + for (int ti = 0; ti < NT0; ++ti) { + for (int tj = 0; tj < NT1; ++tj) { + for (int tk = 0; tk < NT2; ++tk) { + auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); + for (int k = 0; k < T2; ++k) { + for (int j = 0; j < T1; ++j) { + for (int i = 0; i < T0; ++i) { + if (tile_subview(i, j, k) != + v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { + ++counter[1]; + } +#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT + std::cout << "idx0,idx1,idx2 = " << ti * T0 + i << "," + << tj * T1 + j << "," << tk * T2 + k << std::endl; + std::cout + << "ti,tj,tk,i,j,k: " << ti << "," << tj << "," << tk + << "," << i << "," << j << "," << k + << " v = " << v(ti * T0 + i, tj * T1 + j, tk * T2 + k) + << " flat idx = " + << (ti * NT1 * NT2 + tj * NT2 + tk) * FT + + (i + j * T0 + k * T0 * T1) + << std::endl; + std::cout << "subview_tile output = " << tile_subview(i, j, k) + << std::endl; +#endif + } + } + } + } + } + } + } // end scope + + // Create LR View + { + Kokkos::View<Scalar***, LayoutLR_3D_2x4x4, Kokkos::HostSpace> v("v", N0, + N1, N2); + for (int tk = 0; tk < NT2; ++tk) { + for (int tj = 0; tj < NT1; ++tj) { + for (int ti = 0; ti < NT0; ++ti) { + for (int i = 0; i < T0; ++i) { + for (int j = 0; j < T1; ++j) { + for (int k = 0; k < T2; ++k) { + v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = + (ti + tj * NT0 + tk * NT0 * NT1) * FT + + (i * T1 * T2 + j * T2 + k); + } + } + } + } + } + } + + for (int tk = 0; tk < NT2; ++tk) { + for (int tj = 0; tj < NT1; ++tj) { + for (int ti = 0; ti < NT0; ++ti) { + auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); + for (int i = 0; i < T0; ++i) { + for (int j = 0; j < T1; ++j) { + for (int k = 0; k < T2; ++k) { + if (tile_subview(i, j, k) != + v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { + ++counter[2]; + } +#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT + std::cout << "idx0,idx1,idx2 = " << ti * T0 + i << "," + << tj * T1 + j << "," << tk * T2 + k << std::endl; + std::cout + << "ti,tj,tk,i,j,k: " << ti << "," << tj << "," << tk + << "," << i << "," << j << "," << k + << " v = " << v(ti * T0 + i, tj * T1 + j, tk * T2 + k) + << " flat idx = " + << (ti + tj * NT0 + tk * NT0 * NT1) * FT + + (i * T1 * T2 + j * T2 + k) + << std::endl; + std::cout << "subview_tile output = " << tile_subview(i, j, k) + << std::endl; + std::cout + << "subview tile rank = " << Kokkos::rank(tile_subview) + << std::endl; +#endif + } + } + } + } + } + } + } // end scope + + // Create RR View + { + Kokkos::View<Scalar***, LayoutRR_3D_2x4x4, Kokkos::HostSpace> v("v", N0, + N1, N2); + for (int ti = 0; ti < NT0; ++ti) { + for (int tj = 0; tj < NT1; ++tj) { + for (int tk = 0; tk < NT2; ++tk) { + for (int i = 0; i < T0; ++i) { + for (int j = 0; j < T1; ++j) { + for (int k = 0; k < T2; ++k) { + v(ti * T0 + i, tj * T1 + j, tk * T2 + k) = + (ti * NT1 * NT2 + tj * NT2 + tk) * FT + + (i * T1 * T2 + j * T2 + k); + } + } + } + } + } + } + + for (int ti = 0; ti < NT0; ++ti) { + for (int tj = 0; tj < NT1; ++tj) { + for (int tk = 0; tk < NT2; ++tk) { + auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk); + for (int i = 0; i < T0; ++i) { + for (int j = 0; j < T1; ++j) { + for (int k = 0; k < T2; ++k) { + if (tile_subview(i, j, k) != + v(ti * T0 + i, tj * T1 + j, tk * T2 + k)) { + ++counter[3]; + } +#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT + std::cout << "idx0,idx1,idx2 = " << ti * T0 + i << "," + << tj * T1 + j << "," << tk * T2 + k << std::endl; + std::cout + << "ti,tj,tk,i,j,k: " << ti << "," << tj << "," << tk + << "," << i << "," << j << "," << k + << " v = " << v(ti * T0 + i, tj * T1 + j, tk * T2 + k) + << " flat idx = " + << (ti * NT1 * NT2 + tj * NT2 + tk) * FT + + (i * T1 * T2 + j * T2 + k) + << std::endl; + std::cout << "subview_tile output = " << tile_subview(i, j, k) + << std::endl; + std::cout + << "subview tile rank = " << Kokkos::rank(tile_subview) + << std::endl; +#endif + } + } + } + } + } + } + } // end scope + +#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT + std::cout << "subview_tile vs view errors:\n" + << " LL: " << counter[0] << " RL: " << counter[1] + << " LR: " << counter[2] << " RR: " << counter[3] << std::endl; +#endif + + ASSERT_EQ(counter[0], long(0)); + ASSERT_EQ(counter[1], long(0)); + ASSERT_EQ(counter[2], long(0)); + ASSERT_EQ(counter[3], long(0)); + + } // end test_view_layout_tiled_subtile_3d + + static void test_view_layout_tiled_subtile_4d(const int N0, const int N1, + const int N2, const int N3) { + const int FT = T0 * T1 * T2 * T3; + + const int NT0 = int(std::ceil(N0 / T0)); + const int NT1 = int(std::ceil(N1 / T1)); + const int NT2 = int(std::ceil(N2 / T2)); + const int NT3 = int(std::ceil(N3 / T3)); + + // Counter to check for errors at the end + long counter[4] = {0}; + // Create LL View + { + Kokkos::View<Scalar****, LayoutLL_4D_2x4x4x2, Kokkos::HostSpace> v( + "v", N0, N1, N2, N3); + for (int tl = 0; tl < NT3; ++tl) { + for (int tk = 0; tk < NT2; ++tk) { + for (int tj = 0; tj < NT1; ++tj) { + for (int ti = 0; ti < NT0; ++ti) { + for (int l = 0; l < T3; ++l) { + for (int k = 0; k < T2; ++k) { + for (int j = 0; j < T1; ++j) { + for (int i = 0; i < T0; ++i) { + v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = + (ti + tj * NT0 + tk * N0 * N1 + tl * N0 * N1 * N2) * + FT + + (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2); + } + } + } + } + } + } + } + } + + for (int tl = 0; tl < NT3; ++tl) { + for (int tk = 0; tk < NT2; ++tk) { + for (int tj = 0; tj < NT1; ++tj) { + for (int ti = 0; ti < NT0; ++ti) { + auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); + for (int l = 0; l < T3; ++l) { + for (int k = 0; k < T2; ++k) { + for (int j = 0; j < T1; ++j) { + for (int i = 0; i < T0; ++i) { + if (tile_subview(i, j, k, l) != + v(ti * T0 + i, tj * T1 + j, tk * T2 + k, + tl * T3 + l)) { + ++counter[0]; + } +#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT + std::cout << "idx0,idx1,idx2,idx3 = " << ti * T0 + i + << "," << tj * T1 + j << "," << tk * T2 + k + << "," << tl * T3 + l << std::endl; + std::cout + << "ti,tj,tk,tl: " << ti << "," << tj << "," << tk + << "," << tl << "," + << " i,j,k,l: " << i << "," << j << "," << k << "," + << l << " v = " + << v(ti * T0 + i, tj * T1 + j, tk * T2 + k, + tl * T3 + l) + << " flat idx = " + << (ti + tj * NT0 + tk * N0 * N1 + + tl * N0 * N1 * N2) * + FT + + (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2) + << std::endl; + std::cout << "subview_tile output = " + << tile_subview(i, j, k, l) << std::endl; + std::cout << "subview tile rank = " + << Kokkos::rank(tile_subview) << std::endl; +#endif + } + } + } + } + } + } + } + } + } // end scope + + // Create RL View + { + Kokkos::View<Scalar****, LayoutRL_4D_2x4x4x2, Kokkos::HostSpace> v( + "v", N0, N1, N2, N3); + for (int ti = 0; ti < NT0; ++ti) { + for (int tj = 0; tj < NT1; ++tj) { + for (int tk = 0; tk < NT2; ++tk) { + for (int tl = 0; tl < NT3; ++tl) { + for (int l = 0; l < T3; ++l) { + for (int k = 0; k < T2; ++k) { + for (int j = 0; j < T1; ++j) { + for (int i = 0; i < T0; ++i) { + v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = + (ti * NT1 * NT2 * N3 + tj * NT2 * N3 + tk * N3 + tl) * + FT + + (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2); + } + } + } + } + } + } + } + } + + for (int ti = 0; ti < NT0; ++ti) { + for (int tj = 0; tj < NT1; ++tj) { + for (int tk = 0; tk < NT2; ++tk) { + for (int tl = 0; tl < NT3; ++tl) { + auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); + for (int l = 0; l < T3; ++l) { + for (int k = 0; k < T2; ++k) { + for (int j = 0; j < T1; ++j) { + for (int i = 0; i < T0; ++i) { + if (tile_subview(i, j, k, l) != + v(ti * T0 + i, tj * T1 + j, tk * T2 + k, + tl * T3 + l)) { + ++counter[1]; + } +#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT + std::cout << "idx0,idx1,idx2,idx3 = " << ti * T0 + i + << "," << tj * T1 + j << "," << tk * T2 + k + << "," << tl * T3 + l << std::endl; + std::cout + << "ti,tj,tk,tl: " << ti << "," << tj << "," << tk + << "," << tl << "," + << " i,j,k,l: " << i << "," << j << "," << k << "," + << l << " v = " + << v(ti * T0 + i, tj * T1 + j, tk * T2 + k, + tl * T3 + l) + << " flat idx = " + << (ti * NT1 * NT2 * N3 + tj * NT2 * N3 + tk * N3 + + tl) * FT + + (i + j * T0 + k * T0 * T1 + l * T0 * T1 * T2) + << std::endl; + std::cout << "subview_tile output = " + << tile_subview(i, j, k, l) << std::endl; + std::cout << "subview tile rank = " + << Kokkos::rank(tile_subview) << std::endl; +#endif + } + } + } + } + } + } + } + } + } // end scope + + // Create LR View + { + Kokkos::View<Scalar****, LayoutLR_4D_2x4x4x2, Kokkos::HostSpace> v( + "v", N0, N1, N2, N3); + for (int tl = 0; tl < NT3; ++tl) { + for (int tk = 0; tk < NT2; ++tk) { + for (int tj = 0; tj < NT1; ++tj) { + for (int ti = 0; ti < NT0; ++ti) { + for (int i = 0; i < T0; ++i) { + for (int j = 0; j < T1; ++j) { + for (int k = 0; k < T2; ++k) { + for (int l = 0; l < T3; ++l) { + v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = + (ti + tj * NT0 + tk * NT0 * NT1 + + tl * NT0 * NT1 * NT2) * + FT + + (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l); + } + } + } + } + } + } + } + } + + for (int tl = 0; tl < NT3; ++tl) { + for (int tk = 0; tk < NT2; ++tk) { + for (int tj = 0; tj < NT1; ++tj) { + for (int ti = 0; ti < NT0; ++ti) { + auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); + for (int i = 0; i < T0; ++i) { + for (int j = 0; j < T1; ++j) { + for (int k = 0; k < T2; ++k) { + for (int l = 0; l < T3; ++l) { + if (tile_subview(i, j, k, l) != + v(ti * T0 + i, tj * T1 + j, tk * T2 + k, + tl * T3 + l)) { + ++counter[2]; + } +#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT + std::cout << "idx0,idx1,idx2,idx3 = " << ti * T0 + i + << "," << tj * T1 + j << "," << tk * T2 + k + << "," << tl * T3 + l << std::endl; + std::cout + << "ti,tj,tk,tl: " << ti << "," << tj << "," << tk + << "," << tl << "," + << " i,j,k,l: " << i << "," << j << "," << k << "," + << l << " v = " + << v(ti * T0 + i, tj * T1 + j, tk * T2 + k, + tl * T3 + l) + << " flat idx = " + << (ti + tj * NT0 + tk * NT0 * NT1 + + tl * NT0 * NT1 * NT2) * + FT + + (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l) + << std::endl; + std::cout << "subview_tile output = " + << tile_subview(i, j, k, l) << std::endl; + std::cout << "subview tile rank = " + << Kokkos::rank(tile_subview) << std::endl; +#endif + } + } + } + } + } + } + } + } + } // end scope + + // Create RR View + { + Kokkos::View<Scalar****, LayoutRR_4D_2x4x4x2, Kokkos::HostSpace> v( + "v", N0, N1, N2, N3); + for (int ti = 0; ti < NT0; ++ti) { + for (int tj = 0; tj < NT1; ++tj) { + for (int tk = 0; tk < NT2; ++tk) { + for (int tl = 0; tl < NT3; ++tl) { + for (int i = 0; i < T0; ++i) { + for (int j = 0; j < T1; ++j) { + for (int k = 0; k < T2; ++k) { + for (int l = 0; l < T3; ++l) { + v(ti * T0 + i, tj * T1 + j, tk * T2 + k, tl * T3 + l) = + (ti * NT1 * NT2 * NT3 + tj * NT2 * NT3 + tk * NT3 + + tl) * + FT + + (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l); + } + } + } + } + } + } + } + } + + for (int ti = 0; ti < NT0; ++ti) { + for (int tj = 0; tj < NT1; ++tj) { + for (int tk = 0; tk < NT2; ++tk) { + for (int tl = 0; tl < NT3; ++tl) { + auto tile_subview = Kokkos::tile_subview(v, ti, tj, tk, tl); + for (int i = 0; i < T0; ++i) { + for (int j = 0; j < T1; ++j) { + for (int k = 0; k < T2; ++k) { + for (int l = 0; l < T3; ++l) { + if (tile_subview(i, j, k, l) != + v(ti * T0 + i, tj * T1 + j, tk * T2 + k, + tl * T3 + l)) { + ++counter[3]; + } +#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT + std::cout << "idx0,idx1,idx2,idx3 = " << ti * T0 + i + << "," << tj * T1 + j << "," << tk * T2 + k + << "," << tl * T3 + l << std::endl; + std::cout + << "ti,tj,tk,tl: " << ti << "," << tj << "," << tk + << "," << tl << "," + << " i,j,k,l: " << i << "," << j << "," << k << "," + << l << " v = " + << v(ti * T0 + i, tj * T1 + j, tk * T2 + k, + tl * T3 + l) + << " flat idx = " + << (ti * NT1 * NT2 * NT3 + tj * NT2 * NT3 + tk * NT3 + + tl) * FT + + (i * T1 * T2 * T3 + j * T2 * T3 + k * T3 + l) + << std::endl; + std::cout << "subview_tile output = " + << tile_subview(i, j, k, l) << std::endl; + std::cout << "subview tile rank = " + << Kokkos::rank(tile_subview) << std::endl; +#endif + } + } + } + } + } + } + } + } + } // end scope + +#ifdef KOKKOS_VERBOSE_LAYOUTTILED_OUTPUT + std::cout << "subview_tile vs view errors:\n" + << " LL: " << counter[0] << " RL: " << counter[1] + << " LR: " << counter[2] << " RR: " << counter[3] << std::endl; +#endif + + ASSERT_EQ(counter[0], long(0)); + ASSERT_EQ(counter[1], long(0)); + ASSERT_EQ(counter[2], long(0)); + ASSERT_EQ(counter[3], long(0)); + + } // end test_view_layout_tiled_subtile_4d + +}; // end TestViewLayoutTiled struct + +} // namespace + +TEST(TEST_CATEGORY, view_layouttiled) { + // These two examples are iterating by tile, then within a tile - not by + // extents If N# is not a power of two, but want to iterate by tile then + // within a tile, need to check that mapped index is within extent + TestViewLayoutTiled<TEST_EXECSPACE>::test_view_layout_tiled_2d(4, 12); + TestViewLayoutTiled<TEST_EXECSPACE>::test_view_layout_tiled_3d(4, 12, 16); + TestViewLayoutTiled<TEST_EXECSPACE>::test_view_layout_tiled_4d(4, 12, 16, 12); +} +TEST(TEST_CATEGORY, view_layouttiled_subtile) { + // These two examples are iterating by tile, then within a tile - not by + // extents If N# is not a power of two, but want to iterate by tile then + // within a tile, need to check that mapped index is within extent + TestViewLayoutTiled<TEST_EXECSPACE>::test_view_layout_tiled_subtile_2d(4, 12); + TestViewLayoutTiled<TEST_EXECSPACE>::test_view_layout_tiled_subtile_3d(4, 12, + 16); + TestViewLayoutTiled<TEST_EXECSPACE>::test_view_layout_tiled_subtile_4d( + 4, 12, 16, 12); +} +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestViewMapping_a.hpp b/packages/kokkos/core/unit_test/TestViewMapping_a.hpp new file mode 100644 index 0000000000000000000000000000000000000000..fdbda099176c79410c1be6599546f09aba3269dc --- /dev/null +++ b/packages/kokkos/core/unit_test/TestViewMapping_a.hpp @@ -0,0 +1,1328 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <stdexcept> +#include <sstream> +#include <iostream> + +#include <Kokkos_Core.hpp> + +namespace Test { + +template <class Space> +void test_view_mapping() { + using ExecSpace = typename Space::execution_space; + + using dim_0 = Kokkos::Impl::ViewDimension<>; + using dim_s2 = Kokkos::Impl::ViewDimension<2>; + using dim_s2_s3 = Kokkos::Impl::ViewDimension<2, 3>; + using dim_s2_s3_s4 = Kokkos::Impl::ViewDimension<2, 3, 4>; + + using dim_s0 = Kokkos::Impl::ViewDimension<0>; + using dim_s0_s3 = Kokkos::Impl::ViewDimension<0, 3>; + using dim_s0_s3_s4 = Kokkos::Impl::ViewDimension<0, 3, 4>; + + using dim_s0_s0 = Kokkos::Impl::ViewDimension<0, 0>; + using dim_s0_s0_s4 = Kokkos::Impl::ViewDimension<0, 0, 4>; + + using dim_s0_s0_s0 = Kokkos::Impl::ViewDimension<0, 0, 0>; + using dim_s0_s0_s0_s0 = Kokkos::Impl::ViewDimension<0, 0, 0, 0>; + using dim_s0_s0_s0_s0_s0 = Kokkos::Impl::ViewDimension<0, 0, 0, 0, 0>; + using dim_s0_s0_s0_s0_s0_s0 = Kokkos::Impl::ViewDimension<0, 0, 0, 0, 0, 0>; + using dim_s0_s0_s0_s0_s0_s0_s0 = + Kokkos::Impl::ViewDimension<0, 0, 0, 0, 0, 0, 0>; + using dim_s0_s0_s0_s0_s0_s0_s0_s0 = + Kokkos::Impl::ViewDimension<0, 0, 0, 0, 0, 0, 0, 0>; + +// Fully static dimensions should not be larger than an int. +#ifndef _WIN32 // For some reason on Windows the first test here fails with + // size being 7 bytes on windows??? + ASSERT_LE(sizeof(dim_0), sizeof(int)); + ASSERT_LE(sizeof(dim_s2), sizeof(int)); + ASSERT_LE(sizeof(dim_s2_s3), sizeof(int)); + ASSERT_LE(sizeof(dim_s2_s3_s4), sizeof(int)); + + // Rank 1 is size_t. + ASSERT_EQ(sizeof(dim_s0), sizeof(size_t)); + ASSERT_EQ(sizeof(dim_s0_s3), sizeof(size_t)); + ASSERT_EQ(sizeof(dim_s0_s3_s4), sizeof(size_t)); + + // Allow for padding. + ASSERT_LE(sizeof(dim_s0_s0), 2 * sizeof(size_t)); + ASSERT_LE(sizeof(dim_s0_s0_s4), 2 * sizeof(size_t)); + + ASSERT_LE(sizeof(dim_s0_s0_s0), 4 * sizeof(size_t)); + ASSERT_EQ(sizeof(dim_s0_s0_s0_s0), 4 * sizeof(unsigned)); + ASSERT_LE(sizeof(dim_s0_s0_s0_s0_s0), 6 * sizeof(unsigned)); + ASSERT_EQ(sizeof(dim_s0_s0_s0_s0_s0_s0), 6 * sizeof(unsigned)); + ASSERT_LE(sizeof(dim_s0_s0_s0_s0_s0_s0_s0), 8 * sizeof(unsigned)); + ASSERT_EQ(sizeof(dim_s0_s0_s0_s0_s0_s0_s0_s0), 8 * sizeof(unsigned)); +#endif + static_assert(int(dim_0::rank) == int(0), ""); + static_assert(int(dim_0::rank_dynamic) == int(0), ""); + static_assert(int(dim_0::ArgN0) == 1, ""); + static_assert(int(dim_0::ArgN1) == 1, ""); + static_assert(int(dim_0::ArgN2) == 1, ""); + + static_assert(int(dim_s2::rank) == int(1), ""); + static_assert(int(dim_s2::rank_dynamic) == int(0), ""); + static_assert(int(dim_s2::ArgN0) == 2, ""); + static_assert(int(dim_s2::ArgN1) == 1, ""); + + static_assert(int(dim_s2_s3::rank) == int(2), ""); + static_assert(int(dim_s2_s3::rank_dynamic) == int(0), ""); + static_assert(int(dim_s2_s3::ArgN0) == 2, ""); + static_assert(int(dim_s2_s3::ArgN1) == 3, ""); + static_assert(int(dim_s2_s3::ArgN2) == 1, ""); + + static_assert(int(dim_s2_s3_s4::rank) == int(3), ""); + static_assert(int(dim_s2_s3_s4::rank_dynamic) == int(0), ""); + static_assert(int(dim_s2_s3_s4::ArgN0) == 2, ""); + static_assert(int(dim_s2_s3_s4::ArgN1) == 3, ""); + static_assert(int(dim_s2_s3_s4::ArgN2) == 4, ""); + static_assert(int(dim_s2_s3_s4::ArgN3) == 1, ""); + + static_assert(int(dim_s0::rank) == int(1), ""); + static_assert(int(dim_s0::rank_dynamic) == int(1), ""); + + static_assert(int(dim_s0_s3::rank) == int(2), ""); + static_assert(int(dim_s0_s3::rank_dynamic) == int(1), ""); + static_assert(int(dim_s0_s3::ArgN0) == 0, ""); + static_assert(int(dim_s0_s3::ArgN1) == 3, ""); + + static_assert(int(dim_s0_s3_s4::rank) == int(3), ""); + static_assert(int(dim_s0_s3_s4::rank_dynamic) == int(1), ""); + static_assert(int(dim_s0_s3_s4::ArgN0) == 0, ""); + static_assert(int(dim_s0_s3_s4::ArgN1) == 3, ""); + static_assert(int(dim_s0_s3_s4::ArgN2) == 4, ""); + + static_assert(int(dim_s0_s0_s4::rank) == int(3), ""); + static_assert(int(dim_s0_s0_s4::rank_dynamic) == int(2), ""); + static_assert(int(dim_s0_s0_s4::ArgN0) == 0, ""); + static_assert(int(dim_s0_s0_s4::ArgN1) == 0, ""); + static_assert(int(dim_s0_s0_s4::ArgN2) == 4, ""); + + static_assert(int(dim_s0_s0_s0::rank) == int(3), ""); + static_assert(int(dim_s0_s0_s0::rank_dynamic) == int(3), ""); + + static_assert(int(dim_s0_s0_s0_s0::rank) == int(4), ""); + static_assert(int(dim_s0_s0_s0_s0::rank_dynamic) == int(4), ""); + + static_assert(int(dim_s0_s0_s0_s0_s0::rank) == int(5), ""); + static_assert(int(dim_s0_s0_s0_s0_s0::rank_dynamic) == int(5), ""); + + static_assert(int(dim_s0_s0_s0_s0_s0_s0::rank) == int(6), ""); + static_assert(int(dim_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(6), ""); + + static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0::rank) == int(7), ""); + static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(7), ""); + + static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank) == int(8), ""); + static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(8), ""); + + dim_s0 d1(2, 3, 4, 5, 6, 7, 8, 9); + dim_s0_s0 d2(2, 3, 4, 5, 6, 7, 8, 9); + dim_s0_s0_s0 d3(2, 3, 4, 5, 6, 7, 8, 9); + dim_s0_s0_s0_s0 d4(2, 3, 4, 5, 6, 7, 8, 9); + + ASSERT_EQ(d1.N0, 2); + ASSERT_EQ(d2.N0, 2); + ASSERT_EQ(d3.N0, 2); + ASSERT_EQ(d4.N0, 2); + + ASSERT_EQ(d1.N1, 1); + ASSERT_EQ(d2.N1, 3); + ASSERT_EQ(d3.N1, 3); + ASSERT_EQ(d4.N1, 3); + + ASSERT_EQ(d1.N2, 1); + ASSERT_EQ(d2.N2, 1); + ASSERT_EQ(d3.N2, 4); + ASSERT_EQ(d4.N2, 4); + + ASSERT_EQ(d1.N3, 1); + ASSERT_EQ(d2.N3, 1); + ASSERT_EQ(d3.N3, 1); + ASSERT_EQ(d4.N3, 5); + + //---------------------------------------- + + using stride_s0_s0_s0 = + Kokkos::Impl::ViewOffset<dim_s0_s0_s0, Kokkos::LayoutStride>; + + //---------------------------------------- + // Static dimension. + { + using left_s2_s3_s4 = + Kokkos::Impl::ViewOffset<dim_s2_s3_s4, Kokkos::LayoutLeft>; + + ASSERT_EQ(sizeof(left_s2_s3_s4), sizeof(dim_s2_s3_s4)); + + left_s2_s3_s4 off3; + + stride_s0_s0_s0 stride3(off3); + + ASSERT_EQ(off3.stride_0(), 1); + ASSERT_EQ(off3.stride_1(), 2); + ASSERT_EQ(off3.stride_2(), 6); + ASSERT_EQ(off3.span(), 24); + + ASSERT_EQ(off3.stride_0(), stride3.stride_0()); + ASSERT_EQ(off3.stride_1(), stride3.stride_1()); + ASSERT_EQ(off3.stride_2(), stride3.stride_2()); + ASSERT_EQ(off3.span(), stride3.span()); + + int offset = 0; + + for (int k = 0; k < 4; ++k) + for (int j = 0; j < 3; ++j) + for (int i = 0; i < 2; ++i, ++offset) { + ASSERT_EQ(off3(i, j, k), offset); + ASSERT_EQ(stride3(i, j, k), off3(i, j, k)); + } + } + + //---------------------------------------- + // Small dimension is unpadded. + { + using left_s0_s0_s4 = + Kokkos::Impl::ViewOffset<dim_s0_s0_s4, Kokkos::LayoutLeft>; + + left_s0_s0_s4 dyn_off3(std::integral_constant<unsigned, sizeof(int)>(), + Kokkos::LayoutLeft(2, 3, 0, 0, 0, 0, 0, 0)); + + stride_s0_s0_s0 stride3(dyn_off3); + + ASSERT_EQ(dyn_off3.m_dim.rank, 3); + ASSERT_EQ(dyn_off3.m_dim.N0, 2); + ASSERT_EQ(dyn_off3.m_dim.N1, 3); + ASSERT_EQ(dyn_off3.m_dim.N2, 4); + ASSERT_EQ(dyn_off3.m_dim.N3, 1); + ASSERT_EQ(dyn_off3.size(), 2 * 3 * 4); + + const Kokkos::LayoutLeft layout = dyn_off3.layout(); + + ASSERT_EQ(layout.dimension[0], 2); + ASSERT_EQ(layout.dimension[1], 3); + ASSERT_EQ(layout.dimension[2], 4); + ASSERT_EQ(layout.dimension[3], 1); + ASSERT_EQ(layout.dimension[4], 1); + ASSERT_EQ(layout.dimension[5], 1); + ASSERT_EQ(layout.dimension[6], 1); + ASSERT_EQ(layout.dimension[7], 1); + + ASSERT_EQ(stride3.m_dim.rank, 3); + ASSERT_EQ(stride3.m_dim.N0, 2); + ASSERT_EQ(stride3.m_dim.N1, 3); + ASSERT_EQ(stride3.m_dim.N2, 4); + ASSERT_EQ(stride3.m_dim.N3, 1); + ASSERT_EQ(stride3.size(), 2 * 3 * 4); + + int offset = 0; + + for (int k = 0; k < 4; ++k) + for (int j = 0; j < 3; ++j) + for (int i = 0; i < 2; ++i, ++offset) { + ASSERT_EQ(offset, dyn_off3(i, j, k)); + ASSERT_EQ(stride3(i, j, k), dyn_off3(i, j, k)); + } + + ASSERT_EQ(dyn_off3.span(), offset); + ASSERT_EQ(stride3.span(), dyn_off3.span()); + } + + //---------------------------------------- + // Large dimension is likely padded. + { + constexpr int N0 = 2000; + constexpr int N1 = 300; + + using left_s0_s0_s4 = + Kokkos::Impl::ViewOffset<dim_s0_s0_s4, Kokkos::LayoutLeft>; + + left_s0_s0_s4 dyn_off3(std::integral_constant<unsigned, sizeof(int)>(), + Kokkos::LayoutLeft(N0, N1, 0, 0, 0, 0, 0, 0)); + + stride_s0_s0_s0 stride3(dyn_off3); + + ASSERT_EQ(dyn_off3.m_dim.rank, 3); + ASSERT_EQ(dyn_off3.m_dim.N0, N0); + ASSERT_EQ(dyn_off3.m_dim.N1, N1); + ASSERT_EQ(dyn_off3.m_dim.N2, 4); + ASSERT_EQ(dyn_off3.m_dim.N3, 1); + ASSERT_EQ(dyn_off3.size(), N0 * N1 * 4); + + ASSERT_EQ(stride3.m_dim.rank, 3); + ASSERT_EQ(stride3.m_dim.N0, N0); + ASSERT_EQ(stride3.m_dim.N1, N1); + ASSERT_EQ(stride3.m_dim.N2, 4); + ASSERT_EQ(stride3.m_dim.N3, 1); + ASSERT_EQ(stride3.size(), N0 * N1 * 4); + ASSERT_EQ(stride3.span(), dyn_off3.span()); + + int offset = 0; + + for (int k = 0; k < 4; ++k) + for (int j = 0; j < N1; ++j) + for (int i = 0; i < N0; ++i) { + ASSERT_LE(offset, dyn_off3(i, j, k)); + ASSERT_EQ(stride3(i, j, k), dyn_off3(i, j, k)); + offset = dyn_off3(i, j, k) + 1; + } + + ASSERT_LE(offset, dyn_off3.span()); + } + + //---------------------------------------- + // Static dimension. + { + using right_s2_s3_s4 = + Kokkos::Impl::ViewOffset<dim_s2_s3_s4, Kokkos::LayoutRight>; + + ASSERT_EQ(sizeof(right_s2_s3_s4), sizeof(dim_s2_s3_s4)); + + right_s2_s3_s4 off3; + + stride_s0_s0_s0 stride3(off3); + + ASSERT_EQ(off3.stride_0(), 12); + ASSERT_EQ(off3.stride_1(), 4); + ASSERT_EQ(off3.stride_2(), 1); + + ASSERT_EQ(off3.dimension_0(), stride3.dimension_0()); + ASSERT_EQ(off3.dimension_1(), stride3.dimension_1()); + ASSERT_EQ(off3.dimension_2(), stride3.dimension_2()); + ASSERT_EQ(off3.stride_0(), stride3.stride_0()); + ASSERT_EQ(off3.stride_1(), stride3.stride_1()); + ASSERT_EQ(off3.stride_2(), stride3.stride_2()); + ASSERT_EQ(off3.span(), stride3.span()); + + int offset = 0; + + for (int i = 0; i < 2; ++i) + for (int j = 0; j < 3; ++j) + for (int k = 0; k < 4; ++k, ++offset) { + ASSERT_EQ(off3(i, j, k), offset); + ASSERT_EQ(off3(i, j, k), stride3(i, j, k)); + } + + ASSERT_EQ(off3.span(), offset); + } + + //---------------------------------------- + // Small dimension is unpadded. + { + using right_s0_s0_s4 = + Kokkos::Impl::ViewOffset<dim_s0_s0_s4, Kokkos::LayoutRight>; + + right_s0_s0_s4 dyn_off3(std::integral_constant<unsigned, sizeof(int)>(), + Kokkos::LayoutRight(2, 3, 0, 0, 0, 0, 0, 0)); + + stride_s0_s0_s0 stride3(dyn_off3); + + ASSERT_EQ(dyn_off3.m_dim.rank, 3); + ASSERT_EQ(dyn_off3.m_dim.N0, 2); + ASSERT_EQ(dyn_off3.m_dim.N1, 3); + ASSERT_EQ(dyn_off3.m_dim.N2, 4); + ASSERT_EQ(dyn_off3.m_dim.N3, 1); + ASSERT_EQ(dyn_off3.size(), 2 * 3 * 4); + + ASSERT_EQ(dyn_off3.dimension_0(), stride3.dimension_0()); + ASSERT_EQ(dyn_off3.dimension_1(), stride3.dimension_1()); + ASSERT_EQ(dyn_off3.dimension_2(), stride3.dimension_2()); + ASSERT_EQ(dyn_off3.stride_0(), stride3.stride_0()); + ASSERT_EQ(dyn_off3.stride_1(), stride3.stride_1()); + ASSERT_EQ(dyn_off3.stride_2(), stride3.stride_2()); + ASSERT_EQ(dyn_off3.span(), stride3.span()); + + int offset = 0; + + for (int i = 0; i < 2; ++i) + for (int j = 0; j < 3; ++j) + for (int k = 0; k < 4; ++k, ++offset) { + ASSERT_EQ(offset, dyn_off3(i, j, k)); + ASSERT_EQ(dyn_off3(i, j, k), stride3(i, j, k)); + } + + ASSERT_EQ(dyn_off3.span(), offset); + } + + //---------------------------------------- + // Large dimension is likely padded. + { + constexpr int N0 = 2000; + constexpr int N1 = 300; + + using right_s0_s0_s4 = + Kokkos::Impl::ViewOffset<dim_s0_s0_s4, Kokkos::LayoutRight>; + + right_s0_s0_s4 dyn_off3(std::integral_constant<unsigned, sizeof(int)>(), + Kokkos::LayoutRight(N0, N1, 0, 0, 0, 0, 0, 0)); + + stride_s0_s0_s0 stride3(dyn_off3); + + ASSERT_EQ(dyn_off3.m_dim.rank, 3); + ASSERT_EQ(dyn_off3.m_dim.N0, N0); + ASSERT_EQ(dyn_off3.m_dim.N1, N1); + ASSERT_EQ(dyn_off3.m_dim.N2, 4); + ASSERT_EQ(dyn_off3.m_dim.N3, 1); + ASSERT_EQ(dyn_off3.size(), N0 * N1 * 4); + + ASSERT_EQ(dyn_off3.dimension_0(), stride3.dimension_0()); + ASSERT_EQ(dyn_off3.dimension_1(), stride3.dimension_1()); + ASSERT_EQ(dyn_off3.dimension_2(), stride3.dimension_2()); + ASSERT_EQ(dyn_off3.stride_0(), stride3.stride_0()); + ASSERT_EQ(dyn_off3.stride_1(), stride3.stride_1()); + ASSERT_EQ(dyn_off3.stride_2(), stride3.stride_2()); + ASSERT_EQ(dyn_off3.span(), stride3.span()); + + int offset = 0; + + for (int i = 0; i < N0; ++i) + for (int j = 0; j < N1; ++j) + for (int k = 0; k < 4; ++k) { + ASSERT_LE(offset, dyn_off3(i, j, k)); + ASSERT_EQ(dyn_off3(i, j, k), stride3(i, j, k)); + offset = dyn_off3(i, j, k) + 1; + } + + ASSERT_LE(offset, dyn_off3.span()); + } + + //---------------------------------------- + // Subview. + { + // Mapping rank 4 to rank 3 + using SubviewExtents = Kokkos::Impl::SubviewExtents<4, 3>; + + constexpr int N0 = 1000; + constexpr int N1 = 2000; + constexpr int N2 = 3000; + constexpr int N3 = 4000; + + Kokkos::Impl::ViewDimension<N0, N1, N2, N3> dim; + + SubviewExtents tmp(dim, N0 / 2, Kokkos::ALL, + std::pair<int, int>(N2 / 4, 10 + N2 / 4), + Kokkos::pair<int, int>(N3 / 4, 20 + N3 / 4)); + + ASSERT_EQ(tmp.domain_offset(0), N0 / 2); + ASSERT_EQ(tmp.domain_offset(1), 0); + ASSERT_EQ(tmp.domain_offset(2), N2 / 4); + ASSERT_EQ(tmp.domain_offset(3), N3 / 4); + + ASSERT_EQ(tmp.range_index(0), 1); + ASSERT_EQ(tmp.range_index(1), 2); + ASSERT_EQ(tmp.range_index(2), 3); + + ASSERT_EQ(tmp.range_extent(0), N1); + ASSERT_EQ(tmp.range_extent(1), 10); + ASSERT_EQ(tmp.range_extent(2), 20); + } + + { + constexpr int N0 = 2000; + constexpr int N1 = 300; + + constexpr int sub_N0 = 1000; + constexpr int sub_N1 = 200; + constexpr int sub_N2 = 4; + + using left_s0_s0_s4 = + Kokkos::Impl::ViewOffset<dim_s0_s0_s4, Kokkos::LayoutLeft>; + + left_s0_s0_s4 dyn_off3(std::integral_constant<unsigned, sizeof(int)>(), + Kokkos::LayoutLeft(N0, N1, 0, 0, 0, 0, 0, 0)); + + Kokkos::Impl::SubviewExtents<3, 3> sub( + dyn_off3.m_dim, Kokkos::pair<int, int>(0, sub_N0), + Kokkos::pair<int, int>(0, sub_N1), Kokkos::pair<int, int>(0, sub_N2)); + + stride_s0_s0_s0 stride3(dyn_off3, sub); + + ASSERT_EQ(stride3.dimension_0(), sub_N0); + ASSERT_EQ(stride3.dimension_1(), sub_N1); + ASSERT_EQ(stride3.dimension_2(), sub_N2); + ASSERT_EQ(stride3.size(), sub_N0 * sub_N1 * sub_N2); + + ASSERT_EQ(dyn_off3.stride_0(), stride3.stride_0()); + ASSERT_EQ(dyn_off3.stride_1(), stride3.stride_1()); + ASSERT_EQ(dyn_off3.stride_2(), stride3.stride_2()); + ASSERT_GE(dyn_off3.span(), stride3.span()); + + for (int k = 0; k < sub_N2; ++k) + for (int j = 0; j < sub_N1; ++j) + for (int i = 0; i < sub_N0; ++i) { + ASSERT_EQ(stride3(i, j, k), dyn_off3(i, j, k)); + } + } + + { + constexpr int N0 = 2000; + constexpr int N1 = 300; + + constexpr int sub_N0 = 1000; + constexpr int sub_N1 = 200; + constexpr int sub_N2 = 4; + + using right_s0_s0_s4 = + Kokkos::Impl::ViewOffset<dim_s0_s0_s4, Kokkos::LayoutRight>; + + right_s0_s0_s4 dyn_off3(std::integral_constant<unsigned, sizeof(int)>(), + Kokkos::LayoutRight(N0, N1, 0, 0, 0, 0, 0, 0)); + + Kokkos::Impl::SubviewExtents<3, 3> sub( + dyn_off3.m_dim, Kokkos::pair<int, int>(0, sub_N0), + Kokkos::pair<int, int>(0, sub_N1), Kokkos::pair<int, int>(0, sub_N2)); + + stride_s0_s0_s0 stride3(dyn_off3, sub); + + ASSERT_EQ(stride3.dimension_0(), sub_N0); + ASSERT_EQ(stride3.dimension_1(), sub_N1); + ASSERT_EQ(stride3.dimension_2(), sub_N2); + ASSERT_EQ(stride3.size(), sub_N0 * sub_N1 * sub_N2); + + ASSERT_EQ(dyn_off3.stride_0(), stride3.stride_0()); + ASSERT_EQ(dyn_off3.stride_1(), stride3.stride_1()); + ASSERT_EQ(dyn_off3.stride_2(), stride3.stride_2()); + ASSERT_GE(dyn_off3.span(), stride3.span()); + + for (int i = 0; i < sub_N0; ++i) + for (int j = 0; j < sub_N1; ++j) + for (int k = 0; k < sub_N2; ++k) { + ASSERT_EQ(stride3(i, j, k), dyn_off3(i, j, k)); + } + } + + //---------------------------------------- + // View data analysis. + { + using namespace Kokkos::Impl; + + static_assert(rank_dynamic<>::value == 0, ""); + static_assert(rank_dynamic<1>::value == 0, ""); + static_assert(rank_dynamic<0>::value == 1, ""); + static_assert(rank_dynamic<0, 1>::value == 1, ""); + static_assert(rank_dynamic<0, 0, 1>::value == 2, ""); + } + + { + using namespace Kokkos::Impl; + + using a_int_r1 = ViewArrayAnalysis<int[]>; + using a_int_r5 = ViewArrayAnalysis<int* * [4][5][6]>; + using a_const_int_r1 = ViewArrayAnalysis<const int[]>; + using a_const_int_r5 = ViewArrayAnalysis<const int* * [4][5][6]>; + + static_assert(a_int_r1::dimension::rank == 1, ""); + static_assert(a_int_r1::dimension::rank_dynamic == 1, ""); + static_assert(a_int_r5::dimension::ArgN0 == 0, ""); + static_assert(a_int_r5::dimension::ArgN1 == 0, ""); + static_assert(a_int_r5::dimension::ArgN2 == 4, ""); + static_assert(a_int_r5::dimension::ArgN3 == 5, ""); + static_assert(a_int_r5::dimension::ArgN4 == 6, ""); + static_assert(a_int_r5::dimension::ArgN5 == 1, ""); + + static_assert( + std::is_same<typename a_int_r1::dimension, ViewDimension<0> >::value, + ""); + static_assert( + std::is_same<typename a_int_r1::non_const_value_type, int>::value, ""); + + static_assert(a_const_int_r1::dimension::rank == 1, ""); + static_assert(a_const_int_r1::dimension::rank_dynamic == 1, ""); + static_assert(std::is_same<typename a_const_int_r1::dimension, + ViewDimension<0> >::value, + ""); + static_assert( + std::is_same<typename a_const_int_r1::non_const_value_type, int>::value, + ""); + + static_assert(a_const_int_r5::dimension::rank == 5, ""); + static_assert(a_const_int_r5::dimension::rank_dynamic == 2, ""); + + static_assert(a_const_int_r5::dimension::ArgN0 == 0, ""); + static_assert(a_const_int_r5::dimension::ArgN1 == 0, ""); + static_assert(a_const_int_r5::dimension::ArgN2 == 4, ""); + static_assert(a_const_int_r5::dimension::ArgN3 == 5, ""); + static_assert(a_const_int_r5::dimension::ArgN4 == 6, ""); + static_assert(a_const_int_r5::dimension::ArgN5 == 1, ""); + + static_assert(std::is_same<typename a_const_int_r5::dimension, + ViewDimension<0, 0, 4, 5, 6> >::value, + ""); + static_assert( + std::is_same<typename a_const_int_r5::non_const_value_type, int>::value, + ""); + + static_assert(a_int_r5::dimension::rank == 5, ""); + static_assert(a_int_r5::dimension::rank_dynamic == 2, ""); + static_assert(std::is_same<typename a_int_r5::dimension, + ViewDimension<0, 0, 4, 5, 6> >::value, + ""); + static_assert( + std::is_same<typename a_int_r5::non_const_value_type, int>::value, ""); + } + + { + using namespace Kokkos::Impl; + + using t_i4 = int[4]; + + // Dimensions of t_i4 are appended to the multdimensional array. + using a_int_r5 = ViewArrayAnalysis<t_i4** * [3]>; + + static_assert(a_int_r5::dimension::rank == 5, ""); + static_assert(a_int_r5::dimension::rank_dynamic == 3, ""); + static_assert(a_int_r5::dimension::ArgN0 == 0, ""); + static_assert(a_int_r5::dimension::ArgN1 == 0, ""); + static_assert(a_int_r5::dimension::ArgN2 == 0, ""); + static_assert(a_int_r5::dimension::ArgN3 == 3, ""); + static_assert(a_int_r5::dimension::ArgN4 == 4, ""); + static_assert( + std::is_same<typename a_int_r5::non_const_value_type, int>::value, ""); + } + + { + using namespace Kokkos::Impl; + + using a_const_int_r1 = ViewDataAnalysis<const int[], void>; + + static_assert( + std::is_same<typename a_const_int_r1::specialize, void>::value, ""); + static_assert(std::is_same<typename a_const_int_r1::dimension, + Kokkos::Impl::ViewDimension<0> >::value, + ""); + + static_assert( + std::is_same<typename a_const_int_r1::type, const int*>::value, ""); + static_assert( + std::is_same<typename a_const_int_r1::value_type, const int>::value, + ""); + + static_assert(std::is_same<typename a_const_int_r1::scalar_array_type, + const int*>::value, + ""); + static_assert( + std::is_same<typename a_const_int_r1::const_type, const int*>::value, + ""); + static_assert(std::is_same<typename a_const_int_r1::const_value_type, + const int>::value, + ""); + static_assert(std::is_same<typename a_const_int_r1::const_scalar_array_type, + const int*>::value, + ""); + static_assert( + std::is_same<typename a_const_int_r1::non_const_type, int*>::value, ""); + static_assert( + std::is_same<typename a_const_int_r1::non_const_value_type, int>::value, + ""); + + using a_const_int_r3 = ViewDataAnalysis<const int* * [4], void>; + + static_assert( + std::is_same<typename a_const_int_r3::specialize, void>::value, ""); + + static_assert(std::is_same<typename a_const_int_r3::dimension, + Kokkos::Impl::ViewDimension<0, 0, 4> >::value, + ""); + + static_assert( + std::is_same<typename a_const_int_r3::type, const int* * [4]>::value, + ""); + static_assert( + std::is_same<typename a_const_int_r3::value_type, const int>::value, + ""); + static_assert(std::is_same<typename a_const_int_r3::scalar_array_type, + const int* * [4]>::value, + ""); + static_assert(std::is_same<typename a_const_int_r3::const_type, + const int* * [4]>::value, + ""); + static_assert(std::is_same<typename a_const_int_r3::const_value_type, + const int>::value, + ""); + static_assert(std::is_same<typename a_const_int_r3::const_scalar_array_type, + const int* * [4]>::value, + ""); + static_assert(std::is_same<typename a_const_int_r3::non_const_type, + int* * [4]>::value, + ""); + static_assert( + std::is_same<typename a_const_int_r3::non_const_value_type, int>::value, + ""); + static_assert( + std::is_same<typename a_const_int_r3::non_const_scalar_array_type, + int* * [4]>::value, + ""); + + // std::cout << "typeid( const int**[4] ).name() = " << typeid( const + // int**[4] ).name() << std::endl; + } + + //---------------------------------------- + + { + constexpr int N = 10; + + using T = Kokkos::View<int*, Space>; + using C = Kokkos::View<const int*, Space>; + + int data[N]; + + T vr1(data, N); // View of non-const. + C cr1(vr1); // View of const from view of non-const. + C cr2((const int*)data, N); + + // Generate static_assert error: + // T tmp( cr1 ); + + ASSERT_EQ(vr1.span(), N); + ASSERT_EQ(cr1.span(), N); + ASSERT_EQ(vr1.data(), &data[0]); + ASSERT_EQ(cr1.data(), &data[0]); + + ASSERT_TRUE((std::is_same<typename T::data_type, int*>::value)); + ASSERT_TRUE((std::is_same<typename T::const_data_type, const int*>::value)); + ASSERT_TRUE((std::is_same<typename T::non_const_data_type, int*>::value)); + + ASSERT_TRUE((std::is_same<typename T::scalar_array_type, int*>::value)); + ASSERT_TRUE( + (std::is_same<typename T::const_scalar_array_type, const int*>::value)); + ASSERT_TRUE( + (std::is_same<typename T::non_const_scalar_array_type, int*>::value)); + + ASSERT_TRUE((std::is_same<typename T::value_type, int>::value)); + ASSERT_TRUE((std::is_same<typename T::const_value_type, const int>::value)); + ASSERT_TRUE((std::is_same<typename T::non_const_value_type, int>::value)); + + ASSERT_TRUE((std::is_same<typename T::memory_space, + typename Space::memory_space>::value)); + ASSERT_TRUE((std::is_same<typename T::reference_type, int&>::value)); + + ASSERT_EQ(T::Rank, 1); + + ASSERT_TRUE((std::is_same<typename C::data_type, const int*>::value)); + ASSERT_TRUE((std::is_same<typename C::const_data_type, const int*>::value)); + ASSERT_TRUE((std::is_same<typename C::non_const_data_type, int*>::value)); + + ASSERT_TRUE( + (std::is_same<typename C::scalar_array_type, const int*>::value)); + ASSERT_TRUE( + (std::is_same<typename C::const_scalar_array_type, const int*>::value)); + ASSERT_TRUE( + (std::is_same<typename C::non_const_scalar_array_type, int*>::value)); + + ASSERT_TRUE((std::is_same<typename C::value_type, const int>::value)); + ASSERT_TRUE((std::is_same<typename C::const_value_type, const int>::value)); + ASSERT_TRUE((std::is_same<typename C::non_const_value_type, int>::value)); + + ASSERT_TRUE((std::is_same<typename C::memory_space, + typename Space::memory_space>::value)); + ASSERT_TRUE((std::is_same<typename C::reference_type, const int&>::value)); + + ASSERT_EQ(C::Rank, 1); + + ASSERT_EQ(vr1.extent(0), N); + + if (Kokkos::Impl::SpaceAccessibility< + Kokkos::HostSpace, typename Space::memory_space>::accessible) { + for (int i = 0; i < N; ++i) data[i] = i + 1; + for (int i = 0; i < N; ++i) ASSERT_EQ(vr1[i], i + 1); + for (int i = 0; i < N; ++i) ASSERT_EQ(cr1[i], i + 1); + + { + T tmp(vr1); + + for (int i = 0; i < N; ++i) ASSERT_EQ(tmp[i], i + 1); + for (int i = 0; i < N; ++i) vr1(i) = i + 2; + for (int i = 0; i < N; ++i) ASSERT_EQ(tmp[i], i + 2); + } + + for (int i = 0; i < N; ++i) ASSERT_EQ(vr1[i], i + 2); + } + } + + { + constexpr int N = 10; + using T = Kokkos::View<int*, Space>; + using C = Kokkos::View<const int*, Space>; + + T vr1("vr1", N); + C cr1(vr1); + + ASSERT_TRUE((std::is_same<typename T::data_type, int*>::value)); + ASSERT_TRUE((std::is_same<typename T::const_data_type, const int*>::value)); + ASSERT_TRUE((std::is_same<typename T::non_const_data_type, int*>::value)); + + ASSERT_TRUE((std::is_same<typename T::scalar_array_type, int*>::value)); + ASSERT_TRUE( + (std::is_same<typename T::const_scalar_array_type, const int*>::value)); + ASSERT_TRUE( + (std::is_same<typename T::non_const_scalar_array_type, int*>::value)); + + ASSERT_TRUE((std::is_same<typename T::value_type, int>::value)); + ASSERT_TRUE((std::is_same<typename T::const_value_type, const int>::value)); + ASSERT_TRUE((std::is_same<typename T::non_const_value_type, int>::value)); + + ASSERT_TRUE((std::is_same<typename T::memory_space, + typename Space::memory_space>::value)); + ASSERT_TRUE((std::is_same<typename T::reference_type, int&>::value)); + ASSERT_EQ(T::Rank, 1); + + ASSERT_EQ(vr1.extent(0), N); + + if (Kokkos::Impl::SpaceAccessibility< + Kokkos::HostSpace, typename Space::memory_space>::accessible) { + for (int i = 0; i < N; ++i) vr1(i) = i + 1; + for (int i = 0; i < N; ++i) ASSERT_EQ(vr1[i], i + 1); + for (int i = 0; i < N; ++i) ASSERT_EQ(cr1[i], i + 1); + + { + T tmp(vr1); + for (int i = 0; i < N; ++i) ASSERT_EQ(tmp[i], i + 1); + for (int i = 0; i < N; ++i) vr1(i) = i + 2; + for (int i = 0; i < N; ++i) ASSERT_EQ(tmp[i], i + 2); + } + + for (int i = 0; i < N; ++i) ASSERT_EQ(vr1[i], i + 2); + } + } + + // Testing proper handling of zero-length allocations. + { + constexpr int N = 0; + using T = Kokkos::View<int*, Space>; + using C = Kokkos::View<const int*, Space>; + + T vr1("vr1", N); + C cr1(vr1); + + ASSERT_EQ(vr1.extent(0), 0); + ASSERT_EQ(cr1.extent(0), 0); + } + + // Testing using space instance for allocation. + // The execution space of the memory space must be available for view data + // initialization. + if (std::is_same<ExecSpace, + typename ExecSpace::memory_space::execution_space>::value) { + using namespace Kokkos; + + using memory_space = typename ExecSpace::memory_space; + using V = View<int*, memory_space>; + + constexpr int N = 10; + + memory_space mem_space; + + V v("v", N); + V va(view_alloc(), N); + V vb(view_alloc("vb"), N); + V vc(view_alloc("vc", AllowPadding), N); + V vd(view_alloc("vd", WithoutInitializing), N); + V ve(view_alloc("ve", WithoutInitializing, AllowPadding), N); + V vf(view_alloc("vf", mem_space, WithoutInitializing, AllowPadding), N); + V vg(view_alloc(mem_space, "vg", WithoutInitializing, AllowPadding), N); + V vh(view_alloc(WithoutInitializing, AllowPadding), N); + V vi(view_alloc(WithoutInitializing), N); + V vj(view_alloc(std::string("vj"), AllowPadding), N); + V vk(view_alloc(mem_space, std::string("vk"), AllowPadding), N); + } + + { + using traits_t = + Kokkos::ViewTraits<int***, Kokkos::LayoutStride, ExecSpace>; + using dims_t = Kokkos::Impl::ViewDimension<0, 0, 0>; + using offset_t = Kokkos::Impl::ViewOffset<dims_t, Kokkos::LayoutStride>; + + Kokkos::LayoutStride stride; + + stride.dimension[0] = 3; + stride.dimension[1] = 4; + stride.dimension[2] = 5; + stride.stride[0] = 4; + stride.stride[1] = 1; + stride.stride[2] = 12; + + const offset_t offset(std::integral_constant<unsigned, 0>(), stride); + + ASSERT_EQ(offset.dimension_0(), 3); + ASSERT_EQ(offset.dimension_1(), 4); + ASSERT_EQ(offset.dimension_2(), 5); + + ASSERT_EQ(offset.stride_0(), 4); + ASSERT_EQ(offset.stride_1(), 1); + ASSERT_EQ(offset.stride_2(), 12); + + ASSERT_EQ(offset.span(), 60); + ASSERT_TRUE(offset.span_is_contiguous()); + + Kokkos::Impl::ViewMapping<traits_t, void> v( + Kokkos::Impl::ViewCtorProp<int*>(nullptr), stride); + } + + { + using V = Kokkos::View<int**, Space>; + using M = typename V::HostMirror; + using layout_type = typename Kokkos::View<int**, Space>::array_layout; + + constexpr int N0 = 10; + constexpr int N1 = 11; + + V a("a", N0, N1); + M b = Kokkos::create_mirror(a); + M c = Kokkos::create_mirror_view(a); + M d; + + for (int i0 = 0; i0 < N0; ++i0) + for (int i1 = 0; i1 < N1; ++i1) { + b(i0, i1) = 1 + i0 + i1 * N0; + } + + Kokkos::deep_copy(a, b); + Kokkos::deep_copy(c, a); + + for (int i0 = 0; i0 < N0; ++i0) + for (int i1 = 0; i1 < N1; ++i1) { + ASSERT_EQ(b(i0, i1), c(i0, i1)); + } + + Kokkos::resize(b, 5, 6); + + for (int i0 = 0; i0 < 5; ++i0) + for (int i1 = 0; i1 < 6; ++i1) { + int val = 1 + i0 + i1 * N0; + ASSERT_EQ(b(i0, i1), c(i0, i1)); + ASSERT_EQ(b(i0, i1), val); + } + + Kokkos::realloc(c, 5, 6); + Kokkos::realloc(d, 5, 6); + + ASSERT_EQ(b.extent(0), 5); + ASSERT_EQ(b.extent(1), 6); + ASSERT_EQ(c.extent(0), 5); + ASSERT_EQ(c.extent(1), 6); + ASSERT_EQ(d.extent(0), 5); + ASSERT_EQ(d.extent(1), 6); + + layout_type layout(7, 8); + Kokkos::resize(b, layout); + for (int i0 = 0; i0 < 7; ++i0) + for (int i1 = 6; i1 < 8; ++i1) { + b(i0, i1) = 1 + i0 + i1 * N0; + } + + for (int i0 = 5; i0 < 7; ++i0) + for (int i1 = 0; i1 < 8; ++i1) { + b(i0, i1) = 1 + i0 + i1 * N0; + } + + for (int i0 = 0; i0 < 7; ++i0) + for (int i1 = 0; i1 < 8; ++i1) { + int val = 1 + i0 + i1 * N0; + ASSERT_EQ(b(i0, i1), val); + } + + Kokkos::realloc(c, layout); + Kokkos::realloc(d, layout); + + ASSERT_EQ(b.extent(0), 7); + ASSERT_EQ(b.extent(1), 8); + ASSERT_EQ(c.extent(0), 7); + ASSERT_EQ(c.extent(1), 8); + ASSERT_EQ(d.extent(0), 7); + ASSERT_EQ(d.extent(1), 8); + } + + { + using V = Kokkos::View<int**, Kokkos::LayoutStride, Space>; + using M = typename V::HostMirror; + using layout_type = + typename Kokkos::View<int**, Kokkos::LayoutStride, Space>::array_layout; + + constexpr int N0 = 10; + constexpr int N1 = 11; + + const int dimensions[] = {N0, N1}; + const int order[] = {1, 0}; + + V a("a", Kokkos::LayoutStride::order_dimensions(2, order, dimensions)); + M b = Kokkos::create_mirror(a); + M c = Kokkos::create_mirror_view(a); + M d; + + for (int i0 = 0; i0 < N0; ++i0) + for (int i1 = 0; i1 < N1; ++i1) { + b(i0, i1) = 1 + i0 + i1 * N0; + } + + Kokkos::deep_copy(a, b); + Kokkos::deep_copy(c, a); + + for (int i0 = 0; i0 < N0; ++i0) + for (int i1 = 0; i1 < N1; ++i1) { + ASSERT_EQ(b(i0, i1), c(i0, i1)); + } + + const int dimensions2[] = {7, 8}; + const int order2[] = {1, 0}; + layout_type layout = layout_type::order_dimensions(2, order2, dimensions2); + Kokkos::resize(b, layout); + + for (int i0 = 0; i0 < 7; ++i0) + for (int i1 = 0; i1 < 8; ++i1) { + int val = 1 + i0 + i1 * N0; + ASSERT_EQ(b(i0, i1), c(i0, i1)); + ASSERT_EQ(b(i0, i1), val); + } + + Kokkos::realloc(c, layout); + Kokkos::realloc(d, layout); + + ASSERT_EQ(b.extent(0), 7); + ASSERT_EQ(b.extent(1), 8); + ASSERT_EQ(c.extent(0), 7); + ASSERT_EQ(c.extent(1), 8); + ASSERT_EQ(d.extent(0), 7); + ASSERT_EQ(d.extent(1), 8); + } + + { + using V = Kokkos::View<int*, Space>; + using U = Kokkos::View<int*, Space, Kokkos::MemoryUnmanaged>; + + V a("a", 10); + + ASSERT_EQ(a.use_count(), 1); + + V b = a; + + ASSERT_EQ(a.use_count(), 2); + ASSERT_EQ(b.use_count(), 2); + + { + U c = b; // 'c' is compile-time unmanaged. + + ASSERT_EQ(a.use_count(), 2); + ASSERT_EQ(b.use_count(), 2); + ASSERT_EQ(c.use_count(), 2); + + V d = c; // 'd' is run-time unmanaged. + + ASSERT_EQ(a.use_count(), 2); + ASSERT_EQ(b.use_count(), 2); + ASSERT_EQ(c.use_count(), 2); + ASSERT_EQ(d.use_count(), 2); + } + + ASSERT_EQ(a.use_count(), 2); + ASSERT_EQ(b.use_count(), 2); + + b = V(); + + ASSERT_EQ(a.use_count(), 1); + ASSERT_EQ(b.use_count(), 0); + +// TODO: a.use_count() and x.use_count() are 0 with the asynchronous HPX +// backend. Why? +#if !defined(KOKKOS_ENABLE_CUDA_LAMBDA) && \ + !(defined(KOKKOS_ENABLE_HPX) && defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)) + // Cannot launch host lambda when CUDA lambda is enabled. + + using host_exec_space = + typename Kokkos::Impl::HostMirror<Space>::Space::execution_space; + + int errors = 0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy<host_exec_space>(0, 10), + KOKKOS_LAMBDA(int, int& e) { + // an unmanaged copy. When the parallel dispatch accepts a move for + // the lambda, this count should become 1. + + if (a.use_count() != 2) ++e; + V x = a; + if (a.use_count() != 2) ++e; + if (x.use_count() != 2) ++e; + }, + errors); + ASSERT_EQ(errors, 0); +#endif // #if !defined( KOKKOS_ENABLE_CUDA_LAMBDA ) + } +} + +TEST(TEST_CATEGORY, view_mapping) { test_view_mapping<TEST_EXECSPACE>(); } +/*--------------------------------------------------------------------------*/ + +template <class ViewType> +struct TestViewMapOperator { + static_assert(ViewType::reference_type_is_lvalue_reference, + "Test only valid for lvalue reference type"); + + ViewType v; + + KOKKOS_INLINE_FUNCTION + void test_left(size_t i0, int64_t& error_count) const { + typename ViewType::value_type* const base_ptr = + &v.access(0, 0, 0, 0, 0, 0, 0, 0); + const size_t n1 = v.extent(1); + const size_t n2 = v.extent(2); + const size_t n3 = v.extent(3); + const size_t n4 = v.extent(4); + const size_t n5 = v.extent(5); + const size_t n6 = v.extent(6); + const size_t n7 = v.extent(7); + + int64_t offset = 0; + + for (size_t i7 = 0; i7 < n7; ++i7) + for (size_t i6 = 0; i6 < n6; ++i6) + for (size_t i5 = 0; i5 < n5; ++i5) + for (size_t i4 = 0; i4 < n4; ++i4) + for (size_t i3 = 0; i3 < n3; ++i3) + for (size_t i2 = 0; i2 < n2; ++i2) + for (size_t i1 = 0; i1 < n1; ++i1) { + const int64_t d = + &v.access(i0, i1, i2, i3, i4, i5, i6, i7) - base_ptr; + if (d < offset) ++error_count; + offset = d; + } + + if (v.span() <= size_t(offset)) ++error_count; + } + + KOKKOS_INLINE_FUNCTION + void test_right(size_t i0, int64_t& error_count) const { + typename ViewType::value_type* const base_ptr = + &v.access(0, 0, 0, 0, 0, 0, 0, 0); + const size_t n1 = v.extent(1); + const size_t n2 = v.extent(2); + const size_t n3 = v.extent(3); + const size_t n4 = v.extent(4); + const size_t n5 = v.extent(5); + const size_t n6 = v.extent(6); + const size_t n7 = v.extent(7); + + int64_t offset = 0; + + for (size_t i1 = 0; i1 < n1; ++i1) + for (size_t i2 = 0; i2 < n2; ++i2) + for (size_t i3 = 0; i3 < n3; ++i3) + for (size_t i4 = 0; i4 < n4; ++i4) + for (size_t i5 = 0; i5 < n5; ++i5) + for (size_t i6 = 0; i6 < n6; ++i6) + for (size_t i7 = 0; i7 < n7; ++i7) { + const int64_t d = + &v.access(i0, i1, i2, i3, i4, i5, i6, i7) - base_ptr; + if (d < offset) ++error_count; + offset = d; + } + + if (v.span() <= size_t(offset)) ++error_count; + } + + KOKKOS_INLINE_FUNCTION + void operator()(size_t i, int64_t& error_count) const { + if (std::is_same<typename ViewType::array_layout, + Kokkos::LayoutLeft>::value) { + test_left(i, error_count); + } else if (std::is_same<typename ViewType::array_layout, + Kokkos::LayoutRight>::value) { + test_right(i, error_count); + } + } + + enum { N0 = 10 }; + enum { N1 = 9 }; + enum { N2 = 8 }; + enum { N3 = 7 }; + enum { N4 = 6 }; + enum { N5 = 5 }; + enum { N6 = 4 }; + enum { N7 = 3 }; + + TestViewMapOperator() { + const size_t dyn_rank = v.rank_dynamic; + const std::string label("Test"); + switch (dyn_rank) { + case 0: v = ViewType(label); break; + case 1: v = ViewType(label, N0); break; + case 2: v = ViewType(label, N0, N1); break; + case 3: v = ViewType(label, N0, N1, N2); break; + case 4: v = ViewType(label, N0, N1, N2, N3); break; + case 5: v = ViewType(label, N0, N1, N2, N3, N4); break; + case 6: v = ViewType(label, N0, N1, N2, N3, N4, N5); break; + case 7: v = ViewType(label, N0, N1, N2, N3, N4, N5, N6); break; + case 8: + default: v = ViewType(label, N0, N1, N2, N3, N4, N5, N6, N7); + } + } + + void run() { + ASSERT_EQ(v.extent(0), + (0 < ViewType::rank ? TestViewMapOperator<ViewType>::N0 : 1)); + ASSERT_EQ(v.extent(1), + (1 < ViewType::rank ? TestViewMapOperator<ViewType>::N1 : 1)); + ASSERT_EQ(v.extent(2), + (2 < ViewType::rank ? TestViewMapOperator<ViewType>::N2 : 1)); + ASSERT_EQ(v.extent(3), + (3 < ViewType::rank ? TestViewMapOperator<ViewType>::N3 : 1)); + ASSERT_EQ(v.extent(4), + (4 < ViewType::rank ? TestViewMapOperator<ViewType>::N4 : 1)); + ASSERT_EQ(v.extent(5), + (5 < ViewType::rank ? TestViewMapOperator<ViewType>::N5 : 1)); + ASSERT_EQ(v.extent(6), + (6 < ViewType::rank ? TestViewMapOperator<ViewType>::N6 : 1)); + ASSERT_EQ(v.extent(7), + (7 < ViewType::rank ? TestViewMapOperator<ViewType>::N7 : 1)); + + ASSERT_LE(v.extent(0) * v.extent(1) * v.extent(2) * v.extent(3) * + v.extent(4) * v.extent(5) * v.extent(6) * v.extent(7), + v.span()); + + int64_t error_count; + Kokkos::RangePolicy<typename ViewType::execution_space> range(0, + v.extent(0)); + Kokkos::parallel_reduce(range, *this, error_count); + ASSERT_EQ(0, error_count); + } +}; + +template <class Space> +void test_view_mapping_operator() { + using ExecSpace = typename Space::execution_space; + + { + TestViewMapOperator<Kokkos::View<int, Kokkos::LayoutLeft, ExecSpace> > f; + f.run(); + } + { + TestViewMapOperator<Kokkos::View<int*, Kokkos::LayoutLeft, ExecSpace> > f; + f.run(); + } + { + TestViewMapOperator<Kokkos::View<int**, Kokkos::LayoutLeft, ExecSpace> > f; + f.run(); + } + { + TestViewMapOperator<Kokkos::View<int***, Kokkos::LayoutLeft, ExecSpace> > f; + f.run(); + } + { + TestViewMapOperator<Kokkos::View<int****, Kokkos::LayoutLeft, ExecSpace> > + f; + f.run(); + } + { + TestViewMapOperator<Kokkos::View<int*****, Kokkos::LayoutLeft, ExecSpace> > + f; + f.run(); + } + { + TestViewMapOperator<Kokkos::View<int******, Kokkos::LayoutLeft, ExecSpace> > + f; + f.run(); + } + { + TestViewMapOperator< + Kokkos::View<int*******, Kokkos::LayoutLeft, ExecSpace> > + f; + f.run(); + } + + { + TestViewMapOperator<Kokkos::View<int, Kokkos::LayoutRight, ExecSpace> > f; + f.run(); + } + { + TestViewMapOperator<Kokkos::View<int*, Kokkos::LayoutRight, ExecSpace> > f; + f.run(); + } + { + TestViewMapOperator<Kokkos::View<int**, Kokkos::LayoutRight, ExecSpace> > f; + f.run(); + } + { + TestViewMapOperator<Kokkos::View<int***, Kokkos::LayoutRight, ExecSpace> > + f; + f.run(); + } + { + TestViewMapOperator<Kokkos::View<int****, Kokkos::LayoutRight, ExecSpace> > + f; + f.run(); + } + { + TestViewMapOperator<Kokkos::View<int*****, Kokkos::LayoutRight, ExecSpace> > + f; + f.run(); + } + { + TestViewMapOperator< + Kokkos::View<int******, Kokkos::LayoutRight, ExecSpace> > + f; + f.run(); + } + { + TestViewMapOperator< + Kokkos::View<int*******, Kokkos::LayoutRight, ExecSpace> > + f; + f.run(); + } +} + +TEST(TEST_CATEGORY, view_mapping_operator) { + test_view_mapping_operator<TEST_EXECSPACE>(); +} + +TEST(TEST_CATEGORY, static_extent) { + using T = Kokkos::View<double * [2][3]>; + ASSERT_EQ(T::static_extent(1), 2); + ASSERT_EQ(T::static_extent(2), 3); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestViewMapping_b.hpp b/packages/kokkos/core/unit_test/TestViewMapping_b.hpp new file mode 100644 index 0000000000000000000000000000000000000000..23035a303ad450de6f1d6f34abb8c3a33ca94d7b --- /dev/null +++ b/packages/kokkos/core/unit_test/TestViewMapping_b.hpp @@ -0,0 +1,269 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <stdexcept> +#include <sstream> +#include <iostream> + +#include <Kokkos_Core.hpp> + +namespace Test { + +/*--------------------------------------------------------------------------*/ + +template <class Space> +struct TestViewMappingAtomic { + using ExecSpace = typename Space::execution_space; + using MemSpace = typename Space::memory_space; + + using mem_trait = Kokkos::MemoryTraits<Kokkos::Atomic>; + + using T = Kokkos::View<int *, ExecSpace>; + using T_atom = Kokkos::View<int *, ExecSpace, mem_trait>; + + T x; + T_atom x_atom; + + enum { N = 100000 }; + + struct TagInit {}; + struct TagUpdate {}; + struct TagVerify {}; + + KOKKOS_INLINE_FUNCTION + void operator()(const TagInit &, const int i) const { x(i) = i; } + + KOKKOS_INLINE_FUNCTION + void operator()(const TagUpdate &, const int i) const { x_atom(i % 2) += 1; } + + KOKKOS_INLINE_FUNCTION + void operator()(const TagVerify &, const int i, long &error_count) const { + if (i < 2) { + if (x(i) != int(i + N / 2)) ++error_count; + } else { + if (x(i) != int(i)) ++error_count; + } + } + + TestViewMappingAtomic() : x("x", N), x_atom(x) {} + + void run() { + ASSERT_TRUE(T::reference_type_is_lvalue_reference); + ASSERT_FALSE(T_atom::reference_type_is_lvalue_reference); + + Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace, TagInit>(0, N), *this); + Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace, TagUpdate>(0, N), + *this); + + long error_count = -1; + + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace, TagVerify>(0, N), + *this, error_count); + + ASSERT_EQ(0, error_count); + + typename T_atom::HostMirror x_host = Kokkos::create_mirror_view(x); + Kokkos::deep_copy(x_host, x); + + error_count = -1; + + Kokkos::parallel_reduce( + Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace, TagVerify>(0, N), + [=](const TagVerify &, const int i, long &tmp_error_count) { + if (i < 2) { + if (x_host(i) != int(i + N / 2)) ++tmp_error_count; + } else { + if (x_host(i) != int(i)) ++tmp_error_count; + } + }, + error_count); + + ASSERT_EQ(0, error_count); + Kokkos::deep_copy(x, x_host); + } +}; + +TEST(TEST_CATEGORY, view_mapping_atomic) { + TestViewMappingAtomic<TEST_EXECSPACE> f; + f.run(); +} + +} // namespace Test + +/*--------------------------------------------------------------------------*/ + +namespace Test { + +struct MappingClassValueType { + KOKKOS_INLINE_FUNCTION + MappingClassValueType() { +#if 0 +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA) + printf( "TestViewMappingClassValue construct on Cuda\n" ); +#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + printf( "TestViewMappingClassValue construct on Host\n" ); +#else + printf( "TestViewMappingClassValue construct unknown\n" ); +#endif +#endif + } + KOKKOS_INLINE_FUNCTION + ~MappingClassValueType() { +#if 0 +#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA) + printf( "TestViewMappingClassValue destruct on Cuda\n" ); +#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + printf( "TestViewMappingClassValue destruct on Host\n" ); +#else + printf( "TestViewMappingClassValue destruct unknown\n" ); +#endif +#endif + } +}; + +template <class Space> +void test_view_mapping_class_value() { + using ExecSpace = typename Space::execution_space; + + ExecSpace().fence(); + { + Kokkos::View<MappingClassValueType, ExecSpace> a("a"); + ExecSpace().fence(); + } + ExecSpace().fence(); +} + +TEST(TEST_CATEGORY, view_mapping_class_value) { + test_view_mapping_class_value<TEST_EXECSPACE>(); +} + +} // namespace Test + +/*--------------------------------------------------------------------------*/ + +namespace Test { + +TEST(TEST_CATEGORY, view_mapping_assignable) { + using exec_space = TEST_EXECSPACE; + + { // Assignment of rank-0 Left = Right + using dst_traits = Kokkos::ViewTraits<int, Kokkos::LayoutLeft, exec_space>; + using src_traits = Kokkos::ViewTraits<int, Kokkos::LayoutRight, exec_space>; + using mapping = Kokkos::Impl::ViewMapping<dst_traits, src_traits, void>; + static_assert(mapping::is_assignable, ""); + + Kokkos::View<int, Kokkos::LayoutRight, exec_space> src; + Kokkos::View<int, Kokkos::LayoutLeft, exec_space> dst(src); + dst = src; + } + + { // Assignment of rank-0 Right = Left + using dst_traits = Kokkos::ViewTraits<int, Kokkos::LayoutRight, exec_space>; + using src_traits = Kokkos::ViewTraits<int, Kokkos::LayoutLeft, exec_space>; + using mapping = Kokkos::Impl::ViewMapping<dst_traits, src_traits, void>; + static_assert(mapping::is_assignable, ""); + + Kokkos::View<int, Kokkos::LayoutLeft, exec_space> src; + Kokkos::View<int, Kokkos::LayoutRight, exec_space> dst(src); + dst = src; + } + + { // Assignment of rank-1 Left = Right + using dst_traits = + Kokkos::ViewTraits<int *, Kokkos::LayoutLeft, exec_space>; + using src_traits = + Kokkos::ViewTraits<int *, Kokkos::LayoutRight, exec_space>; + using mapping = Kokkos::Impl::ViewMapping<dst_traits, src_traits, void>; + static_assert(mapping::is_assignable, ""); + + Kokkos::View<int *, Kokkos::LayoutRight, exec_space> src; + Kokkos::View<int *, Kokkos::LayoutLeft, exec_space> dst(src); + dst = src; + } + + { // Assignment of rank-1 Right = Left + using dst_traits = + Kokkos::ViewTraits<int *, Kokkos::LayoutRight, exec_space>; + using src_traits = + Kokkos::ViewTraits<int *, Kokkos::LayoutLeft, exec_space>; + using mapping = Kokkos::Impl::ViewMapping<dst_traits, src_traits, void>; + static_assert(mapping::is_assignable, ""); + + Kokkos::View<int *, Kokkos::LayoutLeft, exec_space> src; + Kokkos::View<int *, Kokkos::LayoutRight, exec_space> dst(src); + dst = src; + } + + { // Assignment of rank-2 Left = Right + using dst_traits = + Kokkos::ViewTraits<int **, Kokkos::LayoutLeft, exec_space>; + using src_traits = + Kokkos::ViewTraits<int **, Kokkos::LayoutRight, exec_space>; + using mapping = Kokkos::Impl::ViewMapping<dst_traits, src_traits, void>; + static_assert(!mapping::is_assignable, ""); + } + + { // Assignment of rank-2 Right = Left + using dst_traits = + Kokkos::ViewTraits<int **, Kokkos::LayoutRight, exec_space>; + using src_traits = + Kokkos::ViewTraits<int **, Kokkos::LayoutLeft, exec_space>; + using mapping = Kokkos::Impl::ViewMapping<dst_traits, src_traits, void>; + static_assert(!mapping::is_assignable, ""); + } +} + +TEST(TEST_CATEGORY, view_mapping_trivially_copyable) { + using exec_space = TEST_EXECSPACE; + + using dst_traits = Kokkos::ViewTraits<int *, exec_space>; + using src_traits = dst_traits; + using mapping = Kokkos::Impl::ViewMapping<dst_traits, src_traits, void>; + + static_assert(std::is_trivially_copyable<mapping>{}, ""); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestViewMapping_subview.hpp b/packages/kokkos/core/unit_test/TestViewMapping_subview.hpp new file mode 100644 index 0000000000000000000000000000000000000000..18db67400d6ea03ecb98e891150cb4a154311982 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestViewMapping_subview.hpp @@ -0,0 +1,212 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <stdexcept> +#include <sstream> +#include <iostream> + +#include <Kokkos_Core.hpp> + +namespace Test { + +template <class Space> +struct TestViewMappingSubview { + using ExecSpace = typename Space::execution_space; + using MemSpace = typename Space::memory_space; + + using range = Kokkos::pair<int, int>; + + enum { AN = 10 }; + using AT = Kokkos::View<int*, ExecSpace>; + using ACT = Kokkos::View<const int*, ExecSpace>; + using AS = Kokkos::Subview<AT, range>; + + enum { BN0 = 10, BN1 = 11, BN2 = 12 }; + using BT = Kokkos::View<int***, ExecSpace>; + using BS = Kokkos::Subview<BT, range, range, range>; + + enum { CN0 = 10, CN1 = 11, CN2 = 12 }; + using CT = Kokkos::View<int** * [13][14], ExecSpace>; + // changing CS to CTS here because when compiling with nvshmem, there is a + // define for CS that makes this fail... + using CTS = Kokkos::Subview<CT, range, range, range, int, int>; + + enum { DN0 = 10, DN1 = 11, DN2 = 12, DN3 = 13, DN4 = 14 }; + using DT = Kokkos::View<int** * [DN3][DN4], ExecSpace>; + using DS = Kokkos::Subview<DT, int, range, range, range, int>; + + using DLT = Kokkos::View<int** * [13][14], Kokkos::LayoutLeft, ExecSpace>; + using DLS1 = Kokkos::Subview<DLT, range, int, int, int, int>; + +#if !defined(KOKKOS_IMPL_CUDA_VERSION_9_WORKAROUND) + static_assert( + DLS1::rank == 1 && + std::is_same<typename DLS1::array_layout, Kokkos::LayoutLeft>::value, + "Subview layout error for rank 1 subview of left-most range of " + "LayoutLeft"); +#endif + + using DRT = Kokkos::View<int** * [13][14], Kokkos::LayoutRight, ExecSpace>; + using DRS1 = Kokkos::Subview<DRT, int, int, int, int, range>; + +#if !defined(KOKKOS_IMPL_CUDA_VERSION_9_WORKAROUND) + static_assert( + DRS1::rank == 1 && + std::is_same<typename DRS1::array_layout, Kokkos::LayoutRight>::value, + "Subview layout error for rank 1 subview of right-most range of " + "LayoutRight"); +#endif + + AT Aa; + AS Ab; + ACT Ac; + BT Ba; + BS Bb; + CT Ca; + CTS Cb; + DT Da; + DS Db; + + TestViewMappingSubview() + : Aa("Aa", AN), + Ab(Kokkos::subview(Aa, std::pair<int, int>(1, AN - 1))), + Ac(Aa, std::pair<int, int>(1, AN - 1)), + Ba("Ba", BN0, BN1, BN2), + Bb(Kokkos::subview(Ba, std::pair<int, int>(1, BN0 - 1), + std::pair<int, int>(1, BN1 - 1), + std::pair<int, int>(1, BN2 - 1))), + Ca("Ca", CN0, CN1, CN2), + Cb(Kokkos::subview(Ca, std::pair<int, int>(1, CN0 - 1), + std::pair<int, int>(1, CN1 - 1), + std::pair<int, int>(1, CN2 - 1), 1, 2)), + Da("Da", DN0, DN1, DN2), + Db(Kokkos::subview(Da, 1, std::pair<int, int>(1, DN1 - 1), + std::pair<int, int>(1, DN2 - 1), + std::pair<int, int>(1, DN3 - 1), 2)) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int, long& error_count) const { + auto Ad = Kokkos::subview<Kokkos::MemoryUnmanaged>( + Aa, Kokkos::pair<int, int>(1, AN - 1)); + + for (int i = 1; i < AN - 1; ++i) + if (&Aa[i] != &Ab[i - 1]) ++error_count; + for (int i = 1; i < AN - 1; ++i) + if (&Aa[i] != &Ac[i - 1]) ++error_count; + for (int i = 1; i < AN - 1; ++i) + if (&Aa[i] != &Ad[i - 1]) ++error_count; + + for (int i2 = 1; i2 < BN2 - 1; ++i2) + for (int i1 = 1; i1 < BN1 - 1; ++i1) + for (int i0 = 1; i0 < BN0 - 1; ++i0) { + if (&Ba(i0, i1, i2) != &Bb(i0 - 1, i1 - 1, i2 - 1)) ++error_count; + } + + for (int i2 = 1; i2 < CN2 - 1; ++i2) + for (int i1 = 1; i1 < CN1 - 1; ++i1) + for (int i0 = 1; i0 < CN0 - 1; ++i0) { + if (&Ca(i0, i1, i2, 1, 2) != &Cb(i0 - 1, i1 - 1, i2 - 1)) + ++error_count; + } + + for (int i2 = 1; i2 < DN3 - 1; ++i2) + for (int i1 = 1; i1 < DN2 - 1; ++i1) + for (int i0 = 1; i0 < DN1 - 1; ++i0) { + if (&Da(1, i0, i1, i2, 2) != &Db(i0 - 1, i1 - 1, i2 - 1)) + ++error_count; + } + } + + void run() { + TestViewMappingSubview<ExecSpace> self; + + ASSERT_EQ(Aa.extent(0), AN); + ASSERT_EQ(Ab.extent(0), AN - 2); + ASSERT_EQ(Ac.extent(0), AN - 2); + ASSERT_EQ(Ba.extent(0), BN0); + ASSERT_EQ(Ba.extent(1), BN1); + ASSERT_EQ(Ba.extent(2), BN2); + ASSERT_EQ(Bb.extent(0), BN0 - 2); + ASSERT_EQ(Bb.extent(1), BN1 - 2); + ASSERT_EQ(Bb.extent(2), BN2 - 2); + + ASSERT_EQ(Ca.extent(0), CN0); + ASSERT_EQ(Ca.extent(1), CN1); + ASSERT_EQ(Ca.extent(2), CN2); + ASSERT_EQ(Ca.extent(3), 13); + ASSERT_EQ(Ca.extent(4), 14); + ASSERT_EQ(Cb.extent(0), CN0 - 2); + ASSERT_EQ(Cb.extent(1), CN1 - 2); + ASSERT_EQ(Cb.extent(2), CN2 - 2); + + ASSERT_EQ(Da.extent(0), DN0); + ASSERT_EQ(Da.extent(1), DN1); + ASSERT_EQ(Da.extent(2), DN2); + ASSERT_EQ(Da.extent(3), DN3); + ASSERT_EQ(Da.extent(4), DN4); + + ASSERT_EQ(Db.extent(0), DN1 - 2); + ASSERT_EQ(Db.extent(1), DN2 - 2); + ASSERT_EQ(Db.extent(2), DN3 - 2); + + ASSERT_EQ(Da.stride_1(), Db.stride_0()); + ASSERT_EQ(Da.stride_2(), Db.stride_1()); + ASSERT_EQ(Da.stride_3(), Db.stride_2()); + + long error_count = -1; + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, 1), *this, + error_count); + + ASSERT_EQ(error_count, 0); + } +}; + +TEST(TEST_CATEGORY, view_mapping_subview) { + TestViewMappingSubview<TEST_EXECSPACE> f; + f.run(); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestViewOfClass.hpp b/packages/kokkos/core/unit_test/TestViewOfClass.hpp new file mode 100644 index 0000000000000000000000000000000000000000..634f1da73008e60fc9c761de25655a928879383a --- /dev/null +++ b/packages/kokkos/core/unit_test/TestViewOfClass.hpp @@ -0,0 +1,119 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> +#include <stdexcept> +#include <sstream> +#include <iostream> + +namespace Test { + +template <class Space> +struct NestedView { + Kokkos::View<int *, Space> member; + + public: + KOKKOS_INLINE_FUNCTION + NestedView() : member() {} + + KOKKOS_INLINE_FUNCTION + NestedView &operator=(const Kokkos::View<int *, Space> &lhs) { + member = lhs; + if (member.extent(0)) Kokkos::atomic_add(&member(0), 1); + return *this; + } + + KOKKOS_INLINE_FUNCTION + ~NestedView() { + if (member.extent(0)) { + Kokkos::atomic_add(&member(0), -1); + } + } +}; + +template <class Space> +struct NestedViewFunctor { + Kokkos::View<NestedView<Space> *, Space> nested; + Kokkos::View<int *, Space> array; + + NestedViewFunctor(const Kokkos::View<NestedView<Space> *, Space> &arg_nested, + const Kokkos::View<int *, Space> &arg_array) + : nested(arg_nested), array(arg_array) {} + + KOKKOS_INLINE_FUNCTION + void operator()(int i) const { nested[i] = array; } +}; + +template <class Space> +void view_nested_view() { + Kokkos::View<int *, Space> tracking("tracking", 1); + + typename Kokkos::View<int *, Space>::HostMirror host_tracking = + Kokkos::create_mirror(tracking); + + { + Kokkos::View<NestedView<Space> *, Space> a("a_nested_view", 2); + + Kokkos::parallel_for(Kokkos::RangePolicy<Space>(0, 2), + NestedViewFunctor<Space>(a, tracking)); + Kokkos::deep_copy(host_tracking, tracking); + ASSERT_EQ(2, host_tracking(0)); + + Kokkos::View<NestedView<Space> *, Space> b("b_nested_view", 2); + Kokkos::parallel_for(Kokkos::RangePolicy<Space>(0, 2), + NestedViewFunctor<Space>(b, tracking)); + Kokkos::deep_copy(host_tracking, tracking); + ASSERT_EQ(4, host_tracking(0)); + } + + Kokkos::deep_copy(host_tracking, tracking); + + ASSERT_EQ(0, host_tracking(0)); +} + +TEST(TEST_CATEGORY, view_nested_view) { view_nested_view<TEST_EXECSPACE>(); } + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestViewResize.hpp b/packages/kokkos/core/unit_test/TestViewResize.hpp new file mode 100644 index 0000000000000000000000000000000000000000..9a378e521163dcfe49d498dd51d157ddb389f33b --- /dev/null +++ b/packages/kokkos/core/unit_test/TestViewResize.hpp @@ -0,0 +1,58 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#ifndef TESTVIEWRESIZE_HPP_ +#define TESTVIEWRESIZE_HPP_ + +#include <gtest/gtest.h> +#include "TestResize.hpp" + +namespace Test { + +TEST(TEST_CATEGORY, view_resize) { + using ExecSpace = TEST_EXECSPACE; + TestViewResize::testResize<ExecSpace>(); +} + +} // namespace Test +#endif // TESTVIEWRESIZE_HPP_ diff --git a/packages/kokkos/core/unit_test/TestViewSpaceAssign.hpp b/packages/kokkos/core/unit_test/TestViewSpaceAssign.hpp new file mode 100644 index 0000000000000000000000000000000000000000..d1dfb7c512f03a6183fbc617486a29af0ffee59a --- /dev/null +++ b/packages/kokkos/core/unit_test/TestViewSpaceAssign.hpp @@ -0,0 +1,76 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> +#include <stdexcept> +#include <sstream> +#include <iostream> + +namespace Test { + +template <typename SpaceDst, typename SpaceSrc> +void view_space_assign() { + Kokkos::View<double*, SpaceDst> a = Kokkos::View<double*, SpaceSrc>("a", 1); + + Kokkos::View<double*, Kokkos::LayoutLeft, SpaceDst> b = + Kokkos::View<double*, Kokkos::LayoutLeft, SpaceSrc>("b", 1); + + Kokkos::View<double*, Kokkos::LayoutRight, SpaceDst> c = + Kokkos::View<double*, Kokkos::LayoutRight, SpaceSrc>("c", 1); + + Kokkos::View<double*, SpaceDst, Kokkos::MemoryRandomAccess> d = + Kokkos::View<double*, SpaceSrc>("d", 1); + + Kokkos::View<double*, Kokkos::LayoutLeft, SpaceDst, + Kokkos::MemoryRandomAccess> + e = Kokkos::View<double*, Kokkos::LayoutLeft, SpaceSrc>("e", 1); + + // Rank-one layout can assign: + Kokkos::View<double*, Kokkos::LayoutRight, SpaceDst> f = + Kokkos::View<double*, Kokkos::LayoutLeft, SpaceSrc>("f", 1); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestViewSubview.hpp b/packages/kokkos/core/unit_test/TestViewSubview.hpp new file mode 100644 index 0000000000000000000000000000000000000000..0125017d93786101e2a23a866effe9d8a5e5242d --- /dev/null +++ b/packages/kokkos/core/unit_test/TestViewSubview.hpp @@ -0,0 +1,2315 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#ifndef TESTVIEWSUBVIEW_HPP_ +#define TESTVIEWSUBVIEW_HPP_ +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> +#include <stdexcept> +#include <sstream> +#include <iostream> +#include <type_traits> + +// TODO @refactoring move this to somewhere common + +//------------------------------------------------------------------------------ + +template <class...> +struct _kokkos____________________static_test_failure_____; + +template <class...> +struct static_predicate_message {}; + +//------------------------------------------------------------------------------ + +template <class, template <class...> class, class...> +struct static_assert_predicate_true_impl; + +template <template <class...> class predicate, class... message, class... args> +struct static_assert_predicate_true_impl< + typename std::enable_if<predicate<args...>::type::value>::type, predicate, + static_predicate_message<message...>, args...> { + using type = int; +}; + +template <template <class...> class predicate, class... message, class... args> +struct static_assert_predicate_true_impl< + typename std::enable_if<!predicate<args...>::type::value>::type, predicate, + static_predicate_message<message...>, args...> { + using type = typename _kokkos____________________static_test_failure_____< + message...>::type; +}; + +template <template <class...> class predicate, class... args> +struct static_assert_predicate_true + : static_assert_predicate_true_impl<void, predicate, + static_predicate_message<>, args...> {}; + +template <template <class...> class predicate, class... message, class... args> +struct static_assert_predicate_true< + predicate, static_predicate_message<message...>, args...> + : static_assert_predicate_true_impl< + void, predicate, static_predicate_message<message...>, args...> {}; + +//------------------------------------------------------------------------------ + +// error "messages" +struct _kokkos__________types_should_be_the_same_____expected_type__ {}; +struct _kokkos__________actual_type_was__ {}; +template <class Expected, class Actual> +struct static_expect_same { + using type = typename static_assert_predicate_true< + std::is_same, + static_predicate_message< + _kokkos__________types_should_be_the_same_____expected_type__, + Expected, _kokkos__________actual_type_was__, Actual>, + Expected, Actual>::type; +}; + +//------------------------------------------------------------------------------ + +namespace TestViewSubview { + +template <class Layout, class Space> +struct getView { + static Kokkos::View<double**, Layout, Space> get(int n, int m) { + return Kokkos::View<double**, Layout, Space>("G", n, m); + } +}; + +template <class Space> +struct getView<Kokkos::LayoutStride, Space> { + static Kokkos::View<double**, Kokkos::LayoutStride, Space> get(int n, int m) { + const int rank = 2; + const int order[] = {0, 1}; + const unsigned dim[] = {unsigned(n), unsigned(m)}; + Kokkos::LayoutStride stride = + Kokkos::LayoutStride::order_dimensions(rank, order, dim); + + return Kokkos::View<double**, Kokkos::LayoutStride, Space>("G", stride); + } +}; + +template <class ViewType, class Space> +struct fill_1D { + using execution_space = typename Space::execution_space; + using size_type = typename ViewType::size_type; + + ViewType a; + double val; + + fill_1D(ViewType a_, double val_) : a(a_), val(val_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { a(i) = val; } +}; + +template <class ViewType, class Space> +struct fill_2D { + using execution_space = typename Space::execution_space; + using size_type = typename ViewType::size_type; + + ViewType a; + double val; + + fill_2D(ViewType a_, double val_) : a(a_), val(val_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { + for (int j = 0; j < static_cast<int>(a.extent(1)); j++) { + a(i, j) = val; + } + } +}; + +template <class Layout, class Space> +void test_auto_1d() { + using mv_type = Kokkos::View<double**, Layout, Space>; + using size_type = typename mv_type::size_type; + + const double ZERO = 0.0; + const double ONE = 1.0; + const double TWO = 2.0; + + const size_type numRows = 10; + const size_type numCols = 3; + + mv_type X = getView<Layout, Space>::get(numRows, numCols); + typename mv_type::HostMirror X_h = Kokkos::create_mirror_view(X); + + fill_2D<mv_type, Space> f1(X, ONE); + Kokkos::parallel_for(X.extent(0), f1); + Kokkos::fence(); + Kokkos::deep_copy(X_h, X); + for (size_type j = 0; j < numCols; ++j) { + for (size_type i = 0; i < numRows; ++i) { + ASSERT_TRUE(X_h(i, j) == ONE); + } + } + + fill_2D<mv_type, Space> f2(X, 0.0); + Kokkos::parallel_for(X.extent(0), f2); + Kokkos::fence(); + Kokkos::deep_copy(X_h, X); + for (size_type j = 0; j < numCols; ++j) { + for (size_type i = 0; i < numRows; ++i) { + ASSERT_TRUE(X_h(i, j) == ZERO); + } + } + + fill_2D<mv_type, Space> f3(X, TWO); + Kokkos::parallel_for(X.extent(0), f3); + Kokkos::fence(); + Kokkos::deep_copy(X_h, X); + for (size_type j = 0; j < numCols; ++j) { + for (size_type i = 0; i < numRows; ++i) { + ASSERT_TRUE(X_h(i, j) == TWO); + } + } + + for (size_type j = 0; j < numCols; ++j) { + auto X_j = Kokkos::subview(X, Kokkos::ALL, j); + + fill_1D<decltype(X_j), Space> f4(X_j, ZERO); + Kokkos::parallel_for(X_j.extent(0), f4); + Kokkos::fence(); + Kokkos::deep_copy(X_h, X); + for (size_type i = 0; i < numRows; ++i) { + ASSERT_TRUE(X_h(i, j) == ZERO); + } + + for (size_type jj = 0; jj < numCols; ++jj) { + auto X_jj = Kokkos::subview(X, Kokkos::ALL, jj); + fill_1D<decltype(X_jj), Space> f5(X_jj, ONE); + Kokkos::parallel_for(X_jj.extent(0), f5); + Kokkos::fence(); + Kokkos::deep_copy(X_h, X); + for (size_type i = 0; i < numRows; ++i) { + ASSERT_TRUE(X_h(i, jj) == ONE); + } + } + } +} + +template <class LD, class LS, class Space> +void test_1d_strided_assignment_impl(bool a, bool b, bool c, bool d, int n, + int m) { + Kokkos::View<double**, LS, Space> l2d("l2d", n, m); + + int col = n > 2 ? 2 : 0; + int row = m > 2 ? 2 : 0; + + if (Kokkos::Impl::SpaceAccessibility< + Kokkos::HostSpace, typename Space::memory_space>::accessible) { + if (a) { + Kokkos::View<double*, LD, Space> l1da = + Kokkos::subview(l2d, Kokkos::ALL, row); + ASSERT_TRUE(&l1da(0) == &l2d(0, row)); + if (n > 1) { + ASSERT_TRUE(&l1da(1) == &l2d(1, row)); + } + } + + if (b && n > 13) { + Kokkos::View<double*, LD, Space> l1db = + Kokkos::subview(l2d, std::pair<unsigned, unsigned>(2, 13), row); + ASSERT_TRUE(&l1db(0) == &l2d(2, row)); + ASSERT_TRUE(&l1db(1) == &l2d(3, row)); + } + + if (c) { + Kokkos::View<double*, LD, Space> l1dc = + Kokkos::subview(l2d, col, Kokkos::ALL); + ASSERT_TRUE(&l1dc(0) == &l2d(col, 0)); + if (m > 1) { + ASSERT_TRUE(&l1dc(1) == &l2d(col, 1)); + } + } + + if (d && m > 13) { + Kokkos::View<double*, LD, Space> l1dd = + Kokkos::subview(l2d, col, std::pair<unsigned, unsigned>(2, 13)); + ASSERT_TRUE(&l1dd(0) == &l2d(col, 2)); + ASSERT_TRUE(&l1dd(1) == &l2d(col, 3)); + } + } +} + +template <class Space> +void test_1d_strided_assignment() { + test_1d_strided_assignment_impl<Kokkos::LayoutStride, Kokkos::LayoutLeft, + Space>(true, true, true, true, 17, 3); + test_1d_strided_assignment_impl<Kokkos::LayoutStride, Kokkos::LayoutRight, + Space>(true, true, true, true, 17, 3); + + test_1d_strided_assignment_impl<Kokkos::LayoutLeft, Kokkos::LayoutLeft, + Space>(true, true, false, false, 17, 3); + test_1d_strided_assignment_impl<Kokkos::LayoutRight, Kokkos::LayoutLeft, + Space>(true, true, false, false, 17, 3); + test_1d_strided_assignment_impl<Kokkos::LayoutLeft, Kokkos::LayoutRight, + Space>(false, false, true, true, 17, 3); + test_1d_strided_assignment_impl<Kokkos::LayoutRight, Kokkos::LayoutRight, + Space>(false, false, true, true, 17, 3); + + test_1d_strided_assignment_impl<Kokkos::LayoutLeft, Kokkos::LayoutLeft, + Space>(true, true, false, false, 17, 1); + test_1d_strided_assignment_impl<Kokkos::LayoutLeft, Kokkos::LayoutLeft, + Space>(true, true, true, true, 1, 17); + test_1d_strided_assignment_impl<Kokkos::LayoutRight, Kokkos::LayoutLeft, + Space>(true, true, true, true, 1, 17); + test_1d_strided_assignment_impl<Kokkos::LayoutRight, Kokkos::LayoutLeft, + Space>(true, true, false, false, 17, 1); + + test_1d_strided_assignment_impl<Kokkos::LayoutLeft, Kokkos::LayoutRight, + Space>(true, true, true, true, 17, 1); + test_1d_strided_assignment_impl<Kokkos::LayoutLeft, Kokkos::LayoutRight, + Space>(false, false, true, true, 1, 17); + test_1d_strided_assignment_impl<Kokkos::LayoutRight, Kokkos::LayoutRight, + Space>(false, false, true, true, 1, 17); + test_1d_strided_assignment_impl<Kokkos::LayoutRight, Kokkos::LayoutRight, + Space>(true, true, true, true, 17, 1); +} + +template <class NewView, class OrigView, class... Args> +void make_subview(bool use_constructor, NewView& v, OrigView org, + Args... args) { + if (use_constructor) { + v = NewView(org, args...); + } else { + v = Kokkos::subview(org, args...); + } +} + +template <class Space> +void test_left_0(bool constr) { + using view_static_8_type = + Kokkos::View<int[2][3][4][5][2][3][4][5], Kokkos::LayoutLeft, Space>; + + if (Kokkos::Impl::SpaceAccessibility< + Kokkos::HostSpace, typename Space::memory_space>::accessible) { + view_static_8_type x_static_8("x_static_left_8"); + + ASSERT_TRUE(x_static_8.span_is_contiguous()); + + Kokkos::View<int, Kokkos::LayoutLeft, Space> x0; + make_subview(constr, x0, x_static_8, 0, 0, 0, 0, 0, 0, 0, 0); + + ASSERT_TRUE(x0.span_is_contiguous()); + ASSERT_EQ(x0.span(), 1); + ASSERT_TRUE(&x0() == &x_static_8(0, 0, 0, 0, 0, 0, 0, 0)); + + Kokkos::View<int*, Kokkos::LayoutLeft, Space> x1; + make_subview(constr, x1, x_static_8, Kokkos::pair<int, int>(0, 2), 1, 2, 3, + 0, 1, 2, 3); + + ASSERT_TRUE(x1.span_is_contiguous()); + ASSERT_EQ(x1.span(), 2); + ASSERT_TRUE(&x1(0) == &x_static_8(0, 1, 2, 3, 0, 1, 2, 3)); + ASSERT_TRUE(&x1(1) == &x_static_8(1, 1, 2, 3, 0, 1, 2, 3)); + + Kokkos::View<int*, Kokkos::LayoutLeft, Space> x_deg1; + make_subview(constr, x_deg1, x_static_8, Kokkos::pair<int, int>(0, 0), 1, 2, + 3, 0, 1, 2, 3); + + ASSERT_TRUE(x_deg1.span_is_contiguous()); + ASSERT_EQ(x_deg1.span(), 0); + ASSERT_EQ(x_deg1.data(), &x_static_8(0, 1, 2, 3, 0, 1, 2, 3)); + + Kokkos::View<int*, Kokkos::LayoutLeft, Space> x_deg2; + make_subview(constr, x_deg2, x_static_8, Kokkos::pair<int, int>(2, 2), 2, 3, + 4, 1, 2, 3, 4); + + ASSERT_TRUE(x_deg2.span_is_contiguous()); + ASSERT_EQ(x_deg2.span(), 0); + ASSERT_EQ(x_deg2.data(), x_static_8.data() + x_static_8.span()); + + Kokkos::View<int**, Kokkos::LayoutLeft, Space> x2; + make_subview(constr, x2, x_static_8, Kokkos::pair<int, int>(0, 2), 1, 2, 3, + Kokkos::pair<int, int>(0, 2), 1, 2, 3); + + ASSERT_TRUE(!x2.span_is_contiguous()); + ASSERT_TRUE(&x2(0, 0) == &x_static_8(0, 1, 2, 3, 0, 1, 2, 3)); + ASSERT_TRUE(&x2(1, 0) == &x_static_8(1, 1, 2, 3, 0, 1, 2, 3)); + ASSERT_TRUE(&x2(0, 1) == &x_static_8(0, 1, 2, 3, 1, 1, 2, 3)); + ASSERT_TRUE(&x2(1, 1) == &x_static_8(1, 1, 2, 3, 1, 1, 2, 3)); + + // Kokkos::View< int**, Kokkos::LayoutLeft, Space > error_2 = + Kokkos::View<int**, Kokkos::LayoutStride, Space> sx2; + make_subview(constr, sx2, x_static_8, 1, Kokkos::pair<int, int>(0, 2), 2, 3, + Kokkos::pair<int, int>(0, 2), 1, 2, 3); + + ASSERT_TRUE(!sx2.span_is_contiguous()); + ASSERT_TRUE(&sx2(0, 0) == &x_static_8(1, 0, 2, 3, 0, 1, 2, 3)); + ASSERT_TRUE(&sx2(1, 0) == &x_static_8(1, 1, 2, 3, 0, 1, 2, 3)); + ASSERT_TRUE(&sx2(0, 1) == &x_static_8(1, 0, 2, 3, 1, 1, 2, 3)); + ASSERT_TRUE(&sx2(1, 1) == &x_static_8(1, 1, 2, 3, 1, 1, 2, 3)); + + Kokkos::View<int****, Kokkos::LayoutStride, Space> sx4; + make_subview(constr, sx4, x_static_8, 0, + Kokkos::pair<int, int>(0, 2) /* of [3] */ + , + 1, Kokkos::pair<int, int>(1, 3) /* of [5] */ + , + 1, Kokkos::pair<int, int>(0, 2) /* of [3] */ + , + 2, Kokkos::pair<int, int>(2, 4) /* of [5] */ + ); + + ASSERT_TRUE(!sx4.span_is_contiguous()); + + for (int i0 = 0; i0 < (int)sx4.extent(0); ++i0) + for (int i1 = 0; i1 < (int)sx4.extent(1); ++i1) + for (int i2 = 0; i2 < (int)sx4.extent(2); ++i2) + for (int i3 = 0; i3 < (int)sx4.extent(3); ++i3) { + ASSERT_TRUE(&sx4(i0, i1, i2, i3) == &x_static_8(0, 0 + i0, 1, + 1 + i1, 1, 0 + i2, + 2, 2 + i3)); + } + } +} + +template <class Space> +void test_left_0() { + test_left_0<Space>(true); + test_left_0<Space>(false); +} + +template <class Space> +void test_left_1(bool use_constr) { + using view_type = + Kokkos::View<int*** * [2][3][4][5], Kokkos::LayoutLeft, Space>; + + if (Kokkos::Impl::SpaceAccessibility< + Kokkos::HostSpace, typename Space::memory_space>::accessible) { + view_type x8("x_left_8", 2, 3, 4, 5); + + ASSERT_TRUE(x8.span_is_contiguous()); + + Kokkos::View<int, Kokkos::LayoutLeft, Space> x0; + make_subview(use_constr, x0, x8, 0, 0, 0, 0, 0, 0, 0, 0); + + ASSERT_TRUE(x0.span_is_contiguous()); + ASSERT_TRUE(&x0() == &x8(0, 0, 0, 0, 0, 0, 0, 0)); + + Kokkos::View<int*, Kokkos::LayoutLeft, Space> x1; + make_subview(use_constr, x1, x8, Kokkos::pair<int, int>(0, 2), 1, 2, 3, 0, + 1, 2, 3); + + ASSERT_TRUE(x1.span_is_contiguous()); + ASSERT_TRUE(&x1(0) == &x8(0, 1, 2, 3, 0, 1, 2, 3)); + ASSERT_TRUE(&x1(1) == &x8(1, 1, 2, 3, 0, 1, 2, 3)); + + Kokkos::View<int*, Kokkos::LayoutLeft, Space> x1_deg1; + make_subview(use_constr, x1_deg1, x8, Kokkos::pair<int, int>(0, 0), 1, 2, 3, + 0, 1, 2, 3); + + ASSERT_TRUE(x1_deg1.span_is_contiguous()); + ASSERT_EQ(0, x1_deg1.span()); + ASSERT_EQ(x1_deg1.data(), &x8(0, 1, 2, 3, 0, 1, 2, 3)); + + Kokkos::View<int*, Kokkos::LayoutLeft, Space> x1_deg2; + make_subview(use_constr, x1_deg2, x8, Kokkos::pair<int, int>(2, 2), 2, 3, 4, + 1, 2, 3, 4); + + ASSERT_EQ(0, x1_deg2.span()); + ASSERT_TRUE(x1_deg2.span_is_contiguous()); + ASSERT_EQ(x1_deg2.data(), x8.data() + x8.span()); + + Kokkos::View<int**, Kokkos::LayoutLeft, Space> x2; + make_subview(use_constr, x2, x8, Kokkos::pair<int, int>(0, 2), 1, 2, 3, + Kokkos::pair<int, int>(0, 2), 1, 2, 3); + + ASSERT_TRUE(!x2.span_is_contiguous()); + ASSERT_TRUE(&x2(0, 0) == &x8(0, 1, 2, 3, 0, 1, 2, 3)); + ASSERT_TRUE(&x2(1, 0) == &x8(1, 1, 2, 3, 0, 1, 2, 3)); + ASSERT_TRUE(&x2(0, 1) == &x8(0, 1, 2, 3, 1, 1, 2, 3)); + ASSERT_TRUE(&x2(1, 1) == &x8(1, 1, 2, 3, 1, 1, 2, 3)); + + Kokkos::View<int**, Kokkos::LayoutLeft, Space> x2_deg2; + make_subview(use_constr, x2_deg2, x8, Kokkos::pair<int, int>(2, 2), 2, 3, 4, + 1, 2, Kokkos::pair<int, int>(2, 3), 4); + ASSERT_EQ(0, x2_deg2.span()); + + // Kokkos::View< int**, Kokkos::LayoutLeft, Space > error_2 = + Kokkos::View<int**, Kokkos::LayoutStride, Space> sx2; + make_subview(use_constr, sx2, x8, 1, Kokkos::pair<int, int>(0, 2), 2, 3, + Kokkos::pair<int, int>(0, 2), 1, 2, 3); + + ASSERT_TRUE(!sx2.span_is_contiguous()); + ASSERT_TRUE(&sx2(0, 0) == &x8(1, 0, 2, 3, 0, 1, 2, 3)); + ASSERT_TRUE(&sx2(1, 0) == &x8(1, 1, 2, 3, 0, 1, 2, 3)); + ASSERT_TRUE(&sx2(0, 1) == &x8(1, 0, 2, 3, 1, 1, 2, 3)); + ASSERT_TRUE(&sx2(1, 1) == &x8(1, 1, 2, 3, 1, 1, 2, 3)); + + Kokkos::View<int**, Kokkos::LayoutStride, Space> sx2_deg; + make_subview(use_constr, sx2, x8, 1, Kokkos::pair<int, int>(0, 0), 2, 3, + Kokkos::pair<int, int>(0, 2), 1, 2, 3); + ASSERT_EQ(0, sx2_deg.span()); + + Kokkos::View<int****, Kokkos::LayoutStride, Space> sx4; + make_subview(use_constr, sx4, x8, 0, + Kokkos::pair<int, int>(0, 2) /* of [3] */ + , + 1, Kokkos::pair<int, int>(1, 3) /* of [5] */ + , + 1, Kokkos::pair<int, int>(0, 2) /* of [3] */ + , + 2, Kokkos::pair<int, int>(2, 4) /* of [5] */ + ); + + ASSERT_TRUE(!sx4.span_is_contiguous()); + + for (int i0 = 0; i0 < (int)sx4.extent(0); ++i0) + for (int i1 = 0; i1 < (int)sx4.extent(1); ++i1) + for (int i2 = 0; i2 < (int)sx4.extent(2); ++i2) + for (int i3 = 0; i3 < (int)sx4.extent(3); ++i3) { + ASSERT_TRUE(&sx4(i0, i1, i2, i3) == + &x8(0, 0 + i0, 1, 1 + i1, 1, 0 + i2, 2, 2 + i3)); + } + } +} + +template <class Space> +void test_left_1() { + test_left_1<Space>(true); + test_left_1<Space>(false); +} + +template <class Space> +void test_left_2() { + using view_type = Kokkos::View<int****, Kokkos::LayoutLeft, Space>; + + if (Kokkos::Impl::SpaceAccessibility< + Kokkos::HostSpace, typename Space::memory_space>::accessible) { + view_type x4("x4", 2, 3, 4, 5); + + ASSERT_TRUE(x4.span_is_contiguous()); + + Kokkos::View<int, Kokkos::LayoutLeft, Space> x0 = + Kokkos::subview(x4, 0, 0, 0, 0); + + ASSERT_TRUE(x0.span_is_contiguous()); + ASSERT_TRUE(&x0() == &x4(0, 0, 0, 0)); + + Kokkos::View<int*, Kokkos::LayoutLeft, Space> x1 = + Kokkos::subview(x4, Kokkos::pair<int, int>(0, 2), 1, 2, 3); + + ASSERT_TRUE(x1.span_is_contiguous()); + ASSERT_TRUE(&x1(0) == &x4(0, 1, 2, 3)); + ASSERT_TRUE(&x1(1) == &x4(1, 1, 2, 3)); + + Kokkos::View<int**, Kokkos::LayoutLeft, Space> x2 = Kokkos::subview( + x4, Kokkos::pair<int, int>(0, 2), 1, Kokkos::pair<int, int>(1, 3), 2); + + ASSERT_TRUE(!x2.span_is_contiguous()); + ASSERT_TRUE(&x2(0, 0) == &x4(0, 1, 1, 2)); + ASSERT_TRUE(&x2(1, 0) == &x4(1, 1, 1, 2)); + ASSERT_TRUE(&x2(0, 1) == &x4(0, 1, 2, 2)); + ASSERT_TRUE(&x2(1, 1) == &x4(1, 1, 2, 2)); + + // Kokkos::View< int**, Kokkos::LayoutLeft, Space > error_2 = + Kokkos::View<int**, Kokkos::LayoutStride, Space> sx2 = Kokkos::subview( + x4, 1, Kokkos::pair<int, int>(0, 2), 2, Kokkos::pair<int, int>(1, 4)); + + ASSERT_TRUE(!sx2.span_is_contiguous()); + ASSERT_TRUE(&sx2(0, 0) == &x4(1, 0, 2, 1)); + ASSERT_TRUE(&sx2(1, 0) == &x4(1, 1, 2, 1)); + ASSERT_TRUE(&sx2(0, 1) == &x4(1, 0, 2, 2)); + ASSERT_TRUE(&sx2(1, 1) == &x4(1, 1, 2, 2)); + ASSERT_TRUE(&sx2(0, 2) == &x4(1, 0, 2, 3)); + ASSERT_TRUE(&sx2(1, 2) == &x4(1, 1, 2, 3)); + + Kokkos::View<int****, Kokkos::LayoutStride, Space> sx4 = + Kokkos::subview(x4, Kokkos::pair<int, int>(1, 2) /* of [2] */ + , + Kokkos::pair<int, int>(1, 3) /* of [3] */ + , + Kokkos::pair<int, int>(0, 4) /* of [4] */ + , + Kokkos::pair<int, int>(2, 4) /* of [5] */ + ); + + ASSERT_TRUE(!sx4.span_is_contiguous()); + + for (int i0 = 0; i0 < (int)sx4.extent(0); ++i0) + for (int i1 = 0; i1 < (int)sx4.extent(1); ++i1) + for (int i2 = 0; i2 < (int)sx4.extent(2); ++i2) + for (int i3 = 0; i3 < (int)sx4.extent(3); ++i3) { + ASSERT_TRUE(&sx4(i0, i1, i2, i3) == + &x4(1 + i0, 1 + i1, 0 + i2, 2 + i3)); + } + } +} + +template <class Space> +void test_left_3() { + using view_type = Kokkos::View<int**, Kokkos::LayoutLeft, Space>; + + if (Kokkos::Impl::SpaceAccessibility< + Kokkos::HostSpace, typename Space::memory_space>::accessible) { + view_type xm("x4", 10, 5); + + ASSERT_TRUE(xm.span_is_contiguous()); + + Kokkos::View<int, Kokkos::LayoutLeft, Space> x0 = Kokkos::subview(xm, 5, 3); + + ASSERT_TRUE(x0.span_is_contiguous()); + ASSERT_TRUE(&x0() == &xm(5, 3)); + + Kokkos::View<int*, Kokkos::LayoutLeft, Space> x1 = + Kokkos::subview(xm, Kokkos::ALL, 3); + + ASSERT_TRUE(x1.span_is_contiguous()); + for (int i = 0; i < int(xm.extent(0)); ++i) { + ASSERT_TRUE(&x1(i) == &xm(i, 3)); + } + + Kokkos::View<int**, Kokkos::LayoutLeft, Space> x2 = + Kokkos::subview(xm, Kokkos::pair<int, int>(1, 9), Kokkos::ALL); + + ASSERT_TRUE(!x2.span_is_contiguous()); + for (int j = 0; j < int(x2.extent(1)); ++j) + for (int i = 0; i < int(x2.extent(0)); ++i) { + ASSERT_TRUE(&x2(i, j) == &xm(1 + i, j)); + } + + Kokkos::View<int**, Kokkos::LayoutLeft, Space> x2c = + Kokkos::subview(xm, Kokkos::ALL, std::pair<int, int>(2, 4)); + + ASSERT_TRUE(x2c.span_is_contiguous()); + for (int j = 0; j < int(x2c.extent(1)); ++j) + for (int i = 0; i < int(x2c.extent(0)); ++i) { + ASSERT_TRUE(&x2c(i, j) == &xm(i, 2 + j)); + } + + Kokkos::View<int**, Kokkos::LayoutLeft, Space> x2_n1 = + Kokkos::subview(xm, std::pair<int, int>(1, 1), Kokkos::ALL); + + ASSERT_TRUE(x2_n1.extent(0) == 0); + ASSERT_TRUE(x2_n1.extent(1) == xm.extent(1)); + + Kokkos::View<int**, Kokkos::LayoutLeft, Space> x2_n2 = + Kokkos::subview(xm, Kokkos::ALL, std::pair<int, int>(1, 1)); + + ASSERT_TRUE(x2_n2.extent(0) == xm.extent(0)); + ASSERT_TRUE(x2_n2.extent(1) == 0); + } +} + +//---------------------------------------------------------------------------- + +template <class Space> +void test_right_0(bool use_constr) { + using view_static_8_type = + Kokkos::View<int[2][3][4][5][2][3][4][5], Kokkos::LayoutRight, Space>; + + if (Kokkos::Impl::SpaceAccessibility< + Kokkos::HostSpace, typename Space::memory_space>::accessible) { + view_static_8_type x_static_8("x_static_right_8"); + + Kokkos::View<int, Kokkos::LayoutRight, Space> x0; + make_subview(use_constr, x0, x_static_8, 0, 0, 0, 0, 0, 0, 0, 0); + + ASSERT_TRUE(&x0() == &x_static_8(0, 0, 0, 0, 0, 0, 0, 0)); + + Kokkos::View<int*, Kokkos::LayoutRight, Space> x1; + make_subview(use_constr, x1, x_static_8, 0, 1, 2, 3, 0, 1, 2, + Kokkos::pair<int, int>(1, 3)); + + ASSERT_TRUE(x1.extent(0) == 2); + ASSERT_TRUE(&x1(0) == &x_static_8(0, 1, 2, 3, 0, 1, 2, 1)); + ASSERT_TRUE(&x1(1) == &x_static_8(0, 1, 2, 3, 0, 1, 2, 2)); + + Kokkos::View<int**, Kokkos::LayoutRight, Space> x2; + make_subview(use_constr, x2, x_static_8, 0, 1, 2, + Kokkos::pair<int, int>(1, 3), 0, 1, 2, + Kokkos::pair<int, int>(1, 3)); + + ASSERT_TRUE(x2.extent(0) == 2); + ASSERT_TRUE(x2.extent(1) == 2); + ASSERT_TRUE(&x2(0, 0) == &x_static_8(0, 1, 2, 1, 0, 1, 2, 1)); + ASSERT_TRUE(&x2(1, 0) == &x_static_8(0, 1, 2, 2, 0, 1, 2, 1)); + ASSERT_TRUE(&x2(0, 1) == &x_static_8(0, 1, 2, 1, 0, 1, 2, 2)); + ASSERT_TRUE(&x2(1, 1) == &x_static_8(0, 1, 2, 2, 0, 1, 2, 2)); + + // Kokkos::View< int**, Kokkos::LayoutRight, Space > error_2 = + Kokkos::View<int**, Kokkos::LayoutStride, Space> sx2; + make_subview(use_constr, sx2, x_static_8, 1, Kokkos::pair<int, int>(0, 2), + 2, 3, Kokkos::pair<int, int>(0, 2), 1, 2, 3); + + ASSERT_TRUE(sx2.extent(0) == 2); + ASSERT_TRUE(sx2.extent(1) == 2); + ASSERT_TRUE(&sx2(0, 0) == &x_static_8(1, 0, 2, 3, 0, 1, 2, 3)); + ASSERT_TRUE(&sx2(1, 0) == &x_static_8(1, 1, 2, 3, 0, 1, 2, 3)); + ASSERT_TRUE(&sx2(0, 1) == &x_static_8(1, 0, 2, 3, 1, 1, 2, 3)); + ASSERT_TRUE(&sx2(1, 1) == &x_static_8(1, 1, 2, 3, 1, 1, 2, 3)); + + Kokkos::View<int****, Kokkos::LayoutStride, Space> sx4; + make_subview(use_constr, sx4, x_static_8, 0, + Kokkos::pair<int, int>(0, 2) /* of [3] */ + , + 1, Kokkos::pair<int, int>(1, 3) /* of [5] */ + , + 1, Kokkos::pair<int, int>(0, 2) /* of [3] */ + , + 2, Kokkos::pair<int, int>(2, 4) /* of [5] */ + ); + + ASSERT_TRUE(sx4.extent(0) == 2); + ASSERT_TRUE(sx4.extent(1) == 2); + ASSERT_TRUE(sx4.extent(2) == 2); + ASSERT_TRUE(sx4.extent(3) == 2); + for (int i0 = 0; i0 < (int)sx4.extent(0); ++i0) + for (int i1 = 0; i1 < (int)sx4.extent(1); ++i1) + for (int i2 = 0; i2 < (int)sx4.extent(2); ++i2) + for (int i3 = 0; i3 < (int)sx4.extent(3); ++i3) { + ASSERT_TRUE(&sx4(i0, i1, i2, i3) == &x_static_8(0, 0 + i0, 1, + 1 + i1, 1, 0 + i2, + 2, 2 + i3)); + } + } +} + +template <class Space> +void test_right_0() { + test_right_0<Space>(true); + test_right_0<Space>(false); +} + +template <class Space> +void test_right_1(bool use_constr) { + using view_type = + Kokkos::View<int*** * [2][3][4][5], Kokkos::LayoutRight, Space>; + + if (Kokkos::Impl::SpaceAccessibility< + Kokkos::HostSpace, typename Space::memory_space>::accessible) { + view_type x8("x_right_8", 2, 3, 4, 5); + + Kokkos::View<int, Kokkos::LayoutRight, Space> x0; + make_subview(use_constr, x0, x8, 0, 0, 0, 0, 0, 0, 0, 0); + + ASSERT_TRUE(&x0() == &x8(0, 0, 0, 0, 0, 0, 0, 0)); + + Kokkos::View<int*, Kokkos::LayoutRight, Space> x1; + make_subview(use_constr, x1, x8, 0, 1, 2, 3, 0, 1, 2, + Kokkos::pair<int, int>(1, 3)); + + ASSERT_TRUE(&x1(0) == &x8(0, 1, 2, 3, 0, 1, 2, 1)); + ASSERT_TRUE(&x1(1) == &x8(0, 1, 2, 3, 0, 1, 2, 2)); + + Kokkos::View<int*, Kokkos::LayoutRight, Space> x1_deg1; + make_subview(use_constr, x1_deg1, x8, 0, 1, 2, 3, 0, 1, 2, + Kokkos::pair<int, int>(3, 3)); + ASSERT_EQ(0, x1_deg1.span()); + + Kokkos::View<int**, Kokkos::LayoutRight, Space> x2; + make_subview(use_constr, x2, x8, 0, 1, 2, Kokkos::pair<int, int>(1, 3), 0, + 1, 2, Kokkos::pair<int, int>(1, 3)); + + ASSERT_TRUE(&x2(0, 0) == &x8(0, 1, 2, 1, 0, 1, 2, 1)); + ASSERT_TRUE(&x2(1, 0) == &x8(0, 1, 2, 2, 0, 1, 2, 1)); + ASSERT_TRUE(&x2(0, 1) == &x8(0, 1, 2, 1, 0, 1, 2, 2)); + ASSERT_TRUE(&x2(1, 1) == &x8(0, 1, 2, 2, 0, 1, 2, 2)); + + Kokkos::View<int**, Kokkos::LayoutRight, Space> x2_deg2; + make_subview(use_constr, x2_deg2, x8, 0, 1, 2, Kokkos::pair<int, int>(1, 3), + 0, 1, 2, Kokkos::pair<int, int>(3, 3)); + ASSERT_EQ(0, x2_deg2.span()); + + // Kokkos::View< int**, Kokkos::LayoutRight, Space > error_2 = + Kokkos::View<int**, Kokkos::LayoutStride, Space> sx2; + make_subview(use_constr, sx2, x8, 1, Kokkos::pair<int, int>(0, 2), 2, 3, + Kokkos::pair<int, int>(0, 2), 1, 2, 3); + + ASSERT_TRUE(&sx2(0, 0) == &x8(1, 0, 2, 3, 0, 1, 2, 3)); + ASSERT_TRUE(&sx2(1, 0) == &x8(1, 1, 2, 3, 0, 1, 2, 3)); + ASSERT_TRUE(&sx2(0, 1) == &x8(1, 0, 2, 3, 1, 1, 2, 3)); + ASSERT_TRUE(&sx2(1, 1) == &x8(1, 1, 2, 3, 1, 1, 2, 3)); + + Kokkos::View<int**, Kokkos::LayoutStride, Space> sx2_deg; + make_subview(use_constr, sx2_deg, x8, 1, Kokkos::pair<int, int>(0, 2), 2, 3, + 1, 1, 2, Kokkos::pair<int, int>(3, 3)); + ASSERT_EQ(0, sx2_deg.span()); + + Kokkos::View<int****, Kokkos::LayoutStride, Space> sx4; + make_subview(use_constr, sx4, x8, 0, + Kokkos::pair<int, int>(0, 2) /* of [3] */ + , + 1, Kokkos::pair<int, int>(1, 3) /* of [5] */ + , + 1, Kokkos::pair<int, int>(0, 2) /* of [3] */ + , + 2, Kokkos::pair<int, int>(2, 4) /* of [5] */ + ); + + for (int i0 = 0; i0 < (int)sx4.extent(0); ++i0) + for (int i1 = 0; i1 < (int)sx4.extent(1); ++i1) + for (int i2 = 0; i2 < (int)sx4.extent(2); ++i2) + for (int i3 = 0; i3 < (int)sx4.extent(3); ++i3) { + ASSERT_TRUE(&sx4(i0, i1, i2, i3) == + &x8(0, 0 + i0, 1, 1 + i1, 1, 0 + i2, 2, 2 + i3)); + } + } +} + +template <class Space> +void test_right_1() { + test_right_1<Space>(true); + test_right_1<Space>(false); +} + +template <class Space> +void test_right_3() { + using view_type = Kokkos::View<int**, Kokkos::LayoutRight, Space>; + + if (Kokkos::Impl::SpaceAccessibility< + Kokkos::HostSpace, typename Space::memory_space>::accessible) { + view_type xm("x4", 10, 5); + + ASSERT_TRUE(xm.span_is_contiguous()); + + Kokkos::View<int, Kokkos::LayoutRight, Space> x0 = + Kokkos::subview(xm, 5, 3); + + ASSERT_TRUE(x0.span_is_contiguous()); + ASSERT_TRUE(&x0() == &xm(5, 3)); + + Kokkos::View<int*, Kokkos::LayoutRight, Space> x1 = + Kokkos::subview(xm, 3, Kokkos::ALL); + + ASSERT_TRUE(x1.span_is_contiguous()); + for (int i = 0; i < int(xm.extent(1)); ++i) { + ASSERT_TRUE(&x1(i) == &xm(3, i)); + } + + Kokkos::View<int**, Kokkos::LayoutRight, Space> x2c = + Kokkos::subview(xm, Kokkos::pair<int, int>(1, 9), Kokkos::ALL); + + ASSERT_TRUE(x2c.span_is_contiguous()); + for (int j = 0; j < int(x2c.extent(1)); ++j) + for (int i = 0; i < int(x2c.extent(0)); ++i) { + ASSERT_TRUE(&x2c(i, j) == &xm(1 + i, j)); + } + + Kokkos::View<int**, Kokkos::LayoutRight, Space> x2 = + Kokkos::subview(xm, Kokkos::ALL, std::pair<int, int>(2, 4)); + + ASSERT_TRUE(!x2.span_is_contiguous()); + for (int j = 0; j < int(x2.extent(1)); ++j) + for (int i = 0; i < int(x2.extent(0)); ++i) { + ASSERT_TRUE(&x2(i, j) == &xm(i, 2 + j)); + } + + Kokkos::View<int**, Kokkos::LayoutRight, Space> x2_n1 = + Kokkos::subview(xm, std::pair<int, int>(1, 1), Kokkos::ALL); + + ASSERT_TRUE(x2_n1.extent(0) == 0); + ASSERT_TRUE(x2_n1.extent(1) == xm.extent(1)); + + Kokkos::View<int**, Kokkos::LayoutRight, Space> x2_n2 = + Kokkos::subview(xm, Kokkos::ALL, std::pair<int, int>(1, 1)); + + ASSERT_TRUE(x2_n2.extent(0) == xm.extent(0)); + ASSERT_TRUE(x2_n2.extent(1) == 0); + } +} + +namespace Impl { + +constexpr int N0 = 113; +constexpr int N1 = 11; +constexpr int N2 = 17; +constexpr int N3 = 5; +constexpr int N4 = 7; + +template <class Layout, class Space> +struct FillView_1D { + using view_t = Kokkos::View<int*, Layout, Space>; + view_t a; + using policy_t = Kokkos::RangePolicy<typename Space::execution_space>; + + FillView_1D(view_t a_) : a(a_) {} + + void run() { + Kokkos::parallel_for("FillView_1D", policy_t(0, a.extent(0)), *this); + } + KOKKOS_INLINE_FUNCTION + void operator()(int i) const { a(i) = i; } +}; + +template <class Layout, class Space> +struct FillView_3D { + using exec_t = typename Space::execution_space; + using view_t = Kokkos::View<int***, Layout, Space>; + using rank_t = Kokkos::Rank< + view_t::Rank, + std::is_same<Layout, Kokkos::LayoutLeft>::value ? Kokkos::Iterate::Left + : Kokkos::Iterate::Right, + std::is_same<Layout, Kokkos::LayoutLeft>::value ? Kokkos::Iterate::Left + : Kokkos::Iterate::Right>; + using policy_t = Kokkos::MDRangePolicy<exec_t, rank_t>; + + view_t a; + + FillView_3D(view_t a_) : a(a_) {} + + void run() { + Kokkos::parallel_for( + "FillView_3D", + policy_t({0, 0, 0}, {a.extent(0), a.extent(1), a.extent(2)}), *this); + } + + KOKKOS_INLINE_FUNCTION + void operator()(int i0, int i1, int i2) const { + a(i0, i1, i2) = 1000000 * i0 + 1000 * i1 + i2; + } +}; + +template <class Layout, class Space> +struct FillView_4D { + using exec_t = typename Space::execution_space; + using view_t = Kokkos::View<int****, Layout, Space>; + using rank_t = Kokkos::Rank< + view_t::Rank, + std::is_same<Layout, Kokkos::LayoutLeft>::value ? Kokkos::Iterate::Left + : Kokkos::Iterate::Right, + std::is_same<Layout, Kokkos::LayoutLeft>::value ? Kokkos::Iterate::Left + : Kokkos::Iterate::Right>; + using policy_t = Kokkos::MDRangePolicy<exec_t, rank_t>; + + view_t a; + + FillView_4D(view_t a_) : a(a_) {} + + void run() { + Kokkos::parallel_for("FillView_4D", + policy_t({0, 0, 0, 0}, {a.extent(0), a.extent(1), + a.extent(2), a.extent(3)}), + *this); + } + + KOKKOS_INLINE_FUNCTION + void operator()(int i0, int i1, int i2, int i3) const { + a(i0, i1, i2, i3) = 1000000 * i0 + 10000 * i1 + 100 * i2 + i3; + } +}; + +template <class Layout, class Space> +struct FillView_5D { + using exec_t = typename Space::execution_space; + using view_t = Kokkos::View<int*****, Layout, Space>; + using rank_t = Kokkos::Rank< + view_t::Rank, + std::is_same<Layout, Kokkos::LayoutLeft>::value ? Kokkos::Iterate::Left + : Kokkos::Iterate::Right, + std::is_same<Layout, Kokkos::LayoutLeft>::value ? Kokkos::Iterate::Left + : Kokkos::Iterate::Right>; + using policy_t = Kokkos::MDRangePolicy<exec_t, rank_t>; + + view_t a; + + FillView_5D(view_t a_) : a(a_) {} + + void run() { + Kokkos::parallel_for( + "FillView_5D", + policy_t({0, 0, 0, 0, 0}, {a.extent(0), a.extent(1), a.extent(2), + a.extent(3), a.extent(4)}), + *this); + } + + KOKKOS_INLINE_FUNCTION + void operator()(int i0, int i1, int i2, int i3, int i4) const { + a(i0, i1, i2, i3, i4) = 1000000 * i0 + 10000 * i1 + 100 * i2 + 10 * i3 + i4; + } +}; + +template <class View, class SubView> +struct CheckSubviewCorrectness_1D_1D { + using policy_t = Kokkos::RangePolicy<typename View::execution_space>; + View a; + SubView b; + int offset; + + CheckSubviewCorrectness_1D_1D(View a_, SubView b_, int o) + : a(a_), b(b_), offset(o) {} + + void run() { + int errors = 0; + Kokkos::parallel_reduce("CheckSubView_1D_1D", policy_t(0, b.size()), *this, + errors); + ASSERT_TRUE(errors == 0); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int& i, int& e) const { + if (a(i + offset) != b(i)) { + e++; + } + } +}; + +template <class View, class SubView> +struct CheckSubviewCorrectness_1D_2D { + using policy_t = Kokkos::RangePolicy<typename View::execution_space>; + View a; + SubView b; + int i0; + int offset; + + CheckSubviewCorrectness_1D_2D(View a_, SubView b_, int i0_, int o) + : a(a_), b(b_), i0(i0_), offset(o) {} + + void run() { + int errors = 0; + Kokkos::parallel_reduce("CheckSubView_1D_2D", policy_t(0, b.size()), *this, + errors); + ASSERT_TRUE(errors == 0); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int& i1, int& e) const { + if (a(i0, i1 + offset) != b(i1)) { + e++; + } + } +}; + +template <class View, class SubView> +struct CheckSubviewCorrectness_2D_3D { + using policy_t = Kokkos::RangePolicy<typename View::execution_space>; + using layout = typename View::array_layout; + View a; + SubView b; + int i0; + int offset_1; + int offset_2; + + CheckSubviewCorrectness_2D_3D(View a_, SubView b_, int i0_, int o1, int o2) + : a(a_), b(b_), i0(i0_), offset_1(o1), offset_2(o2) {} + + void run() { + int errors = 0; + Kokkos::parallel_reduce("CheckSubView_2D_3D", policy_t(0, b.size()), *this, + errors); + ASSERT_TRUE(errors == 0); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int& ii, int& e) const { + const int i1 = std::is_same<layout, Kokkos::LayoutLeft>::value + ? ii % b.extent(0) + : ii / b.extent(1); + + const int i2 = std::is_same<layout, Kokkos::LayoutLeft>::value + ? ii / b.extent(0) + : ii % b.extent(1); + + if (a(i0, i1 + offset_1, i2 + offset_2) != b(i1, i2)) { + e++; + } + } +}; + +template <class View, class SubView> +struct CheckSubviewCorrectness_3D_3D { + using policy_t = Kokkos::RangePolicy<typename View::execution_space>; + using layout = typename View::array_layout; + View a; + SubView b; + int offset_0; + int offset_2; + + CheckSubviewCorrectness_3D_3D(View a_, SubView b_, int o0, int o2) + : a(a_), b(b_), offset_0(o0), offset_2(o2) {} + + void run() { + int errors = 0; + Kokkos::parallel_reduce("CheckSubView_3D_3D", policy_t(0, b.size()), *this, + errors); + ASSERT_TRUE(errors == 0); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int& ii, int& e) const { + const int i0 = std::is_same<layout, Kokkos::LayoutLeft>::value + ? ii % b.extent(0) + : ii / (b.extent(1) * b.extent(2)); + + const int i1 = std::is_same<layout, Kokkos::LayoutLeft>::value + ? (ii / b.extent(0)) % b.extent(1) + : (ii / b.extent(2)) % b.extent(1); + + const int i2 = std::is_same<layout, Kokkos::LayoutLeft>::value + ? ii / (b.extent(0) * b.extent(1)) + : ii % b.extent(2); + + if (a(i0 + offset_0, i1, i2 + offset_2) != b(i0, i1, i2)) { + e++; + } + } +}; + +template <class View, class SubView> +struct CheckSubviewCorrectness_3D_4D { + using policy_t = Kokkos::RangePolicy<typename View::execution_space>; + using layout = typename View::array_layout; + View a; + SubView b; + int index; + int offset_0, offset_2; + + CheckSubviewCorrectness_3D_4D(View a_, SubView b_, int index_, int o0, int o2) + : a(a_), b(b_), index(index_), offset_0(o0), offset_2(o2) {} + + void run() { + int errors = 0; + Kokkos::parallel_reduce("CheckSubView_3D_4D", policy_t(0, b.size()), *this, + errors); + ASSERT_TRUE(errors == 0); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int& ii, int& e) const { + const int i = std::is_same<layout, Kokkos::LayoutLeft>::value + ? ii % b.extent(0) + : ii / (b.extent(1) * b.extent(2)); + + const int j = std::is_same<layout, Kokkos::LayoutLeft>::value + ? (ii / b.extent(0)) % b.extent(1) + : (ii / b.extent(2)) % b.extent(1); + + const int k = std::is_same<layout, Kokkos::LayoutLeft>::value + ? ii / (b.extent(0) * b.extent(1)) + : ii % b.extent(2); + + int i0, i1, i2, i3; + + if (std::is_same<layout, Kokkos::LayoutLeft>::value) { + i0 = i + offset_0; + i1 = j; + i2 = k + offset_2; + i3 = index; + } else { + i0 = index; + i1 = i + offset_0; + i2 = j; + i3 = k + offset_2; + } + + if (a(i0, i1, i2, i3) != b(i, j, k)) e++; + } +}; + +template <class View, class SubView> +struct CheckSubviewCorrectness_3D_5D { + using policy_t = Kokkos::RangePolicy<typename View::execution_space>; + using layout = typename View::array_layout; + View a; + SubView b; + int i0, i1; + int offset_2, offset_3, offset_4; + + CheckSubviewCorrectness_3D_5D(View a_, SubView b_, int i0_, int i1_, int o2, + int o3, int o4) + : a(a_), + b(b_), + i0(i0_), + i1(i1_), + offset_2(o2), + offset_3(o3), + offset_4(o4) {} + + void run() { + int errors = 0; + Kokkos::parallel_reduce("CheckSubView_3D_5D", policy_t(0, b.size()), *this, + errors); + ASSERT_TRUE(errors == 0); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int& ii, int& e) const { + const int i2 = std::is_same<layout, Kokkos::LayoutLeft>::value + ? ii % b.extent(0) + : ii / (b.extent(1) * b.extent(2)); + + const int i3 = std::is_same<layout, Kokkos::LayoutLeft>::value + ? (ii / b.extent(0)) % b.extent(1) + : (ii / b.extent(2)) % b.extent(1); + + const int i4 = std::is_same<layout, Kokkos::LayoutLeft>::value + ? ii / (b.extent(0) * b.extent(1)) + : ii % b.extent(2); + + if (a(i0, i1, i2 + offset_2, i3 + offset_3, i4 + offset_4) != + b(i2, i3, i4)) { + e++; + } + } +}; + +template <class SubView, class View> +void test_Check1D(SubView a, View b, Kokkos::pair<int, int> range) { + CheckSubviewCorrectness_1D_1D<View, SubView> check(b, a, range.first); + check.run(); +} + +template <class SubView, class View> +void test_Check1D2D(SubView a, View b, int i0, std::pair<int, int> range) { + CheckSubviewCorrectness_1D_2D<View, SubView> check(b, a, i0, range.first); + check.run(); +} + +template <class SubView, class View> +void test_Check2D3D(SubView a, View b, int i0, std::pair<int, int> range1, + std::pair<int, int> range2) { + CheckSubviewCorrectness_2D_3D<View, SubView> check(b, a, i0, range1.first, + range2.first); + check.run(); +} + +template <class SubView, class View> +void test_Check3D5D(SubView a, View b, int i0, int i1, + Kokkos::pair<int, int> range2, + Kokkos::pair<int, int> range3, + Kokkos::pair<int, int> range4) { + CheckSubviewCorrectness_3D_5D<View, SubView> check( + b, a, i0, i1, range2.first, range3.first, range4.first); + check.run(); +} + +template <class Space, class LayoutSub, class Layout, class LayoutOrg, + class MemTraits> +void test_1d_assign_impl() { + { // Breaks. + Kokkos::View<int*, LayoutOrg, Space> a_org("A", N0); + Kokkos::View<int*, LayoutOrg, Space, MemTraits> a(a_org); + Kokkos::fence(); + + Impl::FillView_1D<LayoutOrg, Space> fill(a_org); + fill.run(); + + Kokkos::View<int[N0], Layout, Space, MemTraits> a1(a); + Kokkos::fence(); + test_Check1D(a1, a, std::pair<int, int>(0, N0)); + + Kokkos::View<int[N0], LayoutSub, Space, MemTraits> a2(a1); + Kokkos::fence(); + test_Check1D(a2, a, std::pair<int, int>(0, N0)); + a1 = a; + test_Check1D(a1, a, std::pair<int, int>(0, N0)); + + // Runtime Fail expected. + // Kokkos::View< int[N1] > afail1( a ); + + // Compile Time Fail expected. + // Kokkos::View< int[N1] > afail2( a1 ); + } + + { // Works. + Kokkos::View<int[N0], LayoutOrg, Space, MemTraits> a("A"); + Kokkos::View<int*, Layout, Space, MemTraits> a1(a); + Kokkos::fence(); + test_Check1D(a1, a, std::pair<int, int>(0, N0)); + a1 = a; + Kokkos::fence(); + test_Check1D(a1, a, std::pair<int, int>(0, N0)); + } +} + +template <class Space, class Type, class TypeSub, class LayoutSub, class Layout, + class LayoutOrg, class MemTraits> +void test_2d_subview_3d_impl_type() { + Kokkos::View<int***, LayoutOrg, Space> a_org("A", N0, N1, N2); + Kokkos::View<Type, Layout, Space, MemTraits> a(a_org); + + Impl::FillView_3D<LayoutOrg, Space> fill(a_org); + fill.run(); + + Kokkos::View<TypeSub, LayoutSub, Space, MemTraits> a1; + a1 = Kokkos::subview(a, 3, Kokkos::ALL, Kokkos::ALL); + Kokkos::fence(); + test_Check2D3D(a1, a, 3, std::pair<int, int>(0, N1), + std::pair<int, int>(0, N2)); + + Kokkos::View<TypeSub, LayoutSub, Space, MemTraits> a2(a, 3, Kokkos::ALL, + Kokkos::ALL); + Kokkos::fence(); + test_Check2D3D(a2, a, 3, std::pair<int, int>(0, N1), + std::pair<int, int>(0, N2)); +} + +template <class Space, class LayoutSub, class Layout, class LayoutOrg, + class MemTraits> +void test_2d_subview_3d_impl_layout() { + test_2d_subview_3d_impl_type<Space, int[N0][N1][N2], int[N1][N2], LayoutSub, + Layout, LayoutOrg, MemTraits>(); + test_2d_subview_3d_impl_type<Space, int[N0][N1][N2], int * [N2], LayoutSub, + Layout, LayoutOrg, MemTraits>(); + test_2d_subview_3d_impl_type<Space, int[N0][N1][N2], int**, LayoutSub, Layout, + LayoutOrg, MemTraits>(); + + test_2d_subview_3d_impl_type<Space, int * [N1][N2], int[N1][N2], LayoutSub, + Layout, LayoutOrg, MemTraits>(); + test_2d_subview_3d_impl_type<Space, int * [N1][N2], int * [N2], LayoutSub, + Layout, LayoutOrg, MemTraits>(); + test_2d_subview_3d_impl_type<Space, int * [N1][N2], int**, LayoutSub, Layout, + LayoutOrg, MemTraits>(); + + test_2d_subview_3d_impl_type<Space, int* * [N2], int[N1][N2], LayoutSub, + Layout, LayoutOrg, MemTraits>(); + test_2d_subview_3d_impl_type<Space, int* * [N2], int * [N2], LayoutSub, + Layout, LayoutOrg, MemTraits>(); + test_2d_subview_3d_impl_type<Space, int* * [N2], int**, LayoutSub, Layout, + LayoutOrg, MemTraits>(); + + test_2d_subview_3d_impl_type<Space, int***, int[N1][N2], LayoutSub, Layout, + LayoutOrg, MemTraits>(); + test_2d_subview_3d_impl_type<Space, int***, int * [N2], LayoutSub, Layout, + LayoutOrg, MemTraits>(); + test_2d_subview_3d_impl_type<Space, int***, int**, LayoutSub, Layout, + LayoutOrg, MemTraits>(); + + test_2d_subview_3d_impl_type<Space, const int[N0][N1][N2], const int[N1][N2], + LayoutSub, Layout, LayoutOrg, MemTraits>(); + test_2d_subview_3d_impl_type<Space, const int[N0][N1][N2], const int * [N2], + LayoutSub, Layout, LayoutOrg, MemTraits>(); + test_2d_subview_3d_impl_type<Space, const int[N0][N1][N2], const int**, + LayoutSub, Layout, LayoutOrg, MemTraits>(); + + test_2d_subview_3d_impl_type<Space, const int * [N1][N2], const int[N1][N2], + LayoutSub, Layout, LayoutOrg, MemTraits>(); + test_2d_subview_3d_impl_type<Space, const int * [N1][N2], const int * [N2], + LayoutSub, Layout, LayoutOrg, MemTraits>(); + test_2d_subview_3d_impl_type<Space, const int * [N1][N2], const int**, + LayoutSub, Layout, LayoutOrg, MemTraits>(); + + test_2d_subview_3d_impl_type<Space, const int* * [N2], const int[N1][N2], + LayoutSub, Layout, LayoutOrg, MemTraits>(); + test_2d_subview_3d_impl_type<Space, const int* * [N2], const int * [N2], + LayoutSub, Layout, LayoutOrg, MemTraits>(); + test_2d_subview_3d_impl_type<Space, const int* * [N2], const int**, LayoutSub, + Layout, LayoutOrg, MemTraits>(); + + test_2d_subview_3d_impl_type<Space, const int***, const int[N1][N2], + LayoutSub, Layout, LayoutOrg, MemTraits>(); + test_2d_subview_3d_impl_type<Space, const int***, const int * [N2], LayoutSub, + Layout, LayoutOrg, MemTraits>(); + test_2d_subview_3d_impl_type<Space, const int***, const int**, LayoutSub, + Layout, LayoutOrg, MemTraits>(); +} + +template <class Space, class Type, class TypeSub, class LayoutSub, class Layout, + class LayoutOrg, class MemTraits> +void test_3d_subview_5d_impl_type() { + Kokkos::View<int*****, LayoutOrg, Space> a_org("A", N0, N1, N2, N3, N4); + Kokkos::View<Type, Layout, Space, MemTraits> a(a_org); + + Impl::FillView_5D<LayoutOrg, Space> fill(a_org); + fill.run(); + + Kokkos::View<TypeSub, LayoutSub, Space, MemTraits> a1; + a1 = Kokkos::subview(a, 3, 5, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL); + Kokkos::fence(); + test_Check3D5D(a1, a, 3, 5, std::pair<int, int>(0, N2), + std::pair<int, int>(0, N3), std::pair<int, int>(0, N4)); + + Kokkos::View<TypeSub, LayoutSub, Space, MemTraits> a2( + a, 3, 5, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL); + Kokkos::fence(); + test_Check3D5D(a2, a, 3, 5, std::pair<int, int>(0, N2), + std::pair<int, int>(0, N3), std::pair<int, int>(0, N4)); +} + +template <class Space, class LayoutSub, class Layout, class LayoutOrg, + class MemTraits> +void test_3d_subview_5d_impl_layout() { + test_3d_subview_5d_impl_type<Space, int[N0][N1][N2][N3][N4], int[N2][N3][N4], + LayoutSub, Layout, LayoutOrg, MemTraits>(); + test_3d_subview_5d_impl_type<Space, int[N0][N1][N2][N3][N4], int * [N3][N4], + LayoutSub, Layout, LayoutOrg, MemTraits>(); + test_3d_subview_5d_impl_type<Space, int[N0][N1][N2][N3][N4], int* * [N4], + LayoutSub, Layout, LayoutOrg, MemTraits>(); + test_3d_subview_5d_impl_type<Space, int[N0][N1][N2][N3][N4], int***, + LayoutSub, Layout, LayoutOrg, MemTraits>(); + + test_3d_subview_5d_impl_type<Space, int * [N1][N2][N3][N4], int[N2][N3][N4], + LayoutSub, Layout, LayoutOrg, MemTraits>(); + test_3d_subview_5d_impl_type<Space, int * [N1][N2][N3][N4], int * [N3][N4], + LayoutSub, Layout, LayoutOrg, MemTraits>(); + test_3d_subview_5d_impl_type<Space, int * [N1][N2][N3][N4], int* * [N4], + LayoutSub, Layout, LayoutOrg, MemTraits>(); + test_3d_subview_5d_impl_type<Space, int * [N1][N2][N3][N4], int***, LayoutSub, + Layout, LayoutOrg, MemTraits>(); + + test_3d_subview_5d_impl_type<Space, int* * [N2][N3][N4], int[N2][N3][N4], + LayoutSub, Layout, LayoutOrg, MemTraits>(); + test_3d_subview_5d_impl_type<Space, int* * [N2][N3][N4], int * [N3][N4], + LayoutSub, Layout, LayoutOrg, MemTraits>(); + test_3d_subview_5d_impl_type<Space, int* * [N2][N3][N4], int* * [N4], + LayoutSub, Layout, LayoutOrg, MemTraits>(); + test_3d_subview_5d_impl_type<Space, int* * [N2][N3][N4], int***, LayoutSub, + Layout, LayoutOrg, MemTraits>(); + + test_3d_subview_5d_impl_type<Space, int** * [N3][N4], int[N2][N3][N4], + LayoutSub, Layout, LayoutOrg, MemTraits>(); + test_3d_subview_5d_impl_type<Space, int** * [N3][N4], int * [N3][N4], + LayoutSub, Layout, LayoutOrg, MemTraits>(); + test_3d_subview_5d_impl_type<Space, int** * [N3][N4], int* * [N4], LayoutSub, + Layout, LayoutOrg, MemTraits>(); + test_3d_subview_5d_impl_type<Space, int** * [N3][N4], int***, LayoutSub, + Layout, LayoutOrg, MemTraits>(); + + test_3d_subview_5d_impl_type<Space, int*** * [N4], int[N2][N3][N4], LayoutSub, + Layout, LayoutOrg, MemTraits>(); + test_3d_subview_5d_impl_type<Space, int*** * [N4], int * [N3][N4], LayoutSub, + Layout, LayoutOrg, MemTraits>(); + test_3d_subview_5d_impl_type<Space, int*** * [N4], int* * [N4], LayoutSub, + Layout, LayoutOrg, MemTraits>(); + test_3d_subview_5d_impl_type<Space, int*** * [N4], int***, LayoutSub, Layout, + LayoutOrg, MemTraits>(); + + test_3d_subview_5d_impl_type<Space, int*****, int[N2][N3][N4], LayoutSub, + Layout, LayoutOrg, MemTraits>(); + test_3d_subview_5d_impl_type<Space, int*****, int * [N3][N4], LayoutSub, + Layout, LayoutOrg, MemTraits>(); + test_3d_subview_5d_impl_type<Space, int*****, int* * [N4], LayoutSub, Layout, + LayoutOrg, MemTraits>(); + test_3d_subview_5d_impl_type<Space, int*****, int***, LayoutSub, Layout, + LayoutOrg, MemTraits>(); + + test_3d_subview_5d_impl_type<Space, const int[N0][N1][N2][N3][N4], + const int[N2][N3][N4], LayoutSub, Layout, + LayoutOrg, MemTraits>(); + test_3d_subview_5d_impl_type<Space, const int[N0][N1][N2][N3][N4], + const int * [N3][N4], LayoutSub, Layout, + LayoutOrg, MemTraits>(); + test_3d_subview_5d_impl_type<Space, const int[N0][N1][N2][N3][N4], + const int* * [N4], LayoutSub, Layout, LayoutOrg, + MemTraits>(); + test_3d_subview_5d_impl_type<Space, const int[N0][N1][N2][N3][N4], + const int***, LayoutSub, Layout, LayoutOrg, + MemTraits>(); + + test_3d_subview_5d_impl_type<Space, const int * [N1][N2][N3][N4], + const int[N2][N3][N4], LayoutSub, Layout, + LayoutOrg, MemTraits>(); + test_3d_subview_5d_impl_type<Space, const int * [N1][N2][N3][N4], + const int * [N3][N4], LayoutSub, Layout, + LayoutOrg, MemTraits>(); + test_3d_subview_5d_impl_type<Space, const int * [N1][N2][N3][N4], + const int* * [N4], LayoutSub, Layout, LayoutOrg, + MemTraits>(); + test_3d_subview_5d_impl_type<Space, const int * [N1][N2][N3][N4], + const int***, LayoutSub, Layout, LayoutOrg, + MemTraits>(); + + test_3d_subview_5d_impl_type<Space, const int* * [N2][N3][N4], + const int[N2][N3][N4], LayoutSub, Layout, + LayoutOrg, MemTraits>(); + test_3d_subview_5d_impl_type<Space, const int* * [N2][N3][N4], + const int * [N3][N4], LayoutSub, Layout, + LayoutOrg, MemTraits>(); + test_3d_subview_5d_impl_type<Space, const int* * [N2][N3][N4], + const int* * [N4], LayoutSub, Layout, LayoutOrg, + MemTraits>(); + test_3d_subview_5d_impl_type<Space, const int* * [N2][N3][N4], const int***, + LayoutSub, Layout, LayoutOrg, MemTraits>(); + + test_3d_subview_5d_impl_type<Space, const int** * [N3][N4], + const int[N2][N3][N4], LayoutSub, Layout, + LayoutOrg, MemTraits>(); + test_3d_subview_5d_impl_type<Space, const int** * [N3][N4], + const int * [N3][N4], LayoutSub, Layout, + LayoutOrg, MemTraits>(); + test_3d_subview_5d_impl_type<Space, const int** * [N3][N4], const int* * [N4], + LayoutSub, Layout, LayoutOrg, MemTraits>(); + test_3d_subview_5d_impl_type<Space, const int** * [N3][N4], const int***, + LayoutSub, Layout, LayoutOrg, MemTraits>(); + + test_3d_subview_5d_impl_type<Space, const int*** * [N4], + const int[N2][N3][N4], LayoutSub, Layout, + LayoutOrg, MemTraits>(); + test_3d_subview_5d_impl_type<Space, const int*** * [N4], const int * [N3][N4], + LayoutSub, Layout, LayoutOrg, MemTraits>(); + test_3d_subview_5d_impl_type<Space, const int*** * [N4], const int* * [N4], + LayoutSub, Layout, LayoutOrg, MemTraits>(); + test_3d_subview_5d_impl_type<Space, const int*** * [N4], const int***, + LayoutSub, Layout, LayoutOrg, MemTraits>(); + + test_3d_subview_5d_impl_type<Space, const int*****, const int[N2][N3][N4], + LayoutSub, Layout, LayoutOrg, MemTraits>(); + test_3d_subview_5d_impl_type<Space, const int*****, const int * [N3][N4], + LayoutSub, Layout, LayoutOrg, MemTraits>(); + test_3d_subview_5d_impl_type<Space, const int*****, const int* * [N4], + LayoutSub, Layout, LayoutOrg, MemTraits>(); + test_3d_subview_5d_impl_type<Space, const int*****, const int***, LayoutSub, + Layout, LayoutOrg, MemTraits>(); +} + +inline void test_subview_legal_args_right() { + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, + Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, + Kokkos::pair<int, int>, int, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, + Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, + Kokkos::Impl::ALL_t, int, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, + Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, + Kokkos::pair<int, int>, int, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, + Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, + Kokkos::Impl::ALL_t, int, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, + Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, + Kokkos::pair<int, int>, int, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, + Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, + Kokkos::Impl::ALL_t, int, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, + Kokkos::pair<int, int>, Kokkos::pair<int, int>, + Kokkos::pair<int, int>, int, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, + Kokkos::pair<int, int>, Kokkos::pair<int, int>, + Kokkos::Impl::ALL_t, int, int>::value)); + + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, + Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, + Kokkos::pair<int, int>, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, + Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, + Kokkos::Impl::ALL_t, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, + Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, + Kokkos::pair<int, int>, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, + Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, + Kokkos::Impl::ALL_t, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, + Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, + Kokkos::pair<int, int>, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, + Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, + Kokkos::Impl::ALL_t, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, + Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, + Kokkos::pair<int, int>, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, + Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, + Kokkos::pair<int, int>, int>::value)); + + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, + Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, + Kokkos::pair<int, int>, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, + Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, + Kokkos::Impl::ALL_t, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, + Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, + Kokkos::pair<int, int>, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, + Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, + Kokkos::Impl::ALL_t, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, + Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, + Kokkos::pair<int, int>, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, + Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, + Kokkos::Impl::ALL_t, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, + Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, + Kokkos::pair<int, int>, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, + Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, + Kokkos::pair<int, int>, int>::value)); + + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, + Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, + Kokkos::pair<int, int>, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, + Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, + Kokkos::Impl::ALL_t, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, + Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, + Kokkos::pair<int, int>, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, + Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, + Kokkos::Impl::ALL_t, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, + Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, + Kokkos::pair<int, int>, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, + Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, + Kokkos::Impl::ALL_t, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, + Kokkos::pair<int, int>, Kokkos::pair<int, int>, + Kokkos::pair<int, int>, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, + Kokkos::pair<int, int>, Kokkos::pair<int, int>, + Kokkos::Impl::ALL_t, int>::value)); + + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, + Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, + Kokkos::pair<int, int>>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, + Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, + Kokkos::Impl::ALL_t>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, + Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, + Kokkos::pair<int, int>>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, + Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, + Kokkos::Impl::ALL_t>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, + Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, + Kokkos::pair<int, int>>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, + Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, + Kokkos::Impl::ALL_t>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, + Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, + Kokkos::pair<int, int>>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, + Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, + Kokkos::Impl::ALL_t>::value)); + + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, + Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, + Kokkos::pair<int, int>>::value)); + ASSERT_EQ(1, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, + Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, + Kokkos::Impl::ALL_t>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, + Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, + Kokkos::pair<int, int>>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, + Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, + Kokkos::Impl::ALL_t>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, + Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, + Kokkos::pair<int, int>>::value)); + ASSERT_EQ(1, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, + Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, + Kokkos::Impl::ALL_t>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, + Kokkos::pair<int, int>, Kokkos::pair<int, int>, + Kokkos::pair<int, int>>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, + Kokkos::pair<int, int>, Kokkos::pair<int, int>, + Kokkos::Impl::ALL_t>::value)); + + ASSERT_EQ(1, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, + Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, + Kokkos::Impl::ALL_t>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, + Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, + Kokkos::pair<int, int>>::value)); + ASSERT_EQ(1, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, + Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, + Kokkos::Impl::ALL_t>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, + Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, + Kokkos::pair<int, int>>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, + Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, + Kokkos::Impl::ALL_t>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, + Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, + Kokkos::pair<int, int>>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, + Kokkos::pair<int, int>, Kokkos::pair<int, int>, + Kokkos::Impl::ALL_t>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, + Kokkos::pair<int, int>, Kokkos::pair<int, int>, + Kokkos::pair<int, int>>::value)); +} + +inline void test_subview_legal_args_left() { + ASSERT_EQ( + 1, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, + Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, int>::value)); + ASSERT_EQ( + 1, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, + Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, int>::value)); + ASSERT_EQ( + 0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, + Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, int>::value)); + ASSERT_EQ( + 0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, + Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, int>::value)); + ASSERT_EQ(1, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, + Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, + Kokkos::pair<int, int>, int, int>::value)); + ASSERT_EQ(1, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, + Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, + Kokkos::Impl::ALL_t, int, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, + Kokkos::pair<int, int>, Kokkos::pair<int, int>, + Kokkos::pair<int, int>, int, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, + Kokkos::pair<int, int>, Kokkos::pair<int, int>, + Kokkos::Impl::ALL_t, int, int>::value)); + + ASSERT_EQ( + 0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, + int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int>::value)); + ASSERT_EQ( + 0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, + int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int>::value)); + ASSERT_EQ( + 0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, + int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int>::value)); + ASSERT_EQ( + 0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, + int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, + Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, + Kokkos::pair<int, int>, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, + Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, + Kokkos::Impl::ALL_t, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, + Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, + Kokkos::pair<int, int>, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, + Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, + Kokkos::pair<int, int>, int>::value)); + + ASSERT_EQ( + 0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, + Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int>::value)); + ASSERT_EQ( + 0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, + Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, int>::value)); + ASSERT_EQ( + 0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, + Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, int>::value)); + ASSERT_EQ( + 0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, + Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, + Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, + Kokkos::pair<int, int>, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, + Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, + Kokkos::Impl::ALL_t, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, + Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, + Kokkos::pair<int, int>, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, + Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, + Kokkos::pair<int, int>, int>::value)); + + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, + Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, + Kokkos::pair<int, int>, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, + Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, + Kokkos::Impl::ALL_t, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, + Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, + Kokkos::pair<int, int>, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, + Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, + Kokkos::Impl::ALL_t, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, + Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, + Kokkos::pair<int, int>, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, + Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, + Kokkos::Impl::ALL_t, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, + Kokkos::pair<int, int>, Kokkos::pair<int, int>, + Kokkos::pair<int, int>, int>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, + Kokkos::pair<int, int>, Kokkos::pair<int, int>, + Kokkos::Impl::ALL_t, int>::value)); + + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, + Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, + Kokkos::pair<int, int>>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, + Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, + Kokkos::Impl::ALL_t>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, + Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, + Kokkos::pair<int, int>>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, + Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, + Kokkos::Impl::ALL_t>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, + Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, + Kokkos::pair<int, int>>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, + Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, + Kokkos::Impl::ALL_t>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, + Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, + Kokkos::pair<int, int>>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, + Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, + Kokkos::Impl::ALL_t>::value)); + + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, + Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, + Kokkos::pair<int, int>>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, + Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, + Kokkos::Impl::ALL_t>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, + Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, + Kokkos::pair<int, int>>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, + Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, + Kokkos::Impl::ALL_t>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, + Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, + Kokkos::pair<int, int>>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, + Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, + Kokkos::Impl::ALL_t>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, + Kokkos::pair<int, int>, Kokkos::pair<int, int>, + Kokkos::pair<int, int>>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, + Kokkos::pair<int, int>, Kokkos::pair<int, int>, + Kokkos::Impl::ALL_t>::value)); + + ASSERT_EQ( + 1, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, + Kokkos::Impl::ALL_t, Kokkos::pair<int, int>>::value)); + ASSERT_EQ( + 1, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, + Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t>::value)); + ASSERT_EQ(1, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, + Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, + Kokkos::pair<int, int>>::value)); + ASSERT_EQ(1, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, + Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, + Kokkos::Impl::ALL_t>::value)); + ASSERT_EQ( + 0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, + Kokkos::pair<int, int>, Kokkos::Impl::ALL_t>::value)); + ASSERT_EQ( + 0, + (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, + Kokkos::pair<int, int>, Kokkos::pair<int, int>>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, + Kokkos::pair<int, int>, Kokkos::pair<int, int>, + Kokkos::Impl::ALL_t>::value)); + ASSERT_EQ(0, (Kokkos::Impl::SubviewLegalArgsCompileTime< + Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, + Kokkos::pair<int, int>, Kokkos::pair<int, int>, + Kokkos::pair<int, int>>::value)); +} + +} // namespace Impl + +template <class Space, class MemTraits = void> +void test_1d_assign() { + Impl::test_1d_assign_impl<Space, Kokkos::LayoutLeft, Kokkos::LayoutLeft, + Kokkos::LayoutLeft, MemTraits>(); + // Impl::test_1d_assign_impl< Space, Kokkos::LayoutRight, Kokkos::LayoutLeft, + // Kokkos::LayoutLeft >(); + Impl::test_1d_assign_impl<Space, Kokkos::LayoutStride, Kokkos::LayoutLeft, + Kokkos::LayoutLeft, MemTraits>(); + // Impl::test_1d_assign_impl< Space, Kokkos::LayoutLeft, Kokkos::LayoutRight, + // Kokkos::LayoutLeft >(); + Impl::test_1d_assign_impl<Space, Kokkos::LayoutRight, Kokkos::LayoutRight, + Kokkos::LayoutRight, MemTraits>(); + Impl::test_1d_assign_impl<Space, Kokkos::LayoutStride, Kokkos::LayoutRight, + Kokkos::LayoutRight, MemTraits>(); + // Impl::test_1d_assign_impl< Space, Kokkos::LayoutLeft, Kokkos::LayoutStride, + // Kokkos::LayoutLeft >(); Impl::test_1d_assign_impl< Space, + // Kokkos::LayoutRight, Kokkos::LayoutStride, Kokkos::LayoutLeft >(); + Impl::test_1d_assign_impl<Space, Kokkos::LayoutStride, Kokkos::LayoutStride, + Kokkos::LayoutLeft, MemTraits>(); +} + +template <class Space, class MemTraits = void> +void test_2d_subview_3d() { + Impl::test_2d_subview_3d_impl_layout<Space, Kokkos::LayoutRight, + Kokkos::LayoutRight, Kokkos::LayoutRight, + MemTraits>(); + Impl::test_2d_subview_3d_impl_layout<Space, Kokkos::LayoutStride, + Kokkos::LayoutRight, Kokkos::LayoutRight, + MemTraits>(); + Impl::test_2d_subview_3d_impl_layout<Space, Kokkos::LayoutStride, + Kokkos::LayoutStride, + Kokkos::LayoutRight, MemTraits>(); + Impl::test_2d_subview_3d_impl_layout<Space, Kokkos::LayoutStride, + Kokkos::LayoutLeft, Kokkos::LayoutLeft, + MemTraits>(); + Impl::test_2d_subview_3d_impl_layout<Space, Kokkos::LayoutStride, + Kokkos::LayoutStride, Kokkos::LayoutLeft, + MemTraits>(); +} + +template <class Space, class MemTraits = void> +void test_3d_subview_5d_right() { + Impl::test_3d_subview_5d_impl_layout<Space, Kokkos::LayoutStride, + Kokkos::LayoutRight, Kokkos::LayoutRight, + MemTraits>(); + Impl::test_3d_subview_5d_impl_layout<Space, Kokkos::LayoutStride, + Kokkos::LayoutStride, + Kokkos::LayoutRight, MemTraits>(); +} + +template <class Space, class MemTraits = void> +void test_3d_subview_5d_left() { + Impl::test_3d_subview_5d_impl_layout<Space, Kokkos::LayoutStride, + Kokkos::LayoutLeft, Kokkos::LayoutLeft, + MemTraits>(); + Impl::test_3d_subview_5d_impl_layout<Space, Kokkos::LayoutStride, + Kokkos::LayoutStride, Kokkos::LayoutLeft, + MemTraits>(); +} + +template <class Space, class MemTraits = void> +void test_layoutleft_to_layoutleft() { + Impl::test_subview_legal_args_left(); + + using view3D_t = Kokkos::View<int***, Kokkos::LayoutLeft, Space>; + using view4D_t = Kokkos::View<int****, Kokkos::LayoutLeft, Space>; + { + view3D_t a("A", 100, 4, 3); + view3D_t b(a, Kokkos::pair<int, int>(16, 32), Kokkos::ALL, Kokkos::ALL); + + Impl::FillView_3D<Kokkos::LayoutLeft, Space> fill(a); + fill.run(); + + Impl::CheckSubviewCorrectness_3D_3D<view3D_t, view3D_t> check(a, b, 16, 0); + check.run(); + } + + { + view3D_t a("A", 100, 4, 5); + view3D_t b(a, Kokkos::pair<int, int>(16, 32), Kokkos::ALL, + Kokkos::pair<int, int>(1, 3)); + + Impl::FillView_3D<Kokkos::LayoutLeft, Space> fill(a); + fill.run(); + + Impl::CheckSubviewCorrectness_3D_3D<view3D_t, view3D_t> check(a, b, 16, 1); + check.run(); + } + + { + view4D_t a("A", 100, 4, 5, 3); + view3D_t b(a, Kokkos::pair<int, int>(16, 32), Kokkos::ALL, + Kokkos::pair<int, int>(1, 3), 1); + + Impl::FillView_4D<Kokkos::LayoutLeft, Space> fill(a); + fill.run(); + + Impl::CheckSubviewCorrectness_3D_4D<view4D_t, view3D_t> check(a, b, 1, 16, + 1); + check.run(); + } +} + +template <class Space, class MemTraits = void> +void test_layoutright_to_layoutright() { + Impl::test_subview_legal_args_right(); + + using view3D_t = Kokkos::View<int***, Kokkos::LayoutRight, Space>; + using view4D_t = Kokkos::View<int****, Kokkos::LayoutRight, Space>; + { + view3D_t a("A", 100, 4, 3); + view3D_t b(a, Kokkos::pair<int, int>(16, 32), Kokkos::ALL, Kokkos::ALL); + + Impl::FillView_3D<Kokkos::LayoutRight, Space> fill(a); + fill.run(); + + Impl::CheckSubviewCorrectness_3D_3D<view3D_t, view3D_t> check(a, b, 16, 0); + check.run(); + } + { + view4D_t a("A", 3, 4, 5, 100); + view3D_t b(a, 1, Kokkos::pair<int, int>(1, 3), Kokkos::ALL, Kokkos::ALL); + + Impl::FillView_4D<Kokkos::LayoutRight, Space> fill(a); + fill.run(); + + Impl::CheckSubviewCorrectness_3D_4D<view4D_t, view3D_t> check(a, b, 1, 1, + 0); + check.run(); + } +} +//---------------------------------------------------------------------------- + +template <class Space> +struct TestUnmanagedSubviewReset { + Kokkos::View<int****, Space> a; + + KOKKOS_INLINE_FUNCTION + void operator()(int) const noexcept { + auto sub_a = Kokkos::subview(a, 0, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL); + + for (int i = 0; i < int(a.extent(0)); ++i) { + sub_a.assign_data(&a(i, 0, 0, 0)); + if (&sub_a(1, 1, 1) != &a(i, 1, 1, 1)) { + Kokkos::abort("TestUnmanagedSubviewReset"); + } + } + } + + TestUnmanagedSubviewReset() : a(Kokkos::view_alloc(), 20, 10, 5, 2) {} +}; + +template <class Space> +void test_unmanaged_subview_reset() { + Kokkos::parallel_for( + Kokkos::RangePolicy<typename Space::execution_space>(0, 1), + TestUnmanagedSubviewReset<Space>()); +} + +//---------------------------------------------------------------------------- + +template <std::underlying_type_t<Kokkos::MemoryTraitsFlags> MTF> +struct TestSubviewMemoryTraitsConstruction { + void operator()() const noexcept { + using view_type = Kokkos::View<double*, Kokkos::HostSpace>; + using size_type = view_type::size_type; + using memory_traits_type = Kokkos::MemoryTraits<MTF>; + + view_type v("v", 7); + for (size_type i = 0; i != v.size(); ++i) v[i] = static_cast<double>(i); + + std::pair<int, int> range(3, 5); + auto sv = Kokkos::subview<memory_traits_type>(v, range); + + ASSERT_EQ(2u, sv.size()); + EXPECT_EQ(3., sv[0]); + EXPECT_EQ(4., sv[1]); + } +}; + +inline void test_subview_memory_traits_construction() { + // Test all combinations of MemoryTraits: + // Unmanaged (1) + // RandomAccess (2) + // Atomic (4) + // Restricted (8) + TestSubviewMemoryTraitsConstruction<0>()(); + TestSubviewMemoryTraitsConstruction<1>()(); + TestSubviewMemoryTraitsConstruction<2>()(); + TestSubviewMemoryTraitsConstruction<3>()(); + TestSubviewMemoryTraitsConstruction<4>()(); + TestSubviewMemoryTraitsConstruction<5>()(); + TestSubviewMemoryTraitsConstruction<6>()(); + TestSubviewMemoryTraitsConstruction<7>()(); + TestSubviewMemoryTraitsConstruction<8>()(); + TestSubviewMemoryTraitsConstruction<9>()(); + TestSubviewMemoryTraitsConstruction<10>()(); + TestSubviewMemoryTraitsConstruction<11>()(); + TestSubviewMemoryTraitsConstruction<12>()(); + TestSubviewMemoryTraitsConstruction<13>()(); + TestSubviewMemoryTraitsConstruction<14>()(); + TestSubviewMemoryTraitsConstruction<15>()(); +} + +//---------------------------------------------------------------------------- + +template <class T> +struct get_view_type; + +template <class T, class... Args> +struct get_view_type<Kokkos::View<T, Args...>> { + using type = T; +}; + +template <class T> +struct + ___________________________________TYPE_DISPLAY________________________________________; +#define TYPE_DISPLAY(...) \ + typename ___________________________________TYPE_DISPLAY________________________________________< \ + __VA_ARGS__>::type notdefined; + +template <class Space, class Layout> +struct TestSubviewStaticSizes { + Kokkos::View<int * [10][5][2], Layout, Space> a; + Kokkos::View<int[6][7][8], Layout, Space> b; + + KOKKOS_INLINE_FUNCTION + int operator()() const noexcept { + /* Doesn't actually do anything; just static assertions */ + + auto sub_a = Kokkos::subview(a, 0, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL); + typename static_expect_same< + /* expected */ int[10][5][2], + /* actual */ typename get_view_type<decltype(sub_a)>::type>::type + test_1 = 0; + + auto sub_a_2 = Kokkos::subview(a, 0, 0, Kokkos::ALL, Kokkos::ALL); + typename static_expect_same< + /* expected */ int[5][2], + /* actual */ typename get_view_type<decltype(sub_a_2)>::type>::type + test_2 = 0; + + auto sub_a_3 = Kokkos::subview(a, 0, 0, Kokkos::ALL, 0); + typename static_expect_same< + /* expected */ int[5], + /* actual */ typename get_view_type<decltype(sub_a_3)>::type>::type + test_3 = 0; + + auto sub_a_4 = Kokkos::subview(a, Kokkos::ALL, 0, Kokkos::ALL, Kokkos::ALL); + typename static_expect_same< + /* expected */ int * [5][2], + /* actual */ typename get_view_type<decltype(sub_a_4)>::type>::type + test_4 = 0; + + // TODO we'll need to update this test once we allow interleaving of static + // and dynamic + auto sub_a_5 = Kokkos::subview(a, Kokkos::ALL, 0, Kokkos::ALL, + Kokkos::make_pair(0, 1)); + typename static_expect_same< + /* expected */ int***, + /* actual */ typename get_view_type<decltype(sub_a_5)>::type>::type + test_5 = 0; + + auto sub_a_sub = Kokkos::subview(sub_a_5, 0, Kokkos::ALL, 0); + typename static_expect_same< + /* expected */ int*, + /* actual */ typename get_view_type<decltype(sub_a_sub)>::type>::type + test_sub = 0; + + auto sub_a_7 = Kokkos::subview(a, Kokkos::ALL, 0, Kokkos::make_pair(0, 1), + Kokkos::ALL); + typename static_expect_same< + /* expected */ int* * [2], + /* actual */ typename get_view_type<decltype(sub_a_7)>::type>::type + test_7 = 0; + + auto sub_a_8 = + Kokkos::subview(a, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL); + typename static_expect_same< + /* expected */ int * [10][5][2], + /* actual */ typename get_view_type<decltype(sub_a_8)>::type>::type + test_8 = 0; + + auto sub_b = Kokkos::subview(b, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL); + typename static_expect_same< + /* expected */ int[6][7][8], + /* actual */ typename get_view_type<decltype(sub_b)>::type>::type + test_9 = 0; + + auto sub_b_2 = Kokkos::subview(b, 0, Kokkos::ALL, Kokkos::ALL); + typename static_expect_same< + /* expected */ int[7][8], + /* actual */ typename get_view_type<decltype(sub_b_2)>::type>::type + test_10 = 0; + + auto sub_b_3 = + Kokkos::subview(b, Kokkos::make_pair(2, 3), Kokkos::ALL, Kokkos::ALL); + typename static_expect_same< + /* expected */ int * [7][8], + /* actual */ typename get_view_type<decltype(sub_b_3)>::type>::type + test_11 = 0; + + return test_1 + test_2 + test_3 + test_4 + test_5 + test_sub + test_7 + + test_8 + test_9 + test_10 + test_11; + } + + TestSubviewStaticSizes() : a(Kokkos::view_alloc("a"), 20), b("b") {} +}; + +template <class Space> +struct TestExtentsStaticTests { + using test1 = typename static_expect_same< + /* expected */ + Kokkos::Experimental::Extents<Kokkos::Experimental::dynamic_extent, + Kokkos::Experimental::dynamic_extent, 1, 2, + 3>, + /* actual */ + typename Kokkos::Impl::ParseViewExtents<double* * [1][2][3]>::type>::type; + + using test2 = typename static_expect_same< + /* expected */ + Kokkos::Experimental::Extents<1, 2, 3>, + /* actual */ + typename Kokkos::Impl::ParseViewExtents<double[1][2][3]>::type>::type; + + using test3 = typename static_expect_same< + /* expected */ + Kokkos::Experimental::Extents<3>, + /* actual */ + typename Kokkos::Impl::ParseViewExtents<double[3]>::type>::type; + + using test4 = typename static_expect_same< + /* expected */ + Kokkos::Experimental::Extents<>, + /* actual */ + typename Kokkos::Impl::ParseViewExtents<double>::type>::type; +}; + +} // namespace TestViewSubview + +#endif diff --git a/packages/kokkos/core/unit_test/TestView_64bit.hpp b/packages/kokkos/core/unit_test/TestView_64bit.hpp new file mode 100644 index 0000000000000000000000000000000000000000..50626718b5774ddefa03a453402564986e831ed1 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestView_64bit.hpp @@ -0,0 +1,139 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> + +namespace Test { + +template <class Device> +void test_64bit() { +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + // FIXME_SYCL The SYCL CUDA backend throws an error +#ifdef KOKKOS_ENABLE_SYCL + int64_t N = 1000000000; +#else + int64_t N = 5000000000; +#endif + int64_t sum = 0; + { + Kokkos::parallel_reduce( + Kokkos::RangePolicy<typename Device::execution_space, + Kokkos::IndexType<int64_t>>(0, N), + KOKKOS_LAMBDA(const int64_t& /*i*/, int64_t& lsum) { lsum += 1; }, sum); + ASSERT_EQ(N, sum); + } + { + Kokkos::View<char*, Device> a("A", N); + Kokkos::deep_copy(a, char(1)); + Kokkos::parallel_reduce( + Kokkos::RangePolicy<typename Device::execution_space, + Kokkos::IndexType<int64_t>>(0, N), + KOKKOS_LAMBDA(const int64_t& i, int64_t& lsum) { + lsum += int64_t(a(i)); + }, + sum); + ASSERT_EQ(N, sum); + Kokkos::parallel_for( + Kokkos::RangePolicy<typename Device::execution_space, + Kokkos::IndexType<int64_t>>(0, N), + KOKKOS_LAMBDA(const int64_t& i) { a(i) = 3; }); + Kokkos::parallel_reduce( + Kokkos::RangePolicy<typename Device::execution_space, + Kokkos::IndexType<int64_t>>(0, N), + KOKKOS_LAMBDA(const int64_t& i, int64_t& lsum) { + lsum += int64_t(a(i)); + }, + sum); + ASSERT_EQ(N * 3, sum); + } + { + int64_t N0 = 56925; + int64_t N1 = 56927; + + Kokkos::View<char**, Device> m("Matrix", N0, N1); + Kokkos::deep_copy(m, char(1)); + Kokkos::parallel_reduce( + Kokkos::RangePolicy<typename Device::execution_space, + Kokkos::IndexType<int64_t>>(0, N0 * N1), + KOKKOS_LAMBDA(const int64_t& i, int64_t& lsum) { + lsum += int64_t(m(i % N0, i / N0)); + }, + sum); + ASSERT_EQ(N0 * N1, sum); + Kokkos::parallel_reduce( + Kokkos::MDRangePolicy<typename Device::execution_space, Kokkos::Rank<2>, + Kokkos::IndexType<int64_t>>({0, 0}, {N0, N1}), + KOKKOS_LAMBDA(const int64_t& i0, const int64_t& i1, int64_t& lsum) { + lsum += int64_t(m(i0, i1)); + }, + sum); + ASSERT_EQ(N0 * N1, sum); + } + { + int N0 = 1024 * 1024 * 1500; + int64_t P = 1713091; + Kokkos::View<int*, Device> a("A", N0); + Kokkos::parallel_for( + "FillA", + Kokkos::RangePolicy<typename Device::execution_space, + Kokkos::IndexType<int>>(0, N0), + KOKKOS_LAMBDA(const int& i) { a(i) = i % P; }); + int64_t sum0 = 0; + Kokkos::parallel_reduce( + "FillA", + Kokkos::RangePolicy<typename Device::execution_space, + Kokkos::IndexType<int>>(0, N0), + KOKKOS_LAMBDA(const int& i, int64_t& lsum) { lsum += a(i); }, sum0); + int64_t expected = + (P * (P - 1) / 2) * int64_t(N0 / P) + (N0 % P) * (N0 % P - 1) / 2; + ASSERT_EQ(expected, sum0); + } +#endif +} + +#ifdef KOKKOS_ENABLE_LARGE_MEM_TESTS +TEST(TEST_CATEGORY, view_64bit) { test_64bit<TEST_EXECSPACE>(); } +#endif + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestWorkGraph.hpp b/packages/kokkos/core/unit_test/TestWorkGraph.hpp new file mode 100644 index 0000000000000000000000000000000000000000..472af3a0444a3d78a9361640399967535d15d704 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestWorkGraph.hpp @@ -0,0 +1,176 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <vector> +#include <iostream> + +#include <Kokkos_Core.hpp> + +namespace Test { + +namespace { + +/* This test is meant to be the WorkGraph equivalent of the Task DAG Scheduler + test, please see TestTaskScheduler.hpp for that test. The algorithm computes + the N-th fibonacci number as follows: + - Each "task" or "work item" computes the i-th fibonacci number + - If a task as (i < 2), it will record the known answer ahead of time. + - If a task has (i >= 2), it will "spawn" two more tasks to compute + the (i - 1) and (i - 2) fibonacci numbers. + We do NOT do any de-duplication of these tasks. + De-duplication would result in only (N - 2) tasks which must be run in + serial. We allow duplicates both to increase the number of tasks and to + increase the amount of available parallelism. + */ + +template <class ExecSpace> +struct TestWorkGraph { + using MemorySpace = typename ExecSpace::memory_space; + using Policy = Kokkos::WorkGraphPolicy<std::int32_t, ExecSpace>; + using Graph = typename Policy::graph_type; + using RowMap = typename Graph::row_map_type; + using Entries = typename Graph::entries_type; + using Values = Kokkos::View<long*, MemorySpace>; + + long m_input; + Graph m_graph; + Graph m_transpose; + Values m_values; + + TestWorkGraph(long arg_input) : m_input(arg_input) { + form_graph(); + transpose_crs(m_transpose, m_graph); + } + + inline long full_fibonacci(long n) { + constexpr long mask = 0x03; + long fib[4] = {0, 1, 1, 2}; + for (long i = 2; i <= n; ++i) { + fib[i & mask] = fib[(i - 1) & mask] + fib[(i - 2) & mask]; + } + return fib[n & mask]; + } + + struct HostEntry { + long input; + std::int32_t parent; + }; + std::vector<HostEntry> form_host_graph() { + std::vector<HostEntry> g; + g.push_back({m_input, -1}); + for (std::int32_t i = 0; i < std::int32_t(g.size()); ++i) { + auto e = g.at(std::size_t(i)); + if (e.input < 2) continue; + /* This part of the host graph formation is the equivalent of task + spawning in the Task DAG system. Notice how each task which is not a + base case spawns two more tasks, without any de-duplication */ + g.push_back({e.input - 1, i}); + g.push_back({e.input - 2, i}); + } + return g; + } + + void form_graph() { + auto hg = form_host_graph(); + m_graph.row_map = + RowMap("row_map", hg.size() + 1); // row map always has one more + m_graph.entries = + Entries("entries", hg.size() - 1); // all but the first have a parent + m_values = Values("values", hg.size()); + // printf("%zu work items\n", hg.size()); + auto h_row_map = Kokkos::create_mirror_view(m_graph.row_map); + auto h_entries = Kokkos::create_mirror_view(m_graph.entries); + auto h_values = Kokkos::create_mirror_view(m_values); + h_row_map(0) = 0; + for (std::int32_t i = 0; i < std::int32_t(hg.size()); ++i) { + auto& e = hg.at(std::size_t(i)); + h_row_map(i + 1) = i; + if (e.input < 2) { + h_values(i) = e.input; + } + if (e.parent == -1) continue; + h_entries(i - 1) = e.parent; + } + Kokkos::deep_copy(m_graph.row_map, h_row_map); + Kokkos::deep_copy(m_graph.entries, h_entries); + Kokkos::deep_copy(m_values, h_values); + } + + KOKKOS_INLINE_FUNCTION + void operator()(std::int32_t i) const { + auto begin = m_transpose.row_map(i); + auto end = m_transpose.row_map(i + 1); + for (auto j = begin; j < end; ++j) { + auto k = m_transpose.entries(j); + m_values(i) += m_values(k); + } + } + + void test_for() { + Kokkos::parallel_for(Policy(m_graph), *this); + Kokkos::fence(); + auto h_values = Kokkos::create_mirror_view(m_values); + Kokkos::deep_copy(h_values, m_values); + ASSERT_EQ(h_values(0), full_fibonacci(m_input)); + } +}; + +} // anonymous namespace + +TEST(TEST_CATEGORY, workgraph_fib) { + // FIXME_HIP The test is very slow with HIP and it causes the CI to timeout +#ifdef KOKKOS_ENABLE_HIP + int limit = 7; +#else + int limit = 27; +#endif + for (int i = 0; i < limit; ++i) { + TestWorkGraph<TEST_EXECSPACE> f(i); + f.test_for(); + } + // TestWorkGraph< TEST_EXECSPACE > f(2); + // f.test_for(); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/Test_InterOp_Streams.hpp b/packages/kokkos/core/unit_test/Test_InterOp_Streams.hpp new file mode 100644 index 0000000000000000000000000000000000000000..6af731b9fa3e037598123add65071c1efa341187 --- /dev/null +++ b/packages/kokkos/core/unit_test/Test_InterOp_Streams.hpp @@ -0,0 +1,146 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> + +namespace Test { + +#ifndef KOKKOS_ENABLE_SYCL +__global__ void offset_streams(int* p) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < 100) { + p[idx] += idx; + } +} +#endif + +template <typename MemorySpace> +struct FunctorRange { + Kokkos::View<int*, MemorySpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> a; + FunctorRange( + Kokkos::View<int*, MemorySpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> + a_) + : a(a_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { a(i) += 1; } +}; + +template <typename MemorySpace> +struct FunctorRangeReduce { + Kokkos::View<int*, MemorySpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> a; + FunctorRangeReduce( + Kokkos::View<int*, MemorySpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> + a_) + : a(a_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, int& lsum) const { lsum += a(i); } +}; + +template <typename MemorySpace> +struct FunctorMDRange { + Kokkos::View<int*, MemorySpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> a; + FunctorMDRange( + Kokkos::View<int*, MemorySpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> + a_) + : a(a_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, const int j) const { a(i * 10 + j) += 1; } +}; + +template <typename MemorySpace> +struct FunctorMDRangeReduce { + Kokkos::View<int*, MemorySpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> a; + FunctorMDRangeReduce( + Kokkos::View<int*, MemorySpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> + a_) + : a(a_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, const int j, int& lsum) const { + lsum += a(i * 10 + j); + } +}; + +template <typename MemorySpace, typename ExecutionSpace> +struct FunctorTeam { + Kokkos::View<int*, MemorySpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> a; + FunctorTeam( + Kokkos::View<int*, MemorySpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> + a_) + : a(a_) {} + + KOKKOS_INLINE_FUNCTION + void operator()( + typename Kokkos::TeamPolicy<ExecutionSpace>::member_type const& team) + const { + int i = team.league_rank(); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 10), + [&](const int j) { a(i * 10 + j) += 1; }); + } +}; + +template <typename MemorySpace, typename ExecutionSpace> +struct FunctorTeamReduce { + Kokkos::View<int*, MemorySpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> a; + FunctorTeamReduce( + Kokkos::View<int*, MemorySpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> + a_) + : a(a_) {} + + KOKKOS_INLINE_FUNCTION + void operator()( + typename Kokkos::TeamPolicy<ExecutionSpace>::member_type const& team, + int& lsum) const { + int i = team.league_rank(); + int team_sum; + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(team, 10), + [&](const int j, int& tsum) { tsum += a(i * 10 + j); }, team_sum); + Kokkos::single(Kokkos::PerTeam(team), [&]() { lsum += team_sum; }); + } +}; +} // namespace Test diff --git a/packages/kokkos/core/unit_test/UnitTestMain.cpp b/packages/kokkos/core/unit_test/UnitTestMain.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a70409105f0b43a9fe07c102b626dce7cf803410 --- /dev/null +++ b/packages/kokkos/core/unit_test/UnitTestMain.cpp @@ -0,0 +1,51 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> +#include <cstdlib> + +int main(int argc, char *argv[]) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/packages/kokkos/core/unit_test/UnitTestMainInit.cpp b/packages/kokkos/core/unit_test/UnitTestMainInit.cpp new file mode 100644 index 0000000000000000000000000000000000000000..140ba418fdac4eda95ed362a9b5bf64e50676cc2 --- /dev/null +++ b/packages/kokkos/core/unit_test/UnitTestMainInit.cpp @@ -0,0 +1,57 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> +#include <cstdlib> + +#include <Kokkos_Core.hpp> + +int main(int argc, char *argv[]) { + Kokkos::initialize(argc, argv); + ::testing::InitGoogleTest(&argc, argv); + + int result = RUN_ALL_TESTS(); + Kokkos::finalize(); + return result; +} diff --git a/packages/kokkos/core/unit_test/UnitTest_CMakePassCmdLineArgs.cpp b/packages/kokkos/core/unit_test/UnitTest_CMakePassCmdLineArgs.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7525f8d2b4b939702b9b361b107b12e57826cb65 --- /dev/null +++ b/packages/kokkos/core/unit_test/UnitTest_CMakePassCmdLineArgs.cpp @@ -0,0 +1,11 @@ +#include <string> + +struct Up {}; + +int main(int argc, char* argv[]) { + if (argc != 4 || std::string(argv[1]) != "one" || + std::string(argv[2]) != "2" || std::string(argv[3]) != "THREE") { + throw Up{}; + } + return 0; +} diff --git a/packages/kokkos/core/unit_test/UnitTest_PushFinalizeHook.cpp b/packages/kokkos/core/unit_test/UnitTest_PushFinalizeHook.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1b99691e6d13dc23c0cc94f653cd6b91c069c82d --- /dev/null +++ b/packages/kokkos/core/unit_test/UnitTest_PushFinalizeHook.cpp @@ -0,0 +1,130 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <cstdlib> +#include <exception> +#include <iostream> +#include <sstream> +#include <Kokkos_Core.hpp> + +namespace { // (anonymous) + +// Output for the finalize hooks. Use this to make sure that all the +// hooks ran, and that they ran in the correct order. +std::ostringstream hookOutput; + +const char hook1str[] = "Behold, I am Hook 1; first pushed, last to be called."; +const char hook2str[] = "Yea verily, I am Hook 2."; +const char hook3str[] = "Indeed, I am Hook 3."; +const char hook4str[] = "Last but not least, I am Hook 4."; + +} // namespace + +// Don't just have all the hooks print the same thing except for a +// number. Have them print different things, so we can detect +// interleaving. The hooks need to run sequentially, in LIFO order. +// Also, make sure that the function accepts at least the following +// kinds of hooks: +// +// 1. A plain old function that takes no arguments and returns nothing. +// 2. Lambda, that can be assigned to std::function<void()> +// 3. An actual std::function<void()> +// 4. A named object with operator(). This is what C++ programmers +// unfortunately like to call "functor," even though this word +// means something different in other languages. + +void hook1() { hookOutput << hook1str << std::endl; } + +struct Hook4 { + void operator()() const { hookOutput << hook4str << std::endl; } +}; + +int main(int argc, char* argv[]) { + using std::cout; + using std::endl; + + const std::string expectedOutput([] { + std::ostringstream os; + os << hook4str << endl + << hook3str << endl + << hook2str << endl + << hook1str << endl; + return os.str(); + }()); + + Kokkos::initialize(argc, argv); + + Kokkos::push_finalize_hook(hook1); // plain old function + Kokkos::push_finalize_hook([] { hookOutput << hook2str << endl; }); // lambda + std::function<void()> hook3 = [] { hookOutput << hook3str << endl; }; + Kokkos::push_finalize_hook(hook3); // actual std::function + Hook4 hook4; + Kokkos::push_finalize_hook(hook4); // function object instance + + // This should invoke the finalize hooks in reverse order. + // Furthermore, it should not throw an exception. + try { + Kokkos::finalize(); + } catch (std::exception& e) { + cout << "FAILED: Kokkos::finalize threw an exception: " << e.what() << endl; + return EXIT_FAILURE; + } catch (...) { + cout << "FAILED: Kokkos::finalize threw an exception whose base class " + "is not std::exception." + << endl; + return EXIT_FAILURE; + } + + const bool success = (hookOutput.str() == expectedOutput); + if (success) { + cout << "SUCCESS" << endl; + } else { + cout << "FAILED:" << endl + << " Expected output:" << endl + << expectedOutput << endl + << " Actual output:" << endl + << hookOutput.str() << endl; + } + return success ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/packages/kokkos/core/unit_test/UnitTest_PushFinalizeHook_terminate.cpp b/packages/kokkos/core/unit_test/UnitTest_PushFinalizeHook_terminate.cpp new file mode 100644 index 0000000000000000000000000000000000000000..442310542be36bf40c495336d287d37812c9b83c --- /dev/null +++ b/packages/kokkos/core/unit_test/UnitTest_PushFinalizeHook_terminate.cpp @@ -0,0 +1,85 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <cstdlib> +#include <iostream> +#include <exception> +#include <Kokkos_Core.hpp> + +// If any of the finalize hooks given to Kokkos::push_finalize_hook +// throws but does not catch an exception, make sure that +// Kokkos::finalize calls std::terminate. + +namespace { // (anonymous) + +// If you change this, change CMakeLists.txt in this directory too! +// I verified that changing this string makes the test fail. +const char my_terminate_str[] = + "PASSED: I am the custom std::terminate handler."; + +// Tell compilers not to complain that this function doesn't return. +[[noreturn]] void my_terminate_handler() { + std::cerr << my_terminate_str << std::endl; + std::abort(); // terminate handlers normally would end by calling this +} + +} // namespace + +int main(int argc, char *argv[]) { + // If std::terminate is called, it will call my_terminate_handler. + std::set_terminate(my_terminate_handler); + + Kokkos::initialize(argc, argv); + Kokkos::push_finalize_hook( + [] { throw std::runtime_error("I am an uncaught exception!"); }); + + // This should call std::terminate, which in turn will call + // my_terminate_handler above. That will print the message that + // makes this test count as passed. + Kokkos::finalize(); + + // The test actually failed if we got to this point. + std::cerr << "FAILED to call std::terminate!" << std::endl; + return EXIT_FAILURE; +} diff --git a/packages/kokkos/core/unit_test/category_files/TestCudaHostPinned_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestCudaHostPinned_Category.hpp new file mode 100644 index 0000000000000000000000000000000000000000..3f8ec6bacf209f9fe3fc0739170b359dfb986f36 --- /dev/null +++ b/packages/kokkos/core/unit_test/category_files/TestCudaHostPinned_Category.hpp @@ -0,0 +1,56 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_CUDAHOSTPINNED_HPP +#define KOKKOS_TEST_CUDAHOSTPINNED_HPP + +#include <gtest/gtest.h> + +#define TEST_CATEGORY cuda_hostpinned +#define TEST_CATEGORY_DEATH cuda_hostpinned_DeathTest +//#define TEST_EXECSPACE +// Kokkos::Device<Kokkos::Cuda,Kokkos::CudaHostPinnedSpace> +#define TEST_EXECSPACE Kokkos::CudaHostPinnedSpace + +#endif diff --git a/packages/kokkos/core/unit_test/category_files/TestCudaUVM_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestCudaUVM_Category.hpp new file mode 100644 index 0000000000000000000000000000000000000000..ff53e5a719a7274e8d38b93259286e14bc44d27a --- /dev/null +++ b/packages/kokkos/core/unit_test/category_files/TestCudaUVM_Category.hpp @@ -0,0 +1,54 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_CUDAUVM_HPP +#define KOKKOS_TEST_CUDAUVM_HPP + +#include <gtest/gtest.h> + +#define TEST_CATEGORY cuda_uvm +#define TEST_CATEGORY_DEATH cuda_uvm_DeathTest +#define TEST_EXECSPACE Kokkos::CudaUVMSpace + +#endif diff --git a/packages/kokkos/core/unit_test/category_files/TestCuda_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestCuda_Category.hpp new file mode 100644 index 0000000000000000000000000000000000000000..22666dc82fab611ee08aa7555e9b56ae0b2f148a --- /dev/null +++ b/packages/kokkos/core/unit_test/category_files/TestCuda_Category.hpp @@ -0,0 +1,56 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_CUDA_HPP +#define KOKKOS_TEST_CUDA_HPP + +#include <gtest/gtest.h> + +#define TEST_CATEGORY cuda +#define TEST_CATEGORY_NUMBER 5 +#define TEST_CATEGORY_DEATH cuda_DeathTest +#define TEST_EXECSPACE Kokkos::Cuda +#define TEST_CATEGORY_FIXTURE(name) cuda_##name + +#endif diff --git a/packages/kokkos/core/unit_test/category_files/TestDefaultDeviceType_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestDefaultDeviceType_Category.hpp new file mode 100644 index 0000000000000000000000000000000000000000..57a0e0ee006460a59c71d5fbccfbd87c2c754653 --- /dev/null +++ b/packages/kokkos/core/unit_test/category_files/TestDefaultDeviceType_Category.hpp @@ -0,0 +1,54 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_DEFAULTDEVICETYPE_HPP +#define KOKKOS_TEST_DEFAULTDEVICETYPE_HPP + +#include <gtest/gtest.h> + +#define TEST_CATEGORY defaultdevicetype +#define TEST_CATEGORY_DEATH defaultdevicetype_DeathTest +#define TEST_EXECSPACE Kokkos::DefaultExecutionSpace + +#endif diff --git a/packages/kokkos/core/unit_test/category_files/TestHIPHostPinned_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestHIPHostPinned_Category.hpp new file mode 100644 index 0000000000000000000000000000000000000000..12c69926c7bfc10ec7fef02d9e96c39691c557d6 --- /dev/null +++ b/packages/kokkos/core/unit_test/category_files/TestHIPHostPinned_Category.hpp @@ -0,0 +1,53 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_HIPHOSTPINNED_HPP +#define KOKKOS_TEST_HIPHOSTPINNED_HPP + +#include <gtest/gtest.h> + +#define TEST_CATEGORY hip_hostpinned +#define TEST_EXECSPACE Kokkos::Experimental::HIPHostPinnedSpace + +#endif diff --git a/packages/kokkos/core/unit_test/category_files/TestHIP_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestHIP_Category.hpp new file mode 100644 index 0000000000000000000000000000000000000000..0a9fe5a08f1167a6c407e9866e8e6ad130053986 --- /dev/null +++ b/packages/kokkos/core/unit_test/category_files/TestHIP_Category.hpp @@ -0,0 +1,54 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_HIP_HPP +#define KOKKOS_TEST_HIP_HPP + +#include <gtest/gtest.h> + +#define TEST_CATEGORY hip +#define TEST_CATEGORY_NUMBER 6 +#define TEST_EXECSPACE Kokkos::Experimental::HIP + +#endif diff --git a/packages/kokkos/core/unit_test/category_files/TestHPX_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestHPX_Category.hpp new file mode 100644 index 0000000000000000000000000000000000000000..401794c43177a4304817318bb6ed88ce1ff1dd88 --- /dev/null +++ b/packages/kokkos/core/unit_test/category_files/TestHPX_Category.hpp @@ -0,0 +1,55 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_HPX_HPP +#define KOKKOS_TEST_HPX_HPP + +#include <gtest/gtest.h> + +#define TEST_CATEGORY hpx +#define TEST_CATEGORY_NUMBER 3 +#define TEST_CATEGORY_DEATH hpx_DeathTest +#define TEST_EXECSPACE Kokkos::Experimental::HPX + +#endif diff --git a/packages/kokkos/core/unit_test/category_files/TestOpenMPTarget_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestOpenMPTarget_Category.hpp new file mode 100644 index 0000000000000000000000000000000000000000..4d3d14e245099e11986d7fdcb83e5cb4289c5d23 --- /dev/null +++ b/packages/kokkos/core/unit_test/category_files/TestOpenMPTarget_Category.hpp @@ -0,0 +1,55 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_OMPTARGET_HPP +#define KOKKOS_TEST_OMPTARGET_HPP + +#include <gtest/gtest.h> + +#define TEST_CATEGORY openmptarget +#define TEST_CATEGORY_NUMBER 4 +#define TEST_CATEGORY_DEATH openmptarget_DeathTest +#define TEST_EXECSPACE Kokkos::Experimental::OpenMPTarget + +#endif diff --git a/packages/kokkos/core/unit_test/category_files/TestOpenMP_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestOpenMP_Category.hpp new file mode 100644 index 0000000000000000000000000000000000000000..98b8b9f515d128ca3fb9fd239f7d3b5ab5131e0b --- /dev/null +++ b/packages/kokkos/core/unit_test/category_files/TestOpenMP_Category.hpp @@ -0,0 +1,56 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_OMP_HPP +#define KOKKOS_TEST_OMP_HPP + +#include <gtest/gtest.h> + +#define TEST_CATEGORY openmp +#define TEST_CATEGORY_NUMBER 2 +#define TEST_CATEGORY_DEATH openmp_DeathTest +#define TEST_EXECSPACE Kokkos::OpenMP +#define TEST_CATEGORY_FIXTURE(name) openmp_##name + +#endif diff --git a/packages/kokkos/core/unit_test/category_files/TestSYCLSharedUSMSpace_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestSYCLSharedUSMSpace_Category.hpp new file mode 100644 index 0000000000000000000000000000000000000000..1ec89fc61a594989f58b5076af6477be051183e8 --- /dev/null +++ b/packages/kokkos/core/unit_test/category_files/TestSYCLSharedUSMSpace_Category.hpp @@ -0,0 +1,53 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_SYCL_SHARED_USM_SPACE_HPP +#define KOKKOS_TEST_SYCL_SHARED_USM_SPACE_HPP + +#include <gtest/gtest.h> + +#define TEST_CATEGORY sycl_shared_usm +#define TEST_EXECSPACE Kokkos::Experimental::SYCLSharedUSMSpace + +#endif diff --git a/packages/kokkos/core/unit_test/category_files/TestSYCL_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestSYCL_Category.hpp new file mode 100644 index 0000000000000000000000000000000000000000..345f40d1c39f403dd62369c8cfa668ed1c75a951 --- /dev/null +++ b/packages/kokkos/core/unit_test/category_files/TestSYCL_Category.hpp @@ -0,0 +1,54 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_SYCL_HPP +#define KOKKOS_TEST_SYCL_HPP + +#include <gtest/gtest.h> + +#define TEST_CATEGORY sycl +#define TEST_CATEGORY_NUMBER 7 +#define TEST_EXECSPACE Kokkos::Experimental::SYCL + +#endif diff --git a/packages/kokkos/core/unit_test/category_files/TestSerial_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestSerial_Category.hpp new file mode 100644 index 0000000000000000000000000000000000000000..b2e0f96f446bd9c67c57f4dd0c7429f133f67bf8 --- /dev/null +++ b/packages/kokkos/core/unit_test/category_files/TestSerial_Category.hpp @@ -0,0 +1,56 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_SERIAL_HPP +#define KOKKOS_TEST_SERIAL_HPP + +#include <gtest/gtest.h> + +#define TEST_CATEGORY serial +#define TEST_CATEGORY_NUMBER 0 +#define TEST_CATEGORY_DEATH serial_DeathTest +#define TEST_EXECSPACE Kokkos::Serial +#define TEST_CATEGORY_FIXTURE(name) serial_##name + +#endif diff --git a/packages/kokkos/core/unit_test/category_files/TestThreads_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestThreads_Category.hpp new file mode 100644 index 0000000000000000000000000000000000000000..b7ca7c826b090649e9e49cc26b3a1bf8b9b45894 --- /dev/null +++ b/packages/kokkos/core/unit_test/category_files/TestThreads_Category.hpp @@ -0,0 +1,55 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_THREADS_HPP +#define KOKKOS_TEST_THREADS_HPP + +#include <gtest/gtest.h> + +#define TEST_CATEGORY threads +#define TEST_CATEGORY_NUMBER 1 +#define TEST_CATEGORY_DEATH threads_DeathTest +#define TEST_EXECSPACE Kokkos::Threads + +#endif diff --git a/packages/kokkos/core/unit_test/configuration/test-code/CMakeLists.txt b/packages/kokkos/core/unit_test/configuration/test-code/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..759d59a07dd9e9268fb7d818b4dc50f564845805 --- /dev/null +++ b/packages/kokkos/core/unit_test/configuration/test-code/CMakeLists.txt @@ -0,0 +1,44 @@ +# Kokkos requires CMake version 3.1 or higher and that you have the following +# line with a version of 3.1 or higher as the first line of your project: +# cmake_minimum_required(VERSION 3.1) +# +# The other CMake commands required to build Kokkos as part of your application +# are: +# add_subdirectory(path/to/kokkos) +# target_link_libraries(executable or library) +# +# If Kokkos is not a subdirectory of your project, you will also need to pass a +# binary directory to add_subdirectory(). We had to pass the binary directory +# for this example for that reason. Note that target_link_libraries() can be +# called on a target added by add_executable(), add_library(), or another +# similar command. +# +# All the flags, etc. required to build using the Kokkos library are +# transitively added to targets which depend on the library. +# +# The CMake variables CMAKE_CXX_STANDARD and CMAKE_CXX_EXTENSIONS are +# respected. We recommend that you set CMAKE_CXX_EXTENSIONS to OFF. +# Otherwise, CMake defaults to using extensions for the C++ standard, and the +# GNU extensions (-std=gnu++14) will be used for compilers that support it +# instead of standard C++14 (-std=c++14). +# +# A bunch of build options are added as variables (all starting with KOKKOS_) +# to the build. Check them out using ccmake or the CMake GUI. +# +# Building this example: +# 1. Create a build directory. +# 2. cd /path/to/build/directory +# 3. cmake /path/to/example +# 4. make + +cmake_minimum_required(VERSION 3.10) +project(Example CXX C Fortran) + +list(APPEND CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} -O3) + +add_subdirectory(${Example_SOURCE_DIR}/../../../.. ${Example_BINARY_DIR}/kokkos) + +include_directories(${Kokkos_INCLUDE_DIRS_RET}) + +add_executable(test_cmake.exe main.cpp) +target_link_libraries(test_cmake.exe kokkos) diff --git a/packages/kokkos/core/unit_test/configuration/test-code/Makefile b/packages/kokkos/core/unit_test/configuration/test-code/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..8ca5f34990b54700e7a29d4197d7f150f1ff0a81 --- /dev/null +++ b/packages/kokkos/core/unit_test/configuration/test-code/Makefile @@ -0,0 +1,46 @@ +KOKKOS_DEVICES=Serial +KOKKOS_CUDA_OPTIONS=enable_lambda +KOKKOS_ARCH = "SNB,Kepler35" + + +MAKEFILE_PATH := $(subst Makefile,,$(abspath $(lastword $(MAKEFILE_LIST)))) + +ifndef KOKKOS_PATH + KOKKOS_PATH = $(MAKEFILE_PATH)../../../.. +endif + +SRC = $(wildcard $(MAKEFILE_PATH)*.cpp) +HEADERS = $(wildcard $(MAKEFILE_PATH)*.hpp) + +vpath %.cpp $(sort $(dir $(SRC))) + +default: build + echo "Start Build" + +CXX = g++ +EXE = test_config.exe + +CXXFLAGS ?= -O3 -g +override CXXFLAGS += -I$(MAKEFILE_PATH) + +DEPFLAGS = -M +LINK = ${CXX} +LINKFLAGS = + +OBJ = $(notdir $(SRC:.cpp=.o)) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o *.cuda *.host + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(HEADERS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/packages/kokkos/core/unit_test/configuration/test-code/main.cpp b/packages/kokkos/core/unit_test/configuration/test-code/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0d2f6de3a9f6cc70bf9d6ebb83d46780405a2b89 --- /dev/null +++ b/packages/kokkos/core/unit_test/configuration/test-code/main.cpp @@ -0,0 +1,6 @@ +#include <Kokkos_Core.hpp> + +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); + Kokkos::finalize(); +} diff --git a/packages/kokkos/core/unit_test/configuration/test-code/test_config.bash b/packages/kokkos/core/unit_test/configuration/test-code/test_config.bash new file mode 100755 index 0000000000000000000000000000000000000000..0edb1139333372796fb45d93f5a365563830ad52 --- /dev/null +++ b/packages/kokkos/core/unit_test/configuration/test-code/test_config.bash @@ -0,0 +1,7 @@ + +mkdir -p gnu-make +mkdir -p cmake +export KOKKOS_PATH=$1 +KOKKOS_PATH=$1 +${KOKKOS_PATH}/core/unit_test/configuration/test-code/test_config_device_list.bash + diff --git a/packages/kokkos/core/unit_test/configuration/test-code/test_config_arch_list.bash b/packages/kokkos/core/unit_test/configuration/test-code/test_config_arch_list.bash new file mode 100755 index 0000000000000000000000000000000000000000..5ff781b96fc0949361329a61baa4f966f6b8a93a --- /dev/null +++ b/packages/kokkos/core/unit_test/configuration/test-code/test_config_arch_list.bash @@ -0,0 +1,45 @@ + +# List of parallel device types +HostArch=(SNB HSW SKX KNL) +DeviceArch=(Kepler35 Kepler37 Pascal60 Pascal61 Volta70) +if [ ! -z "$KOKKOS_HOST_ARCH_TEST" ]; then + export KOKKOS_ARCH_TEST=1 + HostArch=(WSM SNB HSW SKX WSM AMDAVX ARMv80 ARMv81 BDW KNC KNL BGQ Power7 Power8 Power9 Zen Zen2 ARMv8_ThunderX ARMv8_ThunderX2) + DeviceArch=() +fi + +if [ ! -z "$KOKKOS_DEVICE_ARCH_TEST" ]; then + export KOKKOS_ARCH_TEST=1 + HostArch=(SNB) + DeviceArch=(Kepler30 Kepler32 Kepler35 Kepler37 Maxwell50 Maxwell52 Maxwell53 Pascal60 Pascal61 Volta70 Volta72) +fi + +MakeDevices=$1 +CMakeDevices=$2 + +SRC_DIR=${KOKKOS_PATH}/core/unit_test/configuration/test-code + +for harch in "${HostArch[@]}" +do + harch_up=`echo $harch | tr a-z A-Z` + CMAKE_HARCH="-DKokkos_ARCH_${harch_up}=ON" + + if [ "$harch" == "ARMv8_ThunderX2" ]; then + harch="ARMv8-TX2" + elif [ "$harch" == "ARMv8_ThunderX" ]; then + harch="ARMv8-ThunderX" + fi + + if [ ! -z "$DeviceArch" ] + then + for darch in "${DeviceArch[@]}" + do + darch_up=`echo $darch | tr a-z A-Z` + CMAKE_DARCH="-DKokkos_ARCH_${darch_up}=ON" + ${SRC_DIR}/test_config_options_list.bash "$MakeDevices" "$CMakeDevices" "$harch,$darch" "${CMAKE_HARCH} ${CMAKE_DARCH}" + done + else + ${SRC_DIR}/test_config_options_list.bash "$MakeDevices" "$CMakeDevices" "$harch" "${CMAKE_HARCH}" + fi +done + diff --git a/packages/kokkos/core/unit_test/configuration/test-code/test_config_device_list.bash b/packages/kokkos/core/unit_test/configuration/test-code/test_config_device_list.bash new file mode 100755 index 0000000000000000000000000000000000000000..e96f567fe291d6a87a609759c76ebce63ddd726a --- /dev/null +++ b/packages/kokkos/core/unit_test/configuration/test-code/test_config_device_list.bash @@ -0,0 +1,45 @@ + +SRC_DIR=${KOKKOS_PATH}/core/unit_test/configuration/test-code +# List of parallel device types +HostPDevices=(OpenMP Threads) +if [ ! -z "$KOKKOS_ARCH_TEST" ]; then + HostPDevices=(OpenMP) +fi + +if [ ! -z "$HPX_ROOT" ] +then + HostPDevices=(${HostPDevices[@]} HPX) +fi + +if [ ! -z "$CUDA_ROOT" ] +then + AccDevices=(${AccDevices[@]} Cuda) + export CXX=${KOKKOS_PATH}/bin/nvcc_wrapper +fi +if [ ! -z "$HIP_ROOT" ] +then + AccDevices=(${AccDevices[@]} HIP) +fi + +for hpdevice in "${HostPDevices[@]}" +do + hpdevice_up=`echo $hpdevice | tr a-z A-Z` + CMAKE_HPDEVICE="-DKokkos_ENABLE_${hpdevice_up}=ON" + + if [ ! -z "$AccDevices" ] + then + for accdevice in "${AccDevices[@]}" + do + accdevice_up=`echo $accdevice | tr a-z A-Z` + CMAKE_ACCDEVICE="-DKokkos_ENABLE_${accdevice_up}=ON" + ${SRC_DIR}/test_config_arch_list.bash "$hpdevice,$accdevice" "${CMAKE_HPDEVICE} ${CMAKE_ACCDEVICE}" + ${SRC_DIR}/test_config_arch_list.bash "$hpdevice,$accdevice,Serial" "${CMAKE_HPDEVICE} ${CMAKE_ACCDEVICE} -DKokkos_ENABLE_SERIAL=ON" + done + else + #no, I need to be able to specify this + #export CXX=g++ + ${SRC_DIR}/test_config_arch_list.bash "$hpdevice" "${CMAKE_HPDEVICE}" + ${SRC_DIR}/test_config_arch_list.bash "$hpdevice,Serial" "${CMAKE_HPDEVICE} -DKokkos_ENABLE_SERIAL=ON" + fi +done + diff --git a/packages/kokkos/core/unit_test/configuration/test-code/test_config_options_list.bash b/packages/kokkos/core/unit_test/configuration/test-code/test_config_options_list.bash new file mode 100755 index 0000000000000000000000000000000000000000..59072b1a1d0aba6decb9f21e0e50533b714659d1 --- /dev/null +++ b/packages/kokkos/core/unit_test/configuration/test-code/test_config_options_list.bash @@ -0,0 +1,48 @@ +SRC_DIR=${KOKKOS_PATH}/core/unit_test/configuration/test-code + +# List of parallel device types +Options=(aggressive_vectorization disable_profiling large_mem_tests) +CudaOptions=(lambda relocatable_device_code uvm constexpr) + +if [ ! -z "$KOKKOS_ARCH_TEST" ]; then + Options=(disable_profiling) + CudaOptions=(uvm) +fi + +MakeDevices=$1 +CMakeDevices=$2 +MakeArch=$3 +CMakeArch=$4 + +for option in "${Options[@]}" +do + option_up=`echo $option | tr a-z A-Z` + if [[ $option_up == *"DISABLE"* ]]; then + new_option_up=${option_up/DISABLE_/} + CMAKE_OPTION="-DKokkos_ENABLE_${new_option_up}=OFF" + else + CMAKE_OPTION="-DKokkos_ENABLE_${option_up}=ON" + fi + + #Renaming options as GNU Make expects them + option=${option/large_mem_tests/enable_large_mem_tests} + + if [ ! -z $CudaOptions ]; then + for cuda_option in "${CudaOptions[@]}" + do + cuda_option_up=`echo $cuda_option | tr a-z A-Z` + CMAKE_CUDA_OPTION="-DKokkos_ENABLE_CUDA_${cuda_option_up}=ON" + + #Renaming options as GNU Make expects them + cuda_option=${cuda_option/lambda/enable_lambda} + cuda_option=${cuda_option/constexpr/enable_constexpr} + cuda_option=${cuda_option/relocatable_device_code/rdc} + cuda_option=${cuda_option/uvm/force_uvm} + + ${SRC_DIR}/test_config_run.bash "$MakeDevices" "$CMakeDevices" "$MakeArch" "$CMakeArch" "KOKKOS_OPTIONS=$option KOKKOS_CUDA_OPTIONS=$cuda_option" "$CMAKE_OPTION $CMAKE_CUDA_OPTION" + done + else + ${SRC_DIR}/test_config_run.bash "$MakeDevices" "$CMakeDevices" "$MakeArch" "$CMakeArch" "KOKKOS_OPTIONS=$option" "$CMAKE_OPTION" + fi +done + diff --git a/packages/kokkos/core/unit_test/configuration/test-code/test_config_run.bash b/packages/kokkos/core/unit_test/configuration/test-code/test_config_run.bash new file mode 100755 index 0000000000000000000000000000000000000000..4750c843c290ca37b9e533126843c4f14474e7c6 --- /dev/null +++ b/packages/kokkos/core/unit_test/configuration/test-code/test_config_run.bash @@ -0,0 +1,111 @@ + +SRC_DIR=${KOKKOS_PATH}/core/unit_test/configuration/test-code + +# List of parallel device types +MakeDevices=$1 +CMakeDevices=$2 +MakeArch=$3 +CMakeArch=$4 +MakeOptions=$5 +CMakeOptions=$6 + +cd gnu-make +rm -rf * +make -f ${SRC_DIR}/Makefile KOKKOS_DEVICES=$MakeDevices KOKKOS_ARCH=$MakeArch $MakeOptions CXX=$CXX KokkosCore_config.h &>out +make -f ${SRC_DIR}/Makefile KOKKOS_DEVICES=$MakeDevices KOKKOS_ARCH=$MakeArch $MakeOptions CXX=$CXX print-cxx-flags &> cxxflags + +cd ../cmake +rm -rf * +cmake -DKokkos_SKIP_VALIDATION=ON \ + -DCMAKE_CXX_COMPILER=$CXX \ + $CMakeDevices \ + $CMakeArch \ + $CMakeOptions \ + $SRC_DIR &> config_out +cd .. +grep define gnu-make/KokkosCore_config.h | sort -u &> make_config_defines +grep define cmake/kokkos/KokkosCore_config.h | sort -u &> cmake_config_defines + +diff make_config_defines cmake_config_defines &> config_defines_diff +diff_exists=`cat config_defines_diff | wc -l` +if [ $diff_exists -gt 0 ] +then + echo "" + echo "" + echo "Failed #define test" + echo Make: "make -f ${SRC_DIR}/Makefile KOKKOS_DEVICES=$MakeDevices KOKKOS_ARCH=$MakeArch $MakeOptions CXX=$CXX KokkosCore_config.h" + echo CMake: "cmake -DCMAKE_CXX_COMPILER=$CXX $CMakeDevices $CMakeArch $CMakeOptions $SRC_DIR" + cat config_defines_diff + echo "Sleeping for 3 seconds if you want to stop and explore..." + echo "" + sleep 3 +else + echo "" + echo "" + echo "Passed #define test" + echo Make: "make -f ${SRC_DIR}/Makefile KOKKOS_DEVICES=$MakeDevices KOKKOS_ARCH=$MakeArch $MakeOptions CXX=$CXX KokkosCore_config.h" + echo CMake: "cmake -DCMAKE_CXX_COMPILER=$CXX $CMakeDevices $CMakeArch $CMakeOptions $SRC_DIR" +fi + +#find because it goes in different locations +#grep out compiler warnings +#head multiple matches +#sed a bunch of stuff to clean up cmake garbage +#awk trim whitespace +#awk print each on new line +#grep remove empty lines +#grep don't consider -std flags in the comparison +#sort and print unique flags +find cmake/kokkos -name KokkosTargets.cmake -exec grep -h INTERFACE_COMPILE_OPTIONS {} \; \ + | grep -v skew \ + | head -n 1 \ + | sed 's/INTERFACE_COMPILE_OPTIONS//g' \ + | sed 's/;/ /g' \ + | sed 's/"//g' \ + | sed 's/\\$<\\$<//g' \ + | sed 's/COMPILE_LANGUAGE:CXX>://g' \ + | sed 's/> / /g' \ + | sed 's/>$//g' \ + | awk '{$1=$1;print}' \ + | awk -v RS=" " '{print}' \ + | grep -v -e '^$' \ + | grep -v '\-std' \ + | sort | uniq > cmake_cxx_flags + +#-I flags and -std= flags are not part of CMake's compile options +#that's fine, let's ignore thse below +#redundant lines - tail the last one +#awk print each on new line +#grep out blank lines +#grep out include flags +#grep out -std flags +#sort and print unique flags +tail -n 1 gnu-make/cxxflags \ + | awk -v RS=" " '{print}' \ + | grep -v -e '^$' \ + | grep -v '\-I' \ + | grep -v '\-std=' \ + | grep -v 'gcc-toolchain' \ + | sort | uniq > gnu_make_cxx_flags +diff gnu_make_cxx_flags cmake_cxx_flags &> config_cxxflags_diff +diff_exists=`cat config_cxxflags_diff | wc -l` + +if [ $diff_exists -gt 0 ] +then + echo "" + echo "" + echo "Failed CXXFLAGS test" + echo Make: "make -f ${SRC_DIR}/Makefile KOKKOS_DEVICES=$MakeDevices KOKKOS_ARCH=$MakeArch $MakeOptions CXX=$CXX KokkosCore_config.h" + echo CMake: "cmake -DCMAKE_CXX_COMPILER=$CXX $CMakeDevices $CMakeArch $CMakeOptions $SRC_DIR" + cat config_cxxflags_diff + echo "Sleeping for 3 seconds if you want to stop and explore..." + echo "" + sleep 3 +else + echo "" + echo "" + echo "Passed CXXFLAGS test" + echo Make: "make -f ${SRC_DIR}/Makefile KOKKOS_DEVICES=$MakeDevices KOKKOS_ARCH=$MakeArch $MakeOptions CXX=$CXX KokkosCore_config.h" + echo CMake: "cmake -DCMAKE_CXX_COMPILER=$CXX $CMakeDevices $CMakeArch $CMakeOptions $SRC_DIR" +fi + diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_SharedAlloc.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_SharedAlloc.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4228b5181a0ccd68dfde87f71f92fd0a471a8e96 --- /dev/null +++ b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_SharedAlloc.cpp @@ -0,0 +1,46 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestCudaHostPinned_Category.hpp> +#include <TestSharedAlloc.hpp> diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_a.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_a.cpp new file mode 100644 index 0000000000000000000000000000000000000000..316a2b5d0fe0dba2c9b74f3f6f7a6d61342d2c4c --- /dev/null +++ b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_a.cpp @@ -0,0 +1,46 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestCudaHostPinned_Category.hpp> +#include <TestViewAPI_a.hpp> diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_b.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_b.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5eed2ca0d77b828b2431bfce0fe69c4da457bb95 --- /dev/null +++ b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_b.cpp @@ -0,0 +1,46 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestCudaHostPinned_Category.hpp> +#include <TestViewAPI_b.hpp> diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_c.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_c.cpp new file mode 100644 index 0000000000000000000000000000000000000000..26dc9b0e000096ab1809412c4a29fc563844cbd1 --- /dev/null +++ b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_c.cpp @@ -0,0 +1,46 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestCudaHostPinned_Category.hpp> +#include <TestViewAPI_c.hpp> diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_d.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_d.cpp new file mode 100644 index 0000000000000000000000000000000000000000..bab29610a3d4ad2e812405ba96ed06c7e2dfb3b8 --- /dev/null +++ b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_d.cpp @@ -0,0 +1,46 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestCudaHostPinned_Category.hpp> +#include <TestViewAPI_d.hpp> diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_e.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_e.cpp new file mode 100644 index 0000000000000000000000000000000000000000..fd227186d5668239b9d9fe3f6a1ae2b3d5510b32 --- /dev/null +++ b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_e.cpp @@ -0,0 +1,46 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestCudaHostPinned_Category.hpp> +#include <TestViewAPI_e.hpp> diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewCopy_a.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewCopy_a.cpp new file mode 100644 index 0000000000000000000000000000000000000000..669761df979cfd1458f1d5ea78acfb5738af0d38 --- /dev/null +++ b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewCopy_a.cpp @@ -0,0 +1,46 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestCudaHostPinned_Category.hpp> +#include <TestViewCopy_a.hpp> diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewCopy_b.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewCopy_b.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d367fd7e051f49495ce747f6f490bad795f94d86 --- /dev/null +++ b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewCopy_b.cpp @@ -0,0 +1,46 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestCudaHostPinned_Category.hpp> +#include <TestViewCopy_b.hpp> diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_a.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_a.cpp new file mode 100644 index 0000000000000000000000000000000000000000..01b284b2f562299b4f23cc197693c2baad40f38e --- /dev/null +++ b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_a.cpp @@ -0,0 +1,46 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestCudaHostPinned_Category.hpp> +#include <TestViewMapping_a.hpp> diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_b.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_b.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e15228b1d772a5dba97ee434e17fdb18188a709a --- /dev/null +++ b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_b.cpp @@ -0,0 +1,46 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestCudaHostPinned_Category.hpp> +#include <TestViewMapping_b.hpp> diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_subview.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_subview.cpp new file mode 100644 index 0000000000000000000000000000000000000000..52bbd42f292f4b865def36856913dfc6bbe0028f --- /dev/null +++ b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_subview.cpp @@ -0,0 +1,46 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestCudaHostPinned_Category.hpp> +#include <TestViewMapping_subview.hpp> diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_SharedAlloc.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_SharedAlloc.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6602d7396a7c2fdec7e16e83079764962dbeab75 --- /dev/null +++ b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_SharedAlloc.cpp @@ -0,0 +1,46 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestCudaUVM_Category.hpp> +#include <TestSharedAlloc.hpp> diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_a.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_a.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4aeac8f13f4d28672c671a51c1eacfedbf0e92fd --- /dev/null +++ b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_a.cpp @@ -0,0 +1,46 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestCudaUVM_Category.hpp> +#include <TestViewAPI_a.hpp> diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_b.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_b.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e5cb0103424fd022290998307f086aedaea0cb29 --- /dev/null +++ b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_b.cpp @@ -0,0 +1,46 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestCudaUVM_Category.hpp> +#include <TestViewAPI_b.hpp> diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_c.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_c.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a52fcb833ed2a0e959a25e36195460c1ed914a78 --- /dev/null +++ b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_c.cpp @@ -0,0 +1,46 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestCudaUVM_Category.hpp> +#include <TestViewAPI_c.hpp> diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_d.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_d.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e345cd9667526671ef898a0d1247343b47f6296c --- /dev/null +++ b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_d.cpp @@ -0,0 +1,46 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestCudaUVM_Category.hpp> +#include <TestViewAPI_d.hpp> diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_e.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_e.cpp new file mode 100644 index 0000000000000000000000000000000000000000..61547df4f523969f8c93da8315fddb4467e5ade9 --- /dev/null +++ b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_e.cpp @@ -0,0 +1,46 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestCudaUVM_Category.hpp> +#include <TestViewAPI_e.hpp> diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewCopy_a.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewCopy_a.cpp new file mode 100644 index 0000000000000000000000000000000000000000..75a769bb947485e6e7459c1cb95b7b3b1c26f9b1 --- /dev/null +++ b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewCopy_a.cpp @@ -0,0 +1,46 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestCudaUVM_Category.hpp> +#include <TestViewCopy_a.hpp> diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewCopy_b.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewCopy_b.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7d09f5c9f397b3723599aec64c3c50a6aa77a769 --- /dev/null +++ b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewCopy_b.cpp @@ -0,0 +1,46 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestCudaUVM_Category.hpp> +#include <TestViewCopy_b.hpp> diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_a.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_a.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ea03f43bd69a318095e6277f4db226241fc9a482 --- /dev/null +++ b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_a.cpp @@ -0,0 +1,46 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestCudaUVM_Category.hpp> +#include <TestViewMapping_a.hpp> diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_b.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_b.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1f754e8f4996cbc3c0fbefd7000bff65451b19f0 --- /dev/null +++ b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_b.cpp @@ -0,0 +1,46 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestCudaUVM_Category.hpp> +#include <TestViewMapping_b.hpp> diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_subview.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_subview.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4af7057d2aa47db99a8325159e0ee737feff7767 --- /dev/null +++ b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_subview.cpp @@ -0,0 +1,46 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestCudaUVM_Category.hpp> +#include <TestViewMapping_subview.hpp> diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_DebugPinUVMSpace.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_DebugPinUVMSpace.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5b6fccdbd0a500cbb0d45574879a797c866d1b55 --- /dev/null +++ b/packages/kokkos/core/unit_test/cuda/TestCuda_DebugPinUVMSpace.cpp @@ -0,0 +1,131 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <TestCuda_Category.hpp> + +namespace Test { + +template <class View> +struct CopyFunctor { + View a; + View b; + + CopyFunctor(int N) : a(View("A", N)), b(View("B", N)) {} + + KOKKOS_INLINE_FUNCTION + void operator()(int i) const { a(i) = b(i); } + + double time_copy(int R) { + Kokkos::parallel_for("CopyFunctor::time_copy", a.extent(0), *this); + Kokkos::fence(); + + Kokkos::Timer timer; + for (int r = 0; r < R; r++) + Kokkos::parallel_for("CopyFunctor::time_copy", a.extent(0), *this); + Kokkos::fence(); + return timer.seconds(); + } +}; + +TEST(cuda, debug_pin_um_to_host) { + double time_cuda_space; + double time_cuda_host_pinned_space; + double time_cuda_uvm_space_not_pinned_1; + double time_cuda_uvm_space_pinned; + double time_cuda_uvm_space_not_pinned_2; + + int N = 10000000; + int R = 100; + { + CopyFunctor<Kokkos::View<int*, Kokkos::CudaSpace>> f(N); + time_cuda_space = f.time_copy(R); + } + { + CopyFunctor<Kokkos::View<int*, Kokkos::CudaHostPinnedSpace>> f(N); + time_cuda_host_pinned_space = f.time_copy(R); + } + { + CopyFunctor<Kokkos::View<int*, Kokkos::CudaUVMSpace>> f(N); + time_cuda_uvm_space_not_pinned_1 = f.time_copy(R); + } + { +#ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST + kokkos_impl_cuda_set_pin_uvm_to_host(true); +#endif + CopyFunctor<Kokkos::View<int*, Kokkos::CudaUVMSpace>> f(N); + time_cuda_uvm_space_pinned = f.time_copy(R); +#ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST + kokkos_impl_cuda_set_pin_uvm_to_host(false); +#endif + } + { + CopyFunctor<Kokkos::View<int*, Kokkos::CudaUVMSpace>> f(N); + time_cuda_uvm_space_not_pinned_2 = f.time_copy(R); + } + bool uvm_approx_cuda_1 = + time_cuda_uvm_space_not_pinned_1 < time_cuda_space * 2.0; + bool uvm_approx_cuda_2 = + time_cuda_uvm_space_not_pinned_2 < time_cuda_space * 2.0; + bool pinned_slower_cuda = time_cuda_host_pinned_space > time_cuda_space * 2.0; + bool uvm_pinned_slower_cuda = + time_cuda_uvm_space_pinned > time_cuda_space * 2.0; + + bool passed = uvm_approx_cuda_1 && uvm_approx_cuda_2 && pinned_slower_cuda && +#ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST + uvm_pinned_slower_cuda; +#else + !uvm_pinned_slower_cuda; +#endif + if (!passed) + printf( + "Time CudaSpace: %lf CudaUVMSpace_1: %lf CudaUVMSpace_2: %lf " + "CudaPinnedHostSpace: %lf CudaUVMSpace_Pinned: %lf\n", + time_cuda_space, time_cuda_uvm_space_not_pinned_1, + time_cuda_uvm_space_not_pinned_2, time_cuda_host_pinned_space, + time_cuda_uvm_space_pinned); + ASSERT_TRUE(passed); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_DebugSerialExecution.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_DebugSerialExecution.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f1d3dfc5245d971b6b90ca3ef11731e34b538f67 --- /dev/null +++ b/packages/kokkos/core/unit_test/cuda/TestCuda_DebugSerialExecution.cpp @@ -0,0 +1,197 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <TestCuda_Category.hpp> + +namespace Test { + +using ViewType = Kokkos::View<double*>; + +struct TestForFunctor { + ViewType a; + ViewType b; + + TestForFunctor(int N) : a(ViewType("A", N)), b(ViewType("B", N)) {} + + KOKKOS_INLINE_FUNCTION + void operator()(int i) const { a(i) = b(i); } + + double time_par_for() { + Kokkos::Timer timer; + Kokkos::parallel_for("CudaDebugSerialExecution::par_for", a.extent(0), + *this); + Kokkos::fence(); + return timer.seconds(); + } +}; + +struct TestRedFunctor { + ViewType a; + ViewType b; + + TestRedFunctor(int N) : a(ViewType("A", N)), b(ViewType("B", N)) {} + + KOKKOS_INLINE_FUNCTION + void operator()(int i, double& val) const { val += a(i) * b(i); } + + double time_par_red() { + Kokkos::Timer timer; + double dot; + Kokkos::parallel_reduce("CudaDebugSerialExecution::par_red", a.extent(0), + *this, dot); + Kokkos::fence(); + return timer.seconds(); + } +}; + +struct TestScanFunctor { + ViewType a; + ViewType b; + + TestScanFunctor(int N) : a(ViewType("A", N)), b(ViewType("B", N)) {} + + KOKKOS_INLINE_FUNCTION + void operator()(int i, double& val, bool final) const { + val += b(i); + if (final) a(i) = val; + } + + double time_par_scan() { + Kokkos::Timer timer; + double dot; + Kokkos::parallel_scan("CudaDebugSerialExecution::par_scan", a.extent(0), + *this, dot); + Kokkos::fence(); + return timer.seconds(); + } +}; + +TEST(cuda, debug_serial_execution) { + double time_par_for_1, time_par_for_2, time_par_for_serial; + double time_par_red_1, time_par_red_2, time_par_red_serial; + double time_par_scan_1, time_par_scan_2, time_par_scan_serial; + + int N = 10000000; + { + TestForFunctor f(N); + f.time_par_for(); + time_par_for_1 = f.time_par_for(); +#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION + kokkos_impl_cuda_set_serial_execution(true); +#endif + time_par_for_serial = f.time_par_for(); +#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION + kokkos_impl_cuda_set_serial_execution(false); +#endif + time_par_for_2 = f.time_par_for(); + + bool passed_par_for = +#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION + (time_par_for_serial > time_par_for_1 * 20.0) && + (time_par_for_serial > time_par_for_2 * 20.0); +#else + (time_par_for_serial < time_par_for_1 * 2.0) && + (time_par_for_serial < time_par_for_2 * 2.0); +#endif + if (!passed_par_for) + printf("Time For1: %lf For2: %lf ForSerial: %lf\n", time_par_for_1, + time_par_for_2, time_par_for_serial); + ASSERT_TRUE(passed_par_for); + } + { + TestRedFunctor f(N); + f.time_par_red(); + time_par_red_1 = f.time_par_red(); +#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION + kokkos_impl_cuda_set_serial_execution(true); +#endif + time_par_red_serial = f.time_par_red(); +#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION + kokkos_impl_cuda_set_serial_execution(false); +#endif + time_par_red_2 = f.time_par_red(); + + bool passed_par_red = +#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION + (time_par_red_serial > time_par_red_1 * 2.0) && + (time_par_red_serial > time_par_red_2 * 2.0); +#else + (time_par_red_serial < time_par_red_1 * 2.0) && + (time_par_red_serial < time_par_red_2 * 2.0); +#endif + if (!passed_par_red) + printf("Time Red1: %lf Red2: %lf RedSerial: %lf\n", time_par_red_1, + time_par_red_2, time_par_red_serial); + ASSERT_TRUE(passed_par_red); + } + { + TestScanFunctor f(N); + f.time_par_scan(); + time_par_scan_1 = f.time_par_scan(); +#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION + kokkos_impl_cuda_set_serial_execution(true); +#endif + time_par_scan_serial = f.time_par_scan(); +#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION + kokkos_impl_cuda_set_serial_execution(false); +#endif + time_par_scan_2 = f.time_par_scan(); + + bool passed_par_scan = +#ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION + (time_par_scan_serial > time_par_scan_1 * 2.0) && + (time_par_scan_serial > time_par_scan_2 * 2.0); +#else + (time_par_scan_serial < time_par_scan_1 * 2.0) && + (time_par_scan_serial < time_par_scan_2 * 2.0); +#endif + if (!passed_par_scan) + printf("Time Scan1: %lf Scan2: %lf ScanSerial: %lf\n", time_par_scan_1, + time_par_scan_2, time_par_scan_serial); + ASSERT_TRUE(passed_par_scan); + } +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_Graph.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_Graph.cpp new file mode 100644 index 0000000000000000000000000000000000000000..77b1e58a1586482b029f89298c7273cfccc95a7d --- /dev/null +++ b/packages/kokkos/core/unit_test/cuda/TestCuda_Graph.cpp @@ -0,0 +1,47 @@ + +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestCuda_Category.hpp> +#include <TestGraph.hpp> diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Init.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Init.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ee7181e1180fdb887a87190605565e42e897409c --- /dev/null +++ b/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Init.cpp @@ -0,0 +1,88 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <TestCuda_Category.hpp> + +#include <array> + +namespace Test { + +__global__ void offset(int* p) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < 100) { + p[idx] += idx; + } +} + +// Test whether allocations survive Kokkos initialize/finalize if done via Raw +// Cuda. +TEST(cuda, raw_cuda_interop) { + int* p; + CUDA_SAFE_CALL(cudaMalloc(&p, sizeof(int) * 100)); + Kokkos::InitArguments arguments{-1, -1, -1, false}; + Kokkos::initialize(arguments); + + Kokkos::View<int*, Kokkos::MemoryTraits<Kokkos::Unmanaged>> v(p, 100); + Kokkos::deep_copy(v, 5); + + Kokkos::finalize(); + + offset<<<100, 64>>>(p); + CUDA_SAFE_CALL(cudaDeviceSynchronize()); + + std::array<int, 100> h_p; + cudaMemcpy(h_p.data(), p, sizeof(int) * 100, cudaMemcpyDefault); + CUDA_SAFE_CALL(cudaDeviceSynchronize()); + int64_t sum = 0; + int64_t sum_expect = 0; + for (int i = 0; i < 100; i++) { + sum += h_p[i]; + sum_expect += 5 + i; + } + + ASSERT_EQ(sum, sum_expect); + CUDA_SAFE_CALL(cudaFree(p)); +} +} // namespace Test diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Streams.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Streams.cpp new file mode 100644 index 0000000000000000000000000000000000000000..526b985c00f2eec2eab6cafb8e862eff5024d575 --- /dev/null +++ b/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Streams.cpp @@ -0,0 +1,117 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestCuda_Category.hpp> +#include <Test_InterOp_Streams.hpp> + +namespace Test { +// Test Interoperability with Cuda Streams +TEST(cuda, raw_cuda_streams) { + cudaStream_t stream; + cudaStreamCreate(&stream); + Kokkos::InitArguments arguments{-1, -1, -1, false}; + Kokkos::initialize(arguments); + int* p; + cudaMalloc(&p, sizeof(int) * 100); + using MemorySpace = typename TEST_EXECSPACE::memory_space; + + { + TEST_EXECSPACE space0(stream); + Kokkos::View<int*, TEST_EXECSPACE> v(p, 100); + Kokkos::deep_copy(space0, v, 5); + int sum; + + Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Range", + Kokkos::RangePolicy<TEST_EXECSPACE>(space0, 0, 100), + FunctorRange<MemorySpace>(v)); + Kokkos::parallel_reduce( + "Test::cuda::raw_cuda_stream::RangeReduce", + Kokkos::RangePolicy<TEST_EXECSPACE, Kokkos::LaunchBounds<128, 2>>( + space0, 0, 100), + FunctorRangeReduce<MemorySpace>(v), sum); + space0.fence(); + ASSERT_EQ(600, sum); + + Kokkos::parallel_for("Test::cuda::raw_cuda_stream::MDRange", + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>( + space0, {0, 0}, {10, 10}), + FunctorMDRange<MemorySpace>(v)); + Kokkos::parallel_reduce( + "Test::cuda::raw_cuda_stream::MDRangeReduce", + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>, + Kokkos::LaunchBounds<128, 2>>(space0, {0, 0}, + {10, 10}), + FunctorMDRangeReduce<MemorySpace>(v), sum); + space0.fence(); + ASSERT_EQ(700, sum); + + Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Team", + Kokkos::TeamPolicy<TEST_EXECSPACE>(space0, 10, 10), + FunctorTeam<MemorySpace, TEST_EXECSPACE>(v)); + Kokkos::parallel_reduce( + "Test::cuda::raw_cuda_stream::Team", + Kokkos::TeamPolicy<TEST_EXECSPACE, Kokkos::LaunchBounds<128, 2>>( + space0, 10, 10), + FunctorTeamReduce<MemorySpace, TEST_EXECSPACE>(v), sum); + space0.fence(); + ASSERT_EQ(800, sum); + } + Kokkos::finalize(); + offset_streams<<<100, 64, 0, stream>>>(p); + CUDA_SAFE_CALL(cudaDeviceSynchronize()); + cudaStreamDestroy(stream); + + int h_p[100]; + cudaMemcpy(h_p, p, sizeof(int) * 100, cudaMemcpyDefault); + CUDA_SAFE_CALL(cudaDeviceSynchronize()); + int64_t sum = 0; + int64_t sum_expect = 0; + for (int i = 0; i < 100; i++) { + sum += h_p[i]; + sum_expect += 8 + i; + } + + ASSERT_EQ(sum, sum_expect); +} +} // namespace Test diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp new file mode 100644 index 0000000000000000000000000000000000000000..646b37908654d2af6327158cb49f7d4257e8f8bf --- /dev/null +++ b/packages/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp @@ -0,0 +1,379 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <TestCuda_Category.hpp> + +namespace Test { + +__global__ void test_abort() { Kokkos::abort("test_abort"); } + +__global__ void test_cuda_spaces_int_value(int *ptr) { + if (*ptr == 42) { + *ptr = 2 * 42; + } +} + +TEST(cuda, space_access) { + static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace, + Kokkos::HostSpace>::assignable, + ""); + + static_assert( + Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace, + Kokkos::CudaHostPinnedSpace>::assignable, + ""); + + static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace, + Kokkos::CudaSpace>::assignable, + ""); + + static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace, + Kokkos::CudaSpace>::accessible, + ""); + + static_assert( + !Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace, + Kokkos::CudaUVMSpace>::assignable, + ""); + + static_assert( + Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace, + Kokkos::CudaUVMSpace>::accessible, + ""); + + //-------------------------------------- + + static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaSpace, + Kokkos::CudaSpace>::assignable, + ""); + + static_assert( + Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaSpace, + Kokkos::CudaUVMSpace>::assignable, + ""); + + static_assert( + !Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaSpace, + Kokkos::CudaHostPinnedSpace>::assignable, + ""); + + static_assert( + Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaSpace, + Kokkos::CudaHostPinnedSpace>::accessible, + ""); + + static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaSpace, + Kokkos::HostSpace>::assignable, + ""); + + static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaSpace, + Kokkos::HostSpace>::accessible, + ""); + + //-------------------------------------- + + static_assert( + Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaUVMSpace, + Kokkos::CudaUVMSpace>::assignable, + ""); + + static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaUVMSpace, + Kokkos::CudaSpace>::assignable, + ""); + + static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaUVMSpace, + Kokkos::CudaSpace>::accessible, + ""); + + static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaUVMSpace, + Kokkos::HostSpace>::assignable, + ""); + + static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaUVMSpace, + Kokkos::HostSpace>::accessible, + ""); + + static_assert( + !Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaUVMSpace, + Kokkos::CudaHostPinnedSpace>::assignable, + ""); + + static_assert( + Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaUVMSpace, + Kokkos::CudaHostPinnedSpace>::accessible, + ""); + + //-------------------------------------- + + static_assert( + Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaHostPinnedSpace, + Kokkos::CudaHostPinnedSpace>::assignable, + ""); + + static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaHostPinnedSpace, + Kokkos::HostSpace>::assignable, + ""); + + static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaHostPinnedSpace, + Kokkos::HostSpace>::accessible, + ""); + + static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaHostPinnedSpace, + Kokkos::CudaSpace>::assignable, + ""); + + static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaHostPinnedSpace, + Kokkos::CudaSpace>::accessible, + ""); + + static_assert( + !Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaHostPinnedSpace, + Kokkos::CudaUVMSpace>::assignable, + ""); + + static_assert( + Kokkos::Impl::MemorySpaceAccess<Kokkos::CudaHostPinnedSpace, + Kokkos::CudaUVMSpace>::accessible, + ""); + + //-------------------------------------- + + static_assert( + !Kokkos::Impl::SpaceAccessibility<Kokkos::Cuda, + Kokkos::HostSpace>::accessible, + ""); + + static_assert(Kokkos::Impl::SpaceAccessibility<Kokkos::Cuda, + Kokkos::CudaSpace>::accessible, + ""); + + static_assert( + Kokkos::Impl::SpaceAccessibility<Kokkos::Cuda, + Kokkos::CudaUVMSpace>::accessible, + ""); + + static_assert( + Kokkos::Impl::SpaceAccessibility<Kokkos::Cuda, + Kokkos::CudaHostPinnedSpace>::accessible, + ""); + + static_assert( + !Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace, + Kokkos::CudaSpace>::accessible, + ""); + + static_assert( + Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace, + Kokkos::CudaUVMSpace>::accessible, + ""); + + static_assert( + Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace, + Kokkos::CudaHostPinnedSpace>::accessible, + ""); + + static_assert(std::is_same<Kokkos::Impl::HostMirror<Kokkos::CudaSpace>::Space, + Kokkos::HostSpace>::value, + ""); + + static_assert( + std::is_same<Kokkos::Impl::HostMirror<Kokkos::CudaUVMSpace>::Space, + Kokkos::Device<Kokkos::HostSpace::execution_space, + Kokkos::CudaUVMSpace>>::value, + ""); + + static_assert( + std::is_same<Kokkos::Impl::HostMirror<Kokkos::CudaHostPinnedSpace>::Space, + Kokkos::CudaHostPinnedSpace>::value, + ""); + + static_assert(std::is_same<Kokkos::Device<Kokkos::HostSpace::execution_space, + Kokkos::CudaUVMSpace>, + Kokkos::Device<Kokkos::HostSpace::execution_space, + Kokkos::CudaUVMSpace>>::value, + ""); + + static_assert(Kokkos::Impl::SpaceAccessibility< + Kokkos::Impl::HostMirror<Kokkos::Cuda>::Space, + Kokkos::HostSpace>::accessible, + ""); + + static_assert(Kokkos::Impl::SpaceAccessibility< + Kokkos::Impl::HostMirror<Kokkos::CudaSpace>::Space, + Kokkos::HostSpace>::accessible, + ""); + + static_assert(Kokkos::Impl::SpaceAccessibility< + Kokkos::Impl::HostMirror<Kokkos::CudaUVMSpace>::Space, + Kokkos::HostSpace>::accessible, + ""); + + static_assert( + Kokkos::Impl::SpaceAccessibility< + Kokkos::Impl::HostMirror<Kokkos::CudaHostPinnedSpace>::Space, + Kokkos::HostSpace>::accessible, + ""); +#ifdef KOKKOS_ENABLE_CUDA_UVM + using uvm_view = Kokkos::View<double *, Kokkos::CudaUVMSpace>; + static_assert(std::is_same<uvm_view::HostMirror::execution_space, + Kokkos::DefaultHostExecutionSpace>::value, + "Verify HostMirror execution space is really a host space"); +#endif +} + +TEST(cuda, uvm) { + if (Kokkos::CudaUVMSpace::available()) { + int *uvm_ptr = (int *)Kokkos::kokkos_malloc<Kokkos::CudaUVMSpace>( + "uvm_ptr", sizeof(int)); + + *uvm_ptr = 42; + + Kokkos::Cuda().fence(); + test_cuda_spaces_int_value<<<1, 1>>>(uvm_ptr); + Kokkos::Cuda().fence(); + + EXPECT_EQ(*uvm_ptr, int(2 * 42)); + + Kokkos::kokkos_free<Kokkos::CudaUVMSpace>(uvm_ptr); + } +} + +template <class MemSpace, class ExecSpace> +struct TestViewCudaAccessible { + enum { N = 1000 }; + + using V = Kokkos::View<double *, MemSpace>; + + V m_base; + + struct TagInit {}; + struct TagTest {}; + + KOKKOS_INLINE_FUNCTION + void operator()(const TagInit &, const int i) const { m_base[i] = i + 1; } + + KOKKOS_INLINE_FUNCTION + void operator()(const TagTest &, const int i, long &error_count) const { + if (m_base[i] != i + 1) ++error_count; + } + + TestViewCudaAccessible() : m_base("base", N) {} + + static void run() { + TestViewCudaAccessible self; + Kokkos::parallel_for( + Kokkos::RangePolicy<typename MemSpace::execution_space, TagInit>(0, N), + self); + typename MemSpace::execution_space().fence(); + + // Next access is a different execution space, must complete prior kernel. + long error_count = -1; + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace, TagTest>(0, N), self, + error_count); + EXPECT_EQ(error_count, 0); + } +}; + +TEST(cuda, impl_view_accessible) { + TestViewCudaAccessible<Kokkos::CudaSpace, Kokkos::Cuda>::run(); + + TestViewCudaAccessible<Kokkos::CudaUVMSpace, Kokkos::Cuda>::run(); + TestViewCudaAccessible<Kokkos::CudaUVMSpace, + Kokkos::HostSpace::execution_space>::run(); + + TestViewCudaAccessible<Kokkos::CudaHostPinnedSpace, Kokkos::Cuda>::run(); + TestViewCudaAccessible<Kokkos::CudaHostPinnedSpace, + Kokkos::HostSpace::execution_space>::run(); +} + +template <class MemSpace> +struct TestViewCudaTexture { + enum { N = 1000 }; + + using V = Kokkos::View<double *, MemSpace>; + using T = Kokkos::View<const double *, MemSpace, Kokkos::MemoryRandomAccess>; + + V m_base; + T m_tex; + + struct TagInit {}; + struct TagTest {}; + + KOKKOS_INLINE_FUNCTION + void operator()(const TagInit &, const int i) const { m_base[i] = i + 1; } + + KOKKOS_INLINE_FUNCTION + void operator()(const TagTest &, const int i, long &error_count) const { + if (m_tex[i] != i + 1) ++error_count; + } + + TestViewCudaTexture() : m_base("base", N), m_tex(m_base) {} + + static void run() { + EXPECT_TRUE((std::is_same<typename V::reference_type, double &>::value)); + EXPECT_TRUE( + (std::is_same<typename T::reference_type, const double>::value)); + + EXPECT_TRUE(V::reference_type_is_lvalue_reference); // An ordinary view. + EXPECT_FALSE(T::reference_type_is_lvalue_reference); // Texture fetch + // returns by value. + + TestViewCudaTexture self; + Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Cuda, TagInit>(0, N), + self); + + long error_count = -1; + Kokkos::parallel_reduce(Kokkos::RangePolicy<Kokkos::Cuda, TagTest>(0, N), + self, error_count); + EXPECT_EQ(error_count, 0); + } +}; + +TEST(cuda, impl_view_texture) { + TestViewCudaTexture<Kokkos::CudaSpace>::run(); + TestViewCudaTexture<Kokkos::CudaUVMSpace>::run(); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_Task.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_Task.cpp new file mode 100644 index 0000000000000000000000000000000000000000..42fa615bc6f65f0661ceaad12c3613781a133a52 --- /dev/null +++ b/packages/kokkos/core/unit_test/cuda/TestCuda_Task.cpp @@ -0,0 +1,47 @@ + +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestCuda_Category.hpp> +#include <TestTaskScheduler.hpp> diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_TeamScratchStreams.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_TeamScratchStreams.cpp new file mode 100644 index 0000000000000000000000000000000000000000..eb9077aaf423b2bf9bdfa919d4d45cd18805d069 --- /dev/null +++ b/packages/kokkos/core/unit_test/cuda/TestCuda_TeamScratchStreams.cpp @@ -0,0 +1,147 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestCuda_Category.hpp> +#include <Kokkos_Core.hpp> + +namespace Test { + +namespace Impl { + +struct CudaStreamScratchTestFunctor { + using team_t = Kokkos::TeamPolicy<Kokkos::Cuda>::member_type; + using scratch_t = Kokkos::View<int64_t*, Kokkos::Cuda::scratch_memory_space>; + + Kokkos::View<int64_t, Kokkos::CudaSpace, Kokkos::MemoryTraits<Kokkos::Atomic>> + counter; + int N, M; + CudaStreamScratchTestFunctor( + Kokkos::View<int64_t, Kokkos::CudaSpace> counter_, int N_, int M_) + : counter(counter_), N(N_), M(M_) {} + + KOKKOS_FUNCTION + void operator()(const team_t& team) const { + scratch_t scr(team.team_scratch(1), M); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 0, M), + [&](int i) { scr[i] = 0; }); + team.team_barrier(); + for (int i = 0; i < N; i++) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 0, M), + [&](int j) { scr[j] += 1; }); + } + team.team_barrier(); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 0, M), [&](int i) { + if (scr[i] != N) counter()++; + }); + } +}; + +void cuda_stream_scratch_test_one( + int N, int T, int M_base, Kokkos::View<int64_t, Kokkos::CudaSpace> counter, + Kokkos::Cuda cuda, int tid) { + int M = M_base + tid * 5; + Kokkos::TeamPolicy<Kokkos::Cuda> p(cuda, T, 64); + using scratch_t = Kokkos::View<int64_t*, Kokkos::Cuda::scratch_memory_space>; + + int bytes = scratch_t::shmem_size(M); + + for (int r = 0; r < 15; r++) { + Kokkos::parallel_for("Run", p.set_scratch_size(1, Kokkos::PerTeam(bytes)), + CudaStreamScratchTestFunctor(counter, N, M)); + } +} + +void cuda_stream_scratch_test( + int N, int T, int M_base, + Kokkos::View<int64_t, Kokkos::CudaSpace> counter) { + int K = 4; + cudaStream_t stream[4]; + Kokkos::Cuda cuda[4]; + for (int i = 0; i < K; i++) { + cudaStreamCreate(&stream[i]); + cuda[i] = Kokkos::Cuda(stream[i]); + } + // Test that growing scratch size in subsequent calls doesn't crash things +#if defined(KOKKOS_ENABLE_OPENMP) +#pragma omp parallel + { + int tid = omp_get_thread_num(); + // Limit how many threads submit + if (tid < 4) { + cuda_stream_scratch_test_one(N, T, M_base, counter, cuda[tid], tid); + } + } +#else + for (int tid = 0; tid < K; tid++) { + cuda_stream_scratch_test_one(N, T, M_base, counter, cuda[tid], tid); + } +#endif + // Test that if everything is large enough, multiple launches with different + // scratch sizes don't step on each other + for (int tid = K - 1; tid >= 0; tid--) { + cuda_stream_scratch_test_one(N, T, M_base, counter, cuda[tid], tid); + } + + Kokkos::fence(); + for (int i = 0; i < K; i++) { + cuda[i] = Kokkos::Cuda(); + cudaStreamDestroy(stream[i]); + } +} +} // namespace Impl + +TEST(cuda, team_scratch_1_streams) { + int N = 1000000; + int T = 10; + int M_base = 150; + + Kokkos::View<int64_t, Kokkos::CudaSpace> counter("C"); + + Impl::cuda_stream_scratch_test(N, T, M_base, counter); + + int64_t result; + Kokkos::deep_copy(result, counter); + ASSERT_EQ(0, result); +} +} // namespace Test diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceDevelop.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceDevelop.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b312f42b24369a725a44bdd1de1a2771e794959f --- /dev/null +++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceDevelop.cpp @@ -0,0 +1,56 @@ + +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +#include <TestDefaultDeviceType_Category.hpp> + +namespace Test { + +TEST(defaultdevicetype, development_test) {} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5dcbe566e299c0f013843216b0854dc51582dd6d --- /dev/null +++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType.cpp @@ -0,0 +1,77 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> +#include <TestDefaultDeviceType_Category.hpp> +#include <TestHalfConversion.hpp> +#include <TestHalfOperators.hpp> + +#if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__) + +namespace Test { + +TEST(TEST_CATEGORY, host_space_access) { + using host_exec_space = Kokkos::HostSpace::execution_space; + using device_space = Kokkos::Device<host_exec_space, Kokkos::HostSpace>; + using mirror_space = + Kokkos::Impl::HostMirror<Kokkos::DefaultExecutionSpace>::Space; + + static_assert(Kokkos::Impl::SpaceAccessibility<host_exec_space, + Kokkos::HostSpace>::accessible, + ""); + + static_assert(Kokkos::Impl::SpaceAccessibility<device_space, + Kokkos::HostSpace>::accessible, + ""); + + static_assert(Kokkos::Impl::SpaceAccessibility<mirror_space, + Kokkos::HostSpace>::accessible, + ""); +} + +} // namespace Test + +#endif diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_1.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_1.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0f53cf8de7908d3e4dc046cc5c9624fe90e3cb76 --- /dev/null +++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_1.cpp @@ -0,0 +1,2 @@ +#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_01 +#include <TestDefaultDeviceTypeInit.hpp> diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_10.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_10.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c4691edd9b70428844462dd27627bc29618556c8 --- /dev/null +++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_10.cpp @@ -0,0 +1,2 @@ +#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_10 +#include <TestDefaultDeviceTypeInit.hpp> diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_11.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_11.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6d999e6cca50f8765f2216c72625b00942d24cab --- /dev/null +++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_11.cpp @@ -0,0 +1,2 @@ +#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_11 +#include <TestDefaultDeviceTypeInit.hpp> diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_12.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_12.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8b60a704af57c32af9cbf479e8b8475ffc69d543 --- /dev/null +++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_12.cpp @@ -0,0 +1,2 @@ +#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_12 +#include <TestDefaultDeviceTypeInit.hpp> diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_13.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_13.cpp new file mode 100644 index 0000000000000000000000000000000000000000..22af90b95225798b8fe4c5456df8fe5d43aa67d8 --- /dev/null +++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_13.cpp @@ -0,0 +1,2 @@ +#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_13 +#include <TestDefaultDeviceTypeInit.hpp> diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_14.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_14.cpp new file mode 100644 index 0000000000000000000000000000000000000000..cbe7aa34c7b48d892528b1e2139b03cd3e979470 --- /dev/null +++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_14.cpp @@ -0,0 +1,2 @@ +#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_14 +#include <TestDefaultDeviceTypeInit.hpp> diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_15.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_15.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8aefe0b77a277edcf8ef1ca52e7b0e520ebdf638 --- /dev/null +++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_15.cpp @@ -0,0 +1,2 @@ +#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_15 +#include <TestDefaultDeviceTypeInit.hpp> diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_16.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_16.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0023c903be9762a71077fa56c9ebf7852679051a --- /dev/null +++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_16.cpp @@ -0,0 +1,2 @@ +#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_16 +#include <TestDefaultDeviceTypeInit.hpp> diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_17.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_17.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e2c6eed8309cd664e51c1776c3421bb1d40124df --- /dev/null +++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_17.cpp @@ -0,0 +1,2 @@ +#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_17 +#include <TestDefaultDeviceTypeInit.hpp> diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_18.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_18.cpp new file mode 100644 index 0000000000000000000000000000000000000000..282634bc184786aa68537de9817a1bdc5f46b04a --- /dev/null +++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_18.cpp @@ -0,0 +1,2 @@ +#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_18 +#include <TestDefaultDeviceTypeInit.hpp> diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_2.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_2.cpp new file mode 100644 index 0000000000000000000000000000000000000000..da3c5d381c0d97481874661f245fd3202039dea8 --- /dev/null +++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_2.cpp @@ -0,0 +1,2 @@ +#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_02 +#include <TestDefaultDeviceTypeInit.hpp> diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_3.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_3.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ab55bf93aa91360fdce82248d8c282fc6aa6d32e --- /dev/null +++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_3.cpp @@ -0,0 +1,2 @@ +#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_03 +#include <TestDefaultDeviceTypeInit.hpp> diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_4.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_4.cpp new file mode 100644 index 0000000000000000000000000000000000000000..02b637d6cdc16953e84dce595675738b9b0e2ab4 --- /dev/null +++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_4.cpp @@ -0,0 +1,2 @@ +#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_04 +#include <TestDefaultDeviceTypeInit.hpp> diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_5.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_5.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0d6dbc52233d0b15618cd5c805cbab60ade6c034 --- /dev/null +++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_5.cpp @@ -0,0 +1,2 @@ +#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_05 +#include <TestDefaultDeviceTypeInit.hpp> diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_6.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_6.cpp new file mode 100644 index 0000000000000000000000000000000000000000..148153944da29f710c8bd5829a50b1b3723bb44b --- /dev/null +++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_6.cpp @@ -0,0 +1,2 @@ +#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_06 +#include <TestDefaultDeviceTypeInit.hpp> diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_7.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_7.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d7366577e9e64e078d08cac819c28da3cd26115b --- /dev/null +++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_7.cpp @@ -0,0 +1,2 @@ +#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_07 +#include <TestDefaultDeviceTypeInit.hpp> diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_8.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_8.cpp new file mode 100644 index 0000000000000000000000000000000000000000..72404d4328335b901cd5e65b1ecfc6c375dcf714 --- /dev/null +++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_8.cpp @@ -0,0 +1,2 @@ +#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_08 +#include <TestDefaultDeviceTypeInit.hpp> diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_9.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_9.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7e327677199a906ed0f48636e384d7a61af0c2e2 --- /dev/null +++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_9.cpp @@ -0,0 +1,2 @@ +#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_09 +#include <TestDefaultDeviceTypeInit.hpp> diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeResize.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeResize.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7f53034557dca1c06bcbc6588ff7fdce6ddbb4c4 --- /dev/null +++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeResize.cpp @@ -0,0 +1,57 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> +#include "TestResize.hpp" + +namespace Test { + +TEST(kokkosresize, host_space_access) { + // Test with the default device type. + using TestViewResize::testResize; + using device_type = Kokkos::View<int *>::device_type; + testResize<device_type>(); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_a1.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_a1.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9b57de712183a7ff1fd72533f578c25947901f39 --- /dev/null +++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_a1.cpp @@ -0,0 +1,61 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +#if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__) + +#include <TestDefaultDeviceType_Category.hpp> +#include <TestReduceCombinatorical.hpp> + +namespace Test { + +TEST(defaultdevicetype, reduce_instantiation_a1) { + TestReduceCombinatoricalInstantiation<>::execute_a1(); +} + +} // namespace Test +#endif diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_a2.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_a2.cpp new file mode 100644 index 0000000000000000000000000000000000000000..314891433693df21689fedb2827dc8d614896383 --- /dev/null +++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_a2.cpp @@ -0,0 +1,62 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +#if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__) + +#include <TestDefaultDeviceType_Category.hpp> +#include <TestReduceCombinatorical.hpp> + +namespace Test { + +TEST(defaultdevicetype, reduce_instantiation_a2) { + TestReduceCombinatoricalInstantiation<>::execute_a2(); +} + +} // namespace Test + +#endif diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_a3.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_a3.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f9e36e298a227281683ceae0bf5bfe9179a6b8d5 --- /dev/null +++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_a3.cpp @@ -0,0 +1,62 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +#if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__) + +#include <TestDefaultDeviceType_Category.hpp> +#include <TestReduceCombinatorical.hpp> + +namespace Test { + +TEST(defaultdevicetype, reduce_instantiation_a3) { + TestReduceCombinatoricalInstantiation<>::execute_a3(); +} + +} // namespace Test + +#endif diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_b1.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_b1.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1a34bef874f75ecf4a75a99166f7abc66ec1804c --- /dev/null +++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_b1.cpp @@ -0,0 +1,61 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +#if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__) + +#include <TestDefaultDeviceType_Category.hpp> +#include <TestReduceCombinatorical.hpp> + +namespace Test { + +TEST(defaultdevicetype, reduce_instantiation_b1) { + TestReduceCombinatoricalInstantiation<>::execute_b1(); +} + +} // namespace Test +#endif diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_b2.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_b2.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8bd7628243ab0e3c073d797cc2817ec2a4ba1185 --- /dev/null +++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_b2.cpp @@ -0,0 +1,62 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +#if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__) + +#include <TestDefaultDeviceType_Category.hpp> +#include <TestReduceCombinatorical.hpp> + +namespace Test { + +TEST(defaultdevicetype, reduce_instantiation_b2) { + TestReduceCombinatoricalInstantiation<>::execute_b2(); +} + +} // namespace Test + +#endif diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_b3.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_b3.cpp new file mode 100644 index 0000000000000000000000000000000000000000..bc1d763437d1f4d23ae688fe94c16fcd7f9367f9 --- /dev/null +++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_b3.cpp @@ -0,0 +1,62 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +#if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__) + +#include <TestDefaultDeviceType_Category.hpp> +#include <TestReduceCombinatorical.hpp> + +namespace Test { + +TEST(defaultdevicetype, reduce_instantiation_b3) { + TestReduceCombinatoricalInstantiation<>::execute_b3(); +} + +} // namespace Test + +#endif diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_c1.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_c1.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ba4cca46fbb9d0883691a40ee53b8a11c739b4c5 --- /dev/null +++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_c1.cpp @@ -0,0 +1,61 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +#if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__) + +#include <TestDefaultDeviceType_Category.hpp> +#include <TestReduceCombinatorical.hpp> + +namespace Test { + +TEST(defaultdevicetype, reduce_instantiation_c1) { + TestReduceCombinatoricalInstantiation<>::execute_c1(); +} + +} // namespace Test +#endif diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_c2.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_c2.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0459f98dddb20e7cae811502e4ebf5518b011c6b --- /dev/null +++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_c2.cpp @@ -0,0 +1,62 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +#if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__) + +#include <TestDefaultDeviceType_Category.hpp> +#include <TestReduceCombinatorical.hpp> + +namespace Test { + +TEST(defaultdevicetype, reduce_instantiation_c2) { + TestReduceCombinatoricalInstantiation<>::execute_c2(); +} + +} // namespace Test + +#endif diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_c3.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_c3.cpp new file mode 100644 index 0000000000000000000000000000000000000000..801dee83bbe16b6b25398b27068e5d8a3b3d29e2 --- /dev/null +++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_c3.cpp @@ -0,0 +1,62 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +#if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__) + +#include <TestDefaultDeviceType_Category.hpp> +#include <TestReduceCombinatorical.hpp> + +namespace Test { + +TEST(defaultdevicetype, reduce_instantiation_c3) { + TestReduceCombinatoricalInstantiation<>::execute_c3(); +} + +} // namespace Test + +#endif diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_d.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_d.cpp new file mode 100644 index 0000000000000000000000000000000000000000..bcd49e69bd3af022ede0ca0a188066288c9b1d35 --- /dev/null +++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_d.cpp @@ -0,0 +1,68 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> + +#include <Kokkos_Core.hpp> + +#if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__) + +#include <TestDefaultDeviceType_Category.hpp> +#include <TestUtilities.hpp> + +namespace Test { + +TEST(defaultdevicetype, malloc) { + int* data = (int*)Kokkos::kokkos_malloc(100 * sizeof(int)); + ASSERT_NO_THROW(data = (int*)Kokkos::kokkos_realloc(data, 120 * sizeof(int))); + Kokkos::kokkos_free(data); + + int* data2 = (int*)Kokkos::kokkos_malloc(0); + ASSERT_TRUE(data2 == nullptr); + Kokkos::kokkos_free(data2); +} + +} // namespace Test + +#endif diff --git a/packages/kokkos/core/unit_test/diffconfig.sh b/packages/kokkos/core/unit_test/diffconfig.sh new file mode 100755 index 0000000000000000000000000000000000000000..0c8836ff83ca93d5293a986fb68f3a05b2291f51 --- /dev/null +++ b/packages/kokkos/core/unit_test/diffconfig.sh @@ -0,0 +1,18 @@ +#!/bin/bash +# CMake and Make tests run in separate directories +# The mapping of ARCH to #define is very complicated +# so diff is used instead of grepping +if test "`basename $PWD`" = "cmaketest"; then + outfile=$1 + resfile=../results/$1 +else + outfile=config/tmpstore/$1 + resfile=config/results/$1 +fi + +diff=`diff $outfile $resfile 2>&1 | grep -e define -e "such file"` +if test -z "$diff"; then + echo Passed +else + echo Failed: $diff +fi diff --git a/packages/kokkos/core/unit_test/headers_self_contained/CMakeLists.txt b/packages/kokkos/core/unit_test/headers_self_contained/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..20b295650a610a601d73e88b2b116e5dda34c324 --- /dev/null +++ b/packages/kokkos/core/unit_test/headers_self_contained/CMakeLists.txt @@ -0,0 +1,19 @@ +# Create tests that contain each header separately. We do not run these tests +# but we just try to compile them. + +# Globbing all the header filenames to test for self-containment and presence of header guards +SET(BASE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../") +file(GLOB KOKKOS_CORE_HEADERS RELATIVE ${BASE_DIR}/core/src + ${BASE_DIR}/core/src/*.hpp ${BASE_DIR}/core/src/*.h) +file(GLOB KOKKOS_CONTAINERS_HEADERS RELATIVE ${BASE_DIR}/containers/src + ${BASE_DIR}/containers/src/*.hpp) +file(GLOB KOKKOS_ALGORITHMS_HEADERS RELATIVE ${BASE_DIR}/algorithms/src + ${BASE_DIR}/algorithms/src/*.hpp) + +foreach (_header ${KOKKOS_CORE_HEADERS} ${KOKKOS_CONTAINERS_HEADERS} ${KOKKOS_ALGORITHMS_HEADERS}) + string(REGEX REPLACE "[\./]" "_" header_test_name ${_header}) + set(header_test_name Kokkos_HeaderSelfContained_${header_test_name}) + add_executable(${header_test_name} tstHeader.cpp) + target_link_libraries(${header_test_name} PRIVATE Kokkos::kokkos) + target_compile_definitions(${header_test_name} PRIVATE KOKKOS_HEADER_TEST_NAME=${_header}) +endforeach() diff --git a/packages/kokkos/core/unit_test/headers_self_contained/tstHeader.cpp b/packages/kokkos/core/unit_test/headers_self_contained/tstHeader.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d488f0fa36adb2eb3b509a245dcce30b5cc90a76 --- /dev/null +++ b/packages/kokkos/core/unit_test/headers_self_contained/tstHeader.cpp @@ -0,0 +1,15 @@ +#define KOKKOS_HEADER_TEST_STRINGIZE_IMPL(x) #x +#define KOKKOS_HEADER_TEST_STRINGIZE(x) KOKKOS_HEADER_TEST_STRINGIZE_IMPL(x) + +#define KOKKOS_HEADER_TO_TEST \ + KOKKOS_HEADER_TEST_STRINGIZE(KOKKOS_HEADER_TEST_NAME) + +// include header twice to see if the include guards are set correctly +#include KOKKOS_HEADER_TO_TEST +#include KOKKOS_HEADER_TO_TEST + +#if !defined(KOKKOS_MACROS_HPP) +#error "This header does not include Kokkos_Macros.hpp" +#endif + +int main() { return 0; } diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_a.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_a.cpp new file mode 100644 index 0000000000000000000000000000000000000000..02157836b3f6075c6c18e2919d93ed4b541dbab8 --- /dev/null +++ b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_a.cpp @@ -0,0 +1,46 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestHIPHostPinned_Category.hpp> +#include <TestViewAPI_a.hpp> diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_b.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_b.cpp new file mode 100644 index 0000000000000000000000000000000000000000..80e2fe3f93716c23979ede23aa81de9b2f694c9e --- /dev/null +++ b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_b.cpp @@ -0,0 +1,46 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestHIPHostPinned_Category.hpp> +#include <TestViewAPI_b.hpp> diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_c.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_c.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9694e33ca0ce0f5c2fc6214613f4ae2f03c9750d --- /dev/null +++ b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_c.cpp @@ -0,0 +1,46 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestHIPHostPinned_Category.hpp> +#include <TestViewAPI_c.hpp> diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_d.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_d.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0d773494ac6236ce0274cc844fb3369aec81d51d --- /dev/null +++ b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_d.cpp @@ -0,0 +1,46 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestHIPHostPinned_Category.hpp> +#include <TestViewAPI_d.hpp> diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_e.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_e.cpp new file mode 100644 index 0000000000000000000000000000000000000000..cbbbc810b0e8e588be2892b83279a4137675de66 --- /dev/null +++ b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_e.cpp @@ -0,0 +1,46 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestHIPHostPinned_Category.hpp> +#include <TestViewAPI_e.hpp> diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewCopy_a.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewCopy_a.cpp new file mode 100644 index 0000000000000000000000000000000000000000..444a3e6e95d2a62c1ad0e8bedba3767503dd4687 --- /dev/null +++ b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewCopy_a.cpp @@ -0,0 +1,46 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestHIPHostPinned_Category.hpp> +#include <TestViewCopy_a.hpp> diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewCopy_b.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewCopy_b.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f1f90e7acf13c7aaa4820f5bd50ecc403f2d6f5f --- /dev/null +++ b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewCopy_b.cpp @@ -0,0 +1,46 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestHIPHostPinned_Category.hpp> +#include <TestViewCopy_b.hpp> diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_a.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_a.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5e83121e341db1da440c65cd5dce84dc1a6f6259 --- /dev/null +++ b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_a.cpp @@ -0,0 +1,46 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestHIPHostPinned_Category.hpp> +#include <TestViewMapping_a.hpp> diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_b.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_b.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c024143d6c7b735dfa3b897e0a4503ee50e4caec --- /dev/null +++ b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_b.cpp @@ -0,0 +1,46 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestHIPHostPinned_Category.hpp> +#include <TestViewMapping_b.hpp> diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_subview.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_subview.cpp new file mode 100644 index 0000000000000000000000000000000000000000..dcd6c1dc435982fdf44950c3b606847c29c30b37 --- /dev/null +++ b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_subview.cpp @@ -0,0 +1,46 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestHIPHostPinned_Category.hpp> +#include <TestViewMapping_subview.hpp> diff --git a/packages/kokkos/core/unit_test/hip/TestHIP_AsyncLauncher.cpp b/packages/kokkos/core/unit_test/hip/TestHIP_AsyncLauncher.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0a243e0e8e89c0ef5a7cec6195837909d092bc2a --- /dev/null +++ b/packages/kokkos/core/unit_test/hip/TestHIP_AsyncLauncher.cpp @@ -0,0 +1,89 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <TestHIP_Category.hpp> + +namespace Test { + +struct TestAsyncLauncher { + size_t *m_flag; + size_t m_value; + + KOKKOS_INLINE_FUNCTION + void operator()(const int /*i*/) const { + // and update flag + Kokkos::atomic_add(m_flag, m_value); + } + + TestAsyncLauncher(size_t *flag, int value) : m_flag(flag), m_value(value) {} + + void run() { + Kokkos::parallel_for(Kokkos::RangePolicy<TEST_EXECSPACE>(0, 1), *this); + } +}; + +TEST(hip, async_launcher) { + size_t *flag; + HIP_SAFE_CALL(hipMalloc(&flag, sizeof(size_t))); + HIP_SAFE_CALL(hipMemset(flag, 0, sizeof(size_t))); + // launch # of cycles * 1000 kernels w/ distinct values + auto space = Kokkos::Experimental::HIP(); + auto instance = space.impl_internal_space_instance(); + size_t max_cycles = instance->m_maxDriverCycles; + size_t nkernels = max_cycles * 1000; + for (size_t i = 0; i < nkernels; ++i) { + TestAsyncLauncher(flag, i).run(); + } + // and check results -- if any of the driver types were overwritten + // the sum below should fail + instance->fence(); + size_t h_flag; + HIP_SAFE_CALL( + hipMemcpy(&h_flag, flag, sizeof(size_t), hipMemcpyHostToDevice)); + ASSERT_EQ(h_flag, (nkernels * (nkernels - 1)) / 2); + HIP_SAFE_CALL(hipFree(flag)); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Init.cpp b/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Init.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3a76ca148cf683a83b84d351e4ebd8b2f7cdec94 --- /dev/null +++ b/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Init.cpp @@ -0,0 +1,88 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <TestHIP_Category.hpp> + +#include <array> + +namespace Test { + +__global__ void offset(int* p) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < 100) { + p[idx] += idx; + } +} + +// Test whether allocations survive Kokkos initialize/finalize if done via Raw +// HIP. +TEST(hip, raw_hip_interop) { + int* p; + HIP_SAFE_CALL(hipMalloc(&p, sizeof(int) * 100)); + Kokkos::InitArguments arguments{-1, -1, -1, false}; + Kokkos::initialize(arguments); + + Kokkos::View<int*, Kokkos::MemoryTraits<Kokkos::Unmanaged>> v(p, 100); + Kokkos::deep_copy(v, 5); + + Kokkos::finalize(); + + offset<<<dim3(100), dim3(100), 0, nullptr>>>(p); + HIP_SAFE_CALL(hipDeviceSynchronize()); + + std::array<int, 100> h_p; + HIP_SAFE_CALL(hipMemcpy(h_p.data(), p, sizeof(int) * 100, hipMemcpyDefault)); + HIP_SAFE_CALL(hipDeviceSynchronize()); + int64_t sum = 0; + int64_t sum_expect = 0; + for (int i = 0; i < 100; i++) { + sum += h_p[i]; + sum_expect += 5 + i; + } + + ASSERT_EQ(sum, sum_expect); + HIP_SAFE_CALL(hipFree(p)); +} +} // namespace Test diff --git a/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Streams.cpp b/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Streams.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8e0880ddbd0b15524be75ab97b90044e5315a8ff --- /dev/null +++ b/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Streams.cpp @@ -0,0 +1,115 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestHIP_Category.hpp> +#include <Test_InterOp_Streams.hpp> + +namespace Test { +// Test Interoperability with HIP Streams +// The difference with the CUDA tests are: raw HIP vs raw CUDA and no launch +// bound in HIP due to an error when computing the block size. +TEST(hip, raw_hip_streams) { + hipStream_t stream; + HIP_SAFE_CALL(hipStreamCreate(&stream)); + Kokkos::InitArguments arguments{-1, -1, -1, false}; + Kokkos::initialize(arguments); + int* p; + HIP_SAFE_CALL(hipMalloc(&p, sizeof(int) * 100)); + using MemorySpace = typename TEST_EXECSPACE::memory_space; + + { + TEST_EXECSPACE space0(stream); + Kokkos::View<int*, TEST_EXECSPACE> v(p, 100); + Kokkos::deep_copy(space0, v, 5); + int sum; + + Kokkos::parallel_for("Test::hip::raw_hip_stream::Range", + Kokkos::RangePolicy<TEST_EXECSPACE>(space0, 0, 100), + FunctorRange<MemorySpace>(v)); + Kokkos::parallel_reduce("Test::hip::raw_hip_stream::RangeReduce", + Kokkos::RangePolicy<TEST_EXECSPACE>(space0, 0, 100), + FunctorRangeReduce<MemorySpace>(v), sum); + space0.fence(); + ASSERT_EQ(600, sum); + + Kokkos::parallel_for("Test::hip::raw_hip_stream::MDRange", + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>( + space0, {0, 0}, {10, 10}), + FunctorMDRange<MemorySpace>(v)); + Kokkos::parallel_reduce( + "Test::hip::raw_hip_stream::MDRangeReduce", + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space0, {0, 0}, + {10, 10}), + FunctorMDRangeReduce<MemorySpace>(v), sum); + space0.fence(); + ASSERT_EQ(700, sum); + + Kokkos::parallel_for("Test::hip::raw_hip_stream::Team", + Kokkos::TeamPolicy<TEST_EXECSPACE>(space0, 10, 10), + FunctorTeam<MemorySpace, TEST_EXECSPACE>(v)); + Kokkos::parallel_reduce("Test::hip::raw_hip_stream::Team", + Kokkos::TeamPolicy<TEST_EXECSPACE>(space0, 10, 10), + FunctorTeamReduce<MemorySpace, TEST_EXECSPACE>(v), + sum); + space0.fence(); + ASSERT_EQ(800, sum); + } + Kokkos::finalize(); + offset_streams<<<100, 64, 0, stream>>>(p); + HIP_SAFE_CALL(hipDeviceSynchronize()); + HIP_SAFE_CALL(hipStreamDestroy(stream)); + + int h_p[100]; + HIP_SAFE_CALL(hipMemcpy(h_p, p, sizeof(int) * 100, hipMemcpyDefault)); + HIP_SAFE_CALL(hipDeviceSynchronize()); + int64_t sum = 0; + int64_t sum_expect = 0; + for (int i = 0; i < 100; i++) { + sum += h_p[i]; + sum_expect += 8 + i; + } + + ASSERT_EQ(sum, sum_expect); +} +} // namespace Test diff --git a/packages/kokkos/core/unit_test/hip/TestHIP_ScanUnit.cpp b/packages/kokkos/core/unit_test/hip/TestHIP_ScanUnit.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b759d6f407a791fb3b88b86f502cc956780294f3 --- /dev/null +++ b/packages/kokkos/core/unit_test/hip/TestHIP_ScanUnit.cpp @@ -0,0 +1,97 @@ + +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <TestHIP_Category.hpp> + +struct DummyFunctor { + using value_type = int; + void operator()(const int, value_type &, bool) const {} +}; + +template <int N> +__global__ void start_intra_block_scan() + __attribute__((amdgpu_flat_work_group_size(1, 1024))) { + __shared__ DummyFunctor::value_type values[N]; + const int i = threadIdx.y; + values[i] = i + 1; + __syncthreads(); + + DummyFunctor f; + Kokkos::Impl::hip_intra_block_reduce_scan<true, DummyFunctor, void>(f, + values); + + __syncthreads(); + if (values[i] != ((i + 2) * (i + 1)) / 2) { + printf("Value for %d should be %d but is %d\n", i, ((i + 2) * (i + 1)) / 2, + values[i]); + Kokkos::abort("Test for intra_block_reduce_scan failed!"); + } +} + +template <int N> +void test_intra_block_scan() { + dim3 grid(1, 1, 1); + dim3 block(1, N, 1); + start_intra_block_scan<N><<<grid, block, 0, nullptr>>>(); +} + +TEST(TEST_CATEGORY, scan_unit) { + if (std::is_same< + TEST_EXECSPACE, + typename Kokkos::Experimental::HIPSpace::execution_space>::value) { + test_intra_block_scan<1>(); + test_intra_block_scan<2>(); + test_intra_block_scan<4>(); + test_intra_block_scan<8>(); + test_intra_block_scan<16>(); + test_intra_block_scan<32>(); + test_intra_block_scan<64>(); + test_intra_block_scan<128>(); + test_intra_block_scan<256>(); + test_intra_block_scan<512>(); + test_intra_block_scan<1024>(); + } +} diff --git a/packages/kokkos/core/unit_test/hip/TestHIP_Spaces.cpp b/packages/kokkos/core/unit_test/hip/TestHIP_Spaces.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ae1de8ea2d304e41d672ff2e136d16c86cbb8068 --- /dev/null +++ b/packages/kokkos/core/unit_test/hip/TestHIP_Spaces.cpp @@ -0,0 +1,233 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <TestHIP_Category.hpp> + +namespace Test { + +__global__ void test_abort() { Kokkos::abort("test_abort"); } + +__global__ void test_hip_spaces_int_value(int *ptr) { + if (*ptr == 42) { + *ptr = 2 * 42; + } +} + +TEST(hip, space_access) { + static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace, + Kokkos::HostSpace>::assignable, + ""); + + static_assert(Kokkos::Impl::MemorySpaceAccess< + Kokkos::HostSpace, + Kokkos::Experimental::HIPHostPinnedSpace>::assignable, + ""); + + static_assert( + !Kokkos::Impl::MemorySpaceAccess< + Kokkos::HostSpace, Kokkos::Experimental::HIPSpace>::assignable, + ""); + + static_assert( + !Kokkos::Impl::MemorySpaceAccess< + Kokkos::HostSpace, Kokkos::Experimental::HIPSpace>::accessible, + ""); + + //-------------------------------------- + + static_assert(Kokkos::Impl::MemorySpaceAccess< + Kokkos::Experimental::HIPSpace, + Kokkos::Experimental::HIPSpace>::assignable, + ""); + + static_assert(!Kokkos::Impl::MemorySpaceAccess< + Kokkos::Experimental::HIPSpace, + Kokkos::Experimental::HIPHostPinnedSpace>::assignable, + ""); + + static_assert(Kokkos::Impl::MemorySpaceAccess< + Kokkos::Experimental::HIPSpace, + Kokkos::Experimental::HIPHostPinnedSpace>::accessible, + ""); + + static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::HIPSpace, + Kokkos::HostSpace>::assignable, + ""); + + static_assert(!Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::HIPSpace, + Kokkos::HostSpace>::accessible, + ""); + + //-------------------------------------- + + static_assert(Kokkos::Impl::MemorySpaceAccess< + Kokkos::Experimental::HIPHostPinnedSpace, + Kokkos::Experimental::HIPHostPinnedSpace>::assignable, + ""); + + static_assert( + !Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::HIPHostPinnedSpace, + Kokkos::HostSpace>::assignable, + ""); + + static_assert( + Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::HIPHostPinnedSpace, + Kokkos::HostSpace>::accessible, + ""); + + static_assert(!Kokkos::Impl::MemorySpaceAccess< + Kokkos::Experimental::HIPHostPinnedSpace, + Kokkos::Experimental::HIPSpace>::assignable, + ""); + + static_assert(!Kokkos::Impl::MemorySpaceAccess< + Kokkos::Experimental::HIPHostPinnedSpace, + Kokkos::Experimental::HIPSpace>::accessible, + ""); + + //-------------------------------------- + + static_assert( + !Kokkos::Impl::SpaceAccessibility<Kokkos::Experimental::HIP, + Kokkos::HostSpace>::accessible, + ""); + + static_assert(Kokkos::Impl::SpaceAccessibility< + Kokkos::Experimental::HIP, + Kokkos::Experimental::HIPSpace>::accessible, + ""); + + static_assert(Kokkos::Impl::SpaceAccessibility< + Kokkos::Experimental::HIP, + Kokkos::Experimental::HIPHostPinnedSpace>::accessible, + ""); + + static_assert( + !Kokkos::Impl::SpaceAccessibility< + Kokkos::HostSpace, Kokkos::Experimental::HIPSpace>::accessible, + ""); + + static_assert(Kokkos::Impl::SpaceAccessibility< + Kokkos::HostSpace, + Kokkos::Experimental::HIPHostPinnedSpace>::accessible, + ""); + + static_assert( + std::is_same< + Kokkos::Impl::HostMirror<Kokkos::Experimental::HIPSpace>::Space, + Kokkos::HostSpace>::value, + ""); + + static_assert( + std::is_same<Kokkos::Impl::HostMirror< + Kokkos::Experimental::HIPHostPinnedSpace>::Space, + Kokkos::Experimental::HIPHostPinnedSpace>::value, + ""); + + static_assert(Kokkos::Impl::SpaceAccessibility< + Kokkos::Impl::HostMirror<Kokkos::Experimental::HIP>::Space, + Kokkos::HostSpace>::accessible, + ""); + + static_assert( + Kokkos::Impl::SpaceAccessibility< + Kokkos::Impl::HostMirror<Kokkos::Experimental::HIPSpace>::Space, + Kokkos::HostSpace>::accessible, + ""); + + static_assert(Kokkos::Impl::SpaceAccessibility< + Kokkos::Impl::HostMirror< + Kokkos::Experimental::HIPHostPinnedSpace>::Space, + Kokkos::HostSpace>::accessible, + ""); +} + +template <class MemSpace, class ExecSpace> +struct TestViewHIPAccessible { + enum { N = 1000 }; + + using V = Kokkos::View<double *, MemSpace>; + + V m_base; + + struct TagInit {}; + struct TagTest {}; + + KOKKOS_INLINE_FUNCTION + void operator()(const TagInit &, const int i) const { m_base[i] = i + 1; } + + KOKKOS_INLINE_FUNCTION + void operator()(const TagTest &, const int i, long &error_count) const { + if (m_base[i] != i + 1) ++error_count; + } + + TestViewHIPAccessible() : m_base("base", N) {} + + static void run() { + TestViewHIPAccessible self; + Kokkos::parallel_for( + Kokkos::RangePolicy<typename MemSpace::execution_space, TagInit>(0, N), + self); + typename MemSpace::execution_space().fence(); + + // Next access is a different execution space, must complete prior kernel. + long error_count = -1; + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace, TagTest>(0, N), self, + error_count); + EXPECT_EQ(error_count, 0); + } +}; + +TEST(hip, impl_view_accessible) { + TestViewHIPAccessible<Kokkos::Experimental::HIPSpace, + Kokkos::Experimental::HIP>::run(); + + TestViewHIPAccessible<Kokkos::Experimental::HIPHostPinnedSpace, + Kokkos::Experimental::HIP>::run(); + TestViewHIPAccessible<Kokkos::Experimental::HIPHostPinnedSpace, + Kokkos::HostSpace::execution_space>::run(); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/hip/TestHIP_TeamScratchStreams.cpp b/packages/kokkos/core/unit_test/hip/TestHIP_TeamScratchStreams.cpp new file mode 100644 index 0000000000000000000000000000000000000000..db360a99d3d60977cf06479e7662e21350dd5f99 --- /dev/null +++ b/packages/kokkos/core/unit_test/hip/TestHIP_TeamScratchStreams.cpp @@ -0,0 +1,152 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestHIP_Category.hpp> +#include <Kokkos_Core.hpp> + +namespace Test { + +namespace Impl { + +struct HIPStreamScratchTestFunctor { + using team_t = Kokkos::TeamPolicy<Kokkos::Experimental::HIP>::member_type; + using scratch_t = + Kokkos::View<int64_t*, Kokkos::Experimental::HIP::scratch_memory_space>; + + Kokkos::View<int64_t, Kokkos::Experimental::HIPSpace, + Kokkos::MemoryTraits<Kokkos::Atomic>> + counter; + int N, M; + HIPStreamScratchTestFunctor( + Kokkos::View<int64_t, Kokkos::Experimental::HIPSpace> counter_, int N_, + int M_) + : counter(counter_), N(N_), M(M_) {} + + KOKKOS_FUNCTION + void operator()(const team_t& team) const { + scratch_t scr(team.team_scratch(1), M); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 0, M), + [&](int i) { scr[i] = 0; }); + team.team_barrier(); + for (int i = 0; i < N; i++) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 0, M), + [&](int j) { scr[j] += 1; }); + } + team.team_barrier(); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 0, M), [&](int i) { + if (scr[i] != N) counter()++; + }); + } +}; + +void hip_stream_scratch_test_one( + int N, int T, int M_base, + Kokkos::View<int64_t, Kokkos::Experimental::HIPSpace> counter, + Kokkos::Experimental::HIP hip, int tid) { + int M = M_base + tid * 5; + Kokkos::TeamPolicy<Kokkos::Experimental::HIP> p(hip, T, 64); + using scratch_t = + Kokkos::View<int64_t*, Kokkos::Experimental::HIP::scratch_memory_space>; + + int bytes = scratch_t::shmem_size(M); + + for (int r = 0; r < 15; r++) { + Kokkos::parallel_for("Run", p.set_scratch_size(1, Kokkos::PerTeam(bytes)), + HIPStreamScratchTestFunctor(counter, N, M)); + } +} + +void hip_stream_scratch_test( + int N, int T, int M_base, + Kokkos::View<int64_t, Kokkos::Experimental::HIPSpace> counter) { + int K = 4; + hipStream_t stream[4]; + Kokkos::Experimental::HIP hip[4]; + for (int i = 0; i < K; i++) { + HIP_SAFE_CALL(hipStreamCreate(&stream[i])); + hip[i] = Kokkos::Experimental::HIP(stream[i]); + } +// Test that growing scratch size in subsequent calls doesn't crash things +#if defined(KOKKOS_ENABLE_OPENMP) +#pragma omp parallel + { + int tid = omp_get_thread_num(); + // Limit how many threads submit + if (tid < 4) { + hip_stream_scratch_test_one(N, T, M_base, counter, hip[tid], tid); + } + } +#else + for (int tid = 0; tid < K; tid++) { + hip_stream_scratch_test_one(N, T, M_base, counter, hip[tid], tid); + } +#endif + // Test that if everything is large enough, multiple launches with different + // scratch sizes don't step on each other + for (int tid = K - 1; tid >= 0; tid--) { + hip_stream_scratch_test_one(N, T, M_base, counter, hip[tid], tid); + } + + Kokkos::fence(); + for (int i = 0; i < K; i++) { + hip[i] = Kokkos::Experimental::HIP(); + HIP_SAFE_CALL(hipStreamDestroy(stream[i])); + } +} +} // namespace Impl + +TEST(hip, team_scratch_1_streams) { + int N = 1000000; + int T = 10; + int M_base = 150; + + Kokkos::View<int64_t, Kokkos::Experimental::HIPSpace> counter("C"); + + Impl::hip_stream_scratch_test(N, T, M_base, counter); + + int64_t result; + Kokkos::deep_copy(result, counter); + ASSERT_EQ(0, result); +} +} // namespace Test diff --git a/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstances.cpp b/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstances.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8e89d6d6a5da981b33eea9349ae3ace63ec3f684 --- /dev/null +++ b/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstances.cpp @@ -0,0 +1,188 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <TestHPX_Category.hpp> + +#include <hpx/config.hpp> +#include <hpx/include/lcos.hpp> + +#ifdef KOKKOS_ENABLE_HPX_ASYNC_DISPATCH +#ifndef HPX_COMPUTE_DEVICE_CODE + +namespace Test { + +namespace { +struct FunctorInitConstant { + Kokkos::View<int *, Kokkos::Experimental::HPX> a; + int c; + FunctorInitConstant(Kokkos::View<int *, Kokkos::Experimental::HPX> a_, int c_) + : a(a_), c(c_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { a(i) = c; } +}; + +struct FunctorAdd { + Kokkos::View<int *, Kokkos::Experimental::HPX> a; + Kokkos::View<int *, Kokkos::Experimental::HPX> b; + int c; + FunctorAdd(Kokkos::View<int *, Kokkos::Experimental::HPX> a_, + Kokkos::View<int *, Kokkos::Experimental::HPX> b_, int c_) + : a(a_), b(b_), c(c_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { b(i) += a(i) + c; } +}; + +struct FunctorAddIndex { + Kokkos::View<int *, Kokkos::Experimental::HPX> a; + Kokkos::View<int *, Kokkos::Experimental::HPX> b; + FunctorAddIndex(Kokkos::View<int *, Kokkos::Experimental::HPX> a_, + Kokkos::View<int *, Kokkos::Experimental::HPX> b_) + : a(a_), b(b_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { b(i) += a(i) + i; } +}; + +struct FunctorPointwiseSum { + Kokkos::View<int *, Kokkos::Experimental::HPX> a; + Kokkos::View<int *, Kokkos::Experimental::HPX> b; + Kokkos::View<int *, Kokkos::Experimental::HPX> c; + FunctorPointwiseSum(Kokkos::View<int *, Kokkos::Experimental::HPX> a_, + Kokkos::View<int *, Kokkos::Experimental::HPX> b_, + Kokkos::View<int *, Kokkos::Experimental::HPX> c_) + : a(a_), b(b_), c(c_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { c(i) = a(i) + b(i); } +}; + +struct FunctorReduce { + Kokkos::View<int *, Kokkos::Experimental::HPX> a; + FunctorReduce(Kokkos::View<int *, Kokkos::Experimental::HPX> a_) : a(a_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, int &lsum) const { lsum += a(i); } +}; +} // namespace + +TEST(hpx, independent_instances) { + Kokkos::InitArguments arguments{-1, -1, -1, false}; + Kokkos::initialize(arguments); + + const int n = 100; + const int c = 1; + const int d = 3; + + { + Kokkos::View<int *, Kokkos::Experimental::HPX> v1("v1", n); + Kokkos::View<int *, Kokkos::Experimental::HPX> v2("v2", n); + Kokkos::View<int *, Kokkos::Experimental::HPX> v3("v3", n); + Kokkos::View<int *, Kokkos::Experimental::HPX> v4("v4", n); + Kokkos::View<int, Kokkos::Experimental::HPX> sum_v("sum_v"); + + Kokkos::Experimental::HPX hpx1( + Kokkos::Experimental::HPX::instance_mode::independent); + Kokkos::parallel_for( + "Test::hpx::independent_instances::init", + Kokkos::Experimental::require( + Kokkos::RangePolicy<Kokkos::Experimental::HPX>(hpx1, 0, n), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + FunctorInitConstant(v1, c)); + + Kokkos::Experimental::HPX hpx2(hpx1.impl_get_future()); + Kokkos::parallel_for( + "Test::hpx::independent_instances::add", + Kokkos::Experimental::require( + Kokkos::RangePolicy<Kokkos::Experimental::HPX>(hpx2, 0, n), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + FunctorAdd(v1, v2, d)); + + Kokkos::Experimental::HPX hpx3(hpx1.impl_get_future()); + Kokkos::parallel_for( + "Test::hpx::independent_instances::add_index", + Kokkos::Experimental::require( + Kokkos::RangePolicy<Kokkos::Experimental::HPX>(hpx3, 0, n), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + FunctorAddIndex(v1, v3)); + + // NOTE: This monstrosity is used to collapse a future<tuple<future<void>, + // future<void>>> (return type of when_all) into a future<void> which is + // ready whenever the un-collapsed future would've been ready. HPX does not + // currently have the functionality to collapse this automatically. + Kokkos::Experimental::HPX hpx4(hpx::util::get<0>(hpx::split_future( + hpx::when_all(hpx2.impl_get_future(), hpx3.impl_get_future())))); + Kokkos::parallel_for( + "Test::hpx::independent_instances::pointwise_sum", + Kokkos::Experimental::require( + Kokkos::RangePolicy<Kokkos::Experimental::HPX>(hpx4, 0, n), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + FunctorPointwiseSum(v2, v3, v4)); + + Kokkos::parallel_reduce( + "Test::hpx::independent_instances::reduce", + Kokkos::Experimental::require( + Kokkos::RangePolicy<Kokkos::Experimental::HPX>(hpx4, 0, n), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + FunctorReduce(v4), Kokkos::Sum<int>(sum_v)); + + hpx4.fence(); + + ASSERT_EQ(true, hpx1.impl_get_future().is_ready()); + ASSERT_EQ(true, hpx2.impl_get_future().is_ready()); + ASSERT_EQ(true, hpx3.impl_get_future().is_ready()); + ASSERT_EQ(true, hpx4.impl_get_future().is_ready()); + + const int expected_sum = n * (2 * c + d) + (n * (n - 1) / 2); + ASSERT_EQ(expected_sum, sum_v()); + } + + Kokkos::finalize(); +} +} // namespace Test + +#endif +#endif diff --git a/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesDelayedExecution.cpp b/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesDelayedExecution.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0cedc068e594e70d750c9b515c4e08cbe527a1f4 --- /dev/null +++ b/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesDelayedExecution.cpp @@ -0,0 +1,84 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <TestHPX_Category.hpp> + +#include <hpx/include/lcos.hpp> + +#ifdef KOKKOS_ENABLE_HPX_ASYNC_DISPATCH + +namespace Test { + +TEST(hpx, delayed_execution) { + Kokkos::InitArguments arguments{-1, -1, -1, false}; + Kokkos::initialize(arguments); + + { + Kokkos::View<bool, Kokkos::Experimental::HPX> ran("ran"); + hpx::lcos::local::promise<void> p; + hpx::shared_future<void> f = p.get_future(); + + Kokkos::Experimental::HPX hpx(f); + Kokkos::parallel_for( + "Test::hpx::independent_instances::delay_execution", + Kokkos::Experimental::require( + Kokkos::RangePolicy<Kokkos::Experimental::HPX>(hpx, 0, 1), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + KOKKOS_LAMBDA(int) { ran() = true; }); + + ASSERT_EQ(false, ran()); + ASSERT_EQ(false, hpx.impl_get_future().is_ready()); + + p.set_value(); + + hpx.fence(); + ASSERT_EQ(true, hpx.impl_get_future().is_ready()); + } + + Kokkos::finalize(); +} +} // namespace Test + +#endif diff --git a/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesInstanceIds.cpp b/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesInstanceIds.cpp new file mode 100644 index 0000000000000000000000000000000000000000..de4cb01a7835d8b4e3d29920ed572edeeb9ef3fb --- /dev/null +++ b/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesInstanceIds.cpp @@ -0,0 +1,99 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <TestHPX_Category.hpp> + +#ifdef KOKKOS_ENABLE_HPX_ASYNC_DISPATCH + +namespace Test { + +TEST(hpx, instance_ids) { + Kokkos::InitArguments arguments{-1, -1, -1, false}; + Kokkos::initialize(arguments); + + { + Kokkos::Experimental::HPX hpx_global1; + Kokkos::Experimental::HPX hpx_global2 = hpx_global1; + Kokkos::Experimental::HPX hpx_global3{hpx_global1}; + Kokkos::Experimental::HPX hpx_global4( + Kokkos::Experimental::HPX::instance_mode::global); + + ASSERT_EQ(0, hpx_global1.impl_instance_id()); + ASSERT_EQ(0, hpx_global2.impl_instance_id()); + ASSERT_EQ(0, hpx_global3.impl_instance_id()); + ASSERT_EQ(0, hpx_global4.impl_instance_id()); + + Kokkos::Experimental::HPX hpx_independent1( + Kokkos::Experimental::HPX::instance_mode::independent); + Kokkos::Experimental::HPX hpx_independent2 = hpx_independent1; + Kokkos::Experimental::HPX hpx_independent3{hpx_independent1}; + + ASSERT_NE(hpx_global1.impl_instance_id(), + hpx_independent1.impl_instance_id()); + ASSERT_EQ(hpx_independent1.impl_instance_id(), + hpx_independent2.impl_instance_id()); + ASSERT_EQ(hpx_independent1.impl_instance_id(), + hpx_independent3.impl_instance_id()); + + hpx::shared_future<void> f = hpx::make_ready_future<void>(); + Kokkos::Experimental::HPX hpx_independent_future1(f); + Kokkos::Experimental::HPX hpx_independent_future2 = hpx_independent_future1; + Kokkos::Experimental::HPX hpx_independent_future3{hpx_independent_future1}; + + ASSERT_NE(hpx_global1.impl_instance_id(), + hpx_independent1.impl_instance_id()); + ASSERT_NE(hpx_independent1.impl_instance_id(), + hpx_independent_future1.impl_instance_id()); + ASSERT_EQ(hpx_independent_future1.impl_instance_id(), + hpx_independent_future2.impl_instance_id()); + ASSERT_EQ(hpx_independent_future1.impl_instance_id(), + hpx_independent_future3.impl_instance_id()); + } + + Kokkos::finalize(); +} +} // namespace Test + +#endif diff --git a/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesRefCounting.cpp b/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesRefCounting.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a98c8b0d62339fa5c2e68124984d5b790b14f692 --- /dev/null +++ b/packages/kokkos/core/unit_test/hpx/TestHPX_IndependentInstancesRefCounting.cpp @@ -0,0 +1,95 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <TestHPX_Category.hpp> + +#ifdef KOKKOS_ENABLE_HPX_ASYNC_DISPATCH + +namespace Test { +namespace { +std::atomic<int> dummy_count; + +struct dummy { + dummy() { ++dummy_count; } + dummy(dummy const &) { ++dummy_count; } + ~dummy() { --dummy_count; } + void f() const {} +}; +} // namespace +// This test makes sure the independent HPX instances don't hold on to captured +// data after destruction. +TEST(hpx, reference_counting) { + Kokkos::InitArguments arguments{-1, -1, -1, false}; + Kokkos::initialize(arguments); + + { + dummy d; + Kokkos::Experimental::HPX hpx( + Kokkos::Experimental::HPX::instance_mode::independent); + Kokkos::parallel_for( + "Test::hpx::reference_counting::dummy", + Kokkos::RangePolicy<Kokkos::Experimental::HPX>(hpx, 0, 1), + KOKKOS_LAMBDA(int) { + // Make sure dummy struct is captured. + d.f(); + }); + + // This attaches a continuation and releases the d captured above from the + // shared state of the internal future. + Kokkos::parallel_for( + "Test::hpx::reference_counting::dummy_clear", + Kokkos::RangePolicy<Kokkos::Experimental::HPX>(hpx, 0, 1), + KOKKOS_LAMBDA(int){}); + + hpx.fence(); + + ASSERT_EQ(1, dummy_count); + } + + Kokkos::finalize(); +} +} // namespace Test + +#endif diff --git a/packages/kokkos/core/unit_test/hpx/TestHPX_InterOp.cpp b/packages/kokkos/core/unit_test/hpx/TestHPX_InterOp.cpp new file mode 100644 index 0000000000000000000000000000000000000000..31c35ac9a7f0a3425948157cb7f2d3a4239691ad --- /dev/null +++ b/packages/kokkos/core/unit_test/hpx/TestHPX_InterOp.cpp @@ -0,0 +1,57 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <TestHPX_Category.hpp> + +namespace Test { + +// Test whether allocations survive Kokkos initialize/finalize if done via Raw +// Cuda. +TEST(hpx, raw_hpx_interop) { + Kokkos::InitArguments arguments{-1, -1, -1, false}; + Kokkos::initialize(arguments); + Kokkos::finalize(); +} +} // namespace Test diff --git a/packages/kokkos/core/unit_test/hpx/TestHPX_Task.cpp b/packages/kokkos/core/unit_test/hpx/TestHPX_Task.cpp new file mode 100644 index 0000000000000000000000000000000000000000..57d0ac803bcd86b5499dd6c29348d88138088c15 --- /dev/null +++ b/packages/kokkos/core/unit_test/hpx/TestHPX_Task.cpp @@ -0,0 +1,47 @@ + +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestHPX_Category.hpp> +#include <TestTaskScheduler.hpp> diff --git a/packages/kokkos/core/unit_test/incremental/Test01_execspace.hpp b/packages/kokkos/core/unit_test/incremental/Test01_execspace.hpp new file mode 100644 index 0000000000000000000000000000000000000000..419486d7a84673dacd48e7bf2513e054106bab4c --- /dev/null +++ b/packages/kokkos/core/unit_test/incremental/Test01_execspace.hpp @@ -0,0 +1,111 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +/// @Kokkos_Feature_Level_Required:1 + +#include <Kokkos_Core.hpp> +#include <cstdio> +#include <sstream> +#include <type_traits> +#include <gtest/gtest.h> + +namespace Test { + +// Unit test for Execution Space +// Test1 - testing for memory_space, execution_space, scratch space and +// array_layout of an execution space +// Test2 - Test if the is_execution_space evaluation is working correctly + +template <class ExecSpace> +struct TestIncrExecSpaceTypedef { + void testit() { + const bool passed = + (!std::is_same<void, typename ExecSpace::memory_space>::value) && + std::is_same<ExecSpace, typename ExecSpace::execution_space>::value && + !std::is_same<void, typename ExecSpace::scratch_memory_space>::value && + !std::is_same<void, typename ExecSpace::array_layout>::value; + static_assert(passed == true, + "The memory and execution spaces are defined"); + } +}; + +template <class ExecSpace> +struct TestIncrExecSpace { + void testit() { + using device_type = typename ExecSpace::device_type; + using memory_space = typename device_type::memory_space; + using execution_space = typename device_type::execution_space; + + const bool passed = + std::is_same<device_type, + Kokkos::Device<execution_space, memory_space>>::value; + + static_assert(passed == true, + "Checking if the is_execution_space is evaluated correctly"); + + ExecSpace().print_configuration(std::cout); + ExecSpace().fence(); + + auto concurrency = ExecSpace().concurrency(); + ASSERT_TRUE(concurrency > 0); + + int in_parallel = ExecSpace::in_parallel(); + ASSERT_FALSE(in_parallel); + + const char* name = ExecSpace::name(); + std::cout << name << std::endl; + } +}; + +TEST(TEST_CATEGORY, IncrTest_01_execspace_typedef) { + TestIncrExecSpaceTypedef<TEST_EXECSPACE> test; + test.testit(); +} + +TEST(TEST_CATEGORY, IncrTest_01_execspace) { + ASSERT_TRUE(Kokkos::is_execution_space<TEST_EXECSPACE>::value); + ASSERT_FALSE(Kokkos::is_execution_space< + TestIncrExecSpaceTypedef<TEST_EXECSPACE>>::value); +} +} // namespace Test diff --git a/packages/kokkos/core/unit_test/incremental/Test02_atomic_host.hpp b/packages/kokkos/core/unit_test/incremental/Test02_atomic_host.hpp new file mode 100644 index 0000000000000000000000000000000000000000..ff4fb6a89f4d380d0693e8697e27fbf5bde2f4d0 --- /dev/null +++ b/packages/kokkos/core/unit_test/incremental/Test02_atomic_host.hpp @@ -0,0 +1,96 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +/// @Kokkos_Feature_Level_Required:2 +// Unit test for atomic exchange, atomic add and atomic sub. +// Atomic exchange test : we interchange value1 with value2 and check for +// correctness. Atomic add test : we add value2 to value1 and check for +// correctness. Atomic sub test : we subtract value2 from value1 and check for +// correctmess. + +#include <Kokkos_Core.hpp> +#include <gtest/gtest.h> + +namespace Test { + +struct TestIncrAtomic { + using value_type = double; + value_type value1 = 1.5, value2 = 0.5; + + void testExchange() { + value_type ret_value = Kokkos::atomic_exchange(&value1, value2); + + ASSERT_EQ(value1, 0.5); + ASSERT_EQ(ret_value, 1.5); + } + + void testAdd() { + Kokkos::atomic_add(&value1, value2); + + ASSERT_EQ(value1, 2.0); + } + + void testSub() { + Kokkos::atomic_sub(&value1, value2); + + ASSERT_EQ(value1, 1.0); + } +}; + +TEST(TEST_CATEGORY, IncrTest_01_AtomicExchange) { + TestIncrAtomic test; + test.testExchange(); +} + +TEST(TEST_CATEGORY, IncrTest_02_AtomicAdd) { + TestIncrAtomic test; + test.testAdd(); +} + +TEST(TEST_CATEGORY, IncrTest_02_AtomicSub) { + TestIncrAtomic test; + test.testSub(); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/incremental/Test03a_MemorySpace_malloc.hpp b/packages/kokkos/core/unit_test/incremental/Test03a_MemorySpace_malloc.hpp new file mode 100644 index 0000000000000000000000000000000000000000..da808be21918892b423854db0e162706d9b0672b --- /dev/null +++ b/packages/kokkos/core/unit_test/incremental/Test03a_MemorySpace_malloc.hpp @@ -0,0 +1,80 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <gtest/gtest.h> + +/// @Kokkos_Feature_Level_Required:3 +// Unit Test for Kokkos malloc. +// Allocate memory to a pointer and check if the allocation has not returned a +// null pointer. + +namespace Test { + +using value_type = double; +const int num_elements = 10; + +template <class ExecSpace> +struct TestIncrMemorySpace_malloc { + using memory_space = typename ExecSpace::memory_space; + + void test_malloc() { + // Allocate memory + auto *data = static_cast<value_type *>(Kokkos::kokkos_malloc<memory_space>( + "data", num_elements * sizeof(value_type))); + + // Check if the allocated memory has not returned a NULL + ASSERT_NE(data, nullptr); + + // Free the allocated memory + Kokkos::kokkos_free<memory_space>(data); + } +}; + +TEST(TEST_CATEGORY, IncrTest_03a_memspace_malloc) { + TestIncrMemorySpace_malloc<TEST_EXECSPACE> test; + test.test_malloc(); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/incremental/Test03b_MemorySpace_free.hpp b/packages/kokkos/core/unit_test/incremental/Test03b_MemorySpace_free.hpp new file mode 100644 index 0000000000000000000000000000000000000000..f7ee76ec9de45e23b6d7fbd2849ef51ac21443da --- /dev/null +++ b/packages/kokkos/core/unit_test/incremental/Test03b_MemorySpace_free.hpp @@ -0,0 +1,83 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +/// @Kokkos_Feature_Level_Required:3 +// Unit test for Kokkos free. +// We constantly allocate and free the memory. +// If the kokkos_free does not free the allocated memory, +// we will exceed the available space. + +#include <Kokkos_Core.hpp> +#include <gtest/gtest.h> + +namespace Test { + +using value_type = double; + +// Allocate M number of value_type elements N number of times. +const int N = 100000; +const int M = 100000; + +template <class ExecSpace> +struct TestIncrMemorySpace_free { + using memory_space = typename ExecSpace::memory_space; + + void test_free() { + for (int i = 0; i < N; ++i) { + auto *data = static_cast<value_type *>( + Kokkos::kokkos_malloc<memory_space>("data", M * sizeof(value_type))); + + ASSERT_NE(data, nullptr); + + Kokkos::kokkos_free<memory_space>(data); + } + } +}; + +TEST(TEST_CATEGORY, IncrTest_03b_memspace_free) { + TestIncrMemorySpace_free<TEST_EXECSPACE> test; + test.test_free(); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/incremental/Test04_ParallelFor_RangePolicy.hpp b/packages/kokkos/core/unit_test/incremental/Test04_ParallelFor_RangePolicy.hpp new file mode 100644 index 0000000000000000000000000000000000000000..85eef21df3db0ba4eeaf59db4f8db187574592e7 --- /dev/null +++ b/packages/kokkos/core/unit_test/incremental/Test04_ParallelFor_RangePolicy.hpp @@ -0,0 +1,172 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <gtest/gtest.h> + +/// @Kokkos_Feature_Level_Required:4 +// parallel-for unit test. +// In this test, different elements of an array are updated by different +// threads. + +namespace Test { + +using value_type = double; +int num_elements = 10; + +struct ParallelForFunctor { + value_type *_data; + const value_type _value; + + ParallelForFunctor(value_type *data, const value_type value) + : _data(data), _value(value) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { _data[i] = (i + 1) * _value; } +}; + +template <class ExecSpace> +struct TestParallel_For { + // Memory space type for Device and Host data + using d_memspace_type = typename ExecSpace::memory_space; + using h_memspace_type = Kokkos::HostSpace; + + value_type *deviceData, *hostData; + const value_type value = 0.5; + + // Check if the array values are updated correctly. + void correctness_check(value_type *data) { + for (int i = 0; i < num_elements; ++i) { + ASSERT_EQ((i + 1) * value, data[i]) + << "Values in index " << i << " are incorrect"; + } + } + + // Routine to allocate memory in a specific memory space. + template <class MemSpace> + value_type *allocate_mem(int N) { + return (static_cast<value_type *>( + Kokkos::kokkos_malloc<MemSpace>("deviceData", N * sizeof(value_type)))); + } + + // Routine to free the memory from a specific memory space. + template <class MemSpace> + void free_mem(value_type *data) { + Kokkos::kokkos_free<MemSpace>(data); + } + + void init() { + // Allocate memory on Device space. + deviceData = allocate_mem<d_memspace_type>(num_elements); + ASSERT_NE(deviceData, nullptr); + + // Allocate memory on Host space. + hostData = allocate_mem<h_memspace_type>(num_elements); + ASSERT_NE(hostData, nullptr); + } + + void check_correctness_and_cleanup() { + // Copy the data back to Host memory space + Kokkos::Impl::DeepCopy<h_memspace_type, d_memspace_type>( + hostData, deviceData, num_elements * sizeof(value_type)); + + // Check if all data has been update correctly + correctness_check(hostData); + + // free the allocated memory + free_mem<d_memspace_type>(deviceData); + free_mem<h_memspace_type>(hostData); + } + + // A simple parallel for test with functors + void simple_test() { + // Allocates memory for num_elements number of value_type elements in the + // host and device memory spaces. + init(); + + // parallel-for functor called for num_elements number of iterations. + Kokkos::parallel_for("parallel_for", + Kokkos::RangePolicy<ExecSpace>(0, num_elements), + ParallelForFunctor(deviceData, value)); + + Kokkos::fence(); + // Checks if parallel_for gave the correct results. + // Frees the allocated memory in init(). + check_correctness_and_cleanup(); + } + + // A parallel_for test with user defined RangePolicy + void range_policy() { + // Allocates memory for num_elements number of value_type elements in the + // host and device memory spaces. + init(); + + // Creates a range policy that uses dynamic scheduling. + using range_policy_t = + Kokkos::RangePolicy<ExecSpace, Kokkos::Schedule<Kokkos::Dynamic> >; + + // parallel-for functor with range-policy from 0 to num_elements iterations. + Kokkos::parallel_for("RangePolicy_ParallelFor", + range_policy_t(0, num_elements), + ParallelForFunctor(deviceData, value)); + + // Checks if parallel_for gave the correct results. + // Free the allocated memory in init(). + check_correctness_and_cleanup(); + } +}; + +TEST(TEST_CATEGORY, IncrTest_04_simple_parallelFor) { + if (std::is_same<Kokkos::DefaultExecutionSpace, TEST_EXECSPACE>::value) { + TestParallel_For<TEST_EXECSPACE> test; + test.simple_test(); + } +} + +TEST(TEST_CATEGORY, IncrTest_04_RangePolicy_parallelFor) { + TestParallel_For<TEST_EXECSPACE> test; + test.range_policy(); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/incremental/Test05_ParallelReduce_RangePolicy.hpp b/packages/kokkos/core/unit_test/incremental/Test05_ParallelReduce_RangePolicy.hpp new file mode 100644 index 0000000000000000000000000000000000000000..0cc9d6c5d8961730d59365508eb9716bb8a16ba8 --- /dev/null +++ b/packages/kokkos/core/unit_test/incremental/Test05_ParallelReduce_RangePolicy.hpp @@ -0,0 +1,146 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <gtest/gtest.h> + +/// @Kokkos_Feature_Level_Required:5 +// Unit test for reduction of native data type. +// Assigns an index based value to elements of an array. +// Performs an reduction over the addition operation. + +namespace Test { + +using value_type = double; +constexpr double value = 0.5; + +struct ReduceFunctor { + KOKKOS_INLINE_FUNCTION + void operator()(const int i, double &UpdateSum) const { + UpdateSum += (i + 1) * value; + } +}; + +struct NonTrivialReduceFunctor { + KOKKOS_INLINE_FUNCTION + void operator()(const int i, double &UpdateSum) const { + UpdateSum += (i + 1) * value; + } + + NonTrivialReduceFunctor() = default; + NonTrivialReduceFunctor(NonTrivialReduceFunctor const &) = default; + NonTrivialReduceFunctor(NonTrivialReduceFunctor &&) = default; + NonTrivialReduceFunctor &operator=(NonTrivialReduceFunctor &&) = default; + NonTrivialReduceFunctor &operator=(NonTrivialReduceFunctor const &) = default; + ~NonTrivialReduceFunctor() {} +}; + +template <class ExecSpace> +struct TestReduction { + value_type sum = 0.0; + const int m_num_elements; + + TestReduction(int num_elements) : m_num_elements(num_elements) {} + + // compare and equal + void check_correctness() { + const int sum_local = (m_num_elements * (m_num_elements + 1)) / 2; + + ASSERT_EQ(sum, sum_local * value) + << "The reduced value does not match the expected answer"; + } + + // Routine to allocate memory in a specific memory space. + template <class MemSpace> + value_type *allocate_mem(int N) { + return (static_cast<value_type *>( + Kokkos::kokkos_malloc<MemSpace>("deviceData", N * sizeof(value_type)))); + } + + // Routine to free the memory from a specific memory space. + template <class MemSpace> + void free_mem(value_type *data) { + Kokkos::kokkos_free<MemSpace>(data); + } + + void sum_reduction() { + sum = 0.0; + + // Creates a range policy that uses dynamic schedule. + using range_policy = + Kokkos::RangePolicy<ExecSpace, Kokkos::Schedule<Kokkos::Dynamic> >; + + // parallel_reduce call with range policy over num_elements number of + // iterations + Kokkos::parallel_reduce("Reduction", range_policy(0, m_num_elements), + ReduceFunctor{}, sum); + + check_correctness(); + } + + void non_trivial_sum_reduction() { + sum = 0.0; + + // Creates a range policy that uses dynamic schedule. + using range_policy = + Kokkos::RangePolicy<ExecSpace, Kokkos::Schedule<Kokkos::Dynamic> >; + + // parallel_reduce call with range policy over num_elements number of + // iterations + Kokkos::parallel_reduce("Reduction", range_policy(0, m_num_elements), + NonTrivialReduceFunctor{}, sum); + + check_correctness(); + } +}; + +TEST(TEST_CATEGORY, IncrTest_05_reduction) { + for (unsigned int i = 0; i < 100; ++i) { + TestReduction<TEST_EXECSPACE> test(i); + test.sum_reduction(); + test.non_trivial_sum_reduction(); + } +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/incremental/Test06_ParallelFor_MDRangePolicy.hpp b/packages/kokkos/core/unit_test/incremental/Test06_ParallelFor_MDRangePolicy.hpp new file mode 100644 index 0000000000000000000000000000000000000000..4adf9e058fd5b1a85b3f7e24cac530876b7251f3 --- /dev/null +++ b/packages/kokkos/core/unit_test/incremental/Test06_ParallelFor_MDRangePolicy.hpp @@ -0,0 +1,267 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <gtest/gtest.h> + +/// @Kokkos_Feature_Level_Required:6 +// Unit Test for MDRangePolicy without Views uptil 4 ranks. +// For each of the MDRangePolicy test from 2-to-4 ranks, we create an equivalent +// dimensional array implemented in 1D. In each of these arrays we update the +// elements as a product of iterator indexes and a constant. At the end, we +// check for correctness. + +namespace Test06 { + +using value_type = double; + +struct MDFunctor { + value_type *_data; + const value_type _delta; + const int N = 10; + const int M = 10; + + MDFunctor(value_type *data, const value_type delta) + : _data(data), _delta(delta) {} + + // 2D + KOKKOS_INLINE_FUNCTION + void operator()(const int i, const int j) const { + _data[i * M + j] = i * j * _delta; + } + + // 3D + KOKKOS_INLINE_FUNCTION + void operator()(const int i, const int j, const int k) const { + _data[i * M * N + j * M + k] = i * j * k * _delta; + } + + // 4D + KOKKOS_INLINE_FUNCTION + void operator()(const int i, const int j, const int k, const int l) const { + _data[i * M * N * M + j * M * N + k * M + l] = i * j * k * l * _delta; + } +}; + +template <class ExecSpace> +struct TestMDRangePolicy { + // Memory space type for Device and Host data + using d_memspace_type = typename ExecSpace::memory_space; + using h_memspace_type = Kokkos::HostSpace; + + // Index Type for the iterator + using int_index = Kokkos::IndexType<int>; + + // An MDRangePolicy for 2 nested loops + using MDPolicyType_2D = typename Kokkos::Experimental::MDRangePolicy< + ExecSpace, Kokkos::Experimental::Rank<2>, int_index>; + + // An MDRangePolicy for 3 nested loops + using MDPolicyType_3D = typename Kokkos::Experimental::MDRangePolicy< + ExecSpace, Kokkos::Experimental::Rank<3>, int_index>; + + // An MDRangePolicy for 4 nested loops + using MDPolicyType_4D = typename Kokkos::Experimental::MDRangePolicy< + ExecSpace, Kokkos::Experimental::Rank<4>, int_index>; + + // Device and Host Data structure pointer + value_type *deviceData, *hostData; + const value_type delta = 0.5; + const int N = 10; + const int M = 10; + + // Routine to allocate memory in a specific memory space. + template <class MemSpace> + value_type *allocate_mem(int N_) { + return (static_cast<value_type *>( + Kokkos::kokkos_malloc<MemSpace>("Data", N_ * sizeof(value_type)))); + } + + // Routine to free the memory from a specific memory space. + template <class MemSpace> + void free_mem(value_type *data) { + Kokkos::kokkos_free<MemSpace>(data); + } + + // compare and equal + void compare_equal_2D() { + for (int i = 0; i < N; ++i) + for (int j = 0; j < M; ++j) ASSERT_EQ(hostData[i * M + j], i * j * delta); + } + + // compare and equal + void compare_equal_3D() { + for (int i = 0; i < N; ++i) + for (int j = 0; j < M; ++j) + for (int k = 0; k < N; ++k) + ASSERT_EQ(hostData[i * M * N + j * M + k], i * j * k * delta); + } + + // compare and equal + void compare_equal_4D() { + for (int i = 0; i < N; ++i) + for (int j = 0; j < M; ++j) + for (int k = 0; k < N; ++k) + for (int l = 0; l < M; ++l) + ASSERT_EQ(hostData[i * M * N * M + j * M * N + k * M + l], + i * j * k * l * delta); + } + + // A 2-D MDRangePolicy + void mdRange2D() { + MDPolicyType_2D mdPolicy_2D({0, 0}, {N, M}); + + // Total number of elements + int num_elements = N * M; + + // Allocate Memory for both device and host memory spaces + // Data[M*N] + deviceData = allocate_mem<d_memspace_type>(num_elements); + ASSERT_NE(deviceData, nullptr); + + hostData = allocate_mem<h_memspace_type>(num_elements); + ASSERT_NE(hostData, nullptr); + + // parallel_for call + MDFunctor Functor_2D(deviceData, delta); + Kokkos::parallel_for("MDRange2D", mdPolicy_2D, Functor_2D); + + // Copy the data back to Host memory space + Kokkos::Impl::DeepCopy<h_memspace_type, d_memspace_type>( + hostData, deviceData, num_elements * sizeof(value_type)); + + // Check if all data has been update correctly + compare_equal_2D(); + + // free the allocated memory + free_mem<d_memspace_type>(deviceData); + free_mem<h_memspace_type>(hostData); + } + + // A 3-D MDRangePolicy + void mdRange3D() { + MDPolicyType_3D mdPolicy_3D({0, 0, 0}, {N, M, N}); + + // Total number of elements + int num_elements = N * M * N; + + // Allocate Memory for both device and host memory spaces + // Data[M*N*N] + deviceData = allocate_mem<d_memspace_type>(num_elements); + ASSERT_NE(deviceData, nullptr); + + hostData = allocate_mem<h_memspace_type>(num_elements); + ASSERT_NE(hostData, nullptr); + + // parallel_for call + MDFunctor Functor_3D(deviceData, delta); + Kokkos::parallel_for("MDRange3D", mdPolicy_3D, Functor_3D); + + // Copy the data back to Host memory space + Kokkos::Impl::DeepCopy<h_memspace_type, d_memspace_type>( + hostData, deviceData, num_elements * sizeof(value_type)); + + // Check if all data has been update correctly + compare_equal_3D(); + + // free the allocated memory + free_mem<d_memspace_type>(deviceData); + free_mem<h_memspace_type>(hostData); + } + + // A 4-D MDRangePolicy + void mdRange4D() { + MDPolicyType_4D mdPolicy_4D({0, 0, 0, 0}, {N, M, N, M}); + + // Total number of elements + int num_elements = N * M * N * M; + + // Allocate Memory for both device and host memory spaces + // Data[M*N*N*M] + deviceData = allocate_mem<d_memspace_type>(num_elements); + ASSERT_NE(deviceData, nullptr); + + hostData = allocate_mem<h_memspace_type>(num_elements); + ASSERT_NE(hostData, nullptr); + + // parallel_for call + MDFunctor Functor_4D(deviceData, delta); + Kokkos::parallel_for("MDRange4D", mdPolicy_4D, Functor_4D); + + // Copy the data back to Host memory space + Kokkos::Impl::DeepCopy<h_memspace_type, d_memspace_type>( + hostData, deviceData, num_elements * sizeof(value_type)); + + // Check if all data has been update correctly + compare_equal_4D(); + + // free the allocated memory + free_mem<d_memspace_type>(deviceData); + free_mem<h_memspace_type>(hostData); + } +}; + +} // namespace Test06 + +namespace Test { + +// 2D MDRangePolicy +TEST(TEST_CATEGORY, IncrTest_06_mdrange2D) { + Test06::TestMDRangePolicy<TEST_EXECSPACE> test; + test.mdRange2D(); +} + +// 3D MDRangePolicy +TEST(TEST_CATEGORY, IncrTest_06_mdrange3D) { + Test06::TestMDRangePolicy<TEST_EXECSPACE> test; + test.mdRange3D(); +} + +// 4D MDRangePolicy +TEST(TEST_CATEGORY, IncrTest_06_mdrange4D) { + Test06::TestMDRangePolicy<TEST_EXECSPACE> test; + test.mdRange4D(); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/incremental/Test08_deep_copy.hpp b/packages/kokkos/core/unit_test/incremental/Test08_deep_copy.hpp new file mode 100644 index 0000000000000000000000000000000000000000..5166f5a9f0de05b24166161654c9eaab4ff2ad82 --- /dev/null +++ b/packages/kokkos/core/unit_test/incremental/Test08_deep_copy.hpp @@ -0,0 +1,207 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <gtest/gtest.h> + +/// @Kokkos_Feature_Level_Required:8 +// Unit Test for MDRangePolicy without Views uptil 4 ranks. +// For each of the MDRangePolicy test from 2-to-4 ranks, we create an equivalent +// dimensional view. In each of these views we update the +// elements as a product of iterator indexes and a constant inside a +// parallel_for lambda. At the end, we check for correctness. + +namespace Test05 { + +using value_type = double; +const int N = 10; +const int M = 10; + +template <class ExecSpace> +struct TestMDRangePolicy { + // 2D View + using View_2D = typename Kokkos::View<value_type **, ExecSpace>; + using Host_View_2D = typename View_2D::HostMirror; + Host_View_2D hostDataView_2D; + + // 3D View + using View_3D = typename Kokkos::View<value_type ***, ExecSpace>; + using Host_View_3D = typename View_3D::HostMirror; + Host_View_3D hostDataView_3D; + + // 4D View + using View_4D = typename Kokkos::View<value_type ****, ExecSpace>; + using Host_View_4D = typename View_4D::HostMirror; + Host_View_4D hostDataView_4D; + + // Memory space type for Device and Host data + using d_memspace_type = typename ExecSpace::memory_space; + using h_memspace_type = Kokkos::HostSpace; + + // Index Type for the iterator + using int_index = Kokkos::IndexType<int>; + + // An MDRangePolicy for 2 nested loops + using MDPolicyType_2D = typename Kokkos::Experimental::MDRangePolicy< + ExecSpace, Kokkos::Experimental::Rank<2>, int_index>; + + // An MDRangePolicy for 3 nested loops + using MDPolicyType_3D = typename Kokkos::Experimental::MDRangePolicy< + ExecSpace, Kokkos::Experimental::Rank<3>, int_index>; + + // An MDRangePolicy for 4 nested loops + using MDPolicyType_4D = typename Kokkos::Experimental::MDRangePolicy< + ExecSpace, Kokkos::Experimental::Rank<4>, int_index>; + + // compare and equal + void compare_equal_2D() { + for (int i = 0; i < N; ++i) + for (int j = 0; j < M; ++j) ASSERT_EQ(hostDataView_2D(i, j), i * M + j); + } + + // compare and equal + void compare_equal_3D() { + for (int i = 0; i < N; ++i) + for (int j = 0; j < M; ++j) + for (int k = 0; k < N; ++k) + ASSERT_EQ(hostDataView_3D(i, j, k), i * M * N + j * N + k); + } + + // compare and equal + void compare_equal_4D() { + for (int i = 0; i < N; ++i) + for (int j = 0; j < M; ++j) + for (int k = 0; k < N; ++k) + for (int l = 0; l < M; ++l) + ASSERT_EQ(hostDataView_4D(i, j, k, l), + i * M * N * M + j * N * M + k * M + l); + } + + // A 2-D MDRangePolicy + void mdRange2D() { + View_2D deviceDataView_2D("deviceData_2D", N, M); + hostDataView_2D = create_mirror_view(deviceDataView_2D); + + MDPolicyType_2D mdPolicy_2D({0, 0}, {N, M}); + + Kokkos::parallel_for( + mdPolicy_2D, KOKKOS_LAMBDA(const int i, const int j) { + deviceDataView_2D(i, j) = i * M + j; + }); + + // Copy data back to host view. + Kokkos::deep_copy(hostDataView_2D, deviceDataView_2D); + + // Check if all data has been update correctly + compare_equal_2D(); + } + + // A 3-D MDRangePolicy + void mdRange3D() { + View_3D deviceDataView_3D("deviceData_3D", N, M, N); + hostDataView_3D = create_mirror_view(deviceDataView_3D); + + MDPolicyType_3D mdPolicy_3D({0, 0, 0}, {N, M, N}); + + Kokkos::parallel_for( + mdPolicy_3D, KOKKOS_LAMBDA(const int i, const int j, const int k) { + deviceDataView_3D(i, j, k) = i * M * N + j * N + k; + }); + + // Copy data back to host view. + Kokkos::deep_copy(hostDataView_3D, deviceDataView_3D); + + // Check if all data has been update correctly + compare_equal_3D(); + } + + // A 4-D MDRangePolicy + void mdRange4D() { + View_4D deviceDataView_4D("deviceData_4D", N, M, N, M); + hostDataView_4D = create_mirror_view(deviceDataView_4D); + + MDPolicyType_4D mdPolicy_4D({0, 0, 0, 0}, {N, M, N, M}); + + Kokkos::parallel_for( + mdPolicy_4D, + KOKKOS_LAMBDA(const int i, const int j, const int k, const int l) { + deviceDataView_4D(i, j, k, l) = i * M * N * M + j * N * M + k * M + l; + }); + + Kokkos::deep_copy(hostDataView_4D, deviceDataView_4D); + + // Check if all data has been update correctly + compare_equal_4D(); + } +}; + +} // namespace Test05 + +namespace Test { + +// 2D MDRangePolicy +TEST(TEST_CATEGORY, IncrTest_08_deep_copy_2D) { + { + Test05::TestMDRangePolicy<TEST_EXECSPACE> test; + test.mdRange2D(); + } +} + +// 3D MDRangePolicy +TEST(TEST_CATEGORY, IncrTest_08_deep_copy_3D) { + { + Test05::TestMDRangePolicy<TEST_EXECSPACE> test; + test.mdRange3D(); + } +} + +// 4D MDRangePolicy +TEST(TEST_CATEGORY, IncrTest_08_deep_copy_4D) { + { + Test05::TestMDRangePolicy<TEST_EXECSPACE> test; + test.mdRange4D(); + } +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/incremental/Test10_HierarchicalBasics.hpp b/packages/kokkos/core/unit_test/incremental/Test10_HierarchicalBasics.hpp new file mode 100644 index 0000000000000000000000000000000000000000..720197c3545001e3fe2bc56c9b51692f86cac6e4 --- /dev/null +++ b/packages/kokkos/core/unit_test/incremental/Test10_HierarchicalBasics.hpp @@ -0,0 +1,110 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +// @Kokkos_Feature_Level_Required:10 +// Unit test for hierarchical parallelism +// Create concurrent work hierarchically and verify if +// contributions of paticipating processing units corresponds to expected value + +#include <gtest/gtest.h> +#include <Kokkos_Core.hpp> + +namespace Test { + +template <class ExecSpace> +struct HierarchicalBasics { + using policy_t = Kokkos::TeamPolicy<ExecSpace>; + using team_t = typename policy_t::member_type; + + void run(const int nP, int nT) { + if (nT > ExecSpace::concurrency()) nT = ExecSpace::concurrency(); + + policy_t pol(nP, nT); + + ASSERT_EQ(pol.league_size(), nP); + ASSERT_LE(pol.team_size(), nT); + + nT = pol.team_size(); + + Kokkos::View<int **, ExecSpace> v("Array_A", nP, nT); + Kokkos::parallel_for( + "Teams", pol, KOKKOS_LAMBDA(const team_t &team) { + const int tR = team.team_rank(); + const int tS = team.team_size(); + const int lR = team.league_rank(); + const int lS = team.league_size(); + if (lR < lS) { + v(lR, tR) = lR * tS + tR; + } else { + v(lR, tR) = 100000; + } + }); + Kokkos::fence(); + auto h_v = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), v); + + size_t check = 0; + size_t ref = nP * nT; + for (int i = 0; i < nP; ++i) + for (int j = 0; j < nT; ++j) check += h_v(i, j); + + ASSERT_EQ(check, ref * (ref - 1) / 2); + } +}; + +TEST(TEST_CATEGORY, IncrTest_10_Hierarchical_Basics) { + HierarchicalBasics<TEST_EXECSPACE> test; + + // OpenMPTarget backend only accepts >= 32 threads per team +#if defined(KOKKOS_ENABLE_OPENMPTARGET) + test.run(1, 32); + test.run(8, 64); + test.run(11, 128); +#else + test.run(1, 4); + test.run(8, 16); + test.run(11, 13); +#endif +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/incremental/Test11a_ParallelFor_TeamThreadRange.hpp b/packages/kokkos/core/unit_test/incremental/Test11a_ParallelFor_TeamThreadRange.hpp new file mode 100644 index 0000000000000000000000000000000000000000..fef4d9c6906454568d963f7f9053fc12c39160e8 --- /dev/null +++ b/packages/kokkos/core/unit_test/incremental/Test11a_ParallelFor_TeamThreadRange.hpp @@ -0,0 +1,99 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +// @Kokkos_Feature_Level_Required:11 +// Unit test for hierarchical parallelism +// Create concurrent work hierarchically and verify if +// contributions of paticipating processing units corresponds to expected value + +#include <gtest/gtest.h> +#include <Kokkos_Core.hpp> + +namespace Test { + +template <class ExecSpace> +struct Hierarchical_ForLoop_A { + void run(const int pN, const int sX, const int sY) { + using team_policy = Kokkos::TeamPolicy<ExecSpace>; + using member_type = typename Kokkos::TeamPolicy<ExecSpace>::member_type; + + using viewDataType = Kokkos::View<int **, ExecSpace>; + viewDataType v("Matrix", sX, sY); + + Kokkos::parallel_for( + "Team", team_policy(pN, Kokkos::AUTO), + KOKKOS_LAMBDA(const member_type &team) { + const int n = team.league_rank(); + const int ls = team.league_size(); + + const int startDim1 = n * (int)(sX / ls); + const int modDim1 = n == ls - 1 ? sX % ls : 0; + + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, v.extent(1)), [=](const int m) { + for (int i = startDim1; + i < (startDim1 + (int)(sX / ls) + modDim1); ++i) + v(i, m) = i * v.extent(1) + m; + }); + }); + + Kokkos::fence(); + auto v_H = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), v); + + long long int check = 0; + const long long int s = sY * sX; + for (int i = 0; i < sX; ++i) + for (int j = 0; j < sY; ++j) check += v_H(i, j); + ASSERT_EQ(check, s * (s - 1) / 2); + } +}; + +TEST(TEST_CATEGORY, IncrTest_11a_Hierarchical_ForLoop) { + Hierarchical_ForLoop_A<TEST_EXECSPACE> test; + test.run(4, 5, 200); + test.run(4, 7, 19); + test.run(14, 277, 321); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/incremental/Test11b_ParallelFor_TeamVectorRange.hpp b/packages/kokkos/core/unit_test/incremental/Test11b_ParallelFor_TeamVectorRange.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a81b474a64f9db4a2dee93ab0a83dc7c067de766 --- /dev/null +++ b/packages/kokkos/core/unit_test/incremental/Test11b_ParallelFor_TeamVectorRange.hpp @@ -0,0 +1,99 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +// @Kokkos_Feature_Level_Required:11 +// Unit test for hierarchical parallelism +// Create concurrent work hierarchically and verify if +// contributions of paticipating processing units corresponds to expected value + +#include <gtest/gtest.h> +#include <Kokkos_Core.hpp> + +namespace Test { + +template <class ExecSpace> +struct Hierarchical_ForLoop_B { + void run(const int pN, const int sX, const int sY) { + using team_policy = Kokkos::TeamPolicy<ExecSpace>; + using member_type = typename Kokkos::TeamPolicy<ExecSpace>::member_type; + + using viewDataType = Kokkos::View<int **, ExecSpace>; + viewDataType v("Matrix", sX, sY); + + Kokkos::parallel_for( + "Team", team_policy(pN, Kokkos::AUTO), + KOKKOS_LAMBDA(const member_type &team) { + const int n = team.league_rank(); + const int ls = team.league_size(); + + const int startDim1 = n * (int)(sX / ls); + const int modDim1 = n == ls - 1 ? sX % ls : 0; + + Kokkos::parallel_for( + Kokkos::TeamVectorRange(team, v.extent(1)), [=](const int m) { + for (int i = startDim1; + i < (startDim1 + (int)(sX / ls) + modDim1); ++i) + v(i, m) = i * v.extent(1) + m; + }); + }); + + Kokkos::fence(); + auto v_H = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), v); + + long long int check = 0; + const long long int s = sY * sX; + for (int i = 0; i < sX; ++i) + for (int j = 0; j < sY; ++j) check += v_H(i, j); + ASSERT_EQ(check, s * (s - 1) / 2); + } +}; + +TEST(TEST_CATEGORY, IncrTest_11b_Hierarchical_ForLoop) { + Hierarchical_ForLoop_B<TEST_EXECSPACE> test; + test.run(1, 6, 400); + test.run(6, 7, 19); + test.run(12, 277, 321); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/incremental/Test11c_ParallelFor_ThreadVectorRange.hpp b/packages/kokkos/core/unit_test/incremental/Test11c_ParallelFor_ThreadVectorRange.hpp new file mode 100644 index 0000000000000000000000000000000000000000..814ab5fda660fd53c3bbe4e71d252c7e63ec9c73 --- /dev/null +++ b/packages/kokkos/core/unit_test/incremental/Test11c_ParallelFor_ThreadVectorRange.hpp @@ -0,0 +1,105 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +// @Kokkos_Feature_Level_Required:11 +// Unit test for hierarchical parallelism +// Create concurrent work hierarchically and verify if +// contributions of paticipating processing units corresponds to expected value + +#include <gtest/gtest.h> +#include <Kokkos_Core.hpp> + +namespace Test { + +template <class ExecSpace> +struct Hierarchical_ForLoop_C { + void run(const int pN, const int sX, const int sY, const int sZ) { + using team_policy = Kokkos::TeamPolicy<ExecSpace>; + using member_type = typename Kokkos::TeamPolicy<ExecSpace>::member_type; + + using viewDataType = Kokkos::View<size_t ***, ExecSpace>; + viewDataType v("Matrix", sX, sY, sZ); + + Kokkos::parallel_for( + "Team", team_policy(pN, Kokkos::AUTO), + KOKKOS_LAMBDA(const member_type &team) { + int n = team.league_rank(); + int ls = team.league_size(); + + int startDim1 = n * (int)(sX / ls); + int modDim1 = n == ls - 1 ? sX % ls : 0; + + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, v.extent(1)), [&](const int m) { + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(team, v.extent(2)), + [&](const int k) { + for (int i = startDim1; + i < (startDim1 + (int)(sX / ls) + modDim1); ++i) + v(i, m, k) = + i * v.extent(1) * v.extent(2) + m * v.extent(2) + k; + }); + }); + }); + + Kokkos::fence(); + auto v_H = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), v); + + size_t check = 0; + const size_t s = sX * sY * sZ; + for (int i = 0; i < sX; ++i) + for (int j = 0; j < sY; ++j) + for (int k = 0; k < sZ; ++k) check += v_H(i, j, k); + ASSERT_EQ(check, s * (s - 1) / 2); + } +}; + +TEST(TEST_CATEGORY, IncrTest_11c_Hierarchical_ForLoop) { + Hierarchical_ForLoop_C<TEST_EXECSPACE> test; + test.run(4, 16, 16, 16); + test.run(8, 12, 333, 16); + test.run(12, 277, 321, 345); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/incremental/Test12a_ThreadScratch.hpp b/packages/kokkos/core/unit_test/incremental/Test12a_ThreadScratch.hpp new file mode 100644 index 0000000000000000000000000000000000000000..5bf1860d8e4a6bcf739656bdc7e1f790ebf60512 --- /dev/null +++ b/packages/kokkos/core/unit_test/incremental/Test12a_ThreadScratch.hpp @@ -0,0 +1,137 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +// @Kokkos_Feature_Level_Required:12 +// Unit test for hierarchical parallelism +// Create concurrent work hierarchically and verify if +// contributions of paticipating processing units corresponds to expected value +// Use a scratch pad memory for each team +#include <gtest/gtest.h> +#include <Kokkos_Core.hpp> + +namespace Test { + +template <class ExecSpace> +struct ThreadScratch { + using policy_t = Kokkos::TeamPolicy<ExecSpace>; + using team_t = typename Kokkos::TeamPolicy<ExecSpace>::member_type; + using data_t = Kokkos::View<size_t **, ExecSpace>; + + using scratch_t = Kokkos::View<size_t *, ExecSpace, + Kokkos::MemoryTraits<Kokkos::Unmanaged> >; + + int sX, sY; + data_t v; + + const int scratch_level = 1; + KOKKOS_FUNCTION + void operator()(const team_t &team) const { + // Allocate and use scratch pad memory + scratch_t v_S(team.thread_scratch(scratch_level), sY); + int n = team.league_rank(); + + for (int i = 0; i < sY; ++i) v_S(i) = 0; + + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, sX), [&](const int m) { + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(team, sY), + [&](const int k) { v_S(k) += sX * sY * n + sY * m + k; }); + }); + + team.team_barrier(); + + for (int i = 0; i < sY; ++i) { + v(n, team.team_rank()) += v_S(i); + } + } + + void run(const int pN, const int sX_, const int sY_) { + sX = sX_; + sY = sY_; + + int scratchSize = scratch_t::shmem_size(sY); + // So this works with deprecated code enabled: + policy_t policy = + policy_t(pN, Kokkos::AUTO) + .set_scratch_size(scratch_level, Kokkos::PerThread(scratchSize)); + + int max_team_size = policy.team_size_max(*this, Kokkos::ParallelForTag()); + v = data_t("Matrix", pN, max_team_size); + + Kokkos::parallel_for( + "Test12a_ThreadScratch", + policy_t(pN, max_team_size) + .set_scratch_size(scratch_level, Kokkos::PerThread(scratchSize)), + *this); + + Kokkos::fence(); + auto v_H = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), v); + + size_t check = 0; + const size_t s = pN * sX * sY; + for (int n = 0; n < pN; ++n) + for (int m = 0; m < max_team_size; ++m) { + check += v_H(n, m); + } + ASSERT_EQ(s * (s - 1) / 2, check); + } +}; + +TEST(TEST_CATEGORY, IncrTest_12a_ThreadScratch) { + ThreadScratch<TEST_EXECSPACE> test; + // FIXME_OPENMPTARGET - team_size has to be a multiple of 32 for the tests to + // pass in the Release and RelWithDebInfo builds. Does not need the team_size + // to be a multiple of 32 for the Debug builds. +#ifdef KOKKOS_ENABLE_OPENMPTARGET + test.run(1, 32, 9); + test.run(2, 64, 22); + test.run(14, 128, 321); +#else + test.run(1, 55, 9); + test.run(2, 4, 22); + test.run(14, 277, 321); +#endif +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/incremental/Test12b_TeamScratch.hpp b/packages/kokkos/core/unit_test/incremental/Test12b_TeamScratch.hpp new file mode 100644 index 0000000000000000000000000000000000000000..b34f652e76d919f14c3afed0656b8bcd86dbc27f --- /dev/null +++ b/packages/kokkos/core/unit_test/incremental/Test12b_TeamScratch.hpp @@ -0,0 +1,124 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +// @Kokkos_Feature_Level_Required:12 +// Unit test for hierarchical parallelism +// Create concurrent work hierarchically and verify if +// contributions of paticipating processing units corresponds to expected value +// Use a scratch pad memory for each team +#include <gtest/gtest.h> +#include <Kokkos_Core.hpp> + +namespace Test { + +template <class ExecSpace> +struct TeamScratch { + void run(const int pN, const int sX, const int sY) { + using policy_t = Kokkos::TeamPolicy<ExecSpace>; + using team_t = typename Kokkos::TeamPolicy<ExecSpace>::member_type; + using data_t = Kokkos::View<size_t **, ExecSpace>; + data_t v("Matrix", pN, sX); + + using scratch_t = Kokkos::View<size_t **, ExecSpace, + Kokkos::MemoryTraits<Kokkos::Unmanaged> >; + int scratchSize = scratch_t::shmem_size(sX, sY); + + const int scratch_level = 1; + + Kokkos::parallel_for( + "Team", + policy_t(pN, Kokkos::AUTO) + .set_scratch_size(scratch_level, Kokkos::PerTeam(scratchSize)), + KOKKOS_LAMBDA(const team_t &team) { + // Allocate and use scratch pad memory + scratch_t v_S(team.team_scratch(scratch_level), sX, sY); + int n = team.league_rank(); + + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, sX), [&](const int m) { + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(team, sY), [&](const int k) { + v_S(m, k) = v_S.extent(0) * v_S.extent(1) * n + + v_S.extent(1) * m + k; + }); + }); + + team.team_barrier(); + + // Sum up contributions and reduce by one dimension + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, sX), + [&](const int m) { + for (int i = 0; i < sY; ++i) + v(n, m) += v_S(m, i); + }); + }); + + Kokkos::fence(); + auto v_H = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), v); + + size_t check = 0; + const size_t s = pN * sX * sY; + for (int n = 0; n < pN; ++n) + for (int m = 0; m < sX; ++m) check += v_H(n, m); + ASSERT_EQ(check, s * (s - 1) / 2); + } +}; + +TEST(TEST_CATEGORY, IncrTest_12b_TeamScratch) { + TeamScratch<TEST_EXECSPACE> test; + // FIXME_OPENMPTARGET - team_size has to be a multiple of 32 for the tests to + // pass in the Release and RelWithDebInfo builds. Does not need the team_size + // to be a multiple of 32 for the Debug builds. +#ifdef KOKKOS_ENABLE_OPENMPTARGET + test.run(1, 32, 4); + test.run(4, 64, 10); + test.run(14, 128, 20); +#else + test.run(1, 4, 4); + test.run(4, 7, 10); + test.run(14, 277, 321); +#endif +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/incremental/Test13a_ParallelRed_TeamThreadRange.hpp b/packages/kokkos/core/unit_test/incremental/Test13a_ParallelRed_TeamThreadRange.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e32b0ed0fc92684072cf004b64240093e1b981fd --- /dev/null +++ b/packages/kokkos/core/unit_test/incremental/Test13a_ParallelRed_TeamThreadRange.hpp @@ -0,0 +1,105 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +// @Kokkos_Feature_Level_Required:13 +// Unit test for hierarchical parallelism +// Create concurrent work hierarchically and verify if +// sum of created processing units corresponds to expected value + +#include <gtest/gtest.h> +#include <Kokkos_Core.hpp> + +// Degrees of concurrency per nesting level + +using SCALAR_TYPE = int; + +namespace Test { + +template <class ExecSpace> +struct Hierarchical_Red_A { + void run(const int pN, const int sX) { + using team_policy = Kokkos::TeamPolicy<ExecSpace>; + using member_type = typename Kokkos::TeamPolicy<ExecSpace>::member_type; + + using viewDataType = Kokkos::View<SCALAR_TYPE *, ExecSpace>; + viewDataType v("Vector", pN); + + Kokkos::parallel_for( + "Team", team_policy(pN, Kokkos::AUTO), + KOKKOS_LAMBDA(const member_type &team) { + const int n = team.league_rank(); + SCALAR_TYPE out = 0; + + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(team, sX), + [=](const int i, SCALAR_TYPE &tmp) { + tmp += n * v.extent(0) + i; + }, + out); + + Kokkos::single(Kokkos::PerTeam(team), [&]() { v(n) += out; }); + }); + + Kokkos::fence(); + auto v_H = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), v); + + SCALAR_TYPE check = 0; + SCALAR_TYPE ref = 0; + for (int i = 0; i < pN; ++i) { + check += v_H(i); + ref += + (sX + i * pN) * (sX + i * pN - 1) / 2 - ((i * pN) * (i * pN - 1) / 2); + } + ASSERT_EQ(check, ref); + } +}; + +TEST(TEST_CATEGORY, IncrTest_13a_Hierarchical_Red) { + Hierarchical_Red_A<TEST_EXECSPACE> test; + test.run(4, 16); + test.run(2, 39); + test.run(39, 3); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/incremental/Test13b_ParallelRed_TeamVectorRange.hpp b/packages/kokkos/core/unit_test/incremental/Test13b_ParallelRed_TeamVectorRange.hpp new file mode 100644 index 0000000000000000000000000000000000000000..0d37703e2b73d5ca22e73f2bfbd2f553e1fe0225 --- /dev/null +++ b/packages/kokkos/core/unit_test/incremental/Test13b_ParallelRed_TeamVectorRange.hpp @@ -0,0 +1,102 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +// @Kokkos_Feature_Level_Required:13 +// Unit test for hierarchical parallelism +// Create concurrent work hierarchically and verify if +// sum of created processing units corresponds to expected value + +#include <gtest/gtest.h> +#include <Kokkos_Core.hpp> + +using SCALAR_TYPE = int; + +namespace Test { + +template <class ExecSpace> +struct Hierarchical_Red_B { + void run(const int pN, const int sX) { + using team_policy = Kokkos::TeamPolicy<ExecSpace>; + using member_type = typename Kokkos::TeamPolicy<ExecSpace>::member_type; + + using viewDataType = Kokkos::View<SCALAR_TYPE *, ExecSpace>; + viewDataType v("Vector", pN); + + Kokkos::parallel_for( + "Team", team_policy(pN, Kokkos::AUTO), + KOKKOS_LAMBDA(const member_type &team) { + const int n = team.league_rank(); + SCALAR_TYPE out = 0; + + Kokkos::parallel_reduce( + Kokkos::TeamVectorRange(team, sX), + [=](const int i, SCALAR_TYPE &tmp) { + tmp += n * v.extent(0) + i; + }, + out); + + Kokkos::single(Kokkos::PerTeam(team), [&]() { v(n) += out; }); + }); + + Kokkos::fence(); + auto v_H = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), v); + + SCALAR_TYPE check = 0; + SCALAR_TYPE ref = 0; + for (int i = 0; i < pN; ++i) { + check += v_H(i); + ref += ((sX + i * pN) * (sX + i * pN - 1) - (i * pN * (i * pN - 1))) / 2; + } + ASSERT_EQ(check, ref); + } +}; + +TEST(TEST_CATEGORY, IncrTest_13b_Hierarchical_Red) { + Hierarchical_Red_B<TEST_EXECSPACE> test; + test.run(4, 16); + test.run(2, 39); + test.run(39, 3); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/incremental/Test13c_ParallelRed_ThreadVectorRange.hpp b/packages/kokkos/core/unit_test/incremental/Test13c_ParallelRed_ThreadVectorRange.hpp new file mode 100644 index 0000000000000000000000000000000000000000..26f9d000914393a8af86d9ba1bc4bb5658a7244e --- /dev/null +++ b/packages/kokkos/core/unit_test/incremental/Test13c_ParallelRed_ThreadVectorRange.hpp @@ -0,0 +1,112 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +// @Kokkos_Feature_Level_Required:13 +// Unit test for hierarchical parallelism +// Create concurrent work hierarchically and verify if +// sum of created processing units corresponds to expected value + +#include <gtest/gtest.h> +#include <Kokkos_Core.hpp> + +using SCALAR_TYPE = int; + +namespace Test { + +template <class ExecSpace> +struct Hierarchical_Red_C { + void run(const int pN, const int sX, const int sY) { + using team_policy = Kokkos::TeamPolicy<ExecSpace>; + using member_type = typename Kokkos::TeamPolicy<ExecSpace>::member_type; + + using viewDataType = Kokkos::View<SCALAR_TYPE *, ExecSpace>; + viewDataType v("Vector", pN); + + Kokkos::parallel_for( + "Team", team_policy(pN, Kokkos::AUTO), + KOKKOS_LAMBDA(const member_type &team) { + int n = team.league_rank(); + SCALAR_TYPE out = 0; + + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(team, sX), + [=](const int i, SCALAR_TYPE &tmp) { + SCALAR_TYPE out_inner = 0; + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, sY), + [=](const int k, int &tmp_inner) { + tmp_inner += n * sX * v.extent(0) + sX * i + k; + }, + out_inner); + + Kokkos::single(Kokkos::PerThread(team), + [&]() { tmp += out_inner; }); + }, + out); + + Kokkos::single(Kokkos::PerTeam(team), [&]() { v(n) += out; }); + }); + + Kokkos::fence(); + auto v_H = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), v); + + SCALAR_TYPE check = 0; + SCALAR_TYPE ref = 0; + for (int i = 0; i < pN; ++i) { + check += v_H(i); + for (int j = 0; j < sX; ++j) + for (int k = 0; k < sY; ++k) ref += i * sX * pN + sX * j + k; + } + ASSERT_EQ(check, ref); + } +}; + +TEST(TEST_CATEGORY, IncrTest_13c_Hierarchical_Red) { + Hierarchical_Red_C<TEST_EXECSPACE> test; + test.run(1, 4, 8); + test.run(2, 39, 12); + test.run(39, 3, 235); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/incremental/Test14_MDRangeReduce.hpp b/packages/kokkos/core/unit_test/incremental/Test14_MDRangeReduce.hpp new file mode 100644 index 0000000000000000000000000000000000000000..d227e834dc64607c4ca01127228527dc71e9e918 --- /dev/null +++ b/packages/kokkos/core/unit_test/incremental/Test14_MDRangeReduce.hpp @@ -0,0 +1,182 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +/// @Kokkos_Feature_Level_Required:14 +// Incremental test for MDRange reduction . +// Reduction is tested with scalar, view and a customized reduction. + +#include <Kokkos_Core.hpp> +#include <gtest/gtest.h> + +namespace Test { +using value_type = double; +const int N = 10; +const int M = 10; + +// A structure for complex number. +struct MyComplex { + value_type _re, _im; + + MyComplex() = default; + + KOKKOS_INLINE_FUNCTION + MyComplex(value_type re, value_type im) : _re(re), _im(im) {} + + KOKKOS_INLINE_FUNCTION + MyComplex(const MyComplex& src) : _re(src._re), _im(src._im) {} + + KOKKOS_INLINE_FUNCTION + void operator+=(const MyComplex& src) { + _re += src._re; + _im += src._im; + } + + KOKKOS_INLINE_FUNCTION + void operator+=(const volatile MyComplex& src) volatile { + _re += src._re; + _im += src._im; + } +}; + +template <class ExecSpace> +struct TestMDRangeReduce { + // 1D View of double + using View_1D = typename Kokkos::View<value_type*, ExecSpace>; + + // 2D View of double + using View_2D = typename Kokkos::View<value_type**, ExecSpace>; + + // Index Type for the iterator + using int_index = Kokkos::IndexType<int>; + + // An MDRangePolicy for 2 nested loops + using MDPolicyType_2D = typename Kokkos::Experimental::MDRangePolicy< + ExecSpace, Kokkos::Experimental::Rank<2>, int_index>; + + // 1D - complex View + using Complex_View_1D = typename Kokkos::View<MyComplex*, ExecSpace>; + + // Reduction when ExecPolicy = MDRangePolicy and ReducerArgument = + // scalar/1-element view + void reduce_MDRange() { + View_2D d_data("d_data", N, M); + + MDPolicyType_2D mdPolicy_2D({0, 0}, {N, M}); + + // Store the reduced value. + value_type d_result = 0.0, h_result = 0.0; + Kokkos::View<value_type, ExecSpace> d_resultView("result View"); + + // Compute reference solution on the host. + for (int i = 0; i < N; ++i) + for (int j = 0; j < M; ++j) h_result += i * j; + h_result *= 0.5; + + // Fill data. + Kokkos::parallel_for( + mdPolicy_2D, KOKKOS_LAMBDA(const int i, const int j) { + d_data(i, j) = i * j * 0.5; + }); + + // Parallel reduce on a scalar. + Kokkos::parallel_reduce( + mdPolicy_2D, + KOKKOS_LAMBDA(const int i, const int j, value_type& update_value) { + update_value += d_data(i, j); + }, + d_result); + + // Parallel reduce on a view. + Kokkos::parallel_reduce( + mdPolicy_2D, + KOKKOS_LAMBDA(const int i, const int j, value_type& update_value) { + update_value += d_data(i, j); + }, + d_resultView); + + // Check correctness. + ASSERT_EQ(h_result, d_result); + + // Copy view back to host. + value_type view_result = 0.0; + Kokkos::deep_copy(view_result, d_resultView); + ASSERT_EQ(h_result, view_result); + } + + // Custom Reduction + void reduce_custom() { + Complex_View_1D d_data("complex array", N); + MyComplex result(0.0, 0.0); + int sum = 0; + + // Fill data + Kokkos::parallel_for( + Kokkos::RangePolicy<ExecSpace>(0, N), KOKKOS_LAMBDA(const int i) { + d_data(i) = MyComplex(i * 0.5, -i * 0.5); + }); + + // Reduction for complex number. + Kokkos::parallel_reduce( + Kokkos::RangePolicy<ExecSpace>(0, N), + KOKKOS_LAMBDA(const int i, MyComplex& update_value) { + update_value += d_data(i); + }, + result); + + // Correctness Check + for (int i = 0; i < N; ++i) sum += i; + + ASSERT_EQ(result._re, sum * 0.5); + ASSERT_EQ(result._im, -sum * 0.5); + } +}; + +// Reductions tests for MDRange policy and customized reduction. +TEST(TEST_CATEGORY, incr_14_MDrangeReduce) { + TestMDRangeReduce<TEST_EXECSPACE> test; + test.reduce_MDRange(); + test.reduce_custom(); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/incremental/Test16_ParallelScan.hpp b/packages/kokkos/core/unit_test/incremental/Test16_ParallelScan.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e1f5e3767cbc2d45f52ab41dd7220ba68eb4090b --- /dev/null +++ b/packages/kokkos/core/unit_test/incremental/Test16_ParallelScan.hpp @@ -0,0 +1,99 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <gtest/gtest.h> + +/// @Kokkos_Feature_Level_Required:16 +// Incremental test for parallel_scan. +// perform scan on a 1D view of double's and check for correctness. + +namespace Test { + +using value_type = double; +const int N = 10; + +template <class ExecSpace> +struct TestScan { + // 1D View of double + using View_1D = typename Kokkos::View<value_type *, ExecSpace>; + + void parallel_scan() { + View_1D d_data("data", N); + + // Initialize data. + Kokkos::parallel_for( + Kokkos::RangePolicy<ExecSpace>(0, N), + KOKKOS_LAMBDA(const int i) { d_data(i) = i * 0.5; }); + + // Exclusive parallel_scan call. + Kokkos::parallel_scan( + Kokkos::RangePolicy<ExecSpace>(0, N), + KOKKOS_LAMBDA(const int i, value_type &update_value, const bool final) { + const value_type val_i = d_data(i); + if (final) d_data(i) = update_value; + + update_value += val_i; + }); + + // Copy back the data. + auto h_data = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), d_data); + + // Check Correctness + ASSERT_EQ(h_data(0), 0.0); + value_type upd = h_data(0); + for (int i = 1; i < N; ++i) { + upd += (i - 1) * 0.5; + ASSERT_EQ(h_data(i), upd); + } + } +}; + +TEST(TEST_CATEGORY, IncrTest_16_parallelscan) { + TestScan<TEST_EXECSPACE> test; + test.parallel_scan(); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/incremental/Test17_CompleteAtomic.hpp b/packages/kokkos/core/unit_test/incremental/Test17_CompleteAtomic.hpp new file mode 100644 index 0000000000000000000000000000000000000000..6ba5adc618717647b655c6f9654e291cbbf9cb56 --- /dev/null +++ b/packages/kokkos/core/unit_test/incremental/Test17_CompleteAtomic.hpp @@ -0,0 +1,126 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <array> +#include <random> +#include <gtest/gtest.h> + +/// @Kokkos_Feature_Level_Required:17 +// Incremental test for atomic views. +// In this test we sort N integers into num_buckets number of buckets based on +// their rermainder, i.e., a histogram based on remainder. Since the number of +// integers is greater than the number of buckets, we use atomic views for the +// sorted histogram. + +namespace Test { + +using value_type = int; +const int N = 1000; +const int num_buckets = 10; + +template <class ExecSpace> +struct TestAtomicView { + // 1D View of int + using View = typename Kokkos::View<value_type *, ExecSpace>; + + // 1D atomic view + using atomic_view = + typename Kokkos::View<value_type *, ExecSpace, + Kokkos::MemoryTraits<Kokkos::Atomic> >; + + void atomicView() { + // Use default_random_engine object to introduce randomness. + std::default_random_engine generator; + // Initialize uniform_int_distribution class. + std::uniform_int_distribution<int> distribution(0, N); + + // Device and Host views of N number of integers + View d_data("deviceData_1D", N); + auto h_data = create_mirror_view(d_data); + + // Atomic Device and Host views of histogram + atomic_view d_hist("histogram", num_buckets); + auto h_hist = create_mirror_view(d_hist); + + // An array to store correct results for verification + std::array<int, num_buckets> correct_results; + + // Initialize host side histogram arrays + for (int i = 0; i < num_buckets; ++i) { + h_hist(i) = 0; + correct_results[i] = 0; + } + + // Fill host data with integers from the distribution object. + for (int i = 0; i < N; ++i) h_data(i) = distribution(generator); + + // Copy data from host to device + Kokkos::deep_copy(d_data, h_data); + Kokkos::deep_copy(d_hist, h_hist); + + // Update histogram + Kokkos::parallel_for( + Kokkos::RangePolicy<ExecSpace>(0, N), + KOKKOS_LAMBDA(const int i) { d_hist(d_data(i) % num_buckets)++; }); + + // Perform the same computation on host for correctness test. + for (int i = 0; i < N; ++i) correct_results[h_data(i) % num_buckets]++; + + // Copy the histogram back to host + Kokkos::deep_copy(h_hist, d_hist); + + // Validate results + for (int i = 0; i < num_buckets; ++i) + ASSERT_EQ(correct_results[i], h_hist(i)); + } +}; + +// atomic view tests +TEST(TEST_CATEGORY, incr_17_atomicView) { + TestAtomicView<TEST_EXECSPACE> test; + test.atomicView(); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP.hpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP.hpp new file mode 100644 index 0000000000000000000000000000000000000000..ce8ee40d454051e0edaff4ba25db390e7dd056bf --- /dev/null +++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP.hpp @@ -0,0 +1,109 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_OPENMP_HPP +#define KOKKOS_TEST_OPENMP_HPP + +#include <gtest/gtest.h> + +#include <Kokkos_Macros.hpp> + +#ifdef KOKKOS_LAMBDA +#undef KOKKOS_LAMBDA +#endif +#define KOKKOS_LAMBDA [=] + +#include <Kokkos_Core.hpp> + +#include <TestViewMapping.hpp> +#include <TestViewAPI.hpp> +#include <TestViewOfClass.hpp> +#include <TestViewSubview.hpp> +#include <TestAtomic.hpp> +#include <TestAtomicOperations.hpp> +#include <TestAtomicViews.hpp> +#include <TestRange.hpp> +#include <TestTeam.hpp> +#include <TestReduce.hpp> +#include <TestScan.hpp> +#include <TestAggregate.hpp> +#include <TestCompilerMacros.hpp> +#include <TestTaskScheduler.hpp> +#include <TestMemoryPool.hpp> +#include <TestCXX11.hpp> +#include <TestCXX11Deduction.hpp> +#include <TestTeamVector.hpp> +#include <TestTemplateMetaFunctions.hpp> +#include <TestPolicyConstruction.hpp> +#include <TestMDRange.hpp> +#include <TestConcurrentBitset.hpp> + +namespace Test { + +class openmp : public ::testing::Test { + protected: + static void SetUpTestCase() { + int threads_count = 0; +#pragma omp parallel + { +#pragma omp atomic + ++threads_count; + } + + if (threads_count > 3) { + threads_count /= 2; + } + + Kokkos::OpenMP::initialize(threads_count); + Kokkos::print_configuration(std::cout, true); + + srand(10231); + } + + static void TearDownTestCase() { Kokkos::OpenMP::finalize(); } +}; + +} // namespace Test + +#endif diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_Graph.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_Graph.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e5ba9e8738275b4163a787518678c6615f91f0f7 --- /dev/null +++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_Graph.cpp @@ -0,0 +1,47 @@ + +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestOpenMP_Category.hpp> +#include <TestGraph.hpp> diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_InterOp.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_InterOp.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c3ee67673912bb8c8f022d03322d6e8b69adfd72 --- /dev/null +++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_InterOp.cpp @@ -0,0 +1,89 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <TestOpenMP_Category.hpp> +#include <omp.h> + +namespace Test { + +// Test whether allocations survive Kokkos initialize/finalize if done via Raw +// Cuda. +TEST(openmp, raw_openmp_interop) { + int count = 0; + int num_threads, concurrency; +#pragma omp parallel + { +#pragma omp atomic + count++; + if (omp_get_thread_num() == 0) num_threads = omp_get_num_threads(); + } + + ASSERT_EQ(count, num_threads); + + Kokkos::InitArguments arguments{-1, -1, -1, false}; + Kokkos::initialize(arguments); + + count = 0; +#pragma omp parallel + { +#pragma omp atomic + count++; + } + + concurrency = Kokkos::OpenMP::concurrency(); + ASSERT_EQ(count, concurrency); + + Kokkos::finalize(); + + count = 0; +#pragma omp parallel + { +#pragma omp atomic + count++; + } + + ASSERT_EQ(count, concurrency); +} +} // namespace Test diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_PartitionMaster.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_PartitionMaster.cpp new file mode 100644 index 0000000000000000000000000000000000000000..902150da5806d27768603ac71207ce2aaef5551f --- /dev/null +++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_PartitionMaster.cpp @@ -0,0 +1,132 @@ + +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestOpenMP_Category.hpp> +#include <Kokkos_Core.hpp> + +#include <mutex> + +namespace Test { + +TEST(openmp, partition_master) { + using Mutex = Kokkos::Experimental::MasterLock<Kokkos::OpenMP>; + + Mutex mtx; + int errors = 0; + + auto master = [&errors, &mtx](int /*partition_id*/, int /*num_partitions*/) { + const int pool_size = Kokkos::OpenMP::impl_thread_pool_size(); + + { + std::unique_lock<Mutex> lock(mtx); + if (Kokkos::OpenMP::in_parallel()) { + ++errors; + } + if (Kokkos::OpenMP::impl_thread_pool_rank() != 0) { + ++errors; + } + } + + { + int local_errors = 0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy<Kokkos::OpenMP>(0, 1000), + [pool_size](const int, int& errs) { + if (Kokkos::OpenMP::impl_thread_pool_size() != pool_size) { + ++errs; + } + }, + local_errors); + Kokkos::atomic_add(&errors, local_errors); + } + + Kokkos::Experimental::UniqueToken<Kokkos::OpenMP> token; + + Kokkos::View<int*, Kokkos::OpenMP> count("", token.size()); + + Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::OpenMP>(0, 1000), + [=](const int) { + int i = token.acquire(); + ++count[i]; + token.release(i); + }); + + Kokkos::View<int, Kokkos::OpenMP> sum(""); + Kokkos::parallel_for( + Kokkos::RangePolicy<Kokkos::OpenMP>(0, token.size()), + [=](const int i) { Kokkos::atomic_add(sum.data(), count[i]); }); + + if (sum() != 1000) { + Kokkos::atomic_add(&errors, 1); + } + }; + + master(0, 1); + + ASSERT_EQ(errors, 0); + + Kokkos::OpenMP::partition_master(master); + ASSERT_EQ(errors, 0); + + Kokkos::OpenMP::partition_master(master, 4, 0); + ASSERT_EQ(errors, 0); + + Kokkos::OpenMP::partition_master(master, 0, 4); + ASSERT_EQ(errors, 0); + + Kokkos::OpenMP::partition_master(master, 2, 2); + ASSERT_EQ(errors, 0); + + Kokkos::OpenMP::partition_master(master, 8, 0); + ASSERT_EQ(errors, 0); + + Kokkos::OpenMP::partition_master(master, 0, 8); + ASSERT_EQ(errors, 0); + + Kokkos::OpenMP::partition_master(master, 8, 8); + ASSERT_EQ(errors, 0); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_Task.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_Task.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2ddc6a58419040f912ebbd0f9d4f60ae113b9368 --- /dev/null +++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_Task.cpp @@ -0,0 +1,47 @@ + +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestOpenMP_Category.hpp> +#include <TestTaskScheduler.hpp> diff --git a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget.hpp b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget.hpp new file mode 100644 index 0000000000000000000000000000000000000000..edc1c24ddf298f8f00a3a451df8ca75a13cfa46c --- /dev/null +++ b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget.hpp @@ -0,0 +1,108 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_OPENMPTARGET_HPP +#define KOKKOS_TEST_OPENMPTARGET_HPP + +#include <gtest/gtest.h> + +#include <Kokkos_Macros.hpp> + +#ifdef KOKKOS_LAMBDA +#undef KOKKOS_LAMBDA +#endif +#define KOKKOS_LAMBDA [=] + +#include <Kokkos_Core.hpp> + +//#include <TestViewAPI.hpp> +//#include <TestViewOfClass.hpp> +//#include <TestViewSubview.hpp> +//#include <TestAtomic.hpp> +//#include <TestAtomicOperations.hpp> +//#include <TestAtomicViews.hpp> +#include <TestRange.hpp> +#include <TestTeam.hpp> +//#include <TestReduce.hpp> +//#include <TestScan.hpp> +//#include <TestAggregate.hpp> +//#include <TestCompilerMacros.hpp> + +// TODO enable task scheduler tests for openmptarget +//#include <TestTaskScheduler.hpp> + +//#include <TestMemoryPool.hpp> +//#include <TestCXX11.hpp> +//#include <TestCXX11Deduction.hpp> +#include <TestTeamVector.hpp> +//#include <TestTemplateMetaFunctions.hpp> +//#include <TestPolicyConstruction.hpp> +//#include <TestMDRange.hpp> + +namespace Test { + +class openmptarget : public ::testing::Test { + protected: + static void SetUpTestCase() { + const unsigned numa_count = Kokkos::hwloc::get_available_numa_count(); + const unsigned cores_per_numa = + Kokkos::hwloc::get_available_cores_per_numa(); + const unsigned openmptarget_per_core = + Kokkos::hwloc::get_available_openmptarget_per_core(); + + unsigned openmptarget_count = 0; + + openmptarget_count = std::max(1u, numa_count) * + std::max(2u, cores_per_numa * openmptarget_per_core); + + Kokkos::OpenMPTarget::initialize(openmptarget_count); + Kokkos::print_configuration(std::cout, true /* detailed */); + } + + static void TearDownTestCase() { Kokkos::OpenMPTarget::finalize(); } +}; + +} // namespace Test + +#endif diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_Graph.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_Graph.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b2dba1c265cab5cfa4b982bf43f920ec666fcaa5 --- /dev/null +++ b/packages/kokkos/core/unit_test/serial/TestSerial_Graph.cpp @@ -0,0 +1,47 @@ + +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestSerial_Category.hpp> +#include <TestGraph.hpp> diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_Task.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_Task.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c08efbf447b6fe055f7f01e619b2a0b02de0cdf8 --- /dev/null +++ b/packages/kokkos/core/unit_test/serial/TestSerial_Task.cpp @@ -0,0 +1,47 @@ + +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestSerial_Category.hpp> +#include <TestTaskScheduler.hpp> diff --git a/packages/kokkos/core/unit_test/standalone/Makefile b/packages/kokkos/core/unit_test/standalone/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..d60422233d69b51761335caf9c53a1860416aa42 --- /dev/null +++ b/packages/kokkos/core/unit_test/standalone/Makefile @@ -0,0 +1,55 @@ +KOKKOS_DEVICES=OpenMP +KOKKOS_CUDA_OPTIONS=enable_lambda +KOKKOS_ARCH = "SNB,Kepler35" + +MAKEFILE_PATH := $(subst Makefile,,$(abspath $(lastword $(MAKEFILE_LIST)))) + +ifndef KOKKOS_PATH + KOKKOS_PATH = $(MAKEFILE_PATH)../../../ +endif + +SRC = $(wildcard $(MAKEFILE_PATH)*.cpp) +HEADERS = $(wildcard $(MAKEFILE_PATH)*.hpp) +HEADERS = $(wildcard $(MAKEFILE_PATH)/../*.hpp) + +vpath %.cpp $(sort $(dir $(SRC))) + +default: build + echo "Start Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +EXE = test.cuda +else +CXX = g++ +EXE = test.host +endif + +CXXFLAGS ?= -O0 -g -rdynamic +override CXXFLAGS += -I$(MAKEFILE_PATH) -I$(KOKKOS_PATH)/core/unit_test -I$(KOKKOS_PATH)/tpls/gtest -DTESTFILE=$(TESTFILE) +#SRC += $(KOKKOS_PATH)/tpls/gtest/gtest/gtest-all.cc + +DEPFLAGS = -M +LINK = ${CXX} +LINKFLAGS = -rdynamic + +OBJ = $(notdir $(SRC:.cpp=.o)) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) gtest-all.o + $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) gtest-all.o -lpthread -o $(EXE) + +clean: kokkos-clean + rm -f *.o *.cuda *.host + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(HEADERS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) + +gtest-all.o:$(KOKKOS_PATH)/tpls/gtest/gtest/gtest-all.cc + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $(KOKKOS_PATH)/tpls/gtest/gtest/gtest-all.cc diff --git a/packages/kokkos/core/unit_test/standalone/UnitTestMainInit.cpp b/packages/kokkos/core/unit_test/standalone/UnitTestMainInit.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c1f7398c166bcf738111b1674a83a919293faf6d --- /dev/null +++ b/packages/kokkos/core/unit_test/standalone/UnitTestMainInit.cpp @@ -0,0 +1,85 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <gtest/gtest.h> +#include <cstdlib> + +#include <Kokkos_Core.hpp> + +#ifdef KOKKOS_ENABLE_CUDA +#include <TestCuda_Category.hpp> +#endif +#ifdef KOKKOS_ENABLE_HIP +#include <TestHIP_Category.hpp> +#endif +#ifdef KOKKOS_ENABLE_SYCL +#include <TestSYCL_Category.hpp> +#endif +#ifdef KOKKOS_ENABLE_OPENMP +#include <TestOpenMP_Category.hpp> +#endif +#ifdef KOKKOS_ENABLE_THREADS +#include <TestThreads_Category.hpp> +#endif +#ifdef KOKKOS_ENABLE_HPX +#include <TestHPX_Category.hpp> +#endif +#ifdef KOKKOS_ENABLE_OPENMPTARGET +#include <TestOpenMPTarget_Category.hpp> +#endif +#ifndef TEST_EXECSPACE +#ifdef KOKKOS_ENABLE_SERIAL +#include <TestSerial_Category.hpp> +#endif +#endif +#include <TestReducers_d.hpp> + +int main(int argc, char *argv[]) { + Kokkos::initialize(argc, argv); + ::testing::InitGoogleTest(&argc, argv); + + int result = RUN_ALL_TESTS(); + Kokkos::finalize(); + return result; +} diff --git a/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp b/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp new file mode 100644 index 0000000000000000000000000000000000000000..018855963d35f8fef81a93985811dcc3d9b239fc --- /dev/null +++ b/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp @@ -0,0 +1,88 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <TestSYCL_Category.hpp> + +#include <array> + +namespace Test { + +// Test whether allocations survive Kokkos initialize/finalize if done via Raw +// SYCL. +TEST(sycl, raw_sycl_interop) { + sycl::default_selector device_selector; + sycl::queue queue(device_selector); + constexpr int n = 100; + int* p = sycl::malloc_device<int>(n, queue); + + Kokkos::InitArguments arguments{-1, -1, -1, false}; + Kokkos::initialize(arguments); + { + TEST_EXECSPACE space(queue); + Kokkos::View<int*, Kokkos::MemoryTraits<Kokkos::Unmanaged>> v(p, n); + Kokkos::deep_copy(space, v, 5); + } + Kokkos::finalize(); + + queue.submit([&](sycl::handler& cgh) { + cgh.parallel_for(sycl::range<1>(n), [=](int idx) { p[idx] += idx; }); + }); + queue.wait_and_throw(); + + std::array<int, n> h_p; + queue.memcpy(h_p.data(), p, sizeof(int) * n); + queue.wait_and_throw(); + sycl::free(p, queue); + + int64_t sum = 0; + int64_t sum_expect = 0; + for (int i = 0; i < n; i++) { + sum += h_p[i]; + sum_expect += 5 + i; + } + + ASSERT_EQ(sum, sum_expect); +} +} // namespace Test diff --git a/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init_Context.cpp b/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init_Context.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c12c5c07295d73ddb0600d366f9c50faa6ba96df --- /dev/null +++ b/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init_Context.cpp @@ -0,0 +1,120 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <TestSYCL_Category.hpp> + +#include <array> + +namespace Test { + +// Test whether external allocations can be accessed by the default queue. +TEST(sycl, raw_sycl_interop_context_1) { + Kokkos::Experimental::SYCL default_space; + sycl::context default_context = default_space.sycl_context(); + + sycl::default_selector device_selector; + sycl::queue queue(default_context, device_selector); + constexpr int n = 100; + int* p = sycl::malloc_device<int>(n, queue); + + Kokkos::Experimental::SYCL space(queue); + Kokkos::View<int*, Kokkos::MemoryTraits<Kokkos::Unmanaged>> v(p, n); + Kokkos::deep_copy(v, 5); + + queue.submit([&](sycl::handler& cgh) { + cgh.parallel_for(sycl::range<1>(n), [=](int idx) { p[idx] += idx; }); + }); + queue.wait_and_throw(); + + std::array<int, n> h_p; + queue.memcpy(h_p.data(), p, sizeof(int) * n); + queue.wait_and_throw(); + sycl::free(p, queue); + + int64_t sum = 0; + int64_t sum_expect = 0; + for (int i = 0; i < n; i++) { + sum += h_p[i]; + sum_expect += 5 + i; + } + + ASSERT_EQ(sum, sum_expect); +} + +// Test whether regular View allocations can be accessed by non-default queues. +TEST(sycl, raw_sycl_interop_context_2) { + Kokkos::Experimental::SYCL default_space; + sycl::context default_context = default_space.sycl_context(); + + sycl::default_selector device_selector; + sycl::queue queue(default_context, device_selector); + constexpr int n = 100; + + Kokkos::Experimental::SYCL space(queue); + Kokkos::View<int*, Kokkos::Experimental::SYCLDeviceUSMSpace> v("default_view", + n); + Kokkos::deep_copy(space, v, 5); + + auto* v_ptr = v.data(); + queue.submit([&](sycl::handler& cgh) { + cgh.parallel_for(sycl::range<1>(n), [=](int idx) { v_ptr[idx] += idx; }); + }); + queue.wait_and_throw(); + + std::array<int, n> h_p; + queue.memcpy(h_p.data(), v_ptr, sizeof(int) * n); + queue.wait_and_throw(); + + int64_t sum = 0; + int64_t sum_expect = 0; + for (int i = 0; i < n; i++) { + sum += h_p[i]; + sum_expect += 5 + i; + } + + ASSERT_EQ(sum, sum_expect); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Streams.cpp b/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Streams.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f81b7073392cc192318187e2ac31aa632f428489 --- /dev/null +++ b/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Streams.cpp @@ -0,0 +1,118 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestSYCL_Category.hpp> +#include <Test_InterOp_Streams.hpp> + +namespace Test { +// Test Interoperability with SYCL Streams +TEST(sycl, raw_sycl_queues) { + sycl::default_selector device_selector; + sycl::queue queue(device_selector); + Kokkos::InitArguments arguments{-1, -1, -1, false}; + Kokkos::initialize(arguments); + int* p = sycl::malloc_device<int>(100, queue); + using MemorySpace = typename TEST_EXECSPACE::memory_space; + + { + TEST_EXECSPACE space0(queue); + Kokkos::View<int*, TEST_EXECSPACE> v(p, 100); + Kokkos::deep_copy(space0, v, 5); + int sum = 0; + + Kokkos::parallel_for("Test::sycl::raw_sycl_queue::Range", + Kokkos::RangePolicy<TEST_EXECSPACE>(space0, 0, 100), + FunctorRange<MemorySpace>(v)); + Kokkos::parallel_reduce("Test::sycl::raw_sycl_queue::RangeReduce", + Kokkos::RangePolicy<TEST_EXECSPACE>(space0, 0, 100), + FunctorRangeReduce<MemorySpace>(v), sum); + space0.fence(); + ASSERT_EQ(6 * 100, sum); + + Kokkos::parallel_for("Test::sycl::raw_sycl_queue::MDRange", + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>( + space0, {0, 0}, {10, 10}), + FunctorMDRange<MemorySpace>(v)); + space0.fence(); + Kokkos::parallel_reduce( + "Test::sycl::raw_sycl_queue::MDRangeReduce", + Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>(space0, {0, 0}, + {10, 10}), + FunctorMDRangeReduce<MemorySpace>(v), sum); + space0.fence(); + ASSERT_EQ(7 * 100, sum); + + Kokkos::parallel_for("Test::sycl::raw_sycl_queue::Team", + Kokkos::TeamPolicy<TEST_EXECSPACE>(space0, 10, 10), + FunctorTeam<MemorySpace, TEST_EXECSPACE>(v)); + space0.fence(); + Kokkos::parallel_reduce("Test::sycl::raw_sycl_queue::Team", + Kokkos::TeamPolicy<TEST_EXECSPACE>(space0, 10, 10), + FunctorTeamReduce<MemorySpace, TEST_EXECSPACE>(v), + sum); + space0.fence(); + ASSERT_EQ(8 * 100, sum); + } + Kokkos::finalize(); + + // Try to use the queue after Kokkos' copy got out-of-scope. + // This kernel corresponds to "offset_streams" in the HIP and CUDA tests. + queue.submit([&](sycl::handler& cgh) { + cgh.parallel_for(sycl::range<1>(100), [=](int idx) { p[idx] += idx; }); + }); + queue.wait_and_throw(); + + int h_p[100]; + queue.memcpy(h_p, p, sizeof(int) * 100); + queue.wait_and_throw(); + int64_t sum = 0; + int64_t sum_expect = 0; + for (int i = 0; i < 100; i++) { + sum += h_p[i]; + sum_expect += 8 + i; + } + + ASSERT_EQ(sum, sum_expect); +} +} // namespace Test diff --git a/packages/kokkos/core/unit_test/testmake.sh b/packages/kokkos/core/unit_test/testmake.sh new file mode 100755 index 0000000000000000000000000000000000000000..b5d4e8874d6bbd632bb7875bb931935018671195 --- /dev/null +++ b/packages/kokkos/core/unit_test/testmake.sh @@ -0,0 +1,18 @@ +#!/bin/bash +if test "`basename $PWD`" = "cmaketest"; then + outfile=$1 +else + outfile=config/tmpstore/$1 +fi + +grep_arch=`grep KOKKOS_ARCH $outfile | grep $2 2>&1` +grep_devs=`grep KOKKOS_DEVICES $outfile | grep $3 2>&1` +if test -n "$grep_arch"; then + if test -n "$grep_devs"; then + echo Passed + else + echo Failed + fi +else + echo Failed +fi diff --git a/packages/kokkos/core/unit_test/tools/TestAllCalls.cpp b/packages/kokkos/core/unit_test/tools/TestAllCalls.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7ee8d68e30dd1de252866ff83c4aed8e07bd2ab5 --- /dev/null +++ b/packages/kokkos/core/unit_test/tools/TestAllCalls.cpp @@ -0,0 +1,91 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +// This file calls most of the basic Kokkos primitives. When combined with a +// testing library this tests that our shared-library loading based profiling +// mechanisms work + +#include <Kokkos_Core.hpp> +#include <iostream> +#include <sstream> + +int main(int argc, char** argv) { + Kokkos::initialize(argc, argv); + { + // This test only uses host kernel launch mechanisms. This is to allow for + // the test to run on platforms where CUDA lambda launch isn't supported. + // This is safe because this test only seeks to test that the dlsym-based + // tool loading mechanisms work, all of which happens completely + // independently of the enabled backends + using execution_space = Kokkos::DefaultHostExecutionSpace; + using memory_space = typename execution_space::memory_space; + Kokkos::View<int*, memory_space> src_view("source", 10); + Kokkos::View<int*, memory_space> dst_view("destination", 10); + Kokkos::deep_copy(dst_view, src_view); + Kokkos::parallel_for("parallel_for", + Kokkos::RangePolicy<execution_space>(0, 1), + [=](int i) { (void)i; }); + int result; + Kokkos::parallel_reduce( + "parallel_reduce", Kokkos::RangePolicy<execution_space>(0, 1), + [=](int i, int& hold_result) { hold_result += i; }, result); + Kokkos::parallel_scan("parallel_scan", + Kokkos::RangePolicy<execution_space>(0, 1), + [=](const int i, int& hold_result, const bool final) { + if (final) { + hold_result += i; + } + }); + Kokkos::Profiling::pushRegion("push_region"); + Kokkos::Profiling::popRegion(); + uint32_t sectionId; + Kokkos::Profiling::createProfileSection("created_section", §ionId); + Kokkos::Profiling::startSection(sectionId); + Kokkos::Profiling::stopSection(sectionId); + Kokkos::Profiling::destroyProfileSection(sectionId); + Kokkos::Profiling::markEvent("profiling_event"); + Kokkos::Tools::declareMetadata("dogs", "good"); + } + Kokkos::finalize(); +} diff --git a/packages/kokkos/core/unit_test/tools/TestCInterface.c b/packages/kokkos/core/unit_test/tools/TestCInterface.c new file mode 100644 index 0000000000000000000000000000000000000000..66e68154e99eb81d963988e038ca1bfa8d48ad1a --- /dev/null +++ b/packages/kokkos/core/unit_test/tools/TestCInterface.c @@ -0,0 +1,2 @@ +#include <impl/Kokkos_Profiling_C_Interface.h> +int main(){} diff --git a/packages/kokkos/core/unit_test/tools/TestLogicalSpaces.cpp b/packages/kokkos/core/unit_test/tools/TestLogicalSpaces.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4008fd3d5b54ac2f219b6555210da2aebf2722a1 --- /dev/null +++ b/packages/kokkos/core/unit_test/tools/TestLogicalSpaces.cpp @@ -0,0 +1,49 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <iostream> +#include "Kokkos_Core.hpp" + +#include <tools/TestLogicalSpaces.hpp> +#include "../UnitTestMainInit.cpp" diff --git a/packages/kokkos/core/unit_test/tools/TestLogicalSpaces.hpp b/packages/kokkos/core/unit_test/tools/TestLogicalSpaces.hpp new file mode 100644 index 0000000000000000000000000000000000000000..29f6dd7a65e1f1e57769a3453175b484e5077a40 --- /dev/null +++ b/packages/kokkos/core/unit_test/tools/TestLogicalSpaces.hpp @@ -0,0 +1,198 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#include <iostream> +#include <gtest/gtest.h> +#include "Kokkos_Core.hpp" + +#include <impl/Kokkos_Stacktrace.hpp> + +namespace Test { + +void debug_print(const Kokkos_Profiling_SpaceHandle hand, const char* name, + const void* ptr, const size_t size) { + std::cout << "Alloc: " << hand.name << ", [" << name << "," << ptr << "] " + << size << std::endl; +} +void debug_dealloc(const Kokkos_Profiling_SpaceHandle hand, const char* name, + const void* ptr, const size_t size) { + std::cout << "Dealloc: " << hand.name << ", [" << name << "," << ptr << "] " + << size << std::endl; +} + +void fail_on_event(const Kokkos::Profiling::SpaceHandle, const char*, + const void*, const uint64_t) { + ASSERT_TRUE(false) << "Unexpected memory event"; +} + +void expect_no_events() { + Kokkos::Tools::Experimental::set_allocate_data_callback(&fail_on_event); + Kokkos::Tools::Experimental::set_deallocate_data_callback(&fail_on_event); +} + +std::string expected_view_name; +std::string expected_space_name; +std::string error_message; +void expect_allocation_event(const std::string evn, const std::string esn, + const std::string em) { + expected_view_name = evn; + expected_space_name = esn; + error_message = em; + Kokkos::Tools::Experimental::set_allocate_data_callback( + [](const Kokkos_Profiling_SpaceHandle hand, const char* name, const void*, + const uint64_t) { + ASSERT_EQ(std::string(hand.name), expected_space_name) + << error_message << " (bad handle)"; + ASSERT_EQ(std::string(name), expected_view_name) + << error_message << " (bad view name)"; + expect_no_events(); + }); +} +void expect_deallocation_event(const std::string& evn, const std::string& esn, + const std::string em) { + expected_view_name = evn; + expected_space_name = esn; + error_message = em; + Kokkos::Tools::Experimental::set_deallocate_data_callback( + [](const Kokkos_Profiling_SpaceHandle hand, const char* name, const void*, + const uint64_t) { + ASSERT_EQ(std::string(hand.name), expected_space_name) + << error_message << " (bad handle)"; + ASSERT_EQ(std::string(name), expected_view_name) + << error_message << " (bad view name)"; + expect_no_events(); + }); +} + +struct TestSpaceNamer { + static constexpr const char* get_name() { return "TestSpace"; } +}; +struct TestSpaceNamerTwo { + static constexpr const char* get_name() { return "YoDawg"; } +}; +struct TestSpaceNamerThree { + static constexpr const char* get_name() { return "CustomAccessSpace"; } +}; +using fake_memory_space = Kokkos::Experimental::LogicalMemorySpace< + Kokkos::HostSpace, Kokkos::DefaultHostExecutionSpace, TestSpaceNamer, + Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>; + +void test_view_construct() { + { + expect_allocation_event("puppy_view", "TestSpace", "View allocation"); + Kokkos::View<double*, fake_memory_space> pup_view("puppy_view", 1000); + expect_deallocation_event("puppy_view", "TestSpace", "View free"); + } + Kokkos::Tools::Experimental::pause_tools(); +} +void test_malloc_free() { + expect_allocation_event("does_malloc_work", "TestSpace", + "Error in malloc event"); + auto* temp = + Kokkos::kokkos_malloc<fake_memory_space>("does_malloc_work", 1000); + expect_deallocation_event("does_malloc_work", "TestSpace", "Error in free"); + Kokkos::kokkos_free(temp); + Kokkos::Tools::Experimental::pause_tools(); +} +void test_chained_spaces() { + using doubly_fake_memory_space = Kokkos::Experimental::LogicalMemorySpace< + fake_memory_space, Kokkos::DefaultHostExecutionSpace, TestSpaceNamerTwo, + Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>; + { + expect_allocation_event("xzibit_dot_jpeg", "YoDawg", + "Chained space view allocation"); + Kokkos::View<double*, doubly_fake_memory_space> pup_view("xzibit_dot_jpeg", + 1000); + expect_deallocation_event("xzibit_dot_jpeg", "YoDawg", + "Chained space free"); + } + Kokkos::Tools::Experimental::pause_tools(); +} +void test_space_allocations() { + fake_memory_space debug_space; + expect_allocation_event("allocation_from_space", "TestSpace", + "Space allocation"); + auto* temp = debug_space.allocate("allocation_from_space", 1000); + expect_deallocation_event("allocation_from_space", "TestSpace", + "Space deallocation"); + debug_space.deallocate("allocation_from_space", temp, 1000); + Kokkos::Tools::Experimental::pause_tools(); +} +template <typename Space> +struct AccessCheckKernel { + Kokkos::View<double*, Space> data; + KOKKOS_FUNCTION void operator()(const int i) const { data[i] = i; } +}; + +template <typename Space> +void test_allowed_access() { + constexpr const int data_size = 1000; + Kokkos::View<double*, Space> test_view("test_view", data_size); + AccessCheckKernel<Space> functor{test_view}; + Kokkos::parallel_for( + "access_allowed", + Kokkos::RangePolicy<typename Space::execution_space>(0, data_size), + functor); +} + +using semantically_independent_logical_space = + Kokkos::Experimental::LogicalMemorySpace< + Kokkos::HostSpace, Kokkos::DefaultHostExecutionSpace, + TestSpaceNamerThree, + Kokkos::Experimental::LogicalSpaceSharesAccess::no_shared_access>; + +TEST(defaultdevicetype, logical_space_views) { test_view_construct(); } +TEST(defaultdevicetype, logical_space_malloc) { test_malloc_free(); } +TEST(defaultdevicetype, logical_space_alloc) { test_space_allocations(); } +TEST(defaultdevicetype, chained_logical_spaces) { test_chained_spaces(); } +TEST(defaultdevicetype, access_allowed) { + test_allowed_access<fake_memory_space>(); +} +TEST(defaultdevicetype_DeathTest, access_forbidden) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + ASSERT_DEATH( + { test_allowed_access<semantically_independent_logical_space>(); }, + "Kokkos::View ERROR: attempt to access inaccessible memory space"); +} + +} // namespace Test diff --git a/packages/kokkos/core/unit_test/tools/TestTuning.cpp b/packages/kokkos/core/unit_test/tools/TestTuning.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6bc787023efa95a7208b03632279dc0b8beeeb8b --- /dev/null +++ b/packages/kokkos/core/unit_test/tools/TestTuning.cpp @@ -0,0 +1,194 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +// This file tests the primitives of the Tuning system + +#include <iostream> +#include <Kokkos_Core.hpp> +#include <stdexcept> +#include <string> +#include <unordered_map> +#include <vector> + +static size_t expectedNumberOfContextVariables; +static int64_t expectedContextVariableValue; + +int main() { + Kokkos::initialize(); + { + auto context = Kokkos::Tools::Experimental::get_new_context_id(); + + Kokkos::Tools::Experimental::VariableInfo contextVariableInfo; + + contextVariableInfo.category = Kokkos::Tools::Experimental:: + StatisticalCategory::kokkos_value_categorical; + contextVariableInfo.type = + Kokkos::Tools::Experimental::ValueType::kokkos_value_int64; + contextVariableInfo.valueQuantity = + Kokkos::Tools::Experimental::CandidateValueType::kokkos_value_unbounded; + + Kokkos::Tools::Experimental::VariableInfo tuningVariableInfo; + + tuningVariableInfo.category = Kokkos::Tools::Experimental:: + StatisticalCategory::kokkos_value_categorical; + tuningVariableInfo.type = + Kokkos::Tools::Experimental::ValueType::kokkos_value_int64; + tuningVariableInfo.valueQuantity = + Kokkos::Tools::Experimental::CandidateValueType::kokkos_value_set; + + std::vector<int64_t> candidate_value_vector = {0, 1, 2, 3, 4, + 5, 6, 7, 8, 9}; + + Kokkos::Tools::Experimental::SetOrRange allowed_values = + Kokkos::Tools::Experimental::make_candidate_set( + candidate_value_vector.size(), candidate_value_vector.data()); + // test that ID's are transmitted to the tool + Kokkos::Tools::Experimental::set_declare_output_type_callback( + [](const char*, const size_t, + Kokkos::Tools::Experimental::VariableInfo* info) { + if (info->type != + Kokkos::Tools::Experimental::ValueType::kokkos_value_int64) { + throw(std::runtime_error("Tuning Variable has wrong type")); + } + }); + Kokkos::Tools::Experimental::set_declare_input_type_callback( + [](const char*, const size_t, + Kokkos::Tools::Experimental::VariableInfo* info) { + if (info->type != + Kokkos::Tools::Experimental::ValueType::kokkos_value_int64) { + throw(std::runtime_error("Context Variable has wrong type")); + } + }); + tuningVariableInfo.candidates = allowed_values; + auto contextVariableId = Kokkos::Tools::Experimental::declare_input_type( + "kokkos.testing.context_variable", contextVariableInfo); + auto tuningVariableId = Kokkos::Tools::Experimental::declare_output_type( + "kokkos.testing.tuning_variable", tuningVariableInfo); + + // test that we correctly pass context values, and receive tuning variables + // back in return + Kokkos::Tools::Experimental::VariableValue contextValues[] = { + Kokkos::Tools::Experimental::make_variable_value(contextVariableId, + int64_t(0))}; + Kokkos::Tools::Experimental::set_input_values(context, 1, contextValues); + + Kokkos::Tools::Experimental::set_request_output_values_callback( + [](const size_t, const size_t, + const Kokkos::Tools::Experimental::VariableValue* context_values, + const size_t, + Kokkos::Tools::Experimental::VariableValue* tuning_values) { + auto candidate_values = tuning_values[0].metadata->candidates; + if (context_values[0].value.int_value != + expectedContextVariableValue) { + throw std::runtime_error( + "Context variables not correctly passed to tuning callbacks"); + } + int tuningVariableSetSize = candidate_values.set.size; + std::cout << "Set of size " << tuningVariableSetSize << std::endl; + // tuning methodology via https://xkcd.com/221/ + tuning_values[0].value.int_value = + candidate_values.set.values.int_value[4 % tuningVariableSetSize]; + }); + + Kokkos::Tools::Experimental::VariableValue tuningValues[] = { + Kokkos::Tools::Experimental::make_variable_value(tuningVariableId, + int64_t(0))}; + + Kokkos::Tools::Experimental::request_output_values(context, 1, + tuningValues); + std::cout << tuningValues[0].value.int_value << "," + << candidate_value_vector[4] << std::endl; + if (tuningValues[0].value.int_value != candidate_value_vector[4]) { + throw std::runtime_error("Tuning value return is incorrect"); + } + + Kokkos::Tools::Experimental::end_context(context); + + // test nested contexts + auto outerContext = Kokkos::Tools::Experimental::get_new_context_id(); + auto innerContext = Kokkos::Tools::Experimental::get_new_context_id(); + + Kokkos::Tools::Experimental::VariableInfo secondContextVariableInfo; + + secondContextVariableInfo.category = Kokkos::Tools::Experimental:: + StatisticalCategory::kokkos_value_categorical; + secondContextVariableInfo.type = + Kokkos::Tools::Experimental::ValueType::kokkos_value_int64; + secondContextVariableInfo.valueQuantity = + Kokkos::Tools::Experimental::CandidateValueType::kokkos_value_unbounded; + auto secondContextVariableId = + Kokkos::Tools::Experimental::declare_output_type( + "kokkos.testing.second_context_variable", + secondContextVariableInfo); + + Kokkos::Tools::Experimental::VariableValue contextValueTwo[] = { + Kokkos::Tools::Experimental::make_variable_value( + secondContextVariableId, int64_t(1))}; + + Kokkos::Tools::Experimental::set_request_output_values_callback( + [](const size_t, const size_t num_context_variables, + const Kokkos::Tools::Experimental::VariableValue*, const size_t, + Kokkos::Tools::Experimental::VariableValue*) { + std::cout << "Expect " << expectedNumberOfContextVariables + << ", have " << num_context_variables << std::endl; + if (num_context_variables != expectedNumberOfContextVariables) { + throw( + std::runtime_error("Incorrect number of context variables in " + "nested tuning contexts")); + } + }); + Kokkos::Tools::Experimental::set_input_values(outerContext, 1, + contextValues); + expectedNumberOfContextVariables = 1; + Kokkos::Tools::Experimental::request_output_values(outerContext, 1, + tuningValues); + Kokkos::Tools::Experimental::set_input_values(innerContext, 1, + contextValueTwo); + expectedNumberOfContextVariables = 2; + Kokkos::Tools::Experimental::request_output_values(innerContext, 1, + tuningValues); + } // end Kokkos block + + Kokkos::finalize(); +} diff --git a/packages/kokkos/core/unit_test/tools/printing-tool.cpp b/packages/kokkos/core/unit_test/tools/printing-tool.cpp new file mode 100644 index 0000000000000000000000000000000000000000..76b7837d0365306201c83eb8e2ae92523d3a6670 --- /dev/null +++ b/packages/kokkos/core/unit_test/tools/printing-tool.cpp @@ -0,0 +1,140 @@ + +#include <inttypes.h> +#include <iostream> + +struct Kokkos_Profiling_KokkosPDeviceInfo; + +// just get the basename for print_help/parse_args +std::string get_basename(char* cmd, int idx = 0) { + if (idx > 0) return cmd; + std::string _cmd = cmd; + auto _pos = _cmd.find_last_of('/'); + if (_pos != std::string::npos) return _cmd.substr(_pos + 1); + return _cmd; +} + +struct SpaceHandle { + char name[64]; +}; + +const int parallel_for_id = 0; +const int parallel_reduce_id = 1; +const int parallel_scan_id = 2; + +extern "C" void kokkosp_init_library( + const int /*loadSeq*/, const uint64_t /*interfaceVer*/, + const uint32_t /*devInfoCount*/, + Kokkos_Profiling_KokkosPDeviceInfo* /* deviceInfo */) { + std::cout << "kokkosp_init_library::"; +} + +extern "C" void kokkosp_finalize_library() { + std::cout << "kokkosp_finalize_library::"; +} + +extern "C" void kokkosp_print_help(char* exe) { + std::cout << "kokkosp_print_help:" << get_basename(exe) << "::"; +} + +extern "C" void kokkosp_parse_args(int argc, char** argv) { + std::cout << "kokkosp_parse_args:" << argc; + for (int i = 0; i < argc; ++i) std::cout << ":" << get_basename(argv[i], i); + std::cout << "::"; +} + +extern "C" void kokkosp_begin_parallel_for(const char* name, + const uint32_t devID, + uint64_t* kID) { + *kID = parallel_for_id; + std::cout << "kokkosp_begin_parallel_for:" << name << ":" << devID << ":" + << *kID << "::"; +} + +extern "C" void kokkosp_end_parallel_for(const uint64_t kID) { + std::cout << "kokkosp_end_parallel_for:" << kID << "::"; +} + +extern "C" void kokkosp_begin_parallel_scan(const char* name, + const uint32_t devID, + uint64_t* kID) { + *kID = parallel_scan_id; + std::cout << "kokkosp_begin_parallel_scan:" << name << ":" << devID << ":" + << *kID << "::"; +} + +extern "C" void kokkosp_end_parallel_scan(const uint64_t kID) { + std::cout << "kokkosp_end_parallel_scan:" << kID << "::"; +} + +extern "C" void kokkosp_begin_parallel_reduce(const char* name, + const uint32_t devID, + uint64_t* kID) { + *kID = parallel_reduce_id; + std::cout << "kokkosp_begin_parallel_reduce:" << name << ":" << devID << ":" + << *kID << "::"; +} + +extern "C" void kokkosp_end_parallel_reduce(const uint64_t kID) { + std::cout << "kokkosp_end_parallel_reduce:" << kID << "::"; +} + +extern "C" void kokkosp_push_profile_region(char* regionName) { + std::cout << "kokkosp_push_profile_region:" << regionName << "::"; +} + +extern "C" void kokkosp_pop_profile_region() { + std::cout << "kokkosp_pop_profile_region::"; +} + +extern "C" void kokkosp_allocate_data(SpaceHandle handle, const char* name, + void* ptr, uint64_t size) { + std::cout << "kokkosp_allocate_data:" << handle.name << ":" << name << ":" + << ptr << ":" << size << "::"; +} + +extern "C" void kokkosp_deallocate_data(SpaceHandle handle, const char* name, + void* ptr, uint64_t size) { + std::cout << "kokkosp_deallocate_data:" << handle.name << ":" << name << ":" + << ptr << ":" << size << "::"; +} + +extern "C" void kokkosp_begin_deep_copy(SpaceHandle dst_handle, + const char* dst_name, + const void* dst_ptr, + SpaceHandle src_handle, + const char* src_name, + const void* src_ptr, uint64_t size) { + std::cout << "kokkosp_begin_deep_copy:" << dst_handle.name << ":" << dst_name + << ":" << dst_ptr << ":" << src_handle.name << ":" << src_name + << ":" << src_ptr << ":" << size << "::"; +} + +extern "C" void kokkosp_end_deep_copy() { + std::cout << "kokkosp_end_deep_copy::"; +} + +uint32_t section_id = 3; +extern "C" void kokkosp_create_profile_section(const char* name, + uint32_t* sec_id) { + *sec_id = section_id; + std::cout << "kokkosp_create_profile_section:" << name << ":" << *sec_id + << "::"; +} + +extern "C" void kokkosp_start_profile_section(uint32_t sec_id) { + std::cout << "kokkosp_start_profile_section:" << sec_id << "::"; +} + +extern "C" void kokkosp_stop_profile_section(uint32_t sec_id) { + std::cout << "kokkosp_stop_profile_section:" << sec_id << "::"; +} +extern "C" void kokkosp_destroy_profile_section(uint32_t sec_id) { + std::cout << "kokkosp_destroy_profile_section:" << sec_id << "::"; +} + +extern "C" void kokkosp_profile_event(const char* name) { + std::cout << "kokkosp_profile_event:" << name << "::"; +} +extern "C" void kokkosp_declare_metadata(const char* key, const char* value) { + std::cout << "kokkosp_declare_metadata:" << key << ":" << value << "::"; +} diff --git a/packages/kokkos/doc/Doxyfile b/packages/kokkos/doc/Doxyfile new file mode 100644 index 0000000000000000000000000000000000000000..bc5c7486b27fc55ede35359b969af0a8008f960b --- /dev/null +++ b/packages/kokkos/doc/Doxyfile @@ -0,0 +1,127 @@ +# +# Include the global look and feel options +# +@INCLUDE = ../../common/Doxyfile +# +# Package options +# +PROJECT_NAME = "Kokkos Core Kernels Package" +PROJECT_NUMBER = "Version of the Day" +OUTPUT_DIRECTORY = . +OUTPUT_LANGUAGE = English + +EXTRACT_ALL = NO +EXTRACT_PRIVATE = NO +EXTRACT_STATIC = YES +HIDE_UNDOC_MEMBERS = YES +HIDE_UNDOC_CLASSES = YES +BRIEF_MEMBER_DESC = YES +REPEAT_BRIEF = YES +ALWAYS_DETAILED_SEC = YES +FULL_PATH_NAMES = NO +STRIP_FROM_PATH = +INTERNAL_DOCS = NO +CLASS_DIAGRAMS = YES +SOURCE_BROWSER = YES +INLINE_SOURCES = NO +STRIP_CODE_COMMENTS = YES +REFERENCED_BY_RELATION = NO +REFERENCES_RELATION = NO +CASE_SENSE_NAMES = YES +HIDE_SCOPE_NAMES = NO +VERBATIM_HEADERS = YES +SHOW_INCLUDE_FILES = YES +#JAVADOC_AUTOBRIEF = YES +INHERIT_DOCS = YES +INLINE_INHERITED_MEMB = YES +INLINE_INFO = YES +SORT_MEMBER_DOCS = NO +TAB_SIZE = 2 +ENABLED_SECTIONS = +SORT_BRIEF_DOCS = NO +GENERATE_TODOLIST = YES +GENERATE_TESTLIST = YES +QUIET = NO +WARNINGS = YES +WARN_IF_UNDOCUMENTED = YES +WARN_FORMAT = "$file:$line: $text" + +# +# INPUT: Where to find files that Doxygen should process. ../classic +# has a doc/ subdirectory with its own Doxyfile that points to its own +# files. The other Kokkos subpackages don't currently have their own +# Doxyfile files, so we have to do it manually here. +# +# mfh 26 Sep 2013: I've only added those directories in the Core +# subpackage that constitute the "public interface" of that +# subpackage. Please feel free to include additional subdirectories +# of ../core if you want to generate their documentation as well. +# +# mfh 26 Sep 2013: I've only added the Kokkos subpackages here that I +# think are ready for Doxygen documentation generation. Please feel +# free to amend this list as you see fit. +# + +INPUT = index.doc ../classic ../core/src ../containers/src ../linalg/src +FILE_PATTERNS = *.hpp *.cpp *.cuh *.cu +RECURSIVE = NO +EXCLUDE_PATTERNS = *.x *.o *.out +EXAMPLE_PATH = +EXAMPLE_RECURSIVE = YES +EXAMPLE_PATTERNS = *.cpp *.hpp +IMAGE_PATH = +INPUT_FILTER = +ALPHABETICAL_INDEX = YES +COLS_IN_ALPHA_INDEX = 4 +IGNORE_PREFIX = +# +# What diagrams are created +# +CLASS_GRAPH = YES +COLLABORATION_GRAPH = NO +INCLUDE_GRAPH = NO +INCLUDED_BY_GRAPH = NO +GRAPHICAL_HIERARCHY = YES +# +# Preprocessing +# +ENABLE_PREPROCESSING = YES +MACRO_EXPANSION = YES +EXPAND_ONLY_PREDEF = YES +SEARCH_INCLUDES = YES +INCLUDE_FILE_PATTERNS = +PREDEFINED = DOXYGEN_SHOULD_SKIP_THIS DOXYGEN_USE_ONLY +INCLUDE_PATH = ../src +EXPAND_AS_DEFINED = +# +# Links to other packages +# +TAGFILES = ../../common/tag_files/teuchos.tag=../../../teuchos/doc/html ../../common/tag_files/epetra.tag=../../../epetra/doc/html \ + ../../common/tag_files/belos.tag=../../../belos/doc/html ../../common/tag_files/anasazi.tag=../../../anasazi/doc/html \ + ../../common/tag_files/kokkos.tag=../../../kokkos/doc/html +GENERATE_TAGFILE = ../../common/tag_files/tpetra.tag +ALLEXTERNALS = NO +EXTERNAL_GROUPS = NO +# +# Environment +# +PERL_PATH = /usr/bin/perl +HAVE_DOT = YES +DOT_PATH = +MAX_DOT_GRAPH_WIDTH = 1024 +MAX_DOT_GRAPH_HEIGHT = 1024 +# +# What kind of documentation is generated +# +#GENERATE_HTML = YES +#HTML_OUTPUT = html +#HTML_HEADER = includes/header.html +#HTML_FOOTER = includes/footer.html +#HTML_STYLESHEET = includes/stylesheet.css +#HTML_ALIGN_MEMBERS = YES +GENERATE_HTMLHELP = NO +DISABLE_INDEX = NO +GENERATE_LATEX = NO +GENERATE_RTF = NO +GENERATE_MAN = NO +GENERATE_XML = NO diff --git a/packages/kokkos/doc/Kokkos-Programming-Guide.md b/packages/kokkos/doc/Kokkos-Programming-Guide.md new file mode 100644 index 0000000000000000000000000000000000000000..3992dd8130eff3d961da94b5fb42e2df3ed03658 --- /dev/null +++ b/packages/kokkos/doc/Kokkos-Programming-Guide.md @@ -0,0 +1 @@ +[Programming Guide](https://github.com/kokkos/kokkos/wiki) diff --git a/packages/kokkos/doc/SAND2017-10464-Kokkos-Task-DAG.pdf b/packages/kokkos/doc/SAND2017-10464-Kokkos-Task-DAG.pdf new file mode 100644 index 0000000000000000000000000000000000000000..571ebff401044a4f4c2d32c2b948280e25f745b5 Binary files /dev/null and b/packages/kokkos/doc/SAND2017-10464-Kokkos-Task-DAG.pdf differ diff --git a/packages/kokkos/doc/TuningDesign.md b/packages/kokkos/doc/TuningDesign.md new file mode 100644 index 0000000000000000000000000000000000000000..a058c15bc10b8577a7eadb2f0f3a94148b4fd61c --- /dev/null +++ b/packages/kokkos/doc/TuningDesign.md @@ -0,0 +1,221 @@ +#Kokkos Tuning + +This is a design document describing the motivation, ideas, design, and prototype implementation of the Kokkos Tuning System + +## Motivation + +Currently, Kokkos makes a lot of decisions about tuning parameters (CUDA block sizes, different kernel implementations) +by picking an option that results in the best performance for the widest array of applications and architectures at the +time the choice is made. This approach leaves performance on the table, and appears increasingly untenable as the number +of architectures and applications grows, and as software versions change. + +The Kokkos team would like to instead open up the ability to set the parameters as part of the tooling system so that +these parameters can be tuned for individual applications across all the architectures they might run on. In order to match the +feel of past Kokkos tooling efforts, we'd like to achieve this with a callback system. + +## Ideas + +A Kokkos Tuning system should be as small as is wise while achieving the following goals + +1. Expose to tools enough data about the _context_ of the running application to tune intelligently. In autotuning terms, decribe the _features_ +2. Expose to tools enough data about tuning parameters that they might know how to optimize what they're asked to +3. Expose to applications an interface that they might inform a tool about their current application context +4. Expose to tools the results of their choices +5. No perturbation of Kokkos Core when this system is disabled + +Shared among the first three of these goals is a need for some way to describe the semantics of variables (tuning parameters, context variables) +internal to Kokkos or an application to an outside tool. + +### Semantics of Variables + +I think it's best to talk about the semantics of variables with concrete examples. + +Suppose Kokkos wants a tool to choose a block size for it. Suppose all the application context is perfectly understood, that the tool knows +that the application has 10,000,000 particles active and that it's running a kernel called "make_particles_go," which is a parallel_for in +the "cuda" execution space. Even with this knowledge, the tool needs to know several things about what a block size _is_ for this to be generic and practical + +1. Is it an integer value? A float? A string? (Type) +2. Relatedly, what are the mathematical semantics which are valid for it? Is it something +for which a list can be sorted? Do the distances between items in a sorted list make sense? +If I divide two values, does the ratio have some meaning? (semantics) +3. What are the valid choices for this value? Is a block size of -128 okay? How about 7? (candidates) + +Semantics (as always) are likely the source of the most confusion here, so a bit of detail is good. Here I'm leaning heavily on the field +of statistics to enable tools to do intelligent searching. If ordering doesn't make sense, if a value is "categorical", the only thing +a tool can do is try all possible values for a tuning value. If they're ordered (ordinal), the search can take advantage of this by +using the concept of a directional search. If the distances between elements matter (interval data) you can cheat with things like +bisection. Finally if ratios matter you can play games where you increase by a factor of 10 in your searches. Note that one good point in favor of this design is that it matches up nicely with scikit-opt (a happy accident). + +In describing the candidate values in (3), users have two options: sets or ranges. A set has a number of entries of the given type, a range has lower and upper bounds and a step size. + +Claim: the combination of context, candidates, semantics, and types gives a tool enough to intelligently explore the search space of +tuning parameters + +### Context + +Suppose a tool perfectly understands what a block size is. To effectively tune one, it needs to know something about the application. + +In a trivial case, the tool knows absolutely nothing other than candidate values for the block size, and tries to make a choice that optimizes across all +invocations of kernels. This isn't _that_ far from what Kokkos does now, so it's not unreasonable for this to produce decent results. +That said, we could quickly add some context from Kokkos, stuff like the name and type of the kernel, the execution space, all with the semantic information described above. That way a tuning tool could differentiate based on all the information available to Kokkos. Going a little further, we could expose this ability to provide context to our applications. What if the tools wasn't just tuning to the fact that the kernel name was "GEMM", but that "matrix_size" was a million? Or that "live_particles" had a +certain value? The more (relevant) context we provide to a tool, the better it will be able to tune. + + +### Intended Tool Workflow + +Okay, so a tool knows what it's tuning, and it knows the context of the application well enough to do clever ML things, all of this with happy semantic information so that everything make . What should a workflow look like? A tool should + +1) Listen to declarations about the semantics of context and tuning variables +2) Make tuning decisions +3) Measure their feedback +4) Get better at (2) + +The easier we make this loop, the better + +## Design + +The design of this system is intended to reflect the above ideas with the minimal necessary additions to make the mechanics work. This section is almost entirely describing the small holes in the above descriptions. Variable declaration works exactly as described above, except that we associate types and associated values using a type_id with each type at declaration time. + +Any time a value of a variable is declared (context) or requested (tuning), it is also associated with a context ID that says how long that declaration is valid for. So if a user sees + +```c++ +startContext(contextId(0)) +declare_value("is_safe_to_push_button",true,contextId(0)); +foo(); +endContext(contextId(0)); +bar(); +``` + +They should know in `bar` that it is no longer safe to push the button. Similarly, if tools have provided tuning values to contextId(0), when contextId(0) ends, that is when the tool takes measurements related to those tuning values and learns things. *For most tools, when they see a call to startContext associated with a contextId, they'll do a starting measurement, and at endContext they'll stop that measurement*. + +One ugly bit of semantic complexity is in variables with complicated sets of candidates. Taking the exmaple of GPU block size, for different kernels an application might have different sets of valid block sizes. This means that while "block size" might make sense as a type, there could be different types, "block_sizes_up_to_1024," "block_sizes_up_to_2048," that cover the concept of block size. In our experience every solution to this problem is ugly, our alternate answers were much uglier. + +## Implementation + +This section describes the implementation. + +If you're writing a tool, you care about tool implementation. + +If you want tools to know about information from your application, you care about application implementation + +If you're a Kokkos developer, you care about the application implementation and Kokkos implementation + +### Tool implementation + +In the past, tools have responded to the [profiling hooks in Kokkos](https://github.com/kokkos/kokkos-tools/wiki/Profiling-Hooks). This effort adds to that, there are now a few more functions (note that I'm using the C names for types. In general you can replace Kokkos_Tools_ with Kokkos::Tools:: in C++ tools) + + +```c++ +void kokkosp_declare_output_type(const char* name, const size_t id, Kokkos_Tools_VariableInfo& info); +``` + +Declares a tuning variable named `name` with uniqueId `id` and all the semantic information stored in `info`. Note that the VariableInfo struct has a `void*` field called `toolProvidedInfo`. If you fill this in, every time you get a value of that type you'll also get back that same pointer. + +```c++ +void kokkosp_declare_input_type(const char*, const size_t, Kokkos_Tools_VariableInfo& info); +``` + +This is almost exactly like declaring a tuning variable. The only difference is that in cases where the candidate values aren't known, `info.valueQuantity` will be set to `kokkos_value_unbounded`. This is fairly common, Kokkos can tell you that `kernel_name` is a string, but we can't tell you what strings a user might provide. + +```c++ +void kokkosp_request_values( + const size_t contextId, + const size_t numContextVariables, const Kokkos_Tools_VariableValue* contextVariableValues, + const size_t numTuningVariables, Kokkos_Tools_VariableValue* tuningVariableValues); +``` + +Here Kokkos is requesting the values of tuning variables, and most of the meat is here. The contextId tells us the scope across which these variables were used. + +The next two arguments describe the context you're tuning in. You have the number of context variables, and an array of that size containing their values. Note that the Kokkos_Tuning_VariableValue has a field called `metadata` containing all the info (type, semantics, and critically, candidates) about that variable. + +The two arguments following those describe the Tuning Variables. First the number of them, then an array of that size which you can overwrite. *Overwriting those values is how you give values back to the application* + +Critically, as tuningVariableValues comes preloaded with default values, if your function body is `return;` you will not crash Kokkos, only make us use our defaults. If you don't know, you are allowed to punt and let Kokkos do what it would. + +```c++ +void kokkosp_begin_context(size_t contextId); +``` + +This starts the context pointed at by contextId. If tools use measurements to drive tuning, this is where they'll do their starting measurement. + +```c++ +void kokkosp_end_context(const size_t contextId); +``` + +This simply says that the contextId in the argument is now over. If you provided tuning values associated with that context, those values can now be associated with a result. + +### App Implementation + +For 99% of applications, all you need to do to interact with Kokkos Tuning Tools in your code is nothing. The only exceptions are if you want the tuning to be aware of what's happening in your application (number of particles active, whether different physics are active) if +you think that might change what the Tuning decides. If you're feeling especially brave, you can also use the Tuning interface to tune parameters within your own application. For making people aware of your application context, you need to know about a few functions + + +```c++ +size_t Kokkos::Tools::Experimental::declare_input_type(const std::string& variableName + VariableInfo info, + ); +``` + +This function tells a tool that you have some variable they should know about when tuning. The info describes the semantics of your variable. This is discussed in great detail under "Semantics of Variables", but you need to say whether the values will be text, int, or float, whether they're categorical, ordinal,interval, or ratio data, and whether the candidate values are "unbounded" (if you don't know the full set of values), a set, or a range. This returns a `size_t` that you should store, it's how you'll later identify what values you're providing or requesting from the tool. Note that this call doesn't actually tell the tools about values, it simply tells the tool about the nature of values you'll provide later. + + +```c++ +size_t Kokkos::Tools::Experimental::get_new_context_id(); +size_t Kokkos::Tools::Experimental::get_current_context_id(); +``` + + In this interface, + you will associate values with + "contexts" in order to decide when a given declaration of a value has gone + out of scope.The first gets you a new context + ID if you 're starting some new set of values. If you need to recover the last context ID so you can append to that context, rather than overwriting it with a new one, you can use `get_current_context_id()`. You' ll + use that context id to start a context in the function + +```c++ void Kokkos::Tools::Experimental::begin_context(size_t context_id); +``` + +This tells the tool that you're beginning a region in which you'll be setting and requesting values. If the tool optimizes for time, you're telling them to start their timer. + + +```c++ +void Kokkos::Tools::Experimental::set_input_values(size_t contextId, size_t count, + VariableValue* values); +``` + +Here you tell tools the values for your context variables. The contextId is used to later tell when this has gone out of scope, the count is how many variables you're declaring, and the values should come from calling `Kokkos::Tools::Experimental::make_variable_value` with the appropriate variable ID and value. + +```c++ +void Kokkos::Tools::Experimental::end_context(size_t contextId); +``` + + This tells the tool that values from this context are no longer valid, + and that the tool should stop their timers. + + For those who want to declare and request tuning variables, + you only need two more functions. + +```c++ void Kokkos::Tools::Experimental::declare_output_type( + const std::string&variableName VariableInfo info); +``` + + This is exactly like declareContextVariable.The only difference is that + the + ID's this returns should be passed to request_output_values, and that the `candidates` field in the info _must_ list valid values for the tool to provide. + +```c++ void Kokkos::Tools::Experimental::request_output_values( + size_t contextId, size_t count, VariableValue* values, ); +``` + +Here is where you request that the tool give you a set of values. You need a contextId so that the tool can know when you're done using the value and measure results. The count tells the tool how many variables it's providing values for. Values is an array of your default values for that parameter, it must not crash your program if unchanged. + +### Kokkos implementation + +In the past, Kokkos and Kokkos-tools didn't share source code. Except for a "SpaceHandle" struct which users manually copied to their tools, nothing from Kokkos hit the tools repo, the interface consisted entirely of basic C types. If you read the ideas section, it translates to a lot of structs and enums. Despite my best efforts to minimize them, I think we now need to share some header files with kokkos-tools. Andrew Gaspar did really excellent work making this practical, we have + +1) Kokkos_Profiling_C_Interface.h , which is (shockingly) a C interface that everything in Kokkos tools boils down to +2) Kokkos_Profiling_Interface.hpp, nice C++ wrappers around the C so that the C idioms don't hit Kokkos +3) Kokkos_Profiling.[cpp/hpp], which contain things Kokkos needs to implement tooling, but the tools don't need to know about + +All of our function pointer initialization and all that mess now go into Kokkos_Profiling.[cpp/hpp], all the types are in the Interface files. The interface files will be shared with kokkos/kokkos-tools. + +In terms of build changes, we now install the above .h file, and have a KOKKOS_ENABLE_TUNING option diff --git a/packages/kokkos/doc/build_docs b/packages/kokkos/doc/build_docs new file mode 100755 index 0000000000000000000000000000000000000000..da1d3e4f6e061804b1fb2fe21b356b691494df5d --- /dev/null +++ b/packages/kokkos/doc/build_docs @@ -0,0 +1,15 @@ +#!/bin/sh + +if [ $TRILINOS_HOME ]; then + echo "TRILINOS_HOME has already been set!" +else + echo "TRILINOS_HOME has not been set. Setting it!" + export TRILINOS_HOME=`pwd`/../../.. +fi + +echo +echo "Generating main Kokkos doxygen documentation ..." +echo + +doxygen Doxyfile + diff --git a/packages/kokkos/doc/design_notes_space_instances.md b/packages/kokkos/doc/design_notes_space_instances.md new file mode 100644 index 0000000000000000000000000000000000000000..ce3d242998b87180d5169750cd09915c08bd887d --- /dev/null +++ b/packages/kokkos/doc/design_notes_space_instances.md @@ -0,0 +1,131 @@ +# Design Notes for Execution and Memory Space Instances + +## Objective + + * Enable Kokkos interoperability with coarse-grain tasking models + +## Requirements + + * Backwards compatible with existing Kokkos API + * Support existing Host execution spaces (Serial, Threads, OpenMP) + * Support DARMA threading model (may require a new Host execution space) + * Support Uintah threading model, i.e. indepentant worker threadpools working of of shared task queues + + +## Execution Space + + * Parallel work is *dispatched* on an execution space instance + + * Execution space instances are conceptually disjoint/independent from each other + + +## Host Execution Space Instances + + * A host-side *control* thread dispatches work to an instance + + * `main` is the initial control thread + + * A host execution space instance is an organized thread pool + + * All instances are disjoint, i.e. hardware resources are not shared between instances + + * Exactly one control thread is associated with + an instance and only that control thread may + dispatch work to to that instance + + * The control thread is a member of the instance + + * The pool of threads associated with an instances is not mutatable during that instance existence + + * The pool of threads associated with an instance may be masked + + - Allows work to be dispatched to a subset of the pool + + - Example: only one hyperthread per core of the instance + + - A mask can be applied during the policy creation of a parallel algorithm + + - Masking is portable by defining it as ceiling of fraction between [0.0, 1.0] + of the available resources + +``` +class ExecutionSpace { +public: + using execution_space = ExecutionSpace; + using memory_space = ...; + using device_type = Kokkos::Device<execution_space, memory_space>; + using array_layout = ...; + using size_type = ...; + using scratch_memory_space = ...; + + + class Instance + { + int thread_pool_size( int depth = 0 ); + ... + }; + + class InstanceRequest + { + public: + using Control = std::function< void( Instance * )>; + + InstanceRequest( Control control + , unsigned thread_count + , unsigned use_numa_count = 0 + , unsigned use_cores_per_numa = 0 + ); + + }; + + static bool in_parallel(); + + static bool sleep(); + static bool wake(); + + static void fence(); + + static void print_configuration( std::ostream &, const bool detailed = false ); + + static void initialize( unsigned thread_count = 0 + , unsigned use_numa_count = 0 + , unsigned use_cores_per_numa = 0 + ); + + // Partition the current instance into the requested instances + // and run the given functions on the cooresponding instances + // will block until all the partitioned instances complete and + // the original instance will be restored + // + // Requires that the space has already been initialized + // Requires that the request can be statisfied by the current instance + // i.e. the sum of number of requested threads must be less than the + // max_hardware_threads + // + // Each control functor will accept a handle to its new default instance + // Each instance must be independent of all other instances + // i.e. no assumption on scheduling between instances + // The user is responible for checking the return code for errors + static int run_instances( std::vector< InstanceRequest> const& requests ); + + static void finalize(); + + static int is_initialized(); + + static int concurrency(); + + static int thread_pool_size( int depth = 0 ); + + static int thread_pool_rank(); + + static int max_hardware_threads(); + + static int hardware_thread_id(); + + }; + +``` + + + + diff --git a/packages/kokkos/doc/develop_builds.md b/packages/kokkos/doc/develop_builds.md new file mode 100644 index 0000000000000000000000000000000000000000..3fe4e6f6721c9985bec6d8dce648c516295846ad --- /dev/null +++ b/packages/kokkos/doc/develop_builds.md @@ -0,0 +1,56 @@ + +# Places to build options: architecture, device, advanced options, cuda options + +These are the files that need to be updated when a new architecture or device is +added: + + + generate_makefile.bash + * Interface for makefile system + + cmake/kokkos_options.cmake + * Interface for cmake system + + Makefile.kokkos + * Main logic for build (make and cmake) and defines (KokkosCore_config.h) + +In general, an architecture is going to be from on of these platforms: + + AMD + + ARM + + IBM + + Intel + + Intel Xeon Phi + + NVIDIA +Although not strictly necessary, it is helpful to keep things organized by +grouping by platform. + +### generate_makefile.sh + +The bash code does not do any error checking on the `--arch=` or `--device=` +arguments thus strictly speaking you do not *need* to do anything to add a +device or architecture; however, you should add it to the help menu. For the +archictectures, please group by one of the platforms listed above. + + +### cmake/kokkos_options.cmake and cmake/kokkos_settings.cmake + +The options for the CMake build system are: `-DKOKKOS_HOST_ARCH:STRING=` and +`-DKOKKOS_ENABLE_<device>:BOOL=`. Although any string can be passed into +KOKKOS_HOST_ARCH option, it is checked against an accepted list. Likewise, the +KOKKOS_ENABLE_<device> must have the option added AND it is formed using the +list. Thus: + + A new architecture should be added to the KOKKOS_HOST_ARCH_LIST variable. + + A new device should be added to the KOKKOS_DEVICES_LIST variable **AND** a + KOKKOS_ENABLE_<newdevice> option specified (see KOKKOS_ENABLE_CUDA for + example). + + A new device should be added to the KOKKOS_DEVICES_LIST variable **AND** a + +The translation from option to the `KOKKOS_SETTINGS` is done in +`kokkos_settings.cmake`. This translation is automated for some types if you ad +to the list, but for others, it may need to be hand coded. + + +### Makefile.kokkos + +This is the main coding used by both the make and cmake system for defining +the sources (generated makefile and cmake snippets by `core/src/Makefile`), for +setting the defines in KokkosCore_config.h, and defining various internal +variables. To understand how to add to this file, you should work closely with +the Kokkos development team. diff --git a/packages/kokkos/doc/hardware_identification/query_cuda_arch.cpp b/packages/kokkos/doc/hardware_identification/query_cuda_arch.cpp new file mode 100644 index 0000000000000000000000000000000000000000..879b3ca1e5c4b6a01539ee5997cdb86a68077388 --- /dev/null +++ b/packages/kokkos/doc/hardware_identification/query_cuda_arch.cpp @@ -0,0 +1,22 @@ +#include <cstdio> +#include <cuda_runtime_api.h> +int main() { + cudaDeviceProp prop; + const cudaError_t err_code = cudaGetDeviceProperties(&prop, 0); + if (cudaSuccess != err_code) { + fprintf(stderr, "cudaGetDeviceProperties failed: %s\n", + cudaGetErrorString(err_code)); + return -1; + } + switch (prop.major) { + case 3: printf("Kepler"); break; + case 5: printf("Maxwell"); break; + case 6: printf("Pascal"); break; + default: + fprintf(stderr, "Unsupported Device %d%d\n", (int)prop.major, + (int)prop.minor); + return -1; + } + printf("%d%d\n", (int)prop.major, (int)prop.minor); + return 0; +} diff --git a/packages/kokkos/doc/index.doc b/packages/kokkos/doc/index.doc new file mode 100644 index 0000000000000000000000000000000000000000..27a9e4f2e7b90e11bbcde7309e9bf1544e3b386f --- /dev/null +++ b/packages/kokkos/doc/index.doc @@ -0,0 +1,72 @@ +/*! +\mainpage Trilinos/Kokkos: Shared-memory programming interface and computational kernels + +\section Kokkos_Intro Introduction + +The %Kokkos package has two main components. The first, sometimes +called "%Kokkos Array" or just "%Kokkos," implements a +performance-portable shared-memory parallel programming model and data +containers. The second, called "%Kokkos Classic," consists of +computational kernels that support the %Tpetra package. + +\section Kokkos_Kokkos The %Kokkos programming model + +%Kokkos implements a performance-portable shared-memory parallel +programming model and data containers. It lets you write an algorithm +once, and just change a template parameter to get the optimal data +layout for your hardware. %Kokkos has back-ends for the following +parallel programming models: + +- Kokkos::Threads: POSIX Threads (Pthreads) +- Kokkos::OpenMP: OpenMP +- Kokkos::Cuda: NVIDIA's CUDA programming model for graphics + processing units (GPUs) +- Kokkos::Serial: No thread parallelism + +%Kokkos also has optimizations for shared-memory parallel systems with +nonuniform memory access (NUMA). Its containers can hold data of any +primitive ("plain old") data type (and some aggregate types). %Kokkos +Array may be used as a stand-alone programming model. + +%Kokkos' parallel operations include the following: + +- parallel_for: a thread-parallel "for loop" +- parallel_reduce: a thread-parallel reduction +- parallel_scan: a thread-parallel prefix scan operation + +as well as expert-level platform-independent interfaces to thread +"teams," per-team "shared memory," synchronization, and atomic update +operations. + +%Kokkos' data containers include the following: + +- Kokkos::View: A multidimensional array suitable for thread-parallel + operations. Its layout (e.g., row-major or column-major) is + optimized by default for the particular thread-parallel device. +- Kokkos::Vector: A drop-in replacement for std::vector that eases + porting from standard sequential C++ data structures to %Kokkos' + parallel data structures. +- Kokkos::UnorderedMap: A parallel lookup table comparable in + functionality to std::unordered_map. + +%Kokkos also uses the above basic containers to implement higher-level +data structures, like sparse graphs and matrices. + +A good place to start learning about %Kokkos would be <a href="http://trilinos.sandia.gov/events/trilinos_user_group_2013/presentations/2013-11-TUG-Kokkos-Tutorial.pdf">these tutorial slides</a> from the 2013 Trilinos Users' Group meeting. + +\section Kokkos_Classic %Kokkos Classic + +"%Kokkos Classic" consists of computational kernels that support the +%Tpetra package. These kernels include sparse matrix-vector multiply, +sparse triangular solve, Gauss-Seidel, and dense vector operations. +They are templated on the type of objects (\c Scalar) on which they +operate. This component was not meant to be visible to users; it is +an implementation detail of the %Tpetra distributed linear algebra +package. + +%Kokkos Classic also implements a shared-memory parallel programming +model. This inspired and preceded the %Kokkos programming model +described in the previous section. Users should consider the %Kokkos +Classic programming model deprecated, and prefer the new %Kokkos +programming model. +*/ diff --git a/packages/kokkos/doc/kokkos-promotion.txt b/packages/kokkos/doc/kokkos-promotion.txt new file mode 100644 index 0000000000000000000000000000000000000000..81489e6e132d1499ac488fe603deb6a2d87f45b3 --- /dev/null +++ b/packages/kokkos/doc/kokkos-promotion.txt @@ -0,0 +1,267 @@ +Summary: + +- Step 1: Testing Kokkos + KokkosKernels using test_all_sandia + +- Step 2: Testing of Kokkos + KokkosKernels integrated into Trilinos (scripts/trilinos-integration/*.sh) + +- Step 3: Close all issues labeled "InDevelop" + +- Step 4: Locally update CHANGELOG, merge into master, edit scripts/master_history.txt + +- Step 5: Locally snapshot new master of kokkos and kokkos-kernels into corresponding Trilinos branch (develop or temporary), issue PR to Trilinos + +- Step 6: Push local Kokkos master to GitHub (need Owner approval). Push local KokkosKernels master to GitHub (need Owner approval) + +Steps 1, 2, and 4 include testing that may fail. These failures must be fixed either by pull requests to Kokkos and/or KokkosKernels develop, or by creating a new Trilinos branch for parts of Trilinos that must be updated. This is what usually takes the most time. + + +// -------------------------------------------------------------------------------- // + + +Step 1: The following should be repeated on enough machines to cover all +supported compilers. Those machines are: + + kokkos-dev + blake + white + bowman + waterman + ride + + 1.1. Clone kokkos develop branch (or just switch to it) + + git clone -b develop git@github.com:kokkos/kokkos.git + cd kokkos + + 1.2. Create a testing directory + + mkdir testing + cd testing + + 1.3. Run the test_all_sandia script with no options to test all compilers + + nohup ../scripts/test_all_sandia & + tail -f nohup.out # to watch progress + + NOTE: To kill jobs not running in the background (hopefully prevent ghost processes): + + control+z + kill -9 %1 + + 1.4. Clone kokkos-kernels develop branch (or just switch to it) + + git clone -b develop git@github.com:kokkos/kokkos-kernels.git + cd kokkos-kernels + + 1.4. Create a testing directory + + mkdir testing + cd testing + + 1.5. Run the test_all_sandia script with no options to test all compilers + + nohup ../scripts/test_all_sandia & + tail -f nohup.out # to watch progress + + NOTE: To kill jobs not running in the background (hopefully prevent ghost processes): + + control+z + kill -9 %1 + +// -------------------------------------------------------------------------------- // + +Step 2: + 2.1. Build and test Trilinos with the following configurations: + a) serial, openmp, and cuda via the testing scripts in kokkos-kernels/scripts/trilinos-integration (automates the process) + b) various ATDM-supported builds via Trilinos configuration scripts located in kokkos{-kernels}/scripts/trilinos-integration/ATDM_configurations (not yet automated) + + Run scripts for automated testing on white (openmp and cuda) and blake (seral) that are provided in kokkos{-kernels}/scripts/trilinos-integration. + These scripts load their own modules/environment, so don't require preparation. You can run all four at the same time, use separate directories for each. + + mkdir serial + cd serial + nohup KOKKOSKERNELS_PATH/scripts/trilinos-integration/blake_jenkins_run_script_serial_intel & + + Use scripts to configure Trilinos for waterman (cuda, cuda-debug) and ride (cuda-rdc) that are provided in kokkos-kernels/scripts/trilinos-integration/ATDM_configurations. + + These scripts load their own modules/environment, so don't require preparation of the system environment. You can run them all at the same time, just use separate directories for each. Instructions for compute node allocation, building, and testing are included in the scripts. + + The Trilinos configuration scripts include an override of the kokkos and kokkos-kernels packages; this requires that a symbolic link for each be created in the Trilinos base directory: + + cd Trilinos + ln -s <PATH_TO_YOUR_KOKKOS> kokkos + ln -s <PATH_TO_YOUR_KOKKOSKERNELS> kokkos-kernels + + 2.2. Compare the compile errors and test failures between updated and pristine versions; the ATDM configurations scripts should have 0 build errors and 100% passing tests. There may be compile failures that happen in both, tests that fail in both, and there may be tests that only fail some times (thus, rerun tests manually as needed). + +// -------------------------------------------------------------------------------- // + +Step 3: Close all issues labeled "InDevelop" + + Use the GitHub web interface: https://github.com/kokkos/kokkos/issues?q=is%3Aopen+is%3Aissue+label%3AInDevelop + Select all with checkbox in upper left, "Mark as closed" + + Use the GitHub web interface: https://github.com/kokkos/kokkos-kernels/issues?q=is%3Aopen+is%3Aissue+label%3AInDevelop + Select all with checkbox in upper left, "Mark as closed" + +// -------------------------------------------------------------------------------- // + +Step 4: This step should be run on kokkos-dev + + 4.1. If you don't have a GitHub token already, generate one for yourself (this will give you TOKEN): + + https://github.com/settings/tokens + + 4.2. Get a clean copy of the kokkos and kokkos-kernels develop branches + + git clone -b develop git@github.com:kokkos/kokkos.git + git clone -b develop git@github.com:kokkos/kokkos-kernels.git + + 4.3. If you haven't already, install Ruby and the "github_changelog_generator" "gem" + The github_changelog_generator is here: https://github.com/skywinder/github-changelog-generator + Its compatible Ruby version can be found here: https://github.com/skywinder/github-changelog-generator/blob/master/.ruby-version + Grab the corresponding Ruby version from here: https://www.ruby-lang.org/en/downloads/ + Follow the usual configure,make,make install process: https://www.ruby-lang.org/en/documentation/installation/#building-from-source + Note that you will likely have to install to a non-default location with "./configure --prefix=/path" + + 4.4. Generate the initial changelog(s). Use the most recent tag as OLDTAG (`git tag -l` can show you all tags). The NEWTAG is the new version number, e.g. "2.04.00". + + RUN THIS OUTSIDE THE KOKKOS SOURCE TREE! + + NOTE: You likely need to set an HTTPS proxy in order for this script to work: + + export https_proxy=http://wwwproxy.sandia.gov:80 + + github_changelog_generator kokkos/kokkos --token TOKEN --no-pull-requests --include-labels 'InDevelop' --exclude-labels 'question,DevelopOnly' --enhancement-labels 'enhancement,Feature Request' --future-release 'NEWTAG' --between-tags 'NEWTAG,OLDTAG' + + github_changelog_generator kokkos/kokkos-kernels --token TOKEN --no-pull-requests --include-labels 'InDevelop' --exclude-labels 'question,DevelopOnly' --enhancement-labels 'enhancement,Feature Request' --future-release 'NEWTAG' --between-tags 'NEWTAG,OLDTAG' + + 4.5. Manually cleanup and commit the change log. + (Copy the new section from the generated CHANGELOG.md to the corresponding KOKKOS_PATH/CHANGELOG.md or KOKKOSKERNELS_PATH/CHANGELOG.md) + (Make desired changes to CHANGELOG.md to enhance clarity (remove issues not noteworthy)) + (Commit the CHANGELOG.md locally to develop) + + The changelog commit message should be: + + Adding Changelog for Release A.B.CD + + Part of Kokkos C++ Performance Portability Programming EcoSystem A.B + + 4.6. Merge develop into master. DO NOT FAST-FORWARD THE MERGE!!!! + DO NOT USE AN OLD GIT VERSION!!!! + + (From kokkos directory): + git checkout master + git merge --no-ff develop + + The merge commit message should be: + + Merge branch 'develop' for A.B.CD + + Part of Kokkos C++ Performance Portability Programming EcoSystem A.B + + (From kokkos-kernels directory): + git checkout master + git merge --no-ff develop + + The merge commit message should be: + + Merge branch 'develop' for A.B.CD + + Part of Kokkos C++ Performance Portability Programming EcoSystem A.B + + 4.7. Update the tag in kokkos/master_history.txt, then update the tag in kokkos-kernels/master_history.txt + + Tag description: MajorNumber.MinorNumber.WeeksSinceMinorNumberUpdate + Tag field widths: #.#.## + date description: month:day:year + date field widths: ##:##:#### + master description: SHA1 of previous master commit (use `git log --first-parent master`) + develop description: SHA1 of merged develop branch (use `git log develop`) + SHA1 field width: ######## (8 chars) + + # Append to scripts/master_history.txt: + + tag: 2.03.13 date: 07:27:2017 master: da314444 develop: 29ccb58a + + git commit --amend -a + + Keep the merge commit as described in 4.6 + + 4.8. Create the new tag (repeat for kokkos and kokkos-kernels): + + git tag -a #.#.## + + (type the following into the tag message (same as for step 4.7)) + tag: #.#.## + date: mm/dd/yyyy + master: sha1 + develop: sha1 + + 4.9. DO NOT PUSH YET !!! + + +// -------------------------------------------------------------------------------- // + +Step 5: This step can be done on any SEMS machine (e.g. kokkos-dev). + + 5.1. Clone the Trilinos corresponding branch (or just switch to it) + + git clone -b kokkos-promotion git@github.com:trilinos/Trilinos.git + TRILINOS_PATH=$PWD/Trilinos + + 5.2. Snapshot Kokkos into Trilinos - this requires python/2.7.9 and that both Trilinos and Kokkos be clean - no untracked or modified files. Run the following outside of the Kokkos and Trilinos source trees. + + * Use the master branch of Kokkos for this. + + module load sems-python/2.7.9 + python $KOKKOS_PATH/scripts/snapshot.py $KOKKOS_PATH $TRILINOS_PATH/packages + python $KOKKOS_PATH/scripts/snapshot.py $KOKKOSKERNELS_PATH $TRILINOS_PATH/packages + + If snapshotting kokkos-kernels, use the snapshot.py in kokkos. + + 5.3. Push this Trilinos branch to GitHub, open a pull request for it. + The pull request title should be: + + Kokkos + KokkosKernels Promotion To X.X.XX + + In the message body, mention @trilinos/kokkos and @trilinos/kokkos-kernels, + + Add a short description of the most significant features and bug fixes, + + Then add: + + ## Kokkos Changelog + + And copy-paste the content for this release from the kokkos CHANGELOG.md file, then do the same for kokkos-kernels under: + + ## KokkosKernels Changelog + + 5.4. Wait for Trilinos Autotester results + + 5.5. If there are failures, fix and backtrack. Otherwise, merge into Trilinos' develop branch and go to next step + +// -------------------------------------------------------------------------------- // + +Step 6: Push Kokkos + KokkosKernels master and develop branches to respective GitHub repos (requires Owner permission). + + 6.1. Master branch: + cd $KOKKOS_PATH + git checkout master + git push --follow-tags origin master + + cd $KOKKOSKERNELS_PATH + git checkout master + git push --follow-tags origin master + + 6.2. Develop branch: First merge (--no-ff) master back into develop + cd $KOKKOS_PATH + git checkout develop + git merge --no-ff master + git push origin develop + + cd $KOKKOSKERNELS_PATH + git checkout develop + git merge --no-ff master + git push origin develop + diff --git a/packages/kokkos/example/CMakeLists.txt b/packages/kokkos/example/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..3db566f83f923947af5993ccb0156a5836296548 --- /dev/null +++ b/packages/kokkos/example/CMakeLists.txt @@ -0,0 +1,11 @@ + + +# Subpackage name must match what appears in kokkos/cmake/Dependencies.cmake +# +KOKKOS_SUBPACKAGE(Example) + +KOKKOS_ADD_EXAMPLE_DIRECTORIES(query_device) +KOKKOS_ADD_EXAMPLE_DIRECTORIES(tutorial) + +KOKKOS_SUBPACKAGE_POSTPROCESS() + diff --git a/packages/kokkos/example/README b/packages/kokkos/example/README new file mode 100644 index 0000000000000000000000000000000000000000..66860512448a25c0019b862babbbe08d7cf351cf --- /dev/null +++ b/packages/kokkos/example/README @@ -0,0 +1,9 @@ +This directory contains example application proxies that use different +parts of Kokkos. If you are looking for the FENL ("finite element +nonlinear" solve) example, it has moved into the LinAlg subpackage of +Tpetra. + +MANIFEST: + + - query_device: Kokkos' HWLOC wrapper for querying device topology + - tutorial: Kokkos tutorial (START HERE) diff --git a/packages/kokkos/example/build_cmake_in_tree/CMakeLists.txt b/packages/kokkos/example/build_cmake_in_tree/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..7217807072c7fa3fe1dd3b9d0656e9733c34bed8 --- /dev/null +++ b/packages/kokkos/example/build_cmake_in_tree/CMakeLists.txt @@ -0,0 +1,19 @@ +# Kokkos minimally requires 3.10 right now, +# but your project can set it higher +cmake_minimum_required(VERSION 3.10) + +# Project can mix languages - must have C++ support +# Kokkos flags are only applied to C++ files +project(Example CXX) + +# We build kokkos as a subdirectory of our project +add_subdirectory(${Example_SOURCE_DIR}/../.. ${Example_BINARY_DIR}/kokkos) + +add_executable(example cmake_example.cpp) + +# This is the only line required to set up all compiler/linker flags +target_link_libraries(example Kokkos::kokkos) + +# Adds a test for the executable +enable_testing() +add_test(NAME KokkosInTree_Verify COMMAND example 10) diff --git a/packages/kokkos/example/build_cmake_in_tree/cmake_example.cpp b/packages/kokkos/example/build_cmake_in_tree/cmake_example.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b0fd9822a492aa55a797dd76277e775a8afcbd24 --- /dev/null +++ b/packages/kokkos/example/build_cmake_in_tree/cmake_example.cpp @@ -0,0 +1,88 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <cstdio> + +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); + Kokkos::DefaultExecutionSpace::print_configuration(std::cout); + + if (argc < 2) { + fprintf(stderr, "Usage: %s [<kokkos_options>] <size>\n", argv[0]); + Kokkos::finalize(); + exit(1); + } + + const long n = strtol(argv[1], nullptr, 10); + + printf("Number of even integers from 0 to %ld\n", n - 1); + + Kokkos::Timer timer; + timer.reset(); + + // Compute the number of even integers from 0 to n-1, in parallel. + long count = 0; + Kokkos::parallel_reduce( + n, KOKKOS_LAMBDA(const long i, long& lcount) { lcount += (i % 2) == 0; }, + count); + + double count_time = timer.seconds(); + printf(" Parallel: %ld %10.6f\n", count, count_time); + + timer.reset(); + + // Compare to a sequential loop. + long seq_count = 0; + for (long i = 0; i < n; ++i) { + seq_count += (i % 2) == 0; + } + + count_time = timer.seconds(); + printf("Sequential: %ld %10.6f\n", seq_count, count_time); + + Kokkos::finalize(); + + return (count == seq_count) ? 0 : -1; +} diff --git a/packages/kokkos/example/build_cmake_installed/CMakeLists.txt b/packages/kokkos/example/build_cmake_installed/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..7998d2914d586a8725d39992a53cfa56ed436ad5 --- /dev/null +++ b/packages/kokkos/example/build_cmake_installed/CMakeLists.txt @@ -0,0 +1,24 @@ +# Kokkos minimally requires 3.10 right now, +# but your project can set it higher +cmake_minimum_required(VERSION 3.10) + +# Projects can safely mix languages - must have C++ support +# Kokkos flags will only apply to C++ files +project(Example CXX Fortran) + +# You need this for using Kokkos_ROOT variable +if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.12.0") + message(STATUS "Setting policy CMP0074 to use <Package>_ROOT variables") + cmake_policy(SET CMP0074 NEW) +endif() + +# Look for an installed Kokkos +find_package(Kokkos REQUIRED) + +add_executable(example cmake_example.cpp foo.f) + +# This is the only thing required to set up compiler/linker flags +target_link_libraries(example Kokkos::kokkos) + +enable_testing() +add_test(NAME KokkosInTree_Verify COMMAND example 10) diff --git a/packages/kokkos/example/build_cmake_installed/cmake_example.cpp b/packages/kokkos/example/build_cmake_installed/cmake_example.cpp new file mode 100644 index 0000000000000000000000000000000000000000..fd05172cb83ff3052b0a054e2a72475825555d93 --- /dev/null +++ b/packages/kokkos/example/build_cmake_installed/cmake_example.cpp @@ -0,0 +1,97 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <cstdio> + +extern "C" void print_fortran_(); + +struct CountFunctor { + KOKKOS_FUNCTION void operator()(const long i, long& lcount) const { + lcount += (i % 2) == 0; + } +}; + +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); + Kokkos::DefaultExecutionSpace::print_configuration(std::cout); + + if (argc < 2) { + fprintf(stderr, "Usage: %s [<kokkos_options>] <size>\n", argv[0]); + Kokkos::finalize(); + exit(1); + } + + const long n = strtol(argv[1], nullptr, 10); + + printf("Number of even integers from 0 to %ld\n", n - 1); + + Kokkos::Timer timer; + timer.reset(); + + // Compute the number of even integers from 0 to n-1, in parallel. + long count = 0; + CountFunctor functor; + Kokkos::parallel_reduce(n, functor, count); + + double count_time = timer.seconds(); + printf(" Parallel: %ld %10.6f\n", count, count_time); + + timer.reset(); + + // Compare to a sequential loop. + long seq_count = 0; + for (long i = 0; i < n; ++i) { + seq_count += (i % 2) == 0; + } + + count_time = timer.seconds(); + printf("Sequential: %ld %10.6f\n", seq_count, count_time); + + print_fortran_(); + + Kokkos::finalize(); + + return (count == seq_count) ? 0 : -1; +} diff --git a/packages/kokkos/example/build_cmake_installed/foo.f b/packages/kokkos/example/build_cmake_installed/foo.f new file mode 100644 index 0000000000000000000000000000000000000000..e618455283b65602d98a5de00c8dc2abc6b0f8c2 --- /dev/null +++ b/packages/kokkos/example/build_cmake_installed/foo.f @@ -0,0 +1,4 @@ + FUNCTION print_fortran() + PRINT *, 'Hello World from Fortran' + RETURN + END diff --git a/packages/kokkos/example/build_cmake_installed_different_compiler/CMakeLists.txt b/packages/kokkos/example/build_cmake_installed_different_compiler/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..df16774e742e9f60a116a5a8dcdf93bcc17b0606 --- /dev/null +++ b/packages/kokkos/example/build_cmake_installed_different_compiler/CMakeLists.txt @@ -0,0 +1,29 @@ +# Kokkos minimally requires 3.16 right now, +# but your project can set it higher +cmake_minimum_required(VERSION 3.16) + +# Projects can safely mix languages - must have C++ support +# Kokkos flags will only apply to C++ files +project(Example CXX Fortran) + +# You need this for using Kokkos_ROOT variable +message(STATUS "Setting policy CMP0074 to use <Package>_ROOT variables") +cmake_policy(SET CMP0074 NEW) + +# Look for an installed Kokkos but force using the compiler launcher +# to ensure that targets depending on Kokkos use the same compiler +# as when kokkos was installed, e.g. if kokkos was built with +# g++ and the CMAKE_CXX_COMPILER=clang++ then example_with_kokkos +# will be compiled and linked with g++ whereas example_no_kokkos +# will be compiled and linked with clang++ +find_package(Kokkos REQUIRED COMPONENTS launch_compiler) + +add_executable(example_no_kokkos bar.cpp) +add_executable(example_with_kokkos foo.cpp) + +# This is the only thing required to set up compiler/linker flags +target_link_libraries(example_with_kokkos Kokkos::kokkos) + +enable_testing() +add_test(NAME KokkosLauncher_NoKokkos_Verify COMMAND example_no_kokkos 10) +add_test(NAME KokkosLauncher_WithKokkos_Verify COMMAND example_with_kokkos 10) diff --git a/packages/kokkos/example/build_cmake_installed_different_compiler/bar.cpp b/packages/kokkos/example/build_cmake_installed_different_compiler/bar.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e02c2b8c688650fe3c5e0beefb5ea1ce01de2fa8 --- /dev/null +++ b/packages/kokkos/example/build_cmake_installed_different_compiler/bar.cpp @@ -0,0 +1,7 @@ + +#include <cstdio> + +int main() { + puts("hello world!"); + return 0; +} diff --git a/packages/kokkos/example/build_cmake_installed_different_compiler/foo.cpp b/packages/kokkos/example/build_cmake_installed_different_compiler/foo.cpp new file mode 100644 index 0000000000000000000000000000000000000000..fc10366f71bd9b0d421b18e935c2cea86925904b --- /dev/null +++ b/packages/kokkos/example/build_cmake_installed_different_compiler/foo.cpp @@ -0,0 +1,93 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <cstdio> + +struct CountFunctor { + KOKKOS_FUNCTION void operator()(const long i, long& lcount) const { + lcount += (i % 2) == 0; + } +}; + +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); + Kokkos::DefaultExecutionSpace::print_configuration(std::cout); + + if (argc < 2) { + fprintf(stderr, "Usage: %s [<kokkos_options>] <size>\n", argv[0]); + Kokkos::finalize(); + exit(1); + } + + const long n = strtol(argv[1], nullptr, 10); + + printf("Number of even integers from 0 to %ld\n", n - 1); + + Kokkos::Timer timer; + timer.reset(); + + // Compute the number of even integers from 0 to n-1, in parallel. + long count = 0; + CountFunctor functor; + Kokkos::parallel_reduce(n, functor, count); + + double count_time = timer.seconds(); + printf(" Parallel: %ld %10.6f\n", count, count_time); + + timer.reset(); + + // Compare to a sequential loop. + long seq_count = 0; + for (long i = 0; i < n; ++i) { + seq_count += (i % 2) == 0; + } + + count_time = timer.seconds(); + printf("Sequential: %ld %10.6f\n", seq_count, count_time); + + Kokkos::finalize(); + + return (count == seq_count) ? 0 : -1; +} diff --git a/packages/kokkos/example/make_buildlink/Makefile b/packages/kokkos/example/make_buildlink/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..80c8d2f260d55987b8a1af502ecbf4fe52b64fe6 --- /dev/null +++ b/packages/kokkos/example/make_buildlink/Makefile @@ -0,0 +1,50 @@ +KOKKOS_DEVICES=OpenMP +KOKKOS_CUDA_OPTIONS=enable_lambda +KOKKOS_ARCH = "SNB,Kepler35" + + +MAKEFILE_PATH := $(subst Makefile,,$(abspath $(lastword $(MAKEFILE_LIST)))) + +ifndef KOKKOS_PATH + KOKKOS_PATH = $(MAKEFILE_PATH)../.. +endif + +SRC = $(wildcard $(MAKEFILE_PATH)*.cpp) +HEADERS = $(wildcard $(MAKEFILE_PATH)*.hpp) + +vpath %.cpp $(sort $(dir $(SRC))) + +default: build + echo "Start Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +EXE = make_buildlink.cuda +else +CXX = g++ +EXE = make_buildlink.host +endif + +CXXFLAGS ?= -O3 -g +override CXXFLAGS += -I$(MAKEFILE_PATH) + +DEPFLAGS = -M +LINK = ${CXX} +LINKFLAGS = + +OBJ = $(notdir $(SRC:.cpp=.o)) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +test: build + ./$(EXE) + +$(EXE): $(SRC) $(KOKKOS_LINK_DEPENDS) $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(KOKKOS_CXXLDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(SRC) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o *.cuda *.host + diff --git a/packages/kokkos/example/make_buildlink/README b/packages/kokkos/example/make_buildlink/README new file mode 100644 index 0000000000000000000000000000000000000000..bcbb3dcd3409d7a2af826fe6627ca6aeb6f13cc2 --- /dev/null +++ b/packages/kokkos/example/make_buildlink/README @@ -0,0 +1,2 @@ +This example provides a template and test for compiling and linking in a single command. + diff --git a/packages/kokkos/example/make_buildlink/main.cpp b/packages/kokkos/example/make_buildlink/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2dbfb2687c118ba57a3b57d4dceaae6ffee2bea4 --- /dev/null +++ b/packages/kokkos/example/make_buildlink/main.cpp @@ -0,0 +1,13 @@ +#include <Kokkos_Core.hpp> + +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); + { + int N = (argc > 1) ? std::stoi(argv[1]) : 10000; + int M = (argc > 2) ? std::stoi(argv[2]) : 10000; + int R = (argc > 3) ? std::stoi(argv[3]) : 10; + + printf("Called with: %i %i %i\n", N, M, R); + } + Kokkos::finalize(); +} diff --git a/packages/kokkos/example/query_device/CMakeLists.txt b/packages/kokkos/example/query_device/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..86956ba3ba4855d0e769d92fd32e7b225c603157 --- /dev/null +++ b/packages/kokkos/example/query_device/CMakeLists.txt @@ -0,0 +1,12 @@ + +KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +SET(SOURCES "") +FILE(GLOB SOURCES *.cpp) + +KOKKOS_ADD_EXECUTABLE( + query_device + SOURCES ${SOURCES} +) + diff --git a/packages/kokkos/example/query_device/Makefile b/packages/kokkos/example/query_device/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..42b376ec7c5cf73537bf2d49340ce1ca963e3ad1 --- /dev/null +++ b/packages/kokkos/example/query_device/Makefile @@ -0,0 +1,46 @@ +KOKKOS_PATH ?= ../.. + +MAKEFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) +SRC_DIR := $(dir $(MAKEFILE_PATH)) + +SRC = $(wildcard $(SRC_DIR)/*.cpp) +OBJ = $(SRC:$(SRC_DIR)/%.cpp=%.o) + +#SRC = $(wildcard *.cpp) +#OBJ = $(SRC:%.cpp=%.o) + +default: build + echo "Start Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) + CXX = $(KOKKOS_PATH)/bin/nvcc_wrapper + EXE = $(addsuffix .cuda, $(shell basename $(SRC_DIR))) +else + CXX = g++ + EXE = $(addsuffix .host, $(shell basename $(SRC_DIR))) +endif + +CXXFLAGS = -O3 -I$(SRC_DIR) +LINK ?= $(CXX) +LDFLAGS ?= + +include $(KOKKOS_PATH)/Makefile.kokkos + +DEPFLAGS = -M + +LIB = + + +build: $(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: + rm -f *.a *.o *.cuda *.host + +# Compilation rules + +%.o:$(SRC_DIR)/%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< + diff --git a/packages/kokkos/example/query_device/query_device.cpp b/packages/kokkos/example/query_device/query_device.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a563b06b2864d5d0e855a80b836f3ef70f33f3a1 --- /dev/null +++ b/packages/kokkos/example/query_device/query_device.cpp @@ -0,0 +1,100 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <iostream> +#include <sstream> + +#include <Kokkos_Macros.hpp> + +#if defined(KOKKOS_ENABLE_MPI) +#include <mpi.h> +#endif + +#include <Kokkos_Core.hpp> + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +int main(int argc, char** argv) { + std::ostringstream msg; + + (void)argc; + (void)argv; +#if defined(KOKKOS_ENABLE_MPI) + + MPI_Init(&argc, &argv); + + int mpi_rank = 0; + + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + + msg << "MPI rank(" << mpi_rank << ") "; + +#endif + + msg << "{" << std::endl; + + if (Kokkos::hwloc::available()) { + msg << "hwloc( NUMA[" << Kokkos::hwloc::get_available_numa_count() + << "] x CORE[" << Kokkos::hwloc::get_available_cores_per_numa() + << "] x HT[" << Kokkos::hwloc::get_available_threads_per_core() << "] )" + << std::endl; + } + +#if defined(KOKKOS_ENABLE_CUDA) + Kokkos::Cuda::print_configuration(msg); +#endif + + msg << "}" << std::endl; + + std::cout << msg.str(); + +#if defined(KOKKOS_ENABLE_MPI) + + MPI_Finalize(); + +#endif + + return 0; +} diff --git a/packages/kokkos/example/tutorial/01_hello_world/CMakeLists.txt b/packages/kokkos/example/tutorial/01_hello_world/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..e1b90b133ddec1dc8a848a9ab8d2253980edd301 --- /dev/null +++ b/packages/kokkos/example/tutorial/01_hello_world/CMakeLists.txt @@ -0,0 +1,10 @@ + +KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# This is a tutorial, not a test, so we don't ask CTest to run it. +KOKKOS_ADD_EXECUTABLE( + tutorial_01_hello_world + SOURCES hello_world.cpp +) + diff --git a/packages/kokkos/example/tutorial/01_hello_world/Makefile b/packages/kokkos/example/tutorial/01_hello_world/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..02a0fb10a05d2317fb057d33f5096ce4c5b69131 --- /dev/null +++ b/packages/kokkos/example/tutorial/01_hello_world/Makefile @@ -0,0 +1,48 @@ +KOKKOS_PATH = ../../.. +KOKKOS_SRC_PATH = ${KOKKOS_PATH} +SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/01_hello_world/*.cpp) +vpath %.cpp $(sort $(dir $(SRC))) + +default: build + echo "Start Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 01_hello_world.cuda +KOKKOS_DEVICES = "Cuda,OpenMP" +KOKKOS_ARCH = "SNB,Kepler35" +else +CXX = g++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 01_hello_world.host +KOKKOS_DEVICES = "OpenMP" +KOKKOS_ARCH = "SNB" +endif + +DEPFLAGS = -M + +OBJ = $(notdir $(SRC:.cpp=.o)) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +test: $(EXE) + ./$(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o *.cuda *.host + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/packages/kokkos/example/tutorial/01_hello_world/hello_world.cpp b/packages/kokkos/example/tutorial/01_hello_world/hello_world.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5810e0ee7a267c79a823d00516937b8ccd43c0d9 --- /dev/null +++ b/packages/kokkos/example/tutorial/01_hello_world/hello_world.cpp @@ -0,0 +1,135 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <cstdio> +#include <typeinfo> + +// +// "Hello world" parallel_for example: +// 1. Start up Kokkos +// 2. Execute a parallel for loop in the default execution space, +// using a functor to define the loop body +// 3. Shut down Kokkos +// +// If Kokkos was built with C++11 enabled, try comparing this example +// to 01_hello_world_lambda. The latter uses C++11 lambdas (anonymous +// functions) to define the loop body of the parallel_for. That makes +// the code much more concise and readable. On the other hand, +// breaking out the loop body into an explicit functor makes it easier +// to test the loop independently of the parallel pattern. +// + +// Functor that defines the parallel_for's loop body. +// +// A "functor" is just a class or struct with a public operator() +// instance method. +struct hello_world { + // If a functor has an "execution_space" (or "execution_space", for + // backwards compatibility) public alias, parallel_* will only run + // the functor in that execution space. That's a good way to mark a + // functor as specific to an execution space. If the functor lacks + // this alias, parallel_for will run it in the default execution + // space, unless you tell it otherwise (that's an advanced topic; + // see "execution policies"). + + // The functor's operator() defines the loop body. It takes an + // integer argument which is the parallel for loop index. Other + // arguments are possible; see the "hierarchical parallelism" part + // of the tutorial. + // + // The operator() method must be const, and must be marked with the + // KOKKOS_INLINE_FUNCTION macro. If building with CUDA, this macro + // will mark your method as suitable for running on the CUDA device + // (as well as on the host). If not building with CUDA, the macro + // is unnecessary but harmless. + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { + // FIXME_SYCL needs workaround for printf +#ifndef __SYCL_DEVICE_ONLY__ + printf("Hello from i = %i\n", i); +#else + (void)i; +#endif + } +}; + +int main(int argc, char* argv[]) { + // You must call initialize() before you may call Kokkos. + // + // With no arguments, this initializes the default execution space + // (and potentially its host execution space) with default + // parameters. You may also pass in argc and argv, analogously to + // MPI_Init(). It reads and removes command-line arguments that + // start with "--kokkos-". + Kokkos::initialize(argc, argv); + + // Print the name of Kokkos' default execution space. We're using + // typeid here, so the name might get a bit mangled by the linker, + // but you should still be able to figure out what it is. + printf("Hello World on Kokkos execution space %s\n", + typeid(Kokkos::DefaultExecutionSpace).name()); + + // Run the above functor on the default Kokkos execution space in + // parallel, with a parallel for loop count of 15. + // + // The Kokkos::DefaultExecutionSpace alias gives the default + // execution space. Depending on how Kokkos was configured, this + // could be OpenMP, Threads, Cuda, Serial, or even some other + // execution space. + // + // The following line of code would look like this in OpenMP: + // + // #pragma omp parallel for + // for (int i = 0; i < 15; ++i) { + // printf ("Hello from i = %i\n", i); + // } + // + // You may notice that the printed numbers do not print out in + // order. Parallel for loops may execute in any order. + Kokkos::parallel_for("HelloWorld", 15, hello_world()); + + // You must call finalize() after you are done using Kokkos. + Kokkos::finalize(); +} diff --git a/packages/kokkos/example/tutorial/01_hello_world_lambda/CMakeLists.txt b/packages/kokkos/example/tutorial/01_hello_world_lambda/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..a939a5f0ded6d2953af557e6c62fe783ba7b559e --- /dev/null +++ b/packages/kokkos/example/tutorial/01_hello_world_lambda/CMakeLists.txt @@ -0,0 +1,10 @@ + +KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# This is a tutorial, not a test, so we don't ask CTest to run it. +KOKKOS_ADD_EXECUTABLE( + tutorial_01_hello_world_lambda + SOURCES hello_world_lambda.cpp +) + diff --git a/packages/kokkos/example/tutorial/01_hello_world_lambda/Makefile b/packages/kokkos/example/tutorial/01_hello_world_lambda/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..4fe3765c521978c565923f7eac746e08a0042b14 --- /dev/null +++ b/packages/kokkos/example/tutorial/01_hello_world_lambda/Makefile @@ -0,0 +1,49 @@ +KOKKOS_PATH = ../../.. +KOKKOS_SRC_PATH = ${KOKKOS_PATH} +SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/01_hello_world_lambda/*.cpp) +vpath %.cpp $(sort $(dir $(SRC))) + +default: build + echo "Start Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 01_hello_world_lambda.cuda +KOKKOS_DEVICES = "Cuda,OpenMP" +KOKKOS_ARCH = "SNB,Kepler35" +KOKKOS_CUDA_OPTIONS += "enable_lambda" +else +CXX = g++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 01_hello_world_lambda.host +KOKKOS_DEVICES = "OpenMP" +KOKKOS_ARCH = "SNB" +endif + +DEPFLAGS = -M + +OBJ = $(notdir $(SRC:.cpp=.o)) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +test: $(EXE) + ./$(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o *.cuda *.host + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/packages/kokkos/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp b/packages/kokkos/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp new file mode 100644 index 0000000000000000000000000000000000000000..06f209774eae10a4a11161d17aae979450d6e850 --- /dev/null +++ b/packages/kokkos/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp @@ -0,0 +1,118 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <cstdio> +#include <typeinfo> + +// +// "Hello world" parallel_for example: +// 1. Start up Kokkos +// 2. Execute a parallel for loop in the default execution space, +// using a C++11 lambda to define the loop body +// 3. Shut down Kokkos +// +// This example only builds if C++11 is enabled. Compare this example +// to 01_hello_world, which uses functors (explicitly defined classes) +// to define the loop body of the parallel_for. Both functors and +// lambdas have their places. +// + +int main(int argc, char* argv[]) { + // You must call initialize() before you may call Kokkos. + // + // With no arguments, this initializes the default execution space + // (and potentially its host execution space) with default + // parameters. You may also pass in argc and argv, analogously to + // MPI_Init(). It reads and removes command-line arguments that + // start with "--kokkos-". + Kokkos::initialize(argc, argv); + + // Print the name of Kokkos' default execution space. We're using + // typeid here, so the name might get a bit mangled by the linker, + // but you should still be able to figure out what it is. + printf("Hello World on Kokkos execution space %s\n", + typeid(Kokkos::DefaultExecutionSpace).name()); + + // Run lambda on the default Kokkos execution space in parallel, + // with a parallel for loop count of 15. The lambda's argument is + // an integer which is the parallel for's loop index. As you learn + // about different kinds of parallelism, you will find out that + // there are other valid argument types as well. + // + // For a single level of parallelism, we prefer that you use the + // KOKKOS_LAMBDA macro. If CUDA is disabled, this just turns into + // [=]. That captures variables from the surrounding scope by + // value. Do NOT capture them by reference! If CUDA is enabled, + // this macro may have a special definition that makes the lambda + // work correctly with CUDA. Compare to the KOKKOS_INLINE_FUNCTION + // macro, which has a special meaning if CUDA is enabled. + // + // The following parallel_for would look like this if we were using + // OpenMP by itself, instead of Kokkos: + // + // #pragma omp parallel for + // for (int i = 0; i < 15; ++i) { + // printf ("Hello from i = %i\n", i); + // } + // + // You may notice that the printed numbers do not print out in + // order. Parallel for loops may execute in any order. + // We also need to protect the usage of a lambda against compiling + // with a backend which doesn't support it (i.e. Cuda 6.5/7.0). +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + Kokkos::parallel_for( + 15, KOKKOS_LAMBDA(const int i) { + // FIXME_SYCL needs workaround for printf +#ifndef __SYCL_DEVICE_ONLY__ + // printf works in a CUDA parallel kernel; std::ostream does not. + printf("Hello from i = %i\n", i); +#else + (void)i; +#endif + }); +#endif + // You must call finalize() after you are done using Kokkos. + Kokkos::finalize(); +} diff --git a/packages/kokkos/example/tutorial/02_simple_reduce/CMakeLists.txt b/packages/kokkos/example/tutorial/02_simple_reduce/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..21b0c38014b4cd923d9c8ecea07ed645a2775c6e --- /dev/null +++ b/packages/kokkos/example/tutorial/02_simple_reduce/CMakeLists.txt @@ -0,0 +1,9 @@ + +KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# This is a tutorial, not a test, so we don't ask CTest to run it. +KOKKOS_ADD_EXECUTABLE( + tutorial_02_simple_reduce + SOURCES simple_reduce.cpp +) diff --git a/packages/kokkos/example/tutorial/02_simple_reduce/Makefile b/packages/kokkos/example/tutorial/02_simple_reduce/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..72b94bb2655ef05cce2a14af85544eb0bc9eea3a --- /dev/null +++ b/packages/kokkos/example/tutorial/02_simple_reduce/Makefile @@ -0,0 +1,48 @@ +KOKKOS_PATH = ../../.. +KOKKOS_SRC_PATH = ${KOKKOS_PATH} +SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/02_simple_reduce/*.cpp) +vpath %.cpp $(sort $(dir $(SRC))) + +default: build + echo "Start Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 02_simple_reduce.cuda +KOKKOS_DEVICES = "Cuda,OpenMP" +KOKKOS_ARCH = "SNB,Kepler35" +else +CXX = g++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 02_simple_reduce.host +KOKKOS_DEVICES = "OpenMP" +KOKKOS_ARCH = "SNB" +endif + +DEPFLAGS = -M + +OBJ = $(notdir $(SRC:.cpp=.o)) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +test: $(EXE) + ./$(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o *.cuda *.host + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/packages/kokkos/example/tutorial/02_simple_reduce/simple_reduce.cpp b/packages/kokkos/example/tutorial/02_simple_reduce/simple_reduce.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2b7668e515bcbb83ec0c073ebb1422df9a85ba7a --- /dev/null +++ b/packages/kokkos/example/tutorial/02_simple_reduce/simple_reduce.cpp @@ -0,0 +1,105 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <cstdio> + +// +// First reduction (parallel_reduce) example: +// 1. Start up Kokkos +// 2. Execute a parallel_reduce loop in the default execution space, +// using a functor to define the loop body +// 3. Shut down Kokkos +// +// Compare this example to 02_simple_reduce_lambda, which uses a C++11 +// lambda to define the loop body of the parallel_reduce. +// + +// Reduction functor for computing the sum of squares. +// +// More advanced reduction examples will show how to control the +// reduction's "join" operator. If the join operator is not provided, +// it defaults to binary operator+ (adding numbers together). +struct squaresum { + // Specify the type of the reduction value with a "value_type" + // alias. In this case, the reduction value has type int. + using value_type = int; + + // The reduction functor's operator() looks a little different than + // the parallel_for functor's operator(). For the reduction, we + // pass in both the loop index i, and the intermediate reduction + // value lsum. The latter MUST be passed in by nonconst reference. + // (If the reduction type is an array like int[], indicating an + // array reduction result, then the second argument is just int[].) + KOKKOS_INLINE_FUNCTION + void operator()(const int i, int& lsum) const { + lsum += i * i; // compute the sum of squares + } +}; + +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); + const int n = 10; + + // Compute the sum of squares of integers from 0 to n-1, in + // parallel, using Kokkos. + int sum = 0; + Kokkos::parallel_reduce(n, squaresum(), sum); + printf( + "Sum of squares of integers from 0 to %i, " + "computed in parallel, is %i\n", + n - 1, sum); + + // Compare to a sequential loop. + int seqSum = 0; + for (int i = 0; i < n; ++i) { + seqSum += i * i; + } + printf( + "Sum of squares of integers from 0 to %i, " + "computed sequentially, is %i\n", + n - 1, seqSum); + Kokkos::finalize(); + return (sum == seqSum) ? 0 : -1; +} diff --git a/packages/kokkos/example/tutorial/02_simple_reduce_lambda/CMakeLists.txt b/packages/kokkos/example/tutorial/02_simple_reduce_lambda/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..82a87be4bdc46baec421c0363a1481e1ae07e001 --- /dev/null +++ b/packages/kokkos/example/tutorial/02_simple_reduce_lambda/CMakeLists.txt @@ -0,0 +1,9 @@ + +KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +KOKKOS_ADD_EXECUTABLE( + tutorial_02_simple_reduce_lambda + SOURCES simple_reduce_lambda.cpp +) + diff --git a/packages/kokkos/example/tutorial/02_simple_reduce_lambda/Makefile b/packages/kokkos/example/tutorial/02_simple_reduce_lambda/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..ed59338a64a9810493cbd81e4f181d9d49b65cc3 --- /dev/null +++ b/packages/kokkos/example/tutorial/02_simple_reduce_lambda/Makefile @@ -0,0 +1,50 @@ +KOKKOS_PATH = ../../.. +KOKKOS_SRC_PATH = ${KOKKOS_PATH} +SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/02_simple_reduce_lambda/*.cpp) +vpath %.cpp $(sort $(dir $(SRC))) + +default: build + echo "Start Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 02_simple_reduce_lambda.cuda +KOKKOS_DEVICES = "Cuda,OpenMP" +KOKKOS_ARCH = "SNB,Kepler35" +KOKKOS_CUDA_OPTIONS += "enable_lambda" +else +CXX = g++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 02_simple_reduce_lambda.host +KOKKOS_DEVICES = "OpenMP" +KOKKOS_ARCH = "SNB" +endif + + +DEPFLAGS = -M + +OBJ = $(notdir $(SRC:.cpp=.o)) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +test: $(EXE) + ./$(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o *.cuda *.host + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/packages/kokkos/example/tutorial/02_simple_reduce_lambda/simple_reduce_lambda.cpp b/packages/kokkos/example/tutorial/02_simple_reduce_lambda/simple_reduce_lambda.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7f4c356e0ce90896f709922cc10556a6fa08cd04 --- /dev/null +++ b/packages/kokkos/example/tutorial/02_simple_reduce_lambda/simple_reduce_lambda.cpp @@ -0,0 +1,97 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <cstdio> + +// +// First reduction (parallel_reduce) example: +// 1. Start up Kokkos +// 2. Execute a parallel_reduce loop in the default execution space, +// using a C++11 lambda to define the loop body +// 3. Shut down Kokkos +// +// This example only builds if C++11 is enabled. Compare this example +// to 02_simple_reduce, which uses a functor to define the loop body +// of the parallel_reduce. +// + +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); + const int n = 10; + + // Compute the sum of squares of integers from 0 to n-1, in + // parallel, using Kokkos. This time, use a lambda instead of a + // functor. The lambda takes the same arguments as the functor's + // operator(). + int sum = 0; +// The KOKKOS_LAMBDA macro replaces the capture-by-value clause [=]. +// It also handles any other syntax needed for CUDA. +// We also need to protect the usage of a lambda against compiling +// with a backend which doesn't support it (i.e. Cuda 6.5/7.0). +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + Kokkos::parallel_reduce( + n, KOKKOS_LAMBDA(const int i, int& lsum) { lsum += i * i; }, sum); +#endif + printf( + "Sum of squares of integers from 0 to %i, " + "computed in parallel, is %i\n", + n - 1, sum); + + // Compare to a sequential loop. + int seqSum = 0; + for (int i = 0; i < n; ++i) { + seqSum += i * i; + } + printf( + "Sum of squares of integers from 0 to %i, " + "computed sequentially, is %i\n", + n - 1, seqSum); + Kokkos::finalize(); +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + return (sum == seqSum) ? 0 : -1; +#else + return 0; +#endif +} diff --git a/packages/kokkos/example/tutorial/03_simple_view/CMakeLists.txt b/packages/kokkos/example/tutorial/03_simple_view/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..99a7d39c17ad35dc82eaddbe7fe60f8df15544f6 --- /dev/null +++ b/packages/kokkos/example/tutorial/03_simple_view/CMakeLists.txt @@ -0,0 +1,9 @@ + +KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# This is a tutorial, not a test, so we don't ask CTest to run it. +KOKKOS_ADD_EXECUTABLE( + tutorial_03_simple_view + SOURCES simple_view.cpp +) diff --git a/packages/kokkos/example/tutorial/03_simple_view/Makefile b/packages/kokkos/example/tutorial/03_simple_view/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..527a1c7329f2cfe160defa2ac00a2982a2856886 --- /dev/null +++ b/packages/kokkos/example/tutorial/03_simple_view/Makefile @@ -0,0 +1,50 @@ +KOKKOS_PATH = ../../.. +KOKKOS_SRC_PATH = ${KOKKOS_PATH} +SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/03_simple_view/*.cpp) +vpath %.cpp $(sort $(dir $(SRC))) + +default: build + echo "Start Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 03_simple_view.cuda +KOKKOS_DEVICES = "Cuda,OpenMP" +KOKKOS_ARCH = "SNB,Kepler35" +else +CXX = g++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 03_simple_view.host +KOKKOS_DEVICES = "OpenMP" +KOKKOS_ARCH = "SNB" +endif + + +DEPFLAGS = -M + +OBJ = $(notdir $(SRC:.cpp=.o)) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +#for unit testing only, for best performance with OpenMP 4.0 or better +test: $(EXE) + ./$(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o *.cuda *.host + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/packages/kokkos/example/tutorial/03_simple_view/simple_view.cpp b/packages/kokkos/example/tutorial/03_simple_view/simple_view.cpp new file mode 100644 index 0000000000000000000000000000000000000000..46cac62b9df1ab5c9124dffef75089afc1b718ca --- /dev/null +++ b/packages/kokkos/example/tutorial/03_simple_view/simple_view.cpp @@ -0,0 +1,142 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +// +// First Kokkos::View (multidimensional array) example: +// 1. Start up Kokkos +// 2. Allocate a Kokkos::View +// 3. Execute a parallel_for and a parallel_reduce over that View's data +// 4. Shut down Kokkos +// +// Compare this example to 03_simple_view_lambda, which uses C++11 +// lambdas to define the loop bodies of the parallel_for and +// parallel_reduce. +// + +#include <Kokkos_Core.hpp> +#include <cstdio> + +// A Kokkos::View is an array of zero or more dimensions. The number +// of dimensions is specified at compile time, as part of the type of +// the View. This array has two dimensions. The first one +// (represented by the asterisk) is a run-time dimension, and the +// second (represented by [3]) is a compile-time dimension. Thus, +// this View type is an N x 3 array of type double, where N is +// specified at run time in the View's constructor. +// +// The first dimension of the View is the dimension over which it is +// efficient for Kokkos to parallelize. +using view_type = Kokkos::View<double * [3]>; + +// parallel_for functor that fills the View given to its constructor. +// The View must already have been allocated. +struct InitView { + view_type a; + + // Views have "view semantics." This means that they behave like + // pointers, not like std::vector. Their copy constructor and + // operator= only do shallow copies. Thus, you can pass View + // objects around by "value"; they won't do a deep copy unless you + // explicitly ask for a deep copy. + InitView(view_type a_) : a(a_) {} + + // Fill the View with some data. The parallel_for loop will iterate + // over the View's first dimension N. + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { + // Acesss the View just like a Fortran array. The layout depends + // on the View's memory space, so don't rely on the View's + // physical memory layout unless you know what you're doing. + a(i, 0) = 1.0 * i; + a(i, 1) = 1.0 * i * i; + a(i, 2) = 1.0 * i * i * i; + } +}; + +// Reduction functor that reads the View given to its constructor. +struct ReduceFunctor { + view_type a; + + // Constructor takes View by "value"; this does a shallow copy. + ReduceFunctor(view_type a_) : a(a_) {} + + // If you write a functor to do a reduction, you must specify the + // type of the reduction result via a public 'value_type' alias. + using value_type = double; + + KOKKOS_INLINE_FUNCTION + void operator()(int i, double& lsum) const { + lsum += a(i, 0) * a(i, 1) / (a(i, 2) + 0.1); + } +}; + +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); + { + const int N = 10; + + // Allocate the View. The first dimension is a run-time parameter + // N. We set N = 10 here. The second dimension is a compile-time + // parameter, 3. We don't specify it here because we already set it + // by declaring the type of the View. + // + // Views get initialized to zero by default. This happens in + // parallel, using the View's memory space's default execution + // space. Parallel initialization ensures first-touch allocation. + // There is a way to shut off default initialization. + // + // You may NOT allocate a View inside of a parallel_{for, reduce, + // scan}. Treat View allocation as a "thread collective." + // + // The string "A" is just the label; it only matters for debugging. + // Different Views may have the same label. + view_type a("A", N); + + Kokkos::parallel_for(N, InitView(a)); + double sum = 0; + Kokkos::parallel_reduce(N, ReduceFunctor(a), sum); + printf("Result: %f\n", sum); + } // use this scope to ensure the lifetime of "A" ends before finalize + Kokkos::finalize(); +} diff --git a/packages/kokkos/example/tutorial/03_simple_view_lambda/CMakeLists.txt b/packages/kokkos/example/tutorial/03_simple_view_lambda/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f3d9c52de63f6d864b0961ec59ed1d32c92ff05 --- /dev/null +++ b/packages/kokkos/example/tutorial/03_simple_view_lambda/CMakeLists.txt @@ -0,0 +1,9 @@ + +KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# This is a tutorial, not a test, so we don't ask CTest to run it. +KOKKOS_ADD_EXECUTABLE( + tutorial_03_simple_view_lambda + SOURCES simple_view_lambda.cpp + ) diff --git a/packages/kokkos/example/tutorial/03_simple_view_lambda/Makefile b/packages/kokkos/example/tutorial/03_simple_view_lambda/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..57760b84f7ebe9cb02f7b589f1954b31ae89bae9 --- /dev/null +++ b/packages/kokkos/example/tutorial/03_simple_view_lambda/Makefile @@ -0,0 +1,50 @@ +KOKKOS_PATH = ../../.. +KOKKOS_SRC_PATH = ${KOKKOS_PATH} +SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/03_simple_view_lambda/*.cpp) +vpath %.cpp $(sort $(dir $(SRC))) + +default: build + echo "Start Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 03_simple_view_lambda.cuda +KOKKOS_DEVICES = "Cuda,OpenMP" +KOKKOS_ARCH = "SNB,Kepler35" +KOKKOS_CUDA_OPTIONS += "enable_lambda" +else +CXX = g++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 03_simple_view_lambda.host +KOKKOS_DEVICES = "OpenMP" +KOKKOS_ARCH = "SNB" +endif + + +DEPFLAGS = -M + +OBJ = $(notdir $(SRC:.cpp=.o)) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +test: $(EXE) + ./$(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o *.cuda *.host + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/packages/kokkos/example/tutorial/03_simple_view_lambda/simple_view_lambda.cpp b/packages/kokkos/example/tutorial/03_simple_view_lambda/simple_view_lambda.cpp new file mode 100644 index 0000000000000000000000000000000000000000..33b3a1a7db17740d5f578bace4be2e74e80ed03d --- /dev/null +++ b/packages/kokkos/example/tutorial/03_simple_view_lambda/simple_view_lambda.cpp @@ -0,0 +1,126 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +// +// First Kokkos::View (multidimensional array) example: +// 1. Start up Kokkos +// 2. Allocate a Kokkos::View +// 3. Execute a parallel_for and a parallel_reduce over that View's data +// 4. Shut down Kokkos +// +// Compare this example to 03_simple_view, which uses functors to +// define the loop bodies of the parallel_for and parallel_reduce. +// + +#include <Kokkos_Core.hpp> +#include <cstdio> + +// A Kokkos::View is an array of zero or more dimensions. The number +// of dimensions is specified at compile time, as part of the type of +// the View. This array has two dimensions. The first one +// (represented by the asterisk) is a run-time dimension, and the +// second (represented by [3]) is a compile-time dimension. Thus, +// this View type is an N x 3 array of type double, where N is +// specified at run time in the View's constructor. +// +// The first dimension of the View is the dimension over which it is +// efficient for Kokkos to parallelize. +using view_type = Kokkos::View<double * [3]>; + +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); + + { + // Allocate the View. The first dimension is a run-time parameter + // N. We set N = 10 here. The second dimension is a compile-time + // parameter, 3. We don't specify it here because we already set it + // by declaring the type of the View. + // + // Views get initialized to zero by default. This happens in + // parallel, using the View's memory space's default execution + // space. Parallel initialization ensures first-touch allocation. + // There is a way to shut off default initialization. + // + // You may NOT allocate a View inside of a parallel_{for, reduce, + // scan}. Treat View allocation as a "thread collective." + // + // The string "A" is just the label; it only matters for debugging. + // Different Views may have the same label. + view_type a("A", 10); + +// Fill the View with some data. The parallel_for loop will iterate +// over the View's first dimension N. +// +// Note that the View is passed by value into the lambda. The macro +// KOKKOS_LAMBDA includes the "capture by value" clause [=]. This +// tells the lambda to "capture all variables in the enclosing scope +// by value." Views have "view semantics"; they behave like +// pointers, not like std::vector. Passing them by value does a +// shallow copy. A deep copy never happens unless you explicitly +// ask for one. +// We also need to protect the usage of a lambda against compiling +// with a backend which doesn't support it (i.e. Cuda 6.5/7.0). +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + Kokkos::parallel_for( + 10, KOKKOS_LAMBDA(const int i) { + // Acesss the View just like a Fortran array. The layout depends + // on the View's memory space, so don't rely on the View's + // physical memory layout unless you know what you're doing. + a(i, 0) = 1.0 * i; + a(i, 1) = 1.0 * i * i; + a(i, 2) = 1.0 * i * i * i; + }); + // Reduction functor that reads the View given to its constructor. + double sum = 0; + Kokkos::parallel_reduce( + 10, + KOKKOS_LAMBDA(const int i, double& lsum) { + lsum += a(i, 0) * a(i, 1) / (a(i, 2) + 0.1); + }, + sum); + printf("Result: %f\n", sum); +#endif + } + Kokkos::finalize(); +} diff --git a/packages/kokkos/example/tutorial/04_simple_memoryspaces/CMakeLists.txt b/packages/kokkos/example/tutorial/04_simple_memoryspaces/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..03fb97a133caf6039dd048cb0546f502457ddc34 --- /dev/null +++ b/packages/kokkos/example/tutorial/04_simple_memoryspaces/CMakeLists.txt @@ -0,0 +1,9 @@ + +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# This is a tutorial, not a test, so we don't ask CTest to run it. +KOKKOS_ADD_EXECUTABLE( + tutorial_04_simple_memoryspaces + SOURCES simple_memoryspaces.cpp +) diff --git a/packages/kokkos/example/tutorial/04_simple_memoryspaces/Makefile b/packages/kokkos/example/tutorial/04_simple_memoryspaces/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..66f6f65a29a3e034f4f0735c51639937d0fd496b --- /dev/null +++ b/packages/kokkos/example/tutorial/04_simple_memoryspaces/Makefile @@ -0,0 +1,49 @@ +KOKKOS_PATH = ../../.. +KOKKOS_SRC_PATH = ${KOKKOS_PATH} +SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/04_simple_memoryspaces/*.cpp) +vpath %.cpp $(sort $(dir $(SRC))) + +default: build + echo "Start Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 04_simple_memoryspaces.cuda +KOKKOS_DEVICES = "Cuda,OpenMP" +KOKKOS_ARCH = "SNB,Kepler35" +else +CXX = g++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 04_simple_memoryspaces.host +KOKKOS_DEVICES = "OpenMP" +KOKKOS_ARCH = "SNB" +endif + + +DEPFLAGS = -M + +OBJ = $(notdir $(SRC:.cpp=.o)) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +test: $(EXE) + ./$(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o *.cuda *.host + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/packages/kokkos/example/tutorial/04_simple_memoryspaces/simple_memoryspaces.cpp b/packages/kokkos/example/tutorial/04_simple_memoryspaces/simple_memoryspaces.cpp new file mode 100644 index 0000000000000000000000000000000000000000..40ad6123e7bb7b1fd8f3b9ad79c01df7105ef5c3 --- /dev/null +++ b/packages/kokkos/example/tutorial/04_simple_memoryspaces/simple_memoryspaces.cpp @@ -0,0 +1,103 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <cstdio> + +// The type of a two-dimensional N x 3 array of double. +// It lives in Kokkos' default memory space. +using view_type = Kokkos::View<double * [3]>; + +// The "HostMirror" type corresponding to view_type above is also a +// two-dimensional N x 3 array of double. However, it lives in the +// host memory space corresponding to view_type's memory space. For +// example, if view_type lives in CUDA device memory, host_view_type +// lives in host (CPU) memory. Furthermore, declaring host_view_type +// as the host mirror of view_type means that host_view_type has the +// same layout as view_type. This makes it easier to copy between the +// two Views. +// Advanced issues: If a memory space is accessible from the host without +// performance penalties then it is its own host_mirror_space. This is +// the case for HostSpace, CudaUVMSpace and CudaHostPinnedSpace. + +using host_view_type = view_type::HostMirror; + +struct ReduceFunctor { + view_type a; + ReduceFunctor(view_type a_) : a(a_) {} + using value_type = int; // Specify type for reduction value, lsum + + KOKKOS_INLINE_FUNCTION + void operator()(int i, int &lsum) const { + lsum += a(i, 0) - a(i, 1) + a(i, 2); + } +}; + +int main() { + Kokkos::initialize(); + + { + view_type a("A", 10); + // If view_type and host_mirror_type live in the same memory space, + // a "mirror view" is just an alias, and deep_copy does nothing. + // Otherwise, a mirror view of a device View lives in host memory, + // and deep_copy does a deep copy. + host_view_type h_a = Kokkos::create_mirror_view(a); + + // The View h_a lives in host (CPU) memory, so it's legal to fill + // the view sequentially using ordinary code, like this. + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 3; j++) { + h_a(i, j) = i * 10 + j; + } + } + Kokkos::deep_copy(a, h_a); // Copy from host to device. + + int sum = 0; + Kokkos::parallel_reduce(10, ReduceFunctor(a), sum); + printf("Result is %i\n", sum); + } + + Kokkos::finalize(); +} diff --git a/packages/kokkos/example/tutorial/05_simple_atomics/CMakeLists.txt b/packages/kokkos/example/tutorial/05_simple_atomics/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..85870e5e504c1b74bfbccaa97e4b9f289d5a28c7 --- /dev/null +++ b/packages/kokkos/example/tutorial/05_simple_atomics/CMakeLists.txt @@ -0,0 +1,10 @@ + +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# This is a tutorial, not a test, so we don't ask CTest to run it. +KOKKOS_ADD_EXECUTABLE( + tutorial_05_simple_atomics + SOURCES simple_atomics.cpp +) + diff --git a/packages/kokkos/example/tutorial/05_simple_atomics/Makefile b/packages/kokkos/example/tutorial/05_simple_atomics/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..29f6d9b8fb1acb7fdf5504b76c89fba6a762162e --- /dev/null +++ b/packages/kokkos/example/tutorial/05_simple_atomics/Makefile @@ -0,0 +1,49 @@ +KOKKOS_PATH = ../../.. +KOKKOS_SRC_PATH = ${KOKKOS_PATH} +SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/05_simple_atomics/*.cpp) +vpath %.cpp $(sort $(dir $(SRC))) + +default: build + echo "Start Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 05_simple_atomics.cuda +KOKKOS_DEVICES = "Cuda,OpenMP" +KOKKOS_ARCH = "SNB,Kepler35" +else +CXX = g++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 05_simple_atomics.host +KOKKOS_DEVICES = "OpenMP" +KOKKOS_ARCH = "SNB" +endif + + +DEPFLAGS = -M + +OBJ = $(notdir $(SRC:.cpp=.o)) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +test: $(EXE) + ./$(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o *.cuda *.host + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/packages/kokkos/example/tutorial/05_simple_atomics/simple_atomics.cpp b/packages/kokkos/example/tutorial/05_simple_atomics/simple_atomics.cpp new file mode 100644 index 0000000000000000000000000000000000000000..32b18e4d2047c5f3dcc23614109f4440b4686549 --- /dev/null +++ b/packages/kokkos/example/tutorial/05_simple_atomics/simple_atomics.cpp @@ -0,0 +1,137 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <cstdio> +#include <cstdlib> +#include <cmath> + +// Type of a one-dimensional length-N array of int. +using view_type = Kokkos::View<int*>; +using host_view_type = view_type::HostMirror; +// This is a "zero-dimensional" View, that is, a View of a single +// value (an int, in this case). Access the value using operator() +// with no arguments: e.g., 'count()'. +// +// Zero-dimensional Views are useful for reduction results that stay +// resident in device memory, as well as for irregularly updated +// shared state. We use it for the latter in this example. +using count_type = Kokkos::View<int>; +using host_count_type = count_type::HostMirror; + +// Functor for finding a list of primes in a given set of numbers. If +// run in parallel, the order of results is nondeterministic, because +// hardware atomic updates do not guarantee an order of execution. +struct findprimes { + view_type data; + view_type result; + count_type count; + + findprimes(view_type data_, view_type result_, count_type count_) + : data(data_), result(result_), count(count_) {} + + // Test if data(i) is prime. If it is, increment the count of + // primes (stored in the zero-dimensional View 'count') and add the + // value to the current list of primes 'result'. + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { + const int number = data(i); // the current number + + // Test all numbers from 3 to ceiling(sqrt(data(i))), to see if + // they are factors of data(i). It's not the most efficient prime + // test, but it works. + const int upper_bound = std::sqrt(1.0 * number) + 1; + bool is_prime = !(number % 2 == 0); + int k = 3; + while (k < upper_bound && is_prime) { + is_prime = !(number % k == 0); + k += 2; // don't have to test even numbers + } + + if (is_prime) { + // Use an atomic update both to update the current count of + // primes, and to find a place in the current list of primes for + // the new result. + // + // atomic_fetch_add results the _current_ count, but increments + // it (by 1 in this case). The current count of primes indexes + // into the first unoccupied position of the 'result' array. + const int idx = Kokkos::atomic_fetch_add(&count(), 1); + result(idx) = number; + } + } +}; + +int main() { + Kokkos::initialize(); + + { + srand(61391); // Set the random seed + + int nnumbers = 100000; + view_type data("RND", nnumbers); + view_type result("Prime", nnumbers); + count_type count("Count"); + + host_view_type h_data = Kokkos::create_mirror_view(data); + host_view_type h_result = Kokkos::create_mirror_view(result); + host_count_type h_count = Kokkos::create_mirror_view(count); + + using size_type = view_type::size_type; + // Fill the 'data' array on the host with random numbers. We assume + // that they come from some process which is only implemented on the + // host, via some library. (That's true in this case.) + for (size_type i = 0; i < static_cast<size_type>(data.extent(0)); ++i) { + h_data(i) = rand() % nnumbers; + } + Kokkos::deep_copy(data, h_data); // copy from host to device + + Kokkos::parallel_for(data.extent(0), findprimes(data, result, count)); + Kokkos::deep_copy(h_count, count); // copy from device to host + + printf("Found %i prime numbers in %i random numbers\n", h_count(), + nnumbers); + } + Kokkos::finalize(); +} diff --git a/packages/kokkos/example/tutorial/06_simple_mdrangepolicy/CMakeLists.txt b/packages/kokkos/example/tutorial/06_simple_mdrangepolicy/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..2a6c3f6c27a3699d0715c8e5ab41448221432aaf --- /dev/null +++ b/packages/kokkos/example/tutorial/06_simple_mdrangepolicy/CMakeLists.txt @@ -0,0 +1,9 @@ + +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# This is a tutorial, not a test, so we don't ask CTest to run it. +KOKKOS_ADD_EXECUTABLE( + tutorial_06_simple_mdrangepolicy + SOURCES simple_mdrangepolicy.cpp +) diff --git a/packages/kokkos/example/tutorial/06_simple_mdrangepolicy/Makefile b/packages/kokkos/example/tutorial/06_simple_mdrangepolicy/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..7d3498ed1780281d3f0cbfa827e13fc2c09f1e57 --- /dev/null +++ b/packages/kokkos/example/tutorial/06_simple_mdrangepolicy/Makefile @@ -0,0 +1,48 @@ +KOKKOS_PATH = ../../.. +KOKKOS_SRC_PATH = ${KOKKOS_PATH} +SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/06_simple_mdrangepolicy/*.cpp) +vpath %.cpp $(sort $(dir $(SRC))) + +default: build + echo "Start Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 06_simple_mdrangepolicy.cuda +KOKKOS_DEVICES = "Cuda,OpenMP" +KOKKOS_ARCH = "SNB,Kepler35" +else +CXX = g++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 06_simple_mdrangepolicy.host +KOKKOS_DEVICES = "OpenMP" +KOKKOS_ARCH = "SNB" +endif + +DEPFLAGS = -M + +OBJ = $(notdir $(SRC:.cpp=.o)) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +test: $(EXE) + ./$(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o *.cuda *.host + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/packages/kokkos/example/tutorial/06_simple_mdrangepolicy/simple_mdrangepolicy.cpp b/packages/kokkos/example/tutorial/06_simple_mdrangepolicy/simple_mdrangepolicy.cpp new file mode 100644 index 0000000000000000000000000000000000000000..07b99087d4c310e6cf0d82c026f52bd610dd0ecb --- /dev/null +++ b/packages/kokkos/example/tutorial/06_simple_mdrangepolicy/simple_mdrangepolicy.cpp @@ -0,0 +1,213 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <cstdio> + +// +// MDRangePolicy example with parallel_for and parallel_reduce: +// 1. Start up Kokkos +// 2. Execute a parallel_for loop in the default execution space, +// using a functor to define the loop body +// 3. Shut down Kokkos +// +// Two examples are provided: +// Example 1: Rank 2 case with minimal default parameters and arguments used +// in the MDRangePolicy +// +// Example 2: Rank 3 case with additional outer/inner iterate pattern parameters +// and tile dims passed to the ctor + +// Simple functor for computing/storing the product of indices in a View v +template <class ViewType> +struct MDFunctor { + using value_type = long; + + ViewType v; + size_t size; + + MDFunctor(const ViewType& v_, const size_t size_) : v(v_), size(size_) {} + + // 2D case - used by parallel_for + KOKKOS_INLINE_FUNCTION + void operator()(const int i, const int j) const { + v(i, j) = i * j; // compute the product of indices + } + + // 3D case - used by parallel_for + KOKKOS_INLINE_FUNCTION + void operator()(const int i, const int j, const int k) const { + v(i, j, k) = i * j * k; // compute the product of indices + } + + // 2D case - reduction + KOKKOS_INLINE_FUNCTION + void operator()(const int i, const int j, value_type& incorrect_count) const { + if (v(i, j) != i * j) { + incorrect_count += 1; + } + } + + // 3D case - reduction + KOKKOS_INLINE_FUNCTION + void operator()(const int i, const int j, const int k, + value_type& incorrect_count) const { + if (v(i, j, k) != i * j * k) { + incorrect_count += 1; + } + } +}; + +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); + + // Bound(s) for MDRangePolicy + const int n = 100; + + // ViewType aliases for Rank<2>, Rank<3> for example usage + using ScalarType = double; + using ViewType_2D = typename Kokkos::View<ScalarType**>; + using ViewType_3D = typename Kokkos::View<ScalarType***>; + + ///////////////////////////////////////////////////////////////////////////// + // Explanation of MDRangePolicy usage, template parameters, constructor + // arguments + // + // MDRangePolicy aliases for Rank<2>, Rank<3> cases + // Required template parameters: + // Kokkos::Rank<N>: where N=rank + // + // Optional template parameters to Rank<...>: + // Kokkos::Iterate::{Default,Left,Right}: Outer iteration pattern across + // tiles; + // defaults based on the execution space similar to Kokkos::Layout + // Kokkos::Iterate::{Default,Left,Right}: Inner iteration pattern within + // tiles; + // defaults based on the execution space similar to Kokkos::Layout + // + // e.g. using rank2ll = Rank<2, Iterate::Left, Iterate::Left>; + // + // + // Optional template parameters to MDRangePolicy: + // ExecutionSpace: Kokkos::Serial, Kokkos::OpenMP, Kokkos::Cuda, etc. + // + // Kokkos::IndexType< T >: where T = int, long, unsigned int, etc. + // + // struct Tag{}: A user-provided tag for tagging functor operators + // + // e.g. 1: MDRangePolicy< Kokkos::Serial, Rank<2, Iterate::Left, + // Iterate::Left>, IndexType<int>, Tag > mdpolicy; e.g. 2: MDRangePolicy< + // Kokkos::Serial, rank2ll, IndexType<int>, Tag > mdpolicy; + // + // + // Required arguments to ctor: + // {{ l0, l1, ... }}: Lower bounds, provided as Kokkos::Array or + // std::initializer_list + // {{ u0, u1, ... }}: Upper bounds, provided as Kokkos::Array or + // std::initializer_list + // + // Optional arguments to ctor: + // {{ t0, t1, ... }}: Tile dimensions, provided as Kokkos::Array or + // std::initializer_list + // defaults based on the execution space + // + // e.g. mdpolicy( {{0,0}}, {{u0,u1}}, {{t0,t1}}; + // + ///////////////////////////////////////////////////////////////////////////// + + // Example 1: + long incorrect_count_2d = 0; + { + // Rank<2> Case: Rank is provided, all other parameters are default + using MDPolicyType_2D = typename Kokkos::Experimental::MDRangePolicy< + Kokkos::Experimental::Rank<2> >; + + // Construct 2D MDRangePolicy: lower and upper bounds provided, tile dims + // defaulted + MDPolicyType_2D mdpolicy_2d({{0, 0}}, {{n, n}}); + + // Construct a 2D view to store result of product of indices + ViewType_2D v2("v2", n, n); + + // Execute parallel_for with rank 2 MDRangePolicy + Kokkos::parallel_for("md2d", mdpolicy_2d, MDFunctor<ViewType_2D>(v2, n)); + + // Check results with a parallel_reduce using the MDRangePolicy + Kokkos::parallel_reduce("md2dredux", mdpolicy_2d, + MDFunctor<ViewType_2D>(v2, n), incorrect_count_2d); + + printf("Rank 2 MDRangePolicy incorrect count: %ld\n", + incorrect_count_2d); // should be 0 + } + + // Example 2: + long incorrect_count_3d = 0; + { + // Rank<3> Case: Rank, inner iterate pattern, outer iterate pattern provided + using MDPolicyType_3D = typename Kokkos::Experimental::MDRangePolicy< + Kokkos::Experimental::Rank<3, Kokkos::Experimental::Iterate::Left, + Kokkos::Experimental::Iterate::Left> >; + + // Construct 3D MDRangePolicy: lower, upper bounds, tile dims provided + MDPolicyType_3D mdpolicy_3d({{0, 0, 0}}, {{n, n, n}}, {{4, 4, 4}}); + + // Construct a 3D view to store result of product of indices + ViewType_3D v3("v3", n, n, n); + + // Execute parallel_for with rank 3 MDRangePolicy + Kokkos::parallel_for("md3d", mdpolicy_3d, MDFunctor<ViewType_3D>(v3, n)); + + // Check results with a parallel_reduce using the MDRangePolicy + Kokkos::parallel_reduce("md3dredux", mdpolicy_3d, + MDFunctor<ViewType_3D>(v3, n), incorrect_count_3d); + + printf("Rank 3 MDRangePolicy incorrect count: %ld\n", + incorrect_count_3d); // should be 0 + } + + Kokkos::finalize(); + + return (incorrect_count_2d == long(0) && incorrect_count_3d == long(0)) ? 0 + : -1; +} diff --git a/packages/kokkos/example/tutorial/Advanced_Views/01_data_layouts/CMakeLists.txt b/packages/kokkos/example/tutorial/Advanced_Views/01_data_layouts/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..b0db41bf451708c361eb5a0cdeda3413c8532c8e --- /dev/null +++ b/packages/kokkos/example/tutorial/Advanced_Views/01_data_layouts/CMakeLists.txt @@ -0,0 +1,9 @@ + +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# This is a tutorial, not a test, so we don't ask CTest to run it. +KOKKOS_ADD_EXECUTABLE( + tutorial_advancedviews_01_data_layouts + SOURCES data_layouts.cpp +) diff --git a/packages/kokkos/example/tutorial/Advanced_Views/01_data_layouts/Makefile b/packages/kokkos/example/tutorial/Advanced_Views/01_data_layouts/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..8d7441d6ebbd19fed8fcc34209945c41b68d4685 --- /dev/null +++ b/packages/kokkos/example/tutorial/Advanced_Views/01_data_layouts/Makefile @@ -0,0 +1,49 @@ +KOKKOS_PATH = ../../../.. +KOKKOS_SRC_PATH = ${KOKKOS_PATH} +SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/Advanced_Views/01_data_layouts/*.cpp) +vpath %.cpp $(sort $(dir $(SRC))) + +default: build + echo "Start Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 01_data_layouts.cuda +KOKKOS_DEVICES = "Cuda,OpenMP" +KOKKOS_ARCH = "SNB,Kepler35" +else +CXX = g++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 01_data_layouts.host +KOKKOS_DEVICES = "OpenMP" +KOKKOS_ARCH = "SNB" +endif + + +DEPFLAGS = -M + +OBJ = $(notdir $(SRC:.cpp=.o)) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +test: $(EXE) + ./$(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o *.cuda *.host + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/packages/kokkos/example/tutorial/Advanced_Views/01_data_layouts/data_layouts.cpp b/packages/kokkos/example/tutorial/Advanced_Views/01_data_layouts/data_layouts.cpp new file mode 100644 index 0000000000000000000000000000000000000000..597d1e3056ece9ef5865a3fb79dfef09ccf50a6a --- /dev/null +++ b/packages/kokkos/example/tutorial/Advanced_Views/01_data_layouts/data_layouts.cpp @@ -0,0 +1,177 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <impl/Kokkos_Timer.hpp> +#include <cstdio> + +// These two View types are both 2-D arrays of double. However, they +// have different layouts in memory. left_type has "layout left," +// which means "column major," the same as in Fortran, the BLAS, or +// LAPACK. right_type has "layout right," which means "row major," +// the same as in C, C++, or Java. +using left_type = Kokkos::View<double**, Kokkos::LayoutLeft>; +using right_type = Kokkos::View<double**, Kokkos::LayoutRight>; +// This is a one-dimensional View, so the layout matters less. +// However, it still has a layout! Since its layout is not specified +// explicitly in the type, its layout is a function of the memory +// space. For example, the default Cuda layout is LayoutLeft, and the +// default Host layout is LayoutRight. +using view_type = Kokkos::View<double*>; + +// parallel_for functor that fills the given View with some data. It +// expects to access the View by rows in parallel: each call i of +// operator() accesses a row. +template <class ViewType> +struct init_view { + ViewType a; + init_view(ViewType a_) : a(a_) {} + + using size_type = typename ViewType::size_type; + + KOKKOS_INLINE_FUNCTION + void operator()(const typename ViewType::size_type i) const { + // On CPUs this loop could be vectorized so j should do stride 1 + // access on a for optimal performance. I.e. a should be LayoutRight. + // On GPUs threads should do coalesced loads and stores. That means + // that i should be the stride one access for optimal performance. + for (size_type j = 0; j < static_cast<size_type>(a.extent(1)); ++j) { + a(i, j) = 1.0 * a.extent(0) * i + 1.0 * j; + } + } +}; + +// Compute a contraction of v1 and v2 into a: +// +// a(i) := sum_j (v1(i,j) * v2(j,i)) +// +// Since the functor is templated on the ViewTypes itself it doesn't matter what +// there layouts are. That means you can use different layouts on different +// architectures. +template <class ViewType1, class ViewType2> +struct contraction { + view_type a; + typename ViewType1::const_type v1; + typename ViewType2::const_type v2; + contraction(view_type a_, ViewType1 v1_, ViewType2 v2_) + : a(a_), v1(v1_), v2(v2_) {} + + using size_type = typename view_type::size_type; + + // As with the initialization functor the performance of this operator + // depends on the architecture and the chosen data layouts. + // On CPUs optimal would be to vectorize the inner loop, so j should be the + // stride 1 access. That means v1 should be LayoutRight and v2 LayoutLeft. + // In order to get coalesced access on GPUs where i corresponds closely to + // the thread Index, i must be the stride 1 dimension. That means v1 should be + // LayoutLeft and v2 LayoutRight. + KOKKOS_INLINE_FUNCTION + void operator()(const view_type::size_type i) const { + for (size_type j = 0; j < static_cast<size_type>(a.extent(1)); ++j) { + a(i) = v1(i, j) * v2(j, i); + } + } +}; + +// Compute a dot product. This is used for result verification. +struct dot { + view_type a; + dot(view_type a_) : a(a_) {} + using value_type = double; // Specify type for reduction target, lsum + KOKKOS_INLINE_FUNCTION + void operator()(const view_type::size_type i, double& lsum) const { + lsum += a(i) * a(i); + } +}; + +int main(int narg, char* arg[]) { + // When initializing Kokkos, you may pass in command-line arguments, + // just like with MPI_Init(). Kokkos reserves the right to remove + // arguments from the list that start with '--kokkos-'. + Kokkos::initialize(narg, arg); + + { + int size = 10000; + view_type a("A", size); + + // Define two views with LayoutLeft and LayoutRight. + left_type l("L", size, 10000); + right_type r("R", size, 10000); + + // Initialize the data in the views. + Kokkos::parallel_for(size, init_view<left_type>(l)); + Kokkos::parallel_for(size, init_view<right_type>(r)); + Kokkos::fence(); + + // Measure time to execute the contraction kernel when giving it a + // LayoutLeft view for v1 and a LayoutRight view for v2. This should be + // fast on GPUs and slow on CPUs + Kokkos::Timer time1; + Kokkos::parallel_for(size, contraction<left_type, right_type>(a, l, r)); + Kokkos::fence(); + double sec1 = time1.seconds(); + + double sum1 = 0; + Kokkos::parallel_reduce(size, dot(a), sum1); + Kokkos::fence(); + + // Measure time to execute the contraction kernel when giving it a + // LayoutRight view for v1 and a LayoutLeft view for v2. This should be + // fast on CPUs and slow on GPUs + Kokkos::Timer time2; + Kokkos::parallel_for(size, contraction<right_type, left_type>(a, r, l)); + Kokkos::fence(); + double sec2 = time2.seconds(); + + double sum2 = 0; + Kokkos::parallel_reduce(size, dot(a), sum2); + + // Kokkos' reductions are deterministic. + // The results should always be equal. + printf("Result Left/Right %f Right/Left %f (equal result: %i)\n", sec1, + sec2, sum2 == sum1); + } + + Kokkos::finalize(); +} diff --git a/packages/kokkos/example/tutorial/Advanced_Views/02_memory_traits/CMakeLists.txt b/packages/kokkos/example/tutorial/Advanced_Views/02_memory_traits/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..0e50968b4bf2d856b0a1e4b1122929db5aab5326 --- /dev/null +++ b/packages/kokkos/example/tutorial/Advanced_Views/02_memory_traits/CMakeLists.txt @@ -0,0 +1,9 @@ + +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# This is a tutorial, not a test, so we don't ask CTest to run it. +KOKKOS_ADD_EXECUTABLE( + tutorial_advancedviews_02_memory_traits + SOURCES memory_traits.cpp +) diff --git a/packages/kokkos/example/tutorial/Advanced_Views/02_memory_traits/Makefile b/packages/kokkos/example/tutorial/Advanced_Views/02_memory_traits/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..bac40cb26ca602d6d0d3e178b5f415ab56b977ff --- /dev/null +++ b/packages/kokkos/example/tutorial/Advanced_Views/02_memory_traits/Makefile @@ -0,0 +1,49 @@ +KOKKOS_PATH = ../../../.. +KOKKOS_SRC_PATH = ${KOKKOS_PATH} +SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/Advanced_Views/02_memory_traits/*.cpp) +vpath %.cpp $(sort $(dir $(SRC))) + +default: build + echo "Start Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 02_memory_traits.cuda +KOKKOS_DEVICES = "Cuda,OpenMP" +KOKKOS_ARCH = "SNB,Kepler35" +else +CXX = g++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 02_memory_traits.host +KOKKOS_DEVICES = "OpenMP" +KOKKOS_ARCH = "SNB" +endif + + +DEPFLAGS = -M + +OBJ = $(notdir $(SRC:.cpp=.o)) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +test: $(EXE) + ./$(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o *.cuda *.host + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/packages/kokkos/example/tutorial/Advanced_Views/02_memory_traits/memory_traits.cpp b/packages/kokkos/example/tutorial/Advanced_Views/02_memory_traits/memory_traits.cpp new file mode 100644 index 0000000000000000000000000000000000000000..00bfeea36b972e6ea08ab8c82ec5aaca1a4e2af5 --- /dev/null +++ b/packages/kokkos/example/tutorial/Advanced_Views/02_memory_traits/memory_traits.cpp @@ -0,0 +1,148 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <impl/Kokkos_Timer.hpp> +#include <cstdio> +#include <cstdlib> + +using view_type = Kokkos::View<double*>; +// Kokkos::Views have an MemoryTraits template parameter which +// allows users to specify usage scenarios of a View. +// Some of those act simply as hints, which can be used to insert +// optimal load and store paths, others change the symantics of the +// access. The trait Kokkos::Atomic is one of the latter. A view with +// that MemoryTrait will perform any access atomicly (read, write, update). +// +// In this example we use a view with a usage hint for RandomAccess. +// Kokkos::RandomAccess means that we expect to use this view +// with indirect indexing. +// +// In CUDA, RandomAccess allows accesses through the texture +// cache. This only works if the View is read-only, which we enforce +// through the first template parameter. +// +// Note that we are still talking about views of the data, its not a new +// allocation. For example you can have an atomic view of a default view. While +// you even could use both in the same kernel, this could lead to undefined +// behaviour because one of your access paths is not atomic. Think of it in the +// same way as you think of pointers to const data and pointers to non-const +// data (i.e. const double* and double*). While these pointers can point to the +// same data you should not use them together if that brakes the const guarantee +// of the first pointer. +using view_type_rnd = + Kokkos::View<const double*, Kokkos::MemoryTraits<Kokkos::RandomAccess> >; +using idx_type = Kokkos::View<int**>; +using idx_type_host = idx_type::HostMirror; + +// We template this functor on the ViewTypes to show the effect of the +// RandomAccess trait. +template <class DestType, class SrcType> +struct localsum { + idx_type::const_type idx; + DestType dest; + SrcType src; + localsum(idx_type idx_, DestType dest_, SrcType src_) + : idx(idx_), dest(dest_), src(src_) {} + + // Calculate a local sum of values + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { + double tmp = 0.0; + for (int j = 0; j < (int)idx.extent(1); ++j) { + // This is an indirect access on src + const double val = src(idx(i, j)); + tmp += val * val + 0.5 * (idx.extent(0) * val - idx.extent(1) * val); + } + dest(i) = tmp; + } +}; + +int main(int narg, char* arg[]) { + Kokkos::initialize(narg, arg); + + { + int size = 1000000; + + idx_type idx("Idx", size, 64); + idx_type_host h_idx = Kokkos::create_mirror_view(idx); + + view_type dest("Dest", size); + view_type src("Src", size); + + srand(134231); + + using size_type = view_type::size_type; + for (int i = 0; i < size; i++) { + for (size_type j = 0; j < static_cast<size_type>(h_idx.extent(1)); ++j) { + h_idx(i, j) = (size + i + (rand() % 500 - 250)) % size; + } + } + + // Deep copy the initial data to the device + Kokkos::deep_copy(idx, h_idx); + // Run the first kernel to warmup caches + Kokkos::parallel_for(size, + localsum<view_type, view_type_rnd>(idx, dest, src)); + Kokkos::fence(); + + // Run the localsum functor using the RandomAccess trait. On CPUs there + // should not be any different in performance to not using the RandomAccess + // trait. On GPUs where can be a dramatic difference + Kokkos::Timer time1; + Kokkos::parallel_for(size, + localsum<view_type, view_type_rnd>(idx, dest, src)); + Kokkos::fence(); + double sec1 = time1.seconds(); + + Kokkos::Timer time2; + Kokkos::parallel_for(size, localsum<view_type, view_type>(idx, dest, src)); + Kokkos::fence(); + double sec2 = time2.seconds(); + + printf("Time with Trait RandomAccess: %f with Plain: %f \n", sec1, sec2); + } + + Kokkos::finalize(); +} diff --git a/packages/kokkos/example/tutorial/Advanced_Views/03_subviews/CMakeLists.txt b/packages/kokkos/example/tutorial/Advanced_Views/03_subviews/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..90270740c3c49927221ebca87a7c85f16afea074 --- /dev/null +++ b/packages/kokkos/example/tutorial/Advanced_Views/03_subviews/CMakeLists.txt @@ -0,0 +1,9 @@ + +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# This is a tutorial, not a test, so we don't ask CTest to run it. +KOKKOS_ADD_EXECUTABLE( + tutorial_advancedviews_03_subviews + SOURCES subviews.cpp +) diff --git a/packages/kokkos/example/tutorial/Advanced_Views/03_subviews/Makefile b/packages/kokkos/example/tutorial/Advanced_Views/03_subviews/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..9eb948932e141d7c3f34cc52efb7831e4d447eb5 --- /dev/null +++ b/packages/kokkos/example/tutorial/Advanced_Views/03_subviews/Makefile @@ -0,0 +1,49 @@ +KOKKOS_PATH = ../../../.. +KOKKOS_SRC_PATH = ${KOKKOS_PATH} +SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/Advanced_Views/03_subviews/*.cpp) +vpath %.cpp $(sort $(dir $(SRC))) + +default: build + echo "Start Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 03_subviews.cuda +KOKKOS_DEVICES = "Cuda,OpenMP" +KOKKOS_ARCH = "SNB,Kepler35" +else +CXX = g++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 03_subviews.host +KOKKOS_DEVICES = "OpenMP" +KOKKOS_ARCH = "SNB" +endif + + +DEPFLAGS = -M + +OBJ = $(notdir $(SRC:.cpp=.o)) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +test: $(EXE) + ./$(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o *.cuda *.host + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/packages/kokkos/example/tutorial/Advanced_Views/03_subviews/subviews.cpp b/packages/kokkos/example/tutorial/Advanced_Views/03_subviews/subviews.cpp new file mode 100644 index 0000000000000000000000000000000000000000..20e5c5a284f415e7627fd07df20ffbe5856f3428 --- /dev/null +++ b/packages/kokkos/example/tutorial/Advanced_Views/03_subviews/subviews.cpp @@ -0,0 +1,195 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +// This example simulates one timestep of an explicit +// finite-difference discretization of a time-dependent partial +// differential equation (PDE). It shows how to take subviews of the +// mesh in order to represent particular boundaries or the interior of +// the mesh. + +#include <Kokkos_Core.hpp> +#include <impl/Kokkos_Timer.hpp> +#include <cstdio> + +using mesh_type = Kokkos::View<double***, Kokkos::LayoutRight>; + +// These View types represent subviews of the mesh. Some of the Views +// have layout LayoutStride, meaning that they have run-time "strides" +// in each dimension which may differ from that dimension. For +// example, inner_mesh_type (which represents the interior of the +// mesh) has to skip over the boundaries when computing its stride; +// the dimensions of the interior mesh differ from these strides. You +// may safely always use a LayoutStride layout when taking a subview +// of a LayoutRight or LayoutLeft subview, but strided accesses may +// cost a bit more, especially for 1-D Views. +using xz_plane_type = Kokkos::View<double**, Kokkos::LayoutStride>; +using yz_plane_type = Kokkos::View<double**, Kokkos::LayoutRight>; +using xy_plane_type = Kokkos::View<double**, Kokkos::LayoutStride>; +using inner_mesh_type = Kokkos::View<double***, Kokkos::LayoutStride>; + +// Functor to set all entries of a boundary of the mesh to a constant +// value. The functor is templated on ViewType because different +// boundaries may have different layouts. +template <class ViewType> +struct set_boundary { + ViewType a; + double value; + + set_boundary(ViewType a_, double value_) : a(a_), value(value_) {} + + using size_type = typename ViewType::size_type; + + KOKKOS_INLINE_FUNCTION + void operator()(const size_type i) const { + for (size_type j = 0; j < static_cast<size_type>(a.extent(1)); ++j) { + a(i, j) = value; + } + } +}; + +// Functor to set all entries of a boundary of the mesh to a constant +// value. The functor is templated on ViewType because different +// boundaries may have different layouts. +template <class ViewType> +struct set_inner { + ViewType a; + double value; + + set_inner(ViewType a_, double value_) : a(a_), value(value_) {} + + using size_type = typename ViewType::size_type; + + KOKKOS_INLINE_FUNCTION + void operator()(const size_type i) const { + for (size_type j = 0; j < static_cast<size_type>(a.extent(1)); ++j) { + for (size_type k = 0; k < static_cast<size_type>(a.extent(2)); ++k) { + a(i, j, k) = value; + } + } + } +}; + +// Update the interior of the mesh. This simulates one timestep of a +// finite-difference method. +template <class ViewType> +struct update { + ViewType a; + const double dt; + + update(ViewType a_, const double dt_) : a(a_), dt(dt_) {} + + using size_type = typename ViewType::size_type; + + KOKKOS_INLINE_FUNCTION + void operator()(size_type i) const { + i++; + for (size_type j = 1; j < static_cast<size_type>(a.extent(1) - 1); j++) { + for (size_type k = 1; k < static_cast<size_type>(a.extent(2) - 1); k++) { + a(i, j, k) += dt * (a(i, j, k + 1) - a(i, j, k - 1) + a(i, j + 1, k) - + a(i, j - 1, k) + a(i + 1, j, k) - a(i - 1, j, k)); + } + } + } +}; + +int main(int narg, char* arg[]) { + using Kokkos::ALL; + using Kokkos::pair; + using Kokkos::parallel_for; + using Kokkos::subview; + using size_type = mesh_type::size_type; + + Kokkos::initialize(narg, arg); + + { + // The number of mesh points along each dimension of the mesh, not + // including boundaries. + const size_type size = 100; + + // A is the full cubic 3-D mesh, including the boundaries. + mesh_type A("A", size + 2, size + 2, size + 2); + // Ai is the "inner" part of A, _not_ including the boundaries. + // + // A pair of indices in a particular dimension means the contiguous + // zero-based index range in that dimension, including the first + // entry of the pair but _not_ including the second entry. + inner_mesh_type Ai = subview(A, pair<size_type, size_type>(1, size + 1), + pair<size_type, size_type>(1, size + 1), + pair<size_type, size_type>(1, size + 1)); + // A has six boundaries, one for each face of the cube. + // Create a View of each of these boundaries. + // ALL() means "select all indices in that dimension." + xy_plane_type Zneg_halo = subview(A, ALL(), ALL(), 0); + xy_plane_type Zpos_halo = subview(A, ALL(), ALL(), 101); + xz_plane_type Yneg_halo = subview(A, ALL(), 0, ALL()); + xz_plane_type Ypos_halo = subview(A, ALL(), 101, ALL()); + yz_plane_type Xneg_halo = subview(A, 0, ALL(), ALL()); + yz_plane_type Xpos_halo = subview(A, 101, ALL(), ALL()); + + // Set the boundaries to their initial conditions. + parallel_for(Zneg_halo.extent(0), + set_boundary<xy_plane_type>(Zneg_halo, 1)); + parallel_for(Zpos_halo.extent(0), + set_boundary<xy_plane_type>(Zpos_halo, -1)); + parallel_for(Yneg_halo.extent(0), + set_boundary<xz_plane_type>(Yneg_halo, 2)); + parallel_for(Ypos_halo.extent(0), + set_boundary<xz_plane_type>(Ypos_halo, -2)); + parallel_for(Xneg_halo.extent(0), + set_boundary<yz_plane_type>(Xneg_halo, 3)); + parallel_for(Xpos_halo.extent(0), + set_boundary<yz_plane_type>(Xpos_halo, -3)); + + // Set the interior of the mesh to its initial condition. + parallel_for(Ai.extent(0), set_inner<inner_mesh_type>(Ai, 0)); + + // Update the interior of the mesh. + // This simulates one timestep with dt = 0.1. + parallel_for(Ai.extent(0), update<mesh_type>(A, 0.1)); + + Kokkos::fence(); + printf("Done\n"); + } + Kokkos::finalize(); +} diff --git a/packages/kokkos/example/tutorial/Advanced_Views/04_dualviews/CMakeLists.txt b/packages/kokkos/example/tutorial/Advanced_Views/04_dualviews/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f9b9225d2114fa52141322993297a8ac92713c8 --- /dev/null +++ b/packages/kokkos/example/tutorial/Advanced_Views/04_dualviews/CMakeLists.txt @@ -0,0 +1,9 @@ + +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# This is a tutorial, not a test, so we don't ask CTest to run it. +KOKKOS_ADD_EXECUTABLE( + tutorial_advancedviews_04_dualviews + SOURCES dual_view.cpp +) diff --git a/packages/kokkos/example/tutorial/Advanced_Views/04_dualviews/Makefile b/packages/kokkos/example/tutorial/Advanced_Views/04_dualviews/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..f9db021de5b98c9d54dfc0d1a4510af92a96cee3 --- /dev/null +++ b/packages/kokkos/example/tutorial/Advanced_Views/04_dualviews/Makefile @@ -0,0 +1,49 @@ +KOKKOS_PATH = ../../../.. +KOKKOS_SRC_PATH = ${KOKKOS_PATH} +SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/Advanced_Views/04_dualviews/*.cpp) +vpath %.cpp $(sort $(dir $(SRC))) + +default: build + echo "Start Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 04_dualviews.cuda +KOKKOS_DEVICES = "Cuda,OpenMP" +KOKKOS_ARCH = "SNB,Kepler35" +else +CXX = g++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 04_dualviews.host +KOKKOS_DEVICES = "OpenMP" +KOKKOS_ARCH = "SNB" +endif + + +DEPFLAGS = -M + +OBJ = $(notdir $(SRC:.cpp=.o)) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +test: $(EXE) + ./$(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o *.cuda *.host + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/packages/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp b/packages/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3c0fcd085c7c2afe29a328dfa3f574ab9ac81276 --- /dev/null +++ b/packages/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp @@ -0,0 +1,226 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <Kokkos_DualView.hpp> +#include <impl/Kokkos_Timer.hpp> +#include <cstdio> +#include <cstdlib> + +// DualView helps you manage data and computations that take place on +// two different memory spaces. Examples include CUDA device memory +// and (CPU) host memory (currently implemented), or Intel Knights +// Landing MCDRAM and DRAM (not yet implemented). For example, if you +// have ported only some parts of you application to run in CUDA, +// DualView can help manage moving data between the parts of your +// application that work best with CUDA, and the parts that work +// better on the CPU. +// +// A DualView takes the same template parameters as a View, but +// contains two Views: One that lives in the DualView's memory space, +// and one that lives in that memory space's host mirror space. If +// both memory spaces are the same, then the two Views just alias one +// another. This means that you can use DualView all the time, even +// when not running in a memory space like CUDA. DualView's +// operations to help you manage memory take almost no time in that +// case. This makes your code even more performance portable. + +using view_type = Kokkos::DualView<double*>; +using idx_type = Kokkos::DualView<int**>; + +template <class ExecutionSpace> +struct localsum { + // If the functor has a public 'execution_space' alias, that defines + // the functor's execution space (where it runs in parallel). This + // overrides Kokkos' default execution space. + using execution_space = ExecutionSpace; + + using memory_space = typename Kokkos::Impl::if_c< + std::is_same<ExecutionSpace, Kokkos::DefaultExecutionSpace>::value, + idx_type::memory_space, idx_type::host_mirror_space>::type; + + // Get the view types on the particular device for which the functor + // is instantiated. + // + // "const_data_type" is an alias in View (and DualView) which is + // the const version of the first template parameter of the View. + // For example, the const_data_type version of double** is const + // double**. + Kokkos::View<idx_type::const_data_type, idx_type::array_layout, memory_space> + idx; + // "scalar_array_type" is an alias in ViewTraits (and DualView) which is the + // array version of the value(s) stored in the View. + Kokkos::View<view_type::scalar_array_type, view_type::array_layout, + memory_space> + dest; + Kokkos::View<view_type::const_data_type, view_type::array_layout, + memory_space, Kokkos::MemoryRandomAccess> + src; + + // Constructor takes DualViews, synchronizes them to the device, + // then marks them as modified on the device. + localsum(idx_type dv_idx, view_type dv_dest, view_type dv_src) { + // Extract the view on the correct Device (i.e., the correct + // memory space) from the DualView. DualView has a template + // method, view(), which is templated on the memory space. If the + // DualView has a View from that memory space, view() returns the + // View in that space. + idx = dv_idx.view<memory_space>(); + dest = dv_dest.template view<memory_space>(); + src = dv_src.template view<memory_space>(); + + // Synchronize the DualView to the correct Device. + // + // DualView's sync() method is templated on a memory space, and + // synchronizes the DualView in a one-way fashion to that memory + // space. "Synchronizing" means copying, from the other memory + // space to the Device memory space. sync() does _nothing_ if the + // Views on the two memory spaces are in sync. DualView + // determines this by the user manually marking one side or the + // other as modified; see the modify() call below. + + dv_idx.sync<memory_space>(); + dv_dest.template sync<memory_space>(); + dv_src.template sync<memory_space>(); + + // Mark dest as modified on Device. + dv_dest.template modify<memory_space>(); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { + double tmp = 0.0; + for (int j = 0; j < (int)idx.extent(1); ++j) { + const double val = src(idx(i, j)); + tmp += val * val + 0.5 * (idx.extent(0) * val - idx.extent(1) * val); + } + dest(i) += tmp; + } +}; + +class ParticleType { + public: + double q; + double m; + double q_over_m; + KOKKOS_INLINE_FUNCTION + ParticleType(double q_ = -1, double m_ = 1) : q(q_), m(m_), q_over_m(q / m) {} + + protected: +}; + +using ParticleTypes = Kokkos::DualView<ParticleType[10]>; +int main(int narg, char* arg[]) { + Kokkos::initialize(narg, arg); + + // If View is non-trivial constructible type then add braces so it is out of + // scope before Kokkos::finalize() call + { + ParticleTypes test("Test"); + Kokkos::fence(); + test.h_view(0) = ParticleType(-1e4, 1); + Kokkos::fence(); + + int size = 1000000; + + // Create DualViews. This will allocate on both the device and its + // host_mirror_device. + idx_type idx("Idx", size, 64); + view_type dest("Dest", size); + view_type src("Src", size); + + srand(134231); + + // Get a reference to the host view of idx directly (equivalent to + // idx.view<idx_type::host_mirror_space>() ) + idx_type::t_host h_idx = idx.h_view; + using size_type = view_type::size_type; + for (int i = 0; i < size; ++i) { + for (size_type j = 0; j < static_cast<size_type>(h_idx.extent(1)); ++j) { + h_idx(i, j) = (size + i + (rand() % 500 - 250)) % size; + } + } + + // Mark idx as modified on the host_mirror_space so that a + // sync to the device will actually move data. The sync happens in + // the functor's constructor. + idx.modify<idx_type::host_mirror_space>(); + + // Run on the device. This will cause a sync of idx to the device, + // since it was marked as modified on the host. + Kokkos::Timer timer; + Kokkos::parallel_for(size, + localsum<view_type::execution_space>(idx, dest, src)); + Kokkos::fence(); + double sec1_dev = timer.seconds(); + + timer.reset(); + Kokkos::parallel_for(size, + localsum<view_type::execution_space>(idx, dest, src)); + Kokkos::fence(); + double sec2_dev = timer.seconds(); + + // Run on the host's default execution space (could be the same as device). + // This will cause a sync back to the host of dest. Note that if the Device + // is CUDA, the data layout will not be optimal on host, so performance is + // lower than what it would be for a pure host compilation. + timer.reset(); + Kokkos::parallel_for( + size, localsum<Kokkos::HostSpace::execution_space>(idx, dest, src)); + Kokkos::fence(); + double sec1_host = timer.seconds(); + + timer.reset(); + Kokkos::parallel_for( + size, localsum<Kokkos::HostSpace::execution_space>(idx, dest, src)); + Kokkos::fence(); + double sec2_host = timer.seconds(); + + printf("Device Time with Sync: %f without Sync: %f \n", sec1_dev, sec2_dev); + printf("Host Time with Sync: %f without Sync: %f \n", sec1_host, + sec2_host); + } + + Kokkos::finalize(); +} diff --git a/packages/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/CMakeLists.txt b/packages/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..9e9af9872c907256f37de501e6c5783fe8074647 --- /dev/null +++ b/packages/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/CMakeLists.txt @@ -0,0 +1,11 @@ + +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +IF (Kokkos_ENABLE_CUDA_UVM) +# This is a tutorial, not a test, so we don't ask CTest to run it. +KOKKOS_ADD_EXECUTABLE( + tutorial_advancedviews_05_nvidia_uvm + SOURCES uvm_example.cpp +) +ENDIF () diff --git a/packages/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile b/packages/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..ffd81843041e169cdea6f719190ab42ad0f261bc --- /dev/null +++ b/packages/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile @@ -0,0 +1,48 @@ +KOKKOS_PATH = ../../../.. +KOKKOS_SRC_PATH = ${KOKKOS_PATH} +SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/Advanced_Views/05_NVIDIA_UVM/*.cpp) +vpath %.cpp $(sort $(dir $(SRC))) + +default: build + echo "Start Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 05_NVIDIA_UVM.cuda +KOKKOS_DEVICES = "Cuda,OpenMP" +KOKKOS_ARCH = "SNB,Kepler35" +else +CXX = g++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 05_NVIDIA_UVM.host +KOKKOS_DEVICES = "OpenMP" +KOKKOS_ARCH = "SNB" +endif + +DEPFLAGS = -M + +OBJ = $(notdir $(SRC:.cpp=.o)) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +test: $(EXE) + ./$(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o *.cuda *.host + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/packages/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/uvm_example.cpp b/packages/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/uvm_example.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a906ba1447283f3a5b2517e1f6c21839b458b597 --- /dev/null +++ b/packages/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/uvm_example.cpp @@ -0,0 +1,149 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <Kokkos_DualView.hpp> +#include <impl/Kokkos_Timer.hpp> +#include <cstdio> +#include <cstdlib> + +#ifdef KOKKOS_ENABLE_CUDA +using view_type = Kokkos::View<double*, Kokkos::CudaUVMSpace>; +using idx_type = Kokkos::View<int**, Kokkos::CudaUVMSpace>; +#else +using view_type = Kokkos::View<double*, Kokkos::HostSpace>; +using idx_type = Kokkos::View<int**, Kokkos::HostSpace>; +#endif + +template <class Device> +struct localsum { + // Define the execution space for the functor (overrides the + // DefaultExecutionSpace) + using execution_space = Device; + + // Get the view types on the particular device the functor is instantiated for + idx_type::const_type idx; + view_type dest; + Kokkos::View<view_type::const_data_type, view_type::array_layout, + view_type::device_type, Kokkos::MemoryRandomAccess> + src; + + localsum(idx_type idx_, view_type dest_, view_type src_) + : idx(idx_), dest(dest_), src(src_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(int i) const { + double tmp = 0.0; + for (int j = 0; j < int(idx.extent(1)); j++) { + const double val = src(idx(i, j)); + tmp += val * val + 0.5 * (idx.extent(0) * val - idx.extent(1) * val); + } + dest(i) += tmp; + } +}; + +int main(int narg, char* arg[]) { + Kokkos::initialize(narg, arg); + + { + int size = 1000000; + + // Create Views + idx_type idx("Idx", size, 64); + view_type dest("Dest", size); + view_type src("Src", size); + + srand(134231); + + Kokkos::fence(); + + // When using UVM Cuda views can be accessed on the Host directly + for (int i = 0; i < size; i++) { + for (int j = 0; j < int(idx.extent(1)); j++) + idx(i, j) = (size + i + (rand() % 500 - 250)) % size; + } + + Kokkos::fence(); + // Run on the device + // This will cause a sync of idx to the device since it was modified on the + // host + Kokkos::Timer timer; + Kokkos::parallel_for(size, + localsum<view_type::execution_space>(idx, dest, src)); + Kokkos::fence(); + double sec1_dev = timer.seconds(); + + // No data transfer will happen now, since nothing is accessed on the host + timer.reset(); + Kokkos::parallel_for(size, + localsum<view_type::execution_space>(idx, dest, src)); + Kokkos::fence(); + double sec2_dev = timer.seconds(); + + // Run on the host + // This will cause a sync back to the host of dest which was changed on the + // device Compare runtime here with the dual_view example: dest will be + // copied back in 4k blocks when they are accessed the first time during the + // parallel_for. Due to the latency of a memcpy this gives lower effective + // bandwidth when doing a manual copy via dual views + timer.reset(); + Kokkos::parallel_for( + size, localsum<Kokkos::HostSpace::execution_space>(idx, dest, src)); + Kokkos::fence(); + double sec1_host = timer.seconds(); + + // No data transfers will happen now + timer.reset(); + Kokkos::parallel_for( + size, localsum<Kokkos::HostSpace::execution_space>(idx, dest, src)); + Kokkos::fence(); + double sec2_host = timer.seconds(); + + printf("Device Time with Sync: %e without Sync: %e \n", sec1_dev, sec2_dev); + printf("Host Time with Sync: %e without Sync: %e \n", sec1_host, + sec2_host); + } + + Kokkos::finalize(); +} diff --git a/packages/kokkos/example/tutorial/Advanced_Views/06_AtomicViews/Makefile b/packages/kokkos/example/tutorial/Advanced_Views/06_AtomicViews/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..37b2fe2578016c4df410b12f45d4576ffea46d1d --- /dev/null +++ b/packages/kokkos/example/tutorial/Advanced_Views/06_AtomicViews/Makefile @@ -0,0 +1,49 @@ +KOKKOS_PATH = ../../../.. +KOKKOS_SRC_PATH = ${KOKKOS_PATH} +SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/Advanced_Views/06_AtomicViews/*.cpp) +vpath %.cpp $(sort $(dir $(SRC))) + +default: build + echo "Start Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 06_AtomicViews.cuda +KOKKOS_DEVICES = "Cuda,OpenMP" +KOKKOS_ARCH = "SNB,Kepler35" +else +CXX = g++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 06_AtomicViews.host +KOKKOS_DEVICES = "OpenMP" +KOKKOS_ARCH = "SNB" +endif + + +DEPFLAGS = -M + +OBJ = $(notdir $(SRC:.cpp=.o)) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +test: $(EXE) + ./$(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o *.cuda *.host + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/packages/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile b/packages/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..8983b46d600eb62b19e6f6b6e661212106f1c509 --- /dev/null +++ b/packages/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile @@ -0,0 +1,48 @@ +KOKKOS_PATH = ../../../.. +KOKKOS_SRC_PATH = ${KOKKOS_PATH} +SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/*.cpp) +vpath %.cpp $(sort $(dir $(SRC))) + +default: build + echo "Start Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +CXXFLAGS = -O3 --default-stream per-thread +LINK = ${CXX} +LDFLAGS = +EXE = 07_Overlapping_DeepCopy.cuda +KOKKOS_DEVICES = "Cuda,OpenMP" +KOKKOS_ARCH = "SNB,Kepler35" +else +CXX = g++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 07_Overlapping_DeepCopy.host +KOKKOS_DEVICES = "OpenMP" +KOKKOS_ARCH = "SNB" +endif + +DEPFLAGS = -M + +OBJ = $(notdir $(SRC:.cpp=.o)) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +test: $(EXE) + ./$(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o *.cuda *.host + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/packages/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/overlapping_deepcopy.cpp b/packages/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/overlapping_deepcopy.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c582fa17043629bd65b253e6afabd76134f1817b --- /dev/null +++ b/packages/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/overlapping_deepcopy.cpp @@ -0,0 +1,149 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <cstdio> +#include <typeinfo> +#include <cmath> +#include <impl/Kokkos_Timer.hpp> + +struct FillDevice { + double value; + Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaSpace> a; + FillDevice( + const double& val, + const Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaSpace>& d_a) + : value(val), a(d_a) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int& i) const { a(i) = value; } +}; + +struct ComputeADevice { + int iter; + Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaSpace> a; + Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaSpace> b; + ComputeADevice( + const int& iter_, + const Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaSpace>& d_a, + const Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaSpace>& d_b) + : iter(iter_), a(d_a), b(d_b) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int& i) const { + for (int j = 1; j < iter; j++) { + a(i) += std::pow(b(i), 1.0 + 1.0 / iter); + } + } +}; + +struct ComputeAHost { + Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaHostPinnedSpace> a; + Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaHostPinnedSpace> b; + ComputeAHost(const Kokkos::View<double*, Kokkos::LayoutLeft, + Kokkos::CudaHostPinnedSpace>& d_a, + const Kokkos::View<double*, Kokkos::LayoutLeft, + Kokkos::CudaHostPinnedSpace>& d_b) + : a(d_a), b(d_b) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int& i) const { a(i) += b(i); } +}; + +struct MergeDevice { + Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaSpace> a; + Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaSpace> b; + MergeDevice( + const Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaSpace>& d_a, + const Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaSpace>& d_b) + : a(d_a), b(d_b) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int& i) const { a(i) += b(i); } +}; + +int main(int argc, char* argv[]) { + int size = 100000000; + Kokkos::initialize(); + int synch = std::stoi(argv[1]); + Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaSpace> d_a("Device A", + size); + Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaSpace> d_b("Device B", + size); + Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaSpace> d_tmp( + "Device tmp", size); + Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaHostPinnedSpace> h_a( + "Host A", size); + Kokkos::View<double*, Kokkos::LayoutLeft, Kokkos::CudaHostPinnedSpace> h_b( + "Host B", size); + + Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Cuda>(0, size), + FillDevice(0.0, d_a)); + Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Cuda>(0, size), + FillDevice(1.3513, d_b)); + Kokkos::fence(); + Kokkos::Timer timer; + Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Cuda>(0, size), + ComputeADevice(20, d_a, d_b)); + + if (synch == 1) Kokkos::deep_copy(Kokkos::OpenMP(), h_b, d_b); + if (synch == 2) Kokkos::deep_copy(h_b, d_b); + + Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::OpenMP>(0, size), + [=](const int& i) { h_a(i) = 0.0; }); + Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::OpenMP>(0, size), + ComputeAHost(h_a, h_b)); + Kokkos::OpenMP().fence(); + if (synch == 1) Kokkos::deep_copy(Kokkos::OpenMP(), d_tmp, h_a); + if (synch == 2) Kokkos::deep_copy(d_tmp, h_a); + Kokkos::fence(); + + std::cout << "Time " << timer.seconds() << std::endl; + Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Cuda>(0, size), + MergeDevice(d_a, d_tmp)); + + Kokkos::deep_copy(h_a, d_a); + std::cout << "h_a(0): " << h_a(0) << " ( Correct: 27.4154 )" << std::endl; + Kokkos::finalize(); +} diff --git a/packages/kokkos/example/tutorial/Advanced_Views/CMakeLists.txt b/packages/kokkos/example/tutorial/Advanced_Views/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..11da617b8fa0bab71b80ffadae3d8f22b79dd510 --- /dev/null +++ b/packages/kokkos/example/tutorial/Advanced_Views/CMakeLists.txt @@ -0,0 +1,9 @@ + +KOKKOS_ADD_EXAMPLE_DIRECTORIES(01_data_layouts) +KOKKOS_ADD_EXAMPLE_DIRECTORIES(02_memory_traits) +KOKKOS_ADD_EXAMPLE_DIRECTORIES(03_subviews) +KOKKOS_ADD_EXAMPLE_DIRECTORIES(04_dualviews) + +IF (Kokkos_ENABLE_CUDA_UVM) + KOKKOS_ADD_EXAMPLE_DIRECTORIES(05_NVIDIA_UVM) +ENDIF () diff --git a/packages/kokkos/example/tutorial/Advanced_Views/Makefile b/packages/kokkos/example/tutorial/Advanced_Views/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..12ac5652e5798c11f2285e4294fcc88ce771093e --- /dev/null +++ b/packages/kokkos/example/tutorial/Advanced_Views/Makefile @@ -0,0 +1,123 @@ +ifndef KOKKOS_PATH + MAKEFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) + KOKKOS_PATH = $(subst Makefile,,$(MAKEFILE_PATH))../../.. +endif + +ifndef KOKKOS_SETTINGS + KOKKOS_SETTINGS = "KOKKOS_PATH=${KOKKOS_PATH}" + ifdef KOKKOS_ARCH + KOKKOS_SETTINGS += "KOKKOS_ARCH=${KOKKOS_ARCH}" + endif + ifdef KOKKOS_DEVICES + KOKKOS_SETTINGS += "KOKKOS_DEVICES=${KOKKOS_DEVICES}" + endif + ifdef KOKKOS_OPTIONS + KOKKOS_SETTINGS += "KOKKOS_OPTIONS=${KOKKOS_OPTIONS}" + endif + ifdef KOKKOS_CUDA_OPTIONS + KOKKOS_SETTINGS += "KOKKOS_CUDA_OPTIONS=${KOKKOS_CUDA_OPTIONS}" + endif +endif + +build: + mkdir -p 01_data_layouts + cd ./01_data_layouts; \ + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/01_data_layouts/Makefile ${KOKKOS_SETTINGS} + mkdir -p 02_memory_traits + cd ./02_memory_traits; \ + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/02_memory_traits/Makefile ${KOKKOS_SETTINGS} + mkdir -p 03_subviews + cd ./03_subviews; \ + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/03_subviews/Makefile ${KOKKOS_SETTINGS} + mkdir -p 04_dualviews + cd ./04_dualviews; \ + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/04_dualviews/Makefile ${KOKKOS_SETTINGS} + mkdir -p 05_NVIDIA_UVM + cd ./05_NVIDIA_UVM; \ + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile ${KOKKOS_SETTINGS} + #mkdir -p 06_AtomicViews + #cd ./06_AtomicViews; \ + #$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/06_AtomicViews/Makefile ${KOKKOS_SETTINGS} + #mkdir -p 07_Overlapping_DeepCopy + #cd ./07_Overlapping_DeepCopy; \ + #$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile ${KOKKOS_SETTINGS} + +build-insource: + cd ./01_data_layouts; \ + $(MAKE) build ${KOKKOS_SETTINGS} + cd ./02_memory_traits; \ + $(MAKE) build ${KOKKOS_SETTINGS} + cd ./03_subviews; \ + $(MAKE) build ${KOKKOS_SETTINGS} + cd ./04_dualviews; \ + $(MAKE) build ${KOKKOS_SETTINGS} + cd ./05_NVIDIA_UVM; \ + $(MAKE) build ${KOKKOS_SETTINGS} + #cd ./06_AtomicViews; \ + #$(MAKE) build ${KOKKOS_SETTINGS} + #cd ./07_Overlapping_DeepCopy; \ + #$(MAKE) build ${KOKKOS_SETTINGS} + +test: + cd ./01_data_layouts; \ + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/01_data_layouts/Makefile ${KOKKOS_SETTINGS} + cd ./02_memory_traits; \ + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/02_memory_traits/Makefile ${KOKKOS_SETTINGS} + cd ./03_subviews; \ + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/03_subviews/Makefile ${KOKKOS_SETTINGS} + cd ./04_dualviews; \ + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/04_dualviews/Makefile ${KOKKOS_SETTINGS} + cd ./05_NVIDIA_UVM; \ + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile ${KOKKOS_SETTINGS} + #cd ./06_AtomicViews; \ + #$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/06_AtomicViews/Makefile ${KOKKOS_SETTINGS} + #cd ./07_Overlapping_DeepCopy; \ + #$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile ${KOKKOS_SETTINGS} + +test-insource: + cd ./01_data_layouts; \ + $(MAKE) test ${KOKKOS_SETTINGS} + cd ./02_memory_traits; \ + $(MAKE) test ${KOKKOS_SETTINGS} + cd ./03_subviews; \ + $(MAKE) test ${KOKKOS_SETTINGS} + cd ./04_dualviews; \ + $(MAKE) test ${KOKKOS_SETTINGS} + cd ./05_NVIDIA_UVM; \ + $(MAKE) test ${KOKKOS_SETTINGS} + #cd ./06_AtomicViews; \ + #$(MAKE) test ${KOKKOS_SETTINGS} + #cd ./07_Overlapping_DeepCopy; \ + #$(MAKE) test ${KOKKOS_SETTINGS} + +clean: + cd ./01_data_layouts; \ + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/01_data_layouts/Makefile ${KOKKOS_SETTINGS} + cd ./02_memory_traits; \ + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/02_memory_traits/Makefile ${KOKKOS_SETTINGS} + cd ./03_subviews; \ + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/03_subviews/Makefile ${KOKKOS_SETTINGS} + cd ./04_dualviews; \ + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/04_dualviews/Makefile ${KOKKOS_SETTINGS} + cd ./05_NVIDIA_UVM; \ + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile ${KOKKOS_SETTINGS} + #cd ./06_AtomicViews; \ + #$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/06_AtomicViews/Makefile ${KOKKOS_SETTINGS} + #cd ./07_Overlapping_DeepCopy; \ + #$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile ${KOKKOS_SETTINGS} + +clean-insource: + cd ./01_data_layouts; \ + $(MAKE) clean ${KOKKOS_SETTINGS} + cd ./02_memory_traits; \ + $(MAKE) clean ${KOKKOS_SETTINGS} + cd ./03_subviews; \ + $(MAKE) clean ${KOKKOS_SETTINGS} + cd ./04_dualviews; \ + $(MAKE) clean ${KOKKOS_SETTINGS} + cd ./05_NVIDIA_UVM; \ + $(MAKE) clean ${KOKKOS_SETTINGS} + #cd ./06_AtomicViews; \ + #$(MAKE) clean ${KOKKOS_SETTINGS} + #cd ./07_Overlapping_DeepCopy; \ + #$(MAKE) clean ${KOKKOS_SETTINGS} diff --git a/packages/kokkos/example/tutorial/Algorithms/01_random_numbers/Makefile b/packages/kokkos/example/tutorial/Algorithms/01_random_numbers/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..f6b1e42a8b821f865810e394faf45f0c1e167783 --- /dev/null +++ b/packages/kokkos/example/tutorial/Algorithms/01_random_numbers/Makefile @@ -0,0 +1,49 @@ +KOKKOS_PATH = ../../../.. +KOKKOS_SRC_PATH = ${KOKKOS_PATH} +SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/Algorithms/01_random_numbers/*.cpp) +vpath %.cpp $(sort $(dir $(SRC))) + +default: build + echo "Start Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 01_random_numbers.cuda +KOKKOS_DEVICES = "Cuda,OpenMP" +KOKKOS_ARCH = "SNB,Kepler35" +else +CXX = g++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 01_random_numbers.host +KOKKOS_DEVICES = "OpenMP" +KOKKOS_ARCH = "SNB" +endif + + +DEPFLAGS = -M + +OBJ = $(notdir $(SRC:.cpp=.o)) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +test: $(EXE) + ./$(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o *.cuda *.host + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/packages/kokkos/example/tutorial/Algorithms/01_random_numbers/random_numbers.cpp b/packages/kokkos/example/tutorial/Algorithms/01_random_numbers/random_numbers.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9c5f2d62fc58b86cbdd723e3328cf0ba1e38df27 --- /dev/null +++ b/packages/kokkos/example/tutorial/Algorithms/01_random_numbers/random_numbers.cpp @@ -0,0 +1,158 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <Kokkos_Random.hpp> +#include <Kokkos_DualView.hpp> +#include <impl/Kokkos_Timer.hpp> +#include <cstdlib> + +using DefaultHostType = Kokkos::HostSpace::execution_space; + +// Kokkos provides two different random number generators with a 64 bit and a +// 1024 bit state. These generators are based on Vigna, Sebastiano (2014). "An +// experimental exploration of Marsaglia's xorshift generators, scrambled" See: +// http://arxiv.org/abs/1402.6246 The generators can be used fully independently +// on each thread and have been tested to produce good statistics for both inter +// and intra thread numbers. Note that within a kernel NO random number +// operations are (team) collective operations. Everything can be called within +// branches. This is a difference to the curand library where certain operations +// are required to be called by all threads in a block. +// +// In Kokkos you are required to create a pool of generator states, so that +// threads can grep their own. On CPU architectures the pool size is equal to +// the thread number, on CUDA about 128k states are generated (enough to give +// every potentially simultaneously running thread its own state). With a kernel +// a thread is required to acquire a state from the pool and later return it. On +// CPUs the Random number generator is deterministic if using the same number of +// threads. On GPUs (i.e. using the CUDA backend it is not deterministic because +// threads acquire states via atomics. + +// A Functor for generating uint64_t random numbers templated on the +// GeneratorPool type +template <class GeneratorPool> +struct generate_random { + // Output View for the random numbers + Kokkos::View<uint64_t*> vals; + + // The GeneratorPool + GeneratorPool rand_pool; + + int samples; + + // Initialize all members + generate_random(Kokkos::View<uint64_t*> vals_, GeneratorPool rand_pool_, + int samples_) + : vals(vals_), rand_pool(rand_pool_), samples(samples_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(int i) const { + // Get a random number state from the pool for the active thread + typename GeneratorPool::generator_type rand_gen = rand_pool.get_state(); + + // Draw samples numbers from the pool as urand64 between 0 and + // rand_pool.MAX_URAND64 Note there are function calls to get other type of + // scalars, and also to specify Ranges or get a normal distributed float. + for (int k = 0; k < samples; k++) + vals(i * samples + k) = rand_gen.urand64(); + + // Give the state back, which will allow another thread to acquire it + rand_pool.free_state(rand_gen); + } +}; + +int main(int argc, char* args[]) { + if (argc != 3) { + printf("Please pass two integers on the command line\n"); + } else { + // Initialize Kokkos + Kokkos::initialize(argc, args); + int size = std::stoi(args[1]); + int samples = std::stoi(args[2]); + + // Create two random number generator pools one for 64bit states and one for + // 1024 bit states Both take an 64 bit unsigned integer seed to initialize a + // Random_XorShift64 generator which is used to fill the generators of the + // pool. + Kokkos::Random_XorShift64_Pool<> rand_pool64(5374857); + Kokkos::Random_XorShift1024_Pool<> rand_pool1024(5374857); + Kokkos::DualView<uint64_t*> vals("Vals", size * samples); + + // Run some performance comparisons + Kokkos::Timer timer; + Kokkos::parallel_for(size, + generate_random<Kokkos::Random_XorShift64_Pool<> >( + vals.d_view, rand_pool64, samples)); + Kokkos::fence(); + + timer.reset(); + Kokkos::parallel_for(size, + generate_random<Kokkos::Random_XorShift64_Pool<> >( + vals.d_view, rand_pool64, samples)); + Kokkos::fence(); + double time_64 = timer.seconds(); + + Kokkos::parallel_for(size, + generate_random<Kokkos::Random_XorShift1024_Pool<> >( + vals.d_view, rand_pool1024, samples)); + Kokkos::fence(); + + timer.reset(); + Kokkos::parallel_for(size, + generate_random<Kokkos::Random_XorShift1024_Pool<> >( + vals.d_view, rand_pool1024, samples)); + Kokkos::fence(); + double time_1024 = timer.seconds(); + + printf("#Time XorShift64*: %e %e\n", time_64, + 1.0e-9 * samples * size / time_64); + printf("#Time XorShift1024*: %e %e\n", time_1024, + 1.0e-9 * samples * size / time_1024); + + Kokkos::deep_copy(vals.h_view, vals.d_view); + + Kokkos::finalize(); + } + return 0; +} diff --git a/packages/kokkos/example/tutorial/Algorithms/Makefile b/packages/kokkos/example/tutorial/Algorithms/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..4e70ba7d976fe5e364049bd46eae3b7f2c9b1153 --- /dev/null +++ b/packages/kokkos/example/tutorial/Algorithms/Makefile @@ -0,0 +1,43 @@ +ifndef KOKKOS_PATH + MAKEFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) + KOKKOS_PATH = $(subst Makefile,,$(MAKEFILE_PATH))../../.. +endif + +ifndef KOKKOS_SETTINGS + KOKKOS_SETTINGS = "KOKKOS_PATH=${KOKKOS_PATH}" + ifdef KOKKOS_ARCH + KOKKOS_SETTINGS += "KOKKOS_ARCH=${KOKKOS_ARCH}" + endif + ifdef KOKKOS_DEVICES + KOKKOS_SETTINGS += "KOKKOS_DEVICES=${KOKKOS_DEVICES}" + endif + ifdef KOKKOS_OPTIONS + KOKKOS_SETTINGS += "KOKKOS_OPTIONS=${KOKKOS_OPTIONS}" + endif + ifdef KOKKOS_CUDA_OPTIONS + KOKKOS_SETTINGS += "KOKKOS_CUDA_OPTIONS=${KOKKOS_CUDA_OPTIONS}" + endif +endif + +build: + mkdir -p 01_random_numbers + cd ./01_random_numbers; \ + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Algorithms/01_random_numbers/Makefile ${KOKKOS_SETTINGS} + +build-insource: + cd ./01_random_numbers; \ + $(MAKE) build ${KOKKOS_SETTINGS} +test: + cd ./01_random_numbers; \ + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Algorithms/01_random_numbers/Makefile ${KOKKOS_SETTINGS} + +test-insource: + cd ./01_random_numbers; \ + $(MAKE) test ${KOKKOS_SETTINGS} +clean: + cd ./01_random_numbers; \ + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Algorithms/01_random_numbers/Makefile ${KOKKOS_SETTINGS} + +clean-insource: + cd ./01_random_numbers; \ + $(MAKE) clean ${KOKKOS_SETTINGS} diff --git a/packages/kokkos/example/tutorial/CMakeLists.txt b/packages/kokkos/example/tutorial/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..fd471fa6bea712ebf64952c1eedfc8c20c597efa --- /dev/null +++ b/packages/kokkos/example/tutorial/CMakeLists.txt @@ -0,0 +1,16 @@ + +KOKKOS_ADD_EXAMPLE_DIRECTORIES(01_hello_world) +KOKKOS_ADD_EXAMPLE_DIRECTORIES(02_simple_reduce) +KOKKOS_ADD_EXAMPLE_DIRECTORIES(03_simple_view) +KOKKOS_ADD_EXAMPLE_DIRECTORIES(04_simple_memoryspaces) +KOKKOS_ADD_EXAMPLE_DIRECTORIES(05_simple_atomics) +KOKKOS_ADD_EXAMPLE_DIRECTORIES(06_simple_mdrangepolicy) +KOKKOS_ADD_EXAMPLE_DIRECTORIES(Advanced_Views) +KOKKOS_ADD_EXAMPLE_DIRECTORIES(Hierarchical_Parallelism) + +KOKKOS_ADD_EXAMPLE_DIRECTORIES(01_hello_world_lambda) +KOKKOS_ADD_EXAMPLE_DIRECTORIES(02_simple_reduce_lambda) +KOKKOS_ADD_EXAMPLE_DIRECTORIES(03_simple_view_lambda) + + + diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/CMakeLists.txt b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..e7cd6dea07fc86821911e265ee276aee331f97d5 --- /dev/null +++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/CMakeLists.txt @@ -0,0 +1,9 @@ + +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# This is a tutorial, not a test, so we don't ask CTest to run it. +KOKKOS_ADD_EXECUTABLE( + tutorial_hierarchicalparallelism_01_thread_teams + SOURCES thread_teams.cpp +) diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..c0e7ca02275fa2db1595ff79156746576b1a5f8a --- /dev/null +++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile @@ -0,0 +1,49 @@ +KOKKOS_PATH = ../../../.. +KOKKOS_SRC_PATH = ${KOKKOS_PATH} +SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams/*.cpp) +vpath %.cpp $(sort $(dir $(SRC))) + +default: build + echo "Start Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 01_thread_teams.cuda +KOKKOS_DEVICES = "Cuda,OpenMP" +KOKKOS_ARCH = "SNB,Kepler35" +else +CXX = g++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 01_thread_teams.host +KOKKOS_DEVICES = "OpenMP" +KOKKOS_ARCH = "SNB" +endif + + +DEPFLAGS = -M + +OBJ = $(notdir $(SRC:.cpp=.o)) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +test: $(EXE) + ./$(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o *.cuda *.host + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp new file mode 100644 index 0000000000000000000000000000000000000000..735de65e056c84a5290105db39d5369a50f16ec7 --- /dev/null +++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp @@ -0,0 +1,113 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <cstdio> + +// Using default execution space define a TeamPolicy and its member_type +// The member_type is what the operator of a functor or Lambda gets, for +// a simple RangePolicy the member_type is simply an integer +// For a TeamPolicy its a much richer object, since it provides all information +// to identify a thread uniquely and some team related function calls such as a +// barrier (which will be used in a subsequent example). +// A ThreadTeam consists of 1 to n threads where the maxmimum value of n is +// determined by the hardware. On a dual socket CPU machine with 8 cores per +// socket the maximum size of a team is 8. The number of teams (i.e. the +// league_size) is not limited by physical constraints. Its a pure logical +// number. + +using team_policy = Kokkos::TeamPolicy<>; +using team_member = team_policy::member_type; + +// Define a functor which can be launched using the TeamPolicy +struct hello_world { + using value_type = int; // Specify value type for reduction target, sum + + // This is a reduction operator which now takes as first argument the + // TeamPolicy member_type. Every member of the team contributes to the + // total sum. + // It is helpful to think of this operator as a parallel region for a team + // (i.e. every team member is active and will execute the code). + KOKKOS_INLINE_FUNCTION + void operator()(const team_member& thread, int& sum) const { + sum += 1; + // The TeamPolicy<>::member_type provides functions to query the multi + // dimensional index of a thread as well as the number of thread-teams and + // the size of each team. +#ifndef __SYCL_DEVICE_ONLY__ + // FIXME_SYCL needs printf workaround + printf("Hello World: %i %i // %i %i\n", thread.league_rank(), + thread.team_rank(), thread.league_size(), thread.team_size()); +#else + (void)thread; +#endif + } +}; + +int main(int narg, char* args[]) { + Kokkos::initialize(narg, args); + + // Launch 12 teams of the maximum number of threads per team + const int team_size_max = team_policy(1, 1).team_size_max( + hello_world(), Kokkos::ParallelReduceTag()); + const team_policy policy_a(12, team_size_max); + + int sum = 0; + Kokkos::parallel_reduce(policy_a, hello_world(), sum); + + // The result will be 12*team_size_max + printf("Result A: %i == %i\n", sum, team_size_max * 12); + + // In practice it is often better to let Kokkos decide on the team_size + const team_policy policy_b(12, Kokkos::AUTO); + + Kokkos::parallel_reduce(policy_b, hello_world(), sum); + // The result will be 12*policy_b.team_size_recommended( hello_world(), + // Kokkos::ParallelReduceTag()) + const int team_size_recommended = policy_b.team_size_recommended( + hello_world(), Kokkos::ParallelReduceTag()); + printf("Result B: %i %i\n", sum, team_size_recommended * 12); + + Kokkos::finalize(); +} diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/CMakeLists.txt b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c7f3853a014cfe71750060e6d077e1e4f0d777b --- /dev/null +++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/CMakeLists.txt @@ -0,0 +1,10 @@ + +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# This is a tutorial, not a test, so we don't ask CTest to run it. +KOKKOS_ADD_EXECUTABLE( + tutorial_hierarchical_01_thread_teams_lambda + SOURCES thread_teams_lambda.cpp +) + diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..f285692e9827c04cf3e59307202b3cc3b7c9da7d --- /dev/null +++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile @@ -0,0 +1,50 @@ +KOKKOS_PATH = ../../../.. +KOKKOS_SRC_PATH = ${KOKKOS_PATH} +SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/*.cpp) +vpath %.cpp $(sort $(dir $(SRC))) + +default: build + echo "Start Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 01_thread_teams_lambda.cuda +KOKKOS_DEVICES = "Cuda,OpenMP" +KOKKOS_ARCH = "SNB,Kepler35" +KOKKOS_CUDA_OPTIONS += "enable_lambda" +else +CXX = g++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 01_thread_teams_lambda.host +KOKKOS_DEVICES = "OpenMP" +KOKKOS_ARCH = "SNB" +endif + + +DEPFLAGS = -M + +OBJ = $(notdir $(SRC:.cpp=.o)) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +test: $(EXE) + ./$(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o *.cuda *.host + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp new file mode 100644 index 0000000000000000000000000000000000000000..dcb1e0561bca8b096b528d61128f85c6254c221c --- /dev/null +++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp @@ -0,0 +1,105 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <cstdio> + +// Demonstrate a parallel reduction using thread teams (TeamPolicy). +// +// A thread team consists of 1 to n threads. The hardware determines +// the maxmimum value of n. On a dual-socket CPU machine with 8 cores +// per socket, the maximum size of a team is 8. The number of teams +// (the league_size) is not limited by physical constraints (up to +// some reasonable bound, which eventually depends upon the hardware +// and programming model implementation). + +int main(int narg, char* args[]) { + using Kokkos::parallel_reduce; + using team_policy = Kokkos::TeamPolicy<>; + using team_member = typename team_policy::member_type; + + Kokkos::initialize(narg, args); + + // Set up a policy that launches 12 teams, with the maximum number + // of threads per team. + + const team_policy policy(12, Kokkos::AUTO); + + // This is a reduction with a team policy. The team policy changes + // the first argument of the lambda. Rather than an integer index + // (as with RangePolicy), it's now TeamPolicy::member_type. This + // object provides all information to identify a thread uniquely. + // It also provides some team-related function calls such as a team + // barrier (which a subsequent example will use). + // + // Every member of the team contributes to the total sum. It is + // helpful to think of the lambda's body as a "team parallel + // region." That is, every team member is active and will execute + // the body of the lambda. + int sum = 0; +// We also need to protect the usage of a lambda against compiling +// with a backend which doesn't support it (i.e. Cuda 6.5/7.0). +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + parallel_reduce( + policy, + KOKKOS_LAMBDA(const team_member& thread, int& lsum) { + lsum += 1; + // TeamPolicy<>::member_type provides functions to query the + // multidimensional index of a thread, as well as the number of + // thread teams and the size of each team. +#ifndef __SYCL_DEVICE_ONLY__ + // FIXME_SYCL needs workaround for printf + printf("Hello World: %i %i // %i %i\n", thread.league_rank(), + thread.team_rank(), thread.league_size(), thread.team_size()); +#else + (void)thread; +#endif + }, + sum); +#endif + // The result will be 12*team_policy::team_size_max([=]{}) + printf("Result %i\n", sum); + + Kokkos::finalize(); +} diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/CMakeLists.txt b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..92b701e4f430f1335c9f478ede4955b1b28626f6 --- /dev/null +++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/CMakeLists.txt @@ -0,0 +1,9 @@ + +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# This is a tutorial, not a test, so we don't ask CTest to run it. +KOKKOS_ADD_EXECUTABLE( + tutorial_hierarchicalparallelism_02_nested_parallel_for + SOURCES nested_parallel_for.cpp +) diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..4c0139554bd33777cdf27e04a60885440d73181d --- /dev/null +++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile @@ -0,0 +1,49 @@ +KOKKOS_PATH = ../../../.. +KOKKOS_SRC_PATH = ${KOKKOS_PATH} +SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/*.cpp) +vpath %.cpp $(sort $(dir $(SRC))) + +default: build + echo "Start Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 02_nested_parallel_for.cuda +KOKKOS_DEVICES = "Cuda,OpenMP" +KOKKOS_ARCH = "SNB,Kepler35" +else +CXX = g++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 02_nested_parallel_for.host +KOKKOS_DEVICES = "OpenMP" +KOKKOS_ARCH = "SNB" +endif + + +DEPFLAGS = -M + +OBJ = $(notdir $(SRC:.cpp=.o)) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +test: $(EXE) + ./$(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o *.cuda *.host + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a528b71fe33f817b03dc32bacdbe8cd96271eab7 --- /dev/null +++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp @@ -0,0 +1,100 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <cstdio> + +// See 01_thread_teams for an explanation of a basic TeamPolicy +using team_policy = Kokkos::TeamPolicy<>; +using team_member = typename team_policy::member_type; + +struct hello_world { + using value_type = int; // Specify value type for reduction target, sum + KOKKOS_INLINE_FUNCTION + void operator()(const team_member& thread, int& sum) const { + sum += 1; + // When using the TeamPolicy Kokkos allows for nested parallel loops. + // All three Kokkos parallel patterns are allowed (for, reduce, scan) and + // they largely follow the same syntax as on the global level. The execution + // policy for the Thread level nesting (the Vector level is in the next + // tutorial example) is Kokkos::TeamThreadRange. This means the loop will be + // executed by all members of the team and the loop count will be split + // between threads of the team. Its arguments are the team_member, and a + // loop count. Not every thread will do the same amount of iterations. On a + // GPU for example with a team_size() larger than 31 only the first 31 + // threads would actually do anything. On a CPU with 8 threads 7 would + // execute 4 loop iterations, and 1 thread would do + // 3. Note also that the mode of splitting the count is architecture + // dependent similar to what the RangePolicy on a global level does. The + // call itself is not guaranteed to be synchronous. Also keep in mind that + // the operator using a team_policy acts like a parallel region for the + // team. That means that everything outside of the nested parallel_for is + // also executed by all threads of the team. + Kokkos::parallel_for(Kokkos::TeamThreadRange(thread, 31), + [&](const int& i) { +#ifndef __SYCL_DEVICE_ONLY__ + // FIXME_SYCL needs printf workaround + printf("Hello World: (%i , %i) executed loop %i \n", + thread.league_rank(), thread.team_rank(), i); +#else + (void) i; +#endif + }); + } +}; + +int main(int narg, char* args[]) { + Kokkos::initialize(narg, args); + + // Launch 3 teams of the maximum number of threads per team + const int team_size_max = team_policy(3, 1).team_size_max( + hello_world(), Kokkos::ParallelReduceTag()); + const team_policy policy(3, team_size_max); + + int sum = 0; + Kokkos::parallel_reduce(policy, hello_world(), sum); + printf("Result %i\n", sum); + + Kokkos::finalize(); +} diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/CMakeLists.txt b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..3907d1666486036b512440ea745beb75165104e8 --- /dev/null +++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/CMakeLists.txt @@ -0,0 +1,10 @@ + +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# This is a tutorial, not a test, so we don't ask CTest to run it. +KOKKOS_ADD_EXECUTABLE( + tutorial_hierarchicalparallelism_03_vectorization + SOURCES vectorization.cpp +) + diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..3093c272a5f336041da09bbd85b9e73e886c9f95 --- /dev/null +++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile @@ -0,0 +1,49 @@ +KOKKOS_PATH = ../../../.. +KOKKOS_SRC_PATH = ${KOKKOS_PATH} +SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/Hierarchical_Parallelism/03_vectorization/*.cpp) +vpath %.cpp $(sort $(dir $(SRC))) + +default: build + echo "Start Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 03_vectorization.cuda +KOKKOS_DEVICES = "Cuda,OpenMP" +KOKKOS_ARCH = "SNB,Kepler35" +else +CXX = g++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 03_vectorization.host +KOKKOS_DEVICES = "OpenMP" +KOKKOS_ARCH = "SNB" +endif + + +DEPFLAGS = -M + +OBJ = $(notdir $(SRC:.cpp=.o)) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +test: $(EXE) + ./$(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o *.cuda *.host + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/vectorization.cpp b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/vectorization.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8f76110f086e4ca0e7b11d2fc998fc4354f7008e --- /dev/null +++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/vectorization.cpp @@ -0,0 +1,176 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <Kokkos_Random.hpp> +#include <cstdio> + +// The TeamPolicy actually supports 3D parallelism: Teams, Threads, Vector +// Kokkos::parallel_{for/reduce/scan} calls can be completely free nested. +// The execution policies for the nested layers are TeamThreadRange and +// ThreadVectorRange. +// The only restriction on nesting is that a given level can only be nested in a +// higher one. e.g. a ThreadVectorRange can be nested inside a TeamPolicy +// operator and inside a TeamThreadRange, but you can not nest a +// ThreadVectorRange or a TeamThreadRange inside another ThreadVectorRange. As +// with the 2D execution of TeamPolicy the operator has to be considered as a +// parallel region even with respect to VectorLanes. That means even outside a +// TeamThread or VectorThread loop all threads of a team and all vector lanes of +// a thread execute every line of the operator as long as there are no +// restricitons on them. Code lines can be restricted using Kokkos::single to +// either execute once PerThread or execute once PerTeam. +using team_member = typename Kokkos::TeamPolicy<>::member_type; + +struct SomeCorrelation { + using value_type = int; // Specify value type for reduction target, sum + using shared_space = Kokkos::DefaultExecutionSpace::scratch_memory_space; + using shared_1d_int = + Kokkos::View<int*, shared_space, Kokkos::MemoryUnmanaged>; + + Kokkos::View<const int***, Kokkos::LayoutRight> data; + Kokkos::View<int> gsum; + + SomeCorrelation(Kokkos::View<int***, Kokkos::LayoutRight> data_in, + Kokkos::View<int> sum) + : data(data_in), gsum(sum) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const team_member& thread) const { + int i = thread.league_rank(); + + // Allocate a shared array for the team. + shared_1d_int count(thread.team_shmem(), data.extent(1)); + + // With each team run a parallel_for with its threads + Kokkos::parallel_for( + Kokkos::TeamThreadRange(thread, data.extent(1)), [=](const int& j) { + int tsum; + // Run a vector loop reduction over the inner dimension of data + // Count how many values are multiples of 4 + // Every vector lane gets the same reduction value (tsum) back, it is + // broadcast to all vector lanes + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(thread, data.extent(2)), + [=](const int& k, int& vsum) { + vsum += (data(i, j, k) % 4 == 0) ? 1 : 0; + }, + tsum); + + // Make sure only one vector lane adds the reduction value to the + // shared array, i.e. execute the next line only once PerThread + Kokkos::single(Kokkos::PerThread(thread), [=]() { count(j) = tsum; }); + }); + + // Wait for all threads to finish the parallel_for so that all shared memory + // writes are done + thread.team_barrier(); + + // Check with one vector lane from each thread how many consecutive + // data segments have the same number of values divisible by 4 + // The team reduction value is again broadcast to every team member (and + // every vector lane) + int team_sum = 0; + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(thread, data.extent(1) - 1), + [=](const int& j, int& thread_sum) { + // It is not valid to directly add to thread_sum + // Use a single function with broadcast instead + // team_sum will be used as input to the operator (i.e. it is used to + // initialize sum) the end value of sum will be broadcast to all + // vector lanes in the thread. + Kokkos::single( + Kokkos::PerThread(thread), + [=](int& sum) { + if (count(j) == count(j + 1)) sum++; + }, + thread_sum); + }, + team_sum); + + // Add with one thread and vectorlane of the team the team_sum to the global + // value + Kokkos::single(Kokkos::PerTeam(thread), + [=]() { Kokkos::atomic_add(&gsum(), team_sum); }); + } + + // The functor needs to define how much shared memory it requests given a + // team_size. + size_t team_shmem_size(int /*team_size*/) const { + return shared_1d_int::shmem_size(data.extent(1)); + } +}; + +int main(int narg, char* args[]) { + Kokkos::initialize(narg, args); + + { + // Produce some 3D random data (see Algorithms/01_random_numbers for more + // info) + Kokkos::View<int***, Kokkos::LayoutRight> data("Data", 512, 512, 32); + Kokkos::Random_XorShift64_Pool<> rand_pool64(5374857); + Kokkos::fill_random(data, rand_pool64, 100); + + // A global value to put the result in + Kokkos::View<int> gsum("Sum"); + + // Each team handles a slice of the data + // Set up TeamPolicy with 512 teams with maximum number of threads per team + // and 16 vector lanes. Kokkos::AUTO will determine the number of threads + // The maximum vector length is hardware dependent but can always be smaller + // than the hardware allows. The vector length must be a power of 2. + + const Kokkos::TeamPolicy<> policy(512, Kokkos::AUTO, 16); + + Kokkos::parallel_for(policy, SomeCorrelation(data, gsum)); + + Kokkos::fence(); + + // Copy result value back + int sum = 0; + Kokkos::deep_copy(sum, gsum); + printf("Result %i\n", sum); + } + + Kokkos::finalize(); +} diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/CMakeLists.txt b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2f83a25eab38a0ed6386f97bf6efec915230593 --- /dev/null +++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/CMakeLists.txt @@ -0,0 +1,10 @@ + +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# This is a tutorial, not a test, so we don't ask CTest to run it. +KOKKOS_ADD_EXECUTABLE( + tutorial_hierarchicalparallelism_04_team_scan + SOURCES team_scan.cpp +) + diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..f30670db72e15f6659b7eaeff0b84fee7b928075 --- /dev/null +++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile @@ -0,0 +1,49 @@ +KOKKOS_PATH = ../../../.. +KOKKOS_SRC_PATH = ${KOKKOS_PATH} +SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/Hierarchical_Parallelism/04_team_scan/*.cpp) +vpath %.cpp $(sort $(dir $(SRC))) + +default: build + echo "Start Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 04_team_scan.cuda +KOKKOS_DEVICES = "Cuda,OpenMP" +KOKKOS_ARCH = "SNB,Kepler35" +else +CXX = g++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 04_team_scan.host +KOKKOS_DEVICES = "OpenMP" +KOKKOS_ARCH = "SNB" +endif + + +DEPFLAGS = -M + +OBJ = $(notdir $(SRC:.cpp=.o)) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +test: $(EXE) + ./$(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o *.cuda *.host + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/team_scan.cpp b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/team_scan.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d36010892597bbcc9d1be710cae06574e7410ba7 --- /dev/null +++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/team_scan.cpp @@ -0,0 +1,152 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <Kokkos_DualView.hpp> +#include <impl/Kokkos_Timer.hpp> +#include <cstdio> +#include <cstdlib> + +using Device = Kokkos::DefaultExecutionSpace; +using Host = Kokkos::HostSpace::execution_space; + +using team_policy = Kokkos::TeamPolicy<Device>; +using team_member = team_policy::member_type; + +static const int TEAM_SIZE = 16; + +struct find_2_tuples { + int chunk_size; + Kokkos::View<const int*> data; + Kokkos::View<int**> histogram; + + find_2_tuples(int chunk_size_, Kokkos::DualView<int*> data_, + Kokkos::DualView<int**> histogram_) + : chunk_size(chunk_size_), + data(data_.d_view), + histogram(histogram_.d_view) { + data_.sync<Device>(); + histogram_.sync<Device>(); + histogram_.modify<Device>(); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const team_member& dev) const { + Kokkos::View<int**, Kokkos::MemoryUnmanaged> l_histogram( + dev.team_shmem(), TEAM_SIZE, TEAM_SIZE); + Kokkos::View<int*, Kokkos::MemoryUnmanaged> l_data(dev.team_shmem(), + chunk_size + 1); + + const int i = dev.league_rank() * chunk_size; + for (int j = dev.team_rank(); j < chunk_size + 1; j += dev.team_size()) + l_data(j) = data(i + j); + + for (int k = dev.team_rank(); k < TEAM_SIZE; k += dev.team_size()) + for (int l = 0; l < TEAM_SIZE; l++) l_histogram(k, l) = 0; + dev.team_barrier(); + + for (int j = 0; j < chunk_size; j++) { + for (int k = dev.team_rank(); k < TEAM_SIZE; k += dev.team_size()) + for (int l = 0; l < TEAM_SIZE; l++) { + if ((l_data(j) == k) && (l_data(j + 1) == l)) l_histogram(k, l)++; + } + } + + for (int k = dev.team_rank(); k < TEAM_SIZE; k += dev.team_size()) + for (int l = 0; l < TEAM_SIZE; l++) { + Kokkos::atomic_fetch_add(&histogram(k, l), l_histogram(k, l)); + } + dev.team_barrier(); + } + size_t team_shmem_size(int team_size) const { + return Kokkos::View<int**, Kokkos::MemoryUnmanaged>::shmem_size(TEAM_SIZE, + TEAM_SIZE) + + Kokkos::View<int*, Kokkos::MemoryUnmanaged>::shmem_size(chunk_size + + 1); + } +}; + +int main(int narg, char* args[]) { + Kokkos::initialize(narg, args); + + { + int chunk_size = 1024; + int nchunks = 100000; // 1024*1024; + Kokkos::DualView<int*> data("data", nchunks * chunk_size + 1); + + srand(1231093); + + for (int i = 0; i < (int)data.extent(0); i++) { + data.h_view(i) = rand() % TEAM_SIZE; + } + data.modify<Host>(); + data.sync<Device>(); + + Kokkos::DualView<int**> histogram("histogram", TEAM_SIZE, TEAM_SIZE); + + Kokkos::Timer timer; + // threads/team is automatically limited to maximum supported by the device. + int team_size = TEAM_SIZE; + if (team_size > Device::execution_space::concurrency()) + team_size = Device::execution_space::concurrency(); + Kokkos::parallel_for(team_policy(nchunks, team_size), + find_2_tuples(chunk_size, data, histogram)); + Kokkos::fence(); + double time = timer.seconds(); + + histogram.sync<Host>(); + + printf("Time: %f \n\n", time); + int sum = 0; + for (int k = 0; k < TEAM_SIZE; k++) { + for (int l = 0; l < TEAM_SIZE; l++) { + printf("%i ", histogram.h_view(k, l)); + sum += histogram.h_view(k, l); + } + printf("\n"); + } + printf("Result: %i %i\n", sum, chunk_size * nchunks); + } + Kokkos::finalize(); +} diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/CMakeLists.txt b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..c892df34cd5067fa54b728624a654b9d65711761 --- /dev/null +++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/CMakeLists.txt @@ -0,0 +1,6 @@ + +KOKKOS_ADD_EXAMPLE_DIRECTORIES(01_thread_teams) +KOKKOS_ADD_EXAMPLE_DIRECTORIES(01_thread_teams_lambda) +KOKKOS_ADD_EXAMPLE_DIRECTORIES(02_nested_parallel_for) +KOKKOS_ADD_EXAMPLE_DIRECTORIES(03_vectorization) + diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/Makefile b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..4bf6d487ae977ca6bd42e9f5787314bf4fd8bbe7 --- /dev/null +++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/Makefile @@ -0,0 +1,95 @@ +ifndef KOKKOS_PATH + MAKEFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) + KOKKOS_PATH = $(subst Makefile,,$(MAKEFILE_PATH))../../.. +endif + +ifndef KOKKOS_SETTINGS + KOKKOS_SETTINGS = "KOKKOS_PATH=${KOKKOS_PATH}" + ifdef KOKKOS_ARCH + KOKKOS_SETTINGS += "KOKKOS_ARCH=${KOKKOS_ARCH}" + endif + ifdef KOKKOS_DEVICES + KOKKOS_SETTINGS += "KOKKOS_DEVICES=${KOKKOS_DEVICES}" + endif + ifdef KOKKOS_OPTIONS + KOKKOS_SETTINGS += "KOKKOS_OPTIONS=${KOKKOS_OPTIONS}" + endif + ifdef KOKKOS_CUDA_OPTIONS + KOKKOS_SETTINGS += "KOKKOS_CUDA_OPTIONS=${KOKKOS_CUDA_OPTIONS}" + endif +endif + +build: + mkdir -p 01_thread_teams + cd ./01_thread_teams; \ + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile ${KOKKOS_SETTINGS} + mkdir -p 01_thread_teams_lambda + cd ./01_thread_teams_lambda; \ + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile ${KOKKOS_SETTINGS} + mkdir -p 02_nested_parallel_for + cd ./02_nested_parallel_for; \ + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile ${KOKKOS_SETTINGS} + mkdir -p 03_vectorization + cd ./03_vectorization; \ + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile ${KOKKOS_SETTINGS} + mkdir -p 04_team_scan + cd ./04_team_scan; \ + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile ${KOKKOS_SETTINGS} + +build-insource: + cd ./01_thread_teams; \ + $(MAKE) build ${KOKKOS_SETTINGS} + cd ./01_thread_teams_lambda; \ + $(MAKE) build ${KOKKOS_SETTINGS} + cd ./02_nested_parallel_for; \ + $(MAKE) build ${KOKKOS_SETTINGS} + cd ./03_vectorization; \ + $(MAKE) build ${KOKKOS_SETTINGS} + cd ./04_team_scan; \ + $(MAKE) build ${KOKKOS_SETTINGS} +test: + cd ./01_thread_teams; \ + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile ${KOKKOS_SETTINGS} + cd ./01_thread_teams_lambda; \ + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile ${KOKKOS_SETTINGS} + cd ./02_nested_parallel_for; \ + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile ${KOKKOS_SETTINGS} + cd ./03_vectorization; \ + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile ${KOKKOS_SETTINGS} + cd ./04_team_scan; \ + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile ${KOKKOS_SETTINGS} + +test-insource: + cd ./01_thread_teams; \ + $(MAKE) test ${KOKKOS_SETTINGS} + cd ./01_thread_teams_lambda; \ + $(MAKE) test ${KOKKOS_SETTINGS} + cd ./02_nested_parallel_for; \ + $(MAKE) test ${KOKKOS_SETTINGS} + cd ./03_vectorization; \ + $(MAKE) test ${KOKKOS_SETTINGS} + cd ./04_team_scan; \ + $(MAKE) test ${KOKKOS_SETTINGS} +clean: + cd ./01_thread_teams; \ + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile ${KOKKOS_SETTINGS} + cd ./01_thread_teams_lambda; \ + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile ${KOKKOS_SETTINGS} + cd ./02_nested_parallel_for; \ + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile ${KOKKOS_SETTINGS} + cd ./03_vectorization; \ + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile ${KOKKOS_SETTINGS} + cd ./04_team_scan; \ + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile ${KOKKOS_SETTINGS} + +clean-insource: + cd ./01_thread_teams; \ + $(MAKE) clean ${KOKKOS_SETTINGS} + cd ./01_thread_teams_lambda; \ + $(MAKE) clean ${KOKKOS_SETTINGS} + cd ./02_nested_parallel_for; \ + $(MAKE) clean ${KOKKOS_SETTINGS} + cd ./03_vectorization; \ + $(MAKE) clean ${KOKKOS_SETTINGS} + cd ./04_team_scan; \ + $(MAKE) clean ${KOKKOS_SETTINGS} diff --git a/packages/kokkos/example/tutorial/Makefile b/packages/kokkos/example/tutorial/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..7b2732eeedc2c91f5648aeacfb2aa27817e1fae0 --- /dev/null +++ b/packages/kokkos/example/tutorial/Makefile @@ -0,0 +1,174 @@ + +ifndef KOKKOS_PATH + MAKEFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) + KOKKOS_PATH = $(subst Makefile,,$(MAKEFILE_PATH))../.. +endif + +ifndef KOKKOS_SETTINGS + KOKKOS_SETTINGS = "KOKKOS_PATH=${KOKKOS_PATH}" + ifdef KOKKOS_ARCH + KOKKOS_SETTINGS += "KOKKOS_ARCH=${KOKKOS_ARCH}" + endif + ifdef KOKKOS_DEVICES + KOKKOS_SETTINGS += "KOKKOS_DEVICES=${KOKKOS_DEVICES}" + endif + ifdef KOKKOS_OPTIONS + KOKKOS_SETTINGS += "KOKKOS_OPTIONS=${KOKKOS_OPTIONS}" + endif + ifdef KOKKOS_CUDA_OPTIONS + KOKKOS_SETTINGS += "KOKKOS_CUDA_OPTIONS=${KOKKOS_CUDA_OPTIONS}" + endif +endif + +build: + mkdir -p 01_hello_world + cd ./01_hello_world; \ + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/01_hello_world/Makefile ${KOKKOS_SETTINGS} + mkdir -p 01_hello_world_lambda + cd ./01_hello_world_lambda; \ + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/01_hello_world_lambda/Makefile ${KOKKOS_SETTINGS} + mkdir -p 02_simple_reduce + cd ./02_simple_reduce; \ + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce/Makefile ${KOKKOS_SETTINGS} + mkdir -p 02_simple_reduce_lambda + cd ./02_simple_reduce_lambda; \ + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce_lambda/Makefile ${KOKKOS_SETTINGS} + mkdir -p 03_simple_view + cd ./03_simple_view; \ + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/03_simple_view/Makefile ${KOKKOS_SETTINGS} + mkdir -p 03_simple_view_lambda + cd ./03_simple_view_lambda; \ + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/03_simple_view_lambda/Makefile ${KOKKOS_SETTINGS} + mkdir -p 04_simple_memoryspaces + cd ./04_simple_memoryspaces; \ + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/04_simple_memoryspaces/Makefile ${KOKKOS_SETTINGS} + mkdir -p 05_simple_atomics + cd ./05_simple_atomics; \ + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/05_simple_atomics/Makefile ${KOKKOS_SETTINGS} + mkdir -p Advanced_Views + cd ./Advanced_Views; \ + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + mkdir -p Algorithms + cd ./Algorithms; \ + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Algorithms/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + mkdir -p Hierarchical_Parallelism + cd ./Hierarchical_Parallelism; \ + $(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + +build-insource: + cd ./01_hello_world; \ + $(MAKE) build ${KOKKOS_SETTINGS} + cd ./01_hello_world_lambda; \ + $(MAKE) build ${KOKKOS_SETTINGS} + cd ./02_simple_reduce; \ + $(MAKE) build ${KOKKOS_SETTINGS} + cd ./02_simple_reduce_lambda; \ + $(MAKE) build ${KOKKOS_SETTINGS} + cd ./03_simple_view; \ + $(MAKE) build ${KOKKOS_SETTINGS} + cd ./03_simple_view_lambda; \ + $(MAKE) build ${KOKKOS_SETTINGS} + cd ./04_simple_memoryspaces; \ + $(MAKE) build ${KOKKOS_SETTINGS} + cd ./05_simple_atomics; \ + $(MAKE) build ${KOKKOS_SETTINGS} + cd ./Advanced_Views; \ + $(MAKE) build KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + cd ./Algorithms; \ + $(MAKE) build KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + cd ./Hierarchical_Parallelism; \ + $(MAKE) build KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' +test: + cd ./01_hello_world; \ + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/01_hello_world/Makefile ${KOKKOS_SETTINGS} + cd ./01_hello_world_lambda; \ + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/01_hello_world_lambda/Makefile ${KOKKOS_SETTINGS} + cd ./02_simple_reduce; \ + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce/Makefile ${KOKKOS_SETTINGS} + cd ./02_simple_reduce_lambda; \ + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce_lambda/Makefile ${KOKKOS_SETTINGS} + cd ./03_simple_view; \ + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/03_simple_view/Makefile ${KOKKOS_SETTINGS} + cd ./03_simple_view_lambda; \ + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/03_simple_view_lambda/Makefile ${KOKKOS_SETTINGS} + cd ./04_simple_memoryspaces; \ + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/04_simple_memoryspaces/Makefile ${KOKKOS_SETTINGS} + cd ./05_simple_atomics; \ + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/05_simple_atomics/Makefile ${KOKKOS_SETTINGS} + cd ./Advanced_Views; \ + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + cd ./Algorithms; \ + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Algorithms/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + cd ./Hierarchical_Parallelism; \ + $(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + +test-insource: + cd ./01_hello_world; \ + $(MAKE) test ${KOKKOS_SETTINGS} + cd ./01_hello_world_lambda; \ + $(MAKE) test ${KOKKOS_SETTINGS} + cd ./02_simple_reduce; \ + $(MAKE) test ${KOKKOS_SETTINGS} + cd ./02_simple_reduce_lambda; \ + $(MAKE) test ${KOKKOS_SETTINGS} + cd ./03_simple_view; \ + $(MAKE) test ${KOKKOS_SETTINGS} + cd ./03_simple_view_lambda; \ + $(MAKE) test ${KOKKOS_SETTINGS} + cd ./04_simple_memoryspaces; \ + $(MAKE) test ${KOKKOS_SETTINGS} + cd ./05_simple_atomics; \ + $(MAKE) test ${KOKKOS_SETTINGS} + cd ./Advanced_Views; \ + $(MAKE) test KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + cd ./Algorithms; \ + $(MAKE) test KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + cd ./Hierarchical_Parallelism; \ + $(MAKE) test KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' +clean: + cd ./01_hello_world; \ + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/01_hello_world/Makefile ${KOKKOS_SETTINGS} + cd ./01_hello_world_lambda; \ + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/01_hello_world_lambda/Makefile ${KOKKOS_SETTINGS} + cd ./02_simple_reduce; \ + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce/Makefile ${KOKKOS_SETTINGS} + cd ./02_simple_reduce_lambda; \ + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce_lambda/Makefile ${KOKKOS_SETTINGS} + cd ./03_simple_view; \ + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/03_simple_view/Makefile ${KOKKOS_SETTINGS} + cd ./03_simple_view_lambda; \ + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/03_simple_view_lambda/Makefile ${KOKKOS_SETTINGS} + cd ./04_simple_memoryspaces; \ + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/04_simple_memoryspaces/Makefile ${KOKKOS_SETTINGS} + cd ./05_simple_atomics; \ + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/05_simple_atomics/Makefile ${KOKKOS_SETTINGS} + cd ./Advanced_Views; \ + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + cd ./Algorithms; \ + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Algorithms/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + cd ./Hierarchical_Parallelism; \ + $(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + +clean-insource: + cd ./01_hello_world; \ + $(MAKE) clean ${KOKKOS_SETTINGS} + cd ./01_hello_world_lambda; \ + $(MAKE) clean ${KOKKOS_SETTINGS} + cd ./02_simple_reduce; \ + $(MAKE) clean ${KOKKOS_SETTINGS} + cd ./02_simple_reduce_lambda; \ + $(MAKE) clean ${KOKKOS_SETTINGS} + cd ./03_simple_view; \ + $(MAKE) clean ${KOKKOS_SETTINGS} + cd ./03_simple_view_lambda; \ + $(MAKE) clean ${KOKKOS_SETTINGS} + cd ./04_simple_memoryspaces; \ + $(MAKE) clean ${KOKKOS_SETTINGS} + cd ./05_simple_atomics; \ + $(MAKE) clean ${KOKKOS_SETTINGS} + cd ./Advanced_Views; \ + $(MAKE) clean KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + cd ./Algorithms; \ + $(MAKE) clean KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' + cd ./Hierarchical_Parallelism; \ + $(MAKE) clean KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' diff --git a/packages/kokkos/example/tutorial/README b/packages/kokkos/example/tutorial/README new file mode 100644 index 0000000000000000000000000000000000000000..c46ace0d1150760832899457e586f738e875c49f --- /dev/null +++ b/packages/kokkos/example/tutorial/README @@ -0,0 +1,12 @@ +Build the examples by typing in each directory: +make -j 16 + +To specify a target device: +KOKKOS_DEVICES=OpenMP make -j 16 +KOKKOS_DEVICES=Pthread make -j 16 +KOKKOS_DEVICES=Serial make -j 16 +KOKKOS_DEVICES=Cuda make -j 16 + +Some of the advanced topics try to highlight performance impacts by timing +different variants of doing the same thing. + diff --git a/packages/kokkos/example/tutorial/launch_bounds/CMakeLists.txt b/packages/kokkos/example/tutorial/launch_bounds/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..3d9683500dd472c8f8a8dd95c6a641f861f94545 --- /dev/null +++ b/packages/kokkos/example/tutorial/launch_bounds/CMakeLists.txt @@ -0,0 +1,9 @@ + +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# This is a tutorial, not a test, so we don't ask CTest to run it. +KOKKOS_ADD_EXECUTABLE( + tutorial_02_simple_reduce + SOURCES simple_reduce.cpp +) diff --git a/packages/kokkos/example/tutorial/launch_bounds/Makefile b/packages/kokkos/example/tutorial/launch_bounds/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..7df48f23cb79f6aa5c2e5d00fe0e3f9834243d7b --- /dev/null +++ b/packages/kokkos/example/tutorial/launch_bounds/Makefile @@ -0,0 +1,57 @@ +KOKKOS_PATH = ../../.. +KOKKOS_SRC_PATH = ${KOKKOS_PATH} +SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/launch_bounds/*.cpp) +vpath %.cpp $(sort $(dir $(SRC))) + +default: build + echo "Start Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = launch_bounds.cuda +KOKKOS_DEVICES = "Cuda,OpenMP" +KOKKOS_ARCH = "SNB,Kepler35" +else +CXX = g++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = launch_bounds.host +KOKKOS_DEVICES = "OpenMP" +KOKKOS_ARCH = "SNB" +endif + + +# WAR for "undefined memcpy" w/ Ubuntu + CUDA 7.5 +CXXFLAGS += -D_FORCE_INLINES +# Additional compile-time information +CXXFLAGS += -Xptxas=-v + +DEPFLAGS = -M + +OBJ = $(notdir $(SRC:.cpp=.o)) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +temp: + echo $(KOKKOS_INTERNAL_USE_CUDA) $(CUDA_PATH) + +build: $(EXE) + +test: $(EXE) + ./$(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o *.cuda + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/packages/kokkos/example/tutorial/launch_bounds/launch_bounds_reduce.cpp b/packages/kokkos/example/tutorial/launch_bounds/launch_bounds_reduce.cpp new file mode 100644 index 0000000000000000000000000000000000000000..92f82111f98c6afc966520f58b1709197bedf429 --- /dev/null +++ b/packages/kokkos/example/tutorial/launch_bounds/launch_bounds_reduce.cpp @@ -0,0 +1,172 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <cstdio> + +// +// First reduction (parallel_reduce) example: +// 1. Start up Kokkos +// 2. Execute a parallel_reduce loop in the default execution space, +// using a functor to define the loop body +// 3. Shut down Kokkos +// +struct collision { + // Reduction functor + // For each i, we generate 10 hashes, look for and count collisions + // We use parallel_reduce to count the total collisions + // Note that we're just counting collisions within the 10 generated + // one i. + // This function was chosen as one that very simply can increase the + // register count. + using value_type = int; + + KOKKOS_INLINE_FUNCTION + int hash(int q) const { + // A simple hash by Justin Sobel + // Thanks to Arash Partow (partow.net) + char* fourchars = (char*)&q; + int hash = 1315423911; + for (int i = 0; i < 4; fourchars++, i++) { + hash ^= ((hash << 5) + *fourchars + (hash >> 2)); + } + return hash; + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, int& lsum) const { + // This is a silly function which generates 10 hashes + // then checks for collisions + int a = hash(i) % 64; + int b = hash(i * 3) % 64; + int c = hash(i * 5) % 64; + int d = hash(i * 7) % 64; + int e = hash(i * 11) % 64; + int f = hash(i * 17) % 64; + int g = hash(i * 23) % 64; + int h = hash(i * 29) % 64; + int j = hash(i * 31) % 64; + int k = hash(i * 37) % 64; + + if (a == b) lsum++; + if (a == c) lsum++; + if (a == d) lsum++; + if (a == e) lsum++; + if (a == f) lsum++; + if (a == g) lsum++; + if (a == h) lsum++; + if (a == j) lsum++; + if (a == k) lsum++; + if (b == c) lsum++; + if (b == d) lsum++; + if (b == e) lsum++; + if (b == f) lsum++; + if (b == g) lsum++; + if (b == h) lsum++; + if (b == j) lsum++; + if (b == k) lsum++; + if (c == d) lsum++; + if (c == e) lsum++; + if (c == f) lsum++; + if (c == g) lsum++; + if (c == h) lsum++; + if (c == j) lsum++; + if (c == k) lsum++; + if (d == e) lsum++; + if (d == f) lsum++; + if (d == g) lsum++; + if (d == h) lsum++; + if (d == j) lsum++; + if (d == k) lsum++; + if (e == f) lsum++; + if (e == g) lsum++; + if (e == h) lsum++; + if (e == j) lsum++; + if (e == k) lsum++; + if (f == g) lsum++; + if (f == h) lsum++; + if (f == j) lsum++; + if (f == k) lsum++; + if (g == h) lsum++; + if (g == j) lsum++; + if (g == k) lsum++; + if (h == j) lsum++; + if (h == k) lsum++; + if (j == k) lsum++; + } +}; + +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); + const int n = 10000; + + // Compute and count hash collisions in + // parallel, using Kokkos. + // This is not really a useful algorithm, but it demonstrates the + // LaunchBounds functionality + int sum1 = 0; + int sum2 = 0; + + // Without LaunchBounds, the kernel uses 56 registers + Kokkos::parallel_reduce(n, collision(), sum1); + + // With LaunchBounds, we can reduce the register usage to 32 + Kokkos::parallel_reduce( + Kokkos::RangePolicy<Kokkos::LaunchBounds<512, 4>>(0, n), collision(), + sum2); + + printf( + "Number of collisions, " + "computed in parallel, is %i\n", + sum1); + + if (sum1 != sum2) { + printf("Uh-oh! Results do not match\n"); + return -1; + } + + Kokkos::finalize(); + + return 0; +} diff --git a/packages/kokkos/example/virtual_functions/Makefile b/packages/kokkos/example/virtual_functions/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..fbc54510da0f795ed2976ea7ed9e2d6a8a4774d8 --- /dev/null +++ b/packages/kokkos/example/virtual_functions/Makefile @@ -0,0 +1,55 @@ +KOKKOS_DEVICES=Cuda +KOKKOS_CUDA_OPTIONS=enable_lambda,rdc +KOKKOS_ARCH = "SNB,Kepler35" + +#KOKKOS_DEVICES=OpenMP +#KOKKOS_CUDA_OPTIONS=enable_lambda +#KOKKOS_ARCH = "SNB" + +MAKEFILE_PATH := $(subst Makefile,,$(abspath $(lastword $(MAKEFILE_LIST)))) + +ifndef KOKKOS_PATH + KOKKOS_PATH = $(MAKEFILE_PATH)../.. +endif + +SRC = $(wildcard $(MAKEFILE_PATH)*.cpp) +HEADERS = $(wildcard $(MAKEFILE_PATH)*.hpp) + +vpath %.cpp $(sort $(dir $(SRC))) + +default: build + echo "Start Build" + +LINKFLAGS = +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +EXE = virtual.cuda +override LINKFLAGS += --remove-duplicate-link-files +else +CXX = g++ +EXE = virtual.host +endif + +CXXFLAGS ?= -O3 -g +override CXXFLAGS += -I$(MAKEFILE_PATH) + +DEPFLAGS = -M +LINK = ${CXX} + +OBJ = $(notdir $(SRC:.cpp=.o)) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o *.cuda *.host + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(HEADERS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/packages/kokkos/example/virtual_functions/classes.cpp b/packages/kokkos/example/virtual_functions/classes.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9041f980ed46e7b05cfe3f2f8c5432fba49890d8 --- /dev/null +++ b/packages/kokkos/example/virtual_functions/classes.cpp @@ -0,0 +1,16 @@ +#include <classes.hpp> + +KOKKOS_FUNCTION +Foo::Foo() { val = 0; } + +KOKKOS_FUNCTION +Foo_1::Foo_1() { val = 1; } + +KOKKOS_FUNCTION +int Foo_1::value() { return val; } + +KOKKOS_FUNCTION +Foo_2::Foo_2() { val = 2; } + +KOKKOS_FUNCTION +int Foo_2::value() { return val; } diff --git a/packages/kokkos/example/virtual_functions/classes.hpp b/packages/kokkos/example/virtual_functions/classes.hpp new file mode 100644 index 0000000000000000000000000000000000000000..4fa9f595024fb5a18fce62e85edaf10e34ddc0c3 --- /dev/null +++ b/packages/kokkos/example/virtual_functions/classes.hpp @@ -0,0 +1,39 @@ +#ifndef KOKKOS_EXAMPLE_VIRTUAL_FUNCTIONS_CLASSES_HPP +#define KOKKOS_EXAMPLE_VIRTUAL_FUNCTIONS_CLASSES_HPP + +#include <Kokkos_Core.hpp> + +class Foo { + protected: + int val; + + public: + KOKKOS_FUNCTION + Foo(); + + KOKKOS_FUNCTION + virtual int value() { return 0; }; + + KOKKOS_FUNCTION + virtual ~Foo() {} +}; + +class Foo_1 : public Foo { + public: + KOKKOS_FUNCTION + Foo_1(); + + KOKKOS_FUNCTION + int value(); +}; + +class Foo_2 : public Foo { + public: + KOKKOS_FUNCTION + Foo_2(); + + KOKKOS_FUNCTION + int value(); +}; + +#endif // KOKKOS_EXAMPLE_VIRTUAL_FUNCTIONS_CLASSES_HPP diff --git a/packages/kokkos/example/virtual_functions/main.cpp b/packages/kokkos/example/virtual_functions/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..aaa7afb0340ba308caa84021300fec9e72643aee --- /dev/null +++ b/packages/kokkos/example/virtual_functions/main.cpp @@ -0,0 +1,38 @@ +#include <classes.hpp> + +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); + + { + Foo* f_1 = (Foo*)Kokkos::kokkos_malloc(sizeof(Foo_1)); + Foo* f_2 = (Foo*)Kokkos::kokkos_malloc(sizeof(Foo_2)); + + Kokkos::parallel_for( + "CreateObjects", 1, KOKKOS_LAMBDA(const int&) { + new ((Foo_1*)f_1) Foo_1(); + new ((Foo_2*)f_2) Foo_2(); + }); + + int value_1, value_2; + Kokkos::parallel_reduce( + "CheckValues", 1, + KOKKOS_LAMBDA(const int&, int& lsum) { lsum = f_1->value(); }, value_1); + + Kokkos::parallel_reduce( + "CheckValues", 1, + KOKKOS_LAMBDA(const int&, int& lsum) { lsum = f_2->value(); }, value_2); + + printf("Values: %i %i\n", value_1, value_2); + + Kokkos::parallel_for( + "DestroyObjects", 1, KOKKOS_LAMBDA(const int&) { + f_1->~Foo(); + f_2->~Foo(); + }); + + Kokkos::kokkos_free(f_1); + Kokkos::kokkos_free(f_2); + } + + Kokkos::finalize(); +} diff --git a/packages/kokkos/generate_makefile.bash b/packages/kokkos/generate_makefile.bash new file mode 100755 index 0000000000000000000000000000000000000000..e9871b436971a551c82751756b2b18de9175839a --- /dev/null +++ b/packages/kokkos/generate_makefile.bash @@ -0,0 +1,481 @@ +#!/bin/bash + +update_kokkos_devices() { + SEARCH_TEXT="*$1*" + if [[ $KOKKOS_DEVICES == $SEARCH_TEXT ]]; then + echo kokkos devices already includes $SEARCH_TEXT + else + if [ "$KOKKOS_DEVICES" = "" ]; then + KOKKOS_DEVICES="$1" + echo reseting kokkos devices to $KOKKOS_DEVICES + else + KOKKOS_DEVICES="${KOKKOS_DEVICES},$1" + echo appending to kokkos devices $KOKKOS_DEVICES + fi + fi +} + +get_kokkos_device_list() { + KOKKOS_DEVICE_CMD= + PARSE_DEVICES_LST=$(echo $KOKKOS_DEVICES | tr "," "\n") + PARSE_DEVICES_LST=$(echo $PARSE_DEVICES_LST | tr "_" "\n") + for DEVICE_ in $PARSE_DEVICES_LST + do + UC_DEVICE=$(echo $DEVICE_ | tr "[:lower:]" "[:upper:]") + if [ "${UC_DEVICE}" == "CUDA" ]; then + WITH_CUDA_BACKEND=ON + fi + if [ "${UC_DEVICE}" == "HIP" ]; then + WITH_HIP_BACKEND=ON + fi + if [ "${UC_DEVICE}" == "OPENMPTARGET" ]; then + WITH_OMPT_BACKEND=ON + fi + KOKKOS_DEVICE_CMD="-DKokkos_ENABLE_${UC_DEVICE}=ON ${KOKKOS_DEVICE_CMD}" + done + if [ "${WITH_CUDA_BACKEND}" == "ON" ] && [ "${WITH_HIP_BACKEND}" == "ON" ]; then + echo "Invalid configuration - Cuda and Hip cannot be simultaneously enabled" + exit + fi + if [ "${WITH_CUDA_BACKEND}" == "ON" ] && [ "${WITH_OMPT_BACKEND}" == "ON" ]; then + echo "Invalid configuration - Cuda and OpenMPTarget cannot be simultaneously enabled" + exit + fi + if [ "${WITH_OMPT_BACKEND}" == "ON" ] && [ "${WITH_HIP_BACKEND}" == "ON" ]; then + echo "Invalid configuration - OpenMPTarget and Hip cannot be simultaneously enabled" + exit + fi +} + +get_kokkos_arch_list() { + KOKKOS_ARCH_CMD= + PARSE_ARCH_LST=$(echo $KOKKOS_ARCH | tr "," "\n") + for ARCH_ in $PARSE_ARCH_LST + do + UC_ARCH=$(echo $ARCH_ | tr "[:lower:]" "[:upper:]") + KOKKOS_ARCH_CMD="-DKokkos_ARCH_${UC_ARCH}=ON ${KOKKOS_ARCH_CMD}" + done +} + +get_kokkos_cuda_option_list() { + echo parsing KOKKOS_CUDA_OPTIONS=$KOKKOS_CUDA_OPTIONS + KOKKOS_CUDA_OPTION_CMD= + PARSE_CUDA_LST=$(echo $KOKKOS_CUDA_OPTIONS | tr "," "\n") + for CUDA_ in $PARSE_CUDA_LST + do + CUDA_OPT_NAME= + if [ "${CUDA_}" == "enable_lambda" ]; then + CUDA_OPT_NAME=CUDA_LAMBDA + elif [ "${CUDA_}" == "rdc" ]; then + CUDA_OPT_NAME=CUDA_RELOCATABLE_DEVICE_CODE + elif [ "${CUDA_}" == "force_uvm" ]; then + CUDA_OPT_NAME=CUDA_UVM + elif [ "${CUDA_}" == "use_ldg" ]; then + CUDA_OPT_NAME=CUDA_LDG_INTRINSIC + else + echo "${CUDA_} is not a valid cuda options..." + fi + if [ "${CUDA_OPT_NAME}" != "" ]; then + KOKKOS_CUDA_OPTION_CMD="-DKokkos_ENABLE_${CUDA_OPT_NAME}=ON ${KOKKOS_CUDA_OPTION_CMD}" + fi + done +} + +get_kokkos_hip_option_list() { + echo parsing KOKKOS_HIP_OPTIONS=$KOKKOS_HIP_OPTIONS + KOKKOS_HIP_OPTION_CMD= + PARSE_HIP_LST=$(echo $KOKKOS_HIP_OPTIONS | tr "," "\n") + for HIP_ in $PARSE_HIP_LST + do + HIP_OPT_NAME= + if [ "${HIP_}" == "rdc" ]; then + HIP_OPT_NAME=HIP_RELOCATABLE_DEVICE_CODE + else + echo "${HIP_} is not a valid hip option..." + fi + if [ "${HIP_OPT_NAME}" != "" ]; then + KOKKOS_HIP_OPTION_CMD="-DKokkos_ENABLE_${HIP_OPT_NAME}=ON ${KOKKOS_HIP_OPTION_CMD}" + fi + done +} + +get_kokkos_ompt_option_list() { + echo parsing KOKKOS_OMPT_OPTIONS=$KOKKOS_OMPT_OPTIONS + KOKKOS_OMPT_OPTION_CMD= + PARSE_OMPT_LST=$(echo $KOKKOS_OMPT_OPTIONS | tr "," "\n") +# Stub for eventual OpenMPTarget options +# for OMPT_ in $PARSE_OMPT_LST +# do +# OMPT_OPT_NAME= +# if [ "${OMPT_}" == "?" ]; then +# OMPT_OPT_NAME=OMPT_? +# else +# echo "${OMPT_} is not a valid openmptarget option..." +# fi +# if [ "${OMPT_OPT_NAME}" != "" ]; then +# KOKKOS_OMPT_OPTION_CMD="-DKokkos_ENABLE_${OMPT_OPT_NAME}=ON ${KOKKOS_OMPT_OPTION_CMD}" +# fi +# done +} + +get_kokkos_option_list() { + echo parsing KOKKOS_OPTIONS=$KOKKOS_OPTIONS + KOKKOS_OPTION_CMD= + PARSE_OPTIONS_LST=$(echo $KOKKOS_OPTIONS | tr "," "\n") + for OPT_ in $PARSE_OPTIONS_LST + do + UC_OPT_=$(echo $OPT_ | tr "[:lower:]" "[:upper:]") + if [[ "$UC_OPT_" == *DISABLE* ]]; then + FLIP_OPT_=${UC_OPT_/DISABLE/ENABLE} + KOKKOS_OPTION_CMD="-DKokkos_${FLIP_OPT_}=OFF ${KOKKOS_OPTION_CMD}" + elif [[ "$UC_OPT_" == *ENABLE* ]]; then + KOKKOS_OPTION_CMD="-DKokkos_${UC_OPT_}=ON ${KOKKOS_OPTION_CMD}" + else + KOKKOS_OPTION_CMD="-DKokkos_ENABLE_${UC_OPT_}=ON ${KOKKOS_OPTION_CMD}" + fi + done +} + +display_help_text() { + + echo "Kokkos configure options:" + echo "" + echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory." + echo "--prefix=/Install/Path: Path to install the Kokkos library." + echo "" + echo "--with-cuda[=/Path/To/Cuda]: Enable Cuda and set path to Cuda Toolkit." + echo "--with-hip[=/Path/To/Hip]: Enable Hip and set path to ROCM Toolkit." + echo "--with-openmptarget: Enable OpenMPTarget backend." + echo "--with-sycl: Enable Sycl backend." + echo "--with-openmp: Enable OpenMP backend." + echo "--with-pthread: Enable Pthreads backend." + echo "--with-serial: Enable Serial backend." + echo "--with-devices: Explicitly add a set of backends." + echo "" + echo "--arch=[OPT]: Set target architectures. Options are:" + echo " [AMD: CPU]" + echo " AMDAVX = AMD CPU" + echo " ZEN = AMD Zen-Core CPU" + echo " ZEN2 = AMD Zen2-Core CPU" + echo " [AMD: GPU]" + echo " VEGA900 = AMD GPU MI25 GFX900" + echo " VEGA906 = AMD GPU MI50/MI60 GFX906" + echo " VEGA908 = AMD GPU MI100 GFX908" + echo " [ARM]" + echo " ARMV80 = ARMv8.0 Compatible CPU" + echo " ARMV81 = ARMv8.1 Compatible CPU" + echo " ARMV8_THUNDERX = ARMv8 Cavium ThunderX CPU" + echo " ARMV8_THUNDERX2 = ARMv8 Cavium ThunderX2 CPU" + echo " [IBM]" + echo " BGQ = IBM Blue Gene Q" + echo " Power7 = IBM POWER7 and POWER7+ CPUs" + echo " Power8 = IBM POWER8 CPUs" + echo " Power9 = IBM POWER9 CPUs" + echo " [Intel]" + echo " WSM = Intel Westmere CPUs" + echo " SNB = Intel Sandy/Ivy Bridge CPUs" + echo " HSW = Intel Haswell CPUs" + echo " BDW = Intel Broadwell Xeon E-class CPUs" + echo " SKX = Intel Sky Lake Xeon E-class HPC CPUs (AVX512)" + echo " [Intel Xeon Phi]" + echo " KNC = Intel Knights Corner Xeon Phi" + echo " KNL = Intel Knights Landing Xeon Phi" + echo " [NVIDIA]" + echo " Kepler30 = NVIDIA Kepler generation CC 3.0" + echo " Kepler32 = NVIDIA Kepler generation CC 3.2" + echo " Kepler35 = NVIDIA Kepler generation CC 3.5" + echo " Kepler37 = NVIDIA Kepler generation CC 3.7" + echo " Maxwell50 = NVIDIA Maxwell generation CC 5.0" + echo " Maxwell52 = NVIDIA Maxwell generation CC 5.2" + echo " Maxwell53 = NVIDIA Maxwell generation CC 5.3" + echo " Pascal60 = NVIDIA Pascal generation CC 6.0" + echo " Pascal61 = NVIDIA Pascal generation CC 6.1" + echo " Volta70 = NVIDIA Volta generation CC 7.0" + echo " Volta72 = NVIDIA Volta generation CC 7.2" + echo "" + echo "--compiler=/Path/To/Compiler Set the compiler." + echo "--debug,-dbg: Enable Debugging." + echo "--boundscheck: Enable Kokkos_ENABLE_DEBUG_BOUNDS_CHECK to check View accesses within bounds." + echo "--disable-tests Disable compilation of unit tests (enabled by default)" + echo "--cxxflags=[FLAGS] Overwrite CXXFLAGS for library build and test" + echo " build. This will still set certain required" + echo " flags via KOKKOS_CXXFLAGS (such as -fopenmp," + echo " -std=c++14, etc.)." + echo "--cxxstandard=[FLAGS] Set CMAKE_CXX_STANDARD for library build and test" + echo " c++14 (default), c++17, c++1y, c++1z, c++2a" + echo "--ldflags=[FLAGS] Overwrite LDFLAGS for library build and test" + echo " build. This will still set certain required" + echo " flags via KOKKOS_LDFLAGS (such as -fopenmp," + echo " -lpthread, etc.)." + echo "--with-gtest=/Path/To/Gtest: Set path to gtest. (Used in unit and performance" + echo " tests.)" + echo "--with-hwloc=/Path/To/Hwloc: Set path to hwloc library." + echo "--with-memkind=/Path/To/MemKind: Set path to memkind library." + echo "--with-options=[OPT]: Additional options to Kokkos:" + echo " compiler_warnings" + echo " aggressive_vectorization = add ivdep on loops" + echo " disable_profiling = do not compile with profiling hooks" + echo " " + echo "--with-cuda-options=[OPT]: Additional options to CUDA:" + echo " force_uvm, use_ldg, enable_lambda, rdc" + echo "--with-hip-options=[OPT]: Additional options to HIP:" + echo " rdc" + echo "--with-hpx-options=[OPT]: Additional options to HPX:" + echo " enable_async_dispatch" + echo "--gcc-toolchain=/Path/To/GccRoot: Set the gcc toolchain to use with clang (e.g. /usr)" + echo "--cmake-flags=[CMAKE Command options]: Set cmake options not handled by script" + echo "--make-j=[NUM]: DEPRECATED: call make with appropriate" + echo " -j flag" + +} + +KOKKOS_DO_TESTS=ON +KOKKOS_DO_EXAMPLES=OFF + +# For tracking if Cuda and Hip devices are enabled simultaneously +WITH_CUDA_BACKEND=OFF +WITH_HIP_BACKEND=OFF +WITH_OMPT_BACKEND=OFF + +while [[ $# > 0 ]] +do + key="$1" + + case $key in + --kokkos-path*) + KOKKOS_PATH="${key#*=}" + ;; + --hpx-path*) + HPX_PATH="${key#*=}" + ;; + --prefix*) + PREFIX="${key#*=}" + ;; + --with-cuda) + update_kokkos_devices Cuda + CUDA_PATH_NVCC=$(command -v nvcc) + CUDA_PATH=${CUDA_PATH_NVCC%/bin/nvcc} + ;; + # Catch this before '--with-cuda*' + --with-cuda-options*) + KOKKOS_CUDA_OPTIONS="${key#*=}" + ;; + --with-cuda*) + update_kokkos_devices Cuda + CUDA_PATH="${key#*=}" + ;; + --with-hip) + update_kokkos_devices Hip + HIP_PATH_HIPCC=$(command -v hipcc) + HIP_PATH=${HIP_PATH_HIPCC%/bin/hipcc} + ;; + # Catch this before '--with-hip*' + --with-hip-options*) + KOKKOS_HIP_OPTIONS="${key#*=}" + ;; + --with-hip*) + update_kokkos_devices Hip + HIP_PATH="${key#*=}" + ;; + --with-openmptarget) + update_kokkos_devices OpenMPTarget + ;; + --with-openmptarget-options*) + KOKKOS_OMPT_OPTIONS="${key#*=}" + ;; + --with-openmp) + update_kokkos_devices OpenMP + ;; + --with-sycl) + update_kokkos_devices Sycl + ;; + --with-pthread) + update_kokkos_devices Pthread + ;; + --with-serial) + update_kokkos_devices Serial + ;; + --with-hpx-options*) + KOKKOS_HPX_OPT="${key#*=}" + ;; + --with-hpx*) + update_kokkos_devices HPX + if [ -z "$HPX_PATH" ]; then + HPX_PATH="${key#*=}" + fi + ;; + --with-devices*) + DEVICES="${key#*=}" + PARSE_DEVICES=$(echo $DEVICES | tr "," "\n") + for DEVICE_ in $PARSE_DEVICES + do + update_kokkos_devices $DEVICE_ + done + ;; + --with-gtest*) + GTEST_PATH="${key#*=}" + ;; + --with-hwloc*) + KOKKOS_HWLOC=ON + HWLOC_PATH="${key#*=}" + ;; + --with-memkind*) + KOKKOS_MEMKIND=ON + MEMKIND_PATH="${key#*=}" + ;; + --arch*) + KOKKOS_ARCH="${key#*=}" + ;; + --cxxflags*) + KOKKOS_CXXFLAGS="${key#*=}" + KOKKOS_CXXFLAGS=${KOKKOS_CXXFLAGS//,/ } + ;; + --cxxstandard*) + KOKKOS_CXX_STANDARD="${key#*=}" + ;; + --ldflags*) + KOKKOS_LDFLAGS="${key#*=}" + ;; + --debug|-dbg) + KOKKOS_DEBUG=ON + ;; + --boundscheck) + KOKKOS_BOUNDS_CHECK=ON + ;; + --cmake-flags*) + PASSTHRU_CMAKE_FLAGS="${key#*=}" + ;; + --make-j*) + echo "Warning: ${key} is deprecated" + echo "Call make with appropriate -j flag" + ;; + --disable-tests) + KOKKOS_DO_TESTS=OFF + ;; + --no-examples) + KOKKOS_DO_EXAMPLES=OFF + ;; + --enable-examples) + KOKKOS_DO_EXAMPLES=ON + ;; + --compiler*) + COMPILER="${key#*=}" + CNUM=$(command -v ${COMPILER} 2>&1 >/dev/null | grep -c "no ${COMPILER}") + if [ ${CNUM} -gt 0 ]; then + echo "Invalid compiler by --compiler command: '${COMPILER}'" + exit + fi + if [[ ! -n ${COMPILER} ]]; then + echo "Empty compiler specified by --compiler command." + exit + fi + CNUM=$(command -v ${COMPILER} | grep -c ${COMPILER}) + if [ ${CNUM} -eq 0 ]; then + echo "Invalid compiler by --compiler command: '${COMPILER}'" + exit + fi + # ... valid compiler, ensure absolute path set + WCOMPATH=$(command -v $COMPILER) + COMPDIR=$(dirname $WCOMPATH) + COMPNAME=$(basename $WCOMPATH) + COMPILER=${COMPDIR}/${COMPNAME} + ;; + --with-options*) + KOKKOS_OPTIONS="${key#*=}" + ;; + --gcc-toolchain*) + KOKKOS_GCC_TOOLCHAIN="${key#*=}" + ;; + --help) + display_help_text + exit 0 + ;; + *) + echo "warning: ignoring unknown option $key" + ;; + esac + + shift +done + +if [ "$COMPILER" == "" ]; then + COMPILER_CMD= +else + COMPILER_CMD=-DCMAKE_CXX_COMPILER=$COMPILER +fi + +if [ "$KOKKOS_DEBUG" == "ON" ]; then + KOKKOS_DEBUG_CMD="-DCMAKE_BUILD_TYPE=DEBUG -DKokkos_ENABLE_DEBUG=ON" +else + KOKKOS_DEBUG_CMD=-DCMAKE_BUILD_TYPE=RELEASE +fi + +if [ "$KOKKOS_BOUNDS_CHECK" == "ON" ]; then + KOKKOS_BC_CMD=-DKokkos_ENABLE_DEBUG_BOUNDS_CHECK=ON +fi + +if [ "$KOKKOS_HWLOC" == "ON" ]; then + KOKKOS_HWLOC_CMD=-DKokkos_ENABLE_HWLOC=ON + if [ "$HWLOC_PATH" != "" ]; then + KOKKOS_HWLOC_PATH_CMD=-DHWLOC_ROOT=$HWLOC_PATH + fi +else + KOKKOS_HWLOC_CMD= +fi + +if [ "$KOKKOS_MEMKIND" == "ON" ]; then + KOKKOS_MEMKIND_CMD=-DKokkos_ENABLE_MEMKIND=ON + if [ "$MEMKIND_PATH" != "" ]; then + KOKKOS_MEMKIND_PATH_CMD=-DMEMKIND_ROOT=$MEMKIND_PATH + fi +else + KOKKOS_MEMKIND_CMD= +fi + +if [ ! -e ${KOKKOS_PATH}/CMakeLists.txt ]; then + if [ "${KOKKOS_PATH}" == "" ]; then + CM_SCRIPT=$0 + KOKKOS_PATH=`dirname $CM_SCRIPT` + if [ ! -e ${KOKKOS_PATH}/CMakeLists.txt ]; then + echo "${KOKKOS_PATH} repository appears to not be complete. please verify and try again" + exit 0 + fi + else + echo "KOKKOS_PATH does not appear to be set properly. please specify in location of CMakeLists.txt" + display_help_text + exit 0 + fi +fi + +get_kokkos_device_list +get_kokkos_option_list +get_kokkos_arch_list +get_kokkos_cuda_option_list +get_kokkos_hip_option_list +get_kokkos_ompt_option_list + +## if HPX is enabled, we need to enforce cxx standard = 14 +if [[ ${KOKKOS_DEVICE_CMD} == *Kokkos_ENABLE_HPX* ]]; then + if [ "${KOKKOS_CXX_STANDARD}" == "" ] || [ ${#KOKKOS_CXX_STANDARD} -lt 14 ]; then + echo CXX Standard must be 14 or higher for HPX to work. + KOKKOS_CXX_STANDARD=14 + fi +fi + +if [ "$KOKKOS_CXX_STANDARD" == "" ]; then + STANDARD_CMD= +else + STANDARD_CMD=-DCMAKE_CXX_STANDARD=${KOKKOS_CXX_STANDARD} +fi + +if [[ ${COMPILER} == *clang* ]]; then + gcc_path=$(which g++ | awk --field-separator='/bin/g++' '{printf $1}' ) + KOKKOS_CXXFLAGS="${KOKKOS_CXXFLAGS} --gcc-toolchain=${gcc_path}" + + if [ ! "${CUDA_PATH}" == "" ]; then + KOKKOS_CXXFLAGS="${KOKKOS_CXXFLAGS} --cuda-path=${CUDA_PATH}" + fi +fi + +echo cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} ${KOKKOS_PATH} +cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} ${PASSTHRU_CMAKE_FLAGS} ${KOKKOS_PATH} diff --git a/packages/kokkos/gnu_generate_makefile.bash b/packages/kokkos/gnu_generate_makefile.bash new file mode 100755 index 0000000000000000000000000000000000000000..ea509669f068d677a0354c83891d7caf298b1e34 --- /dev/null +++ b/packages/kokkos/gnu_generate_makefile.bash @@ -0,0 +1,418 @@ +#!/bin/bash + +KOKKOS_DEVICES="" + +while [[ $# > 0 ]] +do + key="$1" + + case $key in + --kokkos-path*) + KOKKOS_PATH="${key#*=}" + ;; + --hpx-path*) + HPX_PATH="${key#*=}" + ;; + --prefix*) + PREFIX="${key#*=}" + ;; + --with-cuda) + KOKKOS_DEVICES="${KOKKOS_DEVICES},Cuda" + CUDA_PATH_NVCC=$(command -v nvcc) + CUDA_PATH=${CUDA_PATH_NVCC%/bin/nvcc} + ;; + # Catch this before '--with-cuda*' + --with-cuda-options*) + KOKKOS_CUDA_OPT="${key#*=}" + ;; + --with-cuda*) + KOKKOS_DEVICES="${KOKKOS_DEVICES},Cuda" + CUDA_PATH="${key#*=}" + ;; + --with-hip) + KOKKOS_DEVICES="${KOKKOS_DEVICES},Hip" + HIP_PATH_HIPCC=$(command -v hipcc) + HIP_PATH=${HIP_PATH_HIPCC%/bin/hipcc} + ;; + # Catch this before '--with-hip*' + --with-hip-options*) + KOKKOS_HIP_OPT="${key#*=}" + ;; + --with-hip*) + KOKKOS_DEVICES="${KOKKOS_DEVICES},Hip" + HIP_PATH="${key#*=}" + ;; + --with-openmp) + KOKKOS_DEVICES="${KOKKOS_DEVICES},OpenMP" + ;; + --with-pthread) + KOKKOS_DEVICES="${KOKKOS_DEVICES},Pthread" + ;; + --with-serial) + KOKKOS_DEVICES="${KOKKOS_DEVICES},Serial" + ;; + --with-hpx-options*) + KOKKOS_HPX_OPT="${key#*=}" + ;; + --with-hpx*) + KOKKOS_DEVICES="${KOKKOS_DEVICES},HPX" + if [ -z "$HPX_PATH" ]; then + HPX_PATH="${key#*=}" + fi + ;; + --with-devices*) + DEVICES="${key#*=}" + KOKKOS_DEVICES="${KOKKOS_DEVICES},${DEVICES}" + ;; + --with-gtest*) + GTEST_PATH="${key#*=}" + ;; + --with-hwloc*) + HWLOC_PATH="${key#*=}" + ;; + --with-memkind*) + MEMKIND_PATH="${key#*=}" + ;; + --arch*) + KOKKOS_ARCH="${key#*=}" + ;; + --cxxflags*) + CXXFLAGS="${key#*=}" + ;; + --cxxstandard*) + KOKKOS_CXX_STANDARD="${key#*=}" + ;; + --ldflags*) + LDFLAGS="${key#*=}" + ;; + --debug|-dbg) + KOKKOS_DEBUG=yes + ;; + --make-j*) + echo "Warning: ${key} is deprecated" + echo "Call make with appropriate -j flag" + ;; + --compiler*) + COMPILER="${key#*=}" + CNUM=$(command -v ${COMPILER} 2>&1 >/dev/null | grep -c "no ${COMPILER}") + if [ ${CNUM} -gt 0 ]; then + echo "Invalid compiler by --compiler command: '${COMPILER}'" + exit + fi + if [[ ! -n ${COMPILER} ]]; then + echo "Empty compiler specified by --compiler command." + exit + fi + CNUM=$(command -v ${COMPILER} | grep -c ${COMPILER}) + if [ ${CNUM} -eq 0 ]; then + echo "Invalid compiler by --compiler command: '${COMPILER}'" + exit + fi + # ... valid compiler, ensure absolute path set + WCOMPATH=$(command -v $COMPILER) + COMPDIR=$(dirname $WCOMPATH) + COMPNAME=$(basename $WCOMPATH) + COMPILER=${COMPDIR}/${COMPNAME} + ;; + --with-options*) + KOKKOS_OPT="${key#*=}" + ;; + --gcc-toolchain*) + KOKKOS_GCC_TOOLCHAIN="${key#*=}" + ;; + --help) + echo "Kokkos configure options:" + echo "" + echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory." + echo "--prefix=/Install/Path: Path to install the Kokkos library." + echo "" + echo "--with-cuda[=/Path/To/Cuda]: Enable Cuda and set path to Cuda Toolkit." + echo "--with-openmp: Enable OpenMP backend." + echo "--with-pthread: Enable Pthreads backend." + echo "--with-serial: Enable Serial backend." + echo "--with-devices: Explicitly add a set of backends." + echo "" + echo "--arch=[OPT]: Set target architectures. Options are:" + echo " [AMD]" + echo " AMDAVX = AMD CPU" + echo " ZEN = AMD Zen-Core CPU" + echo " ZEN2 = AMD Zen2-Core CPU" + echo " [ARM]" + echo " ARMv80 = ARMv8.0 Compatible CPU" + echo " ARMv81 = ARMv8.1 Compatible CPU" + echo " ARMv8-ThunderX = ARMv8 Cavium ThunderX CPU" + echo " ARMv8-TX2 = ARMv8 Cavium ThunderX2 CPU" + echo " [IBM]" + echo " BGQ = IBM Blue Gene Q" + echo " Power7 = IBM POWER7 and POWER7+ CPUs" + echo " Power8 = IBM POWER8 CPUs" + echo " Power9 = IBM POWER9 CPUs" + echo " [Intel]" + echo " WSM = Intel Westmere CPUs" + echo " SNB = Intel Sandy/Ivy Bridge CPUs" + echo " HSW = Intel Haswell CPUs" + echo " BDW = Intel Broadwell Xeon E-class CPUs" + echo " SKX = Intel Sky Lake Xeon E-class HPC CPUs (AVX512)" + echo " [Intel Xeon Phi]" + echo " KNC = Intel Knights Corner Xeon Phi" + echo " KNL = Intel Knights Landing Xeon Phi" + echo " [NVIDIA]" + echo " Kepler30 = NVIDIA Kepler generation CC 3.0" + echo " Kepler32 = NVIDIA Kepler generation CC 3.2" + echo " Kepler35 = NVIDIA Kepler generation CC 3.5" + echo " Kepler37 = NVIDIA Kepler generation CC 3.7" + echo " Maxwell50 = NVIDIA Maxwell generation CC 5.0" + echo " Maxwell52 = NVIDIA Maxwell generation CC 5.2" + echo " Maxwell53 = NVIDIA Maxwell generation CC 5.3" + echo " Pascal60 = NVIDIA Pascal generation CC 6.0" + echo " Pascal61 = NVIDIA Pascal generation CC 6.1" + echo " Volta70 = NVIDIA Volta generation CC 7.0" + echo " Volta72 = NVIDIA Volta generation CC 7.2" + echo "" + echo "--compiler=/Path/To/Compiler Set the compiler." + echo "--debug,-dbg: Enable Debugging." + echo "--cxxflags=[FLAGS] Overwrite CXXFLAGS for library build and test" + echo " build. This will still set certain required" + echo " flags via KOKKOS_CXXFLAGS (such as -fopenmp," + echo " -std=c++14, etc.)." + echo "--cxxstandard=[FLAGS] Overwrite KOKKOS_CXX_STANDARD for library build and test" + echo " c++14 (default), c++17, c++1y, c++1z, c++2a" + echo "--ldflags=[FLAGS] Overwrite LDFLAGS for library build and test" + echo " build. This will still set certain required" + echo " flags via KOKKOS_LDFLAGS (such as -fopenmp," + echo " -lpthread, etc.)." + echo "--with-gtest=/Path/To/Gtest: Set path to gtest. (Used in unit and performance" + echo " tests.)" + echo "--with-hwloc=/Path/To/Hwloc: Set path to hwloc library." + echo "--with-memkind=/Path/To/MemKind: Set path to memkind library." + echo "--with-options=[OPT]: Additional options to Kokkos:" + echo " compiler_warnings" + echo " aggressive_vectorization = add ivdep on loops" + echo " disable_profiling = do not compile with profiling hooks" + echo " " + echo "--with-cuda-options=[OPT]: Additional options to CUDA:" + echo " force_uvm, use_ldg, enable_lambda, rdc, enable_constexpr" + echo "--with-hpx-options=[OPT]: Additional options to HPX:" + echo " enable_async_dispatch" + echo "--gcc-toolchain=/Path/To/GccRoot: Set the gcc toolchain to use with clang (e.g. /usr)" + echo "--make-j=[NUM]: DEPRECATED: call make with appropriate" + echo " -j flag" + exit 0 + ;; + *) + echo "warning: ignoring unknown option $key" + ;; + esac + + shift +done + +# Remove leading ',' from KOKKOS_DEVICES. +KOKKOS_DEVICES=$(echo $KOKKOS_DEVICES | sed 's/^,//') + +# If KOKKOS_PATH undefined, assume parent dir of this script is the KOKKOS_PATH. +if [ -z "$KOKKOS_PATH" ]; then + KOKKOS_PATH=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) +else + # Ensure KOKKOS_PATH is abs path + KOKKOS_PATH=$( cd $KOKKOS_PATH && pwd ) +fi + +if [ "${KOKKOS_PATH}" = "${PWD}" ] || [ "${KOKKOS_PATH}" = "${PWD}/" ]; then + echo "Running generate_makefile.bash in the Kokkos root directory is not allowed" + exit +fi + +KOKKOS_SRC_PATH=${KOKKOS_PATH} + +KOKKOS_SETTINGS="KOKKOS_SRC_PATH=${KOKKOS_SRC_PATH}" + +# The double [[ ]] in the elif branch is not a typo +if [ ${#COMPILER} -gt 0 ]; then + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} CXX=${COMPILER}" +elif + [ ${#COMPILER} -eq 0 ] && [[ ${KOKKOS_DEVICES} =~ .*Cuda.* ]]; then + COMPILER="${KOKKOS_PATH}/bin/nvcc_wrapper" + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} CXX=${COMPILER}" +elif + [ ${#COMPILER} -eq 0 ] && [[ ${KOKKOS_DEVICES} =~ .*Hip.* ]]; then + COMPILER=hipcc + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} CXX=${COMPILER}" +fi + +if [ ${#KOKKOS_DEVICES} -gt 0 ]; then + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_DEVICES=${KOKKOS_DEVICES}" +fi + +if [ ${#KOKKOS_ARCH} -gt 0 ]; then + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_ARCH=${KOKKOS_ARCH}" +fi + +if [ ${#KOKKOS_DEBUG} -gt 0 ]; then + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_DEBUG=${KOKKOS_DEBUG}" +fi + +if [ ${#CUDA_PATH} -gt 0 ]; then + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} CUDA_PATH=${CUDA_PATH}" +fi + +if [ ${#HIP_PATH} -gt 0 ]; then + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} HIP_PATH=${HIP_PATH}" +fi + +if [ ${#CXXFLAGS} -gt 0 ]; then + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} CXXFLAGS=\"${CXXFLAGS}\"" +fi + +if [ ${#KOKKOS_CXX_STANDARD} -gt 0 ]; then + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_CXX_STANDARD=\"${KOKKOS_CXX_STANDARD}\"" +fi + +if [ ${#LDFLAGS} -gt 0 ]; then + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} LDFLAGS=\"${LDFLAGS}\"" +fi + +if [ ${#GTEST_PATH} -gt 0 ]; then + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} GTEST_PATH=${GTEST_PATH}" +else + GTEST_PATH=${KOKKOS_PATH}/tpls/gtest + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} GTEST_PATH=${GTEST_PATH}" +fi + +if [ ${#HWLOC_PATH} -gt 0 ]; then + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} HWLOC_PATH=${HWLOC_PATH}" + KOKKOS_USE_TPLS="${KOKKOS_USE_TPLS},hwloc" +fi + +if [ ${#MEMKIND_PATH} -gt 0 ]; then + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} MEMKIND_PATH=${MEMKIND_PATH}" + KOKKOS_USE_TPLS="${KOKKOS_USE_TPLS},experimental_memkind" +fi + +if [ ${#KOKKOS_USE_TPLS} -gt 0 ]; then + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_USE_TPLS=${KOKKOS_USE_TPLS}" +fi + +if [ ${#HPX_PATH} -gt 0 ]; then + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} HPX_PATH=${HPX_PATH}" +fi + +if [ ${#KOKKOS_OPT} -gt 0 ]; then + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_OPTIONS=${KOKKOS_OPT}" +fi + +if [ ${#KOKKOS_CUDA_OPT} -gt 0 ]; then + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_CUDA_OPTIONS=${KOKKOS_CUDA_OPT}" +fi + +if [ ${#KOKKOS_HPX_OPT} -gt 0 ]; then + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_HPX_OPTIONS=${KOKKOS_HPX_OPT}" +fi + +if [ ${#KOKKOS_GCC_TOOLCHAIN} -gt 0 ]; then + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_INTERNAL_GCC_TOOLCHAIN=${KOKKOS_GCC_TOOLCHAIN}" +fi + +KOKKOS_SETTINGS_NO_KOKKOS_PATH="${KOKKOS_SETTINGS}" + + +gen_makefile=Makefile.kokkos +mkdir -p core +mkdir -p core/unit_test +mkdir -p core/perf_test +mkdir -p containers +mkdir -p containers/unit_tests +mkdir -p containers/performance_tests +mkdir -p algorithms +mkdir -p algorithms/unit_tests +mkdir -p algorithms/performance_tests + +KOKKOS_SETTINGS="${KOKKOS_SETTINGS_NO_KOKKOS_PATH} KOKKOS_PATH=${KOKKOS_PATH}" + +# Generate subdirectory makefiles. +echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > core/unit_test/Makefile +echo "" >> core/unit_test/Makefile +echo "all:" >> core/unit_test/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/unit_test/Makefile ${KOKKOS_SETTINGS}" >> core/unit_test/Makefile +echo "" >> core/unit_test/Makefile +echo "test: all" >> core/unit_test/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/unit_test/Makefile ${KOKKOS_SETTINGS} test" >> core/unit_test/Makefile +echo "" >> core/unit_test/Makefile +echo "clean:" >> core/unit_test/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/unit_test/Makefile ${KOKKOS_SETTINGS} clean" >> core/unit_test/Makefile + +echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > core/perf_test/Makefile +echo "" >> core/perf_test/Makefile +echo "all:" >> core/perf_test/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/perf_test/Makefile ${KOKKOS_SETTINGS}" >> core/perf_test/Makefile +echo "" >> core/perf_test/Makefile +echo "test: all" >> core/perf_test/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/perf_test/Makefile ${KOKKOS_SETTINGS} test" >> core/perf_test/Makefile +echo "" >> core/perf_test/Makefile +echo "clean:" >> core/perf_test/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/perf_test/Makefile ${KOKKOS_SETTINGS} clean" >> core/perf_test/Makefile + +echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > containers/unit_tests/Makefile +echo "" >> containers/unit_tests/Makefile +echo "all:" >> containers/unit_tests/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/containers/unit_tests/Makefile ${KOKKOS_SETTINGS}" >> containers/unit_tests/Makefile +echo "" >> containers/unit_tests/Makefile +echo "test: all" >> containers/unit_tests/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/containers/unit_tests/Makefile ${KOKKOS_SETTINGS} test" >> containers/unit_tests/Makefile +echo "" >> containers/unit_tests/Makefile +echo "clean:" >> containers/unit_tests/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/containers/unit_tests/Makefile ${KOKKOS_SETTINGS} clean" >> containers/unit_tests/Makefile + +echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > containers/performance_tests/Makefile +echo "" >> containers/performance_tests/Makefile +echo "all:" >> containers/performance_tests/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/containers/performance_tests/Makefile ${KOKKOS_SETTINGS}" >> containers/performance_tests/Makefile +echo "" >> containers/performance_tests/Makefile +echo "test: all" >> containers/performance_tests/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/containers/performance_tests/Makefile ${KOKKOS_SETTINGS} test" >> containers/performance_tests/Makefile +echo "" >> containers/performance_tests/Makefile +echo "clean:" >> containers/performance_tests/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/containers/performance_tests/Makefile ${KOKKOS_SETTINGS} clean" >> containers/performance_tests/Makefile + +echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > algorithms/unit_tests/Makefile +echo "" >> algorithms/unit_tests/Makefile +echo "all:" >> algorithms/unit_tests/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/algorithms/unit_tests/Makefile ${KOKKOS_SETTINGS}" >> algorithms/unit_tests/Makefile +echo "" >> algorithms/unit_tests/Makefile +echo "test: all" >> algorithms/unit_tests/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/algorithms/unit_tests/Makefile ${KOKKOS_SETTINGS} test" >> algorithms/unit_tests/Makefile +echo "" >> algorithms/unit_tests/Makefile +echo "clean:" >> algorithms/unit_tests/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/algorithms/unit_tests/Makefile ${KOKKOS_SETTINGS} clean" >> algorithms/unit_tests/Makefile + +# Generate top level directory makefile. +echo "Generating Makefiles with options " ${KOKKOS_SETTINGS} +echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > Makefile +echo "" >> Makefile +echo "build-test:" >> Makefile +echo -e "\t\$(MAKE) -C core/unit_test" >> Makefile +echo -e "\t\$(MAKE) -C core/perf_test" >> Makefile +echo -e "\t\$(MAKE) -C containers/unit_tests" >> Makefile +echo -e "\t\$(MAKE) -C containers/performance_tests" >> Makefile +echo -e "\t\$(MAKE) -C algorithms/unit_tests" >> Makefile +echo "" >> Makefile +echo "test: build-test" >> Makefile +echo -e "\t\$(MAKE) -C core/unit_test test" >> Makefile +echo -e "\t\$(MAKE) -C core/perf_test test" >> Makefile +echo -e "\t\$(MAKE) -C containers/unit_tests test" >> Makefile +echo -e "\t\$(MAKE) -C containers/performance_tests test" >> Makefile +echo -e "\t\$(MAKE) -C algorithms/unit_tests test" >> Makefile +echo "" >> Makefile +echo "unit-tests-only:" >> Makefile +echo -e "\t\$(MAKE) -C core/unit_test test" >> Makefile +echo -e "\t\$(MAKE) -C containers/unit_tests test" >> Makefile +echo -e "\t\$(MAKE) -C algorithms/unit_tests test" >> Makefile +echo "" >> Makefile + +echo "clean:" >> Makefile +echo -e "\t\$(MAKE) -C core/unit_test clean" >> Makefile +echo -e "\t\$(MAKE) -C core/perf_test clean" >> Makefile +echo -e "\t\$(MAKE) -C containers/unit_tests clean" >> Makefile +echo -e "\t\$(MAKE) -C containers/performance_tests clean" >> Makefile +echo -e "\t\$(MAKE) -C algorithms/unit_tests clean" >> Makefile + diff --git a/packages/kokkos/master_history.txt b/packages/kokkos/master_history.txt new file mode 100644 index 0000000000000000000000000000000000000000..7a58f593d00e424b7d7dcbda226f5c4c6d7ccd3c --- /dev/null +++ b/packages/kokkos/master_history.txt @@ -0,0 +1,26 @@ +tag: 2.01.00 date: 07:21:2016 master: xxxxxxxx develop: fa6dfcc4 +tag: 2.01.06 date: 09:02:2016 master: 9afaa87f develop: 555f1a3a +tag: 2.01.10 date: 09:27:2016 master: e4119325 develop: e6cda11e +tag: 2.02.00 date: 10:30:2016 master: 6c90a581 develop: ca3dd56e +tag: 2.02.01 date: 11:01:2016 master: 9c698c86 develop: b0072304 +tag: 2.02.07 date: 12:16:2016 master: 4b4cc4ba develop: 382c0966 +tag: 2.02.15 date: 02:10:2017 master: 8c64cd93 develop: 28dea8b6 +tag: 2.03.00 date: 04:25:2017 master: 120d9ce7 develop: 015ba641 +tag: 2.03.05 date: 05:27:2017 master: 36b92f43 develop: 79073186 +tag: 2.03.13 date: 07:27:2017 master: da314444 develop: 29ccb58a +tag: 2.04.00 date: 08:16:2017 master: 54eb75c0 develop: 32fb8ee1 +tag: 2.04.04 date: 09:11:2017 master: 2b7e9c20 develop: 51e7b25a +tag: 2.04.11 date: 10:28:2017 master: 54a1330a develop: ed36c017 +tag: 2.5.00 date: 12:15:2017 master: dfe685f4 develop: ec7ad6d8 +tag: 2.6.00 date: 03:07:2018 master: 62e760fa develop: d1ba7d71 +tag: 2.7.00 date: 05:24:2018 master: e01945d0 develop: 2d13f608 +tag: 2.7.24 date: 11:04:2018 master: d3a94192 develop: 7a06fc81 +tag: 2.8.00 date: 02:05:2019 master: 34931a36 develop: d1659d1d +tag: 2.9.00 date: 06:24:2019 master: 5d6e7fb3 develop: 4c6cb80a +tag: 3.0.00 date: 01:31:2020 master: 2983b80d release-candidate-3.0: fdc904a6 +tag: 3.1.00 date: 04:14:2020 master: cd1b1d0a develop: fd90af43 +tag: 3.1.01 date: 05:04:2020 master: 785d19f2 release: 2be028bc +tag: 3.2.00 date: 08:19:2020 master: 3b2fdc7e release: 5dc6d303 +tag: 3.3.00 date: 12:16:2020 master: 734f577a release: 1535ba5c +tag: 3.3.01 date: 01:06:2021 master: 6d65b5a3 release: 4d23839c +tag: 3.4.00 date: 04:26:2021 master: 1fb0c284 release: 5d7738d6 diff --git a/packages/kokkos/scripts/apply-clang-format b/packages/kokkos/scripts/apply-clang-format new file mode 100755 index 0000000000000000000000000000000000000000..d988ca7ae29990cc123f43b36d9c01c05cf5d904 --- /dev/null +++ b/packages/kokkos/scripts/apply-clang-format @@ -0,0 +1,43 @@ +#!/bin/bash + +# If CLANG_FORMAT_EXE exists in the environment, +# it is used instead of 'clang-format'. +CLANG_FORMAT_EXECUTABLE=${CLANG_FORMAT_EXE:-clang-format} + +if ! [ -x "$(command -v ${CLANG_FORMAT_EXECUTABLE})" ]; then + echo "*** ${CLANG_FORMAT_EXECUTABLE} could not be found." + exit 1 +fi + +CLANG_FORMAT_VERSION="$(${CLANG_FORMAT_EXECUTABLE} --version)" +CLANG_FORMAT_MAJOR_VERSION=$(echo "${CLANG_FORMAT_VERSION}" | sed 's/^[^0-9]*\([0-9]*\).*$/\1/g') +CLANG_FORMAT_MINOR_VERSION=$(echo "${CLANG_FORMAT_VERSION}" | sed 's/^[^0-9]*[0-9]*\.\([0-9]*\).*$/\1/g') + +if [ "${CLANG_FORMAT_MAJOR_VERSION}" -ne 8 ] || [ "${CLANG_FORMAT_MINOR_VERSION}" -ne 0 ]; then + echo "*** This indent script requires clang-format version 8.0," + echo "*** but version ${CLANG_FORMAT_MAJOR_VERSION}.${CLANG_FORMAT_MINOR_VERSION} was found instead." + exit 1 +fi + +BASE_DIR="$(git rev-parse --show-toplevel)" +cd $BASE_DIR +if [ ! -f "scripts/apply-clang-format" ]; then + echo "*** The indenting script must be executed from within the Kokkos clone!" + exit 1 +fi + +TRACKED_FILES="$(git ls-files)" + +find ${TRACKED_FILES} \ + -type f -name '*.cpp' -o -name '*.hpp' -o -name '*.cc' -o -name '*.h' | + xargs -n 1 -P 10 ${CLANG_FORMAT_EXECUTABLE} -i + +# Now also check for trailing whitspace. Mac OSX creates backup files +# that we need to delete manually. +TRACKED_FILES="$(git ls-tree HEAD --name-only)" +find ${TRACKED_FILES} \ + -type f \( -name "*.md" -o -name "*.cc" -o -name "*.h" -o -name "*.txt" -o -name "*.cmake" \) | + xargs -n 1 -P 10 -I {} bash -c "sed -i -e 's/\s\+$//g' {} && rm -f '{}-e'" + +# Check that we do not introduce any file with the old copyright +./scripts/update-copyright diff --git a/packages/kokkos/scripts/docker/Dockerfile.clang b/packages/kokkos/scripts/docker/Dockerfile.clang new file mode 100644 index 0000000000000000000000000000000000000000..6aaf75fae55ff975df5045bb73a0813236871d89 --- /dev/null +++ b/packages/kokkos/scripts/docker/Dockerfile.clang @@ -0,0 +1,44 @@ +FROM nvidia/cuda:9.2-devel + +RUN apt-get update && apt-get install -y \ + bc \ + git \ + wget \ + ccache \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +ARG CMAKE_VERSION=3.16.8 +ENV CMAKE_DIR=/opt/cmake +RUN CMAKE_KEY=2D2CEF1034921684 && \ + CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \ + CMAKE_SCRIPT=cmake-${CMAKE_VERSION}-Linux-x86_64.sh && \ + CMAKE_SHA256=cmake-${CMAKE_VERSION}-SHA-256.txt && \ + wget --quiet ${CMAKE_URL}/${CMAKE_SHA256} && \ + wget --quiet ${CMAKE_URL}/${CMAKE_SHA256}.asc && \ + wget --quiet ${CMAKE_URL}/${CMAKE_SCRIPT} && \ + gpg --keyserver pool.sks-keyservers.net --recv-keys ${CMAKE_KEY} && \ + gpg --verify ${CMAKE_SHA256}.asc ${CMAKE_SHA256} && \ + grep ${CMAKE_SCRIPT} ${CMAKE_SHA256} | sha256sum --check && \ + mkdir -p ${CMAKE_DIR} && \ + sh ${CMAKE_SCRIPT} --skip-license --prefix=${CMAKE_DIR} && \ + rm cmake* +ENV PATH=${CMAKE_DIR}/bin:$PATH + +ENV LLVM_DIR=/opt/llvm +RUN LLVM_VERSION=8.0.0 && \ + LLVM_KEY=345AD05D && \ + LLVM_URL=http://releases.llvm.org/${LLVM_VERSION}/clang+llvm-${LLVM_VERSION}-x86_64-linux-gnu-ubuntu-16.04.tar.xz && \ + LLVM_ARCHIVE=llvm-${LLVM_VERSION}.tar.xz && \ + SCRATCH_DIR=/scratch && mkdir -p ${SCRATCH_DIR} && cd ${SCRATCH_DIR} && \ + wget --quiet ${LLVM_URL} --output-document=${LLVM_ARCHIVE} && \ + wget --quiet ${LLVM_URL}.sig --output-document=${LLVM_ARCHIVE}.sig && \ + gpg --keyserver pool.sks-keyservers.net --recv-keys ${LLVM_KEY} && \ + gpg --verify ${LLVM_ARCHIVE}.sig ${LLVM_ARCHIVE} && \ + mkdir -p ${LLVM_DIR} && \ + tar -xvf ${LLVM_ARCHIVE} -C ${LLVM_DIR} --strip-components=1 && \ + echo "${LLVM_DIR}/lib" > /etc/ld.so.conf.d/llvm.conf && ldconfig && \ + rm -rf /root/.gnupg && \ + rm -rf ${SCRATCH_DIR} +ENV PATH=${LLVM_DIR}/bin:$PATH diff --git a/packages/kokkos/scripts/docker/Dockerfile.gcc b/packages/kokkos/scripts/docker/Dockerfile.gcc new file mode 100644 index 0000000000000000000000000000000000000000..56972d3185d0f62e6b9effb64e8f2cedefe25c66 --- /dev/null +++ b/packages/kokkos/scripts/docker/Dockerfile.gcc @@ -0,0 +1,18 @@ +FROM gcc:5.3.0 + +ARG CMAKE_VERSION=3.16.8 +ENV CMAKE_DIR=/opt/cmake +RUN CMAKE_KEY=2D2CEF1034921684 && \ + CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \ + CMAKE_SCRIPT=cmake-${CMAKE_VERSION}-Linux-x86_64.sh && \ + CMAKE_SHA256=cmake-${CMAKE_VERSION}-SHA-256.txt && \ + wget --quiet ${CMAKE_URL}/${CMAKE_SHA256} && \ + wget --quiet ${CMAKE_URL}/${CMAKE_SHA256}.asc && \ + wget --quiet ${CMAKE_URL}/${CMAKE_SCRIPT} && \ + gpg --keyserver pool.sks-keyservers.net --recv-keys ${CMAKE_KEY} && \ + gpg --verify ${CMAKE_SHA256}.asc ${CMAKE_SHA256} && \ + grep ${CMAKE_SCRIPT} ${CMAKE_SHA256} | sha256sum --check && \ + mkdir -p ${CMAKE_DIR} && \ + sh ${CMAKE_SCRIPT} --skip-license --prefix=${CMAKE_DIR} && \ + rm cmake* +ENV PATH=${CMAKE_DIR}/bin:$PATH diff --git a/packages/kokkos/scripts/docker/Dockerfile.hipcc b/packages/kokkos/scripts/docker/Dockerfile.hipcc new file mode 100644 index 0000000000000000000000000000000000000000..d3b6b93a023396aa785703a5aeec0c4001af34e8 --- /dev/null +++ b/packages/kokkos/scripts/docker/Dockerfile.hipcc @@ -0,0 +1,31 @@ +ARG BASE=rocm/dev-ubuntu-20.04:3.8 +FROM $BASE + +RUN apt-get update && apt-get install -y \ + git \ + kmod \ + wget \ + ccache \ + file \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +ENV PATH=/opt/rocm/bin:$PATH + +ARG CMAKE_VERSION=3.16.8 +ENV CMAKE_DIR=/opt/cmake +RUN CMAKE_KEY=2D2CEF1034921684 && \ + CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \ + CMAKE_SCRIPT=cmake-${CMAKE_VERSION}-Linux-x86_64.sh && \ + CMAKE_SHA256=cmake-${CMAKE_VERSION}-SHA-256.txt && \ + wget --quiet ${CMAKE_URL}/${CMAKE_SHA256} && \ + wget --quiet ${CMAKE_URL}/${CMAKE_SHA256}.asc && \ + wget --quiet ${CMAKE_URL}/${CMAKE_SCRIPT} && \ + gpg --keyserver pool.sks-keyservers.net --recv-keys ${CMAKE_KEY} && \ + gpg --verify ${CMAKE_SHA256}.asc ${CMAKE_SHA256} && \ + grep ${CMAKE_SCRIPT} ${CMAKE_SHA256} | sha256sum --check && \ + mkdir -p ${CMAKE_DIR} && \ + sh ${CMAKE_SCRIPT} --skip-license --prefix=${CMAKE_DIR} && \ + rm cmake* +ENV PATH=${CMAKE_DIR}/bin:$PATH diff --git a/packages/kokkos/scripts/docker/Dockerfile.kokkosllvmproject b/packages/kokkos/scripts/docker/Dockerfile.kokkosllvmproject new file mode 100644 index 0000000000000000000000000000000000000000..5d53a645e4bc7c551698719d3edb1c3768467ca7 --- /dev/null +++ b/packages/kokkos/scripts/docker/Dockerfile.kokkosllvmproject @@ -0,0 +1,53 @@ +FROM nvidia/cuda:10.1-devel + +RUN apt-get update && apt-get install -y \ + bc \ + git \ + wget \ + ccache \ + python3 \ + python3-distutils \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +ARG CMAKE_VERSION=3.16.8 +ENV CMAKE_DIR=/opt/cmake +RUN CMAKE_KEY=2D2CEF1034921684 && \ + CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \ + CMAKE_SCRIPT=cmake-${CMAKE_VERSION}-Linux-x86_64.sh && \ + CMAKE_SHA256=cmake-${CMAKE_VERSION}-SHA-256.txt && \ + wget --quiet ${CMAKE_URL}/${CMAKE_SHA256} && \ + wget --quiet ${CMAKE_URL}/${CMAKE_SHA256}.asc && \ + wget --quiet ${CMAKE_URL}/${CMAKE_SCRIPT} && \ + gpg --keyserver pool.sks-keyservers.net --recv-keys ${CMAKE_KEY} && \ + gpg --verify ${CMAKE_SHA256}.asc ${CMAKE_SHA256} && \ + grep ${CMAKE_SCRIPT} ${CMAKE_SHA256} | sha256sum --check && \ + mkdir -p ${CMAKE_DIR} && \ + sh ${CMAKE_SCRIPT} --skip-license --prefix=${CMAKE_DIR} && \ + rm cmake* +ENV PATH=${CMAKE_DIR}/bin:$PATH + +ARG NPROC=8 + +# Clone Kokkos fork of the LLVM Project and build Clang +ENV LLVM_DIR=/opt/llvm +RUN LLVM_VERSION=55b3bcf643685c63fcc529d434bed112fdf03939 && \ + LLVM_URL=https://github.com/kokkos/llvm-project/archive/${LLVM_VERSION}.tar.gz &&\ + LLVM_ARCHIVE=llvm.tar.xz && \ + SCRATCH_DIR=/scratch && mkdir -p ${SCRATCH_DIR} && cd ${SCRATCH_DIR} && \ + wget --quiet ${LLVM_URL} --output-document=${LLVM_ARCHIVE} && \ + mkdir llvm-project && \ + tar -xf ${LLVM_ARCHIVE} -C llvm-project --strip-components=1 && \ + cd llvm-project && \ + mkdir build && cd build && \ + cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX=$LLVM_DIR \ + -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;compiler-rt" \ + ../llvm && \ + make -j${NPROC} && \ + make install && \ + echo "${LLVM_DIR}/lib" > /etc/ld.so.conf.d/llvm.conf && ldconfig && \ + rm -rf ${SCRATCH_DIR} +ENV PATH=${LLVM_DIR}/bin:$PATH diff --git a/packages/kokkos/scripts/docker/Dockerfile.nvcc b/packages/kokkos/scripts/docker/Dockerfile.nvcc new file mode 100644 index 0000000000000000000000000000000000000000..e17accc0663980694821b8002b976277fcd9ca42 --- /dev/null +++ b/packages/kokkos/scripts/docker/Dockerfile.nvcc @@ -0,0 +1,31 @@ +ARG BASE=nvidia/cuda:9.2-devel +FROM $BASE + +ARG ADDITIONAL_PACKAGES + +RUN apt-get update && apt-get install -y \ + bc \ + git \ + wget \ + ccache \ + $ADDITIONAL_PACKAGES \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +ARG CMAKE_VERSION=3.16.8 +ENV CMAKE_DIR=/opt/cmake +RUN CMAKE_KEY=2D2CEF1034921684 && \ + CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \ + CMAKE_SCRIPT=cmake-${CMAKE_VERSION}-Linux-x86_64.sh && \ + CMAKE_SHA256=cmake-${CMAKE_VERSION}-SHA-256.txt && \ + wget --quiet ${CMAKE_URL}/${CMAKE_SHA256} && \ + wget --quiet ${CMAKE_URL}/${CMAKE_SHA256}.asc && \ + wget --quiet ${CMAKE_URL}/${CMAKE_SCRIPT} && \ + gpg --keyserver pool.sks-keyservers.net --recv-keys ${CMAKE_KEY} && \ + gpg --verify ${CMAKE_SHA256}.asc ${CMAKE_SHA256} && \ + grep ${CMAKE_SCRIPT} ${CMAKE_SHA256} | sha256sum --check && \ + mkdir -p ${CMAKE_DIR} && \ + sh ${CMAKE_SCRIPT} --skip-license --prefix=${CMAKE_DIR} && \ + rm cmake* +ENV PATH=${CMAKE_DIR}/bin:$PATH diff --git a/packages/kokkos/scripts/docker/Dockerfile.openmptarget b/packages/kokkos/scripts/docker/Dockerfile.openmptarget new file mode 100644 index 0000000000000000000000000000000000000000..b6efcb82cae1a8da1cf82e050bf4ad7b8a7870e4 --- /dev/null +++ b/packages/kokkos/scripts/docker/Dockerfile.openmptarget @@ -0,0 +1,67 @@ +ARG BASE=nvidia/cuda:11.1-devel-ubuntu20.04 +FROM $BASE + +RUN apt-get update && apt-get install -y \ + bc \ + git \ + wget \ + ccache \ + python3 \ + libelf-dev \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +ARG NPROC=8 + +ARG CMAKE_VERSION=3.18.5 +ENV CMAKE_DIR=/opt/cmake +RUN CMAKE_KEY=2D2CEF1034921684 && \ + CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \ + CMAKE_SCRIPT=cmake-${CMAKE_VERSION}-Linux-x86_64.sh && \ + CMAKE_SHA256=cmake-${CMAKE_VERSION}-SHA-256.txt && \ + wget --quiet ${CMAKE_URL}/${CMAKE_SHA256} && \ + wget --quiet ${CMAKE_URL}/${CMAKE_SHA256}.asc && \ + wget --quiet ${CMAKE_URL}/${CMAKE_SCRIPT} && \ + gpg --keyserver hkps.pool.sks-keyservers.net --recv-keys ${CMAKE_KEY} && \ + gpg --verify ${CMAKE_SHA256}.asc ${CMAKE_SHA256} && \ + grep ${CMAKE_SCRIPT} ${CMAKE_SHA256} | sha256sum --check && \ + mkdir -p ${CMAKE_DIR} && \ + sh ${CMAKE_SCRIPT} --skip-license --prefix=${CMAKE_DIR} && \ + rm ${CMAKE_SCRIPT} +ENV PATH=${CMAKE_DIR}/bin:$PATH + +ENV LLVM_DIR=/opt/llvm +RUN LLVM_VERSION=887c7660bdf3f300bd1997dcfd7ace91787c0584 && \ + LLVM_URL=https://github.com/llvm/llvm-project/archive &&\ + LLVM_ARCHIVE=${LLVM_VERSION}.tar.gz &&\ + SCRATCH_DIR=/scratch && mkdir -p ${SCRATCH_DIR} && cd ${SCRATCH_DIR} && \ + wget --quiet ${LLVM_URL}/${LLVM_ARCHIVE} && \ + mkdir llvm-project && \ + tar -xf ${LLVM_ARCHIVE} -C llvm-project --strip-components=1 && \ + cd llvm-project && \ + mkdir build && cd build && \ + cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX=$LLVM_DIR \ + -DCMAKE_C_COMPILER=gcc \ + -DCMAKE_CXX_COMPILER=g++ \ + -DLLVM_ENABLE_PROJECTS="clang;libcxx;libcxxabi;openmp" \ + -DCLANG_OPENMP_NVPTX_DEFAULT_ARCH=sm_70 \ + -DLIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES=70 \ + ../llvm && \ + make -j${NPROC} && \ + make install && \ + rm -rf ../build/* && \ + cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX=$LLVM_DIR \ + -DCMAKE_C_COMPILER=$LLVM_DIR/bin/clang \ + -DCMAKE_CXX_COMPILER=$LLVM_DIR/bin/clang++ \ + -DLIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES=70 \ + ../openmp && \ + make -j${NPROC} && \ + make install && \ + echo "${LLVM_DIR}/lib" > /etc/ld.so.conf.d/llvm.conf && ldconfig && \ + rm -rf ${SCRATCH_DIR} +ENV PATH=${LLVM_DIR}/bin:$PATH diff --git a/packages/kokkos/scripts/docker/Dockerfile.sycl b/packages/kokkos/scripts/docker/Dockerfile.sycl new file mode 100644 index 0000000000000000000000000000000000000000..fdcd6d01fb8e3158000aa1507bb5bfcf7e0d9b4e --- /dev/null +++ b/packages/kokkos/scripts/docker/Dockerfile.sycl @@ -0,0 +1,47 @@ +ARG BASE=nvidia/cuda:10.2-devel +FROM $BASE + +RUN apt-get update && apt-get install -y \ + bc \ + git \ + wget \ + ccache \ + ninja-build \ + python3 \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +ARG CMAKE_VERSION=3.18.5 +ENV CMAKE_DIR=/opt/cmake +RUN CMAKE_KEY=2D2CEF1034921684 && \ + CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \ + CMAKE_SCRIPT=cmake-${CMAKE_VERSION}-Linux-x86_64.sh && \ + CMAKE_SHA256=cmake-${CMAKE_VERSION}-SHA-256.txt && \ + wget --quiet ${CMAKE_URL}/${CMAKE_SHA256} && \ + wget --quiet ${CMAKE_URL}/${CMAKE_SHA256}.asc && \ + wget --quiet ${CMAKE_URL}/${CMAKE_SCRIPT} && \ + gpg --keyserver pool.sks-keyservers.net --recv-keys ${CMAKE_KEY} && \ + gpg --verify ${CMAKE_SHA256}.asc ${CMAKE_SHA256} && \ + grep ${CMAKE_SCRIPT} ${CMAKE_SHA256} | sha256sum --check && \ + mkdir -p ${CMAKE_DIR} && \ + sh ${CMAKE_SCRIPT} --skip-license --prefix=${CMAKE_DIR} && \ + rm cmake* +ENV PATH=${CMAKE_DIR}/bin:$PATH + +ENV SYCL_DIR=/opt/sycl +RUN SYCL_VERSION=20210311 && \ + SYCL_URL=https://github.com/intel/llvm/archive/sycl-nightly && \ + SYCL_ARCHIVE=${SYCL_VERSION}.tar.gz && \ + SCRATCH_DIR=/scratch && mkdir -p ${SCRATCH_DIR} && cd ${SCRATCH_DIR} && \ + wget --quiet ${SYCL_URL}/${SYCL_ARCHIVE} && \ + mkdir llvm && \ + tar -xf ${SYCL_ARCHIVE} -C llvm --strip-components=1 && \ + cd llvm && \ + python3 buildbot/configure.py --cuda && \ + python3 buildbot/compile.py && \ + mkdir -p ${SYCL_DIR} && \ + mv ${SCRATCH_DIR}/llvm/build/install/* ${SYCL_DIR} && \ + echo "${SYCL_DIR}/lib" > /etc/ld.so.conf.d/sycl.conf && ldconfig && \ + rm -rf ${SCRATCH_DIR} +ENV PATH=${SYCL_DIR}/bin:$PATH diff --git a/packages/kokkos/scripts/docker/check_format_cpp.sh b/packages/kokkos/scripts/docker/check_format_cpp.sh new file mode 100755 index 0000000000000000000000000000000000000000..c054c67a335f404613b0446a297e16d04cf510d0 --- /dev/null +++ b/packages/kokkos/scripts/docker/check_format_cpp.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +./scripts/apply-clang-format || exit $? +git diff --exit-code diff --git a/packages/kokkos/scripts/snapshot.py b/packages/kokkos/scripts/snapshot.py new file mode 100755 index 0000000000000000000000000000000000000000..b964e2b676a560fc3e17b5b9395143ec0ea312b0 --- /dev/null +++ b/packages/kokkos/scripts/snapshot.py @@ -0,0 +1,291 @@ +#! /usr/bin/env python + +""" +Snapshot a project into another project and perform the necessary repo actions +to provide a commit message that can be used to trace back to the exact point +in the source repository. +""" + +#todo: +# Support svn +# Allow renaming of the source dir in the destination path +# Check if a new snapshot is necessary? +# + +import sys + +#check the version number so that there is a good error message when argparse is not available. +#This checks for exactly 2.7 which is bad, but it is a python 2 script and argparse was introduced +#in 2.7 which is also the last version of python 2. If this script is updated for python 3 this +#will need to change, but for now it is not safe to allow 3.x to run this. +if sys.version_info[:2] != (2, 7): + print "Error snapshot requires python 2.7 detected version is %d.%d." % (sys.version_info[0], sys.version_info[1]) + sys.exit(1) + +import subprocess, argparse, re, doctest, os, datetime, traceback + +def parse_cmdline(description): + parser = argparse.ArgumentParser(usage="snapshot.py [options] source destination", description=description) + + parser.add_argument("-n", "--no-commit", action="store_false", dest="create_commit", default=True, + help="Do not perform a commit or create a commit message.") + parser.add_argument("-v", "--verbose", action="store_true", dest="verbose_mode", default=False, + help="Enable verbose mode.") + parser.add_argument("-d", "--debug", action="store_true", dest="debug_mode", default=False, + help="Enable debugging output.") + parser.add_argument("--no-validate-repo", action="store_true", dest="no_validate_repo", default=False, + help="Reduce the validation that the source and destination repos are clean to a warning.") + parser.add_argument("--source-repo", choices=["git","none"], default="", + help="Type of repository of the source, use none to skip all repository operations.") + parser.add_argument("--dest-repo", choices=["git","none"], default="", + help="Type of repository of the destination, use none to skip all repository operations.") + parser.add_argument("--small", action="store_true", dest="small_mode", + help="Don't include tests and other extra files when copying.") + + parser.add_argument("source", help="Source project to snapshot from.") + parser.add_argument("destination", help="Destination to snapshot too.") + + options = parser.parse_args() + options = validate_options(options) + return options +#end parseCmdline + +def validate_options(options): + apparent_source_repo_type="none" + apparent_dest_repo_type="none" + + #prevent user from accidentally giving us a path that rsync will treat differently than expected. + options.source = options.source.rstrip(os.sep) + options.destination = options.destination.rstrip(os.sep) + + options.source = os.path.abspath(options.source) + options.destination = os.path.abspath(options.destination) + + if os.path.exists(options.source): + apparent_source_repo_type, source_root = determine_repo_type(options.source) + else: + raise RuntimeError("Could not find source directory of %s." % options.source) + options.source_root = source_root + + if not os.path.exists(options.destination): + print "Could not find destination directory of %s so it will be created." % options.destination + os.makedirs(options.destination) + + apparent_dest_repo_type, dest_root = determine_repo_type(options.destination) + options.dest_root = dest_root + + #error on svn repo types for now + if apparent_source_repo_type == "svn" or apparent_dest_repo_type == "svn": + raise RuntimeError("SVN repositories are not supported at this time.") + + if options.source_repo == "": + #source repo type is not specified to just using the apparent type. + options.source_repo = apparent_source_repo_type + else: + if options.source_repo != "none" and options.source_repo != apparent_source_repo_type: + raise RuntimeError("Specified source repository type of %s conflicts with determined type of %s" % \ + (options.source_repo, apparent_source_repo_type)) + + if options.dest_repo == "": + #destination repo type is not specified to just using the apparent type. + options.dest_repo = apparent_dest_repo_type + else: + if options.dest_repo != "none" and options.dest_repo != apparent_dest_repo_type: + raise RuntimeError("Specified destination repository type of %s conflicts with determined type of %s" % \ + (options.dest_repo, apparent_dest_repo_type)) + + return options +#end validate_options + +def run_cmd(cmd, options, working_dir="."): + cmd_str = " ".join(cmd) + if options.verbose_mode: + print "Running command '%s' in dir %s." % (cmd_str, working_dir) + + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=working_dir) + proc_stdout, proc_stderr = proc.communicate() + ret_val = proc.wait() + + if options.debug_mode: + print "==== %s stdout start ====" % cmd_str + print proc_stdout + print "==== %s stdout end ====" % cmd_str + print "==== %s stderr ====" % cmd_str + print proc_stderr + print "==== %s stderr ====" % cmd_str + + if ret_val != 0: + raise RuntimeError("Command '%s' failed with error code %d. Error message:%s%s%sstdout:%s" % \ + (cmd_str, ret_val, os.linesep, proc_stderr, os.linesep, proc_stdout)) + + return proc_stdout, proc_stderr +#end run_cmd + +def determine_repo_type(location): + apparent_repo_type = "none" + + while location != "": + if os.path.exists(os.path.join(location, ".git")): + apparent_repo_type = "git" + break + elif os.path.exists(os.path.join(location, ".svn")): + apparent_repo_type = "svn" + break + else: + location = location[:location.rfind(os.sep)] + + return apparent_repo_type, location +#end determine_repo_type + +def rsync(source, dest, options): + rsync_cmd = ["rsync", "-ar", "--delete"] + if options.debug_mode: + rsync_cmd.append("-v") + + if options.small_mode or options.source_repo == "git": + rsync_cmd.append("--delete-excluded") + + if options.small_mode: + rsync_cmd.append("--include=config/master_history.txt") + rsync_cmd.append("--include=cmake/tpls") + rsync_cmd.append("--exclude=benchmarks/") + rsync_cmd.append("--exclude=config/*") + rsync_cmd.append("--exclude=doc/") + rsync_cmd.append("--exclude=example/") + rsync_cmd.append("--exclude=tpls/") + rsync_cmd.append("--exclude=HOW_TO_SNAPSHOT") + rsync_cmd.append("--exclude=unit_test") + rsync_cmd.append("--exclude=unit_tests") + rsync_cmd.append("--exclude=perf_test") + rsync_cmd.append("--exclude=performance_tests") + + if options.source_repo == "git": + rsync_cmd.append("--exclude=.git*") + + rsync_cmd.append(options.source) + rsync_cmd.append(options.destination) + run_cmd(rsync_cmd, options) +#end rsync + +def create_commit_message(commit_id, commit_log, project_name, project_location): + eol = os.linesep + message = "Snapshot of %s from commit %s" % (project_name, commit_id) + message += eol * 2 + message += "From repository at %s" % project_location + message += eol * 2 + message += "At commit:" + eol + message += commit_log + return message +#end create_commit_message + +def find_git_commit_information(options): + r""" + >>> class fake_options: + ... source="." + ... verbose_mode=False + ... debug_mode=False + >>> myoptions = fake_options() + >>> find_git_commit_information(myoptions)[2:] + ('sems', 'software.sandia.gov:/git/sems') + """ + git_log_cmd = ["git", "log", "-1"] + + output, error = run_cmd(git_log_cmd, options, options.source) + + commit_match = re.match("commit ([0-9a-fA-F]+)", output) + commit_id = commit_match.group(1) + commit_log = output + + git_remote_cmd = ["git", "remote", "-v"] + output, error = run_cmd(git_remote_cmd, options, options.source) + + remote_match = re.search("origin\s([^ ]*/([^ ]+))", output, re.MULTILINE) + if not remote_match: + raise RuntimeError("Could not find origin of repo at %s. Consider using none for source repo type." % (options.source)) + + source_location = remote_match.group(1) + source_name = remote_match.group(2).strip() + + if source_name[-1] == "/": + source_name = source_name[:-1] + + return commit_id, commit_log, source_name, source_location +#end find_git_commit_information + +def do_git_commit(message, options): + if options.verbose_mode: + print "Committing to destination repository." + + git_add_cmd = ["git", "add", "-A"] + run_cmd(git_add_cmd, options, options.destination) + + git_commit_cmd = ["git", "commit", "-m%s" % message] + run_cmd(git_commit_cmd, options, options.destination) + + git_log_cmd = ["git", "log", "--format=%h", "-1"] + commit_sha1, error = run_cmd(git_log_cmd, options, options.destination) + + print "Commit %s was made to %s." % (commit_sha1.strip(), options.dest_root) +#end do_git_commit + +def verify_git_repo_clean(location, options): + git_status_cmd = ["git", "status", "--porcelain"] + output, error = run_cmd(git_status_cmd, options, location) + + if output != "": + if options.no_validate_repo == False: + raise RuntimeError("%s is not clean.%sPlease commit or stash all changes before running snapshot." + % (location, os.linesep)) + else: + print "WARNING: %s is not clean. Proceeding anyway." % location + print "WARNING: This could lead to differences in the source and destination." + print "WARNING: It could also lead to extra files being included in the snapshot commit." +#end verify_git_repo_clean + +def main(options): + if options.verbose_mode: + print "Snapshotting %s to %s." % (options.source, options.destination) + + if options.source_repo == "git": + verify_git_repo_clean(options.source, options) + commit_id, commit_log, repo_name, repo_location = find_git_commit_information(options) + elif options.source_repo == "none": + commit_id = "N/A" + commit_log = "Unknown commit from %s snapshotted at: %s" % (options.source, datetime.datetime.now()) + repo_name = options.source + repo_location = options.source + + commit_message = create_commit_message(commit_id, commit_log, repo_name, repo_location) + os.linesep*2 + + if options.dest_repo == "git": + verify_git_repo_clean(options.destination, options) + + rsync(options.source, options.destination, options) + + if options.dest_repo == "git": + do_git_commit(commit_message, options) + elif options.dest_repo == "none": + file_name = "snapshot_message.txt" + message_file = open(file_name, "w") + message_file.write(commit_message) + message_file.close() + cwd = os.getcwd() + print "No commit done by request. Please use file at:" + print "%s%sif you wish to commit this to a repo later." % (cwd+"/"+file_name, os.linesep) +#end main + +if (__name__ == "__main__"): + if ("--test" in sys.argv): + doctest.testmod() + sys.exit(0) + + try: + options = parse_cmdline(__doc__) + main(options) + except RuntimeError, e: + print "Error occurred:", e + if "--debug" in sys.argv: + traceback.print_exc() + sys.exit(1) + else: + sys.exit(0) diff --git a/packages/kokkos/scripts/spack_test/CMakeLists.txt b/packages/kokkos/scripts/spack_test/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c28bd0b8eccff2487ae1388960bbbc6b8504a34 --- /dev/null +++ b/packages/kokkos/scripts/spack_test/CMakeLists.txt @@ -0,0 +1,21 @@ +cmake_minimum_required(VERSION 3.16) +project(SpackTestGen) +set(TEST_LIST_DEF ${CMAKE_CURRENT_SOURCE_DIR}/test_list.def) +file(STRINGS ${TEST_LIST_DEF} TEST_FILES) + +#Copy test source to Spack test directory +foreach (TEST_FILE ${TEST_FILES}) + set(TEST_FILE_LOCATION ${SPACK_PACKAGE_SOURCE_DIR}/${TEST_FILE}) + file(COPY ${TEST_FILE_LOCATION} DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/out) +endforeach() + +#Clean up names +foreach(TEST_FILE ${TEST_FILES} ) + string( REGEX REPLACE ".+\/" "" TEST_FILE ${TEST_FILE} ) + list(APPEND SRC_NAME_LIST ${TEST_FILE}) + string( REPLACE ".cpp" "" TEST_FILE ${TEST_FILE} ) + list(APPEND BIN_NAME_LIST ${TEST_FILE}) +endforeach() + +#Configure test cmake script and run script +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt.in ${CMAKE_CURRENT_SOURCE_DIR}/out/CMakeLists.txt @ONLY) diff --git a/packages/kokkos/scripts/spack_test/CMakeLists.txt.in b/packages/kokkos/scripts/spack_test/CMakeLists.txt.in new file mode 100644 index 0000000000000000000000000000000000000000..4a216df4aab7b326efc94866b0f943af7c42d29f --- /dev/null +++ b/packages/kokkos/scripts/spack_test/CMakeLists.txt.in @@ -0,0 +1,24 @@ +cmake_minimum_required(VERSION 3.16) +project(kokkos_spack_test CXX) +find_package(Kokkos REQUIRED) + +set(SRC_NAME_LIST "@SRC_NAME_LIST@") +set(BIN_NAME_LIST "@BIN_NAME_LIST@") + +enable_testing() +list(LENGTH SRC_NAME_LIST LEN) +math(EXPR LEN "${LEN}-1") + +set(CMAKE_CXX_COMPILER ${Kokkos_CXX_COMPILER}) + +foreach (it RANGE ${LEN}) + list(GET SRC_NAME_LIST ${it} src) + list(GET BIN_NAME_LIST ${it} bin) + add_executable(${bin} ${src}) + target_link_libraries(${bin} Kokkos::kokkos) + add_test(NAME ${bin} COMMAND ${bin}) + set_tests_properties(${bin} PROPERTIES + LABELS "Kokkos" + PROCESSORS 1 + TIMEOUT 60) +endforeach() diff --git a/packages/kokkos/scripts/spack_test/test_list.def b/packages/kokkos/scripts/spack_test/test_list.def new file mode 100644 index 0000000000000000000000000000000000000000..8703ccb9854140245f5ff684b85eb32c6881b207 --- /dev/null +++ b/packages/kokkos/scripts/spack_test/test_list.def @@ -0,0 +1,4 @@ +example/tutorial/01_hello_world/hello_world.cpp +example/tutorial/02_simple_reduce/simple_reduce.cpp +example/tutorial/Algorithms/01_random_numbers/random_numbers.cpp +example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp diff --git a/packages/kokkos/scripts/testing_scripts/README b/packages/kokkos/scripts/testing_scripts/README new file mode 100644 index 0000000000000000000000000000000000000000..455afffd840514e98686dadcd2c46a774590456c --- /dev/null +++ b/packages/kokkos/scripts/testing_scripts/README @@ -0,0 +1,5 @@ +jenkins_test_driver is designed to be run through Jenkins as a +multiconfiguration job. It relies on a number of environment variables that will +only be set when run in that context. It is possible to override these if you +know the Jenkins job setup. It is not recommended that a non-expert try to run +this script directly. diff --git a/packages/kokkos/scripts/testing_scripts/TestEXEC_TEST.cpp b/packages/kokkos/scripts/testing_scripts/TestEXEC_TEST.cpp new file mode 100644 index 0000000000000000000000000000000000000000..883e88b51b7dd6c3f116ea8731934db5b7dde72a --- /dev/null +++ b/packages/kokkos/scripts/testing_scripts/TestEXEC_TEST.cpp @@ -0,0 +1,47 @@ + +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <TestEXEC_Category.hpp> +#include <TestTEST.hpp> diff --git a/packages/kokkos/scripts/testing_scripts/cpy_test b/packages/kokkos/scripts/testing_scripts/cpy_test new file mode 100755 index 0000000000000000000000000000000000000000..e9a74a243417c063945ffc6f1744c1074e65b0be --- /dev/null +++ b/packages/kokkos/scripts/testing_scripts/cpy_test @@ -0,0 +1,9 @@ +exec=$1 +EXEC=$2 +TEST=$3 + +cp ../../scripts/testing_scripts/TestEXEC_TEST.cpp ${exec}/Test${EXEC}_${TEST}.cpp +sed -i 's|exec|'${exec}'|g' ${exec}/Test${EXEC}_${TEST}.cpp +sed -i 's|EXEC|'${EXEC}'|g' ${exec}/Test${EXEC}_${TEST}.cpp +sed -i 's|TEST|'${TEST}'|g' ${exec}/Test${EXEC}_${TEST}.cpp + diff --git a/packages/kokkos/scripts/testing_scripts/generate_makefile.bash b/packages/kokkos/scripts/testing_scripts/generate_makefile.bash new file mode 100755 index 0000000000000000000000000000000000000000..f21124ed6e716844e876cf209ee2af5cb9a7dbbd --- /dev/null +++ b/packages/kokkos/scripts/testing_scripts/generate_makefile.bash @@ -0,0 +1,488 @@ +#!/bin/bash + +KOKKOS_DEVICES="" + +KOKKOS_DO_EXAMPLES="1" + +while [[ $# > 0 ]] +do + key="$1" + + case $key in + --kokkos-path*) + KOKKOS_PATH="${key#*=}" + ;; + --hpx-path*) + HPX_PATH="${key#*=}" + ;; + --prefix*) + PREFIX="${key#*=}" + ;; + --with-cuda) + KOKKOS_DEVICES="${KOKKOS_DEVICES},Cuda" + CUDA_PATH_NVCC=$(command -v nvcc) + CUDA_PATH=${CUDA_PATH_NVCC%/bin/nvcc} + ;; + # Catch this before '--with-cuda*' + --with-cuda-options*) + KOKKOS_CUDA_OPT="${key#*=}" + ;; + --with-cuda*) + KOKKOS_DEVICES="${KOKKOS_DEVICES},Cuda" + CUDA_PATH="${key#*=}" + ;; + --with-openmp) + KOKKOS_DEVICES="${KOKKOS_DEVICES},OpenMP" + ;; + --with-pthread) + KOKKOS_DEVICES="${KOKKOS_DEVICES},Pthread" + ;; + --with-serial) + KOKKOS_DEVICES="${KOKKOS_DEVICES},Serial" + ;; + --with-hpx-options*) + KOKKOS_HPX_OPT="${key#*=}" + ;; + --with-hpx*) + KOKKOS_DEVICES="${KOKKOS_DEVICES},HPX" + if [ -z "$HPX_PATH" ]; then + HPX_PATH="${key#*=}" + fi + ;; + --with-devices*) + DEVICES="${key#*=}" + KOKKOS_DEVICES="${KOKKOS_DEVICES},${DEVICES}" + ;; + --with-gtest*) + GTEST_PATH="${key#*=}" + ;; + --with-hwloc*) + HWLOC_PATH="${key#*=}" + ;; + --with-memkind*) + MEMKIND_PATH="${key#*=}" + ;; + --arch*) + KOKKOS_ARCH="${key#*=}" + ;; + --cxxflags*) + CXXFLAGS="${key#*=}" + ;; + --cxxstandard*) + KOKKOS_CXX_STANDARD="${key#*=}" + ;; + --ldflags*) + LDFLAGS="${key#*=}" + ;; + --debug|-dbg) + KOKKOS_DEBUG=yes + ;; + --make-j*) + echo "Warning: ${key} is deprecated" + echo "Call make with appropriate -j flag" + ;; + --no-examples) + KOKKOS_DO_EXAMPLES="0" + ;; + --compiler*) + COMPILER="${key#*=}" + CNUM=$(command -v ${COMPILER} 2>&1 >/dev/null | grep -c "no ${COMPILER}") + if [ ${CNUM} -gt 0 ]; then + echo "Invalid compiler by --compiler command: '${COMPILER}'" + exit + fi + if [[ ! -n ${COMPILER} ]]; then + echo "Empty compiler specified by --compiler command." + exit + fi + CNUM=$(command -v ${COMPILER} | grep -c ${COMPILER}) + if [ ${CNUM} -eq 0 ]; then + echo "Invalid compiler by --compiler command: '${COMPILER}'" + exit + fi + # ... valid compiler, ensure absolute path set + WCOMPATH=$(command -v $COMPILER) + COMPDIR=$(dirname $WCOMPATH) + COMPNAME=$(basename $WCOMPATH) + COMPILER=${COMPDIR}/${COMPNAME} + ;; + --with-options*) + KOKKOS_OPT="${key#*=}" + ;; + --gcc-toolchain*) + KOKKOS_GCC_TOOLCHAIN="${key#*=}" + ;; + --help) + echo "Kokkos configure options:" + echo "" + echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory." + echo "--prefix=/Install/Path: Path to install the Kokkos library." + echo "" + echo "--with-cuda[=/Path/To/Cuda]: Enable Cuda and set path to Cuda Toolkit." + echo "--with-openmp: Enable OpenMP backend." + echo "--with-pthread: Enable Pthreads backend." + echo "--with-serial: Enable Serial backend." + echo "--with-devices: Explicitly add a set of backends." + echo "" + echo "--arch=[OPT]: Set target architectures. Options are:" + echo " [AMD]" + echo " AMDAVX = AMD CPU" + echo " ZEN = AMD Zen-Core CPU" + echo " ZEN2 = AMD Zen2-Core CPU" + echo " [ARM]" + echo " ARMv80 = ARMv8.0 Compatible CPU" + echo " ARMv81 = ARMv8.1 Compatible CPU" + echo " ARMv8-ThunderX = ARMv8 Cavium ThunderX CPU" + echo " ARMv8-TX2 = ARMv8 Cavium ThunderX2 CPU" + echo " [IBM]" + echo " BGQ = IBM Blue Gene Q" + echo " Power7 = IBM POWER7 and POWER7+ CPUs" + echo " Power8 = IBM POWER8 CPUs" + echo " Power9 = IBM POWER9 CPUs" + echo " [Intel]" + echo " WSM = Intel Westmere CPUs" + echo " SNB = Intel Sandy/Ivy Bridge CPUs" + echo " HSW = Intel Haswell CPUs" + echo " BDW = Intel Broadwell Xeon E-class CPUs" + echo " SKX = Intel Sky Lake Xeon E-class HPC CPUs (AVX512)" + echo " [Intel Xeon Phi]" + echo " KNC = Intel Knights Corner Xeon Phi" + echo " KNL = Intel Knights Landing Xeon Phi" + echo " [NVIDIA]" + echo " Kepler30 = NVIDIA Kepler generation CC 3.0" + echo " Kepler32 = NVIDIA Kepler generation CC 3.2" + echo " Kepler35 = NVIDIA Kepler generation CC 3.5" + echo " Kepler37 = NVIDIA Kepler generation CC 3.7" + echo " Maxwell50 = NVIDIA Maxwell generation CC 5.0" + echo " Maxwell52 = NVIDIA Maxwell generation CC 5.2" + echo " Maxwell53 = NVIDIA Maxwell generation CC 5.3" + echo " Pascal60 = NVIDIA Pascal generation CC 6.0" + echo " Pascal61 = NVIDIA Pascal generation CC 6.1" + echo " Volta70 = NVIDIA Volta generation CC 7.0" + echo " Volta72 = NVIDIA Volta generation CC 7.2" + echo "" + echo "--compiler=/Path/To/Compiler Set the compiler." + echo "--debug,-dbg: Enable Debugging." + echo "--cxxflags=[FLAGS] Overwrite CXXFLAGS for library build and test" + echo " build. This will still set certain required" + echo " flags via KOKKOS_CXXFLAGS (such as -fopenmp," + echo " -std=c++14, etc.)." + echo "--cxxstandard=[FLAGS] Overwrite KOKKOS_CXX_STANDARD for library build and test" + echo " c++14 (default), c++17, c++1y, c++1z, c++2a" + echo "--ldflags=[FLAGS] Overwrite LDFLAGS for library build and test" + echo " build. This will still set certain required" + echo " flags via KOKKOS_LDFLAGS (such as -fopenmp," + echo " -lpthread, etc.)." + echo "--with-gtest=/Path/To/Gtest: Set path to gtest. (Used in unit and performance" + echo " tests.)" + echo "--with-hwloc=/Path/To/Hwloc: Set path to hwloc library." + echo "--with-memkind=/Path/To/MemKind: Set path to memkind library." + echo "--with-options=[OPT]: Additional options to Kokkos:" + echo " compiler_warnings" + echo " aggressive_vectorization = add ivdep on loops" + echo " disable_profiling = do not compile with profiling hooks" + echo " " + echo "--with-cuda-options=[OPT]: Additional options to CUDA:" + echo " force_uvm, use_ldg, enable_lambda, rdc" + echo "--with-hpx-options=[OPT]: Additional options to HPX:" + echo " enable_async_dispatch" + echo "--gcc-toolchain=/Path/To/GccRoot: Set the gcc toolchain to use with clang (e.g. /usr)" + echo "--make-j=[NUM]: DEPRECATED: call make with appropriate" + echo " -j flag" + exit 0 + ;; + *) + echo "warning: ignoring unknown option $key" + ;; + esac + + shift +done + +# Remove leading ',' from KOKKOS_DEVICES. +KOKKOS_DEVICES=$(echo $KOKKOS_DEVICES | sed 's/^,//') + +# If KOKKOS_PATH undefined, assume parent dir of this script is the KOKKOS_PATH. +if [ -z "$KOKKOS_PATH" ]; then + KOKKOS_PATH=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && cd ../../ && pwd ) +else + # Ensure KOKKOS_PATH is abs path + KOKKOS_PATH=$( cd $KOKKOS_PATH && pwd ) +fi + +if [ "${KOKKOS_PATH}" = "${PWD}" ] || [ "${KOKKOS_PATH}" = "${PWD}/" ]; then + echo "Running generate_makefile.bash in the Kokkos root directory is not allowed" + exit +fi + +KOKKOS_SRC_PATH=${KOKKOS_PATH} + +KOKKOS_SETTINGS="KOKKOS_SRC_PATH=${KOKKOS_SRC_PATH}" + +# The double [[ ]] in the elif branch is not a typo +if [ ${#COMPILER} -gt 0 ]; then + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} CXX=${COMPILER}" +elif + [ ${#COMPILER} -eq 0 ] && [[ ${KOKKOS_DEVICES} =~ .*Cuda.* ]]; then + COMPILER="${KOKKOS_PATH}/bin/nvcc_wrapper" + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} CXX=${COMPILER}" +fi + +if [ ${#KOKKOS_DEVICES} -gt 0 ]; then + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_DEVICES=${KOKKOS_DEVICES}" +fi + +if [ ${#KOKKOS_ARCH} -gt 0 ]; then + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_ARCH=${KOKKOS_ARCH}" +fi + +if [ ${#KOKKOS_DEBUG} -gt 0 ]; then + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_DEBUG=${KOKKOS_DEBUG}" +fi + +if [ ${#CUDA_PATH} -gt 0 ]; then + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} CUDA_PATH=${CUDA_PATH}" +fi + +if [ ${#CXXFLAGS} -gt 0 ]; then + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} CXXFLAGS=\"${CXXFLAGS}\"" +fi + +if [ ${#KOKKOS_CXX_STANDARD} -gt 0 ]; then + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_CXX_STANDARD=\"${KOKKOS_CXX_STANDARD}\"" +fi + +if [ ${#LDFLAGS} -gt 0 ]; then + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} LDFLAGS=\"${LDFLAGS}\"" +fi + +if [ ${#GTEST_PATH} -gt 0 ]; then + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} GTEST_PATH=${GTEST_PATH}" +else + GTEST_PATH=${KOKKOS_PATH}/tpls/gtest + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} GTEST_PATH=${GTEST_PATH}" +fi + +if [ ${#HWLOC_PATH} -gt 0 ]; then + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} HWLOC_PATH=${HWLOC_PATH}" + KOKKOS_USE_TPLS="${KOKKOS_USE_TPLS},hwloc" +fi + +if [ ${#MEMKIND_PATH} -gt 0 ]; then + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} MEMKIND_PATH=${MEMKIND_PATH}" + KOKKOS_USE_TPLS="${KOKKOS_USE_TPLS},experimental_memkind" +fi + +if [ ${#KOKKOS_USE_TPLS} -gt 0 ]; then + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_USE_TPLS=${KOKKOS_USE_TPLS}" +fi + +if [ ${#HPX_PATH} -gt 0 ]; then + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} HPX_PATH=${HPX_PATH}" +fi + +if [ ${#KOKKOS_OPT} -gt 0 ]; then + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_OPTIONS=${KOKKOS_OPT}" +fi + +if [ ${#KOKKOS_CUDA_OPT} -gt 0 ]; then + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_CUDA_OPTIONS=${KOKKOS_CUDA_OPT}" +fi + +if [ ${#KOKKOS_HPX_OPT} -gt 0 ]; then + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_HPX_OPTIONS=${KOKKOS_HPX_OPT}" +fi + +if [ ${#KOKKOS_GCC_TOOLCHAIN} -gt 0 ]; then + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_INTERNAL_GCC_TOOLCHAIN=${KOKKOS_GCC_TOOLCHAIN}" +fi + +KOKKOS_SETTINGS_NO_KOKKOS_PATH="${KOKKOS_SETTINGS}" + + +gen_makefile=Makefile.kokkos +echo "#Makefile to satisfy existence of target kokkos-clean before installing the library" > install/${gen_makefile} +echo "kokkos-clean:" >> install/${gen_makefile} +echo "" >> install/${gen_makefile} +mkdir -p core +mkdir -p core/unit_test +mkdir -p core/perf_test +mkdir -p containers +mkdir -p containers/unit_tests +mkdir -p containers/performance_tests +mkdir -p algorithms +mkdir -p algorithms/unit_tests +mkdir -p algorithms/performance_tests +mkdir -p example +mkdir -p example/fixture +mkdir -p example/feint +mkdir -p example/fenl +mkdir -p example/make_buildlink +mkdir -p example/tutorial + +KOKKOS_SETTINGS="${KOKKOS_SETTINGS_NO_KOKKOS_PATH} KOKKOS_PATH=${KOKKOS_PATH}" + +# Generate subdirectory makefiles. +echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > core/unit_test/Makefile +echo "" >> core/unit_test/Makefile +echo "all:" >> core/unit_test/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/unit_test/Makefile ${KOKKOS_SETTINGS}" >> core/unit_test/Makefile +echo "" >> core/unit_test/Makefile +echo "test: all" >> core/unit_test/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/unit_test/Makefile ${KOKKOS_SETTINGS} test" >> core/unit_test/Makefile +echo "" >> core/unit_test/Makefile +echo "clean:" >> core/unit_test/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/unit_test/Makefile ${KOKKOS_SETTINGS} clean" >> core/unit_test/Makefile + +echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > core/perf_test/Makefile +echo "" >> core/perf_test/Makefile +echo "all:" >> core/perf_test/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/perf_test/Makefile ${KOKKOS_SETTINGS}" >> core/perf_test/Makefile +echo "" >> core/perf_test/Makefile +echo "test: all" >> core/perf_test/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/perf_test/Makefile ${KOKKOS_SETTINGS} test" >> core/perf_test/Makefile +echo "" >> core/perf_test/Makefile +echo "clean:" >> core/perf_test/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/perf_test/Makefile ${KOKKOS_SETTINGS} clean" >> core/perf_test/Makefile + +echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > containers/unit_tests/Makefile +echo "" >> containers/unit_tests/Makefile +echo "all:" >> containers/unit_tests/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/containers/unit_tests/Makefile ${KOKKOS_SETTINGS}" >> containers/unit_tests/Makefile +echo "" >> containers/unit_tests/Makefile +echo "test: all" >> containers/unit_tests/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/containers/unit_tests/Makefile ${KOKKOS_SETTINGS} test" >> containers/unit_tests/Makefile +echo "" >> containers/unit_tests/Makefile +echo "clean:" >> containers/unit_tests/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/containers/unit_tests/Makefile ${KOKKOS_SETTINGS} clean" >> containers/unit_tests/Makefile + +echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > containers/performance_tests/Makefile +echo "" >> containers/performance_tests/Makefile +echo "all:" >> containers/performance_tests/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/containers/performance_tests/Makefile ${KOKKOS_SETTINGS}" >> containers/performance_tests/Makefile +echo "" >> containers/performance_tests/Makefile +echo "test: all" >> containers/performance_tests/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/containers/performance_tests/Makefile ${KOKKOS_SETTINGS} test" >> containers/performance_tests/Makefile +echo "" >> containers/performance_tests/Makefile +echo "clean:" >> containers/performance_tests/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/containers/performance_tests/Makefile ${KOKKOS_SETTINGS} clean" >> containers/performance_tests/Makefile + +echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > algorithms/unit_tests/Makefile +echo "" >> algorithms/unit_tests/Makefile +echo "all:" >> algorithms/unit_tests/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/algorithms/unit_tests/Makefile ${KOKKOS_SETTINGS}" >> algorithms/unit_tests/Makefile +echo "" >> algorithms/unit_tests/Makefile +echo "test: all" >> algorithms/unit_tests/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/algorithms/unit_tests/Makefile ${KOKKOS_SETTINGS} test" >> algorithms/unit_tests/Makefile +echo "" >> algorithms/unit_tests/Makefile +echo "clean:" >> algorithms/unit_tests/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/algorithms/unit_tests/Makefile ${KOKKOS_SETTINGS} clean" >> algorithms/unit_tests/Makefile + +echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > example/fixture/Makefile +echo "" >> example/fixture/Makefile +echo "all:" >> example/fixture/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/fixture/Makefile ${KOKKOS_SETTINGS}" >> example/fixture/Makefile +echo "" >> example/fixture/Makefile +echo "test: all" >> example/fixture/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/fixture/Makefile ${KOKKOS_SETTINGS} test" >> example/fixture/Makefile +echo "" >> example/fixture/Makefile +echo "clean:" >> example/fixture/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/fixture/Makefile ${KOKKOS_SETTINGS} clean" >> example/fixture/Makefile + +echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > example/feint/Makefile +echo "" >> example/feint/Makefile +echo "all:" >> example/feint/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/feint/Makefile ${KOKKOS_SETTINGS}" >> example/feint/Makefile +echo "" >> example/feint/Makefile +echo "test: all" >> example/feint/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/feint/Makefile ${KOKKOS_SETTINGS} test" >> example/feint/Makefile +echo "" >> example/feint/Makefile +echo "clean:" >> example/feint/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/feint/Makefile ${KOKKOS_SETTINGS} clean" >> example/feint/Makefile + +echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > example/fenl/Makefile +echo "" >> example/fenl/Makefile +echo "all:" >> example/fenl/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/fenl/Makefile ${KOKKOS_SETTINGS}" >> example/fenl/Makefile +echo "" >> example/fenl/Makefile +echo "test: all" >> example/fenl/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/fenl/Makefile ${KOKKOS_SETTINGS} test" >> example/fenl/Makefile +echo "" >> example/fenl/Makefile +echo "clean:" >> example/fenl/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/fenl/Makefile ${KOKKOS_SETTINGS} clean" >> example/fenl/Makefile + +echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > example/make_buildlink/Makefile +echo "" >> example/make_buildlink/Makefile +echo "build:" >> example/make_buildlink/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/make_buildlink/Makefile ${KOKKOS_SETTINGS} build" >> example/make_buildlink/Makefile +echo "" >> example/make_buildlink/Makefile +echo "test: build" >> example/make_buildlink/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/make_buildlink/Makefile ${KOKKOS_SETTINGS} test" >> example/make_buildlink/Makefile +echo "" >> example/make_buildlink/Makefile +echo "clean:" >> example/make_buildlink/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/make_buildlink/Makefile ${KOKKOS_SETTINGS} clean" >> example/make_buildlink/Makefile + +echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > example/tutorial/Makefile +echo "" >> example/tutorial/Makefile +echo "build:" >> example/tutorial/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/tutorial/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' KOKKOS_PATH=${KOKKOS_PATH} build">> example/tutorial/Makefile +echo "" >> example/tutorial/Makefile +echo "test: build" >> example/tutorial/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/tutorial/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' KOKKOS_PATH=${KOKKOS_PATH} test" >> example/tutorial/Makefile +echo "" >> example/tutorial/Makefile +echo "clean:" >> example/tutorial/Makefile +echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/tutorial/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' KOKKOS_PATH=${KOKKOS_PATH} clean" >> example/tutorial/Makefile + +# Generate top level directory makefile. +echo "Generating Makefiles with options " ${KOKKOS_SETTINGS} +echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > Makefile +echo "" >> Makefile +echo "build-test:" >> Makefile +echo -e "\t\$(MAKE) -C core/unit_test" >> Makefile +echo -e "\t\$(MAKE) -C core/perf_test" >> Makefile +echo -e "\t\$(MAKE) -C containers/unit_tests" >> Makefile +echo -e "\t\$(MAKE) -C containers/performance_tests" >> Makefile +echo -e "\t\$(MAKE) -C algorithms/unit_tests" >> Makefile +if [ ${KOKKOS_DO_EXAMPLES} -gt 0 ]; then +$() +echo -e "\t\$(MAKE) -C example/fixture" >> Makefile +echo -e "\t\$(MAKE) -C example/feint" >> Makefile +echo -e "\t\$(MAKE) -C example/fenl" >> Makefile +echo -e "\t\$(MAKE) -C example/make_buildlink build" >> Makefile +echo -e "\t\$(MAKE) -C example/tutorial build" >> Makefile +fi +echo "" >> Makefile +echo "test: build-test" >> Makefile +echo -e "\t\$(MAKE) -C core/unit_test test" >> Makefile +echo -e "\t\$(MAKE) -C core/perf_test test" >> Makefile +echo -e "\t\$(MAKE) -C containers/unit_tests test" >> Makefile +echo -e "\t\$(MAKE) -C containers/performance_tests test" >> Makefile +echo -e "\t\$(MAKE) -C algorithms/unit_tests test" >> Makefile +if [ ${KOKKOS_DO_EXAMPLES} -gt 0 ]; then +echo -e "\t\$(MAKE) -C example/fixture test" >> Makefile +echo -e "\t\$(MAKE) -C example/feint test" >> Makefile +echo -e "\t\$(MAKE) -C example/fenl test" >> Makefile +echo -e "\t\$(MAKE) -C example/make_buildlink test" >> Makefile +echo -e "\t\$(MAKE) -C example/tutorial test" >> Makefile +fi +echo "" >> Makefile +echo "unit-tests-only:" >> Makefile +echo -e "\t\$(MAKE) -C core/unit_test test" >> Makefile +echo -e "\t\$(MAKE) -C containers/unit_tests test" >> Makefile +echo -e "\t\$(MAKE) -C algorithms/unit_tests test" >> Makefile +echo "" >> Makefile + +echo "clean:" >> Makefile +echo -e "\t\$(MAKE) -C core/unit_test clean" >> Makefile +echo -e "\t\$(MAKE) -C core/perf_test clean" >> Makefile +echo -e "\t\$(MAKE) -C containers/unit_tests clean" >> Makefile +echo -e "\t\$(MAKE) -C containers/performance_tests clean" >> Makefile +echo -e "\t\$(MAKE) -C algorithms/unit_tests clean" >> Makefile +if [ ${KOKKOS_DO_EXAMPLES} -gt 0 ]; then +echo -e "\t\$(MAKE) -C example/fixture clean" >> Makefile +echo -e "\t\$(MAKE) -C example/feint clean" >> Makefile +echo -e "\t\$(MAKE) -C example/fenl clean" >> Makefile +echo -e "\t\$(MAKE) -C example/make_buildlink clean" >> Makefile +echo -e "\t\$(MAKE) -C example/tutorial clean" >> Makefile +fi + diff --git a/packages/kokkos/scripts/testing_scripts/gnu_test_all_sandia b/packages/kokkos/scripts/testing_scripts/gnu_test_all_sandia new file mode 100755 index 0000000000000000000000000000000000000000..b2a0677e12fc14bc1b9ecc834e961abee5543efe --- /dev/null +++ b/packages/kokkos/scripts/testing_scripts/gnu_test_all_sandia @@ -0,0 +1,887 @@ +#!/bin/bash -e + +# +# Global config +# + +set -o pipefail + +# Determine current machine. + +MACHINE="" +HOSTNAME=$(hostname) +PROCESSOR=`uname -p` + +if [[ "$HOSTNAME" =~ (white|ride).* ]]; then + MACHINE=white + module load git +fi + +if [[ "$HOSTNAME" =~ .*bowman.* ]]; then + MACHINE=bowman + module load git +fi + +if [[ "$HOSTNAME" == *blake* ]]; then # Warning: very generic name + MACHINE=blake + module load git +fi + +if [[ "$HOSTNAME" == apollo\.* ]]; then + MACHINE=apollo +fi + +if [[ "$HOSTNAME" == kokkos-dev-2* ]]; then + MACHINE=kokkos-dev-2 +fi + +if [[ "$HOSTNAME" == may* ]]; then + MACHINE=mayer +# module load git +fi + +if [[ "$HOSTNAME" == cn* ]]; then # Warning: very generic name + MACHINE=mayer +fi + +if [[ "$HOSTNAME" == kokkos-dev\.sandia\.gov* ]]; then + MACHINE=kokkos-dev +fi + +if [ ! -z "$SEMS_MODULEFILES_ROOT" ]; then + if [[ "$MACHINE" = "" ]]; then + MACHINE=sems + module load sems-git + fi +fi + +if [[ "$MACHINE" = "" ]]; then + echo "Unrecognized machine" >&2 + exit 1 +fi + +echo "Running on machine: $MACHINE" + +GCC_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial" +IBM_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" +ARM_GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" +INTEL_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial" +CLANG_BUILD_LIST="Pthread,Serial,Pthread_Serial" +CUDA_BUILD_LIST="Cuda_OpenMP,Cuda_Pthread,Cuda_Serial" +CUDA_IBM_BUILD_LIST="Cuda_OpenMP,Cuda_Serial" + +GCC_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wignored-qualifiers,-Wempty-body,-Wclobbered,-Wuninitialized" +IBM_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Wsign-compare,-Wtype-limits,-Wuninitialized" +CLANG_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" +INTEL_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" +CUDA_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" +#CUDA_WARNING_FLAGS="-Wunused-parameter,-Wall,-Wshadow,-pedantic,-Wsign-compare,-Wtype-limits,-Wuninitialized" +PGI_WARNING_FLAGS="" + +# Default. Machine specific can override. +DEBUG=False +ARGS="" +CUSTOM_BUILD_LIST="" +DRYRUN=False +BUILD_ONLY=False +declare -i NUM_JOBS_TO_RUN_IN_PARALLEL=1 +TEST_SCRIPT=False +SKIP_HWLOC=False +SPOT_CHECK=False + +PRINT_HELP=False +OPT_FLAG="" +CXX_FLAGS_EXTRA="" +LD_FLAGS_EXTRA="" +KOKKOS_OPTIONS="" + +CXX_STANDARD="c++14" + +# +# Handle arguments. +# + +while [[ $# > 0 ]] +do + key="$1" + + case $key in + --kokkos-path*) + KOKKOS_PATH="${key#*=}" + ;; + --build-list*) + CUSTOM_BUILD_LIST="${key#*=}" + ;; + --debug*) + DEBUG=True + ;; + --build-only*) + BUILD_ONLY=True + ;; + --test-script*) + TEST_SCRIPT=True + ;; + --skip-hwloc*) + SKIP_HWLOC=True + ;; + --num*) + NUM_JOBS_TO_RUN_IN_PARALLEL="${key#*=}" + ;; + --dry-run*) + DRYRUN=True + ;; + --spot-check*) + SPOT_CHECK=True + ;; + --arch*) + ARCH_FLAG="--arch=${key#*=}" + ;; + --opt-flag*) + OPT_FLAG="${key#*=}" + ;; + --with-cuda-options*) + KOKKOS_CUDA_OPTIONS="--with-cuda-options=${key#*=}" + ;; + --with-options*) + KOKKOS_OPTIONS="--with-options=${key#*=}" + ;; + --cxxflags-extra*) + CXX_FLAGS_EXTRA="${key#*=}" + ;; + --cxxstandard*) + CXX_STANDARD="${key#*=}" + ;; + --ldflags-extra*) + LD_FLAGS_EXTRA="${key#*=}" + ;; + --help*) + PRINT_HELP=True + ;; + *) + # args, just append + ARGS="$ARGS $1" + ;; + esac + + shift +done + +SCRIPT_KOKKOS_ROOT=$( cd "$( dirname "$0" )" && cd ../.. && pwd ) + +# Set kokkos path. +if [ -z "$KOKKOS_PATH" ]; then + KOKKOS_PATH=$SCRIPT_KOKKOS_ROOT +else + # Ensure KOKKOS_PATH is abs path. + KOKKOS_PATH=$( cd $KOKKOS_PATH && pwd ) +fi + +UNCOMMITTED=`cd ${KOKKOS_PATH}; git status --porcelain 2>/dev/null` +if ! [ -z "$UNCOMMITTED" ]; then + echo "WARNING!! THE FOLLOWING CHANGES ARE UNCOMMITTED!! :" + echo "$UNCOMMITTED" + echo "" +fi + +GITSTATUS=`cd ${KOKKOS_PATH}; git log -n 1 --format=oneline` +echo "Repository Status: " ${GITSTATUS} +echo "" +echo "" + +# +# Machine specific config. +# + +if [ "$MACHINE" = "sems" ]; then + source /projects/sems/modulefiles/utils/sems-modules-init.sh + + # On unnamed sems machines, assume more restricted rhel7 environment + # On rhel7 sems machines gcc/7.3.0, clang/4.0.1, and intel/16.0.3 are missing + # Remove kokkkos-env module use + + BASE_MODULE_LIST="sems-env,sems-<COMPILER_NAME>/<COMPILER_VERSION>" + CUDA9_MODULE_LIST="sems-env,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/7.2.0" + SKIP_HWLOC=True + # No sems hwloc module + + if [ -z "$ARCH_FLAG" ]; then + ARCH_FLAG="" + fi + + if [ "$SPOT_CHECK" = "True" ]; then + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS" + "gcc/7.2.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" + "intel/17.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" + "clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" + "cuda/9.2 $CUDA9_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + ) + else + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "clang/3.7.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "clang/3.8.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "clang/3.9.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "cuda/9.2 $CUDA9_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + ) + fi +elif [ "$MACHINE" = "kokkos-dev" ]; then + source /projects/sems/modulefiles/utils/sems-modules-init.sh + + BASE_MODULE_LIST="sems-env,kokkos-env,kokkos-hwloc/1.10.1/base,sems-<COMPILER_NAME>/<COMPILER_VERSION>" + + if [ -z "$ARCH_FLAG" ]; then + ARCH_FLAG="" + fi + + if [ "$SPOT_CHECK" = "True" ]; then + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS" + "gcc/7.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" + "intel/17.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" + "clang/4.0.1 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" + ) + else + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/7.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/16.0.3 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "clang/3.7.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "clang/3.8.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "clang/3.9.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "clang/4.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + ) + fi +elif [ "$MACHINE" = "white" ]; then + source /etc/profile.d/modules.sh + SKIP_HWLOC=True + export SLURM_TASKS_PER_NODE=32 + + BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>" + IBM_MODULE_LIST="<COMPILER_NAME>/xl/<COMPILER_VERSION>,gcc/7.2.0" + CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/7.2.0,ibm/xl/16.1.0" + CUDA10_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/7.4.0,ibm/xl/16.1.0" + + # Don't do pthread on white. + GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" + + if [ "$SPOT_CHECK" = "True" ]; then + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("gcc/6.4.0 $BASE_MODULE_LIST "OpenMP_Serial" g++ $GCC_WARNING_FLAGS" + "gcc/7.2.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "ibm/16.1.0 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS" + "cuda/9.2.88 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + ) + else + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("gcc/6.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/7.2.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "ibm/16.1.0 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS" + "ibm/16.1.1 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS" + "cuda/9.2.88 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/10.0.130 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + ) + fi + + if [ -z "$ARCH_FLAG" ]; then + ARCH_FLAG="--arch=Power8,Kepler37" + fi + +elif [ "$MACHINE" = "bowman" ]; then + source /etc/profile.d/modules.sh + SKIP_HWLOC=True + export SLURM_TASKS_PER_NODE=32 + + BASE_MODULE_LIST="<COMPILER_NAME>/compilers/<COMPILER_VERSION>" + + OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial" + + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("intel/16.4.258 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/17.2.174 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/18.2.199 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + ) + + if [ -z "$ARCH_FLAG" ]; then + ARCH_FLAG="--arch=KNL" + fi + +elif [ "$MACHINE" = "mayer" ]; then + SKIP_HWLOC=True + export SLURM_TASKS_PER_NODE=96 + + BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>" +# ARM_MODULE_LIST="<COMPILER_NAME>/compilers/<COMPILER_VERSION>" + + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("gnu7/7.2.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "arm/19.2 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST armclang++ $CLANG_WARNING_FLAGS") + + if [ -z "$ARCH_FLAG" ]; then + ARCH_FLAG="--arch=ARMv8-TX2" + fi + +elif [ "$MACHINE" = "blake" ]; then + source /etc/profile.d/modules.sh + SKIP_HWLOC=True + export SLURM_TASKS_PER_NODE=32 + + BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>" + BASE_MODULE_LIST_INTEL="<COMPILER_NAME>/compilers/<COMPILER_VERSION>" + + if [ "$SPOT_CHECK" = "True" ]; then + + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("intel/18.1.163 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "pgi/18.7.0 $BASE_MODULE_LIST $GCC_BUILD_LIST pgc++ $PGI_WARNING_FLAGS" + ) + else + COMPILERS=("intel/18.1.163 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/5.5.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/6.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/8.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "pgi/18.7.0 $BASE_MODULE_LIST $GCC_BUILD_LIST pgc++ $PGI_WARNING_FLAGS" + ) + + fi + if [ -z "$ARCH_FLAG" ]; then + ARCH_FLAG="--arch=SKX" + fi + +elif [ "$MACHINE" = "apollo" ]; then + source /projects/sems/modulefiles/utils/sems-modules-init.sh + module use /home/projects/modulefiles/local/x86-64 + module load kokkos-env + + module load sems-git + module load sems-tex + module load sems-cmake/3.5.2 + module load sems-gdb + module load binutils + + SKIP_HWLOC=True + + GCC_MODULE_LIST="sems-env,kokkos-env,kokkos-hwloc/1.10.1/base,sems-<COMPILER_NAME>/<COMPILER_VERSION>" + NONGCC_MODULE_LIST="sems-env,kokkos-env,sems-gcc/5.3.0,sems-<COMPILER_NAME>/<COMPILER_VERSION>,kokkos-hwloc/1.10.1/base" + CUDA_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/4.8.4,kokkos-hwloc/1.10.1/base" + CUDA8_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0,kokkos-hwloc/1.10.1/base" + CUDA10_MODULE_LIST="sems-env,kokkos-env,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0,kokkos-hwloc/1.10.1/base" + + CLANG_MODULE_LIST="sems-env,kokkos-env,<COMPILER_NAME>/<COMPILER_VERSION>,cuda/9.0.69" + CLANG7_MODULE_LIST="sems-env,kokkos-env,<COMPILER_NAME>/<COMPILER_VERSION>,cuda/9.1" + NVCC_MODULE_LIST="sems-env,kokkos-env,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0" + HPX_MODULE_LIST="sems-env,kokkos-env,hpx/1.2.1,sems-gcc/6.1.0,binutils" + + BUILD_LIST_CUDA_NVCC="Cuda_Serial,Cuda_OpenMP" + BUILD_LIST_CUDA_CLANG="Cuda_Serial,Cuda_Pthread" + BUILD_LIST_CLANG="Serial,Pthread,OpenMP" + + if [ "$SPOT_CHECK" = "True" ]; then + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("gcc/4.8.4 $GCC_MODULE_LIST "OpenMP,Pthread" g++ $GCC_WARNING_FLAGS" + "gcc/5.3.0 $GCC_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" + "intel/16.0.1 $NONGCC_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" + "clang/3.9.0 $NONGCC_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" + "clang/6.0 $CLANG_MODULE_LIST "Cuda_Pthread,OpenMP" clang++ $CUDA_WARNING_FLAGS" + "cuda/9.1 $CUDA_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "hpx/1.2.1 $HPX_MODULE_LIST "HPX" g++ $PGI_WARNING_FLAGS" + ) + else + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("cuda/9.1 $CUDA8_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/10.0 $CUDA10_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "clang/6.0 $CLANG_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS" + "clang/7.0 $CLANG7_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS" + "clang/3.9.0 $CLANG_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS" + "gcc/4.8.4 $GCC_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/4.9.3 $GCC_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/5.3.0 $GCC_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/6.1.0 $GCC_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "intel/15.0.2 $NONGCC_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/16.0.1 $NONGCC_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/17.0.1 $NONGCC_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "clang/3.5.2 $NONGCC_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "clang/3.6.1 $NONGCC_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + ) + fi + + if [ -z "$ARCH_FLAG" ]; then + ARCH_FLAG="--arch=SNB,Volta70" + fi + +elif [ "$MACHINE" = "kokkos-dev-2" ]; then + source /projects/sems/modulefiles/utils/sems-modules-init.sh + module use /home/projects/x86-64/modulefiles/local + module purge + module load sems-env + module load kokkos-env + + module load sems-git + module load sems-tex + module load sems-cmake/3.12.2 + module load sems-gdb + + SKIP_HWLOC=True + + BASE_MODULE_LIST="sems-env,kokkos-env,sems-cmake/3.12.2,kokkos-hwloc/1.10.1/base,sems-<COMPILER_NAME>/<COMPILER_VERSION>" + GCC91_MODULE_LIST="sems-env,kokkos-env,sems-cmake/3.12.2,kokkos-hwloc/1.10.1/base,<COMPILER_NAME>/<COMPILER_VERSION>" + NVCC_MODULE_LIST="sems-env,kokkos-env,sems-cmake/3.12.2,kokkos-hwloc/1.10.1/base,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/7.3.0" + + CLANG_MODULE_LIST="sems-env,kokkos-env,sems-cmake/3.12.2,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/6.1.0" + CLANG8_MODULE_LIST="sems-env,kokkos-env,sems-cmake/3.12.2,<COMPILER_NAME>/<COMPILER_VERSION>,cuda/10.0" + PGI_MODULE_LIST="sems-env,kokkos-env,sems-cmake/3.12.2,sems-gcc/7.3.0,<COMPILER_NAME>/<COMPILER_VERSION>" + + BUILD_LIST_CUDA_NVCC="Cuda_Serial,Cuda_Pthread" + BUILD_LIST_CUDA_CLANG="Cuda_Serial,Cuda_OpenMP" + BUILD_LIST_CLANG="Serial,Pthread,OpenMP" + + if [ "$SPOT_CHECK" = "True" ]; then + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("gcc/7.3.0 $BASE_MODULE_LIST "OpenMP,Pthread" g++ $GCC_WARNING_FLAGS" + "gcc/8.3.0 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS" + "gcc/9.1 $GCC91_MODULE_LIST "OpenMP,Serial" g++ $GCC_WARNING_FLAGS" + "intel/18.0.5 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" + "clang/8.0 $CLANG8_MODULE_LIST "Cuda_OpenMP,Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" + "cuda/10.1 $NVCC_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + ) + else + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("cuda/10.0 $NVCC_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/10.1 $NVCC_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "clang/8.0 $CLANG8_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS" + "clang/8.0 $CLANG8_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS" + "gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/7.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/8.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/9.1 $GCC91_MODULE_LIST "$GCC_BUILD_LIST" g++ $GCC_WARNING_FLAGS" + "gcc/9.2.0 $BASE_MODULE_LIST "$GCC_BUILD_LIST" g++ $GCC_WARNING_FLAGS" + "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/18.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/19.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "clang/3.5.2 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "clang/5.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "clang/7.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "clang/9.0.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "pgi/19.4 $PGI_MODULE_LIST $GCC_BUILD_LIST pgc++ $PGI_WARNING_FLAGS" + ) + fi + + if [ -z "$ARCH_FLAG" ]; then + ARCH_FLAG="--arch=SNB,Volta70" + fi + +else + echo "Unhandled machine $MACHINE" >&2 + exit 1 +fi + +export OMP_NUM_THREADS=8 +export OMP_PROC_BIND=spread +export OMP_PLACES=cores + +declare -i NUM_RESULTS_TO_KEEP=7 + +RESULT_ROOT_PREFIX=TestAll + +if [ "$PRINT_HELP" = "True" ]; then + echo "test_all_sandia <ARGS> <OPTIONS>:" + echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory" + echo " Defaults to root repo containing this script" + echo "--debug: Run tests in debug. Defaults to False" + echo "--test-script: Test this script, not Kokkos" + echo "--skip-hwloc: Do not do hwloc tests" + echo "--num=N: Number of jobs to run in parallel" + echo "--spot-check: Minimal test set to issue pull request" + echo "--dry-run: Just print what would be executed" + echo "--build-only: Just do builds, don't run anything" + echo "--opt-flag=FLAG: Optimization flag (default: -O3)" + echo "--cxxflags-extra=FLAGS: Extra flags to be added to CXX_FLAGS" + echo "--cxxstandard=OPT: c++14 (default), c++17, c++1y, c++1z, c++2a" + echo "--ldflags-extra=FLAGS: Extra flags to be added to LD_FLAGS" + echo "--arch=ARCHITECTURE: overwrite architecture flags" + echo "--with-cuda-options=OPT: set KOKKOS_CUDA_OPTIONS" + echo "--build-list=BUILD,BUILD,BUILD..." + echo " Provide a comma-separated list of builds instead of running all builds" + echo " Valid items:" + echo " OpenMP, Pthread, Serial, OpenMP_Serial, Pthread_Serial" + echo " Cuda_OpenMP, Cuda_Pthread, Cuda_Serial" + echo "" + + echo "ARGS: list of expressions matching compilers to test" + echo " supported compilers sems" + for COMPILER_DATA in "${COMPILERS[@]}"; do + ARR=($COMPILER_DATA) + COMPILER=${ARR[0]} + echo " $COMPILER" + done + echo "" + + echo "Examples:" + echo " Run all tests" + echo " % test_all_sandia" + echo "" + echo " Run all gcc tests" + echo " % test_all_sandia gcc" + echo "" + echo " Run all gcc/4.8.4 and all intel tests" + echo " % test_all_sandia gcc/4.8.4 intel" + echo "" + echo " Run all tests in debug" + echo " % test_all_sandia --debug" + echo "" + echo " Run gcc/4.8.4 and only do OpenMP and OpenMP_Serial builds" + echo " % test_all_sandia gcc/4.8.4 --build-list=OpenMP,OpenMP_Serial" + echo "" + echo "If you want to kill the tests, do:" + echo " hit ctrl-z" + echo " % kill -9 %1" + echo + exit 0 +fi + +# Set build type. +if [ "$DEBUG" = "True" ]; then + BUILD_TYPE=debug +else + BUILD_TYPE=release +fi + +# If no args provided, do all compilers. +if [ -z "$ARGS" ]; then + ARGS='?' +fi + +# Process args to figure out which compilers to test. +COMPILERS_TO_TEST="" + +for ARG in $ARGS; do + for COMPILER_DATA in "${COMPILERS[@]}"; do + ARR=($COMPILER_DATA) + COMPILER=${ARR[0]} + + if [[ "$COMPILER" = $ARG* ]]; then + if [[ "$COMPILERS_TO_TEST" != *${COMPILER}* ]]; then + COMPILERS_TO_TEST="$COMPILERS_TO_TEST $COMPILER" + else + echo "Tried to add $COMPILER twice" + fi + fi + done +done + +# +# Functions. +# + +# get_compiler_name <COMPILER> +get_compiler_name() { + echo $1 | cut -d/ -f1 +} + +# get_compiler_version <COMPILER> +get_compiler_version() { + echo $1 | cut -d/ -f2 +} + +# Do not call directly. +get_compiler_data() { + local compiler=$1 + local item=$2 + local compiler_name=$(get_compiler_name $compiler) + local compiler_vers=$(get_compiler_version $compiler) + + local compiler_data + for compiler_data in "${COMPILERS[@]}" ; do + local arr=($compiler_data) + + if [ "$compiler" = "${arr[0]}" ]; then + echo "${arr[$item]}" | tr , ' ' | sed -e "s/<COMPILER_NAME>/$compiler_name/g" -e "s/<COMPILER_VERSION>/$compiler_vers/g" + return 0 + fi + done + + # Not found. + echo "Unreconized compiler $compiler" >&2 + exit 1 +} + +# +# For all getters, usage: <GETTER> <COMPILER> +# + +get_compiler_modules() { + get_compiler_data $1 1 +} + +get_compiler_build_list() { + get_compiler_data $1 2 +} + +get_compiler_exe_name() { + get_compiler_data $1 3 +} + +get_compiler_warning_flags() { + get_compiler_data $1 4 +} + +run_cmd() { + echo "RUNNING: $*" + if [ "$DRYRUN" != "True" ]; then + eval "$* 2>&1" + fi +} + +# report_and_log_test_results <SUCCESS> <DESC> <COMMENT> +report_and_log_test_result() { + # Use sane var names. + local success=$1; local desc=$2; local comment=$3; + + if [ "$success" = "0" ]; then + echo " PASSED $desc" + echo $comment > $PASSED_DIR/$desc + else + # For failures, comment should be the name of the phase that failed. + echo " FAILED $desc" >&2 + echo $comment > $FAILED_DIR/$desc + cat ${desc}.${comment}.log + fi +} + +setup_env() { + local compiler=$1 + local compiler_modules=$(get_compiler_modules $compiler) + + module purge + + local mod + for mod in $compiler_modules; do + echo "Loading module $mod" + module load $mod 2>&1 + # It is ridiculously hard to check for the success of a loaded + # module. Module does not return error codes and piping to grep + # causes module to run in a subshell. + module list 2>&1 | grep "$mod" >& /dev/null || return 1 + done + + return 0 +} + +# single_build_and_test <COMPILER> <BUILD> <BUILD_TYPE> +single_build_and_test() { + # Use sane var names. + local compiler=$1; local build=$2; local build_type=$3; + + # Set up env. + mkdir -p $ROOT_DIR/$compiler/"${build}-$build_type" + cd $ROOT_DIR/$compiler/"${build}-$build_type" + local desc=$(echo "${compiler}-${build}-${build_type}" | sed 's:/:-:g') + setup_env $compiler >& ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } + + # Set up flags. + local compiler_warning_flags=$(get_compiler_warning_flags $compiler) + local compiler_exe=$(get_compiler_exe_name $compiler) + + if [[ "$build_type" = hwloc* ]]; then + local extra_args=--with-hwloc=$(dirname $(dirname $(which hwloc-info))) + fi + + if [[ "$OPT_FLAG" = "" ]]; then + OPT_FLAG="-O3" + fi + + if [[ "$build_type" = *debug* ]]; then + local extra_args="$extra_args --debug" + local cxxflags="-g $compiler_warning_flags" + local ldflags="-g" + else + local cxxflags="$OPT_FLAG $compiler_warning_flags" + local ldflags="${OPT_FLAG}" + fi + + local cxxflags="${cxxflags} ${CXX_FLAGS_EXTRA}" + local ldflags="${ldflags} ${LD_FLAGS_EXTRA}" + + local cxx_standard="${CXX_STANDARD}" + + if [[ "$KOKKOS_CUDA_OPTIONS" != "" ]]; then + local extra_args="$extra_args $KOKKOS_CUDA_OPTIONS" + fi + if [[ "$KOKKOS_OPTIONS" != "" ]]; then + local extra_args="$extra_args $KOKKOS_OPTIONS" + else + local extra_args="$extra_args --with-options=enable_large_mem_tests" + fi + + echo " Starting job $desc" + + local comment="no_comment" + + if [ "$TEST_SCRIPT" = "True" ]; then + local rand=$[ 1 + $[ RANDOM % 10 ]] + sleep $rand + + if [ $rand -gt 5 ]; then + run_cmd ls fake_problem >& ${desc}.configure.log || { report_and_log_test_result 1 $desc configure && return 0; } + fi + else + run_cmd ${KOKKOS_PATH}/scripts/testing_scripts/gnu_generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } + local -i build_start_time=$(date +%s) + run_cmd make -j 48 build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; } + local -i build_end_time=$(date +%s) + comment="build_time=$(($build_end_time-$build_start_time))" + + if [[ "$BUILD_ONLY" == False ]]; then + run_cmd make test >& ${desc}.test.log || { report_and_log_test_result 1 ${desc} test && return 0; } + local -i run_end_time=$(date +%s) + comment="$comment run_time=$(($run_end_time-$build_end_time))" + fi + fi + + report_and_log_test_result 0 $desc "$comment" + + return 0 +} + +# wait_for_jobs <NUM-JOBS> +wait_for_jobs() { + local -i max_jobs=$1 + local -i num_active_jobs=$(jobs | wc -l) + while [ $num_active_jobs -ge $max_jobs ] + do + sleep 1 + num_active_jobs=$(jobs | wc -l) + jobs >& /dev/null + done +} + +# run_in_background <COMPILER> <BUILD> <BUILD_TYPE> +run_in_background() { + local compiler=$1 + + local -i num_jobs=$NUM_JOBS_TO_RUN_IN_PARALLEL + # Don't override command line input. + # if [[ "$BUILD_ONLY" == True ]]; then + # num_jobs=8 + # else + if [[ "$compiler" == cuda* ]]; then + num_jobs=1 + fi + if [[ "$compiler" == clang ]]; then + num_jobs=1 + fi + # fi + wait_for_jobs $num_jobs + + single_build_and_test $* & +} + +# build_and_test_all <COMPILER> +build_and_test_all() { + # Get compiler data. + local compiler=$1 + if [ -z "$CUSTOM_BUILD_LIST" ]; then + local compiler_build_list=$(get_compiler_build_list $compiler) + else + local compiler_build_list=$(echo "$CUSTOM_BUILD_LIST" | tr , ' ') + fi + + # Do builds. + local build + for build in $compiler_build_list + do + run_in_background $compiler $build $BUILD_TYPE + + # If not cuda, do a hwloc test too. + if [[ "$compiler" != cuda* && "$SKIP_HWLOC" == False ]]; then + run_in_background $compiler $build "hwloc-$BUILD_TYPE" + fi + done + + return 0 +} + +get_test_root_dir() { + local existing_results=$(find . -maxdepth 1 -name "$RESULT_ROOT_PREFIX*" | sort) + local -i num_existing_results=$(echo $existing_results | tr ' ' '\n' | wc -l) + local -i num_to_delete=${num_existing_results}-${NUM_RESULTS_TO_KEEP} + + if [ $num_to_delete -gt 0 ]; then + /bin/rm -rf $(echo $existing_results | tr ' ' '\n' | head -n $num_to_delete) + fi + + echo $(pwd)/${RESULT_ROOT_PREFIX}_$(date +"%Y-%m-%d_%H.%M.%S") +} + +wait_summarize_and_exit() { + wait_for_jobs 1 + + echo "#######################################################" + echo "PASSED TESTS" + echo "#######################################################" + + local passed_test + for passed_test in $(\ls -1 $PASSED_DIR | sort) + do + echo $passed_test $(cat $PASSED_DIR/$passed_test) + done + + local -i rv=0 + if [ "$(ls -A $FAILED_DIR)" ]; then + echo "#######################################################" + echo "FAILED TESTS" + echo "#######################################################" + + local failed_test + for failed_test in $(\ls -1 $FAILED_DIR | sort) + do + echo $failed_test "("$(cat $FAILED_DIR/$failed_test)" failed)" + rv=$rv+1 + done + fi + + exit $rv +} + +# +# Main. +# + +ROOT_DIR=$(get_test_root_dir) +mkdir -p $ROOT_DIR +cd $ROOT_DIR + +PASSED_DIR=$ROOT_DIR/results/passed +FAILED_DIR=$ROOT_DIR/results/failed +mkdir -p $PASSED_DIR +mkdir -p $FAILED_DIR + +echo "Going to test compilers: " $COMPILERS_TO_TEST +for COMPILER in $COMPILERS_TO_TEST; do + echo "Testing compiler $COMPILER" + build_and_test_all $COMPILER +done + +wait_summarize_and_exit diff --git a/packages/kokkos/scripts/testing_scripts/jenkins_test_driver b/packages/kokkos/scripts/testing_scripts/jenkins_test_driver new file mode 100755 index 0000000000000000000000000000000000000000..f393940304ee8e679440871414376283f8eef9a7 --- /dev/null +++ b/packages/kokkos/scripts/testing_scripts/jenkins_test_driver @@ -0,0 +1,83 @@ +#!/bin/bash -x + +echo "Building for BUILD_TYPE = ${BUILD_TYPE}" +echo "Building with HOST_COMPILER = ${HOST_COMPILER}" +echo "Building in ${WORKSPACE}" + +module use /home/projects/modulefiles + +BUILD_TYPE=`echo $BUILD_TYPE | tr "~" " "` +build_options="" +for item in ${BUILD_TYPE}; do + build_options="$build_options --with-$item" +done + +kokkos_path=${WORKSPACE}/kokkos +gtest_path=${WORKSPACE}/kokkos/tpls/gtest + +echo ${WORKSPACE} +pwd + +#extract information from the provided parameters. +host_compiler_brand=`echo $HOST_COMPILER | grep -o "^[a-zA-Z]*"` +cuda_compiler=`echo $BUILD_TYPE | grep -o "cuda_[^ ]*"` + +host_compiler_module=`echo $HOST_COMPILER | tr "_" "/"` +cuda_compiler_module=`echo $cuda_compiler | tr "_" "/"` +build_path=`echo $BUILD_TYPE | tr " " "_"` + +module load $host_compiler_module +module load $cuda_compiler_module + +case $host_compiler_brand in + gcc) + module load nvcc-wrapper/gnu + compiler=g++ + ;; + intel) + module load nvcc-wrapper/intel + compiler=icpc + ;; + *) + echo "Unrecognized compiler brand." + exit 1 + ;; +esac + +#if cuda is on we need to set the host compiler for the +#nvcc wrapper and make the wrapper the compiler. +if [ $cuda_compiler != "" ]; then + export NVCC_WRAPPER_DEFAULT_COMPILER=$compiler + compiler=$kokkos_path/bin/nvcc_wrapper +fi + +if [ $host_compiler_brand == "intel" -a $cuda_compiler != "" ]; then + echo "Intel compilers are not supported with cuda at this time." + exit 0 +fi + +rm -rf test-$build_path +mkdir test-$build_path +cd test-$build_path + +/bin/bash $kokkos_path/generate_makefile.bash $build_options --kokkos-path="$kokkos_path" --with-gtest="$gtest_path" --compiler=$compiler 2>&1 |tee configure.out + +if [ ${PIPESTATUS[0]} != 0 ]; then + echo "Configure failed." + exit 1 +fi + +make build-test 2>&1 | tee build.log + +if [ ${PIPESTATUS[0]} != 0 ]; then + echo "Build failed." + exit 1 +fi + +make test 2>&1 | tee test.log + +grep "FAIL" test.log +if [ $? == 0 ]; then + echo "Tests failed." + exit 1 +fi diff --git a/packages/kokkos/scripts/testing_scripts/obj_size_opt_check b/packages/kokkos/scripts/testing_scripts/obj_size_opt_check new file mode 100755 index 0000000000000000000000000000000000000000..47c84d1a92a8a288115ecf0d416d57b349fb69b4 --- /dev/null +++ b/packages/kokkos/scripts/testing_scripts/obj_size_opt_check @@ -0,0 +1,287 @@ +#! /usr/bin/env python + +""" +Compute the size at which the current compiler will start to +significantly scale back optimization. + +The CPP file being modified will need the following tags. +// JGF_DUPLICATE_BEGIN - Put before start of function to duplicate +// JGF_DUPLICATE_END - Put after end of function to duplcate +// JGF_DUPE function_name(args); - Put anywhere where it's legal to +put a function call but not in your timing section. + +The program will need to output the string: +FOM: <number> +This will represent the program's performance +""" + +import argparse, sys, os, doctest, subprocess, re, time + +VERBOSE = False + +############################################################################### +def parse_command_line(args, description): +############################################################################### + parser = argparse.ArgumentParser( + usage="""\n%s <cppfile> <build-command> <run-command> [--verbose] +OR +%s --help +OR +%s --test + +\033[1mEXAMPLES:\033[0m + > %s foo.cpp 'make -j4' foo +""" % ((os.path.basename(args[0]), ) * 4), + +description=description, + +formatter_class=argparse.ArgumentDefaultsHelpFormatter +) + + parser.add_argument("cppfile", help="Name of file to modify.") + + parser.add_argument("buildcmd", help="Build command") + + parser.add_argument("execmd", help="Run command") + + parser.add_argument("-v", "--verbose", action="store_true", + help="Print extra information") + + parser.add_argument("-s", "--start", type=int, default=1, + help="Starting number of dupes") + + parser.add_argument("-e", "--end", type=int, default=1000, + help="Ending number of dupes") + + parser.add_argument("-n", "--repeat", type=int, default=10, + help="Number of times to repeat an individial execution. Best value will be taken.") + + parser.add_argument("-t", "--template", action="store_true", + help="Use templating instead of source copying to increase object size") + + parser.add_argument("-c", "--csv", action="store_true", + help="Print results as CSV") + + args = parser.parse_args(args[1:]) + + if (args.verbose): + global VERBOSE + VERBOSE = True + + return args.cppfile, args.buildcmd, args.execmd, args.start, args.end, args.repeat, args.template, args.csv + +############################################################################### +def verbose_print(msg, override=None): +############################################################################### + if ( (VERBOSE and not override is False) or override): + print msg + +############################################################################### +def error_print(msg): +############################################################################### + print >> sys.stderr, msg + +############################################################################### +def expect(condition, error_msg): +############################################################################### + """ + Similar to assert except doesn't generate an ugly stacktrace. Useful for + checking user error, not programming error. + """ + if (not condition): + raise SystemExit("FAIL: %s" % error_msg) + +############################################################################### +def run_cmd(cmd, ok_to_fail=False, input_str=None, from_dir=None, verbose=None, + arg_stdout=subprocess.PIPE, arg_stderr=subprocess.PIPE): +############################################################################### + verbose_print("RUN: %s" % cmd, verbose) + + if (input_str is not None): + stdin = subprocess.PIPE + else: + stdin = None + + proc = subprocess.Popen(cmd, + shell=True, + stdout=arg_stdout, + stderr=arg_stderr, + stdin=stdin, + cwd=from_dir) + output, errput = proc.communicate(input_str) + output = output.strip() if output is not None else output + stat = proc.wait() + + if (ok_to_fail): + return stat, output, errput + else: + if (arg_stderr is not None): + errput = errput if errput is not None else open(arg_stderr.name, "r").read() + expect(stat == 0, "Command: '%s' failed with error '%s'" % (cmd, errput)) + else: + expect(stat == 0, "Command: '%s' failed. See terminal output" % cmd) + return output + +############################################################################### +def build_and_run(source, cppfile, buildcmd, execmd, repeat): +############################################################################### + open(cppfile, 'w').writelines(source) + + run_cmd(buildcmd) + + best = None + for i in xrange(repeat): + wait_for_quiet_machine() + output = run_cmd(execmd) + + current = None + fom_regex = re.compile(r'^FOM: ([0-9.]+)$') + for line in output.splitlines(): + m = fom_regex.match(line) + if (m is not None): + current = float(m.groups()[0]) + break + + expect(current is not None, "No lines in output matched FOM regex") + + if (best is None or best < current): + best = current + + return best + +############################################################################### +def wait_for_quiet_machine(): +############################################################################### + while(True): + time.sleep(2) + + # The first iteration of top gives garbage results + idle_pct_raw = run_cmd("top -bn2 | grep 'Cpu(s)' | tr ',' ' ' | tail -n 1 | awk '{print $5}'") + + idle_pct_re = re.compile(r'^([0-9.]+)%id$') + m = idle_pct_re.match(idle_pct_raw) + + expect(m is not None, "top not returning output in expected form") + + idle_pct = float(m.groups()[0]) + if (idle_pct < 95): + error_print("Machine is too busy, waiting for it to become free") + else: + break + +############################################################################### +def add_n_dupes(curr_lines, num_dupes, template): +############################################################################### + function_name = None + function_invocation = None + function_lines = [] + + function_re = re.compile(r'^.* (\w+) *[(]') + function_inv_re = re.compile(r'^.*JGF_DUPE: +(.+)$') + + # Get function lines + record = False + definition_insertion_point = None + invocation_insertion_point = None + for idx, line in enumerate(curr_lines): + if ("JGF_DUPLICATE_BEGIN" in line): + record = True + m = function_re.match(curr_lines[idx+1]) + expect(m is not None, "Could not find function in line '%s'" % curr_lines[idx+1]) + function_name = m.groups()[0] + + elif ("JGF_DUPLICATE_END" in line): + record = False + definition_insertion_point = idx + 1 + + elif (record): + function_lines.append(line) + + elif ("JGF_DUPE" in line): + m = function_inv_re.match(line) + expect(m is not None, "Could not find function invocation example in line '%s'" % line) + function_invocation = m.groups()[0] + invocation_insertion_point = idx + 1 + + expect(function_name is not None, "Could not find name of dupe function") + expect(function_invocation is not None, "Could not find function invocation point") + + expect(definition_insertion_point < invocation_insertion_point, "fix me") + + dupe_func_defs = [] + dupe_invocations = ["int jgf_rand = std::rand();\n", "if (false) {}\n"] + + for i in xrange(num_dupes): + if (not template): + dupe_func = list(function_lines) + dupe_func[0] = dupe_func[0].replace(function_name, "%s%d" % (function_name, i)) + dupe_func_defs.extend(dupe_func) + + dupe_invocations.append("else if (jgf_rand == %d) " % i) + if (template): + dupe_call = function_invocation.replace(function_name, "%s<%d>" % (function_name, i)) + "\n" + else: + dupe_call = function_invocation.replace(function_name, "%s%d" % (function_name, i)) + "\n" + dupe_invocations.append(dupe_call) + + curr_lines[invocation_insertion_point:invocation_insertion_point] = dupe_invocations + curr_lines[definition_insertion_point:definition_insertion_point] = dupe_func_defs + +############################################################################### +def report(num_dupes, curr_lines, object_file, orig_fom, curr_fom, csv=False, is_first_report=False): +############################################################################### + fom_change = (curr_fom - orig_fom) / orig_fom + + if (csv): + if (is_first_report): + print "num_dupes, obj_byte_size, loc, fom, pct_diff" + + print "%s, %s, %s, %s, %s" % (num_dupes, os.path.getsize(object_file), len(curr_lines), curr_fom, fom_change*100) + else: + print "========================================================" + print "For number of dupes:", num_dupes + print "Object file size (bytes):", os.path.getsize(object_file) + print "Lines of code:", len(curr_lines) + print "Field of merit:", curr_fom + print "Change pct:", fom_change*100 + +############################################################################### +def obj_size_opt_check(cppfile, buildcmd, execmd, start, end, repeat, template, csv=False): +############################################################################### + orig_source_lines = open(cppfile, 'r').readlines() + + backup_file = "%s.orig" % cppfile + object_file = "%s.o" % os.path.splitext(cppfile)[0] + os.rename(cppfile, backup_file) + + orig_fom = build_and_run(orig_source_lines, cppfile, buildcmd, execmd, repeat) + report(0, orig_source_lines, object_file, orig_fom, orig_fom, csv=csv, is_first_report=True) + + i = start + while (i < end): + curr_lines = list(orig_source_lines) + add_n_dupes(curr_lines, i, template) + + curr_fom = build_and_run(curr_lines, cppfile, buildcmd, execmd, repeat) + + report(i, curr_lines, object_file, orig_fom, curr_fom, csv=csv) + + i *= 2 # make growth function configurable? + + os.remove(cppfile) + os.rename(backup_file, cppfile) + +############################################################################### +def _main_func(description): +############################################################################### + if ("--test" in sys.argv): + test_results = doctest.testmod(verbose=True) + sys.exit(1 if test_results.failed > 0 else 0) + + cppfile, buildcmd, execmd, start, end, repeat, template, csv = parse_command_line(sys.argv, description) + + obj_size_opt_check(cppfile, buildcmd, execmd, start, end, repeat, template, csv) + +############################################################################### +if (__name__ == "__main__"): + _main_func(__doc__) diff --git a/packages/kokkos/scripts/testing_scripts/test_all_sandia b/packages/kokkos/scripts/testing_scripts/test_all_sandia new file mode 100755 index 0000000000000000000000000000000000000000..877b35b73e1aef7c64cdb2d7e5f00f7bc235781c --- /dev/null +++ b/packages/kokkos/scripts/testing_scripts/test_all_sandia @@ -0,0 +1,1065 @@ +#!/bin/bash -e + +# +# Global config +# + +set -o pipefail + +# Determine current machine. + +print_help() { + echo "test_all_sandia <ARGS> <OPTIONS>:" + echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory" + echo " Defaults to root repo containing this script" + echo "--debug: Run tests in debug. Defaults to False" + echo "--boundscheck: Enable Kokkos_ENABLE_DEBUG_BOUNDS_CHECK to check View accesses within bounds." + echo "--test-script: Test this script, not Kokkos" + echo "--skip-hwloc: Do not do hwloc tests" + echo "--num=N: Number of jobs to run in parallel" + echo "--spot-check: Minimal test set to issue pull request" + echo "--timeout: Max time before ctest timeout (in seconds)" + echo "--dry-run: Just print what would be executed" + echo "--build-only: Just do builds, don't run anything" + echo "--opt-flag=FLAG: Optimization flag (default: -O3)" + echo "--cxxflags-extra=FLAGS: Extra flags to be added to CXX_FLAGS" + echo "--cxxstandard=OPT: c++14 (default), c++17, c++1y, c++1z, c++2a" + echo "--ldflags-extra=FLAGS: Extra flags to be added to LD_FLAGS" + echo "--arch=ARCHITECTURE: overwrite architecture flags" + echo "--with-cuda-options=OPT: set KOKKOS_CUDA_OPTIONS" + echo "--with-options=OPT: set KOKKOS_OPTIONS" + echo "--build-list=BUILD,BUILD,BUILD..." + echo " Provide a comma-separated list of builds instead of running all builds" + echo " Valid items:" + echo " OpenMP, Pthread, Serial, OpenMP_Serial, Pthread_Serial" + echo " Cuda_OpenMP, Cuda_Pthread, Cuda_Serial" + echo "" + + echo "ARGS: list of expressions matching compilers to test" + echo " supported compilers sems" + for COMPILER_DATA in "${COMPILERS[@]}"; do + ARR=($COMPILER_DATA) + COMPILER=${ARR[0]} + echo " $COMPILER" + done + echo "" + + echo "Examples:" + echo " Run all tests" + echo " % test_all_sandia" + echo "" + echo " Run all gcc tests" + echo " % test_all_sandia gcc" + echo "" + echo " Run all gcc/4.8.4 and all intel tests" + echo " % test_all_sandia gcc/4.8.4 intel" + echo "" + echo " Run all tests in debug" + echo " % test_all_sandia --debug" + echo "" + echo " Run gcc/4.8.4 and only do OpenMP and OpenMP_Serial builds" + echo " % test_all_sandia gcc/4.8.4 --build-list=OpenMP,OpenMP_Serial" + echo "" + echo "If you want to kill the tests, do:" + echo " hit ctrl-z" + echo " % kill -9 %1" + echo +} + +MACHINE="" +HOSTNAME=$(hostname) +PROCESSOR=`uname -p` +CUDA_ENABLE_CMD= + +if [[ "$HOSTNAME" =~ (white|ride).* ]]; then + MACHINE=white + module load git +fi + +if [[ "$HOSTNAME" =~ weaver.* ]]; then + MACHINE=weaver + module load git +fi + +if [[ "$HOSTNAME" =~ .*voltrino.* ]]; then + MACHINE=voltrino + module load git +fi + +if [[ "$HOSTNAME" == *blake* ]]; then # Warning: very generic name + MACHINE=blake + module load git +fi + +if [[ "$HOSTNAME" == apollo\.* ]]; then + MACHINE=apollo +fi + +if [[ "$HOSTNAME" == kokkos-dev-2* ]]; then + MACHINE=kokkos-dev-2 +fi + +if [[ "$HOSTNAME" == may* ]]; then + MACHINE=mayer +# module load git +fi + +if [[ "$HOSTNAME" == cn* ]]; then # Warning: very generic name + MACHINE=mayer +fi + +if [[ "$HOSTNAME" == kokkos-dev\.sandia\.gov* ]]; then + MACHINE=kokkos-dev +fi + +if [[ "$HOSTNAME" == sogpu01* ]]; then + MACHINE=sogpu +fi + +if [ ! -z "$SEMS_MODULEFILES_ROOT" ]; then + if [[ "$MACHINE" = "" ]]; then + MACHINE=sems + module load sems-git + fi +fi + +if [[ "$MACHINE" = "" ]]; then + echo "Unrecognized machine" >&2 + exit 1 +fi + +echo "Running on machine: $MACHINE" + +GCC_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial" +IBM_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" +ARM_GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" +INTEL_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial" +CLANG_BUILD_LIST="Pthread,Serial,Pthread_Serial" +CUDA_BUILD_LIST="Cuda_OpenMP,Cuda_Pthread,Cuda_Serial" +CUDA_IBM_BUILD_LIST="Cuda_OpenMP,Cuda_Serial" + +GCC_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wignored-qualifiers,-Wempty-body,-Wclobbered,-Wuninitialized" +IBM_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Wsign-compare,-Wtype-limits,-Wuninitialized" +CLANG_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" +INTEL_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" +INTEL15_WARNING_FLAGS="-Wall,-Wno-unused-variable,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" +CUDA_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" +#CUDA_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Wsign-compare,-Wtype-limits,-Wuninitialized" +PGI_WARNING_FLAGS="" + +# Default. Machine specific can override. +DEBUG=False +ARGS="" +CUSTOM_BUILD_LIST="" +DRYRUN=False +BUILD_ONLY=False +declare -i NUM_JOBS_TO_RUN_IN_PARALLEL=1 +TEST_SCRIPT=False +SKIP_HWLOC=False +SPOT_CHECK=False + +PRINT_HELP=False +OPT_FLAG="" +CXX_FLAGS_EXTRA="" +LD_FLAGS_EXTRA="" +KOKKOS_OPTIONS="" + +CXX_STANDARD="14" + +CTESTTIMEOUT=2000 + +# +# Handle arguments. +# + +while [[ $# > 0 ]] +do + key="$1" + + case $key in + --kokkos-path*) + KOKKOS_PATH="${key#*=}" + ;; + --build-list*) + CUSTOM_BUILD_LIST="${key#*=}" + ;; + --debug*) + DEBUG=True + ;; + --boundscheck*) + KOKKOS_BOUNDS_CHECK="--boundscheck" + ;; + --build-only*) + BUILD_ONLY=True + ;; + --test-script*) + TEST_SCRIPT=True + ;; + --skip-hwloc*) + SKIP_HWLOC=True + ;; + --num*) + NUM_JOBS_TO_RUN_IN_PARALLEL="${key#*=}" + ;; + --dry-run*) + DRYRUN=True + ;; + --spot-check*) + SPOT_CHECK=True + ;; + --timeout*) + CTESTTIMEOUT="${key#*=}" + ;; + --arch*) + ARCH_FLAG="--arch=${key#*=}" + ;; + --opt-flag*) + OPT_FLAG="${key#*=}" + ;; + --with-cuda-options*) + KOKKOS_CUDA_OPTIONS="${key#*=}" + export KOKKOS_CUDA_OPTIONS + ;; + --with-options*) + KOKKOS_OPTIONS="${key#*=}" + export KOKKOS_OPTIONS + ;; + --cxxflags-extra*) + CXX_FLAGS_EXTRA="${key#*=}" + ;; + --cxxstandard*) + FULL_CXX_STANDARD="${key#*=}" + if [[ ${FULL_CXX_STANDARD} == *++* ]]; then + CXX_STANDARD="${FULL_CXX_STANDARD#*++}" + else + CXX_STANDARD="${FULL_CXX_STANDARD}" + fi + ;; + --ldflags-extra*) + LD_FLAGS_EXTRA="${key#*=}" + ;; + --help*) + PRINT_HELP=True + ;; + *) + # args, just append + ARGS="$ARGS $1" + ;; + esac + + shift +done + +SCRIPT_KOKKOS_ROOT=$( cd "$( dirname "$0" )" && cd ../.. && pwd ) + +# Set kokkos path. +if [ -z "$KOKKOS_PATH" ]; then + KOKKOS_PATH=$SCRIPT_KOKKOS_ROOT +else + # Ensure KOKKOS_PATH is abs path. + KOKKOS_PATH=$( cd $KOKKOS_PATH && pwd ) +fi + + + +# +# Machine specific config. +# + +if [ "$MACHINE" = "sems" ]; then + source /projects/sems/modulefiles/utils/sems-modules-init.sh + + # On unnamed sems machines, assume more restricted rhel7 environment + # On rhel7 sems machines gcc/7.3.0, clang/4.0.1, and intel/16.0.3 are missing + # Remove kokkkos-env module use + + module load sems-cmake/3.17.1 + BASE_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>" + CUDA9_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/7.2.0" + SKIP_HWLOC=True + # No sems hwloc module + + if [ -z "$ARCH_FLAG" ]; then + ARCH_FLAG="" + fi + + if [ "$SPOT_CHECK" = "True" ]; then + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS" + "gcc/7.2.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" + "intel/17.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" + "cuda/9.2 $CUDA9_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + ) + else + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/6.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/7.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/8.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "clang/5.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "clang/7.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "clang/9.0.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/18.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/19.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "cuda/9.2 $CUDA9_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + ) + fi +elif [ "$MACHINE" = "sogpu" ]; then + source /projects/sems/modulefiles/utils/sems-modules-init.sh + + module load sems-cmake/3.17.1 sems-git + BASE_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>" + CUDA_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/7.2.0" + CUDA11_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/8.3.0" + SKIP_HWLOC=True + # No sems hwloc module + + if [ -z "$ARCH_FLAG" ]; then + ARCH_FLAG="--arch=Volta70" + fi + + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/6.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/7.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/8.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "clang/5.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "clang/7.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "clang/9.0.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "clang/10.0.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/18.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/19.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "cuda/10.1 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/11.1 $CUDA11_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + ) +elif [ "$MACHINE" = "kokkos-dev" ]; then + source /projects/sems/modulefiles/utils/sems-modules-init.sh + + module load sems-cmake/3.17.1 + BASE_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>" + CUDA9_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/6.1.0" + CUDA10_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/7.2.0" + CUDA11_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/9.2.0" + CLANG7_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-cuda/9.2" + SKIP_HWLOC=True + + if [ -z "$ARCH_FLAG" ]; then + ARCH_FLAG="--arch=Kepler35" + fi + + if [ "$SPOT_CHECK" = "True" ]; then + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS" + "gcc/7.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" + "intel/17.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" + "intel/18.0.5 $BASE_MODULE_LIST "Serial" icpc $INTEL_WARNING_FLAGS" + "intel/19.0.5 $BASE_MODULE_LIST "Pthread_Serial" icpc $INTEL_WARNING_FLAGS" + "clang/5.0.1 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" + "clang/7.0.1 $CLANG7_MODULE_LIST "Cuda_OpenMP" clang++ $CLANG_WARNING_FLAGS" + "cuda/9.2 $CUDA9_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + ) + else + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/7.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/8.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/18.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/19.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "clang/5.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "clang/7.0.1 $CLANG7_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "clang/9.0.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "clang/10.0.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "cuda/10.1 $CUDA10_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/11.1 $CUDA11_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/9.2 $CUDA9_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + ) + fi +elif [ "$MACHINE" = "white" ]; then + source /etc/profile.d/modules.sh + SKIP_HWLOC=True + export SLURM_TASKS_PER_NODE=32 + + BASE_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/<COMPILER_VERSION>" + IBM_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/xl/<COMPILER_VERSION>,gcc/7.2.0" + CUDA_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/<COMPILER_VERSION>,gcc/7.2.0,ibm/xl/16.1.1" + CUDA10_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/<COMPILER_VERSION>,gcc/7.4.0,ibm/xl/16.1.1" + + # Don't do pthread with Power + GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" + + if [ "$SPOT_CHECK" = "True" ]; then + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("gcc/6.4.0 $BASE_MODULE_LIST "OpenMP_Serial" g++ $GCC_WARNING_FLAGS" + "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "ibm/16.1.1 $IBM_MODULE_LIST "Serial" xlC $IBM_WARNING_FLAGS" + "cuda/9.2.88 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/10.1.105 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + ) + else + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("gcc/5.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/6.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/7.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/9.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "ibm/16.1.1 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS" + "cuda/9.2.88 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/10.0.130 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/10.1.105 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + ) + fi + + if [ -z "$ARCH_FLAG" ]; then + ARCH_FLAG="--arch=Power8,Pascal60" + fi + +elif [ "$MACHINE" = "weaver" ]; then + source /etc/profile.d/modules.sh + SKIP_HWLOC=True + + BASE_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/<COMPILER_VERSION>" + IBM_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/xl/<COMPILER_VERSION>,gcc/7.2.0" + CUDA_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/<COMPILER_VERSION>,gcc/7.2.0,ibm/xl/16.1.1" + CUDA10_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/<COMPILER_VERSION>,gcc/7.4.0,ibm/xl/16.1.1" + + # Don't do pthread with Power + GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" + + if [ "$SPOT_CHECK" = "True" ]; then + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("gcc/6.4.0 $BASE_MODULE_LIST "OpenMP_Serial" g++ $GCC_WARNING_FLAGS" + "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "ibm/16.1.1 $IBM_MODULE_LIST "Serial" xlC $IBM_WARNING_FLAGS" + "cuda/9.2.88 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/10.1.243 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + ) + else + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("gcc/6.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/7.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/9.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "ibm/16.1.1 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS" + "cuda/9.2.88 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/10.0.130 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/10.1.105 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/10.1.243 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/10.2.089 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + ) + fi + + if [ -z "$ARCH_FLAG" ]; then + ARCH_FLAG="--arch=Power9,Volta70" + fi + +elif [ "$MACHINE" = "voltrino" ]; then + SKIP_HWLOC=True + export SLURM_TASKS_PER_NODE=32 + + BASE_MODULE_LIST="PrgEnv-intel,craype-mic-knl,cmake/3.16.2,slurm/20.11.4a,<COMPILER_NAME>/<COMPILER_VERSION>,gcc/9.3.0" + + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("intel/17.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/18.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/19.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + ) + + if [ -z "$ARCH_FLAG" ]; then + ARCH_FLAG="--arch=KNL" + fi + +elif [ "$MACHINE" = "mayer" ]; then + SKIP_HWLOC=True + export SLURM_TASKS_PER_NODE=96 + + BASE_MODULE_LIST="cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>" + + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("gnu7/7.2.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gnu9/9.3.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "arm/20.1 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST armclang++ $CLANG_WARNING_FLAGS") + + if [ -z "$ARCH_FLAG" ]; then + ARCH_FLAG="--arch=ARMV8_THUNDERX2" + fi + +elif [ "$MACHINE" = "blake" ]; then + source /etc/profile.d/modules.sh + SKIP_HWLOC=True + export SLURM_TASKS_PER_NODE=32 + + module load cmake/3.19.3 + + BASE_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/<COMPILER_VERSION>" + BASE_MODULE_LIST_INTEL="cmake/3.19.3,<COMPILER_NAME>/compilers/<COMPILER_VERSION>" + BASE_MODULE_LIST_ONEAPI="cmake/3.19.3,<COMPILER_NAME>/oneAPI/base-toolkit/<COMPILER_VERSION>" + ONEAPI_WARNING_FLAGS="" + + if [ "$SPOT_CHECK" = "True" ]; then + + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("intel/18.1.163 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + ) + else + COMPILERS=("intel/17.4.196 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/18.0.128 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/18.1.163 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/19.1.144 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/19.3.199 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/19.5.281 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/2021.1.1 $BASE_MODULE_LIST_ONEAPI $INTEL_BUILD_LIST icpx $ONEAPI_WARNING_FLAGS" + "gcc/5.5.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/6.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/8.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/8.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/10.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + ) + + fi + if [ -z "$ARCH_FLAG" ]; then + ARCH_FLAG="--arch=SKX" + fi + +elif [ "$MACHINE" = "apollo" ]; then + source /projects/sems/modulefiles/utils/sems-modules-init.sh + module use /home/projects/modulefiles/local/x86-64 + + module load sems-git + module load sems-tex + module load sems-cmake/3.17.1 + module load sems-gdb + module load binutils + + SKIP_HWLOC=True + + BASE_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>" + CLANG_MODULE_LIST="sems-env,sems-cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>" + CUDA10_MODULE_LIST="sems-env,sems-cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0" + CUDA10X_MODULE_LIST="sems-env,sems-cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/7.3.0" + + HPX3_MODULE_LIST="sems-env,sems-cmake/3.17.1,compilers/hpx/1.3.0,sems-gcc/6.1.0,binutils" + + BUILD_LIST_CUDA_NVCC="Cuda_Serial,Cuda_OpenMP" + BUILD_LIST_CUDA_CLANG="Cuda_Serial,Cuda_Pthread" + BUILD_LIST_CLANG="Serial,Pthread,OpenMP" + + if [ "$SPOT_CHECK" = "True" ]; then + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "OpenMP,Pthread,Serial" g++ $GCC_WARNING_FLAGS" + "hpx/1.3.0 $HPX3_MODULE_LIST "HPX" g++ $PGI_WARNING_FLAGS" + ) + else + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("cuda/10.0 $CUDA10_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/10.1 $CUDA10X_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/10.2 $CUDA10X_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/7.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "clang/6.0 $CLANG_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS" + "clang/7.0 $CLANG_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS" + "clang/8.0 $CLANG_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS" + "hpx/1.3.0 $HPX3_MODULE_LIST "HPX" g++ $PGI_WARNING_FLAGS" + ) + fi + + if [ -z "$ARCH_FLAG" ]; then + ARCH_FLAG="--arch=SNB,Volta70" + fi + +elif [ "$MACHINE" = "kokkos-dev-2" ]; then + source /projects/sems/modulefiles/utils/sems-modules-init.sh + module use /home/projects/x86-64/modulefiles/local + module purge + module load sems-env + + module load sems-git + module load sems-tex + module load sems-cmake/3.17.1 + module load sems-gdb + + SKIP_HWLOC=True + + BASE_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>" + GCC91_MODULE_LIST="sems-env,sems-cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>" + NVCC9_MODULE_LIST="sems-env,sems-cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0" + NVCC_MODULE_LIST="sems-env,sems-cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/7.3.0" + NVCC11_MODULE_LIST="sems-env,sems-cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/9.2.0" + + CLANG8_MODULE_LIST="sems-env,sems-cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>,cuda/10.0" + PGI_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-gcc/7.3.0,<COMPILER_NAME>/<COMPILER_VERSION>" + + BUILD_LIST_CUDA_NVCC="Cuda_Serial,Cuda_Pthread" + BUILD_LIST_CUDA_CLANG="Cuda_Serial,Cuda_OpenMP" + BUILD_LIST_CLANG="Serial,Pthread,OpenMP" + + if [ "$SPOT_CHECK" = "True" ]; then + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("gcc/7.3.0 $BASE_MODULE_LIST "OpenMP,Pthread" g++ $GCC_WARNING_FLAGS" + "gcc/8.3.0 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS" + "gcc/9.1 $GCC91_MODULE_LIST "OpenMP,Serial" g++ $GCC_WARNING_FLAGS" + "intel/18.0.5 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" + "clang/8.0 $CLANG8_MODULE_LIST "Cuda_OpenMP,Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" + "cuda/10.1 $NVCC_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + ) + else + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("cuda/10.0 $NVCC_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/10.1 $NVCC_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/11.0 $NVCC11_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/9.2 $NVCC9_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "clang/8.0 $CLANG8_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS" + "clang/8.0 $CLANG8_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS" + "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/7.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/8.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/9.1 $GCC91_MODULE_LIST "$GCC_BUILD_LIST" g++ $GCC_WARNING_FLAGS" + "gcc/9.2.0 $BASE_MODULE_LIST "$GCC_BUILD_LIST" g++ $GCC_WARNING_FLAGS" + "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/18.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/19.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "clang/5.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "clang/7.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "clang/9.0.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + "clang/10.0.0 $BASE_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS" + "pgi/19.4 $PGI_MODULE_LIST $GCC_BUILD_LIST pgc++ $PGI_WARNING_FLAGS" + ) + fi + + if [ -z "$ARCH_FLAG" ]; then + ARCH_FLAG="--arch=SNB,Volta70" + fi + +else + echo "Unhandled machine $MACHINE" >&2 + exit 1 +fi + +export OMP_NUM_THREADS=8 +export OMP_PROC_BIND=spread +export OMP_PLACES=cores + +declare -i NUM_RESULTS_TO_KEEP=7 + +RESULT_ROOT_PREFIX=TestAll + +if [ "$PRINT_HELP" = "True" ]; then + print_help + exit 0 +fi + +UNCOMMITTED=`cd ${KOKKOS_PATH}; git status --porcelain 2>/dev/null` +if ! [ -z "$UNCOMMITTED" ]; then + echo "WARNING!! THE FOLLOWING CHANGES ARE UNCOMMITTED!! :" + echo "$UNCOMMITTED" + echo "" +fi + +GITSTATUS=`cd ${KOKKOS_PATH}; git log -n 1 --format=oneline` +echo "Repository Status: " ${GITSTATUS} +echo "" +echo "" + +# Set build type. +if [ "$DEBUG" = "True" ]; then + BUILD_TYPE=debug +else + BUILD_TYPE=release +fi + +# If no args provided, do all compilers. +if [ -z "$ARGS" ]; then + ARGS='?' +fi + +# Process args to figure out which compilers to test. +COMPILERS_TO_TEST="" + +for ARG in $ARGS; do + for COMPILER_DATA in "${COMPILERS[@]}"; do + ARR=($COMPILER_DATA) + COMPILER=${ARR[0]} + + if [[ "$COMPILER" = $ARG* ]]; then + if [[ "$COMPILERS_TO_TEST" != *${COMPILER}* ]]; then + COMPILERS_TO_TEST="$COMPILERS_TO_TEST $COMPILER" + else + echo "Tried to add $COMPILER twice" + fi + fi + done +done + +if [ "$COMPILERS_TO_TEST" == "" ]; then + echo "-----------------------------------------------" + echo " !!!! Invalid Compiler provided '$ARGS' !!!!" + echo "-----------------------------------------------" + print_help + exit 0 +fi + +# +# Functions. +# + +# get_compiler_name <COMPILER> +get_compiler_name() { + echo $1 | cut -d/ -f1 +} + +# get_compiler_version <COMPILER> +get_compiler_version() { + echo $1 | cut -d/ -f2 +} + +# Do not call directly. +get_compiler_data() { + local compiler=$1 + local item=$2 + local compiler_name=$(get_compiler_name $compiler) + local compiler_vers=$(get_compiler_version $compiler) + + local compiler_data + for compiler_data in "${COMPILERS[@]}" ; do + local arr=($compiler_data) + + if [ "$compiler" = "${arr[0]}" ]; then + echo "${arr[$item]}" | tr , ' ' | sed -e "s/<COMPILER_NAME>/$compiler_name/g" -e "s/<COMPILER_VERSION>/$compiler_vers/g" + return 0 + fi + done + + # Not found. + echo "Unreconized compiler $compiler" >&2 + exit 1 +} + +# +# For all getters, usage: <GETTER> <COMPILER> +# + +get_compiler_modules() { + get_compiler_data $1 1 +} + +get_compiler_build_list() { + get_compiler_data $1 2 +} + +get_compiler_exe_name() { + get_compiler_data $1 3 +} + +get_compiler_warning_flags() { + get_compiler_data $1 4 +} + +run_cmd() { + echo "RUNNING: $*" + if [ "$DRYRUN" != "True" ]; then + eval "$* 2>&1" + fi +} + +# report_and_log_test_results <SUCCESS> <DESC> <COMMENT> +report_and_log_test_result() { + # Use sane var names. + local success=$1; local desc=$2; local comment=$3; + + if [ "$success" = "0" ]; then + echo " PASSED $desc" + echo $comment > $PASSED_DIR/$desc + else + # For failures, comment should be the name of the phase that failed. + echo " FAILED $desc" >&2 + echo $comment > $FAILED_DIR/$desc + cat ${desc}.${comment}.log + fi +} + +setup_env() { + local compiler=$1 + local compiler_modules=$(get_compiler_modules $compiler) + + module purge + + local mod + for mod in $compiler_modules; do + echo "Loading module $mod" + module load $mod 2>&1 + # It is ridiculously hard to check for the success of a loaded + # module. Module does not return error codes and piping to grep + # causes module to run in a subshell. + module list 2>&1 | grep "$mod" >& /dev/null || return 1 + done + + if [ -e ${CM_ALL_SCRIPT_PATH}/update_lib.sh ]; then + echo "calling ${CM_ALL_SCRIPT_PATH}/update_lib.sh $MACHINE $compiler" + source ${CM_ALL_SCRIPT_PATH}/update_lib.sh $MACHINE $compiler + fi + return 0 +} + +# single_build_and_test <COMPILER> <BUILD> <BUILD_TYPE> +single_build_and_test() { + # Use sane var names. + local compiler=$1; local build=$2; local build_type=$3; + + # Set up env. + mkdir -p $ROOT_DIR/$compiler/"${build}-$build_type" + cd $ROOT_DIR/$compiler/"${build}-$build_type" + + local compiler_modules_list=$(get_compiler_modules $compiler) + echo " # Load modules:" &> reload_modules.sh + echo " module load $compiler_modules_list" &>> reload_modules.sh + echo "" &>> reload_modules.sh + chmod +x reload_modules.sh + + local desc=$(echo "${compiler}-${build}-${build_type}" | sed 's:/:-:g') + setup_env $compiler >& ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } + + # Set up flags. + local compiler_warning_flags=$(get_compiler_warning_flags $compiler) + local compiler_exe=$(get_compiler_exe_name $compiler) + + if [[ "$KOKKOS_OPTIONS" == "" ]]; then + local extra_args="--with-options=enable_large_mem_tests" + else + local extra_args= + fi + + if [[ "$build_type" = hwloc* ]]; then + local extra_args="$extra_args --with-hwloc=$(dirname $(dirname $(which hwloc-info)))" + fi + + if [[ "$OPT_FLAG" = "" ]]; then + OPT_FLAG="-O3" + fi + + if [[ "$build_type" = *debug* ]]; then + local extra_args="$extra_args --debug" + local cxxflags="-g $compiler_warning_flags" + else + local cxxflags="$OPT_FLAG $compiler_warning_flags" + fi + + local cxxflags="${cxxflags} ${CXX_FLAGS_EXTRA}" + local ldflags="${LD_FLAGS_EXTRA}" + + local cxx_standard="${CXX_STANDARD}" + + + echo " Starting job $desc" + + local comment="no_comment" + + if [ "$TEST_SCRIPT" = "True" ]; then + local rand=$[ 1 + $[ RANDOM % 10 ]] + sleep $rand + + if [ $rand -gt 5 ]; then + run_cmd ls fake_problem >& ${desc}.configure.log || { report_and_log_test_result 1 $desc configure && return 0; } + fi + else + LOCAL_KOKKOS_DEVICES=${build//_/,} + if [[ "$LOCAL_KOKKOS_DEVICES" = *Cuda* ]]; then + CUDA_ENABLE_CMD="--with-cuda=$CUDA_ROOT" + fi + echo "kokkos options: ${KOKKOS_OPTIONS}" + echo "kokkos devices: ${LOCAL_KOKKOS_DEVICES}" + echo "kokkos cxx: ${cxxflags}" + + # KOKKOS_OPTIONS and KOKKOS_CUDA_OPTIONS are exported and detected by kokkos' generate_makefile.sh during install of kokkos; we pass them to the reproducer script instructions + echo " # Use generate_makefile line below to call cmake which generates makefile for this build:" &> call_generate_makefile.sh + echo " ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} --no-examples ${KOKKOS_BOUNDS_CHECK} $extra_args" &>> call_generate_makefile.sh + + # store script command with generic path for faster copy/paste of reproducer into issues + echo " \$KOKKOS_PATH/generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD --kokkos-path=\$KOKKOS_PATH --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} ${KOKKOS_BOUNDS_CHECK} --no-examples $extra_args" &> call_generate_makefile_genericpath.sh + + run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} ${KOKKOS_BOUNDS_CHECK} --no-examples $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } + local make_par_lvl=12 + if [[ "$MACHINE" = white* ]]; then + make_par_lvl=48 + fi + local -i build_start_time=$(date +%s) + run_cmd make -j $make_par_lvl all >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; } + local -i build_end_time=$(date +%s) + comment="build_time=$(($build_end_time-$build_start_time))" + + if [[ "$BUILD_ONLY" == False ]]; then + run_cmd ctest --timeout ${CTESTTIMEOUT} -V --output-on-failure >& ${desc}.test.log || { report_and_log_test_result 1 ${desc} test && return 0; } + local -i run_end_time=$(date +%s) + comment="$comment run_time=$(($run_end_time-$build_end_time))" + fi + fi + + report_and_log_test_result 0 $desc "$comment" + + return 0 +} + +# wait_for_jobs <NUM-JOBS> +wait_for_jobs() { + local -i max_jobs=$1 + local -i num_active_jobs=$(jobs | wc -l) + while [ $num_active_jobs -ge $max_jobs ] + do + sleep 1 + num_active_jobs=$(jobs | wc -l) + jobs >& /dev/null + done +} + +# run_in_background <COMPILER> <BUILD> <BUILD_TYPE> +run_in_background() { + local compiler=$1 + + local -i num_jobs=$NUM_JOBS_TO_RUN_IN_PARALLEL + # Don't override command line input. + # if [[ "$BUILD_ONLY" == True ]]; then + # num_jobs=8 + # else + if [[ "$compiler" == cuda* ]]; then + num_jobs=1 + fi + if [[ "$compiler" == clang ]]; then + num_jobs=1 + fi + # fi + wait_for_jobs $num_jobs + + single_build_and_test $* & +} + +# build_and_test_all <COMPILER> +build_and_test_all() { + # Get compiler data. + local compiler=$1 + if [ -z "$CUSTOM_BUILD_LIST" ]; then + local compiler_build_list=$(get_compiler_build_list $compiler) + else + local compiler_build_list=$(echo "$CUSTOM_BUILD_LIST" | tr , ' ') + fi + + # Do builds. + local build + for build in $compiler_build_list + do + run_in_background $compiler $build $BUILD_TYPE + + # If not cuda, do a hwloc test too. + if [[ "$compiler" != cuda* && "$SKIP_HWLOC" == False ]]; then + run_in_background $compiler $build "hwloc-$BUILD_TYPE" + fi + done + + return 0 +} + +get_test_root_dir() { + local existing_results=$(find . -maxdepth 1 -name "$RESULT_ROOT_PREFIX*" | sort) + local -i num_existing_results=$(echo $existing_results | tr ' ' '\n' | wc -l) + local -i num_to_delete=${num_existing_results}-${NUM_RESULTS_TO_KEEP} + + if [ $num_to_delete -gt 0 ]; then + /bin/rm -rf $(echo $existing_results | tr ' ' '\n' | head -n $num_to_delete) + fi + + echo $(pwd)/${RESULT_ROOT_PREFIX}_$(date +"%Y-%m-%d_%H.%M.%S") +} + +wait_summarize_and_exit() { + wait_for_jobs 1 + + echo "#######################################################" + echo "PASSED TESTS" + echo "#######################################################" + + local passed_test + for passed_test in $(\ls -1 $PASSED_DIR | sort) + do + echo $passed_test $(cat $PASSED_DIR/$passed_test) + done + + local -i rv=0 + if [ "$(ls -A $FAILED_DIR)" ]; then + echo "#######################################################" + echo "FAILED TESTS" + echo "#######################################################" + + local failed_test + for failed_test in $(\ls -1 $FAILED_DIR | sort) + do + echo $failed_test "("$(cat $FAILED_DIR/$failed_test)" failed)" + rv=$rv+1 + + local str=$failed_test + local comp=$(echo "$str" | cut -d- -f1) + local vers=$(echo "$str" | cut -d- -f2) + local lbuild=$(echo "$str" | cut -d- -f3-) + # Generate reproducer instructions + #local filename=reproducer_instructions-$comp-$vers-$lbuild + local faildir=$ROOT_DIR/$comp/$vers/$lbuild + # Output reproducer instructions + echo "#######################################################" + echo " # Reproducer instructions:" + cat $faildir/reload_modules.sh + cat $faildir/call_generate_makefile_genericpath.sh + echo "" + echo " # To reload modules, reconfigure, rebuild, and retest directly from this failing build do the following:" + echo " # Move to the build directory" + echo " cd $faildir" + echo " # To reload modules" + echo " source ./reload_modules.sh" + echo " # To reconfigure" + echo " ./call_generate_makefile.sh" + echo " # To rebuild" + echo " make -j" + echo " # To retest" + echo " ctest -V" + echo "#######################################################" + done + fi + + exit $rv +} + +# +# Main. +# + +CM_ALL_SCRIPT=$0 +CM_ALL_SCRIPT_PATH=$(cd `dirname $CM_ALL_SCRIPT` && pwd) + +ROOT_DIR=$(get_test_root_dir) +mkdir -p $ROOT_DIR +cd $ROOT_DIR + +PASSED_DIR=$ROOT_DIR/results/passed +FAILED_DIR=$ROOT_DIR/results/failed +mkdir -p $PASSED_DIR +mkdir -p $FAILED_DIR + +echo "Going to test compilers: " $COMPILERS_TO_TEST +for COMPILER in $COMPILERS_TO_TEST; do + echo "Testing compiler $COMPILER" + build_and_test_all $COMPILER +done + +wait_summarize_and_exit diff --git a/packages/kokkos/scripts/testing_scripts/test_kokkos_master_develop_promotion.sh b/packages/kokkos/scripts/testing_scripts/test_kokkos_master_develop_promotion.sh new file mode 100755 index 0000000000000000000000000000000000000000..048f48194ce7bfe6f773c7f8ee289f76f8f16cb6 --- /dev/null +++ b/packages/kokkos/scripts/testing_scripts/test_kokkos_master_develop_promotion.sh @@ -0,0 +1,66 @@ +#!/bin/bash + +. /etc/profile.d/modules.sh + +echo "build-dir $1" +echo "backend $2" +echo "module $3" +echo "compiler $4" +echo "cxxflags $5" +echo "architecrure $6" +echo "debug $7" +echo "kokkos-options $8" +echo "kokkos-cuda-options $9" +echo "hwloc $9" + +NOW=`date "+%Y%m%d%H%M%S"` +BASEDIR="$1-$NOW" + +mkdir $BASEDIR +cd $BASEDIR + +module load $2 + +if [ $9 == "yes" ]; then +if [ $7 == "debug" ]; then + ../generate_makefile.sh --with-devices=$2 \ + --compiler=$4 \ + --cxxflags=$5 \ + --arch=$6 \ + --debug \ + --with-options=$8 \ + --with-cuda-options=$9 + --with-hwloc=${HWLOC_ROOT} +else + ../generate_makefile.sh --with-devices=$2 \ + --compiler=$4 \ + --cxxflags=$5 \ + --arch=$6 \ + --debug \ + --with-options=$8 \ + --with-cuda-options=$9 + --with-hwloc=${HWLOC_ROOT} +fi +else +if [ $7 == "debug" ]; then + ../generate_makefile.sh --with-devices=$2 \ + --compiler=$4 \ + --cxxflags=$5 \ + --arch=$6 \ + --debug \ + --with-options=$8 \ + --with-cuda-options=$9 +else + ../generate_makefile.sh --with-devices=$2 \ + --compiler=$4 \ + --cxxflags=$5 \ + --arch=$6 \ + --debug \ + --with-options=$8 \ + --with-cuda-options=$9 +fi +fi + + +make test +return $? diff --git a/packages/kokkos/scripts/testing_scripts/update_lib.sh b/packages/kokkos/scripts/testing_scripts/update_lib.sh new file mode 100755 index 0000000000000000000000000000000000000000..34ab5dd3c9a0afae4b10b70d99772308f35b3f9f --- /dev/null +++ b/packages/kokkos/scripts/testing_scripts/update_lib.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +local machine_input="$1" +local compiler_input="$2" + +check_sems_intel() { + ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)" + if [[ "${ICPCVER}" = 17.* ]]; then + module swap sems-gcc/4.9.3 sems-gcc/6.4.0 + module list + fi + if [[ "${ICPCVER}" = 19.* ]]; then + # Newer gcc needed for c++ standard beyond c++14 + module swap sems-gcc/6.1.0 sems-gcc/7.2.0 + module list + fi +} + +check_sems_clang() { + CLANGVER=$(clang --version | grep "clang version" | cut -d " " -f 3) + if [[ "${CLANGVER}" = 9.* ]] || [[ "${CLANGVER}" = 10.* ]]; then + # Newer gcc needed for c++ standard beyond c++14 + module swap sems-gcc/5.3.0 sems-gcc/6.4.0 + module list + fi +} + +check_compiler_modules() { + if [[ "$compiler_input" = clang/* ]]; then + echo " clang compiler - check supporting modules" + check_sems_clang + elif [[ "$compiler_input" = intel/* ]]; then + echo " intel compiler - check supporting modules" + check_sems_intel + fi +} + +if [ "$machine_input" = blake ]; then + ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)" + if [[ "${ICPCVER}" = 17.* || "${ICPCVER}" = 18.0.128 ]]; then + module swap gcc/4.9.3 gcc/6.4.0 + module list + fi +fi +if [ "$machine_input" = kokkos-dev ]; then + check_compiler_modules +fi +if [ "$machine_input" = kokkos-dev-2 ]; then + check_compiler_modules +fi +if [ "$machine_input" = sems ] || [ "$machine_input" = sogpu ]; then + check_compiler_modules +fi diff --git a/packages/kokkos/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depoff-dbg.sh b/packages/kokkos/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depoff-dbg.sh new file mode 100755 index 0000000000000000000000000000000000000000..04f7fb56e7f2ed9e67cd764e2602abcadf2dd5a4 --- /dev/null +++ b/packages/kokkos/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depoff-dbg.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +export TRILINOS_DIR=${PWD}/../.. + +# Load modules +module purge +source ${TRILINOS_DIR}/cmake/std/atdm/load-env.sh cuda-9.2-dbg + +# Packages +PACKAGE1=Tpetra +PACKAGE2=Sacado +PACKAGE3=Stokhos +PACKAGE4=MueLu +PACKAGE5=Intrepid2 +PACKAGE6=Ifpack2 +PACKAGE7=Panzer +PACKAGE8=Phalanx +PACKAGE9=Stratimikos +PACKAGE10=Belos + +# Configure +cmake \ + -GNinja \ + -DTrilinos_CONFIGURE_OPTIONS_FILE:STRING=cmake/std/atdm/ATDMDevEnv.cmake \ + -DTrilinos_ENABLE_TESTS=ON \ + -DTrilinos_ENABLE_${PACKAGE1}=ON \ + -DTrilinos_ENABLE_${PACKAGE2}=ON \ + -DTrilinos_ENABLE_${PACKAGE3}=ON \ + -DTrilinos_ENABLE_${PACKAGE4}=ON \ + -DTrilinos_ENABLE_${PACKAGE5}=ON \ + -DTrilinos_ENABLE_${PACKAGE6}=ON \ + -DTrilinos_ENABLE_${PACKAGE7}=ON \ + -DTrilinos_ENABLE_${PACKAGE8}=ON \ + -DTrilinos_ENABLE_${PACKAGE9}=ON \ + -DTrilinos_ENABLE_${PACKAGE10}=ON \ + -DKokkos_SOURCE_DIR_OVERRIDE:STRING=kokkos \ + -DKokkosKernels_SOURCE_DIR_OVERRIDE:STRING=kokkos-kernels \ + -DTpetra_ENABLE_DEBUG=ON \ +$TRILINOS_DIR + +# Notes: +# Compile using ninja +# make NP=32 + +# Allocate node: +# bsub -J TestCompare-DepOffdbg -W 06:00 -Is -n 16 -q rhel7W bash + +# Run tests +# ctest -j8 + +# Submit tests as job +# bsub -x -Is -q rhel7W -n 16 ctest -j8 diff --git a/packages/kokkos/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depoff.sh b/packages/kokkos/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depoff.sh new file mode 100755 index 0000000000000000000000000000000000000000..2c01b861bd47c46318718c07f62c4ef74d8d2060 --- /dev/null +++ b/packages/kokkos/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depoff.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +echo "SOURCE this script!!" + +export TRILINOS_DIR=${PWD}/../.. + +# Load modules +module purge +source ${TRILINOS_DIR}/cmake/std/atdm/load-env.sh cuda-9.2-opt + +# Packages +PACKAGE1=Tpetra +PACKAGE2=Sacado +PACKAGE3=Stokhos +PACKAGE4=MueLu +PACKAGE5=Intrepid2 +PACKAGE6=Ifpack2 +PACKAGE7=Panzer +PACKAGE8=Phalanx +PACKAGE9=Stratimikos +PACKAGE10=Belos + + +rm -rf CMake* + +# Configure +cmake \ + -GNinja \ + -DTrilinos_CONFIGURE_OPTIONS_FILE:STRING=cmake/std/atdm/ATDMDevEnv.cmake \ + -DTrilinos_ENABLE_TESTS=ON \ + -DTrilinos_ENABLE_${PACKAGE1}=ON \ + -DTrilinos_ENABLE_${PACKAGE2}=ON \ + -DTrilinos_ENABLE_${PACKAGE3}=ON \ + -DTrilinos_ENABLE_${PACKAGE4}=ON \ + -DTrilinos_ENABLE_${PACKAGE5}=ON \ + -DTrilinos_ENABLE_${PACKAGE6}=ON \ + -DTrilinos_ENABLE_${PACKAGE7}=ON \ + -DTrilinos_ENABLE_${PACKAGE8}=ON \ + -DTrilinos_ENABLE_${PACKAGE9}=ON \ + -DTrilinos_ENABLE_${PACKAGE10}=ON \ + -DKokkos_SOURCE_DIR_OVERRIDE:STRING=kokkos \ + -DKokkosKernels_SOURCE_DIR_OVERRIDE:STRING=kokkos-kernels \ +$TRILINOS_DIR + + +# Notes: +# Compile using ninja +# make NP=32 + +# Allocate node: +# bsub -J TestCompare-DepCodeOFF -W 06:00 -Is -n 16 -q rhel7W bash + +# Run tests +# ctest -j8 + +# Or submit tests as job +# bsub -x -Is -q rhel7W -n 16 ctest -j8 diff --git a/packages/kokkos/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depon-dbg.sh b/packages/kokkos/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depon-dbg.sh new file mode 100755 index 0000000000000000000000000000000000000000..c6af962034b70deaf573bde2489807055b094856 --- /dev/null +++ b/packages/kokkos/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depon-dbg.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +export TRILINOS_DIR=${PWD}/../.. + +# Load modules +module purge +source ${TRILINOS_DIR}/cmake/std/atdm/load-env.sh cuda-9.2-dbg + +# Packages +PACKAGE1=Tpetra +PACKAGE2=Sacado +PACKAGE3=Stokhos +PACKAGE4=MueLu +PACKAGE5=Intrepid2 +PACKAGE6=Ifpack2 +PACKAGE7=Panzer +PACKAGE8=Phalanx +PACKAGE9=Stratimikos +PACKAGE10=Belos + +# Configure +cmake \ + -GNinja \ + -DTrilinos_CONFIGURE_OPTIONS_FILE:STRING=cmake/std/atdm/ATDMDevEnv.cmake \ + -DTrilinos_ENABLE_TESTS=ON \ + -DTrilinos_ENABLE_${PACKAGE1}=ON \ + -DTrilinos_ENABLE_${PACKAGE2}=ON \ + -DTrilinos_ENABLE_${PACKAGE3}=ON \ + -DTrilinos_ENABLE_${PACKAGE4}=ON \ + -DTrilinos_ENABLE_${PACKAGE5}=ON \ + -DTrilinos_ENABLE_${PACKAGE6}=ON \ + -DTrilinos_ENABLE_${PACKAGE7}=ON \ + -DTrilinos_ENABLE_${PACKAGE8}=ON \ + -DTrilinos_ENABLE_${PACKAGE9}=ON \ + -DTrilinos_ENABLE_${PACKAGE10}=ON \ + -DKokkos_SOURCE_DIR_OVERRIDE:STRING=kokkos \ + -DKokkosKernels_SOURCE_DIR_OVERRIDE:STRING=kokkos-kernels \ + -DTpetra_ENABLE_DEBUG=ON \ +$TRILINOS_DIR + + +# Notes: +# Compile using ninja +# make NP=32 + +# Allocate node: +# bsub -J TestCompare-DepOndbg -W 06:00 -Is -n 16 -q rhel7W bash + +# Run tests +# ctest -j8 + +# Submit tests as job +# bsub -x -Is -q rhel7W -n 16 ctest -j8 diff --git a/packages/kokkos/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depon.sh b/packages/kokkos/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depon.sh new file mode 100755 index 0000000000000000000000000000000000000000..9403741586eb5b74bf061a8e3a771f760e515421 --- /dev/null +++ b/packages/kokkos/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-depon.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +echo "SOURCE this script!!" + +export TRILINOS_DIR=${PWD}/../.. + +# Load modules +module purge +source ${TRILINOS_DIR}/cmake/std/atdm/load-env.sh cuda-9.2-opt + +# Packages +PACKAGE1=Tpetra +PACKAGE2=Sacado +PACKAGE3=Stokhos +PACKAGE4=MueLu +PACKAGE5=Intrepid2 +PACKAGE6=Ifpack2 +PACKAGE7=Panzer +PACKAGE8=Phalanx +PACKAGE9=Stratimikos +PACKAGE10=Belos + + +rm -rf CMake* + +# Configure +cmake \ + -GNinja \ + -DTrilinos_CONFIGURE_OPTIONS_FILE:STRING=cmake/std/atdm/ATDMDevEnv.cmake \ + -DTrilinos_ENABLE_TESTS=ON \ + -DTrilinos_ENABLE_${PACKAGE1}=ON \ + -DTrilinos_ENABLE_${PACKAGE2}=ON \ + -DTrilinos_ENABLE_${PACKAGE3}=ON \ + -DTrilinos_ENABLE_${PACKAGE4}=ON \ + -DTrilinos_ENABLE_${PACKAGE5}=ON \ + -DTrilinos_ENABLE_${PACKAGE6}=ON \ + -DTrilinos_ENABLE_${PACKAGE7}=ON \ + -DTrilinos_ENABLE_${PACKAGE8}=ON \ + -DTrilinos_ENABLE_${PACKAGE9}=ON \ + -DTrilinos_ENABLE_${PACKAGE10}=ON \ + -DKokkos_SOURCE_DIR_OVERRIDE:STRING=kokkos \ + -DKokkosKernels_SOURCE_DIR_OVERRIDE:STRING=kokkos-kernels \ +$TRILINOS_DIR + +# Notes: +# Compile using ninja +# make NP=32 + +# Allocate node: +# bsub -J TestKokkos-DepCodeOn -W 07:00 -Is -n 16 -q rhel7W bash + +# Run tests +# ctest -j8 + +# Submit tests as job +# bsub -x -Is -q rhel7W -n 16 ctest -j8 diff --git a/packages/kokkos/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-ride-rdc-depoff.sh b/packages/kokkos/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-ride-rdc-depoff.sh new file mode 100755 index 0000000000000000000000000000000000000000..d508d4c77ac8894147f220be4ac0f890b38a1737 --- /dev/null +++ b/packages/kokkos/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-ride-rdc-depoff.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +echo "SOURCE this script!!" + +export TRILINOS_DIR=${PWD}/../.. + +# Load modules +module purge +source ${TRILINOS_DIR}/cmake/std/atdm/load-env.sh cuda-9.2-rdc-release-debug-pt + +rm -rf CMake* + +# Configure +cmake \ + -GNinja \ + -DTrilinos_CONFIGURE_OPTIONS_FILE:STRING=cmake/std/atdm/ATDMDevEnv.cmake \ + -DTrilinos_ENABLE_TESTS=ON \ + -DTrilinos_ENABLE_ALL_PACKAGES=ON \ + -DKokkos_SOURCE_DIR_OVERRIDE:STRING=kokkos \ + -DKokkosKernels_SOURCE_DIR_OVERRIDE:STRING=kokkos-kernels \ +$TRILINOS_DIR + +# Notes: +# Compile using ninja +# make NP=32 + +# Allocate node: +# bsub -J TestKokkos-DepCodeOn-rdcpt -W 07:00 -Is -n 16 -q rhel7W bash + +# Run tests +# ctest -j8 + +# Submit tests as job +# bsub -x -Is -q rhel7W -n 16 ctest -j8 diff --git a/packages/kokkos/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-ride-rdc-depon.sh b/packages/kokkos/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-ride-rdc-depon.sh new file mode 100755 index 0000000000000000000000000000000000000000..d508d4c77ac8894147f220be4ac0f890b38a1737 --- /dev/null +++ b/packages/kokkos/scripts/trilinos-integration/ATDM_configurations/configure-atdm-cuda-ride-rdc-depon.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +echo "SOURCE this script!!" + +export TRILINOS_DIR=${PWD}/../.. + +# Load modules +module purge +source ${TRILINOS_DIR}/cmake/std/atdm/load-env.sh cuda-9.2-rdc-release-debug-pt + +rm -rf CMake* + +# Configure +cmake \ + -GNinja \ + -DTrilinos_CONFIGURE_OPTIONS_FILE:STRING=cmake/std/atdm/ATDMDevEnv.cmake \ + -DTrilinos_ENABLE_TESTS=ON \ + -DTrilinos_ENABLE_ALL_PACKAGES=ON \ + -DKokkos_SOURCE_DIR_OVERRIDE:STRING=kokkos \ + -DKokkosKernels_SOURCE_DIR_OVERRIDE:STRING=kokkos-kernels \ +$TRILINOS_DIR + +# Notes: +# Compile using ninja +# make NP=32 + +# Allocate node: +# bsub -J TestKokkos-DepCodeOn-rdcpt -W 07:00 -Is -n 16 -q rhel7W bash + +# Run tests +# ctest -j8 + +# Submit tests as job +# bsub -x -Is -q rhel7W -n 16 ctest -j8 diff --git a/packages/kokkos/scripts/trilinos-integration/ATDM_configurations/configure-atdm-env.sh b/packages/kokkos/scripts/trilinos-integration/ATDM_configurations/configure-atdm-env.sh new file mode 100755 index 0000000000000000000000000000000000000000..7be71edc1cf3418ab22ad1a1b4d7eada04d29d4f --- /dev/null +++ b/packages/kokkos/scripts/trilinos-integration/ATDM_configurations/configure-atdm-env.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +echo "SOURCE this script!!" + +export TRILINOS_DIR=${PWD}/../.. + +# Load modules +module purge +source ${TRILINOS_DIR}/cmake/std/atdm/load-env.sh Trilinos-atdm-waterman-cuda-9.2-rdc-release-debug-pt + +# Packages +PACKAGE1=Tpetra +PACKAGE2=Sacado +PACKAGE3=Stokhos +PACKAGE4=MueLu +PACKAGE5=Intrepid2 +PACKAGE6=Ifpack2 +PACKAGE7=Panzer +PACKAGE8=Phalanx +PACKAGE9=Stratimikos +PACKAGE10=Belos + + +rm -rf CMake* + +# Configure +cmake \ + -GNinja \ + -DTrilinos_CONFIGURE_OPTIONS_FILE:STRING=cmake/std/atdm/ATDMDevEnv.cmake \ + -DTrilinos_ENABLE_TESTS=ON \ + -DTrilinos_ENABLE_${PACKAGE1}=ON \ + -DTrilinos_ENABLE_${PACKAGE2}=ON \ + -DTrilinos_ENABLE_${PACKAGE3}=ON \ + -DTrilinos_ENABLE_${PACKAGE4}=ON \ + -DTrilinos_ENABLE_${PACKAGE5}=ON \ + -DTrilinos_ENABLE_${PACKAGE6}=ON \ + -DTrilinos_ENABLE_${PACKAGE7}=ON \ + -DTrilinos_ENABLE_${PACKAGE8}=ON \ + -DTrilinos_ENABLE_${PACKAGE9}=ON \ + -DTrilinos_ENABLE_${PACKAGE10}=ON \ + -DKokkos_SOURCE_DIR_OVERRIDE:STRING=kokkos \ + -DKokkosKernels_SOURCE_DIR_OVERRIDE:STRING=kokkos-kernels \ +$TRILINOS_DIR + + +# Notes: +# Compile using ninja +# make NP=32 + +# Allocate node: +# bsub -J TestKokkos-DepCodeOn -W 07:00 -Is -n 16 -q rhel7W bash + +# Run tests +# ctest -j8 + +# Submit tests as job +# bsub -x -Is -q rhel7W -n 16 ctest -j8 diff --git a/packages/kokkos/scripts/trilinos-integration/README.md b/packages/kokkos/scripts/trilinos-integration/README.md new file mode 100644 index 0000000000000000000000000000000000000000..96650ca6687aa540ec5e87e93928e53f6c641311 --- /dev/null +++ b/packages/kokkos/scripts/trilinos-integration/README.md @@ -0,0 +1,59 @@ + + +# Kokkos: Promotion Test Debugging + +This explains the use (and basic implementation details) of promotion testing +new Kokkos and Kokkos Kernels branches. We first introduce a test for first +validating an existing Trilinos branch that all tests should be passing on (usually the develop, master, or kokkos-promotion branch of Trilinos). +After validating a clean promotion branch, we show how to test the same Trilinos branch +with an updated Kokkoks and Kokkos Kernels. + +# Clean Promotion Test +There is a script called `clean_promotion_test` in `scripts/trilinos-integration` that tests a self-contained +Trilinos branch with its default included Kokkos and Kokkos Kernels packages. +The script takes two arguments: +```` +./clean_promotion_test <ENV_FILE> <TRILINOS_ROOT> +```` +For running CUDA tests on the platform called White, e.g. +```` +./clean_promotion_test white_cuda_env.sh $TRILINOS_ROOT +```` +Based on the Trilinos path and ENV file, a unique hash `X` is generated. +A CMake configuration is then run in the folder `clean-test-X`. +Output of the configuration will appear in a `config.out`. +If the configuration succeeds, you can `cd` into the folder and run: +```` +> source ../<ENV_FILE> +> make -j +```` +to validate the build. The configure and build steps are deliberately separated +to allow incremental debugging of each step. + +# New Branch Promotion Test +If the clean test is passing, you can now test your updates to Kokkos. +To start the tests, there is a script that now takes four arguments: +```` +./config_promotion_test <ENV_FILE> <TRILINOS_ROOT> <KOKKOS_ROOT> <KERNELS_ROOT> +```` +where the two additional arguments are the locations of Kokkos and Kokkos Kernels +branches containing updated. Again, a unique hash `X` is generated. +A CMake configuration is then run in the folder `promotion-test-X`. +If the configuration succeeds, you can `cd` into the folder and run: +```` +> source ../<ENV_FILE> +> make -j +```` +The script uses the the source override feature of the Trilinos build system. +It creates symlinks in the Trilinos folder to your updated Kokkos +and Kokkos Kernels branches. Trilinos is then redirected to build with the updated +Kokkos via a CMake option `-DKokkos_SOURCE_DIR_OVERRIDE=kokkos`. + + +##### [LICENSE](https://github.com/kokkos/kokkos/blob/master/LICENSE) + +[](https://opensource.org/licenses/BSD-3-Clause) + +Under the terms of Contract DE-NA0003525 with NTESS, +the U.S. Government retains certain rights in this software. + diff --git a/packages/kokkos/scripts/trilinos-integration/blake_jenkins_run_script_pthread_intel b/packages/kokkos/scripts/trilinos-integration/blake_jenkins_run_script_pthread_intel new file mode 100755 index 0000000000000000000000000000000000000000..f5aeacdf896c683fd2833ab4b7964698a48a1155 --- /dev/null +++ b/packages/kokkos/scripts/trilinos-integration/blake_jenkins_run_script_pthread_intel @@ -0,0 +1,63 @@ +#!/bin/bash -el +ulimit -c 0 +module load devpack/20171203/openmpi/2.1.2/intel/18.1.163 +# Trilinos now requires cmake version >= 3.10.0 +module swap cmake/3.9.0 cmake/3.10.2 + +KOKKOS_BRANCH=$1 +TRILINOS_UPDATE_BRANCH=$2 +TRILINOS_PRISTINE_BRANCH=$3 + +if [ -z $KOKKOS_BRANCH ] +then + KOKKOS_BRANCH=develop +fi + +if [ -z $TRILINOS_UPDATE_BRANCH ] +then + TRILINOS_UPDATE_BRANCH=develop +fi + +if [ -z $TRILINOS_PRISTINE_BRANCH ] +then + TRILINOS_PRISTINE_BRANCH=develop +fi + +export OMP_NUM_THREADS=8 +export JENKINS_DO_CUDA=OFF +export JENKINS_DO_OPENMP=OFF +export JENKINS_DO_PTHREAD=ON +export JENKINS_DO_SERIAL=OFF +export JENKINS_DO_COMPLEX=OFF + +export JENKINS_ARCH=SKX +export JENKINS_ARCH_CXX_FLAG="-xCORE-AVX512 -mkl" +export JENKINS_ARCH_C_FLAG="-xCORE-AVX512 -mkl" +export BLAS_LIBRARIES="-mkl;${MKLROOT}/lib/intel64/libmkl_intel_lp64.a;${MKLROOT}/lib/intel64/libmkl_intel_thread.a;${MKLROOT}/lib/intel64/libmkl_core.a" +export LAPACK_LIBRARIES=${BLAS_LIBRARIES} + +export JENKINS_DO_TESTS=ON +export JENKINS_DO_EXAMPLES=ON +export JENKINS_DO_SHARED=ON + +export QUEUE=blake + + +module load python/2.7.13 + + +export KOKKOS_PATH=${PWD}/kokkos + +#Already done: +if [ ! -d "${KOKKOS_PATH}" ]; then + git clone https://github.com/kokkos/kokkos ${KOKKOS_PATH} +fi + +cd ${KOKKOS_PATH} +git checkout $KOKKOS_BRANCH +git pull +cd .. + +source ${KOKKOS_PATH}/scripts/trilinos-integration/prepare_trilinos_repos.sh $TRILINOS_UPDATE_BRANCH $TRILINOS_PRISTINE_BRANCH + +${TRILINOS_UPDATED_PATH}/sampleScripts/Sandia-SEMS/run_repo_comparison_slurm ${TRILINOS_UPDATED_PATH} ${TRILINOS_PRISTINE_PATH} ${TRILINOS_UPDATED_PATH}/sampleScripts/Sandia-SEMS/configure-testbeds-jenkins-all TestCompare ${QUEUE} diff --git a/packages/kokkos/scripts/trilinos-integration/blake_jenkins_run_script_serial_intel b/packages/kokkos/scripts/trilinos-integration/blake_jenkins_run_script_serial_intel new file mode 100755 index 0000000000000000000000000000000000000000..a1555f9afb65675c66f65675dfe51332a938a377 --- /dev/null +++ b/packages/kokkos/scripts/trilinos-integration/blake_jenkins_run_script_serial_intel @@ -0,0 +1,63 @@ +#!/bin/bash -el +ulimit -c 0 +module load devpack/20171203/openmpi/2.1.2/intel/18.1.163 +# Trilinos now requires cmake version >= 3.10.0 +module swap cmake/3.9.0 cmake/3.10.2 + +KOKKOS_BRANCH=$1 +TRILINOS_UPDATE_BRANCH=$2 +TRILINOS_PRISTINE_BRANCH=$3 + +if [ -z $KOKKOS_BRANCH ] +then + KOKKOS_BRANCH=develop +fi + +if [ -z $TRILINOS_UPDATE_BRANCH ] +then + TRILINOS_UPDATE_BRANCH=develop +fi + +if [ -z $TRILINOS_PRISTINE_BRANCH ] +then + TRILINOS_PRISTINE_BRANCH=develop +fi + +export OMP_NUM_THREADS=8 +export JENKINS_DO_CUDA=OFF +export JENKINS_DO_OPENMP=OFF +export JENKINS_DO_PTHREAD=OFF +export JENKINS_DO_SERIAL=ON +export JENKINS_DO_COMPLEX=ON + +export JENKINS_ARCH=SKX +export JENKINS_ARCH_CXX_FLAG="-xCORE-AVX512 -mkl" +export JENKINS_ARCH_C_FLAG="-xCORE-AVX512 -mkl" +export BLAS_LIBRARIES="-mkl;${MKLROOT}/lib/intel64/libmkl_intel_lp64.a;${MKLROOT}/lib/intel64/libmkl_intel_thread.a;${MKLROOT}/lib/intel64/libmkl_core.a" +export LAPACK_LIBRARIES=${BLAS_LIBRARIES} + +export JENKINS_DO_TESTS=ON +export JENKINS_DO_EXAMPLES=ON +export JENKINS_DO_SHARED=ON + +export QUEUE=blake + + +module load python/2.7.13 + + +export KOKKOS_PATH=${PWD}/kokkos + +#Already done: +if [ ! -d "${KOKKOS_PATH}" ]; then + git clone https://github.com/kokkos/kokkos ${KOKKOS_PATH} +fi + +cd ${KOKKOS_PATH} +git checkout $KOKKOS_BRANCH +git pull +cd .. + +source ${KOKKOS_PATH}/scripts/trilinos-integration/prepare_trilinos_repos.sh $TRILINOS_UPDATE_BRANCH $TRILINOS_PRISTINE_BRANCH + +${TRILINOS_UPDATED_PATH}/sampleScripts/Sandia-SEMS/run_repo_comparison_slurm ${TRILINOS_UPDATED_PATH} ${TRILINOS_PRISTINE_PATH} ${TRILINOS_UPDATED_PATH}/sampleScripts/Sandia-SEMS/configure-testbeds-jenkins-all TestCompare ${QUEUE} diff --git a/packages/kokkos/scripts/trilinos-integration/blake_pthread_env.sh b/packages/kokkos/scripts/trilinos-integration/blake_pthread_env.sh new file mode 100755 index 0000000000000000000000000000000000000000..af76f2f9f97c6f1f0d3e9ef4609607bb6da1dd00 --- /dev/null +++ b/packages/kokkos/scripts/trilinos-integration/blake_pthread_env.sh @@ -0,0 +1,27 @@ +module purge +module load devpack/20171203/openmpi/2.1.2/intel/18.1.163 +# Trilinos now requires cmake version >= 3.10.0 +module swap cmake/3.9.0 cmake/3.10.2 + +export OMP_NUM_THREADS=8 +export JENKINS_DO_CUDA=OFF +export JENKINS_DO_OPENMP=OFF +export JENKINS_DO_PTHREAD=ON +export JENKINS_DO_SERIAL=OFF +export JENKINS_DO_COMPLEX=OFF + +export JENKINS_ARCH=SKX +export JENKINS_ARCH_CXX_FLAG="-xCORE-AVX512 -mkl" +export JENKINS_ARCH_C_FLAG="-xCORE-AVX512 -mkl" +export BLAS_LIBRARIES="-mkl;${MKLROOT}/lib/intel64/libmkl_intel_lp64.a;${MKLROOT}/lib/intel64/libmkl_intel_thread.a;${MKLROOT}/lib/intel64/libmkl_core.a" +export LAPACK_LIBRARIES=${BLAS_LIBRARIES} + +export JENKINS_DO_TESTS=ON +export JENKINS_DO_EXAMPLES=ON +export JENKINS_DO_SHARED=ON + +export QUEUE=blake + + +module load python + diff --git a/packages/kokkos/scripts/trilinos-integration/checkin-test b/packages/kokkos/scripts/trilinos-integration/checkin-test new file mode 100644 index 0000000000000000000000000000000000000000..ffb565fcbbbb85f881053828d34208bd8e4b9e7e --- /dev/null +++ b/packages/kokkos/scripts/trilinos-integration/checkin-test @@ -0,0 +1,4 @@ +module purge +module load sems-env sems-gcc/4.9.3 sems-openmpi/1.10.1 sems-hdf5/1.8.12/parallel sems-netcdf/4.3.2/parallel sems-python/2.7.9 sems-zlib/1.2.8/base sems-cmake/3.5.2 sems-parmetis/4.0.3/64bit_parallel sems-scotch/6.0.3/nopthread_64bit_parallel sems-boost/1.63.0/base sems-yaml_cpp sems-superlu + +#Run Trilinos CheckinTest diff --git a/packages/kokkos/scripts/trilinos-integration/clean_promotion_test b/packages/kokkos/scripts/trilinos-integration/clean_promotion_test new file mode 100755 index 0000000000000000000000000000000000000000..78fb7649645107329de20307eecf9e99901c5629 --- /dev/null +++ b/packages/kokkos/scripts/trilinos-integration/clean_promotion_test @@ -0,0 +1,41 @@ +#! /usr/bin/env bash + +# $1 is the name of the environment file to source +# $2 is the path to the Trilinos repo + +ENV_FILE=$1 +TRILINOS_PATH=$2 + +export ENV_FILE=${ENV_FILE} +export TRILINOS_PATH=${TRILINOS_PATH} + + +if [ -z ${ENV_FILE} ] || [ ! -f ${ENV_FILE} ]; then + >&2 echo "Must give valid environment file as first argument" + exit 1 +fi + +if [ -z ${TRILINOS_PATH} ] || [ ! -d ${TRILINOS_PATH} ]; then + >&2 echo "Must give valid Trilinos path as second argument" + exit 1 +fi + + +source ${ENV_FILE} + +stringToHash="$1 $2" +hash=`cksum <<< "${stringToHash}" | cut -f 1 -d ' '` + +testFolder="clean-test-$hash" +echo "Running test in folder $testFolder" + +rm -rf $testFolder +mkdir $testFolder + +cd $testFolder + +${TRILINOS_PATH}/sampleScripts/Sandia-SEMS/configure-testbeds-jenkins \ + ${KOKKOS_EXTRA_FLAGS} >& config.out + + + diff --git a/packages/kokkos/scripts/trilinos-integration/config_promotion_test b/packages/kokkos/scripts/trilinos-integration/config_promotion_test new file mode 100755 index 0000000000000000000000000000000000000000..6a2a3c6c72aa0cc2b2bd32385cbf66671062c3d4 --- /dev/null +++ b/packages/kokkos/scripts/trilinos-integration/config_promotion_test @@ -0,0 +1,61 @@ +#! /usr/bin/env bash + +# $1 is the name of the environment file to source +# $2 is the path to the Trilinos repo +# $3 is the path to the Kokkos repo +# $4 is the path to the Kokkos Kernels repo + +ENV_FILE=$1 +TRILINOS_PATH=$2 +KOKKOS_PATH=$3 +KOKKOSKERNELS_PATH=$4 + +export ENV_FILE=${ENV_FILE} +export TRILINOS_PATH=${TRILINOS_PATH} +export KOKKOS_PATH=${KOKKOS_PATH} +export KOKKOSKERNELS_PATH=${KOKKOSKERNELS_PATH} + + +if [ -z ${ENV_FILE} ] || [ ! -f ${ENV_FILE} ]; then + >&2 echo "Must give valid environment file as first argument" + exit 1 +fi + +if [ -z ${TRILINOS_PATH} ] || [ ! -d ${TRILINOS_PATH} ]; then + >&2 echo "Must give valid Trilinos path as second argument" + exit 1 +fi + +if [ -z ${KOKKOS_PATH} ] || [ ! -d ${KOKKOS_PATH} ]; then + >&2 echo "Must give valid Kokkos path as third argument" + exit 1 +fi + +if [ -z ${KOKKOSKERNELS_PATH} ] || [ ! -d ${KOKKOSKERNELS_PATH} ]; then + >&2 echo "Must give valid KokkosKernels path as fourth argument" + exit 1 +fi + +source ${ENV_FILE} + +ln -s ${KOKKOS_PATH} ${TRILINOS_PATH}/kokkos +ln -s ${KOKKOSKERNELS_PATH} ${TRILINOS_PATH}/kokkos-kernels + +stringToHash="$1 $2 $3 $4" +hash=`cksum <<< "${stringToHash}" | cut -f 1 -d ' '` + +testFolder="promotion-test-$hash" +echo "Running test in folder $testFolder" + +rm -rf $testFolder +mkdir $testFolder + +cd $testFolder + +${TRILINOS_PATH}/sampleScripts/Sandia-SEMS/configure-testbeds-jenkins \ + -DKokkos_SOURCE_DIR_OVERRIDE=kokkos \ + -DKokkosKernels_SOURCE_DIR_OVERRIDE=kokkos-kernels \ + ${KOKKOS_EXTRA_FLAGS} >& config.out + + + diff --git a/packages/kokkos/scripts/trilinos-integration/prepare_trilinos_repos.sh b/packages/kokkos/scripts/trilinos-integration/prepare_trilinos_repos.sh new file mode 100755 index 0000000000000000000000000000000000000000..31b2ad21bdc1b8daea4d67ab7b203f47c614dc42 --- /dev/null +++ b/packages/kokkos/scripts/trilinos-integration/prepare_trilinos_repos.sh @@ -0,0 +1,59 @@ +#!/bin/bash -le + +TRILINOS_UPDATE_BRANCH=$1 +TRILINOS_PRISTINE_BRANCH=$2 + +if [ -z $TRILINOS_UPDATE_BRANCH ] +then + TRILINOS_UPDATE_BRANCH=develop +fi + +if [ -z $TRILINOS_PRISTINE_BRANCH ] +then + TRILINOS_PRISTINE_BRANCH=develop +fi + +export TRILINOS_UPDATED_PATH=${PWD}/trilinos-update +export TRILINOS_PRISTINE_PATH=${PWD}/trilinos-pristine + +#rm -rf ${KOKKOS_PATH} +#rm -rf ${TRILINOS_UPDATED_PATH} +#rm -rf ${TRILINOS_PRISTINE_PATH} + +#Already done: +if [ ! -d "${TRILINOS_UPDATED_PATH}" ]; then + git clone https://github.com/trilinos/trilinos ${TRILINOS_UPDATED_PATH} +fi +if [ ! -d "${TRILINOS_PRISTINE_PATH}" ]; then + git clone https://github.com/trilinos/trilinos ${TRILINOS_PRISTINE_PATH} +fi + +cd ${TRILINOS_UPDATED_PATH} +git checkout $TRILINOS_UPDATE_BRANCH +git reset --hard origin/$TRILINOS_UPDATE_BRANCH +git pull +cd .. + +python kokkos/scripts/snapshot.py ${KOKKOS_PATH} ${TRILINOS_UPDATED_PATH}/packages + +cd ${TRILINOS_UPDATED_PATH} +echo "" +echo "" +echo "Trilinos State:" +git log --pretty=oneline --since=7.days +cd .. + +cd ${TRILINOS_PRISTINE_PATH} +git status +echo "Checkout $TRILINOS_PRISTINE_BRANCH" +git checkout $TRILINOS_PRISTINE_BRANCH +echo "Pull" +git pull +cd .. + +cd ${TRILINOS_PRISTINE_PATH} +echo "" +echo "" +echo "Trilinos Pristine State:" +git log --pretty=oneline --since=7.days +cd .. diff --git a/packages/kokkos/scripts/trilinos-integration/waterman_cuda_env.sh b/packages/kokkos/scripts/trilinos-integration/waterman_cuda_env.sh new file mode 100755 index 0000000000000000000000000000000000000000..445b4f9697d458dd2fb28a709e9caa6593bc332b --- /dev/null +++ b/packages/kokkos/scripts/trilinos-integration/waterman_cuda_env.sh @@ -0,0 +1,38 @@ +#!/bin/bash -el +ulimit -c 0 + +module purge + +module load git +module load devpack/20180517/openmpi/2.1.2/gcc/7.2.0/cuda/9.2.88 +module swap openblas/0.2.20/gcc/7.2.0 netlib/3.8.0/gcc/7.2.0 +# Trilinos now requires cmake version >= 3.10.0 +module swap cmake/3.6.2 cmake/3.12.3 +export OMP_NUM_THREADS=8 +export JENKINS_DO_CUDA=ON +export JENKINS_DO_OPENMP=OFF +export JENKINS_DO_PTHREAD=OFF +export JENKINS_DO_SERIAL=ON +export JENKINS_DO_COMPLEX=OFF + +export JENKINS_ARCH="Power9,Volta70" +export BLAS_LIBRARIES="${BLAS_ROOT}/lib/libblas.a;gfortran;gomp" +export LAPACK_LIBRARIES="${LAPACK_ROOT}/lib/liblapack.a;gfortran;gomp" + +export JENKINS_DO_TESTS=ON +export JENKINS_DO_EXAMPLES=ON + +export QUEUE=rhel7F + +module load python + +export CUDA_LAUNCH_BLOCKING=1 +export CUDA_MANAGED_FORCE_DEVICE_ALLOC=1 + + +export KOKKOS_EXTRA_FLAGS="-DKokkos_ENABLE_CUDA_LAMBDA=ON" +scriptdir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +echo "DIR=$scriptdir" +NVCC_WRAPPER=`realpath $scriptdir/../../bin/nvcc_wrapper` +export OMPI_CXX=$NVCC_WRAPPER + diff --git a/packages/kokkos/scripts/trilinos-integration/white_cuda_env.sh b/packages/kokkos/scripts/trilinos-integration/white_cuda_env.sh new file mode 100755 index 0000000000000000000000000000000000000000..f3745ede8c658fd872a06488fe99b9f7a719ac51 --- /dev/null +++ b/packages/kokkos/scripts/trilinos-integration/white_cuda_env.sh @@ -0,0 +1,38 @@ +#!/bin/bash -el +ulimit -c 0 + +module purge + +module load devpack/20180521/openmpi/2.1.2/gcc/7.2.0/cuda/9.2.88 +module swap openblas/0.2.20/gcc/7.2.0 netlib/3.8.0/gcc/7.2.0 +# Trilinos now requires cmake version >= 3.10.0 +module swap cmake/3.9.6 cmake/3.12.3 +export OMP_NUM_THREADS=8 +export JENKINS_DO_CUDA=ON +export JENKINS_DO_OPENMP=OFF +export JENKINS_DO_PTHREAD=OFF +export JENKINS_DO_SERIAL=ON +export JENKINS_DO_COMPLEX=OFF + +export JENKINS_ARCH="Power8,Kepler37" +export JENKINS_ARCH_CXX_FLAG="-mcpu=power8 -arch=sm_37" +export JENKINS_ARCH_C_FLAG="-mcpu=power8" +export BLAS_LIBRARIES="${BLAS_ROOT}/lib/libblas.a;gfortran;gomp" +export LAPACK_LIBRARIES="${LAPACK_ROOT}/lib/liblapack.a;gfortran;gomp" + +export JENKINS_DO_TESTS=ON +export JENKINS_DO_EXAMPLES=ON + +export QUEUE=rhel7F + +module load python + +export CUDA_LAUNCH_BLOCKING=1 +export CUDA_MANAGED_FORCE_DEVICE_ALLOC=1 + + +export KOKKOS_EXTRA_FLAGS="-DKokkos_ENABLE_CUDA_LAMBDA=ON" +scriptdir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +NVCC_WRAPPER=`realpath $scriptdir/../../bin/nvcc_wrapper` +export OMPI_CXX=$NVCC_WRAPPER + diff --git a/packages/kokkos/scripts/trilinos-integration/white_run_jenkins_script_cuda b/packages/kokkos/scripts/trilinos-integration/white_run_jenkins_script_cuda new file mode 100755 index 0000000000000000000000000000000000000000..bf8bedef7ce0f3d374ca5100f10a7980742e3c4f --- /dev/null +++ b/packages/kokkos/scripts/trilinos-integration/white_run_jenkins_script_cuda @@ -0,0 +1,67 @@ +#!/bin/bash -el +ulimit -c 0 + +KOKKOS_BRANCH=$1 +TRILINOS_UPDATE_BRANCH=$2 +TRILINOS_PRISTINE_BRANCH=$3 + +if [ -z $KOKKOS_BRANCH ] +then + KOKKOS_BRANCH=develop +fi + +if [ -z $TRILINOS_UPDATE_BRANCH ] +then + TRILINOS_UPDATE_BRANCH=develop +fi + +if [ -z $TRILINOS_PRISTINE_BRANCH ] +then + TRILINOS_PRISTINE_BRANCH=develop +fi + +module load devpack/20180521/openmpi/2.1.2/gcc/7.2.0/cuda/9.2.88 +module swap openblas/0.2.20/gcc/7.2.0 netlib/3.8.0/gcc/7.2.0 +# Trilinos now requires cmake version >= 3.10.0 +module swap cmake/3.9.6 cmake/3.12.3 +export OMP_NUM_THREADS=8 +export JENKINS_DO_CUDA=ON +export JENKINS_DO_OPENMP=OFF +export JENKINS_DO_PTHREAD=OFF +export JENKINS_DO_SERIAL=ON +export JENKINS_DO_COMPLEX=OFF + +export JENKINS_ARCH="Power8,Kepler37" +export JENKINS_ARCH_CXX_FLAG="-mcpu=power8 -arch=sm_37" +export JENKINS_ARCH_C_FLAG="-mcpu=power8" +export BLAS_LIBRARIES="${BLAS_ROOT}/lib/libblas.a;gfortran;gomp" +export LAPACK_LIBRARIES="${LAPACK_ROOT}/lib/liblapack.a;gfortran;gomp" + +export JENKINS_DO_TESTS=ON +export JENKINS_DO_EXAMPLES=ON + +export QUEUE=rhel7F + +module load python/2.7.12 + +export KOKKOS_PATH=${PWD}/kokkos + +#Already done: +if [ ! -d "${KOKKOS_PATH}" ]; then + git clone https://github.com/kokkos/kokkos ${KOKKOS_PATH} +fi + +export OMPI_CXX=${KOKKOS_PATH}/bin/nvcc_wrapper + +cd ${KOKKOS_PATH} +git checkout $KOKKOS_BRANCH +git pull +cd .. + +export CUDA_LAUNCH_BLOCKING=1 +export CUDA_MANAGED_FORCE_DEVICE_ALLOC=1 + +source ${KOKKOS_PATH}/scripts/trilinos-integration/prepare_trilinos_repos.sh $TRILINOS_UPDATE_BRANCH $TRILINOS_PRISTINE_BRANCH + +${TRILINOS_UPDATED_PATH}/sampleScripts/Sandia-SEMS/run_repo_comparison_lsf ${TRILINOS_UPDATED_PATH} ${TRILINOS_PRISTINE_PATH} ${TRILINOS_UPDATED_PATH}/sampleScripts/Sandia-SEMS/configure-testbeds-jenkins-all TestCompare ${QUEUE} + diff --git a/packages/kokkos/scripts/trilinos-integration/white_run_jenkins_script_omp b/packages/kokkos/scripts/trilinos-integration/white_run_jenkins_script_omp new file mode 100755 index 0000000000000000000000000000000000000000..56933f7bfca88db15e872ce53a4a846d609ebc76 --- /dev/null +++ b/packages/kokkos/scripts/trilinos-integration/white_run_jenkins_script_omp @@ -0,0 +1,62 @@ +#!/bin/bash -el +ulimit -c 0 + +KOKKOS_BRANCH=$1 +TRILINOS_UPDATE_BRANCH=$2 +TRILINOS_PRISTINE_BRANCH=$3 + +if [ -z $KOKKOS_BRANCH ] +then + KOKKOS_BRANCH=develop +fi + +if [ -z $TRILINOS_UPDATE_BRANCH ] +then + TRILINOS_UPDATE_BRANCH=develop +fi + +if [ -z $TRILINOS_PRISTINE_BRANCH ] +then + TRILINOS_PRISTINE_BRANCH=develop +fi + +module load devpack/20180521/openmpi/2.1.2/gcc/7.2.0/cuda/9.2.88 +module swap openblas/0.2.20/gcc/7.2.0 netlib/3.8.0/gcc/7.2.0 +# Trilinos now requires cmake version >= 3.10.0 +module swap cmake/3.9.6 cmake/3.12.3 +export OMP_NUM_THREADS=8 +export JENKINS_DO_CUDA=OFF +export JENKINS_DO_OPENMP=ON +export JENKINS_DO_PTHREAD=OFF +export JENKINS_DO_SERIAL=OFF +export JENKINS_DO_COMPLEX=OFF + +export JENKINS_ARCH="Power8" +export JENKINS_ARCH_CXX_FLAG="-mcpu=power8" +export JENKINS_ARCH_C_FLAG="-mcpu=power8" +export BLAS_LIBRARIES="${BLAS_ROOT}/lib/libblas.a;gfortran;gomp" +export LAPACK_LIBRARIES="${LAPACK_ROOT}/lib/liblapack.a;gfortran;gomp" + +export JENKINS_DO_TESTS=ON +export JENKINS_DO_EXAMPLES=ON + +export QUEUE=rhel7T + +module load python/2.7.12 + +export KOKKOS_PATH=${PWD}/kokkos + +#Already done: +if [ ! -d "${KOKKOS_PATH}" ]; then + git clone https://github.com/kokkos/kokkos ${KOKKOS_PATH} +fi + +cd ${KOKKOS_PATH} +git checkout $KOKKOS_BRANCH +git pull +cd .. + +source ${KOKKOS_PATH}/scripts/trilinos-integration/prepare_trilinos_repos.sh $TRILINOS_UPDATE_BRANCH $TRILINOS_PRISTINE_BRANCH + +${TRILINOS_UPDATED_PATH}/sampleScripts/Sandia-SEMS/run_repo_comparison_lsf ${TRILINOS_UPDATED_PATH} ${TRILINOS_PRISTINE_PATH} ${TRILINOS_UPDATED_PATH}/sampleScripts/Sandia-SEMS/configure-testbeds-jenkins-all TestCompare ${QUEUE} + diff --git a/packages/kokkos/scripts/update-copyright b/packages/kokkos/scripts/update-copyright new file mode 100755 index 0000000000000000000000000000000000000000..04224f9ec937c2077b501a6e56eccaac67358787 --- /dev/null +++ b/packages/kokkos/scripts/update-copyright @@ -0,0 +1,6 @@ +files=`find . -name '*.cpp' -o -name '*.pc.in' -o -name '*.hpp' -o -name 'LICENSE' -o -name 'Copyright.txt' -o -name '*.cc'` +for file in $files; do +awk '{if($2=="Copyright" && $3=="(2014)" && $4=="Sandia" && $5=="Corporation") {print "// Copyright (2020) National Technology & Engineering"; print "// Solutions of Sandia, LLC (NTESS).";} else {print $0}}' $file | sed 's/DE-AC04-94AL85000 with Sandia Corporation/DE-NA0003525 with NTESS/g' | sed 's|Kokkos v. 2.0|Kokkos v. 3.0|g' &> tmp +sed -i 's|SANDIA CORPORATION|NTESS|g' tmp +cp tmp $file +done diff --git a/packages/kokkos/scripts/update-version b/packages/kokkos/scripts/update-version new file mode 100755 index 0000000000000000000000000000000000000000..3cc10b33112fc90aaa33865b8e4759851e1bde70 --- /dev/null +++ b/packages/kokkos/scripts/update-version @@ -0,0 +1,35 @@ + +NEW_MAJOR=$1 +NEW_MINOR=$2 +NEW_PATCH=$3 + +GNU_MAJOR=`grep "KOKKOS_VERSION_MAJOR =" Makefile.kokkos | awk '{print $3}'` +CMAKE_MAJOR=`grep "Kokkos_VERSION_MAJOR " CMakeLists.txt | awk '{print $2}' | sed 's|)||'` + +GNU_MINOR=`grep "KOKKOS_VERSION_MINOR =" Makefile.kokkos | awk '{print $3}'` +CMAKE_MINOR=`grep "Kokkos_VERSION_MINOR " CMakeLists.txt | awk '{print $2}' | sed 's|)||'` + +GNU_PATCH=`grep "KOKKOS_VERSION_PATCH =" Makefile.kokkos | awk '{print $3}'` +CMAKE_PATCH=`grep "Kokkos_VERSION_PATCH " CMakeLists.txt | awk '{print $2}' | sed 's|)||'` + +if [ $GNU_MAJOR -ne $CMAKE_MAJOR ] +then + echo 'MAJOR Versions do not agree GNU' $GNU_MAJOR 'vs CMAKE' $CMAKE_MAJOR +fi +if [ $GNU_MINOR -ne $CMAKE_MINOR ] +then + echo 'MINOR Versions do not agree GNU' $GNU_MINOR 'vs CMAKE' $CMAKE_MINOR +fi +if [ $GNU_PATCH -ne $CMAKE_PATCH ] +then + echo 'PATCH Versions do not agree GNU' $GNU_PATCH 'vs CMAKE' $CMAKE_PATCH +fi + +sed -i.bak 's|KOKKOS_VERSION_MAJOR = '$GNU_MAJOR'|KOKKOS_VERSION_MAJOR = '$NEW_MAJOR'|g' Makefile.kokkos +sed -i.bak 's|KOKKOS_VERSION_MINOR = '$GNU_MINOR'|KOKKOS_VERSION_MINOR = '$NEW_MINOR'|g' Makefile.kokkos +sed -i.bak 's|KOKKOS_VERSION_PATCH = '$GNU_PATCH'|KOKKOS_VERSION_PATCH = '$NEW_PATCH'|g' Makefile.kokkos +rm Makefile.kokkos.bak +sed -i.bak 's|Kokkos_VERSION_MAJOR '$GNU_MAJOR')|Kokkos_VERSION_MAJOR '$NEW_MAJOR')|g' CMakeLists.txt +sed -i.bak 's|Kokkos_VERSION_MINOR '$GNU_MINOR')|Kokkos_VERSION_MINOR '$NEW_MINOR')|g' CMakeLists.txt +sed -i.bak 's|Kokkos_VERSION_PATCH '$GNU_PATCH')|Kokkos_VERSION_PATCH '$NEW_PATCH')|g' CMakeLists.txt +rm CMakeLists.txt.bak diff --git a/packages/kokkos/tpls/.clang-format b/packages/kokkos/tpls/.clang-format new file mode 100644 index 0000000000000000000000000000000000000000..743216e523eae0566aea6018925309936bf787d3 --- /dev/null +++ b/packages/kokkos/tpls/.clang-format @@ -0,0 +1,3 @@ +#Official Tool: clang-format version 8.0.0 +DisableFormat: true +SortIncludes: false diff --git a/packages/kokkos/tpls/gtest/gtest/LICENSE b/packages/kokkos/tpls/gtest/gtest/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..1941a11f8ce94389160b458927a29ba217542818 --- /dev/null +++ b/packages/kokkos/tpls/gtest/gtest/LICENSE @@ -0,0 +1,28 @@ +Copyright 2008, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/packages/kokkos/tpls/gtest/gtest/README b/packages/kokkos/tpls/gtest/gtest/README new file mode 100644 index 0000000000000000000000000000000000000000..82964ecc329b474002c66cf534999519e8fc39a3 --- /dev/null +++ b/packages/kokkos/tpls/gtest/gtest/README @@ -0,0 +1,13 @@ +This is a fused source version of gtest 1.7.0. All that should be necessary to +start using gtest in your package is to declare the dependency and include +gtest/gtest.h. + +However, because some of the packages that are developed in Sierra do not use a +fused source version of gtest we need to make it possible for them to build with +this version as well as with their native build. To facilitate this we have +created symlinks for the other gtest headers that they use to the fused source +gtest.h. This will make it possible for them find the headers while still using +the fuse source version. This should not have any ill effects since the header is +protected and allows for only using the non-gtest.h headers in their files. + + diff --git a/packages/kokkos/tpls/gtest/gtest/gtest-all.cc b/packages/kokkos/tpls/gtest/gtest/gtest-all.cc new file mode 100644 index 0000000000000000000000000000000000000000..7c544a382f2a23364e521104a267cfbbf2da33fb --- /dev/null +++ b/packages/kokkos/tpls/gtest/gtest/gtest-all.cc @@ -0,0 +1,9594 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: mheule@google.com (Markus Heule) +// +// Google C++ Testing Framework (Google Test) +// +// Sometimes it's desirable to build Google Test by compiling a single file. +// This file serves this purpose. + +// This line ensures that gtest.h can be compiled on its own, even +// when it's fused. +#include "gtest/gtest.h" + +// The following lines pull in the real gtest *.cc files. +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) +// +// The Google C++ Testing Framework (Google Test) + +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) +// +// Utilities for testing Google Test itself and code that uses Google Test +// (e.g. frameworks built on top of Google Test). + +#ifndef GTEST_INCLUDE_GTEST_GTEST_SPI_H_ +#define GTEST_INCLUDE_GTEST_GTEST_SPI_H_ + + +namespace testing { + +// This helper class can be used to mock out Google Test failure reporting +// so that we can test Google Test or code that builds on Google Test. +// +// An object of this class appends a TestPartResult object to the +// TestPartResultArray object given in the constructor whenever a Google Test +// failure is reported. It can either intercept only failures that are +// generated in the same thread that created this object or it can intercept +// all generated failures. The scope of this mock object can be controlled with +// the second argument to the two arguments constructor. +class GTEST_API_ ScopedFakeTestPartResultReporter + : public TestPartResultReporterInterface { + public: + // The two possible mocking modes of this object. + enum InterceptMode { + INTERCEPT_ONLY_CURRENT_THREAD, // Intercepts only thread local failures. + INTERCEPT_ALL_THREADS // Intercepts all failures. + }; + + // The c'tor sets this object as the test part result reporter used + // by Google Test. The 'result' parameter specifies where to report the + // results. This reporter will only catch failures generated in the current + // thread. DEPRECATED + explicit ScopedFakeTestPartResultReporter(TestPartResultArray* result); + + // Same as above, but you can choose the interception scope of this object. + ScopedFakeTestPartResultReporter(InterceptMode intercept_mode, + TestPartResultArray* result); + + // The d'tor restores the previous test part result reporter. + virtual ~ScopedFakeTestPartResultReporter(); + + // Appends the TestPartResult object to the TestPartResultArray + // received in the constructor. + // + // This method is from the TestPartResultReporterInterface + // interface. + virtual void ReportTestPartResult(const TestPartResult& result); + private: + void Init(); + + const InterceptMode intercept_mode_; + TestPartResultReporterInterface* old_reporter_; + TestPartResultArray* const result_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedFakeTestPartResultReporter); +}; + +namespace internal { + +// A helper class for implementing EXPECT_FATAL_FAILURE() and +// EXPECT_NONFATAL_FAILURE(). Its destructor verifies that the given +// TestPartResultArray contains exactly one failure that has the given +// type and contains the given substring. If that's not the case, a +// non-fatal failure will be generated. +class GTEST_API_ SingleFailureChecker { + public: + // The constructor remembers the arguments. + SingleFailureChecker(const TestPartResultArray* results, + TestPartResult::Type type, + const string& substr); + ~SingleFailureChecker(); + private: + const TestPartResultArray* const results_; + const TestPartResult::Type type_; + const string substr_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(SingleFailureChecker); +}; + +} // namespace internal + +} // namespace testing + +// A set of macros for testing Google Test assertions or code that's expected +// to generate Google Test fatal failures. It verifies that the given +// statement will cause exactly one fatal Google Test failure with 'substr' +// being part of the failure message. +// +// There are two different versions of this macro. EXPECT_FATAL_FAILURE only +// affects and considers failures generated in the current thread and +// EXPECT_FATAL_FAILURE_ON_ALL_THREADS does the same but for all threads. +// +// The verification of the assertion is done correctly even when the statement +// throws an exception or aborts the current function. +// +// Known restrictions: +// - 'statement' cannot reference local non-static variables or +// non-static members of the current object. +// - 'statement' cannot return a value. +// - You cannot stream a failure message to this macro. +// +// Note that even though the implementations of the following two +// macros are much alike, we cannot refactor them to use a common +// helper macro, due to some peculiarity in how the preprocessor +// works. The AcceptsMacroThatExpandsToUnprotectedComma test in +// gtest_unittest.cc will fail to compile if we do that. +#define EXPECT_FATAL_FAILURE(statement, substr) \ + do { \ + class GTestExpectFatalFailureHelper {\ + public:\ + static void Execute() { statement; }\ + };\ + ::testing::TestPartResultArray gtest_failures;\ + ::testing::internal::SingleFailureChecker gtest_checker(\ + >est_failures, ::testing::TestPartResult::kFatalFailure, (substr));\ + {\ + ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\ + ::testing::ScopedFakeTestPartResultReporter:: \ + INTERCEPT_ONLY_CURRENT_THREAD, >est_failures);\ + GTestExpectFatalFailureHelper::Execute();\ + }\ + } while (::testing::internal::AlwaysFalse()) + +#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr) \ + do { \ + class GTestExpectFatalFailureHelper {\ + public:\ + static void Execute() { statement; }\ + };\ + ::testing::TestPartResultArray gtest_failures;\ + ::testing::internal::SingleFailureChecker gtest_checker(\ + >est_failures, ::testing::TestPartResult::kFatalFailure, (substr));\ + {\ + ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\ + ::testing::ScopedFakeTestPartResultReporter:: \ + INTERCEPT_ALL_THREADS, >est_failures);\ + GTestExpectFatalFailureHelper::Execute();\ + }\ + } while (::testing::internal::AlwaysFalse()) + +// A macro for testing Google Test assertions or code that's expected to +// generate Google Test non-fatal failures. It asserts that the given +// statement will cause exactly one non-fatal Google Test failure with 'substr' +// being part of the failure message. +// +// There are two different versions of this macro. EXPECT_NONFATAL_FAILURE only +// affects and considers failures generated in the current thread and +// EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS does the same but for all threads. +// +// 'statement' is allowed to reference local variables and members of +// the current object. +// +// The verification of the assertion is done correctly even when the statement +// throws an exception or aborts the current function. +// +// Known restrictions: +// - You cannot stream a failure message to this macro. +// +// Note that even though the implementations of the following two +// macros are much alike, we cannot refactor them to use a common +// helper macro, due to some peculiarity in how the preprocessor +// works. If we do that, the code won't compile when the user gives +// EXPECT_NONFATAL_FAILURE() a statement that contains a macro that +// expands to code containing an unprotected comma. The +// AcceptsMacroThatExpandsToUnprotectedComma test in gtest_unittest.cc +// catches that. +// +// For the same reason, we have to write +// if (::testing::internal::AlwaysTrue()) { statement; } +// instead of +// GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) +// to avoid an MSVC warning on unreachable code. +#define EXPECT_NONFATAL_FAILURE(statement, substr) \ + do {\ + ::testing::TestPartResultArray gtest_failures;\ + ::testing::internal::SingleFailureChecker gtest_checker(\ + >est_failures, ::testing::TestPartResult::kNonFatalFailure, \ + (substr));\ + {\ + ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\ + ::testing::ScopedFakeTestPartResultReporter:: \ + INTERCEPT_ONLY_CURRENT_THREAD, >est_failures);\ + if (::testing::internal::AlwaysTrue()) { statement; }\ + }\ + } while (::testing::internal::AlwaysFalse()) + +#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr) \ + do {\ + ::testing::TestPartResultArray gtest_failures;\ + ::testing::internal::SingleFailureChecker gtest_checker(\ + >est_failures, ::testing::TestPartResult::kNonFatalFailure, \ + (substr));\ + {\ + ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\ + ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \ + >est_failures);\ + if (::testing::internal::AlwaysTrue()) { statement; }\ + }\ + } while (::testing::internal::AlwaysFalse()) + +#endif // GTEST_INCLUDE_GTEST_GTEST_SPI_H_ + +#include <ctype.h> +#include <math.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <time.h> +#include <wchar.h> +#include <wctype.h> + +#include <algorithm> +#include <iomanip> +#include <limits> +#include <ostream> // NOLINT +#include <sstream> +#include <vector> + +#if GTEST_OS_LINUX + +// TODO(kenton@google.com): Use autoconf to detect availability of +// gettimeofday(). +# define GTEST_HAS_GETTIMEOFDAY_ 1 + +# include <fcntl.h> // NOLINT +# include <limits.h> // NOLINT +# include <sched.h> // NOLINT +// Declares vsnprintf(). This header is not available on Windows. +# include <strings.h> // NOLINT +# include <sys/mman.h> // NOLINT +# include <sys/time.h> // NOLINT +# include <unistd.h> // NOLINT +# include <string> + +#elif GTEST_OS_SYMBIAN +# define GTEST_HAS_GETTIMEOFDAY_ 1 +# include <sys/time.h> // NOLINT + +#elif GTEST_OS_ZOS +# define GTEST_HAS_GETTIMEOFDAY_ 1 +# include <sys/time.h> // NOLINT + +// On z/OS we additionally need strings.h for strcasecmp. +# include <strings.h> // NOLINT + +#elif GTEST_OS_WINDOWS_MOBILE // We are on Windows CE. + +# include <windows.h> // NOLINT + +#elif GTEST_OS_WINDOWS // We are on Windows proper. + +# include <io.h> // NOLINT +# include <sys/timeb.h> // NOLINT +# include <sys/types.h> // NOLINT +# include <sys/stat.h> // NOLINT + +# if GTEST_OS_WINDOWS_MINGW +// MinGW has gettimeofday() but not _ftime64(). +// TODO(kenton@google.com): Use autoconf to detect availability of +// gettimeofday(). +// TODO(kenton@google.com): There are other ways to get the time on +// Windows, like GetTickCount() or GetSystemTimeAsFileTime(). MinGW +// supports these. consider using them instead. +# define GTEST_HAS_GETTIMEOFDAY_ 1 +# include <sys/time.h> // NOLINT +# endif // GTEST_OS_WINDOWS_MINGW + +// cpplint thinks that the header is already included, so we want to +// silence it. +# include <windows.h> // NOLINT + +#else + +// Assume other platforms have gettimeofday(). +// TODO(kenton@google.com): Use autoconf to detect availability of +// gettimeofday(). +# define GTEST_HAS_GETTIMEOFDAY_ 1 + +// cpplint thinks that the header is already included, so we want to +// silence it. +# include <sys/time.h> // NOLINT +# include <unistd.h> // NOLINT + +#endif // GTEST_OS_LINUX + +#if GTEST_HAS_EXCEPTIONS +# include <stdexcept> +#endif + +#if GTEST_CAN_STREAM_RESULTS_ +# include <arpa/inet.h> // NOLINT +# include <netdb.h> // NOLINT +#endif + +// Indicates that this translation unit is part of Google Test's +// implementation. It must come before gtest-internal-inl.h is +// included, or there will be a compiler error. This trick is to +// prevent a user from accidentally including gtest-internal-inl.h in +// his code. +#define GTEST_IMPLEMENTATION_ 1 +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Utility functions and classes used by the Google C++ testing framework. +// +// Author: wan@google.com (Zhanyong Wan) +// +// This file contains purely Google Test's internal implementation. Please +// DO NOT #INCLUDE IT IN A USER PROGRAM. + +#ifndef GTEST_SRC_GTEST_INTERNAL_INL_H_ +#define GTEST_SRC_GTEST_INTERNAL_INL_H_ + +// GTEST_IMPLEMENTATION_ is defined to 1 iff the current translation unit is +// part of Google Test's implementation; otherwise it's undefined. +#if !GTEST_IMPLEMENTATION_ +// A user is trying to include this from his code - just say no. +# error "gtest-internal-inl.h is part of Google Test's internal implementation." +# error "It must not be included except by Google Test itself." +#endif // GTEST_IMPLEMENTATION_ + +#ifndef _WIN32_WCE +# include <errno.h> +#endif // !_WIN32_WCE +#include <stddef.h> +#include <stdlib.h> // For strtoll/_strtoul64/malloc/free. +#include <string.h> // For memmove. + +#include <algorithm> +#include <string> +#include <vector> + + +#if GTEST_CAN_STREAM_RESULTS_ +# include <arpa/inet.h> // NOLINT +# include <netdb.h> // NOLINT +#endif + +#if GTEST_OS_WINDOWS +# include <windows.h> // NOLINT +#endif // GTEST_OS_WINDOWS + + +namespace testing { + +// Declares the flags. +// +// We don't want the users to modify this flag in the code, but want +// Google Test's own unit tests to be able to access it. Therefore we +// declare it here as opposed to in gtest.h. +GTEST_DECLARE_bool_(death_test_use_fork); + +namespace internal { + +// The value of GetTestTypeId() as seen from within the Google Test +// library. This is solely for testing GetTestTypeId(). +GTEST_API_ extern const TypeId kTestTypeIdInGoogleTest; + +// Names of the flags (needed for parsing Google Test flags). +const char kAlsoRunDisabledTestsFlag[] = "also_run_disabled_tests"; +const char kBreakOnFailureFlag[] = "break_on_failure"; +const char kCatchExceptionsFlag[] = "catch_exceptions"; +const char kColorFlag[] = "color"; +const char kFilterFlag[] = "filter"; +const char kListTestsFlag[] = "list_tests"; +const char kOutputFlag[] = "output"; +const char kPrintTimeFlag[] = "print_time"; +const char kRandomSeedFlag[] = "random_seed"; +const char kRepeatFlag[] = "repeat"; +const char kShuffleFlag[] = "shuffle"; +const char kStackTraceDepthFlag[] = "stack_trace_depth"; +const char kStreamResultToFlag[] = "stream_result_to"; +const char kThrowOnFailureFlag[] = "throw_on_failure"; + +// A valid random seed must be in [1, kMaxRandomSeed]. +const int kMaxRandomSeed = 99999; + +// g_help_flag is true iff the --help flag or an equivalent form is +// specified on the command line. +GTEST_API_ extern bool g_help_flag; + +// Returns the current time in milliseconds. +GTEST_API_ TimeInMillis GetTimeInMillis(); + +// Returns true iff Google Test should use colors in the output. +GTEST_API_ bool ShouldUseColor(bool stdout_is_tty); + +// Formats the given time in milliseconds as seconds. +GTEST_API_ std::string FormatTimeInMillisAsSeconds(TimeInMillis ms); + +// Converts the given time in milliseconds to a date string in the ISO 8601 +// format, without the timezone information. N.B.: due to the use the +// non-reentrant localtime() function, this function is not thread safe. Do +// not use it in any code that can be called from multiple threads. +GTEST_API_ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms); + +// Parses a string for an Int32 flag, in the form of "--flag=value". +// +// On success, stores the value of the flag in *value, and returns +// true. On failure, returns false without changing *value. +GTEST_API_ bool ParseInt32Flag( + const char* str, const char* flag, Int32* value); + +// Returns a random seed in range [1, kMaxRandomSeed] based on the +// given --gtest_random_seed flag value. +inline int GetRandomSeedFromFlag(Int32 random_seed_flag) { + const unsigned int raw_seed = (random_seed_flag == 0) ? + static_cast<unsigned int>(GetTimeInMillis()) : + static_cast<unsigned int>(random_seed_flag); + + // Normalizes the actual seed to range [1, kMaxRandomSeed] such that + // it's easy to type. + const int normalized_seed = + static_cast<int>((raw_seed - 1U) % + static_cast<unsigned int>(kMaxRandomSeed)) + 1; + return normalized_seed; +} + +// Returns the first valid random seed after 'seed'. The behavior is +// undefined if 'seed' is invalid. The seed after kMaxRandomSeed is +// considered to be 1. +inline int GetNextRandomSeed(int seed) { + GTEST_CHECK_(1 <= seed && seed <= kMaxRandomSeed) + << "Invalid random seed " << seed << " - must be in [1, " + << kMaxRandomSeed << "]."; + const int next_seed = seed + 1; + return (next_seed > kMaxRandomSeed) ? 1 : next_seed; +} + +// This class saves the values of all Google Test flags in its c'tor, and +// restores them in its d'tor. +class GTestFlagSaver { + public: + // The c'tor. + GTestFlagSaver() { + also_run_disabled_tests_ = GTEST_FLAG(also_run_disabled_tests); + break_on_failure_ = GTEST_FLAG(break_on_failure); + catch_exceptions_ = GTEST_FLAG(catch_exceptions); + color_ = GTEST_FLAG(color); + death_test_style_ = GTEST_FLAG(death_test_style); + death_test_use_fork_ = GTEST_FLAG(death_test_use_fork); + filter_ = GTEST_FLAG(filter); + internal_run_death_test_ = GTEST_FLAG(internal_run_death_test); + list_tests_ = GTEST_FLAG(list_tests); + output_ = GTEST_FLAG(output); + print_time_ = GTEST_FLAG(print_time); + random_seed_ = GTEST_FLAG(random_seed); + repeat_ = GTEST_FLAG(repeat); + shuffle_ = GTEST_FLAG(shuffle); + stack_trace_depth_ = GTEST_FLAG(stack_trace_depth); + stream_result_to_ = GTEST_FLAG(stream_result_to); + throw_on_failure_ = GTEST_FLAG(throw_on_failure); + } + + // The d'tor is not virtual. DO NOT INHERIT FROM THIS CLASS. + ~GTestFlagSaver() { + GTEST_FLAG(also_run_disabled_tests) = also_run_disabled_tests_; + GTEST_FLAG(break_on_failure) = break_on_failure_; + GTEST_FLAG(catch_exceptions) = catch_exceptions_; + GTEST_FLAG(color) = color_; + GTEST_FLAG(death_test_style) = death_test_style_; + GTEST_FLAG(death_test_use_fork) = death_test_use_fork_; + GTEST_FLAG(filter) = filter_; + GTEST_FLAG(internal_run_death_test) = internal_run_death_test_; + GTEST_FLAG(list_tests) = list_tests_; + GTEST_FLAG(output) = output_; + GTEST_FLAG(print_time) = print_time_; + GTEST_FLAG(random_seed) = random_seed_; + GTEST_FLAG(repeat) = repeat_; + GTEST_FLAG(shuffle) = shuffle_; + GTEST_FLAG(stack_trace_depth) = stack_trace_depth_; + GTEST_FLAG(stream_result_to) = stream_result_to_; + GTEST_FLAG(throw_on_failure) = throw_on_failure_; + } + + private: + // Fields for saving the original values of flags. + bool also_run_disabled_tests_; + bool break_on_failure_; + bool catch_exceptions_; + std::string color_; + std::string death_test_style_; + bool death_test_use_fork_; + std::string filter_; + std::string internal_run_death_test_; + bool list_tests_; + std::string output_; + bool print_time_; + internal::Int32 random_seed_; + internal::Int32 repeat_; + bool shuffle_; + internal::Int32 stack_trace_depth_; + std::string stream_result_to_; + bool throw_on_failure_; +} GTEST_ATTRIBUTE_UNUSED_; + +// Converts a Unicode code point to a narrow string in UTF-8 encoding. +// code_point parameter is of type UInt32 because wchar_t may not be +// wide enough to contain a code point. +// If the code_point is not a valid Unicode code point +// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted +// to "(Invalid Unicode 0xXXXXXXXX)". +GTEST_API_ std::string CodePointToUtf8(UInt32 code_point); + +// Converts a wide string to a narrow string in UTF-8 encoding. +// The wide string is assumed to have the following encoding: +// UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS) +// UTF-32 if sizeof(wchar_t) == 4 (on Linux) +// Parameter str points to a null-terminated wide string. +// Parameter num_chars may additionally limit the number +// of wchar_t characters processed. -1 is used when the entire string +// should be processed. +// If the string contains code points that are not valid Unicode code points +// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output +// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding +// and contains invalid UTF-16 surrogate pairs, values in those pairs +// will be encoded as individual Unicode characters from Basic Normal Plane. +GTEST_API_ std::string WideStringToUtf8(const wchar_t* str, int num_chars); + +// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file +// if the variable is present. If a file already exists at this location, this +// function will write over it. If the variable is present, but the file cannot +// be created, prints an error and exits. +void WriteToShardStatusFileIfNeeded(); + +// Checks whether sharding is enabled by examining the relevant +// environment variable values. If the variables are present, +// but inconsistent (e.g., shard_index >= total_shards), prints +// an error and exits. If in_subprocess_for_death_test, sharding is +// disabled because it must only be applied to the original test +// process. Otherwise, we could filter out death tests we intended to execute. +GTEST_API_ bool ShouldShard(const char* total_shards_str, + const char* shard_index_str, + bool in_subprocess_for_death_test); + +// Parses the environment variable var as an Int32. If it is unset, +// returns default_val. If it is not an Int32, prints an error and +// and aborts. +GTEST_API_ Int32 Int32FromEnvOrDie(const char* env_var, Int32 default_val); + +// Given the total number of shards, the shard index, and the test id, +// returns true iff the test should be run on this shard. The test id is +// some arbitrary but unique non-negative integer assigned to each test +// method. Assumes that 0 <= shard_index < total_shards. +GTEST_API_ bool ShouldRunTestOnShard( + int total_shards, int shard_index, int test_id); + +// STL container utilities. + +// Returns the number of elements in the given container that satisfy +// the given predicate. +template <class Container, typename Predicate> +inline int CountIf(const Container& c, Predicate predicate) { + // Implemented as an explicit loop since std::count_if() in libCstd on + // Solaris has a non-standard signature. + int count = 0; + for (typename Container::const_iterator it = c.begin(); it != c.end(); ++it) { + if (predicate(*it)) + ++count; + } + return count; +} + +// Applies a function/functor to each element in the container. +template <class Container, typename Functor> +void ForEach(const Container& c, Functor functor) { + std::for_each(c.begin(), c.end(), functor); +} + +// Returns the i-th element of the vector, or default_value if i is not +// in range [0, v.size()). +template <typename E> +inline E GetElementOr(const std::vector<E>& v, int i, E default_value) { + return (i < 0 || i >= static_cast<int>(v.size())) ? default_value : v[i]; +} + +// Performs an in-place shuffle of a range of the vector's elements. +// 'begin' and 'end' are element indices as an STL-style range; +// i.e. [begin, end) are shuffled, where 'end' == size() means to +// shuffle to the end of the vector. +template <typename E> +void ShuffleRange(internal::Random* random, int begin, int end, + std::vector<E>* v) { + const int size = static_cast<int>(v->size()); + GTEST_CHECK_(0 <= begin && begin <= size) + << "Invalid shuffle range start " << begin << ": must be in range [0, " + << size << "]."; + GTEST_CHECK_(begin <= end && end <= size) + << "Invalid shuffle range finish " << end << ": must be in range [" + << begin << ", " << size << "]."; + + // Fisher-Yates shuffle, from + // http://en.wikipedia.org/wiki/Fisher-Yates_shuffle + for (int range_width = end - begin; range_width >= 2; range_width--) { + const int last_in_range = begin + range_width - 1; + const int selected = begin + random->Generate(range_width); + std::swap((*v)[selected], (*v)[last_in_range]); + } +} + +// Performs an in-place shuffle of the vector's elements. +template <typename E> +inline void Shuffle(internal::Random* random, std::vector<E>* v) { + ShuffleRange(random, 0, static_cast<int>(v->size()), v); +} + +// A function for deleting an object. Handy for being used as a +// functor. +template <typename T> +static void Delete(T* x) { + delete x; +} + +// A predicate that checks the key of a TestProperty against a known key. +// +// TestPropertyKeyIs is copyable. +class TestPropertyKeyIs { + public: + // Constructor. + // + // TestPropertyKeyIs has NO default constructor. + explicit TestPropertyKeyIs(const std::string& key) : key_(key) {} + + // Returns true iff the test name of test property matches on key_. + bool operator()(const TestProperty& test_property) const { + return test_property.key() == key_; + } + + private: + std::string key_; +}; + +// Class UnitTestOptions. +// +// This class contains functions for processing options the user +// specifies when running the tests. It has only static members. +// +// In most cases, the user can specify an option using either an +// environment variable or a command line flag. E.g. you can set the +// test filter using either GTEST_FILTER or --gtest_filter. If both +// the variable and the flag are present, the latter overrides the +// former. +class GTEST_API_ UnitTestOptions { + public: + // Functions for processing the gtest_output flag. + + // Returns the output format, or "" for normal printed output. + static std::string GetOutputFormat(); + + // Returns the absolute path of the requested output file, or the + // default (test_detail.xml in the original working directory) if + // none was explicitly specified. + static std::string GetAbsolutePathToOutputFile(); + + // Functions for processing the gtest_filter flag. + + // Returns true iff the wildcard pattern matches the string. The + // first ':' or '\0' character in pattern marks the end of it. + // + // This recursive algorithm isn't very efficient, but is clear and + // works well enough for matching test names, which are short. + static bool PatternMatchesString(const char *pattern, const char *str); + + // Returns true iff the user-specified filter matches the test case + // name and the test name. + static bool FilterMatchesTest(const std::string &test_case_name, + const std::string &test_name); + +#if GTEST_OS_WINDOWS + // Function for supporting the gtest_catch_exception flag. + + // Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the + // given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise. + // This function is useful as an __except condition. + static int GTestShouldProcessSEH(DWORD exception_code); +#endif // GTEST_OS_WINDOWS + + // Returns true if "name" matches the ':' separated list of glob-style + // filters in "filter". + static bool MatchesFilter(const std::string& name, const char* filter); +}; + +// Returns the current application's name, removing directory path if that +// is present. Used by UnitTestOptions::GetOutputFile. +GTEST_API_ FilePath GetCurrentExecutableName(); + +// The role interface for getting the OS stack trace as a string. +class OsStackTraceGetterInterface { + public: + OsStackTraceGetterInterface() {} + virtual ~OsStackTraceGetterInterface() {} + + // Returns the current OS stack trace as an std::string. Parameters: + // + // max_depth - the maximum number of stack frames to be included + // in the trace. + // skip_count - the number of top frames to be skipped; doesn't count + // against max_depth. + virtual string CurrentStackTrace(int max_depth, int skip_count) = 0; + + // UponLeavingGTest() should be called immediately before Google Test calls + // user code. It saves some information about the current stack that + // CurrentStackTrace() will use to find and hide Google Test stack frames. + virtual void UponLeavingGTest() = 0; + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetterInterface); +}; + +// A working implementation of the OsStackTraceGetterInterface interface. +class OsStackTraceGetter : public OsStackTraceGetterInterface { + public: + OsStackTraceGetter() : caller_frame_(NULL) {} + + virtual string CurrentStackTrace(int max_depth, int skip_count) + GTEST_LOCK_EXCLUDED_(mutex_); + + virtual void UponLeavingGTest() GTEST_LOCK_EXCLUDED_(mutex_); + + // This string is inserted in place of stack frames that are part of + // Google Test's implementation. + static const char* const kElidedFramesMarker; + + private: + Mutex mutex_; // protects all internal state + + // We save the stack frame below the frame that calls user code. + // We do this because the address of the frame immediately below + // the user code changes between the call to UponLeavingGTest() + // and any calls to CurrentStackTrace() from within the user code. + void* caller_frame_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetter); +}; + +// Information about a Google Test trace point. +struct TraceInfo { + const char* file; + int line; + std::string message; +}; + +// This is the default global test part result reporter used in UnitTestImpl. +// This class should only be used by UnitTestImpl. +class DefaultGlobalTestPartResultReporter + : public TestPartResultReporterInterface { + public: + explicit DefaultGlobalTestPartResultReporter(UnitTestImpl* unit_test); + // Implements the TestPartResultReporterInterface. Reports the test part + // result in the current test. + virtual void ReportTestPartResult(const TestPartResult& result); + + private: + UnitTestImpl* const unit_test_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultGlobalTestPartResultReporter); +}; + +// This is the default per thread test part result reporter used in +// UnitTestImpl. This class should only be used by UnitTestImpl. +class DefaultPerThreadTestPartResultReporter + : public TestPartResultReporterInterface { + public: + explicit DefaultPerThreadTestPartResultReporter(UnitTestImpl* unit_test); + // Implements the TestPartResultReporterInterface. The implementation just + // delegates to the current global test part result reporter of *unit_test_. + virtual void ReportTestPartResult(const TestPartResult& result); + + private: + UnitTestImpl* const unit_test_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultPerThreadTestPartResultReporter); +}; + +// The private implementation of the UnitTest class. We don't protect +// the methods under a mutex, as this class is not accessible by a +// user and the UnitTest class that delegates work to this class does +// proper locking. +class GTEST_API_ UnitTestImpl { + public: + explicit UnitTestImpl(UnitTest* parent); + virtual ~UnitTestImpl(); + + // There are two different ways to register your own TestPartResultReporter. + // You can register your own reporter to listen either only for test results + // from the current thread or for results from all threads. + // By default, each per-thread test result reporter just passes a new + // TestPartResult to the global test result reporter, which registers the + // test part result for the currently running test. + + // Returns the global test part result reporter. + TestPartResultReporterInterface* GetGlobalTestPartResultReporter(); + + // Sets the global test part result reporter. + void SetGlobalTestPartResultReporter( + TestPartResultReporterInterface* reporter); + + // Returns the test part result reporter for the current thread. + TestPartResultReporterInterface* GetTestPartResultReporterForCurrentThread(); + + // Sets the test part result reporter for the current thread. + void SetTestPartResultReporterForCurrentThread( + TestPartResultReporterInterface* reporter); + + // Gets the number of successful test cases. + int successful_test_case_count() const; + + // Gets the number of failed test cases. + int failed_test_case_count() const; + + // Gets the number of all test cases. + int total_test_case_count() const; + + // Gets the number of all test cases that contain at least one test + // that should run. + int test_case_to_run_count() const; + + // Gets the number of successful tests. + int successful_test_count() const; + + // Gets the number of failed tests. + int failed_test_count() const; + + // Gets the number of disabled tests that will be reported in the XML report. + int reportable_disabled_test_count() const; + + // Gets the number of disabled tests. + int disabled_test_count() const; + + // Gets the number of tests to be printed in the XML report. + int reportable_test_count() const; + + // Gets the number of all tests. + int total_test_count() const; + + // Gets the number of tests that should run. + int test_to_run_count() const; + + // Gets the time of the test program start, in ms from the start of the + // UNIX epoch. + TimeInMillis start_timestamp() const { return start_timestamp_; } + + // Gets the elapsed time, in milliseconds. + TimeInMillis elapsed_time() const { return elapsed_time_; } + + // Returns true iff the unit test passed (i.e. all test cases passed). + bool Passed() const { return !Failed(); } + + // Returns true iff the unit test failed (i.e. some test case failed + // or something outside of all tests failed). + bool Failed() const { + return failed_test_case_count() > 0 || ad_hoc_test_result()->Failed(); + } + + // Gets the i-th test case among all the test cases. i can range from 0 to + // total_test_case_count() - 1. If i is not in that range, returns NULL. + const TestCase* GetTestCase(int i) const { + const int index = GetElementOr(test_case_indices_, i, -1); + return index < 0 ? NULL : test_cases_[i]; + } + + // Gets the i-th test case among all the test cases. i can range from 0 to + // total_test_case_count() - 1. If i is not in that range, returns NULL. + TestCase* GetMutableTestCase(int i) { + const int index = GetElementOr(test_case_indices_, i, -1); + return index < 0 ? NULL : test_cases_[index]; + } + + // Provides access to the event listener list. + TestEventListeners* listeners() { return &listeners_; } + + // Returns the TestResult for the test that's currently running, or + // the TestResult for the ad hoc test if no test is running. + TestResult* current_test_result(); + + // Returns the TestResult for the ad hoc test. + const TestResult* ad_hoc_test_result() const { return &ad_hoc_test_result_; } + + // Sets the OS stack trace getter. + // + // Does nothing if the input and the current OS stack trace getter + // are the same; otherwise, deletes the old getter and makes the + // input the current getter. + void set_os_stack_trace_getter(OsStackTraceGetterInterface* getter); + + // Returns the current OS stack trace getter if it is not NULL; + // otherwise, creates an OsStackTraceGetter, makes it the current + // getter, and returns it. + OsStackTraceGetterInterface* os_stack_trace_getter(); + + // Returns the current OS stack trace as an std::string. + // + // The maximum number of stack frames to be included is specified by + // the gtest_stack_trace_depth flag. The skip_count parameter + // specifies the number of top frames to be skipped, which doesn't + // count against the number of frames to be included. + // + // For example, if Foo() calls Bar(), which in turn calls + // CurrentOsStackTraceExceptTop(1), Foo() will be included in the + // trace but Bar() and CurrentOsStackTraceExceptTop() won't. + std::string CurrentOsStackTraceExceptTop(int skip_count) GTEST_NO_INLINE_; + + // Finds and returns a TestCase with the given name. If one doesn't + // exist, creates one and returns it. + // + // Arguments: + // + // test_case_name: name of the test case + // type_param: the name of the test's type parameter, or NULL if + // this is not a typed or a type-parameterized test. + // set_up_tc: pointer to the function that sets up the test case + // tear_down_tc: pointer to the function that tears down the test case + TestCase* GetTestCase(const char* test_case_name, + const char* type_param, + Test::SetUpTestCaseFunc set_up_tc, + Test::TearDownTestCaseFunc tear_down_tc); + + // Adds a TestInfo to the unit test. + // + // Arguments: + // + // set_up_tc: pointer to the function that sets up the test case + // tear_down_tc: pointer to the function that tears down the test case + // test_info: the TestInfo object + void AddTestInfo(Test::SetUpTestCaseFunc set_up_tc, + Test::TearDownTestCaseFunc tear_down_tc, + TestInfo* test_info) { + // In order to support thread-safe death tests, we need to + // remember the original working directory when the test program + // was first invoked. We cannot do this in RUN_ALL_TESTS(), as + // the user may have changed the current directory before calling + // RUN_ALL_TESTS(). Therefore we capture the current directory in + // AddTestInfo(), which is called to register a TEST or TEST_F + // before main() is reached. + if (original_working_dir_.IsEmpty()) { + original_working_dir_.Set(FilePath::GetCurrentDir()); + GTEST_CHECK_(!original_working_dir_.IsEmpty()) + << "Failed to get the current working directory."; + } + + GetTestCase(test_info->test_case_name(), + test_info->type_param(), + set_up_tc, + tear_down_tc)->AddTestInfo(test_info); + } + +#if GTEST_HAS_PARAM_TEST + // Returns ParameterizedTestCaseRegistry object used to keep track of + // value-parameterized tests and instantiate and register them. + internal::ParameterizedTestCaseRegistry& parameterized_test_registry() { + return parameterized_test_registry_; + } +#endif // GTEST_HAS_PARAM_TEST + + // Sets the TestCase object for the test that's currently running. + void set_current_test_case(TestCase* a_current_test_case) { + current_test_case_ = a_current_test_case; + } + + // Sets the TestInfo object for the test that's currently running. If + // current_test_info is NULL, the assertion results will be stored in + // ad_hoc_test_result_. + void set_current_test_info(TestInfo* a_current_test_info) { + current_test_info_ = a_current_test_info; + } + + // Registers all parameterized tests defined using TEST_P and + // INSTANTIATE_TEST_CASE_P, creating regular tests for each test/parameter + // combination. This method can be called more then once; it has guards + // protecting from registering the tests more then once. If + // value-parameterized tests are disabled, RegisterParameterizedTests is + // present but does nothing. + void RegisterParameterizedTests(); + + // Runs all tests in this UnitTest object, prints the result, and + // returns true if all tests are successful. If any exception is + // thrown during a test, this test is considered to be failed, but + // the rest of the tests will still be run. + bool RunAllTests(); + + // Clears the results of all tests, except the ad hoc tests. + void ClearNonAdHocTestResult() { + ForEach(test_cases_, TestCase::ClearTestCaseResult); + } + + // Clears the results of ad-hoc test assertions. + void ClearAdHocTestResult() { + ad_hoc_test_result_.Clear(); + } + + // Adds a TestProperty to the current TestResult object when invoked in a + // context of a test or a test case, or to the global property set. If the + // result already contains a property with the same key, the value will be + // updated. + void RecordProperty(const TestProperty& test_property); + + enum ReactionToSharding { + HONOR_SHARDING_PROTOCOL, + IGNORE_SHARDING_PROTOCOL + }; + + // Matches the full name of each test against the user-specified + // filter to decide whether the test should run, then records the + // result in each TestCase and TestInfo object. + // If shard_tests == HONOR_SHARDING_PROTOCOL, further filters tests + // based on sharding variables in the environment. + // Returns the number of tests that should run. + int FilterTests(ReactionToSharding shard_tests); + + // Prints the names of the tests matching the user-specified filter flag. + void ListTestsMatchingFilter(); + + const TestCase* current_test_case() const { return current_test_case_; } + TestInfo* current_test_info() { return current_test_info_; } + const TestInfo* current_test_info() const { return current_test_info_; } + + // Returns the vector of environments that need to be set-up/torn-down + // before/after the tests are run. + std::vector<Environment*>& environments() { return environments_; } + + // Getters for the per-thread Google Test trace stack. + std::vector<TraceInfo>& gtest_trace_stack() { + return *(gtest_trace_stack_.pointer()); + } + const std::vector<TraceInfo>& gtest_trace_stack() const { + return gtest_trace_stack_.get(); + } + +#if GTEST_HAS_DEATH_TEST + void InitDeathTestSubprocessControlInfo() { + internal_run_death_test_flag_.reset(ParseInternalRunDeathTestFlag()); + } + // Returns a pointer to the parsed --gtest_internal_run_death_test + // flag, or NULL if that flag was not specified. + // This information is useful only in a death test child process. + // Must not be called before a call to InitGoogleTest. + const InternalRunDeathTestFlag* internal_run_death_test_flag() const { + return internal_run_death_test_flag_.get(); + } + + // Returns a pointer to the current death test factory. + internal::DeathTestFactory* death_test_factory() { + return death_test_factory_.get(); + } + + void SuppressTestEventsIfInSubprocess(); + + friend class ReplaceDeathTestFactory; +#endif // GTEST_HAS_DEATH_TEST + + // Initializes the event listener performing XML output as specified by + // UnitTestOptions. Must not be called before InitGoogleTest. + void ConfigureXmlOutput(); + +#if GTEST_CAN_STREAM_RESULTS_ + // Initializes the event listener for streaming test results to a socket. + // Must not be called before InitGoogleTest. + void ConfigureStreamingOutput(); +#endif + + // Performs initialization dependent upon flag values obtained in + // ParseGoogleTestFlagsOnly. Is called from InitGoogleTest after the call to + // ParseGoogleTestFlagsOnly. In case a user neglects to call InitGoogleTest + // this function is also called from RunAllTests. Since this function can be + // called more than once, it has to be idempotent. + void PostFlagParsingInit(); + + // Gets the random seed used at the start of the current test iteration. + int random_seed() const { return random_seed_; } + + // Gets the random number generator. + internal::Random* random() { return &random_; } + + // Shuffles all test cases, and the tests within each test case, + // making sure that death tests are still run first. + void ShuffleTests(); + + // Restores the test cases and tests to their order before the first shuffle. + void UnshuffleTests(); + + // Returns the value of GTEST_FLAG(catch_exceptions) at the moment + // UnitTest::Run() starts. + bool catch_exceptions() const { return catch_exceptions_; } + + private: + friend class ::testing::UnitTest; + + // Used by UnitTest::Run() to capture the state of + // GTEST_FLAG(catch_exceptions) at the moment it starts. + void set_catch_exceptions(bool value) { catch_exceptions_ = value; } + + // The UnitTest object that owns this implementation object. + UnitTest* const parent_; + + // The working directory when the first TEST() or TEST_F() was + // executed. + internal::FilePath original_working_dir_; + + // The default test part result reporters. + DefaultGlobalTestPartResultReporter default_global_test_part_result_reporter_; + DefaultPerThreadTestPartResultReporter + default_per_thread_test_part_result_reporter_; + + // Points to (but doesn't own) the global test part result reporter. + TestPartResultReporterInterface* global_test_part_result_repoter_; + + // Protects read and write access to global_test_part_result_reporter_. + internal::Mutex global_test_part_result_reporter_mutex_; + + // Points to (but doesn't own) the per-thread test part result reporter. + internal::ThreadLocal<TestPartResultReporterInterface*> + per_thread_test_part_result_reporter_; + + // The vector of environments that need to be set-up/torn-down + // before/after the tests are run. + std::vector<Environment*> environments_; + + // The vector of TestCases in their original order. It owns the + // elements in the vector. + std::vector<TestCase*> test_cases_; + + // Provides a level of indirection for the test case list to allow + // easy shuffling and restoring the test case order. The i-th + // element of this vector is the index of the i-th test case in the + // shuffled order. + std::vector<int> test_case_indices_; + +#if GTEST_HAS_PARAM_TEST + // ParameterizedTestRegistry object used to register value-parameterized + // tests. + internal::ParameterizedTestCaseRegistry parameterized_test_registry_; + + // Indicates whether RegisterParameterizedTests() has been called already. + bool parameterized_tests_registered_; +#endif // GTEST_HAS_PARAM_TEST + + // Index of the last death test case registered. Initially -1. + int last_death_test_case_; + + // This points to the TestCase for the currently running test. It + // changes as Google Test goes through one test case after another. + // When no test is running, this is set to NULL and Google Test + // stores assertion results in ad_hoc_test_result_. Initially NULL. + TestCase* current_test_case_; + + // This points to the TestInfo for the currently running test. It + // changes as Google Test goes through one test after another. When + // no test is running, this is set to NULL and Google Test stores + // assertion results in ad_hoc_test_result_. Initially NULL. + TestInfo* current_test_info_; + + // Normally, a user only writes assertions inside a TEST or TEST_F, + // or inside a function called by a TEST or TEST_F. Since Google + // Test keeps track of which test is current running, it can + // associate such an assertion with the test it belongs to. + // + // If an assertion is encountered when no TEST or TEST_F is running, + // Google Test attributes the assertion result to an imaginary "ad hoc" + // test, and records the result in ad_hoc_test_result_. + TestResult ad_hoc_test_result_; + + // The list of event listeners that can be used to track events inside + // Google Test. + TestEventListeners listeners_; + + // The OS stack trace getter. Will be deleted when the UnitTest + // object is destructed. By default, an OsStackTraceGetter is used, + // but the user can set this field to use a custom getter if that is + // desired. + OsStackTraceGetterInterface* os_stack_trace_getter_; + + // True iff PostFlagParsingInit() has been called. + bool post_flag_parse_init_performed_; + + // The random number seed used at the beginning of the test run. + int random_seed_; + + // Our random number generator. + internal::Random random_; + + // The time of the test program start, in ms from the start of the + // UNIX epoch. + TimeInMillis start_timestamp_; + + // How long the test took to run, in milliseconds. + TimeInMillis elapsed_time_; + +#if GTEST_HAS_DEATH_TEST + // The decomposed components of the gtest_internal_run_death_test flag, + // parsed when RUN_ALL_TESTS is called. + internal::scoped_ptr<InternalRunDeathTestFlag> internal_run_death_test_flag_; + internal::scoped_ptr<internal::DeathTestFactory> death_test_factory_; +#endif // GTEST_HAS_DEATH_TEST + + // A per-thread stack of traces created by the SCOPED_TRACE() macro. + internal::ThreadLocal<std::vector<TraceInfo> > gtest_trace_stack_; + + // The value of GTEST_FLAG(catch_exceptions) at the moment RunAllTests() + // starts. + bool catch_exceptions_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTestImpl); +}; // class UnitTestImpl + +// Convenience function for accessing the global UnitTest +// implementation object. +inline UnitTestImpl* GetUnitTestImpl() { + return UnitTest::GetInstance()->impl(); +} + +#if GTEST_USES_SIMPLE_RE + +// Internal helper functions for implementing the simple regular +// expression matcher. +GTEST_API_ bool IsInSet(char ch, const char* str); +GTEST_API_ bool IsAsciiDigit(char ch); +GTEST_API_ bool IsAsciiPunct(char ch); +GTEST_API_ bool IsRepeat(char ch); +GTEST_API_ bool IsAsciiWhiteSpace(char ch); +GTEST_API_ bool IsAsciiWordChar(char ch); +GTEST_API_ bool IsValidEscape(char ch); +GTEST_API_ bool AtomMatchesChar(bool escaped, char pattern, char ch); +GTEST_API_ bool ValidateRegex(const char* regex); +GTEST_API_ bool MatchRegexAtHead(const char* regex, const char* str); +GTEST_API_ bool MatchRepetitionAndRegexAtHead( + bool escaped, char ch, char repeat, const char* regex, const char* str); +GTEST_API_ bool MatchRegexAnywhere(const char* regex, const char* str); + +#endif // GTEST_USES_SIMPLE_RE + +// Parses the command line for Google Test flags, without initializing +// other parts of Google Test. +GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, char** argv); +GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv); + +#if GTEST_HAS_DEATH_TEST + +// Returns the message describing the last system error, regardless of the +// platform. +GTEST_API_ std::string GetLastErrnoDescription(); + +# if GTEST_OS_WINDOWS +// Provides leak-safe Windows kernel handle ownership. +class AutoHandle { + public: + AutoHandle() : handle_(INVALID_HANDLE_VALUE) {} + explicit AutoHandle(HANDLE handle) : handle_(handle) {} + + ~AutoHandle() { Reset(); } + + HANDLE Get() const { return handle_; } + void Reset() { Reset(INVALID_HANDLE_VALUE); } + void Reset(HANDLE handle) { + if (handle != handle_) { + if (handle_ != INVALID_HANDLE_VALUE) + ::CloseHandle(handle_); + handle_ = handle; + } + } + + private: + HANDLE handle_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(AutoHandle); +}; +# endif // GTEST_OS_WINDOWS + +// Attempts to parse a string into a positive integer pointed to by the +// number parameter. Returns true if that is possible. +// GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we can use +// it here. +template <typename Integer> +bool ParseNaturalNumber(const ::std::string& str, Integer* number) { + // Fail fast if the given string does not begin with a digit; + // this bypasses strtoXXX's "optional leading whitespace and plus + // or minus sign" semantics, which are undesirable here. + if (str.empty() || !IsDigit(str[0])) { + return false; + } + errno = 0; + + char* end; + // BiggestConvertible is the largest integer type that system-provided + // string-to-number conversion routines can return. + +# if GTEST_OS_WINDOWS && !defined(__GNUC__) + + // MSVC and C++ Builder define __int64 instead of the standard long long. + typedef unsigned __int64 BiggestConvertible; + const BiggestConvertible parsed = _strtoui64(str.c_str(), &end, 10); + +# else + + typedef unsigned long long BiggestConvertible; // NOLINT + const BiggestConvertible parsed = strtoull(str.c_str(), &end, 10); + +# endif // GTEST_OS_WINDOWS && !defined(__GNUC__) + + const bool parse_success = *end == '\0' && errno == 0; + + // TODO(vladl@google.com): Convert this to compile time assertion when it is + // available. + GTEST_CHECK_(sizeof(Integer) <= sizeof(parsed)); + + const Integer result = static_cast<Integer>(parsed); + if (parse_success && static_cast<BiggestConvertible>(result) == parsed) { + *number = result; + return true; + } + return false; +} +#endif // GTEST_HAS_DEATH_TEST + +// TestResult contains some private methods that should be hidden from +// Google Test user but are required for testing. This class allow our tests +// to access them. +// +// This class is supplied only for the purpose of testing Google Test's own +// constructs. Do not use it in user tests, either directly or indirectly. +class TestResultAccessor { + public: + static void RecordProperty(TestResult* test_result, + const std::string& xml_element, + const TestProperty& property) { + test_result->RecordProperty(xml_element, property); + } + + static void ClearTestPartResults(TestResult* test_result) { + test_result->ClearTestPartResults(); + } + + static const std::vector<testing::TestPartResult>& test_part_results( + const TestResult& test_result) { + return test_result.test_part_results(); + } +}; + +#if GTEST_CAN_STREAM_RESULTS_ + +// Streams test results to the given port on the given host machine. +class StreamingListener : public EmptyTestEventListener { + public: + // Abstract base class for writing strings to a socket. + class AbstractSocketWriter { + public: + virtual ~AbstractSocketWriter() {} + + // Sends a string to the socket. + virtual void Send(const string& message) = 0; + + // Closes the socket. + virtual void CloseConnection() {} + + // Sends a string and a newline to the socket. + void SendLn(const string& message) { + Send(message + "\n"); + } + }; + + // Concrete class for actually writing strings to a socket. + class SocketWriter : public AbstractSocketWriter { + public: + SocketWriter(const string& host, const string& port) + : sockfd_(-1), host_name_(host), port_num_(port) { + MakeConnection(); + } + + virtual ~SocketWriter() { + if (sockfd_ != -1) + CloseConnection(); + } + + // Sends a string to the socket. + virtual void Send(const string& message) { + GTEST_CHECK_(sockfd_ != -1) + << "Send() can be called only when there is a connection."; + + const int len = static_cast<int>(message.length()); + if (write(sockfd_, message.c_str(), len) != len) { + GTEST_LOG_(WARNING) + << "stream_result_to: failed to stream to " + << host_name_ << ":" << port_num_; + } + } + + private: + // Creates a client socket and connects to the server. + void MakeConnection(); + + // Closes the socket. + void CloseConnection() { + GTEST_CHECK_(sockfd_ != -1) + << "CloseConnection() can be called only when there is a connection."; + + close(sockfd_); + sockfd_ = -1; + } + + int sockfd_; // socket file descriptor + const string host_name_; + const string port_num_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(SocketWriter); + }; // class SocketWriter + + // Escapes '=', '&', '%', and '\n' characters in str as "%xx". + static string UrlEncode(const char* str); + + StreamingListener(const string& host, const string& port) + : socket_writer_(new SocketWriter(host, port)) { Start(); } + + explicit StreamingListener(AbstractSocketWriter* socket_writer) + : socket_writer_(socket_writer) { Start(); } + + void OnTestProgramStart(const UnitTest& /* unit_test */) { + SendLn("event=TestProgramStart"); + } + + void OnTestProgramEnd(const UnitTest& unit_test) { + // Note that Google Test current only report elapsed time for each + // test iteration, not for the entire test program. + SendLn("event=TestProgramEnd&passed=" + FormatBool(unit_test.Passed())); + + // Notify the streaming server to stop. + socket_writer_->CloseConnection(); + } + + void OnTestIterationStart(const UnitTest& /* unit_test */, int iteration) { + SendLn("event=TestIterationStart&iteration=" + + StreamableToString(iteration)); + } + + void OnTestIterationEnd(const UnitTest& unit_test, int /* iteration */) { + SendLn("event=TestIterationEnd&passed=" + + FormatBool(unit_test.Passed()) + "&elapsed_time=" + + StreamableToString(unit_test.elapsed_time()) + "ms"); + } + + void OnTestCaseStart(const TestCase& test_case) { + SendLn(std::string("event=TestCaseStart&name=") + test_case.name()); + } + + void OnTestCaseEnd(const TestCase& test_case) { + SendLn("event=TestCaseEnd&passed=" + FormatBool(test_case.Passed()) + + "&elapsed_time=" + StreamableToString(test_case.elapsed_time()) + + "ms"); + } + + void OnTestStart(const TestInfo& test_info) { + SendLn(std::string("event=TestStart&name=") + test_info.name()); + } + + void OnTestEnd(const TestInfo& test_info) { + SendLn("event=TestEnd&passed=" + + FormatBool((test_info.result())->Passed()) + + "&elapsed_time=" + + StreamableToString((test_info.result())->elapsed_time()) + "ms"); + } + + void OnTestPartResult(const TestPartResult& test_part_result) { + const char* file_name = test_part_result.file_name(); + if (file_name == NULL) + file_name = ""; + SendLn("event=TestPartResult&file=" + UrlEncode(file_name) + + "&line=" + StreamableToString(test_part_result.line_number()) + + "&message=" + UrlEncode(test_part_result.message())); + } + + private: + // Sends the given message and a newline to the socket. + void SendLn(const string& message) { socket_writer_->SendLn(message); } + + // Called at the start of streaming to notify the receiver what + // protocol we are using. + void Start() { SendLn("gtest_streaming_protocol_version=1.0"); } + + string FormatBool(bool value) { return value ? "1" : "0"; } + + const scoped_ptr<AbstractSocketWriter> socket_writer_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamingListener); +}; // class StreamingListener + +#endif // GTEST_CAN_STREAM_RESULTS_ + +} // namespace internal +} // namespace testing + +#endif // GTEST_SRC_GTEST_INTERNAL_INL_H_ +#undef GTEST_IMPLEMENTATION_ + +#if GTEST_OS_WINDOWS +# define vsnprintf _vsnprintf +#endif // GTEST_OS_WINDOWS + +namespace testing { + +using internal::CountIf; +using internal::ForEach; +using internal::GetElementOr; +using internal::Shuffle; + +// Constants. + +// A test whose test case name or test name matches this filter is +// disabled and not run. +static const char kDisableTestFilter[] = "DISABLED_*:*/DISABLED_*"; + +// A test case whose name matches this filter is considered a death +// test case and will be run before test cases whose name doesn't +// match this filter. +static const char kDeathTestCaseFilter[] = "*DeathTest:*DeathTest/*"; + +// A test filter that matches everything. +static const char kUniversalFilter[] = "*"; + +// The default output file for XML output. +static const char kDefaultOutputFile[] = "test_detail.xml"; + +// The environment variable name for the test shard index. +static const char kTestShardIndex[] = "GTEST_SHARD_INDEX"; +// The environment variable name for the total number of test shards. +static const char kTestTotalShards[] = "GTEST_TOTAL_SHARDS"; +// The environment variable name for the test shard status file. +static const char kTestShardStatusFile[] = "GTEST_SHARD_STATUS_FILE"; + +namespace internal { + +// The text used in failure messages to indicate the start of the +// stack trace. +const char kStackTraceMarker[] = "\nStack trace:\n"; + +// g_help_flag is true iff the --help flag or an equivalent form is +// specified on the command line. +bool g_help_flag = false; + +} // namespace internal + +static const char* GetDefaultFilter() { + return kUniversalFilter; +} + +GTEST_DEFINE_bool_( + also_run_disabled_tests, + internal::BoolFromGTestEnv("also_run_disabled_tests", false), + "Run disabled tests too, in addition to the tests normally being run."); + +GTEST_DEFINE_bool_( + break_on_failure, + internal::BoolFromGTestEnv("break_on_failure", false), + "True iff a failed assertion should be a debugger break-point."); + +GTEST_DEFINE_bool_( + catch_exceptions, + internal::BoolFromGTestEnv("catch_exceptions", true), + "True iff " GTEST_NAME_ + " should catch exceptions and treat them as test failures."); + +GTEST_DEFINE_string_( + color, + internal::StringFromGTestEnv("color", "auto"), + "Whether to use colors in the output. Valid values: yes, no, " + "and auto. 'auto' means to use colors if the output is " + "being sent to a terminal and the TERM environment variable " + "is set to a terminal type that supports colors."); + +GTEST_DEFINE_string_( + filter, + internal::StringFromGTestEnv("filter", GetDefaultFilter()), + "A colon-separated list of glob (not regex) patterns " + "for filtering the tests to run, optionally followed by a " + "'-' and a : separated list of negative patterns (tests to " + "exclude). A test is run if it matches one of the positive " + "patterns and does not match any of the negative patterns."); + +GTEST_DEFINE_bool_(list_tests, false, + "List all tests without running them."); + +GTEST_DEFINE_string_( + output, + internal::StringFromGTestEnv("output", ""), + "A format (currently must be \"xml\"), optionally followed " + "by a colon and an output file name or directory. A directory " + "is indicated by a trailing pathname separator. " + "Examples: \"xml:filename.xml\", \"xml::directoryname/\". " + "If a directory is specified, output files will be created " + "within that directory, with file-names based on the test " + "executable's name and, if necessary, made unique by adding " + "digits."); + +GTEST_DEFINE_bool_( + print_time, + internal::BoolFromGTestEnv("print_time", true), + "True iff " GTEST_NAME_ + " should display elapsed time in text output."); + +GTEST_DEFINE_int32_( + random_seed, + internal::Int32FromGTestEnv("random_seed", 0), + "Random number seed to use when shuffling test orders. Must be in range " + "[1, 99999], or 0 to use a seed based on the current time."); + +GTEST_DEFINE_int32_( + repeat, + internal::Int32FromGTestEnv("repeat", 1), + "How many times to repeat each test. Specify a negative number " + "for repeating forever. Useful for shaking out flaky tests."); + +GTEST_DEFINE_bool_( + show_internal_stack_frames, false, + "True iff " GTEST_NAME_ " should include internal stack frames when " + "printing test failure stack traces."); + +GTEST_DEFINE_bool_( + shuffle, + internal::BoolFromGTestEnv("shuffle", false), + "True iff " GTEST_NAME_ + " should randomize tests' order on every run."); + +GTEST_DEFINE_int32_( + stack_trace_depth, + internal::Int32FromGTestEnv("stack_trace_depth", kMaxStackTraceDepth), + "The maximum number of stack frames to print when an " + "assertion fails. The valid range is 0 through 100, inclusive."); + +GTEST_DEFINE_string_( + stream_result_to, + internal::StringFromGTestEnv("stream_result_to", ""), + "This flag specifies the host name and the port number on which to stream " + "test results. Example: \"localhost:555\". The flag is effective only on " + "Linux."); + +GTEST_DEFINE_bool_( + throw_on_failure, + internal::BoolFromGTestEnv("throw_on_failure", false), + "When this flag is specified, a failed assertion will throw an exception " + "if exceptions are enabled or exit the program with a non-zero code " + "otherwise."); + +namespace internal { + +// Generates a random number from [0, range), using a Linear +// Congruential Generator (LCG). Crashes if 'range' is 0 or greater +// than kMaxRange. +UInt32 Random::Generate(UInt32 range) { + // These constants are the same as are used in glibc's rand(3). + state_ = (1103515245U*state_ + 12345U) % kMaxRange; + + GTEST_CHECK_(range > 0) + << "Cannot generate a number in the range [0, 0)."; + GTEST_CHECK_(range <= kMaxRange) + << "Generation of a number in [0, " << range << ") was requested, " + << "but this can only generate numbers in [0, " << kMaxRange << ")."; + + // Converting via modulus introduces a bit of downward bias, but + // it's simple, and a linear congruential generator isn't too good + // to begin with. + return state_ % range; +} + +// GTestIsInitialized() returns true iff the user has initialized +// Google Test. Useful for catching the user mistake of not initializing +// Google Test before calling RUN_ALL_TESTS(). +// +// A user must call testing::InitGoogleTest() to initialize Google +// Test. g_init_gtest_count is set to the number of times +// InitGoogleTest() has been called. We don't protect this variable +// under a mutex as it is only accessed in the main thread. +GTEST_API_ int g_init_gtest_count = 0; +static bool GTestIsInitialized() { return g_init_gtest_count != 0; } + +// Iterates over a vector of TestCases, keeping a running sum of the +// results of calling a given int-returning method on each. +// Returns the sum. +static int SumOverTestCaseList(const std::vector<TestCase*>& case_list, + int (TestCase::*method)() const) { + int sum = 0; + for (size_t i = 0; i < case_list.size(); i++) { + sum += (case_list[i]->*method)(); + } + return sum; +} + +// Returns true iff the test case passed. +static bool TestCasePassed(const TestCase* test_case) { + return test_case->should_run() && test_case->Passed(); +} + +// Returns true iff the test case failed. +static bool TestCaseFailed(const TestCase* test_case) { + return test_case->should_run() && test_case->Failed(); +} + +// Returns true iff test_case contains at least one test that should +// run. +static bool ShouldRunTestCase(const TestCase* test_case) { + return test_case->should_run(); +} + +// AssertHelper constructor. +AssertHelper::AssertHelper(TestPartResult::Type type, + const char* file, + int line, + const char* message) + : data_(new AssertHelperData(type, file, line, message)) { +} + +AssertHelper::~AssertHelper() { + delete data_; +} + +// Message assignment, for assertion streaming support. +void AssertHelper::operator=(const Message& message) const { + UnitTest::GetInstance()-> + AddTestPartResult(data_->type, data_->file, data_->line, + AppendUserMessage(data_->message, message), + UnitTest::GetInstance()->impl() + ->CurrentOsStackTraceExceptTop(1) + // Skips the stack frame for this function itself. + ); // NOLINT +} + +// Mutex for linked pointers. +GTEST_API_ GTEST_DEFINE_STATIC_MUTEX_(g_linked_ptr_mutex); + +// Application pathname gotten in InitGoogleTest. +std::string g_executable_path; + +// Returns the current application's name, removing directory path if that +// is present. +FilePath GetCurrentExecutableName() { + FilePath result; + +#if GTEST_OS_WINDOWS + result.Set(FilePath(g_executable_path).RemoveExtension("exe")); +#else + result.Set(FilePath(g_executable_path)); +#endif // GTEST_OS_WINDOWS + + return result.RemoveDirectoryName(); +} + +// Functions for processing the gtest_output flag. + +// Returns the output format, or "" for normal printed output. +std::string UnitTestOptions::GetOutputFormat() { + const char* const gtest_output_flag = GTEST_FLAG(output).c_str(); + if (gtest_output_flag == NULL) return std::string(""); + + const char* const colon = strchr(gtest_output_flag, ':'); + return (colon == NULL) ? + std::string(gtest_output_flag) : + std::string(gtest_output_flag, colon - gtest_output_flag); +} + +// Returns the name of the requested output file, or the default if none +// was explicitly specified. +std::string UnitTestOptions::GetAbsolutePathToOutputFile() { + const char* const gtest_output_flag = GTEST_FLAG(output).c_str(); + if (gtest_output_flag == NULL) + return ""; + + const char* const colon = strchr(gtest_output_flag, ':'); + if (colon == NULL) + return internal::FilePath::ConcatPaths( + internal::FilePath( + UnitTest::GetInstance()->original_working_dir()), + internal::FilePath(kDefaultOutputFile)).string(); + + internal::FilePath output_name(colon + 1); + if (!output_name.IsAbsolutePath()) + // TODO(wan@google.com): on Windows \some\path is not an absolute + // path (as its meaning depends on the current drive), yet the + // following logic for turning it into an absolute path is wrong. + // Fix it. + output_name = internal::FilePath::ConcatPaths( + internal::FilePath(UnitTest::GetInstance()->original_working_dir()), + internal::FilePath(colon + 1)); + + if (!output_name.IsDirectory()) + return output_name.string(); + + internal::FilePath result(internal::FilePath::GenerateUniqueFileName( + output_name, internal::GetCurrentExecutableName(), + GetOutputFormat().c_str())); + return result.string(); +} + +// Returns true iff the wildcard pattern matches the string. The +// first ':' or '\0' character in pattern marks the end of it. +// +// This recursive algorithm isn't very efficient, but is clear and +// works well enough for matching test names, which are short. +bool UnitTestOptions::PatternMatchesString(const char *pattern, + const char *str) { + switch (*pattern) { + case '\0': + case ':': // Either ':' or '\0' marks the end of the pattern. + return *str == '\0'; + case '?': // Matches any single character. + return *str != '\0' && PatternMatchesString(pattern + 1, str + 1); + case '*': // Matches any string (possibly empty) of characters. + return (*str != '\0' && PatternMatchesString(pattern, str + 1)) || + PatternMatchesString(pattern + 1, str); + default: // Non-special character. Matches itself. + return *pattern == *str && + PatternMatchesString(pattern + 1, str + 1); + } +} + +bool UnitTestOptions::MatchesFilter( + const std::string& name, const char* filter) { + const char *cur_pattern = filter; + for (;;) { + if (PatternMatchesString(cur_pattern, name.c_str())) { + return true; + } + + // Finds the next pattern in the filter. + cur_pattern = strchr(cur_pattern, ':'); + + // Returns if no more pattern can be found. + if (cur_pattern == NULL) { + return false; + } + + // Skips the pattern separater (the ':' character). + cur_pattern++; + } +} + +// Returns true iff the user-specified filter matches the test case +// name and the test name. +bool UnitTestOptions::FilterMatchesTest(const std::string &test_case_name, + const std::string &test_name) { + const std::string& full_name = test_case_name + "." + test_name.c_str(); + + // Split --gtest_filter at '-', if there is one, to separate into + // positive filter and negative filter portions + const char* const p = GTEST_FLAG(filter).c_str(); + const char* const dash = strchr(p, '-'); + std::string positive; + std::string negative; + if (dash == NULL) { + positive = GTEST_FLAG(filter).c_str(); // Whole string is a positive filter + negative = ""; + } else { + positive = std::string(p, dash); // Everything up to the dash + negative = std::string(dash + 1); // Everything after the dash + if (positive.empty()) { + // Treat '-test1' as the same as '*-test1' + positive = kUniversalFilter; + } + } + + // A filter is a colon-separated list of patterns. It matches a + // test if any pattern in it matches the test. + return (MatchesFilter(full_name, positive.c_str()) && + !MatchesFilter(full_name, negative.c_str())); +} + +#if GTEST_HAS_SEH +// Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the +// given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise. +// This function is useful as an __except condition. +int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) { + // Google Test should handle a SEH exception if: + // 1. the user wants it to, AND + // 2. this is not a breakpoint exception, AND + // 3. this is not a C++ exception (VC++ implements them via SEH, + // apparently). + // + // SEH exception code for C++ exceptions. + // (see http://support.microsoft.com/kb/185294 for more information). + const DWORD kCxxExceptionCode = 0xe06d7363; + + bool should_handle = true; + + if (!GTEST_FLAG(catch_exceptions)) + should_handle = false; + else if (exception_code == EXCEPTION_BREAKPOINT) + should_handle = false; + else if (exception_code == kCxxExceptionCode) + should_handle = false; + + return should_handle ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH; +} +#endif // GTEST_HAS_SEH + +} // namespace internal + +// The c'tor sets this object as the test part result reporter used by +// Google Test. The 'result' parameter specifies where to report the +// results. Intercepts only failures from the current thread. +ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter( + TestPartResultArray* result) + : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD), + result_(result) { + Init(); +} + +// The c'tor sets this object as the test part result reporter used by +// Google Test. The 'result' parameter specifies where to report the +// results. +ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter( + InterceptMode intercept_mode, TestPartResultArray* result) + : intercept_mode_(intercept_mode), + result_(result) { + Init(); +} + +void ScopedFakeTestPartResultReporter::Init() { + internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); + if (intercept_mode_ == INTERCEPT_ALL_THREADS) { + old_reporter_ = impl->GetGlobalTestPartResultReporter(); + impl->SetGlobalTestPartResultReporter(this); + } else { + old_reporter_ = impl->GetTestPartResultReporterForCurrentThread(); + impl->SetTestPartResultReporterForCurrentThread(this); + } +} + +// The d'tor restores the test part result reporter used by Google Test +// before. +ScopedFakeTestPartResultReporter::~ScopedFakeTestPartResultReporter() { + internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); + if (intercept_mode_ == INTERCEPT_ALL_THREADS) { + impl->SetGlobalTestPartResultReporter(old_reporter_); + } else { + impl->SetTestPartResultReporterForCurrentThread(old_reporter_); + } +} + +// Increments the test part result count and remembers the result. +// This method is from the TestPartResultReporterInterface interface. +void ScopedFakeTestPartResultReporter::ReportTestPartResult( + const TestPartResult& result) { + result_->Append(result); +} + +namespace internal { + +// Returns the type ID of ::testing::Test. We should always call this +// instead of GetTypeId< ::testing::Test>() to get the type ID of +// testing::Test. This is to work around a suspected linker bug when +// using Google Test as a framework on Mac OS X. The bug causes +// GetTypeId< ::testing::Test>() to return different values depending +// on whether the call is from the Google Test framework itself or +// from user test code. GetTestTypeId() is guaranteed to always +// return the same value, as it always calls GetTypeId<>() from the +// gtest.cc, which is within the Google Test framework. +TypeId GetTestTypeId() { + return GetTypeId<Test>(); +} + +// The value of GetTestTypeId() as seen from within the Google Test +// library. This is solely for testing GetTestTypeId(). +extern const TypeId kTestTypeIdInGoogleTest = GetTestTypeId(); + +// This predicate-formatter checks that 'results' contains a test part +// failure of the given type and that the failure message contains the +// given substring. +AssertionResult HasOneFailure(const char* /* results_expr */, + const char* /* type_expr */, + const char* /* substr_expr */, + const TestPartResultArray& results, + TestPartResult::Type type, + const string& substr) { + const std::string expected(type == TestPartResult::kFatalFailure ? + "1 fatal failure" : + "1 non-fatal failure"); + Message msg; + if (results.size() != 1) { + msg << "Expected: " << expected << "\n" + << " Actual: " << results.size() << " failures"; + for (int i = 0; i < results.size(); i++) { + msg << "\n" << results.GetTestPartResult(i); + } + return AssertionFailure() << msg; + } + + const TestPartResult& r = results.GetTestPartResult(0); + if (r.type() != type) { + return AssertionFailure() << "Expected: " << expected << "\n" + << " Actual:\n" + << r; + } + + if (strstr(r.message(), substr.c_str()) == NULL) { + return AssertionFailure() << "Expected: " << expected << " containing \"" + << substr << "\"\n" + << " Actual:\n" + << r; + } + + return AssertionSuccess(); +} + +// The constructor of SingleFailureChecker remembers where to look up +// test part results, what type of failure we expect, and what +// substring the failure message should contain. +SingleFailureChecker:: SingleFailureChecker( + const TestPartResultArray* results, + TestPartResult::Type type, + const string& substr) + : results_(results), + type_(type), + substr_(substr) {} + +// The destructor of SingleFailureChecker verifies that the given +// TestPartResultArray contains exactly one failure that has the given +// type and contains the given substring. If that's not the case, a +// non-fatal failure will be generated. +SingleFailureChecker::~SingleFailureChecker() { + EXPECT_PRED_FORMAT3(HasOneFailure, *results_, type_, substr_); +} + +DefaultGlobalTestPartResultReporter::DefaultGlobalTestPartResultReporter( + UnitTestImpl* unit_test) : unit_test_(unit_test) {} + +void DefaultGlobalTestPartResultReporter::ReportTestPartResult( + const TestPartResult& result) { + unit_test_->current_test_result()->AddTestPartResult(result); + unit_test_->listeners()->repeater()->OnTestPartResult(result); +} + +DefaultPerThreadTestPartResultReporter::DefaultPerThreadTestPartResultReporter( + UnitTestImpl* unit_test) : unit_test_(unit_test) {} + +void DefaultPerThreadTestPartResultReporter::ReportTestPartResult( + const TestPartResult& result) { + unit_test_->GetGlobalTestPartResultReporter()->ReportTestPartResult(result); +} + +// Returns the global test part result reporter. +TestPartResultReporterInterface* +UnitTestImpl::GetGlobalTestPartResultReporter() { + internal::MutexLock lock(&global_test_part_result_reporter_mutex_); + return global_test_part_result_repoter_; +} + +// Sets the global test part result reporter. +void UnitTestImpl::SetGlobalTestPartResultReporter( + TestPartResultReporterInterface* reporter) { + internal::MutexLock lock(&global_test_part_result_reporter_mutex_); + global_test_part_result_repoter_ = reporter; +} + +// Returns the test part result reporter for the current thread. +TestPartResultReporterInterface* +UnitTestImpl::GetTestPartResultReporterForCurrentThread() { + return per_thread_test_part_result_reporter_.get(); +} + +// Sets the test part result reporter for the current thread. +void UnitTestImpl::SetTestPartResultReporterForCurrentThread( + TestPartResultReporterInterface* reporter) { + per_thread_test_part_result_reporter_.set(reporter); +} + +// Gets the number of successful test cases. +int UnitTestImpl::successful_test_case_count() const { + return CountIf(test_cases_, TestCasePassed); +} + +// Gets the number of failed test cases. +int UnitTestImpl::failed_test_case_count() const { + return CountIf(test_cases_, TestCaseFailed); +} + +// Gets the number of all test cases. +int UnitTestImpl::total_test_case_count() const { + return static_cast<int>(test_cases_.size()); +} + +// Gets the number of all test cases that contain at least one test +// that should run. +int UnitTestImpl::test_case_to_run_count() const { + return CountIf(test_cases_, ShouldRunTestCase); +} + +// Gets the number of successful tests. +int UnitTestImpl::successful_test_count() const { + return SumOverTestCaseList(test_cases_, &TestCase::successful_test_count); +} + +// Gets the number of failed tests. +int UnitTestImpl::failed_test_count() const { + return SumOverTestCaseList(test_cases_, &TestCase::failed_test_count); +} + +// Gets the number of disabled tests that will be reported in the XML report. +int UnitTestImpl::reportable_disabled_test_count() const { + return SumOverTestCaseList(test_cases_, + &TestCase::reportable_disabled_test_count); +} + +// Gets the number of disabled tests. +int UnitTestImpl::disabled_test_count() const { + return SumOverTestCaseList(test_cases_, &TestCase::disabled_test_count); +} + +// Gets the number of tests to be printed in the XML report. +int UnitTestImpl::reportable_test_count() const { + return SumOverTestCaseList(test_cases_, &TestCase::reportable_test_count); +} + +// Gets the number of all tests. +int UnitTestImpl::total_test_count() const { + return SumOverTestCaseList(test_cases_, &TestCase::total_test_count); +} + +// Gets the number of tests that should run. +int UnitTestImpl::test_to_run_count() const { + return SumOverTestCaseList(test_cases_, &TestCase::test_to_run_count); +} + +// Returns the current OS stack trace as an std::string. +// +// The maximum number of stack frames to be included is specified by +// the gtest_stack_trace_depth flag. The skip_count parameter +// specifies the number of top frames to be skipped, which doesn't +// count against the number of frames to be included. +// +// For example, if Foo() calls Bar(), which in turn calls +// CurrentOsStackTraceExceptTop(1), Foo() will be included in the +// trace but Bar() and CurrentOsStackTraceExceptTop() won't. +std::string UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) { + (void)skip_count; + return ""; +} + +// Returns the current time in milliseconds. +TimeInMillis GetTimeInMillis() { +#if GTEST_OS_WINDOWS_MOBILE || defined(__BORLANDC__) + // Difference between 1970-01-01 and 1601-01-01 in milliseconds. + // http://analogous.blogspot.com/2005/04/epoch.html + const TimeInMillis kJavaEpochToWinFileTimeDelta = + static_cast<TimeInMillis>(116444736UL) * 100000UL; + const DWORD kTenthMicrosInMilliSecond = 10000; + + SYSTEMTIME now_systime; + FILETIME now_filetime; + ULARGE_INTEGER now_int64; + // TODO(kenton@google.com): Shouldn't this just use + // GetSystemTimeAsFileTime()? + GetSystemTime(&now_systime); + if (SystemTimeToFileTime(&now_systime, &now_filetime)) { + now_int64.LowPart = now_filetime.dwLowDateTime; + now_int64.HighPart = now_filetime.dwHighDateTime; + now_int64.QuadPart = (now_int64.QuadPart / kTenthMicrosInMilliSecond) - + kJavaEpochToWinFileTimeDelta; + return now_int64.QuadPart; + } + return 0; +#elif GTEST_OS_WINDOWS && !GTEST_HAS_GETTIMEOFDAY_ + __timeb64 now; + +# ifdef _MSC_VER + + // MSVC 8 deprecates _ftime64(), so we want to suppress warning 4996 + // (deprecated function) there. + // TODO(kenton@google.com): Use GetTickCount()? Or use + // SystemTimeToFileTime() +# pragma warning(push) // Saves the current warning state. +# pragma warning(disable:4996) // Temporarily disables warning 4996. + _ftime64(&now); +# pragma warning(pop) // Restores the warning state. +# else + + _ftime64(&now); + +# endif // _MSC_VER + + return static_cast<TimeInMillis>(now.time) * 1000 + now.millitm; +#elif GTEST_HAS_GETTIMEOFDAY_ + struct timeval now; + gettimeofday(&now, NULL); + return static_cast<TimeInMillis>(now.tv_sec) * 1000 + now.tv_usec / 1000; +#else +# error "Don't know how to get the current time on your system." +#endif +} + +// Utilities + +// class String. + +#if GTEST_OS_WINDOWS_MOBILE +// Creates a UTF-16 wide string from the given ANSI string, allocating +// memory using new. The caller is responsible for deleting the return +// value using delete[]. Returns the wide string, or NULL if the +// input is NULL. +LPCWSTR String::AnsiToUtf16(const char* ansi) { + if (!ansi) return NULL; + const int length = strlen(ansi); + const int unicode_length = + MultiByteToWideChar(CP_ACP, 0, ansi, length, + NULL, 0); + WCHAR* unicode = new WCHAR[unicode_length + 1]; + MultiByteToWideChar(CP_ACP, 0, ansi, length, + unicode, unicode_length); + unicode[unicode_length] = 0; + return unicode; +} + +// Creates an ANSI string from the given wide string, allocating +// memory using new. The caller is responsible for deleting the return +// value using delete[]. Returns the ANSI string, or NULL if the +// input is NULL. +const char* String::Utf16ToAnsi(LPCWSTR utf16_str) { + if (!utf16_str) return NULL; + const int ansi_length = + WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, + NULL, 0, NULL, NULL); + char* ansi = new char[ansi_length + 1]; + WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, + ansi, ansi_length, NULL, NULL); + ansi[ansi_length] = 0; + return ansi; +} + +#endif // GTEST_OS_WINDOWS_MOBILE + +// Compares two C strings. Returns true iff they have the same content. +// +// Unlike strcmp(), this function can handle NULL argument(s). A NULL +// C string is considered different to any non-NULL C string, +// including the empty string. +bool String::CStringEquals(const char * lhs, const char * rhs) { + if ( lhs == NULL ) return rhs == NULL; + + if ( rhs == NULL ) return false; + + return strcmp(lhs, rhs) == 0; +} + +#if GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING + +// Converts an array of wide chars to a narrow string using the UTF-8 +// encoding, and streams the result to the given Message object. +static void StreamWideCharsToMessage(const wchar_t* wstr, size_t length, + Message* msg) { + for (size_t i = 0; i != length; ) { // NOLINT + if (wstr[i] != L'\0') { + *msg << WideStringToUtf8(wstr + i, static_cast<int>(length - i)); + while (i != length && wstr[i] != L'\0') + i++; + } else { + *msg << '\0'; + i++; + } + } +} + +#endif // GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING + +} // namespace internal + +// Constructs an empty Message. +// We allocate the stringstream separately because otherwise each use of +// ASSERT/EXPECT in a procedure adds over 200 bytes to the procedure's +// stack frame leading to huge stack frames in some cases; gcc does not reuse +// the stack space. +Message::Message() : ss_(new ::std::stringstream) { + // By default, we want there to be enough precision when printing + // a double to a Message. + *ss_ << std::setprecision(std::numeric_limits<double>::digits10 + 2); +} + +// These two overloads allow streaming a wide C string to a Message +// using the UTF-8 encoding. +Message& Message::operator <<(const wchar_t* wide_c_str) { + return *this << internal::String::ShowWideCString(wide_c_str); +} +Message& Message::operator <<(wchar_t* wide_c_str) { + return *this << internal::String::ShowWideCString(wide_c_str); +} + +#if GTEST_HAS_STD_WSTRING +// Converts the given wide string to a narrow string using the UTF-8 +// encoding, and streams the result to this Message object. +Message& Message::operator <<(const ::std::wstring& wstr) { + internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this); + return *this; +} +#endif // GTEST_HAS_STD_WSTRING + +#if GTEST_HAS_GLOBAL_WSTRING +// Converts the given wide string to a narrow string using the UTF-8 +// encoding, and streams the result to this Message object. +Message& Message::operator <<(const ::wstring& wstr) { + internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this); + return *this; +} +#endif // GTEST_HAS_GLOBAL_WSTRING + +// Gets the text streamed to this object so far as an std::string. +// Each '\0' character in the buffer is replaced with "\\0". +std::string Message::GetString() const { + return internal::StringStreamToString(ss_.get()); +} + +// AssertionResult constructors. +// Used in EXPECT_TRUE/FALSE(assertion_result). +AssertionResult::AssertionResult(const AssertionResult& other) + : success_(other.success_), + message_(other.message_.get() != NULL ? + new ::std::string(*other.message_) : + static_cast< ::std::string*>(NULL)) { +} + +// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE. +AssertionResult AssertionResult::operator!() const { + AssertionResult negation(!success_); + if (message_.get() != NULL) + negation << *message_; + return negation; +} + +// Makes a successful assertion result. +AssertionResult AssertionSuccess() { + return AssertionResult(true); +} + +// Makes a failed assertion result. +AssertionResult AssertionFailure() { + return AssertionResult(false); +} + +// Makes a failed assertion result with the given failure message. +// Deprecated; use AssertionFailure() << message. +AssertionResult AssertionFailure(const Message& message) { + return AssertionFailure() << message; +} + +namespace internal { + +// Constructs and returns the message for an equality assertion +// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure. +// +// The first four parameters are the expressions used in the assertion +// and their values, as strings. For example, for ASSERT_EQ(foo, bar) +// where foo is 5 and bar is 6, we have: +// +// expected_expression: "foo" +// actual_expression: "bar" +// expected_value: "5" +// actual_value: "6" +// +// The ignoring_case parameter is true iff the assertion is a +// *_STRCASEEQ*. When it's true, the string " (ignoring case)" will +// be inserted into the message. +AssertionResult EqFailure(const char* expected_expression, + const char* actual_expression, + const std::string& expected_value, + const std::string& actual_value, + bool ignoring_case) { + Message msg; + msg << "Value of: " << actual_expression; + if (actual_value != actual_expression) { + msg << "\n Actual: " << actual_value; + } + + msg << "\nExpected: " << expected_expression; + if (ignoring_case) { + msg << " (ignoring case)"; + } + if (expected_value != expected_expression) { + msg << "\nWhich is: " << expected_value; + } + + return AssertionFailure() << msg; +} + +// Constructs a failure message for Boolean assertions such as EXPECT_TRUE. +std::string GetBoolAssertionFailureMessage( + const AssertionResult& assertion_result, + const char* expression_text, + const char* actual_predicate_value, + const char* expected_predicate_value) { + const char* actual_message = assertion_result.message(); + Message msg; + msg << "Value of: " << expression_text + << "\n Actual: " << actual_predicate_value; + if (actual_message[0] != '\0') + msg << " (" << actual_message << ")"; + msg << "\nExpected: " << expected_predicate_value; + return msg.GetString(); +} + +// Helper function for implementing ASSERT_NEAR. +AssertionResult DoubleNearPredFormat(const char* expr1, + const char* expr2, + const char* abs_error_expr, + double val1, + double val2, + double abs_error) { + const double diff = fabs(val1 - val2); + if (diff <= abs_error) return AssertionSuccess(); + + // TODO(wan): do not print the value of an expression if it's + // already a literal. + return AssertionFailure() + << "The difference between " << expr1 << " and " << expr2 + << " is " << diff << ", which exceeds " << abs_error_expr << ", where\n" + << expr1 << " evaluates to " << val1 << ",\n" + << expr2 << " evaluates to " << val2 << ", and\n" + << abs_error_expr << " evaluates to " << abs_error << "."; +} + + +// Helper template for implementing FloatLE() and DoubleLE(). +template <typename RawType> +AssertionResult FloatingPointLE(const char* expr1, + const char* expr2, + RawType val1, + RawType val2) { + // Returns success if val1 is less than val2, + if (val1 < val2) { + return AssertionSuccess(); + } + + // or if val1 is almost equal to val2. + const FloatingPoint<RawType> lhs(val1), rhs(val2); + if (lhs.AlmostEquals(rhs)) { + return AssertionSuccess(); + } + + // Note that the above two checks will both fail if either val1 or + // val2 is NaN, as the IEEE floating-point standard requires that + // any predicate involving a NaN must return false. + + ::std::stringstream val1_ss; + val1_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2) + << val1; + + ::std::stringstream val2_ss; + val2_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2) + << val2; + + return AssertionFailure() + << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n" + << " Actual: " << StringStreamToString(&val1_ss) << " vs " + << StringStreamToString(&val2_ss); +} + +} // namespace internal + +// Asserts that val1 is less than, or almost equal to, val2. Fails +// otherwise. In particular, it fails if either val1 or val2 is NaN. +AssertionResult FloatLE(const char* expr1, const char* expr2, + float val1, float val2) { + return internal::FloatingPointLE<float>(expr1, expr2, val1, val2); +} + +// Asserts that val1 is less than, or almost equal to, val2. Fails +// otherwise. In particular, it fails if either val1 or val2 is NaN. +AssertionResult DoubleLE(const char* expr1, const char* expr2, + double val1, double val2) { + return internal::FloatingPointLE<double>(expr1, expr2, val1, val2); +} + +namespace internal { + +// The helper function for {ASSERT|EXPECT}_EQ with int or enum +// arguments. +AssertionResult CmpHelperEQ(const char* expected_expression, + const char* actual_expression, + BiggestInt expected, + BiggestInt actual) { + if (expected == actual) { + return AssertionSuccess(); + } + + return EqFailure(expected_expression, + actual_expression, + FormatForComparisonFailureMessage(expected, actual), + FormatForComparisonFailureMessage(actual, expected), + false); +} + +// A macro for implementing the helper functions needed to implement +// ASSERT_?? and EXPECT_?? with integer or enum arguments. It is here +// just to avoid copy-and-paste of similar code. +#define GTEST_IMPL_CMP_HELPER_(op_name, op)\ +AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \ + BiggestInt val1, BiggestInt val2) {\ + if (val1 op val2) {\ + return AssertionSuccess();\ + } else {\ + return AssertionFailure() \ + << "Expected: (" << expr1 << ") " #op " (" << expr2\ + << "), actual: " << FormatForComparisonFailureMessage(val1, val2)\ + << " vs " << FormatForComparisonFailureMessage(val2, val1);\ + }\ +} + +// Implements the helper function for {ASSERT|EXPECT}_NE with int or +// enum arguments. +GTEST_IMPL_CMP_HELPER_(NE, !=) +// Implements the helper function for {ASSERT|EXPECT}_LE with int or +// enum arguments. +GTEST_IMPL_CMP_HELPER_(LE, <=) +// Implements the helper function for {ASSERT|EXPECT}_LT with int or +// enum arguments. +GTEST_IMPL_CMP_HELPER_(LT, < ) +// Implements the helper function for {ASSERT|EXPECT}_GE with int or +// enum arguments. +GTEST_IMPL_CMP_HELPER_(GE, >=) +// Implements the helper function for {ASSERT|EXPECT}_GT with int or +// enum arguments. +GTEST_IMPL_CMP_HELPER_(GT, > ) + +#undef GTEST_IMPL_CMP_HELPER_ + +// The helper function for {ASSERT|EXPECT}_STREQ. +AssertionResult CmpHelperSTREQ(const char* expected_expression, + const char* actual_expression, + const char* expected, + const char* actual) { + if (String::CStringEquals(expected, actual)) { + return AssertionSuccess(); + } + + return EqFailure(expected_expression, + actual_expression, + PrintToString(expected), + PrintToString(actual), + false); +} + +// The helper function for {ASSERT|EXPECT}_STRCASEEQ. +AssertionResult CmpHelperSTRCASEEQ(const char* expected_expression, + const char* actual_expression, + const char* expected, + const char* actual) { + if (String::CaseInsensitiveCStringEquals(expected, actual)) { + return AssertionSuccess(); + } + + return EqFailure(expected_expression, + actual_expression, + PrintToString(expected), + PrintToString(actual), + true); +} + +// The helper function for {ASSERT|EXPECT}_STRNE. +AssertionResult CmpHelperSTRNE(const char* s1_expression, + const char* s2_expression, + const char* s1, + const char* s2) { + if (!String::CStringEquals(s1, s2)) { + return AssertionSuccess(); + } else { + return AssertionFailure() << "Expected: (" << s1_expression << ") != (" + << s2_expression << "), actual: \"" + << s1 << "\" vs \"" << s2 << "\""; + } +} + +// The helper function for {ASSERT|EXPECT}_STRCASENE. +AssertionResult CmpHelperSTRCASENE(const char* s1_expression, + const char* s2_expression, + const char* s1, + const char* s2) { + if (!String::CaseInsensitiveCStringEquals(s1, s2)) { + return AssertionSuccess(); + } else { + return AssertionFailure() + << "Expected: (" << s1_expression << ") != (" + << s2_expression << ") (ignoring case), actual: \"" + << s1 << "\" vs \"" << s2 << "\""; + } +} + +} // namespace internal + +namespace { + +// Helper functions for implementing IsSubString() and IsNotSubstring(). + +// This group of overloaded functions return true iff needle is a +// substring of haystack. NULL is considered a substring of itself +// only. + +bool IsSubstringPred(const char* needle, const char* haystack) { + if (needle == NULL || haystack == NULL) + return needle == haystack; + + return strstr(haystack, needle) != NULL; +} + +bool IsSubstringPred(const wchar_t* needle, const wchar_t* haystack) { + if (needle == NULL || haystack == NULL) + return needle == haystack; + + return wcsstr(haystack, needle) != NULL; +} + +// StringType here can be either ::std::string or ::std::wstring. +template <typename StringType> +bool IsSubstringPred(const StringType& needle, + const StringType& haystack) { + return haystack.find(needle) != StringType::npos; +} + +// This function implements either IsSubstring() or IsNotSubstring(), +// depending on the value of the expected_to_be_substring parameter. +// StringType here can be const char*, const wchar_t*, ::std::string, +// or ::std::wstring. +template <typename StringType> +AssertionResult IsSubstringImpl( + bool expected_to_be_substring, + const char* needle_expr, const char* haystack_expr, + const StringType& needle, const StringType& haystack) { + if (IsSubstringPred(needle, haystack) == expected_to_be_substring) + return AssertionSuccess(); + + const bool is_wide_string = sizeof(needle[0]) > 1; + const char* const begin_string_quote = is_wide_string ? "L\"" : "\""; + return AssertionFailure() + << "Value of: " << needle_expr << "\n" + << " Actual: " << begin_string_quote << needle << "\"\n" + << "Expected: " << (expected_to_be_substring ? "" : "not ") + << "a substring of " << haystack_expr << "\n" + << "Which is: " << begin_string_quote << haystack << "\""; +} + +} // namespace + +// IsSubstring() and IsNotSubstring() check whether needle is a +// substring of haystack (NULL is considered a substring of itself +// only), and return an appropriate error message when they fail. + +AssertionResult IsSubstring( + const char* needle_expr, const char* haystack_expr, + const char* needle, const char* haystack) { + return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); +} + +AssertionResult IsSubstring( + const char* needle_expr, const char* haystack_expr, + const wchar_t* needle, const wchar_t* haystack) { + return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); +} + +AssertionResult IsNotSubstring( + const char* needle_expr, const char* haystack_expr, + const char* needle, const char* haystack) { + return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); +} + +AssertionResult IsNotSubstring( + const char* needle_expr, const char* haystack_expr, + const wchar_t* needle, const wchar_t* haystack) { + return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); +} + +AssertionResult IsSubstring( + const char* needle_expr, const char* haystack_expr, + const ::std::string& needle, const ::std::string& haystack) { + return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); +} + +AssertionResult IsNotSubstring( + const char* needle_expr, const char* haystack_expr, + const ::std::string& needle, const ::std::string& haystack) { + return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); +} + +#if GTEST_HAS_STD_WSTRING +AssertionResult IsSubstring( + const char* needle_expr, const char* haystack_expr, + const ::std::wstring& needle, const ::std::wstring& haystack) { + return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); +} + +AssertionResult IsNotSubstring( + const char* needle_expr, const char* haystack_expr, + const ::std::wstring& needle, const ::std::wstring& haystack) { + return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); +} +#endif // GTEST_HAS_STD_WSTRING + +namespace internal { + +#if GTEST_OS_WINDOWS + +namespace { + +// Helper function for IsHRESULT{SuccessFailure} predicates +AssertionResult HRESULTFailureHelper(const char* expr, + const char* expected, + long hr) { // NOLINT +# if GTEST_OS_WINDOWS_MOBILE + + // Windows CE doesn't support FormatMessage. + const char error_text[] = ""; + +# else + + // Looks up the human-readable system message for the HRESULT code + // and since we're not passing any params to FormatMessage, we don't + // want inserts expanded. + const DWORD kFlags = FORMAT_MESSAGE_FROM_SYSTEM | + FORMAT_MESSAGE_IGNORE_INSERTS; + const DWORD kBufSize = 4096; + // Gets the system's human readable message string for this HRESULT. + char error_text[kBufSize] = { '\0' }; + DWORD message_length = ::FormatMessageA(kFlags, + 0, // no source, we're asking system + hr, // the error + 0, // no line width restrictions + error_text, // output buffer + kBufSize, // buf size + NULL); // no arguments for inserts + // Trims tailing white space (FormatMessage leaves a trailing CR-LF) + for (; message_length && IsSpace(error_text[message_length - 1]); + --message_length) { + error_text[message_length - 1] = '\0'; + } + +# endif // GTEST_OS_WINDOWS_MOBILE + + const std::string error_hex("0x" + String::FormatHexInt(hr)); + return ::testing::AssertionFailure() + << "Expected: " << expr << " " << expected << ".\n" + << " Actual: " << error_hex << " " << error_text << "\n"; +} + +} // namespace + +AssertionResult IsHRESULTSuccess(const char* expr, long hr) { // NOLINT + if (SUCCEEDED(hr)) { + return AssertionSuccess(); + } + return HRESULTFailureHelper(expr, "succeeds", hr); +} + +AssertionResult IsHRESULTFailure(const char* expr, long hr) { // NOLINT + if (FAILED(hr)) { + return AssertionSuccess(); + } + return HRESULTFailureHelper(expr, "fails", hr); +} + +#endif // GTEST_OS_WINDOWS + +// Utility functions for encoding Unicode text (wide strings) in +// UTF-8. + +// A Unicode code-point can have upto 21 bits, and is encoded in UTF-8 +// like this: +// +// Code-point length Encoding +// 0 - 7 bits 0xxxxxxx +// 8 - 11 bits 110xxxxx 10xxxxxx +// 12 - 16 bits 1110xxxx 10xxxxxx 10xxxxxx +// 17 - 21 bits 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + +// The maximum code-point a one-byte UTF-8 sequence can represent. +const UInt32 kMaxCodePoint1 = (static_cast<UInt32>(1) << 7) - 1; + +// The maximum code-point a two-byte UTF-8 sequence can represent. +const UInt32 kMaxCodePoint2 = (static_cast<UInt32>(1) << (5 + 6)) - 1; + +// The maximum code-point a three-byte UTF-8 sequence can represent. +const UInt32 kMaxCodePoint3 = (static_cast<UInt32>(1) << (4 + 2*6)) - 1; + +// The maximum code-point a four-byte UTF-8 sequence can represent. +const UInt32 kMaxCodePoint4 = (static_cast<UInt32>(1) << (3 + 3*6)) - 1; + +// Chops off the n lowest bits from a bit pattern. Returns the n +// lowest bits. As a side effect, the original bit pattern will be +// shifted to the right by n bits. +inline UInt32 ChopLowBits(UInt32* bits, int n) { + const UInt32 low_bits = *bits & ((static_cast<UInt32>(1) << n) - 1); + *bits >>= n; + return low_bits; +} + +// Converts a Unicode code point to a narrow string in UTF-8 encoding. +// code_point parameter is of type UInt32 because wchar_t may not be +// wide enough to contain a code point. +// If the code_point is not a valid Unicode code point +// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted +// to "(Invalid Unicode 0xXXXXXXXX)". +std::string CodePointToUtf8(UInt32 code_point) { + if (code_point > kMaxCodePoint4) { + return "(Invalid Unicode 0x" + String::FormatHexInt(code_point) + ")"; + } + + char str[5]; // Big enough for the largest valid code point. + if (code_point <= kMaxCodePoint1) { + str[1] = '\0'; + str[0] = static_cast<char>(code_point); // 0xxxxxxx + } else if (code_point <= kMaxCodePoint2) { + str[2] = '\0'; + str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx + str[0] = static_cast<char>(0xC0 | code_point); // 110xxxxx + } else if (code_point <= kMaxCodePoint3) { + str[3] = '\0'; + str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx + str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx + str[0] = static_cast<char>(0xE0 | code_point); // 1110xxxx + } else { // code_point <= kMaxCodePoint4 + str[4] = '\0'; + str[3] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx + str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx + str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx + str[0] = static_cast<char>(0xF0 | code_point); // 11110xxx + } + return str; +} + +// The following two functions only make sense if the the system +// uses UTF-16 for wide string encoding. All supported systems +// with 16 bit wchar_t (Windows, Cygwin, Symbian OS) do use UTF-16. + +// Determines if the arguments constitute UTF-16 surrogate pair +// and thus should be combined into a single Unicode code point +// using CreateCodePointFromUtf16SurrogatePair. +inline bool IsUtf16SurrogatePair(wchar_t first, wchar_t second) { + return sizeof(wchar_t) == 2 && + (first & 0xFC00) == 0xD800 && (second & 0xFC00) == 0xDC00; +} + +// Creates a Unicode code point from UTF16 surrogate pair. +inline UInt32 CreateCodePointFromUtf16SurrogatePair(wchar_t first, + wchar_t second) { + const UInt32 mask = (1 << 10) - 1; + return (sizeof(wchar_t) == 2) ? + (((first & mask) << 10) | (second & mask)) + 0x10000 : + // This function should not be called when the condition is + // false, but we provide a sensible default in case it is. + static_cast<UInt32>(first); +} + +// Converts a wide string to a narrow string in UTF-8 encoding. +// The wide string is assumed to have the following encoding: +// UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS) +// UTF-32 if sizeof(wchar_t) == 4 (on Linux) +// Parameter str points to a null-terminated wide string. +// Parameter num_chars may additionally limit the number +// of wchar_t characters processed. -1 is used when the entire string +// should be processed. +// If the string contains code points that are not valid Unicode code points +// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output +// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding +// and contains invalid UTF-16 surrogate pairs, values in those pairs +// will be encoded as individual Unicode characters from Basic Normal Plane. +std::string WideStringToUtf8(const wchar_t* str, int num_chars) { + if (num_chars == -1) + num_chars = static_cast<int>(wcslen(str)); + + ::std::stringstream stream; + for (int i = 0; i < num_chars; ++i) { + UInt32 unicode_code_point; + + if (str[i] == L'\0') { + break; + } else if (i + 1 < num_chars && IsUtf16SurrogatePair(str[i], str[i + 1])) { + unicode_code_point = CreateCodePointFromUtf16SurrogatePair(str[i], + str[i + 1]); + i++; + } else { + unicode_code_point = static_cast<UInt32>(str[i]); + } + + stream << CodePointToUtf8(unicode_code_point); + } + return StringStreamToString(&stream); +} + +// Converts a wide C string to an std::string using the UTF-8 encoding. +// NULL will be converted to "(null)". +std::string String::ShowWideCString(const wchar_t * wide_c_str) { + if (wide_c_str == NULL) return "(null)"; + + return internal::WideStringToUtf8(wide_c_str, -1); +} + +// Compares two wide C strings. Returns true iff they have the same +// content. +// +// Unlike wcscmp(), this function can handle NULL argument(s). A NULL +// C string is considered different to any non-NULL C string, +// including the empty string. +bool String::WideCStringEquals(const wchar_t * lhs, const wchar_t * rhs) { + if (lhs == NULL) return rhs == NULL; + + if (rhs == NULL) return false; + + return wcscmp(lhs, rhs) == 0; +} + +// Helper function for *_STREQ on wide strings. +AssertionResult CmpHelperSTREQ(const char* expected_expression, + const char* actual_expression, + const wchar_t* expected, + const wchar_t* actual) { + if (String::WideCStringEquals(expected, actual)) { + return AssertionSuccess(); + } + + return EqFailure(expected_expression, + actual_expression, + PrintToString(expected), + PrintToString(actual), + false); +} + +// Helper function for *_STRNE on wide strings. +AssertionResult CmpHelperSTRNE(const char* s1_expression, + const char* s2_expression, + const wchar_t* s1, + const wchar_t* s2) { + if (!String::WideCStringEquals(s1, s2)) { + return AssertionSuccess(); + } + + return AssertionFailure() << "Expected: (" << s1_expression << ") != (" + << s2_expression << "), actual: " + << PrintToString(s1) + << " vs " << PrintToString(s2); +} + +// Compares two C strings, ignoring case. Returns true iff they have +// the same content. +// +// Unlike strcasecmp(), this function can handle NULL argument(s). A +// NULL C string is considered different to any non-NULL C string, +// including the empty string. +bool String::CaseInsensitiveCStringEquals(const char * lhs, const char * rhs) { + if (lhs == NULL) + return rhs == NULL; + if (rhs == NULL) + return false; + return posix::StrCaseCmp(lhs, rhs) == 0; +} + + // Compares two wide C strings, ignoring case. Returns true iff they + // have the same content. + // + // Unlike wcscasecmp(), this function can handle NULL argument(s). + // A NULL C string is considered different to any non-NULL wide C string, + // including the empty string. + // NB: The implementations on different platforms slightly differ. + // On windows, this method uses _wcsicmp which compares according to LC_CTYPE + // environment variable. On GNU platform this method uses wcscasecmp + // which compares according to LC_CTYPE category of the current locale. + // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the + // current locale. +bool String::CaseInsensitiveWideCStringEquals(const wchar_t* lhs, + const wchar_t* rhs) { + if (lhs == NULL) return rhs == NULL; + + if (rhs == NULL) return false; + +#if GTEST_OS_WINDOWS + return _wcsicmp(lhs, rhs) == 0; +#elif GTEST_OS_LINUX && !GTEST_OS_LINUX_ANDROID + return wcscasecmp(lhs, rhs) == 0; +#else + // Android, Mac OS X and Cygwin don't define wcscasecmp. + // Other unknown OSes may not define it either. + wint_t left, right; + do { + left = towlower(*lhs++); + right = towlower(*rhs++); + } while (left && left == right); + return left == right; +#endif // OS selector +} + +// Returns true iff str ends with the given suffix, ignoring case. +// Any string is considered to end with an empty suffix. +bool String::EndsWithCaseInsensitive( + const std::string& str, const std::string& suffix) { + const size_t str_len = str.length(); + const size_t suffix_len = suffix.length(); + return (str_len >= suffix_len) && + CaseInsensitiveCStringEquals(str.c_str() + str_len - suffix_len, + suffix.c_str()); +} + +// Formats an int value as "%02d". +std::string String::FormatIntWidth2(int value) { + std::stringstream ss; + ss << std::setfill('0') << std::setw(2) << value; + return ss.str(); +} + +// Formats an int value as "%X". +std::string String::FormatHexInt(int value) { + std::stringstream ss; + ss << std::hex << std::uppercase << value; + return ss.str(); +} + +// Formats a byte as "%02X". +std::string String::FormatByte(unsigned char value) { + std::stringstream ss; + ss << std::setfill('0') << std::setw(2) << std::hex << std::uppercase + << static_cast<unsigned int>(value); + return ss.str(); +} + +// Converts the buffer in a stringstream to an std::string, converting NUL +// bytes to "\\0" along the way. +std::string StringStreamToString(::std::stringstream* ss) { + const ::std::string& str = ss->str(); + const char* const start = str.c_str(); + const char* const end = start + str.length(); + + std::string result; + result.reserve(2 * (end - start)); + for (const char* ch = start; ch != end; ++ch) { + if (*ch == '\0') { + result += "\\0"; // Replaces NUL with "\\0"; + } else { + result += *ch; + } + } + + return result; +} + +// Appends the user-supplied message to the Google-Test-generated message. +std::string AppendUserMessage(const std::string& gtest_msg, + const Message& user_msg) { + // Appends the user message if it's non-empty. + const std::string user_msg_string = user_msg.GetString(); + if (user_msg_string.empty()) { + return gtest_msg; + } + + return gtest_msg + "\n" + user_msg_string; +} + +} // namespace internal + +// class TestResult + +// Creates an empty TestResult. +TestResult::TestResult() + : death_test_count_(0), + elapsed_time_(0) { +} + +// D'tor. +TestResult::~TestResult() { +} + +// Returns the i-th test part result among all the results. i can +// range from 0 to total_part_count() - 1. If i is not in that range, +// aborts the program. +const TestPartResult& TestResult::GetTestPartResult(int i) const { + if (i < 0 || i >= total_part_count()) + internal::posix::Abort(); + return test_part_results_.at(i); +} + +// Returns the i-th test property. i can range from 0 to +// test_property_count() - 1. If i is not in that range, aborts the +// program. +const TestProperty& TestResult::GetTestProperty(int i) const { + if (i < 0 || i >= test_property_count()) + internal::posix::Abort(); + return test_properties_.at(i); +} + +// Clears the test part results. +void TestResult::ClearTestPartResults() { + test_part_results_.clear(); +} + +// Adds a test part result to the list. +void TestResult::AddTestPartResult(const TestPartResult& test_part_result) { + test_part_results_.push_back(test_part_result); +} + +// Adds a test property to the list. If a property with the same key as the +// supplied property is already represented, the value of this test_property +// replaces the old value for that key. +void TestResult::RecordProperty(const std::string& xml_element, + const TestProperty& test_property) { + if (!ValidateTestProperty(xml_element, test_property)) { + return; + } + internal::MutexLock lock(&test_properites_mutex_); + const std::vector<TestProperty>::iterator property_with_matching_key = + std::find_if(test_properties_.begin(), test_properties_.end(), + internal::TestPropertyKeyIs(test_property.key())); + if (property_with_matching_key == test_properties_.end()) { + test_properties_.push_back(test_property); + return; + } + property_with_matching_key->SetValue(test_property.value()); +} + +// The list of reserved attributes used in the <testsuites> element of XML +// output. +static const char* const kReservedTestSuitesAttributes[] = { + "disabled", + "errors", + "failures", + "name", + "random_seed", + "tests", + "time", + "timestamp" +}; + +// The list of reserved attributes used in the <testsuite> element of XML +// output. +static const char* const kReservedTestSuiteAttributes[] = { + "disabled", + "errors", + "failures", + "name", + "tests", + "time" +}; + +// The list of reserved attributes used in the <testcase> element of XML output. +static const char* const kReservedTestCaseAttributes[] = { + "classname", + "name", + "status", + "time", + "type_param", + "value_param" +}; + +template <int kSize> +std::vector<std::string> ArrayAsVector(const char* const (&array)[kSize]) { + return std::vector<std::string>(array, array + kSize); +} + +static std::vector<std::string> GetReservedAttributesForElement( + const std::string& xml_element) { + if (xml_element == "testsuites") { + return ArrayAsVector(kReservedTestSuitesAttributes); + } else if (xml_element == "testsuite") { + return ArrayAsVector(kReservedTestSuiteAttributes); + } else if (xml_element == "testcase") { + return ArrayAsVector(kReservedTestCaseAttributes); + } else { + GTEST_CHECK_(false) << "Unrecognized xml_element provided: " << xml_element; + } + // This code is unreachable but some compilers may not realizes that. + return std::vector<std::string>(); +} + +static std::string FormatWordList(const std::vector<std::string>& words) { + Message word_list; + for (size_t i = 0; i < words.size(); ++i) { + if (i > 0 && words.size() > 2) { + word_list << ", "; + } + if (i == words.size() - 1) { + word_list << "and "; + } + word_list << "'" << words[i] << "'"; + } + return word_list.GetString(); +} + +bool ValidateTestPropertyName(const std::string& property_name, + const std::vector<std::string>& reserved_names) { + if (std::find(reserved_names.begin(), reserved_names.end(), property_name) != + reserved_names.end()) { + ADD_FAILURE() << "Reserved key used in RecordProperty(): " << property_name + << " (" << FormatWordList(reserved_names) + << " are reserved by " << GTEST_NAME_ << ")"; + return false; + } + return true; +} + +// Adds a failure if the key is a reserved attribute of the element named +// xml_element. Returns true if the property is valid. +bool TestResult::ValidateTestProperty(const std::string& xml_element, + const TestProperty& test_property) { + return ValidateTestPropertyName(test_property.key(), + GetReservedAttributesForElement(xml_element)); +} + +// Clears the object. +void TestResult::Clear() { + test_part_results_.clear(); + test_properties_.clear(); + death_test_count_ = 0; + elapsed_time_ = 0; +} + +// Returns true iff the test failed. +bool TestResult::Failed() const { + for (int i = 0; i < total_part_count(); ++i) { + if (GetTestPartResult(i).failed()) + return true; + } + return false; +} + +// Returns true iff the test part fatally failed. +static bool TestPartFatallyFailed(const TestPartResult& result) { + return result.fatally_failed(); +} + +// Returns true iff the test fatally failed. +bool TestResult::HasFatalFailure() const { + return CountIf(test_part_results_, TestPartFatallyFailed) > 0; +} + +// Returns true iff the test part non-fatally failed. +static bool TestPartNonfatallyFailed(const TestPartResult& result) { + return result.nonfatally_failed(); +} + +// Returns true iff the test has a non-fatal failure. +bool TestResult::HasNonfatalFailure() const { + return CountIf(test_part_results_, TestPartNonfatallyFailed) > 0; +} + +// Gets the number of all test parts. This is the sum of the number +// of successful test parts and the number of failed test parts. +int TestResult::total_part_count() const { + return static_cast<int>(test_part_results_.size()); +} + +// Returns the number of the test properties. +int TestResult::test_property_count() const { + return static_cast<int>(test_properties_.size()); +} + +// class Test + +// Creates a Test object. + +// The c'tor saves the values of all Google Test flags. +Test::Test() + : gtest_flag_saver_(new internal::GTestFlagSaver) { +} + +// The d'tor restores the values of all Google Test flags. +Test::~Test() { + delete gtest_flag_saver_; +} + +// Sets up the test fixture. +// +// A sub-class may override this. +void Test::SetUp() { +} + +// Tears down the test fixture. +// +// A sub-class may override this. +void Test::TearDown() { +} + +// Allows user supplied key value pairs to be recorded for later output. +void Test::RecordProperty(const std::string& key, const std::string& value) { + UnitTest::GetInstance()->RecordProperty(key, value); +} + +// Allows user supplied key value pairs to be recorded for later output. +void Test::RecordProperty(const std::string& key, int value) { + Message value_message; + value_message << value; + RecordProperty(key, value_message.GetString().c_str()); +} + +namespace internal { + +void ReportFailureInUnknownLocation(TestPartResult::Type result_type, + const std::string& message) { + // This function is a friend of UnitTest and as such has access to + // AddTestPartResult. + UnitTest::GetInstance()->AddTestPartResult( + result_type, + NULL, // No info about the source file where the exception occurred. + -1, // We have no info on which line caused the exception. + message, + ""); // No stack trace, either. +} + +} // namespace internal + +// Google Test requires all tests in the same test case to use the same test +// fixture class. This function checks if the current test has the +// same fixture class as the first test in the current test case. If +// yes, it returns true; otherwise it generates a Google Test failure and +// returns false. +bool Test::HasSameFixtureClass() { + internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); + const TestCase* const test_case = impl->current_test_case(); + + // Info about the first test in the current test case. + const TestInfo* const first_test_info = test_case->test_info_list()[0]; + const internal::TypeId first_fixture_id = first_test_info->fixture_class_id_; + const char* const first_test_name = first_test_info->name(); + + // Info about the current test. + const TestInfo* const this_test_info = impl->current_test_info(); + const internal::TypeId this_fixture_id = this_test_info->fixture_class_id_; + const char* const this_test_name = this_test_info->name(); + + if (this_fixture_id != first_fixture_id) { + // Is the first test defined using TEST? + const bool first_is_TEST = first_fixture_id == internal::GetTestTypeId(); + // Is this test defined using TEST? + const bool this_is_TEST = this_fixture_id == internal::GetTestTypeId(); + + if (first_is_TEST || this_is_TEST) { + // The user mixed TEST and TEST_F in this test case - we'll tell + // him/her how to fix it. + + // Gets the name of the TEST and the name of the TEST_F. Note + // that first_is_TEST and this_is_TEST cannot both be true, as + // the fixture IDs are different for the two tests. + const char* const TEST_name = + first_is_TEST ? first_test_name : this_test_name; + const char* const TEST_F_name = + first_is_TEST ? this_test_name : first_test_name; + + ADD_FAILURE() + << "All tests in the same test case must use the same test fixture\n" + << "class, so mixing TEST_F and TEST in the same test case is\n" + << "illegal. In test case " << this_test_info->test_case_name() + << ",\n" + << "test " << TEST_F_name << " is defined using TEST_F but\n" + << "test " << TEST_name << " is defined using TEST. You probably\n" + << "want to change the TEST to TEST_F or move it to another test\n" + << "case."; + } else { + // The user defined two fixture classes with the same name in + // two namespaces - we'll tell him/her how to fix it. + ADD_FAILURE() + << "All tests in the same test case must use the same test fixture\n" + << "class. However, in test case " + << this_test_info->test_case_name() << ",\n" + << "you defined test " << first_test_name + << " and test " << this_test_name << "\n" + << "using two different test fixture classes. This can happen if\n" + << "the two classes are from different namespaces or translation\n" + << "units and have the same name. You should probably rename one\n" + << "of the classes to put the tests into different test cases."; + } + return false; + } + + return true; +} + +#if GTEST_HAS_SEH + +// Adds an "exception thrown" fatal failure to the current test. This +// function returns its result via an output parameter pointer because VC++ +// prohibits creation of objects with destructors on stack in functions +// using __try (see error C2712). +static std::string* FormatSehExceptionMessage(DWORD exception_code, + const char* location) { + Message message; + message << "SEH exception with code 0x" << std::setbase(16) << + exception_code << std::setbase(10) << " thrown in " << location << "."; + + return new std::string(message.GetString()); +} + +#endif // GTEST_HAS_SEH + +namespace internal { + +#if GTEST_HAS_EXCEPTIONS + +// Adds an "exception thrown" fatal failure to the current test. +static std::string FormatCxxExceptionMessage(const char* description, + const char* location) { + Message message; + if (description != NULL) { + message << "C++ exception with description \"" << description << "\""; + } else { + message << "Unknown C++ exception"; + } + message << " thrown in " << location << "."; + + return message.GetString(); +} + +static std::string PrintTestPartResultToString( + const TestPartResult& test_part_result); + +GoogleTestFailureException::GoogleTestFailureException( + const TestPartResult& failure) + : ::std::runtime_error(PrintTestPartResultToString(failure).c_str()) {} + +#endif // GTEST_HAS_EXCEPTIONS + +// We put these helper functions in the internal namespace as IBM's xlC +// compiler rejects the code if they were declared static. + +// Runs the given method and handles SEH exceptions it throws, when +// SEH is supported; returns the 0-value for type Result in case of an +// SEH exception. (Microsoft compilers cannot handle SEH and C++ +// exceptions in the same function. Therefore, we provide a separate +// wrapper function for handling SEH exceptions.) +template <class T, typename Result> +Result HandleSehExceptionsInMethodIfSupported( + T* object, Result (T::*method)(), const char* location) { +#if GTEST_HAS_SEH + __try { + return (object->*method)(); + } __except (internal::UnitTestOptions::GTestShouldProcessSEH( // NOLINT + GetExceptionCode())) { + // We create the exception message on the heap because VC++ prohibits + // creation of objects with destructors on stack in functions using __try + // (see error C2712). + std::string* exception_message = FormatSehExceptionMessage( + GetExceptionCode(), location); + internal::ReportFailureInUnknownLocation(TestPartResult::kFatalFailure, + *exception_message); + delete exception_message; + return static_cast<Result>(0); + } +#else + (void)location; + return (object->*method)(); +#endif // GTEST_HAS_SEH +} + +// Runs the given method and catches and reports C++ and/or SEH-style +// exceptions, if they are supported; returns the 0-value for type +// Result in case of an SEH exception. +template <class T, typename Result> +Result HandleExceptionsInMethodIfSupported( + T* object, Result (T::*method)(), const char* location) { + // NOTE: The user code can affect the way in which Google Test handles + // exceptions by setting GTEST_FLAG(catch_exceptions), but only before + // RUN_ALL_TESTS() starts. It is technically possible to check the flag + // after the exception is caught and either report or re-throw the + // exception based on the flag's value: + // + // try { + // // Perform the test method. + // } catch (...) { + // if (GTEST_FLAG(catch_exceptions)) + // // Report the exception as failure. + // else + // throw; // Re-throws the original exception. + // } + // + // However, the purpose of this flag is to allow the program to drop into + // the debugger when the exception is thrown. On most platforms, once the + // control enters the catch block, the exception origin information is + // lost and the debugger will stop the program at the point of the + // re-throw in this function -- instead of at the point of the original + // throw statement in the code under test. For this reason, we perform + // the check early, sacrificing the ability to affect Google Test's + // exception handling in the method where the exception is thrown. + if (internal::GetUnitTestImpl()->catch_exceptions()) { +#if GTEST_HAS_EXCEPTIONS + try { + return HandleSehExceptionsInMethodIfSupported(object, method, location); + } catch (const internal::GoogleTestFailureException&) { // NOLINT + // This exception type can only be thrown by a failed Google + // Test assertion with the intention of letting another testing + // framework catch it. Therefore we just re-throw it. + throw; + } catch (const std::exception& e) { // NOLINT + internal::ReportFailureInUnknownLocation( + TestPartResult::kFatalFailure, + FormatCxxExceptionMessage(e.what(), location)); + } catch (...) { // NOLINT + internal::ReportFailureInUnknownLocation( + TestPartResult::kFatalFailure, + FormatCxxExceptionMessage(NULL, location)); + } + return static_cast<Result>(0); +#else + return HandleSehExceptionsInMethodIfSupported(object, method, location); +#endif // GTEST_HAS_EXCEPTIONS + } else { + return (object->*method)(); + } +} + +} // namespace internal + +// Runs the test and updates the test result. +void Test::Run() { + if (!HasSameFixtureClass()) return; + + internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); + impl->os_stack_trace_getter()->UponLeavingGTest(); + internal::HandleExceptionsInMethodIfSupported(this, &Test::SetUp, "SetUp()"); + // We will run the test only if SetUp() was successful. + if (!HasFatalFailure()) { + impl->os_stack_trace_getter()->UponLeavingGTest(); + internal::HandleExceptionsInMethodIfSupported( + this, &Test::TestBody, "the test body"); + } + + // However, we want to clean up as much as possible. Hence we will + // always call TearDown(), even if SetUp() or the test body has + // failed. + impl->os_stack_trace_getter()->UponLeavingGTest(); + internal::HandleExceptionsInMethodIfSupported( + this, &Test::TearDown, "TearDown()"); +} + +// Returns true iff the current test has a fatal failure. +bool Test::HasFatalFailure() { + return internal::GetUnitTestImpl()->current_test_result()->HasFatalFailure(); +} + +// Returns true iff the current test has a non-fatal failure. +bool Test::HasNonfatalFailure() { + return internal::GetUnitTestImpl()->current_test_result()-> + HasNonfatalFailure(); +} + +// class TestInfo + +// Constructs a TestInfo object. It assumes ownership of the test factory +// object. +TestInfo::TestInfo(const std::string& a_test_case_name, + const std::string& a_name, + const char* a_type_param, + const char* a_value_param, + internal::TypeId fixture_class_id, + internal::TestFactoryBase* factory) + : test_case_name_(a_test_case_name), + name_(a_name), + type_param_(a_type_param ? new std::string(a_type_param) : NULL), + value_param_(a_value_param ? new std::string(a_value_param) : NULL), + fixture_class_id_(fixture_class_id), + should_run_(false), + is_disabled_(false), + matches_filter_(false), + factory_(factory), + result_() {} + +// Destructs a TestInfo object. +TestInfo::~TestInfo() { delete factory_; } + +namespace internal { + +// Creates a new TestInfo object and registers it with Google Test; +// returns the created object. +// +// Arguments: +// +// test_case_name: name of the test case +// name: name of the test +// type_param: the name of the test's type parameter, or NULL if +// this is not a typed or a type-parameterized test. +// value_param: text representation of the test's value parameter, +// or NULL if this is not a value-parameterized test. +// fixture_class_id: ID of the test fixture class +// set_up_tc: pointer to the function that sets up the test case +// tear_down_tc: pointer to the function that tears down the test case +// factory: pointer to the factory that creates a test object. +// The newly created TestInfo instance will assume +// ownership of the factory object. +TestInfo* MakeAndRegisterTestInfo( + const char* test_case_name, + const char* name, + const char* type_param, + const char* value_param, + TypeId fixture_class_id, + SetUpTestCaseFunc set_up_tc, + TearDownTestCaseFunc tear_down_tc, + TestFactoryBase* factory) { + TestInfo* const test_info = + new TestInfo(test_case_name, name, type_param, value_param, + fixture_class_id, factory); + GetUnitTestImpl()->AddTestInfo(set_up_tc, tear_down_tc, test_info); + return test_info; +} + +#if GTEST_HAS_PARAM_TEST +void ReportInvalidTestCaseType(const char* test_case_name, + const char* file, int line) { + Message errors; + errors + << "Attempted redefinition of test case " << test_case_name << ".\n" + << "All tests in the same test case must use the same test fixture\n" + << "class. However, in test case " << test_case_name << ", you tried\n" + << "to define a test using a fixture class different from the one\n" + << "used earlier. This can happen if the two fixture classes are\n" + << "from different namespaces and have the same name. You should\n" + << "probably rename one of the classes to put the tests into different\n" + << "test cases."; + + fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(), + errors.GetString().c_str()); +} +#endif // GTEST_HAS_PARAM_TEST + +} // namespace internal + +namespace { + +// A predicate that checks the test name of a TestInfo against a known +// value. +// +// This is used for implementation of the TestCase class only. We put +// it in the anonymous namespace to prevent polluting the outer +// namespace. +// +// TestNameIs is copyable. + +//Commenting out this class since its not used and wherefor produces warnings +// class TestNameIs { +// public: +// // Constructor. +// // +// // TestNameIs has NO default constructor. +// explicit TestNameIs(const char* name) +// : name_(name) {} +// +// // Returns true iff the test name of test_info matches name_. +// bool operator()(const TestInfo * test_info) const { +// return test_info && test_info->name() == name_; +// } +// +// private: +// std::string name_; +//}; + +} // namespace + +namespace internal { + +// This method expands all parameterized tests registered with macros TEST_P +// and INSTANTIATE_TEST_CASE_P into regular tests and registers those. +// This will be done just once during the program runtime. +void UnitTestImpl::RegisterParameterizedTests() { +#if GTEST_HAS_PARAM_TEST + if (!parameterized_tests_registered_) { + parameterized_test_registry_.RegisterTests(); + parameterized_tests_registered_ = true; + } +#endif +} + +} // namespace internal + +// Creates the test object, runs it, records its result, and then +// deletes it. +void TestInfo::Run() { + if (!should_run_) return; + + // Tells UnitTest where to store test result. + internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); + impl->set_current_test_info(this); + + TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater(); + + // Notifies the unit test event listeners that a test is about to start. + repeater->OnTestStart(*this); + + const TimeInMillis start = internal::GetTimeInMillis(); + + impl->os_stack_trace_getter()->UponLeavingGTest(); + + // Creates the test object. + Test* const test = internal::HandleExceptionsInMethodIfSupported( + factory_, &internal::TestFactoryBase::CreateTest, + "the test fixture's constructor"); + + // Runs the test only if the test object was created and its + // constructor didn't generate a fatal failure. + if ((test != NULL) && !Test::HasFatalFailure()) { + // This doesn't throw as all user code that can throw are wrapped into + // exception handling code. + test->Run(); + } + + // Deletes the test object. + impl->os_stack_trace_getter()->UponLeavingGTest(); + internal::HandleExceptionsInMethodIfSupported( + test, &Test::DeleteSelf_, "the test fixture's destructor"); + + result_.set_elapsed_time(internal::GetTimeInMillis() - start); + + // Notifies the unit test event listener that a test has just finished. + repeater->OnTestEnd(*this); + + // Tells UnitTest to stop associating assertion results to this + // test. + impl->set_current_test_info(NULL); +} + +// class TestCase + +// Gets the number of successful tests in this test case. +int TestCase::successful_test_count() const { + return CountIf(test_info_list_, TestPassed); +} + +// Gets the number of failed tests in this test case. +int TestCase::failed_test_count() const { + return CountIf(test_info_list_, TestFailed); +} + +// Gets the number of disabled tests that will be reported in the XML report. +int TestCase::reportable_disabled_test_count() const { + return CountIf(test_info_list_, TestReportableDisabled); +} + +// Gets the number of disabled tests in this test case. +int TestCase::disabled_test_count() const { + return CountIf(test_info_list_, TestDisabled); +} + +// Gets the number of tests to be printed in the XML report. +int TestCase::reportable_test_count() const { + return CountIf(test_info_list_, TestReportable); +} + +// Get the number of tests in this test case that should run. +int TestCase::test_to_run_count() const { + return CountIf(test_info_list_, ShouldRunTest); +} + +// Gets the number of all tests. +int TestCase::total_test_count() const { + return static_cast<int>(test_info_list_.size()); +} + +// Creates a TestCase with the given name. +// +// Arguments: +// +// name: name of the test case +// a_type_param: the name of the test case's type parameter, or NULL if +// this is not a typed or a type-parameterized test case. +// set_up_tc: pointer to the function that sets up the test case +// tear_down_tc: pointer to the function that tears down the test case +TestCase::TestCase(const char* a_name, const char* a_type_param, + Test::SetUpTestCaseFunc set_up_tc, + Test::TearDownTestCaseFunc tear_down_tc) + : name_(a_name), + type_param_(a_type_param ? new std::string(a_type_param) : NULL), + set_up_tc_(set_up_tc), + tear_down_tc_(tear_down_tc), + should_run_(false), + elapsed_time_(0) { +} + +// Destructor of TestCase. +TestCase::~TestCase() { + // Deletes every Test in the collection. + ForEach(test_info_list_, internal::Delete<TestInfo>); +} + +// Returns the i-th test among all the tests. i can range from 0 to +// total_test_count() - 1. If i is not in that range, returns NULL. +const TestInfo* TestCase::GetTestInfo(int i) const { + const int index = GetElementOr(test_indices_, i, -1); + return index < 0 ? NULL : test_info_list_[index]; +} + +// Returns the i-th test among all the tests. i can range from 0 to +// total_test_count() - 1. If i is not in that range, returns NULL. +TestInfo* TestCase::GetMutableTestInfo(int i) { + const int index = GetElementOr(test_indices_, i, -1); + return index < 0 ? NULL : test_info_list_[index]; +} + +// Adds a test to this test case. Will delete the test upon +// destruction of the TestCase object. +void TestCase::AddTestInfo(TestInfo * test_info) { + test_info_list_.push_back(test_info); + test_indices_.push_back(static_cast<int>(test_indices_.size())); +} + +// Runs every test in this TestCase. +void TestCase::Run() { + if (!should_run_) return; + + internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); + impl->set_current_test_case(this); + + TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater(); + + repeater->OnTestCaseStart(*this); + impl->os_stack_trace_getter()->UponLeavingGTest(); + internal::HandleExceptionsInMethodIfSupported( + this, &TestCase::RunSetUpTestCase, "SetUpTestCase()"); + + const internal::TimeInMillis start = internal::GetTimeInMillis(); + for (int i = 0; i < total_test_count(); i++) { + GetMutableTestInfo(i)->Run(); + } + elapsed_time_ = internal::GetTimeInMillis() - start; + + impl->os_stack_trace_getter()->UponLeavingGTest(); + internal::HandleExceptionsInMethodIfSupported( + this, &TestCase::RunTearDownTestCase, "TearDownTestCase()"); + + repeater->OnTestCaseEnd(*this); + impl->set_current_test_case(NULL); +} + +// Clears the results of all tests in this test case. +void TestCase::ClearResult() { + ad_hoc_test_result_.Clear(); + ForEach(test_info_list_, TestInfo::ClearTestResult); +} + +// Shuffles the tests in this test case. +void TestCase::ShuffleTests(internal::Random* random) { + Shuffle(random, &test_indices_); +} + +// Restores the test order to before the first shuffle. +void TestCase::UnshuffleTests() { + for (size_t i = 0; i < test_indices_.size(); i++) { + test_indices_[i] = static_cast<int>(i); + } +} + +// Formats a countable noun. Depending on its quantity, either the +// singular form or the plural form is used. e.g. +// +// FormatCountableNoun(1, "formula", "formuli") returns "1 formula". +// FormatCountableNoun(5, "book", "books") returns "5 books". +static std::string FormatCountableNoun(int count, + const char * singular_form, + const char * plural_form) { + return internal::StreamableToString(count) + " " + + (count == 1 ? singular_form : plural_form); +} + +// Formats the count of tests. +static std::string FormatTestCount(int test_count) { + return FormatCountableNoun(test_count, "test", "tests"); +} + +// Formats the count of test cases. +static std::string FormatTestCaseCount(int test_case_count) { + return FormatCountableNoun(test_case_count, "test case", "test cases"); +} + +// Converts a TestPartResult::Type enum to human-friendly string +// representation. Both kNonFatalFailure and kFatalFailure are translated +// to "Failure", as the user usually doesn't care about the difference +// between the two when viewing the test result. +static const char * TestPartResultTypeToString(TestPartResult::Type type) { + switch (type) { + case TestPartResult::kSuccess: + return "Success"; + + case TestPartResult::kNonFatalFailure: + case TestPartResult::kFatalFailure: +#ifdef _MSC_VER + return "error: "; +#else + return "Failure\n"; +#endif + default: + return "Unknown result type"; + } +} + +namespace internal { + +// Prints a TestPartResult to an std::string. +static std::string PrintTestPartResultToString( + const TestPartResult& test_part_result) { + return (Message() + << internal::FormatFileLocation(test_part_result.file_name(), + test_part_result.line_number()) + << " " << TestPartResultTypeToString(test_part_result.type()) + << test_part_result.message()).GetString(); +} + +// Prints a TestPartResult. +static void PrintTestPartResult(const TestPartResult& test_part_result) { + const std::string& result = + PrintTestPartResultToString(test_part_result); + printf("%s\n", result.c_str()); + fflush(stdout); + // If the test program runs in Visual Studio or a debugger, the + // following statements add the test part result message to the Output + // window such that the user can double-click on it to jump to the + // corresponding source code location; otherwise they do nothing. +#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE + // We don't call OutputDebugString*() on Windows Mobile, as printing + // to stdout is done by OutputDebugString() there already - we don't + // want the same message printed twice. + ::OutputDebugStringA(result.c_str()); + ::OutputDebugStringA("\n"); +#endif +} + +// class PrettyUnitTestResultPrinter + +enum GTestColor { + COLOR_DEFAULT, + COLOR_RED, + COLOR_GREEN, + COLOR_YELLOW +}; + +#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE + +// Returns the character attribute for the given color. +WORD GetColorAttribute(GTestColor color) { + switch (color) { + case COLOR_RED: return FOREGROUND_RED; + case COLOR_GREEN: return FOREGROUND_GREEN; + case COLOR_YELLOW: return FOREGROUND_RED | FOREGROUND_GREEN; + default: return 0; + } +} + +#else + +// Returns the ANSI color code for the given color. COLOR_DEFAULT is +// an invalid input. +const char* GetAnsiColorCode(GTestColor color) { + switch (color) { + case COLOR_RED: return "1"; + case COLOR_GREEN: return "2"; + case COLOR_YELLOW: return "3"; + default: return NULL; + }; +} + +#endif // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE + +// Returns true iff Google Test should use colors in the output. +bool ShouldUseColor(bool stdout_is_tty) { + const char* const gtest_color = GTEST_FLAG(color).c_str(); + + if (String::CaseInsensitiveCStringEquals(gtest_color, "auto")) { +#if GTEST_OS_WINDOWS + // On Windows the TERM variable is usually not set, but the + // console there does support colors. + return stdout_is_tty; +#else + // On non-Windows platforms, we rely on the TERM variable. + const char* const term = posix::GetEnv("TERM"); + const bool term_supports_color = + String::CStringEquals(term, "xterm") || + String::CStringEquals(term, "xterm-color") || + String::CStringEquals(term, "xterm-256color") || + String::CStringEquals(term, "screen") || + String::CStringEquals(term, "screen-256color") || + String::CStringEquals(term, "linux") || + String::CStringEquals(term, "cygwin"); + return stdout_is_tty && term_supports_color; +#endif // GTEST_OS_WINDOWS + } + + return String::CaseInsensitiveCStringEquals(gtest_color, "yes") || + String::CaseInsensitiveCStringEquals(gtest_color, "true") || + String::CaseInsensitiveCStringEquals(gtest_color, "t") || + String::CStringEquals(gtest_color, "1"); + // We take "yes", "true", "t", and "1" as meaning "yes". If the + // value is neither one of these nor "auto", we treat it as "no" to + // be conservative. +} + +// Helpers for printing colored strings to stdout. Note that on Windows, we +// cannot simply emit special characters and have the terminal change colors. +// This routine must actually emit the characters rather than return a string +// that would be colored when printed, as can be done on Linux. +void ColoredPrintf(GTestColor color, const char* fmt, ...) { + va_list args; + va_start(args, fmt); + +#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS || GTEST_OS_IOS + const bool use_color = false; +#else + static const bool in_color_mode = + ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0); + const bool use_color = in_color_mode && (color != COLOR_DEFAULT); +#endif // GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS + // The '!= 0' comparison is necessary to satisfy MSVC 7.1. + + if (!use_color) { + vprintf(fmt, args); + va_end(args); + return; + } + +#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE + const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE); + + // Gets the current text color. + CONSOLE_SCREEN_BUFFER_INFO buffer_info; + GetConsoleScreenBufferInfo(stdout_handle, &buffer_info); + const WORD old_color_attrs = buffer_info.wAttributes; + + // We need to flush the stream buffers into the console before each + // SetConsoleTextAttribute call lest it affect the text that is already + // printed but has not yet reached the console. + fflush(stdout); + SetConsoleTextAttribute(stdout_handle, + GetColorAttribute(color) | FOREGROUND_INTENSITY); + vprintf(fmt, args); + + fflush(stdout); + // Restores the text color. + SetConsoleTextAttribute(stdout_handle, old_color_attrs); +#else + printf("\033[0;3%sm", GetAnsiColorCode(color)); + vprintf(fmt, args); + printf("\033[m"); // Resets the terminal to default. +#endif // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE + va_end(args); +} + +// Text printed in Google Test's text output and --gunit_list_tests +// output to label the type parameter and value parameter for a test. +static const char kTypeParamLabel[] = "TypeParam"; +static const char kValueParamLabel[] = "GetParam()"; + +void PrintFullTestCommentIfPresent(const TestInfo& test_info) { + const char* const type_param = test_info.type_param(); + const char* const value_param = test_info.value_param(); + + if (type_param != NULL || value_param != NULL) { + printf(", where "); + if (type_param != NULL) { + printf("%s = %s", kTypeParamLabel, type_param); + if (value_param != NULL) + printf(" and "); + } + if (value_param != NULL) { + printf("%s = %s", kValueParamLabel, value_param); + } + } +} + +// This class implements the TestEventListener interface. +// +// Class PrettyUnitTestResultPrinter is copyable. +class PrettyUnitTestResultPrinter : public TestEventListener { + public: + PrettyUnitTestResultPrinter() {} + static void PrintTestName(const char * test_case, const char * test) { + printf("%s.%s", test_case, test); + } + + // The following methods override what's in the TestEventListener class. + virtual void OnTestProgramStart(const UnitTest& /*unit_test*/) {} + virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration); + virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test); + virtual void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) {} + virtual void OnTestCaseStart(const TestCase& test_case); + virtual void OnTestStart(const TestInfo& test_info); + virtual void OnTestPartResult(const TestPartResult& result); + virtual void OnTestEnd(const TestInfo& test_info); + virtual void OnTestCaseEnd(const TestCase& test_case); + virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test); + virtual void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) {} + virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration); + virtual void OnTestProgramEnd(const UnitTest& /*unit_test*/) {} + + private: + static void PrintFailedTests(const UnitTest& unit_test); +}; + + // Fired before each iteration of tests starts. +void PrettyUnitTestResultPrinter::OnTestIterationStart( + const UnitTest& unit_test, int iteration) { + if (GTEST_FLAG(repeat) != 1) + printf("\nRepeating all tests (iteration %d) . . .\n\n", iteration + 1); + + const char* const filter = GTEST_FLAG(filter).c_str(); + + // Prints the filter if it's not *. This reminds the user that some + // tests may be skipped. + if (!String::CStringEquals(filter, kUniversalFilter)) { + ColoredPrintf(COLOR_YELLOW, + "Note: %s filter = %s\n", GTEST_NAME_, filter); + } + + if (internal::ShouldShard(kTestTotalShards, kTestShardIndex, false)) { + const Int32 shard_index = Int32FromEnvOrDie(kTestShardIndex, -1); + ColoredPrintf(COLOR_YELLOW, + "Note: This is test shard %d of %s.\n", + static_cast<int>(shard_index) + 1, + internal::posix::GetEnv(kTestTotalShards)); + } + + if (GTEST_FLAG(shuffle)) { + ColoredPrintf(COLOR_YELLOW, + "Note: Randomizing tests' orders with a seed of %d .\n", + unit_test.random_seed()); + } + + ColoredPrintf(COLOR_GREEN, "[==========] "); + printf("Running %s from %s.\n", + FormatTestCount(unit_test.test_to_run_count()).c_str(), + FormatTestCaseCount(unit_test.test_case_to_run_count()).c_str()); + fflush(stdout); +} + +void PrettyUnitTestResultPrinter::OnEnvironmentsSetUpStart( + const UnitTest& /*unit_test*/) { + ColoredPrintf(COLOR_GREEN, "[----------] "); + printf("Global test environment set-up.\n"); + fflush(stdout); +} + +void PrettyUnitTestResultPrinter::OnTestCaseStart(const TestCase& test_case) { + const std::string counts = + FormatCountableNoun(test_case.test_to_run_count(), "test", "tests"); + ColoredPrintf(COLOR_GREEN, "[----------] "); + printf("%s from %s", counts.c_str(), test_case.name()); + if (test_case.type_param() == NULL) { + printf("\n"); + } else { + printf(", where %s = %s\n", kTypeParamLabel, test_case.type_param()); + } + fflush(stdout); +} + +void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) { + ColoredPrintf(COLOR_GREEN, "[ RUN ] "); + PrintTestName(test_info.test_case_name(), test_info.name()); + printf("\n"); + fflush(stdout); +} + +// Called after an assertion failure. +void PrettyUnitTestResultPrinter::OnTestPartResult( + const TestPartResult& result) { + // If the test part succeeded, we don't need to do anything. + if (result.type() == TestPartResult::kSuccess) + return; + + // Print failure message from the assertion (e.g. expected this and got that). + PrintTestPartResult(result); + fflush(stdout); +} + +void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) { + if (test_info.result()->Passed()) { + ColoredPrintf(COLOR_GREEN, "[ OK ] "); + } else { + ColoredPrintf(COLOR_RED, "[ FAILED ] "); + } + PrintTestName(test_info.test_case_name(), test_info.name()); + if (test_info.result()->Failed()) + PrintFullTestCommentIfPresent(test_info); + + if (GTEST_FLAG(print_time)) { + printf(" (%s ms)\n", internal::StreamableToString( + test_info.result()->elapsed_time()).c_str()); + } else { + printf("\n"); + } + fflush(stdout); +} + +void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) { + if (!GTEST_FLAG(print_time)) return; + + const std::string counts = + FormatCountableNoun(test_case.test_to_run_count(), "test", "tests"); + ColoredPrintf(COLOR_GREEN, "[----------] "); + printf("%s from %s (%s ms total)\n\n", + counts.c_str(), test_case.name(), + internal::StreamableToString(test_case.elapsed_time()).c_str()); + fflush(stdout); +} + +void PrettyUnitTestResultPrinter::OnEnvironmentsTearDownStart( + const UnitTest& /*unit_test*/) { + ColoredPrintf(COLOR_GREEN, "[----------] "); + printf("Global test environment tear-down\n"); + fflush(stdout); +} + +// Internal helper for printing the list of failed tests. +void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest& unit_test) { + const int failed_test_count = unit_test.failed_test_count(); + if (failed_test_count == 0) { + return; + } + + for (int i = 0; i < unit_test.total_test_case_count(); ++i) { + const TestCase& test_case = *unit_test.GetTestCase(i); + if (!test_case.should_run() || (test_case.failed_test_count() == 0)) { + continue; + } + for (int j = 0; j < test_case.total_test_count(); ++j) { + const TestInfo& test_info = *test_case.GetTestInfo(j); + if (!test_info.should_run() || test_info.result()->Passed()) { + continue; + } + ColoredPrintf(COLOR_RED, "[ FAILED ] "); + printf("%s.%s", test_case.name(), test_info.name()); + PrintFullTestCommentIfPresent(test_info); + printf("\n"); + } + } +} + +void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, + int /*iteration*/) { + ColoredPrintf(COLOR_GREEN, "[==========] "); + printf("%s from %s ran.", + FormatTestCount(unit_test.test_to_run_count()).c_str(), + FormatTestCaseCount(unit_test.test_case_to_run_count()).c_str()); + if (GTEST_FLAG(print_time)) { + printf(" (%s ms total)", + internal::StreamableToString(unit_test.elapsed_time()).c_str()); + } + printf("\n"); + ColoredPrintf(COLOR_GREEN, "[ PASSED ] "); + printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str()); + + int num_failures = unit_test.failed_test_count(); + if (!unit_test.Passed()) { + const int failed_test_count = unit_test.failed_test_count(); + ColoredPrintf(COLOR_RED, "[ FAILED ] "); + printf("%s, listed below:\n", FormatTestCount(failed_test_count).c_str()); + PrintFailedTests(unit_test); + printf("\n%2d FAILED %s\n", num_failures, + num_failures == 1 ? "TEST" : "TESTS"); + } + + int num_disabled = unit_test.reportable_disabled_test_count(); + if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) { + if (!num_failures) { + printf("\n"); // Add a spacer if no FAILURE banner is displayed. + } + ColoredPrintf(COLOR_YELLOW, + " YOU HAVE %d DISABLED %s\n\n", + num_disabled, + num_disabled == 1 ? "TEST" : "TESTS"); + } + // Ensure that Google Test output is printed before, e.g., heapchecker output. + fflush(stdout); +} + +// End PrettyUnitTestResultPrinter + +// class TestEventRepeater +// +// This class forwards events to other event listeners. +class TestEventRepeater : public TestEventListener { + public: + TestEventRepeater() : forwarding_enabled_(true) {} + virtual ~TestEventRepeater(); + void Append(TestEventListener *listener); + TestEventListener* Release(TestEventListener* listener); + + // Controls whether events will be forwarded to listeners_. Set to false + // in death test child processes. + bool forwarding_enabled() const { return forwarding_enabled_; } + void set_forwarding_enabled(bool enable) { forwarding_enabled_ = enable; } + + virtual void OnTestProgramStart(const UnitTest& unit_test); + virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration); + virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test); + virtual void OnEnvironmentsSetUpEnd(const UnitTest& unit_test); + virtual void OnTestCaseStart(const TestCase& test_case); + virtual void OnTestStart(const TestInfo& test_info); + virtual void OnTestPartResult(const TestPartResult& result); + virtual void OnTestEnd(const TestInfo& test_info); + virtual void OnTestCaseEnd(const TestCase& test_case); + virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test); + virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test); + virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration); + virtual void OnTestProgramEnd(const UnitTest& unit_test); + + private: + // Controls whether events will be forwarded to listeners_. Set to false + // in death test child processes. + bool forwarding_enabled_; + // The list of listeners that receive events. + std::vector<TestEventListener*> listeners_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventRepeater); +}; + +TestEventRepeater::~TestEventRepeater() { + ForEach(listeners_, Delete<TestEventListener>); +} + +void TestEventRepeater::Append(TestEventListener *listener) { + listeners_.push_back(listener); +} + +// TODO(vladl@google.com): Factor the search functionality into Vector::Find. +TestEventListener* TestEventRepeater::Release(TestEventListener *listener) { + for (size_t i = 0; i < listeners_.size(); ++i) { + if (listeners_[i] == listener) { + listeners_.erase(listeners_.begin() + i); + return listener; + } + } + + return NULL; +} + +// Since most methods are very similar, use macros to reduce boilerplate. +// This defines a member that forwards the call to all listeners. +#define GTEST_REPEATER_METHOD_(Name, Type) \ +void TestEventRepeater::Name(const Type& parameter) { \ + if (forwarding_enabled_) { \ + for (size_t i = 0; i < listeners_.size(); i++) { \ + listeners_[i]->Name(parameter); \ + } \ + } \ +} +// This defines a member that forwards the call to all listeners in reverse +// order. +#define GTEST_REVERSE_REPEATER_METHOD_(Name, Type) \ +void TestEventRepeater::Name(const Type& parameter) { \ + if (forwarding_enabled_) { \ + for (int i = static_cast<int>(listeners_.size()) - 1; i >= 0; i--) { \ + listeners_[i]->Name(parameter); \ + } \ + } \ +} + +GTEST_REPEATER_METHOD_(OnTestProgramStart, UnitTest) +GTEST_REPEATER_METHOD_(OnEnvironmentsSetUpStart, UnitTest) +GTEST_REPEATER_METHOD_(OnTestCaseStart, TestCase) +GTEST_REPEATER_METHOD_(OnTestStart, TestInfo) +GTEST_REPEATER_METHOD_(OnTestPartResult, TestPartResult) +GTEST_REPEATER_METHOD_(OnEnvironmentsTearDownStart, UnitTest) +GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsSetUpEnd, UnitTest) +GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsTearDownEnd, UnitTest) +GTEST_REVERSE_REPEATER_METHOD_(OnTestEnd, TestInfo) +GTEST_REVERSE_REPEATER_METHOD_(OnTestCaseEnd, TestCase) +GTEST_REVERSE_REPEATER_METHOD_(OnTestProgramEnd, UnitTest) + +#undef GTEST_REPEATER_METHOD_ +#undef GTEST_REVERSE_REPEATER_METHOD_ + +void TestEventRepeater::OnTestIterationStart(const UnitTest& unit_test, + int iteration) { + if (forwarding_enabled_) { + for (size_t i = 0; i < listeners_.size(); i++) { + listeners_[i]->OnTestIterationStart(unit_test, iteration); + } + } +} + +void TestEventRepeater::OnTestIterationEnd(const UnitTest& unit_test, + int iteration) { + if (forwarding_enabled_) { + for (int i = static_cast<int>(listeners_.size()) - 1; i >= 0; i--) { + listeners_[i]->OnTestIterationEnd(unit_test, iteration); + } + } +} + +// End TestEventRepeater + +// This class generates an XML output file. +class XmlUnitTestResultPrinter : public EmptyTestEventListener { + public: + explicit XmlUnitTestResultPrinter(const char* output_file); + + virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration); + + private: + // Is c a whitespace character that is normalized to a space character + // when it appears in an XML attribute value? + static bool IsNormalizableWhitespace(char c) { + return c == 0x9 || c == 0xA || c == 0xD; + } + + // May c appear in a well-formed XML document? + static bool IsValidXmlCharacter(char c) { + return IsNormalizableWhitespace(c) || c >= 0x20; + } + + // Returns an XML-escaped copy of the input string str. If + // is_attribute is true, the text is meant to appear as an attribute + // value, and normalizable whitespace is preserved by replacing it + // with character references. + static std::string EscapeXml(const std::string& str, bool is_attribute); + + // Returns the given string with all characters invalid in XML removed. + static std::string RemoveInvalidXmlCharacters(const std::string& str); + + // Convenience wrapper around EscapeXml when str is an attribute value. + static std::string EscapeXmlAttribute(const std::string& str) { + return EscapeXml(str, true); + } + + // Convenience wrapper around EscapeXml when str is not an attribute value. + static std::string EscapeXmlText(const char* str) { + return EscapeXml(str, false); + } + + // Verifies that the given attribute belongs to the given element and + // streams the attribute as XML. + static void OutputXmlAttribute(std::ostream* stream, + const std::string& element_name, + const std::string& name, + const std::string& value); + + // Streams an XML CDATA section, escaping invalid CDATA sequences as needed. + static void OutputXmlCDataSection(::std::ostream* stream, const char* data); + + // Streams an XML representation of a TestInfo object. + static void OutputXmlTestInfo(::std::ostream* stream, + const char* test_case_name, + const TestInfo& test_info); + + // Prints an XML representation of a TestCase object + static void PrintXmlTestCase(::std::ostream* stream, + const TestCase& test_case); + + // Prints an XML summary of unit_test to output stream out. + static void PrintXmlUnitTest(::std::ostream* stream, + const UnitTest& unit_test); + + // Produces a string representing the test properties in a result as space + // delimited XML attributes based on the property key="value" pairs. + // When the std::string is not empty, it includes a space at the beginning, + // to delimit this attribute from prior attributes. + static std::string TestPropertiesAsXmlAttributes(const TestResult& result); + + // The output file. + const std::string output_file_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(XmlUnitTestResultPrinter); +}; + +// Creates a new XmlUnitTestResultPrinter. +XmlUnitTestResultPrinter::XmlUnitTestResultPrinter(const char* output_file) + : output_file_(output_file) { + if (output_file_.c_str() == NULL || output_file_.empty()) { + fprintf(stderr, "XML output file may not be null\n"); + fflush(stderr); + exit(EXIT_FAILURE); + } +} + +// Called after the unit test ends. +void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, + int /*iteration*/) { + FILE* xmlout = NULL; + FilePath output_file(output_file_); + FilePath output_dir(output_file.RemoveFileName()); + + if (output_dir.CreateDirectoriesRecursively()) { + xmlout = posix::FOpen(output_file_.c_str(), "w"); + } + if (xmlout == NULL) { + // TODO(wan): report the reason of the failure. + // + // We don't do it for now as: + // + // 1. There is no urgent need for it. + // 2. It's a bit involved to make the errno variable thread-safe on + // all three operating systems (Linux, Windows, and Mac OS). + // 3. To interpret the meaning of errno in a thread-safe way, + // we need the strerror_r() function, which is not available on + // Windows. + fprintf(stderr, + "Unable to open file \"%s\"\n", + output_file_.c_str()); + fflush(stderr); + exit(EXIT_FAILURE); + } + std::stringstream stream; + PrintXmlUnitTest(&stream, unit_test); + fprintf(xmlout, "%s", StringStreamToString(&stream).c_str()); + fclose(xmlout); +} + +// Returns an XML-escaped copy of the input string str. If is_attribute +// is true, the text is meant to appear as an attribute value, and +// normalizable whitespace is preserved by replacing it with character +// references. +// +// Invalid XML characters in str, if any, are stripped from the output. +// It is expected that most, if not all, of the text processed by this +// module will consist of ordinary English text. +// If this module is ever modified to produce version 1.1 XML output, +// most invalid characters can be retained using character references. +// TODO(wan): It might be nice to have a minimally invasive, human-readable +// escaping scheme for invalid characters, rather than dropping them. +std::string XmlUnitTestResultPrinter::EscapeXml( + const std::string& str, bool is_attribute) { + Message m; + + for (size_t i = 0; i < str.size(); ++i) { + const char ch = str[i]; + switch (ch) { + case '<': + m << "<"; + break; + case '>': + m << ">"; + break; + case '&': + m << "&"; + break; + case '\'': + if (is_attribute) + m << "'"; + else + m << '\''; + break; + case '"': + if (is_attribute) + m << """; + else + m << '"'; + break; + default: + if (IsValidXmlCharacter(ch)) { + if (is_attribute && IsNormalizableWhitespace(ch)) + m << "&#x" << String::FormatByte(static_cast<unsigned char>(ch)) + << ";"; + else + m << ch; + } + break; + } + } + + return m.GetString(); +} + +// Returns the given string with all characters invalid in XML removed. +// Currently invalid characters are dropped from the string. An +// alternative is to replace them with certain characters such as . or ?. +std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters( + const std::string& str) { + std::string output; + output.reserve(str.size()); + for (std::string::const_iterator it = str.begin(); it != str.end(); ++it) + if (IsValidXmlCharacter(*it)) + output.push_back(*it); + + return output; +} + +// The following routines generate an XML representation of a UnitTest +// object. +// +// This is how Google Test concepts map to the DTD: +// +// <testsuites name="AllTests"> <-- corresponds to a UnitTest object +// <testsuite name="testcase-name"> <-- corresponds to a TestCase object +// <testcase name="test-name"> <-- corresponds to a TestInfo object +// <failure message="...">...</failure> +// <failure message="...">...</failure> +// <failure message="...">...</failure> +// <-- individual assertion failures +// </testcase> +// </testsuite> +// </testsuites> + +// Formats the given time in milliseconds as seconds. +std::string FormatTimeInMillisAsSeconds(TimeInMillis ms) { + ::std::stringstream ss; + ss << ms/1000.0; + return ss.str(); +} + +// Converts the given epoch time in milliseconds to a date string in the ISO +// 8601 format, without the timezone information. +std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms) { + // Using non-reentrant version as localtime_r is not portable. + time_t seconds = static_cast<time_t>(ms / 1000); +#ifdef _MSC_VER +# pragma warning(push) // Saves the current warning state. +# pragma warning(disable:4996) // Temporarily disables warning 4996 + // (function or variable may be unsafe). + const struct tm* const time_struct = localtime(&seconds); // NOLINT +# pragma warning(pop) // Restores the warning state again. +#else + const struct tm* const time_struct = localtime(&seconds); // NOLINT +#endif + if (time_struct == NULL) + return ""; // Invalid ms value + + // YYYY-MM-DDThh:mm:ss + return StreamableToString(time_struct->tm_year + 1900) + "-" + + String::FormatIntWidth2(time_struct->tm_mon + 1) + "-" + + String::FormatIntWidth2(time_struct->tm_mday) + "T" + + String::FormatIntWidth2(time_struct->tm_hour) + ":" + + String::FormatIntWidth2(time_struct->tm_min) + ":" + + String::FormatIntWidth2(time_struct->tm_sec); +} + +// Streams an XML CDATA section, escaping invalid CDATA sequences as needed. +void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream, + const char* data) { + const char* segment = data; + *stream << "<![CDATA["; + for (;;) { + const char* const next_segment = strstr(segment, "]]>"); + if (next_segment != NULL) { + stream->write( + segment, static_cast<std::streamsize>(next_segment - segment)); + *stream << "]]>]]><![CDATA["; + segment = next_segment + strlen("]]>"); + } else { + *stream << segment; + break; + } + } + *stream << "]]>"; +} + +void XmlUnitTestResultPrinter::OutputXmlAttribute( + std::ostream* stream, + const std::string& element_name, + const std::string& name, + const std::string& value) { + const std::vector<std::string>& allowed_names = + GetReservedAttributesForElement(element_name); + + GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) != + allowed_names.end()) + << "Attribute " << name << " is not allowed for element <" << element_name + << ">."; + + *stream << " " << name << "=\"" << EscapeXmlAttribute(value) << "\""; +} + +// Prints an XML representation of a TestInfo object. +// TODO(wan): There is also value in printing properties with the plain printer. +void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream, + const char* test_case_name, + const TestInfo& test_info) { + const TestResult& result = *test_info.result(); + const std::string kTestcase = "testcase"; + + *stream << " <testcase"; + OutputXmlAttribute(stream, kTestcase, "name", test_info.name()); + + if (test_info.value_param() != NULL) { + OutputXmlAttribute(stream, kTestcase, "value_param", + test_info.value_param()); + } + if (test_info.type_param() != NULL) { + OutputXmlAttribute(stream, kTestcase, "type_param", test_info.type_param()); + } + + OutputXmlAttribute(stream, kTestcase, "status", + test_info.should_run() ? "run" : "notrun"); + OutputXmlAttribute(stream, kTestcase, "time", + FormatTimeInMillisAsSeconds(result.elapsed_time())); + OutputXmlAttribute(stream, kTestcase, "classname", test_case_name); + *stream << TestPropertiesAsXmlAttributes(result); + + int failures = 0; + for (int i = 0; i < result.total_part_count(); ++i) { + const TestPartResult& part = result.GetTestPartResult(i); + if (part.failed()) { + if (++failures == 1) { + *stream << ">\n"; + } + const string location = internal::FormatCompilerIndependentFileLocation( + part.file_name(), part.line_number()); + const string summary = location + "\n" + part.summary(); + *stream << " <failure message=\"" + << EscapeXmlAttribute(summary.c_str()) + << "\" type=\"\">"; + const string detail = location + "\n" + part.message(); + OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str()); + *stream << "</failure>\n"; + } + } + + if (failures == 0) + *stream << " />\n"; + else + *stream << " </testcase>\n"; +} + +// Prints an XML representation of a TestCase object +void XmlUnitTestResultPrinter::PrintXmlTestCase(std::ostream* stream, + const TestCase& test_case) { + const std::string kTestsuite = "testsuite"; + *stream << " <" << kTestsuite; + OutputXmlAttribute(stream, kTestsuite, "name", test_case.name()); + OutputXmlAttribute(stream, kTestsuite, "tests", + StreamableToString(test_case.reportable_test_count())); + OutputXmlAttribute(stream, kTestsuite, "failures", + StreamableToString(test_case.failed_test_count())); + OutputXmlAttribute( + stream, kTestsuite, "disabled", + StreamableToString(test_case.reportable_disabled_test_count())); + OutputXmlAttribute(stream, kTestsuite, "errors", "0"); + OutputXmlAttribute(stream, kTestsuite, "time", + FormatTimeInMillisAsSeconds(test_case.elapsed_time())); + *stream << TestPropertiesAsXmlAttributes(test_case.ad_hoc_test_result()) + << ">\n"; + + for (int i = 0; i < test_case.total_test_count(); ++i) { + if (test_case.GetTestInfo(i)->is_reportable()) + OutputXmlTestInfo(stream, test_case.name(), *test_case.GetTestInfo(i)); + } + *stream << " </" << kTestsuite << ">\n"; +} + +// Prints an XML summary of unit_test to output stream out. +void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream, + const UnitTest& unit_test) { + const std::string kTestsuites = "testsuites"; + + *stream << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"; + *stream << "<" << kTestsuites; + + OutputXmlAttribute(stream, kTestsuites, "tests", + StreamableToString(unit_test.reportable_test_count())); + OutputXmlAttribute(stream, kTestsuites, "failures", + StreamableToString(unit_test.failed_test_count())); + OutputXmlAttribute( + stream, kTestsuites, "disabled", + StreamableToString(unit_test.reportable_disabled_test_count())); + OutputXmlAttribute(stream, kTestsuites, "errors", "0"); + OutputXmlAttribute( + stream, kTestsuites, "timestamp", + FormatEpochTimeInMillisAsIso8601(unit_test.start_timestamp())); + OutputXmlAttribute(stream, kTestsuites, "time", + FormatTimeInMillisAsSeconds(unit_test.elapsed_time())); + + if (GTEST_FLAG(shuffle)) { + OutputXmlAttribute(stream, kTestsuites, "random_seed", + StreamableToString(unit_test.random_seed())); + } + + *stream << TestPropertiesAsXmlAttributes(unit_test.ad_hoc_test_result()); + + OutputXmlAttribute(stream, kTestsuites, "name", "AllTests"); + *stream << ">\n"; + + for (int i = 0; i < unit_test.total_test_case_count(); ++i) { + if (unit_test.GetTestCase(i)->reportable_test_count() > 0) + PrintXmlTestCase(stream, *unit_test.GetTestCase(i)); + } + *stream << "</" << kTestsuites << ">\n"; +} + +// Produces a string representing the test properties in a result as space +// delimited XML attributes based on the property key="value" pairs. +std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes( + const TestResult& result) { + Message attributes; + for (int i = 0; i < result.test_property_count(); ++i) { + const TestProperty& property = result.GetTestProperty(i); + attributes << " " << property.key() << "=" + << "\"" << EscapeXmlAttribute(property.value()) << "\""; + } + return attributes.GetString(); +} + +// End XmlUnitTestResultPrinter + +#if GTEST_CAN_STREAM_RESULTS_ + +// Checks if str contains '=', '&', '%' or '\n' characters. If yes, +// replaces them by "%xx" where xx is their hexadecimal value. For +// example, replaces "=" with "%3D". This algorithm is O(strlen(str)) +// in both time and space -- important as the input str may contain an +// arbitrarily long test failure message and stack trace. +string StreamingListener::UrlEncode(const char* str) { + string result; + result.reserve(strlen(str) + 1); + for (char ch = *str; ch != '\0'; ch = *++str) { + switch (ch) { + case '%': + case '=': + case '&': + case '\n': + result.append("%" + String::FormatByte(static_cast<unsigned char>(ch))); + break; + default: + result.push_back(ch); + break; + } + } + return result; +} + +void StreamingListener::SocketWriter::MakeConnection() { + GTEST_CHECK_(sockfd_ == -1) + << "MakeConnection() can't be called when there is already a connection."; + + addrinfo hints; + memset(&hints, 0, sizeof(hints)); + hints.ai_family = AF_UNSPEC; // To allow both IPv4 and IPv6 addresses. + hints.ai_socktype = SOCK_STREAM; + addrinfo* servinfo = NULL; + + // Use the getaddrinfo() to get a linked list of IP addresses for + // the given host name. + const int error_num = getaddrinfo( + host_name_.c_str(), port_num_.c_str(), &hints, &servinfo); + if (error_num != 0) { + GTEST_LOG_(WARNING) << "stream_result_to: getaddrinfo() failed: " + << gai_strerror(error_num); + } + + // Loop through all the results and connect to the first we can. + for (addrinfo* cur_addr = servinfo; sockfd_ == -1 && cur_addr != NULL; + cur_addr = cur_addr->ai_next) { + sockfd_ = socket( + cur_addr->ai_family, cur_addr->ai_socktype, cur_addr->ai_protocol); + if (sockfd_ != -1) { + // Connect the client socket to the server socket. + if (connect(sockfd_, cur_addr->ai_addr, cur_addr->ai_addrlen) == -1) { + close(sockfd_); + sockfd_ = -1; + } + } + } + + freeaddrinfo(servinfo); // all done with this structure + + if (sockfd_ == -1) { + GTEST_LOG_(WARNING) << "stream_result_to: failed to connect to " + << host_name_ << ":" << port_num_; + } +} + +// End of class Streaming Listener +#endif // GTEST_CAN_STREAM_RESULTS__ + +// Class ScopedTrace + +// Pushes the given source file location and message onto a per-thread +// trace stack maintained by Google Test. +ScopedTrace::ScopedTrace(const char* file, int line, const Message& message) + GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) { + TraceInfo trace; + trace.file = file; + trace.line = line; + trace.message = message.GetString(); + + UnitTest::GetInstance()->PushGTestTrace(trace); +} + +// Pops the info pushed by the c'tor. +ScopedTrace::~ScopedTrace() + GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) { + UnitTest::GetInstance()->PopGTestTrace(); +} + + +// class OsStackTraceGetter + +// Returns the current OS stack trace as an std::string. Parameters: +// +// max_depth - the maximum number of stack frames to be included +// in the trace. +// skip_count - the number of top frames to be skipped; doesn't count +// against max_depth. +// +string OsStackTraceGetter::CurrentStackTrace(int /* max_depth */, + int /* skip_count */) + GTEST_LOCK_EXCLUDED_(mutex_) { + return ""; +} + +void OsStackTraceGetter::UponLeavingGTest() + GTEST_LOCK_EXCLUDED_(mutex_) { +} + +const char* const +OsStackTraceGetter::kElidedFramesMarker = + "... " GTEST_NAME_ " internal frames ..."; + +// A helper class that creates the premature-exit file in its +// constructor and deletes the file in its destructor. +class ScopedPrematureExitFile { + public: + explicit ScopedPrematureExitFile(const char* premature_exit_filepath) + : premature_exit_filepath_(premature_exit_filepath) { + // If a path to the premature-exit file is specified... + if (premature_exit_filepath != NULL && *premature_exit_filepath != '\0') { + // create the file with a single "0" character in it. I/O + // errors are ignored as there's nothing better we can do and we + // don't want to fail the test because of this. + FILE* pfile = posix::FOpen(premature_exit_filepath, "w"); + fwrite("0", 1, 1, pfile); + fclose(pfile); + } + } + + ~ScopedPrematureExitFile() { + if (premature_exit_filepath_ != NULL && *premature_exit_filepath_ != '\0') { + remove(premature_exit_filepath_); + } + } + + private: + const char* const premature_exit_filepath_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedPrematureExitFile); +}; + +} // namespace internal + +// class TestEventListeners + +TestEventListeners::TestEventListeners() + : repeater_(new internal::TestEventRepeater()), + default_result_printer_(NULL), + default_xml_generator_(NULL) { +} + +TestEventListeners::~TestEventListeners() { delete repeater_; } + +// Returns the standard listener responsible for the default console +// output. Can be removed from the listeners list to shut down default +// console output. Note that removing this object from the listener list +// with Release transfers its ownership to the user. +void TestEventListeners::Append(TestEventListener* listener) { + repeater_->Append(listener); +} + +// Removes the given event listener from the list and returns it. It then +// becomes the caller's responsibility to delete the listener. Returns +// NULL if the listener is not found in the list. +TestEventListener* TestEventListeners::Release(TestEventListener* listener) { + if (listener == default_result_printer_) + default_result_printer_ = NULL; + else if (listener == default_xml_generator_) + default_xml_generator_ = NULL; + return repeater_->Release(listener); +} + +// Returns repeater that broadcasts the TestEventListener events to all +// subscribers. +TestEventListener* TestEventListeners::repeater() { return repeater_; } + +// Sets the default_result_printer attribute to the provided listener. +// The listener is also added to the listener list and previous +// default_result_printer is removed from it and deleted. The listener can +// also be NULL in which case it will not be added to the list. Does +// nothing if the previous and the current listener objects are the same. +void TestEventListeners::SetDefaultResultPrinter(TestEventListener* listener) { + if (default_result_printer_ != listener) { + // It is an error to pass this method a listener that is already in the + // list. + delete Release(default_result_printer_); + default_result_printer_ = listener; + if (listener != NULL) + Append(listener); + } +} + +// Sets the default_xml_generator attribute to the provided listener. The +// listener is also added to the listener list and previous +// default_xml_generator is removed from it and deleted. The listener can +// also be NULL in which case it will not be added to the list. Does +// nothing if the previous and the current listener objects are the same. +void TestEventListeners::SetDefaultXmlGenerator(TestEventListener* listener) { + if (default_xml_generator_ != listener) { + // It is an error to pass this method a listener that is already in the + // list. + delete Release(default_xml_generator_); + default_xml_generator_ = listener; + if (listener != NULL) + Append(listener); + } +} + +// Controls whether events will be forwarded by the repeater to the +// listeners in the list. +bool TestEventListeners::EventForwardingEnabled() const { + return repeater_->forwarding_enabled(); +} + +void TestEventListeners::SuppressEventForwarding() { + repeater_->set_forwarding_enabled(false); +} + +// class UnitTest + +// Gets the singleton UnitTest object. The first time this method is +// called, a UnitTest object is constructed and returned. Consecutive +// calls will return the same object. +// +// We don't protect this under mutex_ as a user is not supposed to +// call this before main() starts, from which point on the return +// value will never change. +UnitTest* UnitTest::GetInstance() { + // When compiled with MSVC 7.1 in optimized mode, destroying the + // UnitTest object upon exiting the program messes up the exit code, + // causing successful tests to appear failed. We have to use a + // different implementation in this case to bypass the compiler bug. + // This implementation makes the compiler happy, at the cost of + // leaking the UnitTest object. + + // CodeGear C++Builder insists on a public destructor for the + // default implementation. Use this implementation to keep good OO + // design with private destructor. + +#if (_MSC_VER == 1310 && !defined(_DEBUG)) || defined(__BORLANDC__) + static UnitTest* const instance = new UnitTest; + return instance; +#else + static UnitTest instance; + return &instance; +#endif // (_MSC_VER == 1310 && !defined(_DEBUG)) || defined(__BORLANDC__) +} + +// Gets the number of successful test cases. +int UnitTest::successful_test_case_count() const { + return impl()->successful_test_case_count(); +} + +// Gets the number of failed test cases. +int UnitTest::failed_test_case_count() const { + return impl()->failed_test_case_count(); +} + +// Gets the number of all test cases. +int UnitTest::total_test_case_count() const { + return impl()->total_test_case_count(); +} + +// Gets the number of all test cases that contain at least one test +// that should run. +int UnitTest::test_case_to_run_count() const { + return impl()->test_case_to_run_count(); +} + +// Gets the number of successful tests. +int UnitTest::successful_test_count() const { + return impl()->successful_test_count(); +} + +// Gets the number of failed tests. +int UnitTest::failed_test_count() const { return impl()->failed_test_count(); } + +// Gets the number of disabled tests that will be reported in the XML report. +int UnitTest::reportable_disabled_test_count() const { + return impl()->reportable_disabled_test_count(); +} + +// Gets the number of disabled tests. +int UnitTest::disabled_test_count() const { + return impl()->disabled_test_count(); +} + +// Gets the number of tests to be printed in the XML report. +int UnitTest::reportable_test_count() const { + return impl()->reportable_test_count(); +} + +// Gets the number of all tests. +int UnitTest::total_test_count() const { return impl()->total_test_count(); } + +// Gets the number of tests that should run. +int UnitTest::test_to_run_count() const { return impl()->test_to_run_count(); } + +// Gets the time of the test program start, in ms from the start of the +// UNIX epoch. +internal::TimeInMillis UnitTest::start_timestamp() const { + return impl()->start_timestamp(); +} + +// Gets the elapsed time, in milliseconds. +internal::TimeInMillis UnitTest::elapsed_time() const { + return impl()->elapsed_time(); +} + +// Returns true iff the unit test passed (i.e. all test cases passed). +bool UnitTest::Passed() const { return impl()->Passed(); } + +// Returns true iff the unit test failed (i.e. some test case failed +// or something outside of all tests failed). +bool UnitTest::Failed() const { return impl()->Failed(); } + +// Gets the i-th test case among all the test cases. i can range from 0 to +// total_test_case_count() - 1. If i is not in that range, returns NULL. +const TestCase* UnitTest::GetTestCase(int i) const { + return impl()->GetTestCase(i); +} + +// Returns the TestResult containing information on test failures and +// properties logged outside of individual test cases. +const TestResult& UnitTest::ad_hoc_test_result() const { + return *impl()->ad_hoc_test_result(); +} + +// Gets the i-th test case among all the test cases. i can range from 0 to +// total_test_case_count() - 1. If i is not in that range, returns NULL. +TestCase* UnitTest::GetMutableTestCase(int i) { + return impl()->GetMutableTestCase(i); +} + +// Returns the list of event listeners that can be used to track events +// inside Google Test. +TestEventListeners& UnitTest::listeners() { + return *impl()->listeners(); +} + +// Registers and returns a global test environment. When a test +// program is run, all global test environments will be set-up in the +// order they were registered. After all tests in the program have +// finished, all global test environments will be torn-down in the +// *reverse* order they were registered. +// +// The UnitTest object takes ownership of the given environment. +// +// We don't protect this under mutex_, as we only support calling it +// from the main thread. +Environment* UnitTest::AddEnvironment(Environment* env) { + if (env == NULL) { + return NULL; + } + + impl_->environments().push_back(env); + return env; +} + +// Adds a TestPartResult to the current TestResult object. All Google Test +// assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) eventually call +// this to report their results. The user code should use the +// assertion macros instead of calling this directly. +void UnitTest::AddTestPartResult( + TestPartResult::Type result_type, + const char* file_name, + int line_number, + const std::string& message, + const std::string& os_stack_trace) GTEST_LOCK_EXCLUDED_(mutex_) { + Message msg; + msg << message; + + internal::MutexLock lock(&mutex_); + if (impl_->gtest_trace_stack().size() > 0) { + msg << "\n" << GTEST_NAME_ << " trace:"; + + for (int i = static_cast<int>(impl_->gtest_trace_stack().size()); + i > 0; --i) { + const internal::TraceInfo& trace = impl_->gtest_trace_stack()[i - 1]; + msg << "\n" << internal::FormatFileLocation(trace.file, trace.line) + << " " << trace.message; + } + } + + if (os_stack_trace.c_str() != NULL && !os_stack_trace.empty()) { + msg << internal::kStackTraceMarker << os_stack_trace; + } + + const TestPartResult result = + TestPartResult(result_type, file_name, line_number, + msg.GetString().c_str()); + impl_->GetTestPartResultReporterForCurrentThread()-> + ReportTestPartResult(result); + + if (result_type != TestPartResult::kSuccess) { + // gtest_break_on_failure takes precedence over + // gtest_throw_on_failure. This allows a user to set the latter + // in the code (perhaps in order to use Google Test assertions + // with another testing framework) and specify the former on the + // command line for debugging. + if (GTEST_FLAG(break_on_failure)) { +#if GTEST_OS_WINDOWS + // Using DebugBreak on Windows allows gtest to still break into a debugger + // when a failure happens and both the --gtest_break_on_failure and + // the --gtest_catch_exceptions flags are specified. + DebugBreak(); +#else + // Dereference NULL through a volatile pointer to prevent the compiler + // from removing. We use this rather than abort() or __builtin_trap() for + // portability: Symbian doesn't implement abort() well, and some debuggers + // don't correctly trap abort(). + *static_cast<volatile int*>(NULL) = 1; +#endif // GTEST_OS_WINDOWS + } else if (GTEST_FLAG(throw_on_failure)) { +#if GTEST_HAS_EXCEPTIONS + throw internal::GoogleTestFailureException(result); +#else + // We cannot call abort() as it generates a pop-up in debug mode + // that cannot be suppressed in VC 7.1 or below. + exit(1); +#endif + } + } +} + +// Adds a TestProperty to the current TestResult object when invoked from +// inside a test, to current TestCase's ad_hoc_test_result_ when invoked +// from SetUpTestCase or TearDownTestCase, or to the global property set +// when invoked elsewhere. If the result already contains a property with +// the same key, the value will be updated. +void UnitTest::RecordProperty(const std::string& key, + const std::string& value) { + impl_->RecordProperty(TestProperty(key, value)); +} + +// Runs all tests in this UnitTest object and prints the result. +// Returns 0 if successful, or 1 otherwise. +// +// We don't protect this under mutex_, as we only support calling it +// from the main thread. +int UnitTest::Run() { + const bool in_death_test_child_process = + internal::GTEST_FLAG(internal_run_death_test).length() > 0; + + // Google Test implements this protocol for catching that a test + // program exits before returning control to Google Test: + // + // 1. Upon start, Google Test creates a file whose absolute path + // is specified by the environment variable + // TEST_PREMATURE_EXIT_FILE. + // 2. When Google Test has finished its work, it deletes the file. + // + // This allows a test runner to set TEST_PREMATURE_EXIT_FILE before + // running a Google-Test-based test program and check the existence + // of the file at the end of the test execution to see if it has + // exited prematurely. + + // If we are in the child process of a death test, don't + // create/delete the premature exit file, as doing so is unnecessary + // and will confuse the parent process. Otherwise, create/delete + // the file upon entering/leaving this function. If the program + // somehow exits before this function has a chance to return, the + // premature-exit file will be left undeleted, causing a test runner + // that understands the premature-exit-file protocol to report the + // test as having failed. + const internal::ScopedPrematureExitFile premature_exit_file( + in_death_test_child_process ? + NULL : internal::posix::GetEnv("TEST_PREMATURE_EXIT_FILE")); + + // Captures the value of GTEST_FLAG(catch_exceptions). This value will be + // used for the duration of the program. + impl()->set_catch_exceptions(GTEST_FLAG(catch_exceptions)); + +#if GTEST_HAS_SEH + // Either the user wants Google Test to catch exceptions thrown by the + // tests or this is executing in the context of death test child + // process. In either case the user does not want to see pop-up dialogs + // about crashes - they are expected. + if (impl()->catch_exceptions() || in_death_test_child_process) { +# if !GTEST_OS_WINDOWS_MOBILE + // SetErrorMode doesn't exist on CE. + SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOALIGNMENTFAULTEXCEPT | + SEM_NOGPFAULTERRORBOX | SEM_NOOPENFILEERRORBOX); +# endif // !GTEST_OS_WINDOWS_MOBILE + +# if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE + // Death test children can be terminated with _abort(). On Windows, + // _abort() can show a dialog with a warning message. This forces the + // abort message to go to stderr instead. + _set_error_mode(_OUT_TO_STDERR); +# endif + +# if _MSC_VER >= 1400 && !GTEST_OS_WINDOWS_MOBILE + // In the debug version, Visual Studio pops up a separate dialog + // offering a choice to debug the aborted program. We need to suppress + // this dialog or it will pop up for every EXPECT/ASSERT_DEATH statement + // executed. Google Test will notify the user of any unexpected + // failure via stderr. + // + // VC++ doesn't define _set_abort_behavior() prior to the version 8.0. + // Users of prior VC versions shall suffer the agony and pain of + // clicking through the countless debug dialogs. + // TODO(vladl@google.com): find a way to suppress the abort dialog() in the + // debug mode when compiled with VC 7.1 or lower. + if (!GTEST_FLAG(break_on_failure)) + _set_abort_behavior( + 0x0, // Clear the following flags: + _WRITE_ABORT_MSG | _CALL_REPORTFAULT); // pop-up window, core dump. +# endif + } +#endif // GTEST_HAS_SEH + + return internal::HandleExceptionsInMethodIfSupported( + impl(), + &internal::UnitTestImpl::RunAllTests, + "auxiliary test code (environments or event listeners)") ? 0 : 1; +} + +// Returns the working directory when the first TEST() or TEST_F() was +// executed. +const char* UnitTest::original_working_dir() const { + return impl_->original_working_dir_.c_str(); +} + +// Returns the TestCase object for the test that's currently running, +// or NULL if no test is running. +const TestCase* UnitTest::current_test_case() const + GTEST_LOCK_EXCLUDED_(mutex_) { + internal::MutexLock lock(&mutex_); + return impl_->current_test_case(); +} + +// Returns the TestInfo object for the test that's currently running, +// or NULL if no test is running. +const TestInfo* UnitTest::current_test_info() const + GTEST_LOCK_EXCLUDED_(mutex_) { + internal::MutexLock lock(&mutex_); + return impl_->current_test_info(); +} + +// Returns the random seed used at the start of the current test run. +int UnitTest::random_seed() const { return impl_->random_seed(); } + +#if GTEST_HAS_PARAM_TEST +// Returns ParameterizedTestCaseRegistry object used to keep track of +// value-parameterized tests and instantiate and register them. +internal::ParameterizedTestCaseRegistry& + UnitTest::parameterized_test_registry() + GTEST_LOCK_EXCLUDED_(mutex_) { + return impl_->parameterized_test_registry(); +} +#endif // GTEST_HAS_PARAM_TEST + +// Creates an empty UnitTest. +UnitTest::UnitTest() { + impl_ = new internal::UnitTestImpl(this); +} + +// Destructor of UnitTest. +UnitTest::~UnitTest() { + delete impl_; +} + +// Pushes a trace defined by SCOPED_TRACE() on to the per-thread +// Google Test trace stack. +void UnitTest::PushGTestTrace(const internal::TraceInfo& trace) + GTEST_LOCK_EXCLUDED_(mutex_) { + internal::MutexLock lock(&mutex_); + impl_->gtest_trace_stack().push_back(trace); +} + +// Pops a trace from the per-thread Google Test trace stack. +void UnitTest::PopGTestTrace() + GTEST_LOCK_EXCLUDED_(mutex_) { + internal::MutexLock lock(&mutex_); + impl_->gtest_trace_stack().pop_back(); +} + +namespace internal { + +UnitTestImpl::UnitTestImpl(UnitTest* parent) + : parent_(parent), +#ifdef _MSC_VER +# pragma warning(push) // Saves the current warning state. +# pragma warning(disable:4355) // Temporarily disables warning 4355 + // (using this in initializer). + default_global_test_part_result_reporter_(this), + default_per_thread_test_part_result_reporter_(this), +# pragma warning(pop) // Restores the warning state again. +#else + default_global_test_part_result_reporter_(this), + default_per_thread_test_part_result_reporter_(this), +#endif // _MSC_VER + global_test_part_result_repoter_( + &default_global_test_part_result_reporter_), + per_thread_test_part_result_reporter_( + &default_per_thread_test_part_result_reporter_), +#if GTEST_HAS_PARAM_TEST + parameterized_test_registry_(), + parameterized_tests_registered_(false), +#endif // GTEST_HAS_PARAM_TEST + last_death_test_case_(-1), + current_test_case_(NULL), + current_test_info_(NULL), + ad_hoc_test_result_(), + os_stack_trace_getter_(NULL), + post_flag_parse_init_performed_(false), + random_seed_(0), // Will be overridden by the flag before first use. + random_(0), // Will be reseeded before first use. + start_timestamp_(0), + elapsed_time_(0), +#if GTEST_HAS_DEATH_TEST + death_test_factory_(new DefaultDeathTestFactory), +#endif + // Will be overridden by the flag before first use. + catch_exceptions_(false) { + listeners()->SetDefaultResultPrinter(new PrettyUnitTestResultPrinter); +} + +UnitTestImpl::~UnitTestImpl() { + // Deletes every TestCase. + ForEach(test_cases_, internal::Delete<TestCase>); + + // Deletes every Environment. + ForEach(environments_, internal::Delete<Environment>); + + delete os_stack_trace_getter_; +} + +// Adds a TestProperty to the current TestResult object when invoked in a +// context of a test, to current test case's ad_hoc_test_result when invoke +// from SetUpTestCase/TearDownTestCase, or to the global property set +// otherwise. If the result already contains a property with the same key, +// the value will be updated. +void UnitTestImpl::RecordProperty(const TestProperty& test_property) { + std::string xml_element; + TestResult* test_result; // TestResult appropriate for property recording. + + if (current_test_info_ != NULL) { + xml_element = "testcase"; + test_result = &(current_test_info_->result_); + } else if (current_test_case_ != NULL) { + xml_element = "testsuite"; + test_result = &(current_test_case_->ad_hoc_test_result_); + } else { + xml_element = "testsuites"; + test_result = &ad_hoc_test_result_; + } + test_result->RecordProperty(xml_element, test_property); +} + +#if GTEST_HAS_DEATH_TEST +// Disables event forwarding if the control is currently in a death test +// subprocess. Must not be called before InitGoogleTest. +void UnitTestImpl::SuppressTestEventsIfInSubprocess() { + if (internal_run_death_test_flag_.get() != NULL) + listeners()->SuppressEventForwarding(); +} +#endif // GTEST_HAS_DEATH_TEST + +// Initializes event listeners performing XML output as specified by +// UnitTestOptions. Must not be called before InitGoogleTest. +void UnitTestImpl::ConfigureXmlOutput() { + const std::string& output_format = UnitTestOptions::GetOutputFormat(); + if (output_format == "xml") { + listeners()->SetDefaultXmlGenerator(new XmlUnitTestResultPrinter( + UnitTestOptions::GetAbsolutePathToOutputFile().c_str())); + } else if (output_format != "") { + printf("WARNING: unrecognized output format \"%s\" ignored.\n", + output_format.c_str()); + fflush(stdout); + } +} + +#if GTEST_CAN_STREAM_RESULTS_ +// Initializes event listeners for streaming test results in string form. +// Must not be called before InitGoogleTest. +void UnitTestImpl::ConfigureStreamingOutput() { + const std::string& target = GTEST_FLAG(stream_result_to); + if (!target.empty()) { + const size_t pos = target.find(':'); + if (pos != std::string::npos) { + listeners()->Append(new StreamingListener(target.substr(0, pos), + target.substr(pos+1))); + } else { + printf("WARNING: unrecognized streaming target \"%s\" ignored.\n", + target.c_str()); + fflush(stdout); + } + } +} +#endif // GTEST_CAN_STREAM_RESULTS_ + +// Performs initialization dependent upon flag values obtained in +// ParseGoogleTestFlagsOnly. Is called from InitGoogleTest after the call to +// ParseGoogleTestFlagsOnly. In case a user neglects to call InitGoogleTest +// this function is also called from RunAllTests. Since this function can be +// called more than once, it has to be idempotent. +void UnitTestImpl::PostFlagParsingInit() { + // Ensures that this function does not execute more than once. + if (!post_flag_parse_init_performed_) { + post_flag_parse_init_performed_ = true; + +#if GTEST_HAS_DEATH_TEST + InitDeathTestSubprocessControlInfo(); + SuppressTestEventsIfInSubprocess(); +#endif // GTEST_HAS_DEATH_TEST + + // Registers parameterized tests. This makes parameterized tests + // available to the UnitTest reflection API without running + // RUN_ALL_TESTS. + RegisterParameterizedTests(); + + // Configures listeners for XML output. This makes it possible for users + // to shut down the default XML output before invoking RUN_ALL_TESTS. + ConfigureXmlOutput(); + +#if GTEST_CAN_STREAM_RESULTS_ + // Configures listeners for streaming test results to the specified server. + ConfigureStreamingOutput(); +#endif // GTEST_CAN_STREAM_RESULTS_ + } +} + +// A predicate that checks the name of a TestCase against a known +// value. +// +// This is used for implementation of the UnitTest class only. We put +// it in the anonymous namespace to prevent polluting the outer +// namespace. +// +// TestCaseNameIs is copyable. +class TestCaseNameIs { + public: + // Constructor. + explicit TestCaseNameIs(const std::string& name) + : name_(name) {} + + // Returns true iff the name of test_case matches name_. + bool operator()(const TestCase* test_case) const { + return test_case != NULL && strcmp(test_case->name(), name_.c_str()) == 0; + } + + private: + std::string name_; +}; + +// Finds and returns a TestCase with the given name. If one doesn't +// exist, creates one and returns it. It's the CALLER'S +// RESPONSIBILITY to ensure that this function is only called WHEN THE +// TESTS ARE NOT SHUFFLED. +// +// Arguments: +// +// test_case_name: name of the test case +// type_param: the name of the test case's type parameter, or NULL if +// this is not a typed or a type-parameterized test case. +// set_up_tc: pointer to the function that sets up the test case +// tear_down_tc: pointer to the function that tears down the test case +TestCase* UnitTestImpl::GetTestCase(const char* test_case_name, + const char* type_param, + Test::SetUpTestCaseFunc set_up_tc, + Test::TearDownTestCaseFunc tear_down_tc) { + // Can we find a TestCase with the given name? + const std::vector<TestCase*>::const_iterator test_case = + std::find_if(test_cases_.begin(), test_cases_.end(), + TestCaseNameIs(test_case_name)); + + if (test_case != test_cases_.end()) + return *test_case; + + // No. Let's create one. + TestCase* const new_test_case = + new TestCase(test_case_name, type_param, set_up_tc, tear_down_tc); + + // Is this a death test case? + if (internal::UnitTestOptions::MatchesFilter(test_case_name, + kDeathTestCaseFilter)) { + // Yes. Inserts the test case after the last death test case + // defined so far. This only works when the test cases haven't + // been shuffled. Otherwise we may end up running a death test + // after a non-death test. + ++last_death_test_case_; + test_cases_.insert(test_cases_.begin() + last_death_test_case_, + new_test_case); + } else { + // No. Appends to the end of the list. + test_cases_.push_back(new_test_case); + } + + test_case_indices_.push_back(static_cast<int>(test_case_indices_.size())); + return new_test_case; +} + +// Helpers for setting up / tearing down the given environment. They +// are for use in the ForEach() function. +static void SetUpEnvironment(Environment* env) { env->SetUp(); } +static void TearDownEnvironment(Environment* env) { env->TearDown(); } + +// Runs all tests in this UnitTest object, prints the result, and +// returns true if all tests are successful. If any exception is +// thrown during a test, the test is considered to be failed, but the +// rest of the tests will still be run. +// +// When parameterized tests are enabled, it expands and registers +// parameterized tests first in RegisterParameterizedTests(). +// All other functions called from RunAllTests() may safely assume that +// parameterized tests are ready to be counted and run. +bool UnitTestImpl::RunAllTests() { + // Makes sure InitGoogleTest() was called. + if (!GTestIsInitialized()) { + printf("%s", + "\nThis test program did NOT call ::testing::InitGoogleTest " + "before calling RUN_ALL_TESTS(). Please fix it.\n"); + return false; + } + + // Do not run any test if the --help flag was specified. + if (g_help_flag) + return true; + + // Repeats the call to the post-flag parsing initialization in case the + // user didn't call InitGoogleTest. + PostFlagParsingInit(); + + // Even if sharding is not on, test runners may want to use the + // GTEST_SHARD_STATUS_FILE to query whether the test supports the sharding + // protocol. + internal::WriteToShardStatusFileIfNeeded(); + + // True iff we are in a subprocess for running a thread-safe-style + // death test. + bool in_subprocess_for_death_test = false; + +#if GTEST_HAS_DEATH_TEST + in_subprocess_for_death_test = (internal_run_death_test_flag_.get() != NULL); +#endif // GTEST_HAS_DEATH_TEST + + const bool should_shard = ShouldShard(kTestTotalShards, kTestShardIndex, + in_subprocess_for_death_test); + + // Compares the full test names with the filter to decide which + // tests to run. + const bool has_tests_to_run = FilterTests(should_shard + ? HONOR_SHARDING_PROTOCOL + : IGNORE_SHARDING_PROTOCOL) > 0; + + // Lists the tests and exits if the --gtest_list_tests flag was specified. + if (GTEST_FLAG(list_tests)) { + // This must be called *after* FilterTests() has been called. + ListTestsMatchingFilter(); + return true; + } + + random_seed_ = GTEST_FLAG(shuffle) ? + GetRandomSeedFromFlag(GTEST_FLAG(random_seed)) : 0; + + // True iff at least one test has failed. + bool failed = false; + + TestEventListener* repeater = listeners()->repeater(); + + start_timestamp_ = GetTimeInMillis(); + repeater->OnTestProgramStart(*parent_); + + // How many times to repeat the tests? We don't want to repeat them + // when we are inside the subprocess of a death test. + const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG(repeat); + // Repeats forever if the repeat count is negative. + const bool forever = repeat < 0; + for (int i = 0; forever || i != repeat; i++) { + // We want to preserve failures generated by ad-hoc test + // assertions executed before RUN_ALL_TESTS(). + ClearNonAdHocTestResult(); + + const TimeInMillis start = GetTimeInMillis(); + + // Shuffles test cases and tests if requested. + if (has_tests_to_run && GTEST_FLAG(shuffle)) { + random()->Reseed(random_seed_); + // This should be done before calling OnTestIterationStart(), + // such that a test event listener can see the actual test order + // in the event. + ShuffleTests(); + } + + // Tells the unit test event listeners that the tests are about to start. + repeater->OnTestIterationStart(*parent_, i); + + // Runs each test case if there is at least one test to run. + if (has_tests_to_run) { + // Sets up all environments beforehand. + repeater->OnEnvironmentsSetUpStart(*parent_); + ForEach(environments_, SetUpEnvironment); + repeater->OnEnvironmentsSetUpEnd(*parent_); + + // Runs the tests only if there was no fatal failure during global + // set-up. + if (!Test::HasFatalFailure()) { + for (int test_index = 0; test_index < total_test_case_count(); + test_index++) { + GetMutableTestCase(test_index)->Run(); + } + } + + // Tears down all environments in reverse order afterwards. + repeater->OnEnvironmentsTearDownStart(*parent_); + std::for_each(environments_.rbegin(), environments_.rend(), + TearDownEnvironment); + repeater->OnEnvironmentsTearDownEnd(*parent_); + } + + elapsed_time_ = GetTimeInMillis() - start; + + // Tells the unit test event listener that the tests have just finished. + repeater->OnTestIterationEnd(*parent_, i); + + // Gets the result and clears it. + if (!Passed()) { + failed = true; + } + + // Restores the original test order after the iteration. This + // allows the user to quickly repro a failure that happens in the + // N-th iteration without repeating the first (N - 1) iterations. + // This is not enclosed in "if (GTEST_FLAG(shuffle)) { ... }", in + // case the user somehow changes the value of the flag somewhere + // (it's always safe to unshuffle the tests). + UnshuffleTests(); + + if (GTEST_FLAG(shuffle)) { + // Picks a new random seed for each iteration. + random_seed_ = GetNextRandomSeed(random_seed_); + } + } + + repeater->OnTestProgramEnd(*parent_); + + return !failed; +} + +// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file +// if the variable is present. If a file already exists at this location, this +// function will write over it. If the variable is present, but the file cannot +// be created, prints an error and exits. +void WriteToShardStatusFileIfNeeded() { + const char* const test_shard_file = posix::GetEnv(kTestShardStatusFile); + if (test_shard_file != NULL) { + FILE* const file = posix::FOpen(test_shard_file, "w"); + if (file == NULL) { + ColoredPrintf(COLOR_RED, + "Could not write to the test shard status file \"%s\" " + "specified by the %s environment variable.\n", + test_shard_file, kTestShardStatusFile); + fflush(stdout); + exit(EXIT_FAILURE); + } + fclose(file); + } +} + +// Checks whether sharding is enabled by examining the relevant +// environment variable values. If the variables are present, +// but inconsistent (i.e., shard_index >= total_shards), prints +// an error and exits. If in_subprocess_for_death_test, sharding is +// disabled because it must only be applied to the original test +// process. Otherwise, we could filter out death tests we intended to execute. +bool ShouldShard(const char* total_shards_env, + const char* shard_index_env, + bool in_subprocess_for_death_test) { + if (in_subprocess_for_death_test) { + return false; + } + + const Int32 total_shards = Int32FromEnvOrDie(total_shards_env, -1); + const Int32 shard_index = Int32FromEnvOrDie(shard_index_env, -1); + + if (total_shards == -1 && shard_index == -1) { + return false; + } else if (total_shards == -1 && shard_index != -1) { + const Message msg = Message() + << "Invalid environment variables: you have " + << kTestShardIndex << " = " << shard_index + << ", but have left " << kTestTotalShards << " unset.\n"; + ColoredPrintf(COLOR_RED, msg.GetString().c_str()); + fflush(stdout); + exit(EXIT_FAILURE); + } else if (total_shards != -1 && shard_index == -1) { + const Message msg = Message() + << "Invalid environment variables: you have " + << kTestTotalShards << " = " << total_shards + << ", but have left " << kTestShardIndex << " unset.\n"; + ColoredPrintf(COLOR_RED, msg.GetString().c_str()); + fflush(stdout); + exit(EXIT_FAILURE); + } else if (shard_index < 0 || shard_index >= total_shards) { + const Message msg = Message() + << "Invalid environment variables: we require 0 <= " + << kTestShardIndex << " < " << kTestTotalShards + << ", but you have " << kTestShardIndex << "=" << shard_index + << ", " << kTestTotalShards << "=" << total_shards << ".\n"; + ColoredPrintf(COLOR_RED, msg.GetString().c_str()); + fflush(stdout); + exit(EXIT_FAILURE); + } + + return total_shards > 1; +} + +// Parses the environment variable var as an Int32. If it is unset, +// returns default_val. If it is not an Int32, prints an error +// and aborts. +Int32 Int32FromEnvOrDie(const char* var, Int32 default_val) { + const char* str_val = posix::GetEnv(var); + if (str_val == NULL) { + return default_val; + } + + Int32 result; + if (!ParseInt32(Message() << "The value of environment variable " << var, + str_val, &result)) { + exit(EXIT_FAILURE); + } + return result; +} + +// Given the total number of shards, the shard index, and the test id, +// returns true iff the test should be run on this shard. The test id is +// some arbitrary but unique non-negative integer assigned to each test +// method. Assumes that 0 <= shard_index < total_shards. +bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) { + return (test_id % total_shards) == shard_index; +} + +// Compares the name of each test with the user-specified filter to +// decide whether the test should be run, then records the result in +// each TestCase and TestInfo object. +// If shard_tests == true, further filters tests based on sharding +// variables in the environment - see +// http://code.google.com/p/googletest/wiki/GoogleTestAdvancedGuide. +// Returns the number of tests that should run. +int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) { + const Int32 total_shards = shard_tests == HONOR_SHARDING_PROTOCOL ? + Int32FromEnvOrDie(kTestTotalShards, -1) : -1; + const Int32 shard_index = shard_tests == HONOR_SHARDING_PROTOCOL ? + Int32FromEnvOrDie(kTestShardIndex, -1) : -1; + + // num_runnable_tests are the number of tests that will + // run across all shards (i.e., match filter and are not disabled). + // num_selected_tests are the number of tests to be run on + // this shard. + int num_runnable_tests = 0; + int num_selected_tests = 0; + for (size_t i = 0; i < test_cases_.size(); i++) { + TestCase* const test_case = test_cases_[i]; + const std::string &test_case_name = test_case->name(); + test_case->set_should_run(false); + + for (size_t j = 0; j < test_case->test_info_list().size(); j++) { + TestInfo* const test_info = test_case->test_info_list()[j]; + const std::string test_name(test_info->name()); + // A test is disabled if test case name or test name matches + // kDisableTestFilter. + const bool is_disabled = + internal::UnitTestOptions::MatchesFilter(test_case_name, + kDisableTestFilter) || + internal::UnitTestOptions::MatchesFilter(test_name, + kDisableTestFilter); + test_info->is_disabled_ = is_disabled; + + const bool matches_filter = + internal::UnitTestOptions::FilterMatchesTest(test_case_name, + test_name); + test_info->matches_filter_ = matches_filter; + + const bool is_runnable = + (GTEST_FLAG(also_run_disabled_tests) || !is_disabled) && + matches_filter; + + const bool is_selected = is_runnable && + (shard_tests == IGNORE_SHARDING_PROTOCOL || + ShouldRunTestOnShard(total_shards, shard_index, + num_runnable_tests)); + + num_runnable_tests += is_runnable; + num_selected_tests += is_selected; + + test_info->should_run_ = is_selected; + test_case->set_should_run(test_case->should_run() || is_selected); + } + } + return num_selected_tests; +} + +// Prints the given C-string on a single line by replacing all '\n' +// characters with string "\\n". If the output takes more than +// max_length characters, only prints the first max_length characters +// and "...". +static void PrintOnOneLine(const char* str, int max_length) { + if (str != NULL) { + for (int i = 0; *str != '\0'; ++str) { + if (i >= max_length) { + printf("..."); + break; + } + if (*str == '\n') { + printf("\\n"); + i += 2; + } else { + printf("%c", *str); + ++i; + } + } + } +} + +// Prints the names of the tests matching the user-specified filter flag. +void UnitTestImpl::ListTestsMatchingFilter() { + // Print at most this many characters for each type/value parameter. + const int kMaxParamLength = 250; + + for (size_t i = 0; i < test_cases_.size(); i++) { + const TestCase* const test_case = test_cases_[i]; + bool printed_test_case_name = false; + + for (size_t j = 0; j < test_case->test_info_list().size(); j++) { + const TestInfo* const test_info = + test_case->test_info_list()[j]; + if (test_info->matches_filter_) { + if (!printed_test_case_name) { + printed_test_case_name = true; + printf("%s.", test_case->name()); + if (test_case->type_param() != NULL) { + printf(" # %s = ", kTypeParamLabel); + // We print the type parameter on a single line to make + // the output easy to parse by a program. + PrintOnOneLine(test_case->type_param(), kMaxParamLength); + } + printf("\n"); + } + printf(" %s", test_info->name()); + if (test_info->value_param() != NULL) { + printf(" # %s = ", kValueParamLabel); + // We print the value parameter on a single line to make the + // output easy to parse by a program. + PrintOnOneLine(test_info->value_param(), kMaxParamLength); + } + printf("\n"); + } + } + } + fflush(stdout); +} + +// Sets the OS stack trace getter. +// +// Does nothing if the input and the current OS stack trace getter are +// the same; otherwise, deletes the old getter and makes the input the +// current getter. +void UnitTestImpl::set_os_stack_trace_getter( + OsStackTraceGetterInterface* getter) { + if (os_stack_trace_getter_ != getter) { + delete os_stack_trace_getter_; + os_stack_trace_getter_ = getter; + } +} + +// Returns the current OS stack trace getter if it is not NULL; +// otherwise, creates an OsStackTraceGetter, makes it the current +// getter, and returns it. +OsStackTraceGetterInterface* UnitTestImpl::os_stack_trace_getter() { + if (os_stack_trace_getter_ == NULL) { + os_stack_trace_getter_ = new OsStackTraceGetter; + } + + return os_stack_trace_getter_; +} + +// Returns the TestResult for the test that's currently running, or +// the TestResult for the ad hoc test if no test is running. +TestResult* UnitTestImpl::current_test_result() { + return current_test_info_ ? + &(current_test_info_->result_) : &ad_hoc_test_result_; +} + +// Shuffles all test cases, and the tests within each test case, +// making sure that death tests are still run first. +void UnitTestImpl::ShuffleTests() { + // Shuffles the death test cases. + ShuffleRange(random(), 0, last_death_test_case_ + 1, &test_case_indices_); + + // Shuffles the non-death test cases. + ShuffleRange(random(), last_death_test_case_ + 1, + static_cast<int>(test_cases_.size()), &test_case_indices_); + + // Shuffles the tests inside each test case. + for (size_t i = 0; i < test_cases_.size(); i++) { + test_cases_[i]->ShuffleTests(random()); + } +} + +// Restores the test cases and tests to their order before the first shuffle. +void UnitTestImpl::UnshuffleTests() { + for (size_t i = 0; i < test_cases_.size(); i++) { + // Unshuffles the tests in each test case. + test_cases_[i]->UnshuffleTests(); + // Resets the index of each test case. + test_case_indices_[i] = static_cast<int>(i); + } +} + +// Returns the current OS stack trace as an std::string. +// +// The maximum number of stack frames to be included is specified by +// the gtest_stack_trace_depth flag. The skip_count parameter +// specifies the number of top frames to be skipped, which doesn't +// count against the number of frames to be included. +// +// For example, if Foo() calls Bar(), which in turn calls +// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in +// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't. +std::string GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/, + int skip_count) { + // We pass skip_count + 1 to skip this wrapper function in addition + // to what the user really wants to skip. + return GetUnitTestImpl()->CurrentOsStackTraceExceptTop(skip_count + 1); +} + +// Used by the GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_ macro to +// suppress unreachable code warnings. +namespace { +class ClassUniqueToAlwaysTrue {}; +} + +bool IsTrue(bool condition) { return condition; } + +bool AlwaysTrue() { +#if GTEST_HAS_EXCEPTIONS + // This condition is always false so AlwaysTrue() never actually throws, + // but it makes the compiler think that it may throw. + if (IsTrue(false)) + throw ClassUniqueToAlwaysTrue(); +#endif // GTEST_HAS_EXCEPTIONS + return true; +} + +// If *pstr starts with the given prefix, modifies *pstr to be right +// past the prefix and returns true; otherwise leaves *pstr unchanged +// and returns false. None of pstr, *pstr, and prefix can be NULL. +bool SkipPrefix(const char* prefix, const char** pstr) { + const size_t prefix_len = strlen(prefix); + if (strncmp(*pstr, prefix, prefix_len) == 0) { + *pstr += prefix_len; + return true; + } + return false; +} + +// Parses a string as a command line flag. The string should have +// the format "--flag=value". When def_optional is true, the "=value" +// part can be omitted. +// +// Returns the value of the flag, or NULL if the parsing failed. +const char* ParseFlagValue(const char* str, + const char* flag, + bool def_optional) { + // str and flag must not be NULL. + if (str == NULL || flag == NULL) return NULL; + + // The flag must start with "--" followed by GTEST_FLAG_PREFIX_. + const std::string flag_str = std::string("--") + GTEST_FLAG_PREFIX_ + flag; + const size_t flag_len = flag_str.length(); + if (strncmp(str, flag_str.c_str(), flag_len) != 0) return NULL; + + // Skips the flag name. + const char* flag_end = str + flag_len; + + // When def_optional is true, it's OK to not have a "=value" part. + if (def_optional && (flag_end[0] == '\0')) { + return flag_end; + } + + // If def_optional is true and there are more characters after the + // flag name, or if def_optional is false, there must be a '=' after + // the flag name. + if (flag_end[0] != '=') return NULL; + + // Returns the string after "=". + return flag_end + 1; +} + +// Parses a string for a bool flag, in the form of either +// "--flag=value" or "--flag". +// +// In the former case, the value is taken as true as long as it does +// not start with '0', 'f', or 'F'. +// +// In the latter case, the value is taken as true. +// +// On success, stores the value of the flag in *value, and returns +// true. On failure, returns false without changing *value. +bool ParseBoolFlag(const char* str, const char* flag, bool* value) { + // Gets the value of the flag as a string. + const char* const value_str = ParseFlagValue(str, flag, true); + + // Aborts if the parsing failed. + if (value_str == NULL) return false; + + // Converts the string value to a bool. + *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F'); + return true; +} + +// Parses a string for an Int32 flag, in the form of +// "--flag=value". +// +// On success, stores the value of the flag in *value, and returns +// true. On failure, returns false without changing *value. +bool ParseInt32Flag(const char* str, const char* flag, Int32* value) { + // Gets the value of the flag as a string. + const char* const value_str = ParseFlagValue(str, flag, false); + + // Aborts if the parsing failed. + if (value_str == NULL) return false; + + // Sets *value to the value of the flag. + return ParseInt32(Message() << "The value of flag --" << flag, + value_str, value); +} + +// Parses a string for a string flag, in the form of +// "--flag=value". +// +// On success, stores the value of the flag in *value, and returns +// true. On failure, returns false without changing *value. +bool ParseStringFlag(const char* str, const char* flag, std::string* value) { + // Gets the value of the flag as a string. + const char* const value_str = ParseFlagValue(str, flag, false); + + // Aborts if the parsing failed. + if (value_str == NULL) return false; + + // Sets *value to the value of the flag. + *value = value_str; + return true; +} + +// Determines whether a string has a prefix that Google Test uses for its +// flags, i.e., starts with GTEST_FLAG_PREFIX_ or GTEST_FLAG_PREFIX_DASH_. +// If Google Test detects that a command line flag has its prefix but is not +// recognized, it will print its help message. Flags starting with +// GTEST_INTERNAL_PREFIX_ followed by "internal_" are considered Google Test +// internal flags and do not trigger the help message. +static bool HasGoogleTestFlagPrefix(const char* str) { + return (SkipPrefix("--", &str) || + SkipPrefix("-", &str) || + SkipPrefix("/", &str)) && + !SkipPrefix(GTEST_FLAG_PREFIX_ "internal_", &str) && + (SkipPrefix(GTEST_FLAG_PREFIX_, &str) || + SkipPrefix(GTEST_FLAG_PREFIX_DASH_, &str)); +} + +// Prints a string containing code-encoded text. The following escape +// sequences can be used in the string to control the text color: +// +// @@ prints a single '@' character. +// @R changes the color to red. +// @G changes the color to green. +// @Y changes the color to yellow. +// @D changes to the default terminal text color. +// +// TODO(wan@google.com): Write tests for this once we add stdout +// capturing to Google Test. +static void PrintColorEncoded(const char* str) { + GTestColor color = COLOR_DEFAULT; // The current color. + + // Conceptually, we split the string into segments divided by escape + // sequences. Then we print one segment at a time. At the end of + // each iteration, the str pointer advances to the beginning of the + // next segment. + for (;;) { + const char* p = strchr(str, '@'); + if (p == NULL) { + ColoredPrintf(color, "%s", str); + return; + } + + ColoredPrintf(color, "%s", std::string(str, p).c_str()); + + const char ch = p[1]; + str = p + 2; + if (ch == '@') { + ColoredPrintf(color, "@"); + } else if (ch == 'D') { + color = COLOR_DEFAULT; + } else if (ch == 'R') { + color = COLOR_RED; + } else if (ch == 'G') { + color = COLOR_GREEN; + } else if (ch == 'Y') { + color = COLOR_YELLOW; + } else { + --str; + } + } +} + +static const char kColorEncodedHelpMessage[] = +"This program contains tests written using " GTEST_NAME_ ". You can use the\n" +"following command line flags to control its behavior:\n" +"\n" +"Test Selection:\n" +" @G--" GTEST_FLAG_PREFIX_ "list_tests@D\n" +" List the names of all tests instead of running them. The name of\n" +" TEST(Foo, Bar) is \"Foo.Bar\".\n" +" @G--" GTEST_FLAG_PREFIX_ "filter=@YPOSTIVE_PATTERNS" + "[@G-@YNEGATIVE_PATTERNS]@D\n" +" Run only the tests whose name matches one of the positive patterns but\n" +" none of the negative patterns. '?' matches any single character; '*'\n" +" matches any substring; ':' separates two patterns.\n" +" @G--" GTEST_FLAG_PREFIX_ "also_run_disabled_tests@D\n" +" Run all disabled tests too.\n" +"\n" +"Test Execution:\n" +" @G--" GTEST_FLAG_PREFIX_ "repeat=@Y[COUNT]@D\n" +" Run the tests repeatedly; use a negative count to repeat forever.\n" +" @G--" GTEST_FLAG_PREFIX_ "shuffle@D\n" +" Randomize tests' orders on every iteration.\n" +" @G--" GTEST_FLAG_PREFIX_ "random_seed=@Y[NUMBER]@D\n" +" Random number seed to use for shuffling test orders (between 1 and\n" +" 99999, or 0 to use a seed based on the current time).\n" +"\n" +"Test Output:\n" +" @G--" GTEST_FLAG_PREFIX_ "color=@Y(@Gyes@Y|@Gno@Y|@Gauto@Y)@D\n" +" Enable/disable colored output. The default is @Gauto@D.\n" +" -@G-" GTEST_FLAG_PREFIX_ "print_time=0@D\n" +" Don't print the elapsed time of each test.\n" +" @G--" GTEST_FLAG_PREFIX_ "output=xml@Y[@G:@YDIRECTORY_PATH@G" + GTEST_PATH_SEP_ "@Y|@G:@YFILE_PATH]@D\n" +" Generate an XML report in the given directory or with the given file\n" +" name. @YFILE_PATH@D defaults to @Gtest_details.xml@D.\n" +#if GTEST_CAN_STREAM_RESULTS_ +" @G--" GTEST_FLAG_PREFIX_ "stream_result_to=@YHOST@G:@YPORT@D\n" +" Stream test results to the given server.\n" +#endif // GTEST_CAN_STREAM_RESULTS_ +"\n" +"Assertion Behavior:\n" +#if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS +" @G--" GTEST_FLAG_PREFIX_ "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n" +" Set the default death test style.\n" +#endif // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS +" @G--" GTEST_FLAG_PREFIX_ "break_on_failure@D\n" +" Turn assertion failures into debugger break-points.\n" +" @G--" GTEST_FLAG_PREFIX_ "throw_on_failure@D\n" +" Turn assertion failures into C++ exceptions.\n" +" @G--" GTEST_FLAG_PREFIX_ "catch_exceptions=0@D\n" +" Do not report exceptions as test failures. Instead, allow them\n" +" to crash the program or throw a pop-up (on Windows).\n" +"\n" +"Except for @G--" GTEST_FLAG_PREFIX_ "list_tests@D, you can alternatively set " + "the corresponding\n" +"environment variable of a flag (all letters in upper-case). For example, to\n" +"disable colored text output, you can either specify @G--" GTEST_FLAG_PREFIX_ + "color=no@D or set\n" +"the @G" GTEST_FLAG_PREFIX_UPPER_ "COLOR@D environment variable to @Gno@D.\n" +"\n" +"For more information, please read the " GTEST_NAME_ " documentation at\n" +"@G" GTEST_PROJECT_URL_ "@D. If you find a bug in " GTEST_NAME_ "\n" +"(not one in your own code or tests), please report it to\n" +"@G<" GTEST_DEV_EMAIL_ ">@D.\n"; + +// Parses the command line for Google Test flags, without initializing +// other parts of Google Test. The type parameter CharType can be +// instantiated to either char or wchar_t. +template <typename CharType> +void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) { + for (int i = 1; i < *argc; i++) { + const std::string arg_string = StreamableToString(argv[i]); + const char* const arg = arg_string.c_str(); + + using internal::ParseBoolFlag; + using internal::ParseInt32Flag; + using internal::ParseStringFlag; + + // Do we see a Google Test flag? + if (ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag, + >EST_FLAG(also_run_disabled_tests)) || + ParseBoolFlag(arg, kBreakOnFailureFlag, + >EST_FLAG(break_on_failure)) || + ParseBoolFlag(arg, kCatchExceptionsFlag, + >EST_FLAG(catch_exceptions)) || + ParseStringFlag(arg, kColorFlag, >EST_FLAG(color)) || + ParseStringFlag(arg, kDeathTestStyleFlag, + >EST_FLAG(death_test_style)) || + ParseBoolFlag(arg, kDeathTestUseFork, + >EST_FLAG(death_test_use_fork)) || + ParseStringFlag(arg, kFilterFlag, >EST_FLAG(filter)) || + ParseStringFlag(arg, kInternalRunDeathTestFlag, + >EST_FLAG(internal_run_death_test)) || + ParseBoolFlag(arg, kListTestsFlag, >EST_FLAG(list_tests)) || + ParseStringFlag(arg, kOutputFlag, >EST_FLAG(output)) || + ParseBoolFlag(arg, kPrintTimeFlag, >EST_FLAG(print_time)) || + ParseInt32Flag(arg, kRandomSeedFlag, >EST_FLAG(random_seed)) || + ParseInt32Flag(arg, kRepeatFlag, >EST_FLAG(repeat)) || + ParseBoolFlag(arg, kShuffleFlag, >EST_FLAG(shuffle)) || + ParseInt32Flag(arg, kStackTraceDepthFlag, + >EST_FLAG(stack_trace_depth)) || + ParseStringFlag(arg, kStreamResultToFlag, + >EST_FLAG(stream_result_to)) || + ParseBoolFlag(arg, kThrowOnFailureFlag, + >EST_FLAG(throw_on_failure)) + ) { + // Yes. Shift the remainder of the argv list left by one. Note + // that argv has (*argc + 1) elements, the last one always being + // NULL. The following loop moves the trailing NULL element as + // well. + for (int j = i; j != *argc; j++) { + argv[j] = argv[j + 1]; + } + + // Decrements the argument count. + (*argc)--; + + // We also need to decrement the iterator as we just removed + // an element. + i--; + } else if (arg_string == "--help" || arg_string == "-h" || + arg_string == "-?" || arg_string == "/?" || + HasGoogleTestFlagPrefix(arg)) { + // Both help flag and unrecognized Google Test flags (excluding + // internal ones) trigger help display. + g_help_flag = true; + } + } + + if (g_help_flag) { + // We print the help here instead of in RUN_ALL_TESTS(), as the + // latter may not be called at all if the user is using Google + // Test with another testing framework. + PrintColorEncoded(kColorEncodedHelpMessage); + } +} + +// Parses the command line for Google Test flags, without initializing +// other parts of Google Test. +void ParseGoogleTestFlagsOnly(int* argc, char** argv) { + ParseGoogleTestFlagsOnlyImpl(argc, argv); +} +void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv) { + ParseGoogleTestFlagsOnlyImpl(argc, argv); +} + +// The internal implementation of InitGoogleTest(). +// +// The type parameter CharType can be instantiated to either char or +// wchar_t. +template <typename CharType> +void InitGoogleTestImpl(int* argc, CharType** argv) { + g_init_gtest_count++; + + // We don't want to run the initialization code twice. + if (g_init_gtest_count != 1) return; + + if (*argc <= 0) return; + + internal::g_executable_path = internal::StreamableToString(argv[0]); + +#if GTEST_HAS_DEATH_TEST + + g_argvs.clear(); + for (int i = 0; i != *argc; i++) { + g_argvs.push_back(StreamableToString(argv[i])); + } + +#endif // GTEST_HAS_DEATH_TEST + + ParseGoogleTestFlagsOnly(argc, argv); + GetUnitTestImpl()->PostFlagParsingInit(); +} + +} // namespace internal + +// Initializes Google Test. This must be called before calling +// RUN_ALL_TESTS(). In particular, it parses a command line for the +// flags that Google Test recognizes. Whenever a Google Test flag is +// seen, it is removed from argv, and *argc is decremented. +// +// No value is returned. Instead, the Google Test flag variables are +// updated. +// +// Calling the function for the second time has no user-visible effect. +void InitGoogleTest(int* argc, char** argv) { + internal::InitGoogleTestImpl(argc, argv); +} + +// This overloaded version can be used in Windows programs compiled in +// UNICODE mode. +void InitGoogleTest(int* argc, wchar_t** argv) { + internal::InitGoogleTestImpl(argc, argv); +} + +} // namespace testing +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan), vladl@google.com (Vlad Losev) +// +// This file implements death tests. + + +#if GTEST_HAS_DEATH_TEST + +# if GTEST_OS_MAC +# include <crt_externs.h> +# endif // GTEST_OS_MAC + +# include <errno.h> +# include <fcntl.h> +# include <limits.h> + +# if GTEST_OS_LINUX +# include <signal.h> +# endif // GTEST_OS_LINUX + +# include <stdarg.h> + +# if GTEST_OS_WINDOWS +# include <windows.h> +# else +# include <sys/mman.h> +# include <sys/wait.h> +# endif // GTEST_OS_WINDOWS + +# if GTEST_OS_QNX +# include <spawn.h> +# endif // GTEST_OS_QNX + +#endif // GTEST_HAS_DEATH_TEST + + +// Indicates that this translation unit is part of Google Test's +// implementation. It must come before gtest-internal-inl.h is +// included, or there will be a compiler error. This trick is to +// prevent a user from accidentally including gtest-internal-inl.h in +// his code. +#define GTEST_IMPLEMENTATION_ 1 +#undef GTEST_IMPLEMENTATION_ + +namespace testing { + +// Constants. + +// The default death test style. +static const char kDefaultDeathTestStyle[] = "fast"; + +GTEST_DEFINE_string_( + death_test_style, + internal::StringFromGTestEnv("death_test_style", kDefaultDeathTestStyle), + "Indicates how to run a death test in a forked child process: " + "\"threadsafe\" (child process re-executes the test binary " + "from the beginning, running only the specific death test) or " + "\"fast\" (child process runs the death test immediately " + "after forking)."); + +GTEST_DEFINE_bool_( + death_test_use_fork, + internal::BoolFromGTestEnv("death_test_use_fork", false), + "Instructs to use fork()/_exit() instead of clone() in death tests. " + "Ignored and always uses fork() on POSIX systems where clone() is not " + "implemented. Useful when running under valgrind or similar tools if " + "those do not support clone(). Valgrind 3.3.1 will just fail if " + "it sees an unsupported combination of clone() flags. " + "It is not recommended to use this flag w/o valgrind though it will " + "work in 99% of the cases. Once valgrind is fixed, this flag will " + "most likely be removed."); + +namespace internal { +GTEST_DEFINE_string_( + internal_run_death_test, "", + "Indicates the file, line number, temporal index of " + "the single death test to run, and a file descriptor to " + "which a success code may be sent, all separated by " + "the '|' characters. This flag is specified if and only if the current " + "process is a sub-process launched for running a thread-safe " + "death test. FOR INTERNAL USE ONLY."); +} // namespace internal + +#if GTEST_HAS_DEATH_TEST + +namespace internal { + +// Valid only for fast death tests. Indicates the code is running in the +// child process of a fast style death test. +static bool g_in_fast_death_test_child = false; + +// Returns a Boolean value indicating whether the caller is currently +// executing in the context of the death test child process. Tools such as +// Valgrind heap checkers may need this to modify their behavior in death +// tests. IMPORTANT: This is an internal utility. Using it may break the +// implementation of death tests. User code MUST NOT use it. +bool InDeathTestChild() { +# if GTEST_OS_WINDOWS + + // On Windows, death tests are thread-safe regardless of the value of the + // death_test_style flag. + return !GTEST_FLAG(internal_run_death_test).empty(); + +# else + + if (GTEST_FLAG(death_test_style) == "threadsafe") + return !GTEST_FLAG(internal_run_death_test).empty(); + else + return g_in_fast_death_test_child; +#endif +} + +} // namespace internal + +// ExitedWithCode constructor. +ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) { +} + +// ExitedWithCode function-call operator. +bool ExitedWithCode::operator()(int exit_status) const { +# if GTEST_OS_WINDOWS + + return exit_status == exit_code_; + +# else + + return WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == exit_code_; + +# endif // GTEST_OS_WINDOWS +} + +# if !GTEST_OS_WINDOWS +// KilledBySignal constructor. +KilledBySignal::KilledBySignal(int signum) : signum_(signum) { +} + +// KilledBySignal function-call operator. +bool KilledBySignal::operator()(int exit_status) const { + return WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum_; +} +# endif // !GTEST_OS_WINDOWS + +namespace internal { + +// Utilities needed for death tests. + +// Generates a textual description of a given exit code, in the format +// specified by wait(2). +static std::string ExitSummary(int exit_code) { + Message m; + +# if GTEST_OS_WINDOWS + + m << "Exited with exit status " << exit_code; + +# else + + if (WIFEXITED(exit_code)) { + m << "Exited with exit status " << WEXITSTATUS(exit_code); + } else if (WIFSIGNALED(exit_code)) { + m << "Terminated by signal " << WTERMSIG(exit_code); + } +# ifdef WCOREDUMP + if (WCOREDUMP(exit_code)) { + m << " (core dumped)"; + } +# endif +# endif // GTEST_OS_WINDOWS + + return m.GetString(); +} + +// Returns true if exit_status describes a process that was terminated +// by a signal, or exited normally with a nonzero exit code. +bool ExitedUnsuccessfully(int exit_status) { + return !ExitedWithCode(0)(exit_status); +} + +# if !GTEST_OS_WINDOWS +// Generates a textual failure message when a death test finds more than +// one thread running, or cannot determine the number of threads, prior +// to executing the given statement. It is the responsibility of the +// caller not to pass a thread_count of 1. +static std::string DeathTestThreadWarning(size_t thread_count) { + Message msg; + msg << "Death tests use fork(), which is unsafe particularly" + << " in a threaded context. For this test, " << GTEST_NAME_ << " "; + if (thread_count == 0) + msg << "couldn't detect the number of threads."; + else + msg << "detected " << thread_count << " threads."; + return msg.GetString(); +} +# endif // !GTEST_OS_WINDOWS + +// Flag characters for reporting a death test that did not die. +static const char kDeathTestLived = 'L'; +static const char kDeathTestReturned = 'R'; +static const char kDeathTestThrew = 'T'; +static const char kDeathTestInternalError = 'I'; + +// An enumeration describing all of the possible ways that a death test can +// conclude. DIED means that the process died while executing the test +// code; LIVED means that process lived beyond the end of the test code; +// RETURNED means that the test statement attempted to execute a return +// statement, which is not allowed; THREW means that the test statement +// returned control by throwing an exception. IN_PROGRESS means the test +// has not yet concluded. +// TODO(vladl@google.com): Unify names and possibly values for +// AbortReason, DeathTestOutcome, and flag characters above. +enum DeathTestOutcome { IN_PROGRESS, DIED, LIVED, RETURNED, THREW }; + +// Routine for aborting the program which is safe to call from an +// exec-style death test child process, in which case the error +// message is propagated back to the parent process. Otherwise, the +// message is simply printed to stderr. In either case, the program +// then exits with status 1. +void DeathTestAbort(const std::string& message) { + // On a POSIX system, this function may be called from a threadsafe-style + // death test child process, which operates on a very small stack. Use + // the heap for any additional non-minuscule memory requirements. + const InternalRunDeathTestFlag* const flag = + GetUnitTestImpl()->internal_run_death_test_flag(); + if (flag != NULL) { + FILE* parent = posix::FDOpen(flag->write_fd(), "w"); + fputc(kDeathTestInternalError, parent); + fprintf(parent, "%s", message.c_str()); + fflush(parent); + _exit(1); + } else { + fprintf(stderr, "%s", message.c_str()); + fflush(stderr); + posix::Abort(); + } +} + +// A replacement for CHECK that calls DeathTestAbort if the assertion +// fails. +# define GTEST_DEATH_TEST_CHECK_(expression) \ + do { \ + if (!::testing::internal::IsTrue(expression)) { \ + DeathTestAbort( \ + ::std::string("CHECK failed: File ") + __FILE__ + ", line " \ + + ::testing::internal::StreamableToString(__LINE__) + ": " \ + + #expression); \ + } \ + } while (::testing::internal::AlwaysFalse()) + +// This macro is similar to GTEST_DEATH_TEST_CHECK_, but it is meant for +// evaluating any system call that fulfills two conditions: it must return +// -1 on failure, and set errno to EINTR when it is interrupted and +// should be tried again. The macro expands to a loop that repeatedly +// evaluates the expression as long as it evaluates to -1 and sets +// errno to EINTR. If the expression evaluates to -1 but errno is +// something other than EINTR, DeathTestAbort is called. +# define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression) \ + do { \ + int gtest_retval; \ + do { \ + gtest_retval = (expression); \ + } while (gtest_retval == -1 && errno == EINTR); \ + if (gtest_retval == -1) { \ + DeathTestAbort( \ + ::std::string("CHECK failed: File ") + __FILE__ + ", line " \ + + ::testing::internal::StreamableToString(__LINE__) + ": " \ + + #expression + " != -1"); \ + } \ + } while (::testing::internal::AlwaysFalse()) + +// Returns the message describing the last system error in errno. +std::string GetLastErrnoDescription() { + return errno == 0 ? "" : posix::StrError(errno); +} + +// This is called from a death test parent process to read a failure +// message from the death test child process and log it with the FATAL +// severity. On Windows, the message is read from a pipe handle. On other +// platforms, it is read from a file descriptor. +static void FailFromInternalError(int fd) { + Message error; + char buffer[256]; + int num_read; + + do { + while ((num_read = posix::Read(fd, buffer, 255)) > 0) { + buffer[num_read] = '\0'; + error << buffer; + } + } while (num_read == -1 && errno == EINTR); + + if (num_read == 0) { + GTEST_LOG_(FATAL) << error.GetString(); + } else { + const int last_error = errno; + GTEST_LOG_(FATAL) << "Error while reading death test internal: " + << GetLastErrnoDescription() << " [" << last_error << "]"; + } +} + +// Death test constructor. Increments the running death test count +// for the current test. +DeathTest::DeathTest() { + TestInfo* const info = GetUnitTestImpl()->current_test_info(); + if (info == NULL) { + DeathTestAbort("Cannot run a death test outside of a TEST or " + "TEST_F construct"); + } +} + +// Creates and returns a death test by dispatching to the current +// death test factory. +bool DeathTest::Create(const char* statement, const RE* regex, + const char* file, int line, DeathTest** test) { + return GetUnitTestImpl()->death_test_factory()->Create( + statement, regex, file, line, test); +} + +const char* DeathTest::LastMessage() { + return last_death_test_message_.c_str(); +} + +void DeathTest::set_last_death_test_message(const std::string& message) { + last_death_test_message_ = message; +} + +std::string DeathTest::last_death_test_message_; + +// Provides cross platform implementation for some death functionality. +class DeathTestImpl : public DeathTest { + protected: + DeathTestImpl(const char* a_statement, const RE* a_regex) + : statement_(a_statement), + regex_(a_regex), + spawned_(false), + status_(-1), + outcome_(IN_PROGRESS), + read_fd_(-1), + write_fd_(-1) {} + + // read_fd_ is expected to be closed and cleared by a derived class. + ~DeathTestImpl() { GTEST_DEATH_TEST_CHECK_(read_fd_ == -1); } + + void Abort(AbortReason reason); + virtual bool Passed(bool status_ok); + + const char* statement() const { return statement_; } + const RE* regex() const { return regex_; } + bool spawned() const { return spawned_; } + void set_spawned(bool is_spawned) { spawned_ = is_spawned; } + int status() const { return status_; } + void set_status(int a_status) { status_ = a_status; } + DeathTestOutcome outcome() const { return outcome_; } + void set_outcome(DeathTestOutcome an_outcome) { outcome_ = an_outcome; } + int read_fd() const { return read_fd_; } + void set_read_fd(int fd) { read_fd_ = fd; } + int write_fd() const { return write_fd_; } + void set_write_fd(int fd) { write_fd_ = fd; } + + // Called in the parent process only. Reads the result code of the death + // test child process via a pipe, interprets it to set the outcome_ + // member, and closes read_fd_. Outputs diagnostics and terminates in + // case of unexpected codes. + void ReadAndInterpretStatusByte(); + + private: + // The textual content of the code this object is testing. This class + // doesn't own this string and should not attempt to delete it. + const char* const statement_; + // The regular expression which test output must match. DeathTestImpl + // doesn't own this object and should not attempt to delete it. + const RE* const regex_; + // True if the death test child process has been successfully spawned. + bool spawned_; + // The exit status of the child process. + int status_; + // How the death test concluded. + DeathTestOutcome outcome_; + // Descriptor to the read end of the pipe to the child process. It is + // always -1 in the child process. The child keeps its write end of the + // pipe in write_fd_. + int read_fd_; + // Descriptor to the child's write end of the pipe to the parent process. + // It is always -1 in the parent process. The parent keeps its end of the + // pipe in read_fd_. + int write_fd_; +}; + +// Called in the parent process only. Reads the result code of the death +// test child process via a pipe, interprets it to set the outcome_ +// member, and closes read_fd_. Outputs diagnostics and terminates in +// case of unexpected codes. +void DeathTestImpl::ReadAndInterpretStatusByte() { + char flag; + int bytes_read; + + // The read() here blocks until data is available (signifying the + // failure of the death test) or until the pipe is closed (signifying + // its success), so it's okay to call this in the parent before + // the child process has exited. + do { + bytes_read = posix::Read(read_fd(), &flag, 1); + } while (bytes_read == -1 && errno == EINTR); + + if (bytes_read == 0) { + set_outcome(DIED); + } else if (bytes_read == 1) { + switch (flag) { + case kDeathTestReturned: + set_outcome(RETURNED); + break; + case kDeathTestThrew: + set_outcome(THREW); + break; + case kDeathTestLived: + set_outcome(LIVED); + break; + case kDeathTestInternalError: + FailFromInternalError(read_fd()); // Does not return. + break; + default: + GTEST_LOG_(FATAL) << "Death test child process reported " + << "unexpected status byte (" + << static_cast<unsigned int>(flag) << ")"; + } + } else { + GTEST_LOG_(FATAL) << "Read from death test child process failed: " + << GetLastErrnoDescription(); + } + GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Close(read_fd())); + set_read_fd(-1); +} + +// Signals that the death test code which should have exited, didn't. +// Should be called only in a death test child process. +// Writes a status byte to the child's status file descriptor, then +// calls _exit(1). +void DeathTestImpl::Abort(AbortReason reason) { + // The parent process considers the death test to be a failure if + // it finds any data in our pipe. So, here we write a single flag byte + // to the pipe, then exit. + const char status_ch = + reason == TEST_DID_NOT_DIE ? kDeathTestLived : + reason == TEST_THREW_EXCEPTION ? kDeathTestThrew : kDeathTestReturned; + + GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Write(write_fd(), &status_ch, 1)); + // We are leaking the descriptor here because on some platforms (i.e., + // when built as Windows DLL), destructors of global objects will still + // run after calling _exit(). On such systems, write_fd_ will be + // indirectly closed from the destructor of UnitTestImpl, causing double + // close if it is also closed here. On debug configurations, double close + // may assert. As there are no in-process buffers to flush here, we are + // relying on the OS to close the descriptor after the process terminates + // when the destructors are not run. + _exit(1); // Exits w/o any normal exit hooks (we were supposed to crash) +} + +// Returns an indented copy of stderr output for a death test. +// This makes distinguishing death test output lines from regular log lines +// much easier. +static ::std::string FormatDeathTestOutput(const ::std::string& output) { + ::std::string ret; + for (size_t at = 0; ; ) { + const size_t line_end = output.find('\n', at); + ret += "[ DEATH ] "; + if (line_end == ::std::string::npos) { + ret += output.substr(at); + break; + } + ret += output.substr(at, line_end + 1 - at); + at = line_end + 1; + } + return ret; +} + +// Assesses the success or failure of a death test, using both private +// members which have previously been set, and one argument: +// +// Private data members: +// outcome: An enumeration describing how the death test +// concluded: DIED, LIVED, THREW, or RETURNED. The death test +// fails in the latter three cases. +// status: The exit status of the child process. On *nix, it is in the +// in the format specified by wait(2). On Windows, this is the +// value supplied to the ExitProcess() API or a numeric code +// of the exception that terminated the program. +// regex: A regular expression object to be applied to +// the test's captured standard error output; the death test +// fails if it does not match. +// +// Argument: +// status_ok: true if exit_status is acceptable in the context of +// this particular death test, which fails if it is false +// +// Returns true iff all of the above conditions are met. Otherwise, the +// first failing condition, in the order given above, is the one that is +// reported. Also sets the last death test message string. +bool DeathTestImpl::Passed(bool status_ok) { + if (!spawned()) + return false; + + const std::string error_message = GetCapturedStderr(); + + bool success = false; + Message buffer; + + buffer << "Death test: " << statement() << "\n"; + switch (outcome()) { + case LIVED: + buffer << " Result: failed to die.\n" + << " Error msg:\n" << FormatDeathTestOutput(error_message); + break; + case THREW: + buffer << " Result: threw an exception.\n" + << " Error msg:\n" << FormatDeathTestOutput(error_message); + break; + case RETURNED: + buffer << " Result: illegal return in test statement.\n" + << " Error msg:\n" << FormatDeathTestOutput(error_message); + break; + case DIED: + if (status_ok) { + const bool matched = RE::PartialMatch(error_message.c_str(), *regex()); + if (matched) { + success = true; + } else { + buffer << " Result: died but not with expected error.\n" + << " Expected: " << regex()->pattern() << "\n" + << "Actual msg:\n" << FormatDeathTestOutput(error_message); + } + } else { + buffer << " Result: died but not with expected exit code:\n" + << " " << ExitSummary(status()) << "\n" + << "Actual msg:\n" << FormatDeathTestOutput(error_message); + } + break; + case IN_PROGRESS: + default: + GTEST_LOG_(FATAL) + << "DeathTest::Passed somehow called before conclusion of test"; + } + + DeathTest::set_last_death_test_message(buffer.GetString()); + return success; +} + +# if GTEST_OS_WINDOWS +// WindowsDeathTest implements death tests on Windows. Due to the +// specifics of starting new processes on Windows, death tests there are +// always threadsafe, and Google Test considers the +// --gtest_death_test_style=fast setting to be equivalent to +// --gtest_death_test_style=threadsafe there. +// +// A few implementation notes: Like the Linux version, the Windows +// implementation uses pipes for child-to-parent communication. But due to +// the specifics of pipes on Windows, some extra steps are required: +// +// 1. The parent creates a communication pipe and stores handles to both +// ends of it. +// 2. The parent starts the child and provides it with the information +// necessary to acquire the handle to the write end of the pipe. +// 3. The child acquires the write end of the pipe and signals the parent +// using a Windows event. +// 4. Now the parent can release the write end of the pipe on its side. If +// this is done before step 3, the object's reference count goes down to +// 0 and it is destroyed, preventing the child from acquiring it. The +// parent now has to release it, or read operations on the read end of +// the pipe will not return when the child terminates. +// 5. The parent reads child's output through the pipe (outcome code and +// any possible error messages) from the pipe, and its stderr and then +// determines whether to fail the test. +// +// Note: to distinguish Win32 API calls from the local method and function +// calls, the former are explicitly resolved in the global namespace. +// +class WindowsDeathTest : public DeathTestImpl { + public: + WindowsDeathTest(const char* a_statement, + const RE* a_regex, + const char* file, + int line) + : DeathTestImpl(a_statement, a_regex), file_(file), line_(line) {} + + // All of these virtual functions are inherited from DeathTest. + virtual int Wait(); + virtual TestRole AssumeRole(); + + private: + // The name of the file in which the death test is located. + const char* const file_; + // The line number on which the death test is located. + const int line_; + // Handle to the write end of the pipe to the child process. + AutoHandle write_handle_; + // Child process handle. + AutoHandle child_handle_; + // Event the child process uses to signal the parent that it has + // acquired the handle to the write end of the pipe. After seeing this + // event the parent can release its own handles to make sure its + // ReadFile() calls return when the child terminates. + AutoHandle event_handle_; +}; + +// Waits for the child in a death test to exit, returning its exit +// status, or 0 if no child process exists. As a side effect, sets the +// outcome data member. +int WindowsDeathTest::Wait() { + if (!spawned()) + return 0; + + // Wait until the child either signals that it has acquired the write end + // of the pipe or it dies. + const HANDLE wait_handles[2] = { child_handle_.Get(), event_handle_.Get() }; + switch (::WaitForMultipleObjects(2, + wait_handles, + FALSE, // Waits for any of the handles. + INFINITE)) { + case WAIT_OBJECT_0: + case WAIT_OBJECT_0 + 1: + break; + default: + GTEST_DEATH_TEST_CHECK_(false); // Should not get here. + } + + // The child has acquired the write end of the pipe or exited. + // We release the handle on our side and continue. + write_handle_.Reset(); + event_handle_.Reset(); + + ReadAndInterpretStatusByte(); + + // Waits for the child process to exit if it haven't already. This + // returns immediately if the child has already exited, regardless of + // whether previous calls to WaitForMultipleObjects synchronized on this + // handle or not. + GTEST_DEATH_TEST_CHECK_( + WAIT_OBJECT_0 == ::WaitForSingleObject(child_handle_.Get(), + INFINITE)); + DWORD status_code; + GTEST_DEATH_TEST_CHECK_( + ::GetExitCodeProcess(child_handle_.Get(), &status_code) != FALSE); + child_handle_.Reset(); + set_status(static_cast<int>(status_code)); + return status(); +} + +// The AssumeRole process for a Windows death test. It creates a child +// process with the same executable as the current process to run the +// death test. The child process is given the --gtest_filter and +// --gtest_internal_run_death_test flags such that it knows to run the +// current death test only. +DeathTest::TestRole WindowsDeathTest::AssumeRole() { + const UnitTestImpl* const impl = GetUnitTestImpl(); + const InternalRunDeathTestFlag* const flag = + impl->internal_run_death_test_flag(); + const TestInfo* const info = impl->current_test_info(); + const int death_test_index = info->result()->death_test_count(); + + if (flag != NULL) { + // ParseInternalRunDeathTestFlag() has performed all the necessary + // processing. + set_write_fd(flag->write_fd()); + return EXECUTE_TEST; + } + + // WindowsDeathTest uses an anonymous pipe to communicate results of + // a death test. + SECURITY_ATTRIBUTES handles_are_inheritable = { + sizeof(SECURITY_ATTRIBUTES), NULL, TRUE }; + HANDLE read_handle, write_handle; + GTEST_DEATH_TEST_CHECK_( + ::CreatePipe(&read_handle, &write_handle, &handles_are_inheritable, + 0) // Default buffer size. + != FALSE); + set_read_fd(::_open_osfhandle(reinterpret_cast<intptr_t>(read_handle), + O_RDONLY)); + write_handle_.Reset(write_handle); + event_handle_.Reset(::CreateEvent( + &handles_are_inheritable, + TRUE, // The event will automatically reset to non-signaled state. + FALSE, // The initial state is non-signalled. + NULL)); // The even is unnamed. + GTEST_DEATH_TEST_CHECK_(event_handle_.Get() != NULL); + const std::string filter_flag = + std::string("--") + GTEST_FLAG_PREFIX_ + kFilterFlag + "=" + + info->test_case_name() + "." + info->name(); + const std::string internal_flag = + std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + + "=" + file_ + "|" + StreamableToString(line_) + "|" + + StreamableToString(death_test_index) + "|" + + StreamableToString(static_cast<unsigned int>(::GetCurrentProcessId())) + + // size_t has the same width as pointers on both 32-bit and 64-bit + // Windows platforms. + // See http://msdn.microsoft.com/en-us/library/tcxf1dw6.aspx. + "|" + StreamableToString(reinterpret_cast<size_t>(write_handle)) + + "|" + StreamableToString(reinterpret_cast<size_t>(event_handle_.Get())); + + char executable_path[_MAX_PATH + 1]; // NOLINT + GTEST_DEATH_TEST_CHECK_( + _MAX_PATH + 1 != ::GetModuleFileNameA(NULL, + executable_path, + _MAX_PATH)); + + std::string command_line = + std::string(::GetCommandLineA()) + " " + filter_flag + " \"" + + internal_flag + "\""; + + DeathTest::set_last_death_test_message(""); + + CaptureStderr(); + // Flush the log buffers since the log streams are shared with the child. + FlushInfoLog(); + + // The child process will share the standard handles with the parent. + STARTUPINFOA startup_info; + memset(&startup_info, 0, sizeof(STARTUPINFO)); + startup_info.dwFlags = STARTF_USESTDHANDLES; + startup_info.hStdInput = ::GetStdHandle(STD_INPUT_HANDLE); + startup_info.hStdOutput = ::GetStdHandle(STD_OUTPUT_HANDLE); + startup_info.hStdError = ::GetStdHandle(STD_ERROR_HANDLE); + + PROCESS_INFORMATION process_info; + GTEST_DEATH_TEST_CHECK_(::CreateProcessA( + executable_path, + const_cast<char*>(command_line.c_str()), + NULL, // Retuned process handle is not inheritable. + NULL, // Retuned thread handle is not inheritable. + TRUE, // Child inherits all inheritable handles (for write_handle_). + 0x0, // Default creation flags. + NULL, // Inherit the parent's environment. + UnitTest::GetInstance()->original_working_dir(), + &startup_info, + &process_info) != FALSE); + child_handle_.Reset(process_info.hProcess); + ::CloseHandle(process_info.hThread); + set_spawned(true); + return OVERSEE_TEST; +} +# else // We are not on Windows. + +// ForkingDeathTest provides implementations for most of the abstract +// methods of the DeathTest interface. Only the AssumeRole method is +// left undefined. +class ForkingDeathTest : public DeathTestImpl { + public: + ForkingDeathTest(const char* statement, const RE* regex); + + // All of these virtual functions are inherited from DeathTest. + virtual int Wait(); + + protected: + void set_child_pid(pid_t child_pid) { child_pid_ = child_pid; } + + private: + // PID of child process during death test; 0 in the child process itself. + pid_t child_pid_; +}; + +// Constructs a ForkingDeathTest. +ForkingDeathTest::ForkingDeathTest(const char* a_statement, const RE* a_regex) + : DeathTestImpl(a_statement, a_regex), + child_pid_(-1) {} + +// Waits for the child in a death test to exit, returning its exit +// status, or 0 if no child process exists. As a side effect, sets the +// outcome data member. +int ForkingDeathTest::Wait() { + if (!spawned()) + return 0; + + ReadAndInterpretStatusByte(); + + int status_value; + GTEST_DEATH_TEST_CHECK_SYSCALL_(waitpid(child_pid_, &status_value, 0)); + set_status(status_value); + return status_value; +} + +// A concrete death test class that forks, then immediately runs the test +// in the child process. +class NoExecDeathTest : public ForkingDeathTest { + public: + NoExecDeathTest(const char* a_statement, const RE* a_regex) : + ForkingDeathTest(a_statement, a_regex) { } + virtual TestRole AssumeRole(); +}; + +// The AssumeRole process for a fork-and-run death test. It implements a +// straightforward fork, with a simple pipe to transmit the status byte. +DeathTest::TestRole NoExecDeathTest::AssumeRole() { + const size_t thread_count = GetThreadCount(); + if (thread_count != 1) { + GTEST_LOG_(WARNING) << DeathTestThreadWarning(thread_count); + } + + int pipe_fd[2]; + GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1); + + DeathTest::set_last_death_test_message(""); + CaptureStderr(); + // When we fork the process below, the log file buffers are copied, but the + // file descriptors are shared. We flush all log files here so that closing + // the file descriptors in the child process doesn't throw off the + // synchronization between descriptors and buffers in the parent process. + // This is as close to the fork as possible to avoid a race condition in case + // there are multiple threads running before the death test, and another + // thread writes to the log file. + FlushInfoLog(); + + const pid_t child_pid = fork(); + GTEST_DEATH_TEST_CHECK_(child_pid != -1); + set_child_pid(child_pid); + if (child_pid == 0) { + GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[0])); + set_write_fd(pipe_fd[1]); + // Redirects all logging to stderr in the child process to prevent + // concurrent writes to the log files. We capture stderr in the parent + // process and append the child process' output to a log. + LogToStderr(); + // Event forwarding to the listeners of event listener API mush be shut + // down in death test subprocesses. + GetUnitTestImpl()->listeners()->SuppressEventForwarding(); + g_in_fast_death_test_child = true; + return EXECUTE_TEST; + } else { + GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1])); + set_read_fd(pipe_fd[0]); + set_spawned(true); + return OVERSEE_TEST; + } +} + +// A concrete death test class that forks and re-executes the main +// program from the beginning, with command-line flags set that cause +// only this specific death test to be run. +class ExecDeathTest : public ForkingDeathTest { + public: + ExecDeathTest(const char* a_statement, const RE* a_regex, + const char* file, int line) : + ForkingDeathTest(a_statement, a_regex), file_(file), line_(line) { } + virtual TestRole AssumeRole(); + private: + static ::std::vector<testing::internal::string> + GetArgvsForDeathTestChildProcess() { + ::std::vector<testing::internal::string> args = GetInjectableArgvs(); + return args; + } + // The name of the file in which the death test is located. + const char* const file_; + // The line number on which the death test is located. + const int line_; +}; + +// Utility class for accumulating command-line arguments. +class Arguments { + public: + Arguments() { + args_.push_back(NULL); + } + + ~Arguments() { + for (std::vector<char*>::iterator i = args_.begin(); i != args_.end(); + ++i) { + free(*i); + } + } + void AddArgument(const char* argument) { + args_.insert(args_.end() - 1, posix::StrDup(argument)); + } + + template <typename Str> + void AddArguments(const ::std::vector<Str>& arguments) { + for (typename ::std::vector<Str>::const_iterator i = arguments.begin(); + i != arguments.end(); + ++i) { + args_.insert(args_.end() - 1, posix::StrDup(i->c_str())); + } + } + char* const* Argv() { + return &args_[0]; + } + + private: + std::vector<char*> args_; +}; + +// A struct that encompasses the arguments to the child process of a +// threadsafe-style death test process. +struct ExecDeathTestArgs { + char* const* argv; // Command-line arguments for the child's call to exec + int close_fd; // File descriptor to close; the read end of a pipe +}; + +# if GTEST_OS_MAC +inline char** GetEnviron() { + // When Google Test is built as a framework on MacOS X, the environ variable + // is unavailable. Apple's documentation (man environ) recommends using + // _NSGetEnviron() instead. + return *_NSGetEnviron(); +} +# else +// Some POSIX platforms expect you to declare environ. extern "C" makes +// it reside in the global namespace. +extern "C" char** environ; +inline char** GetEnviron() { return environ; } +# endif // GTEST_OS_MAC + +# if !GTEST_OS_QNX +// The main function for a threadsafe-style death test child process. +// This function is called in a clone()-ed process and thus must avoid +// any potentially unsafe operations like malloc or libc functions. +static int ExecDeathTestChildMain(void* child_arg) { + ExecDeathTestArgs* const args = static_cast<ExecDeathTestArgs*>(child_arg); + GTEST_DEATH_TEST_CHECK_SYSCALL_(close(args->close_fd)); + + // We need to execute the test program in the same environment where + // it was originally invoked. Therefore we change to the original + // working directory first. + const char* const original_dir = + UnitTest::GetInstance()->original_working_dir(); + // We can safely call chdir() as it's a direct system call. + if (chdir(original_dir) != 0) { + DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " + + GetLastErrnoDescription()); + return EXIT_FAILURE; + } + + // We can safely call execve() as it's a direct system call. We + // cannot use execvp() as it's a libc function and thus potentially + // unsafe. Since execve() doesn't search the PATH, the user must + // invoke the test program via a valid path that contains at least + // one path separator. + execve(args->argv[0], args->argv, GetEnviron()); + DeathTestAbort(std::string("execve(") + args->argv[0] + ", ...) in " + + original_dir + " failed: " + + GetLastErrnoDescription()); + return EXIT_FAILURE; +} +# endif // !GTEST_OS_QNX + +// Two utility routines that together determine the direction the stack +// grows. +// This could be accomplished more elegantly by a single recursive +// function, but we want to guard against the unlikely possibility of +// a smart compiler optimizing the recursion away. +// +// GTEST_NO_INLINE_ is required to prevent GCC 4.6 from inlining +// StackLowerThanAddress into StackGrowsDown, which then doesn't give +// correct answer. +void StackLowerThanAddress(const void* ptr, bool* result) GTEST_NO_INLINE_; +void StackLowerThanAddress(const void* ptr, bool* result) { + int dummy; + *result = (&dummy < ptr); +} + +bool StackGrowsDown() { + int dummy; + bool result; + StackLowerThanAddress(&dummy, &result); + return result; +} + +// Spawns a child process with the same executable as the current process in +// a thread-safe manner and instructs it to run the death test. The +// implementation uses fork(2) + exec. On systems where clone(2) is +// available, it is used instead, being slightly more thread-safe. On QNX, +// fork supports only single-threaded environments, so this function uses +// spawn(2) there instead. The function dies with an error message if +// anything goes wrong. +static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) { + ExecDeathTestArgs args = { argv, close_fd }; + pid_t child_pid = -1; + +# if GTEST_OS_QNX + // Obtains the current directory and sets it to be closed in the child + // process. + const int cwd_fd = open(".", O_RDONLY); + GTEST_DEATH_TEST_CHECK_(cwd_fd != -1); + GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(cwd_fd, F_SETFD, FD_CLOEXEC)); + // We need to execute the test program in the same environment where + // it was originally invoked. Therefore we change to the original + // working directory first. + const char* const original_dir = + UnitTest::GetInstance()->original_working_dir(); + // We can safely call chdir() as it's a direct system call. + if (chdir(original_dir) != 0) { + DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " + + GetLastErrnoDescription()); + return EXIT_FAILURE; + } + + int fd_flags; + // Set close_fd to be closed after spawn. + GTEST_DEATH_TEST_CHECK_SYSCALL_(fd_flags = fcntl(close_fd, F_GETFD)); + GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(close_fd, F_SETFD, + fd_flags | FD_CLOEXEC)); + struct inheritance inherit = {0}; + // spawn is a system call. + child_pid = spawn(args.argv[0], 0, NULL, &inherit, args.argv, GetEnviron()); + // Restores the current working directory. + GTEST_DEATH_TEST_CHECK_(fchdir(cwd_fd) != -1); + GTEST_DEATH_TEST_CHECK_SYSCALL_(close(cwd_fd)); + +# else // GTEST_OS_QNX +# if GTEST_OS_LINUX + // When a SIGPROF signal is received while fork() or clone() are executing, + // the process may hang. To avoid this, we ignore SIGPROF here and re-enable + // it after the call to fork()/clone() is complete. + struct sigaction saved_sigprof_action; + struct sigaction ignore_sigprof_action; + memset(&ignore_sigprof_action, 0, sizeof(ignore_sigprof_action)); + sigemptyset(&ignore_sigprof_action.sa_mask); + ignore_sigprof_action.sa_handler = SIG_IGN; + GTEST_DEATH_TEST_CHECK_SYSCALL_(sigaction( + SIGPROF, &ignore_sigprof_action, &saved_sigprof_action)); +# endif // GTEST_OS_LINUX + +# if GTEST_HAS_CLONE + const bool use_fork = GTEST_FLAG(death_test_use_fork); + + if (!use_fork) { + static const bool stack_grows_down = StackGrowsDown(); + const size_t stack_size = getpagesize(); + // MMAP_ANONYMOUS is not defined on Mac, so we use MAP_ANON instead. + void* const stack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE, -1, 0); + GTEST_DEATH_TEST_CHECK_(stack != MAP_FAILED); + + // Maximum stack alignment in bytes: For a downward-growing stack, this + // amount is subtracted from size of the stack space to get an address + // that is within the stack space and is aligned on all systems we care + // about. As far as I know there is no ABI with stack alignment greater + // than 64. We assume stack and stack_size already have alignment of + // kMaxStackAlignment. + const size_t kMaxStackAlignment = 64; + void* const stack_top = + static_cast<char*>(stack) + + (stack_grows_down ? stack_size - kMaxStackAlignment : 0); + GTEST_DEATH_TEST_CHECK_(stack_size > kMaxStackAlignment && + reinterpret_cast<intptr_t>(stack_top) % kMaxStackAlignment == 0); + + child_pid = clone(&ExecDeathTestChildMain, stack_top, SIGCHLD, &args); + + GTEST_DEATH_TEST_CHECK_(munmap(stack, stack_size) != -1); + } +# else + const bool use_fork = true; +# endif // GTEST_HAS_CLONE + + if (use_fork && (child_pid = fork()) == 0) { + ExecDeathTestChildMain(&args); + _exit(0); + } +# endif // GTEST_OS_QNX +# if GTEST_OS_LINUX + GTEST_DEATH_TEST_CHECK_SYSCALL_( + sigaction(SIGPROF, &saved_sigprof_action, NULL)); +# endif // GTEST_OS_LINUX + + GTEST_DEATH_TEST_CHECK_(child_pid != -1); + return child_pid; +} + +// The AssumeRole process for a fork-and-exec death test. It re-executes the +// main program from the beginning, setting the --gtest_filter +// and --gtest_internal_run_death_test flags to cause only the current +// death test to be re-run. +DeathTest::TestRole ExecDeathTest::AssumeRole() { + const UnitTestImpl* const impl = GetUnitTestImpl(); + const InternalRunDeathTestFlag* const flag = + impl->internal_run_death_test_flag(); + const TestInfo* const info = impl->current_test_info(); + const int death_test_index = info->result()->death_test_count(); + + if (flag != NULL) { + set_write_fd(flag->write_fd()); + return EXECUTE_TEST; + } + + int pipe_fd[2]; + GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1); + // Clear the close-on-exec flag on the write end of the pipe, lest + // it be closed when the child process does an exec: + GTEST_DEATH_TEST_CHECK_(fcntl(pipe_fd[1], F_SETFD, 0) != -1); + + const std::string filter_flag = + std::string("--") + GTEST_FLAG_PREFIX_ + kFilterFlag + "=" + + info->test_case_name() + "." + info->name(); + const std::string internal_flag = + std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "=" + + file_ + "|" + StreamableToString(line_) + "|" + + StreamableToString(death_test_index) + "|" + + StreamableToString(pipe_fd[1]); + Arguments args; + args.AddArguments(GetArgvsForDeathTestChildProcess()); + args.AddArgument(filter_flag.c_str()); + args.AddArgument(internal_flag.c_str()); + + DeathTest::set_last_death_test_message(""); + + CaptureStderr(); + // See the comment in NoExecDeathTest::AssumeRole for why the next line + // is necessary. + FlushInfoLog(); + + const pid_t child_pid = ExecDeathTestSpawnChild(args.Argv(), pipe_fd[0]); + GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1])); + set_child_pid(child_pid); + set_read_fd(pipe_fd[0]); + set_spawned(true); + return OVERSEE_TEST; +} + +# endif // !GTEST_OS_WINDOWS + +// Creates a concrete DeathTest-derived class that depends on the +// --gtest_death_test_style flag, and sets the pointer pointed to +// by the "test" argument to its address. If the test should be +// skipped, sets that pointer to NULL. Returns true, unless the +// flag is set to an invalid value. +bool DefaultDeathTestFactory::Create(const char* statement, const RE* regex, + const char* file, int line, + DeathTest** test) { + UnitTestImpl* const impl = GetUnitTestImpl(); + const InternalRunDeathTestFlag* const flag = + impl->internal_run_death_test_flag(); + const int death_test_index = impl->current_test_info() + ->increment_death_test_count(); + + if (flag != NULL) { + if (death_test_index > flag->index()) { + DeathTest::set_last_death_test_message( + "Death test count (" + StreamableToString(death_test_index) + + ") somehow exceeded expected maximum (" + + StreamableToString(flag->index()) + ")"); + return false; + } + + if (!(flag->file() == file && flag->line() == line && + flag->index() == death_test_index)) { + *test = NULL; + return true; + } + } + +# if GTEST_OS_WINDOWS + + if (GTEST_FLAG(death_test_style) == "threadsafe" || + GTEST_FLAG(death_test_style) == "fast") { + *test = new WindowsDeathTest(statement, regex, file, line); + } + +# else + + if (GTEST_FLAG(death_test_style) == "threadsafe") { + *test = new ExecDeathTest(statement, regex, file, line); + } else if (GTEST_FLAG(death_test_style) == "fast") { + *test = new NoExecDeathTest(statement, regex); + } + +# endif // GTEST_OS_WINDOWS + + else { // NOLINT - this is more readable than unbalanced brackets inside #if. + DeathTest::set_last_death_test_message( + "Unknown death test style \"" + GTEST_FLAG(death_test_style) + + "\" encountered"); + return false; + } + + return true; +} + +// Splits a given string on a given delimiter, populating a given +// vector with the fields. GTEST_HAS_DEATH_TEST implies that we have +// ::std::string, so we can use it here. +static void SplitString(const ::std::string& str, char delimiter, + ::std::vector< ::std::string>* dest) { + ::std::vector< ::std::string> parsed; + ::std::string::size_type pos = 0; + while (::testing::internal::AlwaysTrue()) { + const ::std::string::size_type colon = str.find(delimiter, pos); + if (colon == ::std::string::npos) { + parsed.push_back(str.substr(pos)); + break; + } else { + parsed.push_back(str.substr(pos, colon - pos)); + pos = colon + 1; + } + } + dest->swap(parsed); +} + +# if GTEST_OS_WINDOWS +// Recreates the pipe and event handles from the provided parameters, +// signals the event, and returns a file descriptor wrapped around the pipe +// handle. This function is called in the child process only. +int GetStatusFileDescriptor(unsigned int parent_process_id, + size_t write_handle_as_size_t, + size_t event_handle_as_size_t) { + AutoHandle parent_process_handle(::OpenProcess(PROCESS_DUP_HANDLE, + FALSE, // Non-inheritable. + parent_process_id)); + if (parent_process_handle.Get() == INVALID_HANDLE_VALUE) { + DeathTestAbort("Unable to open parent process " + + StreamableToString(parent_process_id)); + } + + // TODO(vladl@google.com): Replace the following check with a + // compile-time assertion when available. + GTEST_CHECK_(sizeof(HANDLE) <= sizeof(size_t)); + + const HANDLE write_handle = + reinterpret_cast<HANDLE>(write_handle_as_size_t); + HANDLE dup_write_handle; + + // The newly initialized handle is accessible only in in the parent + // process. To obtain one accessible within the child, we need to use + // DuplicateHandle. + if (!::DuplicateHandle(parent_process_handle.Get(), write_handle, + ::GetCurrentProcess(), &dup_write_handle, + 0x0, // Requested privileges ignored since + // DUPLICATE_SAME_ACCESS is used. + FALSE, // Request non-inheritable handler. + DUPLICATE_SAME_ACCESS)) { + DeathTestAbort("Unable to duplicate the pipe handle " + + StreamableToString(write_handle_as_size_t) + + " from the parent process " + + StreamableToString(parent_process_id)); + } + + const HANDLE event_handle = reinterpret_cast<HANDLE>(event_handle_as_size_t); + HANDLE dup_event_handle; + + if (!::DuplicateHandle(parent_process_handle.Get(), event_handle, + ::GetCurrentProcess(), &dup_event_handle, + 0x0, + FALSE, + DUPLICATE_SAME_ACCESS)) { + DeathTestAbort("Unable to duplicate the event handle " + + StreamableToString(event_handle_as_size_t) + + " from the parent process " + + StreamableToString(parent_process_id)); + } + + const int write_fd = + ::_open_osfhandle(reinterpret_cast<intptr_t>(dup_write_handle), O_APPEND); + if (write_fd == -1) { + DeathTestAbort("Unable to convert pipe handle " + + StreamableToString(write_handle_as_size_t) + + " to a file descriptor"); + } + + // Signals the parent that the write end of the pipe has been acquired + // so the parent can release its own write end. + ::SetEvent(dup_event_handle); + + return write_fd; +} +# endif // GTEST_OS_WINDOWS + +// Returns a newly created InternalRunDeathTestFlag object with fields +// initialized from the GTEST_FLAG(internal_run_death_test) flag if +// the flag is specified; otherwise returns NULL. +InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() { + if (GTEST_FLAG(internal_run_death_test) == "") return NULL; + + // GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we + // can use it here. + int line = -1; + int index = -1; + ::std::vector< ::std::string> fields; + SplitString(GTEST_FLAG(internal_run_death_test).c_str(), '|', &fields); + int write_fd = -1; + +# if GTEST_OS_WINDOWS + + unsigned int parent_process_id = 0; + size_t write_handle_as_size_t = 0; + size_t event_handle_as_size_t = 0; + + if (fields.size() != 6 + || !ParseNaturalNumber(fields[1], &line) + || !ParseNaturalNumber(fields[2], &index) + || !ParseNaturalNumber(fields[3], &parent_process_id) + || !ParseNaturalNumber(fields[4], &write_handle_as_size_t) + || !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) { + DeathTestAbort("Bad --gtest_internal_run_death_test flag: " + + GTEST_FLAG(internal_run_death_test)); + } + write_fd = GetStatusFileDescriptor(parent_process_id, + write_handle_as_size_t, + event_handle_as_size_t); +# else + + if (fields.size() != 4 + || !ParseNaturalNumber(fields[1], &line) + || !ParseNaturalNumber(fields[2], &index) + || !ParseNaturalNumber(fields[3], &write_fd)) { + DeathTestAbort("Bad --gtest_internal_run_death_test flag: " + + GTEST_FLAG(internal_run_death_test)); + } + +# endif // GTEST_OS_WINDOWS + + return new InternalRunDeathTestFlag(fields[0], line, index, write_fd); +} + +} // namespace internal + +#endif // GTEST_HAS_DEATH_TEST + +} // namespace testing +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Authors: keith.ray@gmail.com (Keith Ray) + + +#include <stdlib.h> + +#if GTEST_OS_WINDOWS_MOBILE +# include <windows.h> +#elif GTEST_OS_WINDOWS +# include <direct.h> +# include <io.h> +#elif GTEST_OS_SYMBIAN +// Symbian OpenC has PATH_MAX in sys/syslimits.h +# include <sys/syslimits.h> +#else +# include <limits.h> +# include <climits> // Some Linux distributions define PATH_MAX here. +#endif // GTEST_OS_WINDOWS_MOBILE + +#if GTEST_OS_WINDOWS +# define GTEST_PATH_MAX_ _MAX_PATH +#elif defined(PATH_MAX) +# define GTEST_PATH_MAX_ PATH_MAX +#elif defined(_XOPEN_PATH_MAX) +# define GTEST_PATH_MAX_ _XOPEN_PATH_MAX +#else +# define GTEST_PATH_MAX_ _POSIX_PATH_MAX +#endif // GTEST_OS_WINDOWS + + +namespace testing { +namespace internal { + +#if GTEST_OS_WINDOWS +// On Windows, '\\' is the standard path separator, but many tools and the +// Windows API also accept '/' as an alternate path separator. Unless otherwise +// noted, a file path can contain either kind of path separators, or a mixture +// of them. +const char kPathSeparator = '\\'; +const char kAlternatePathSeparator = '/'; +//const char kPathSeparatorString[] = "\\"; +const char kAlternatePathSeparatorString[] = "/"; +# if GTEST_OS_WINDOWS_MOBILE +// Windows CE doesn't have a current directory. You should not use +// the current directory in tests on Windows CE, but this at least +// provides a reasonable fallback. +const char kCurrentDirectoryString[] = "\\"; +// Windows CE doesn't define INVALID_FILE_ATTRIBUTES +const DWORD kInvalidFileAttributes = 0xffffffff; +# else +const char kCurrentDirectoryString[] = ".\\"; +# endif // GTEST_OS_WINDOWS_MOBILE +#else +const char kPathSeparator = '/'; +//const char kPathSeparatorString[] = "/"; +const char kCurrentDirectoryString[] = "./"; +#endif // GTEST_OS_WINDOWS + +// Returns whether the given character is a valid path separator. +static bool IsPathSeparator(char c) { +#if GTEST_HAS_ALT_PATH_SEP_ + return (c == kPathSeparator) || (c == kAlternatePathSeparator); +#else + return c == kPathSeparator; +#endif +} + +// Returns the current working directory, or "" if unsuccessful. +FilePath FilePath::GetCurrentDir() { +#if GTEST_OS_WINDOWS_MOBILE + // Windows CE doesn't have a current directory, so we just return + // something reasonable. + return FilePath(kCurrentDirectoryString); +#elif GTEST_OS_WINDOWS + char cwd[GTEST_PATH_MAX_ + 1] = { '\0' }; + return FilePath(_getcwd(cwd, sizeof(cwd)) == NULL ? "" : cwd); +#else + char cwd[GTEST_PATH_MAX_ + 1] = { '\0' }; + return FilePath(getcwd(cwd, sizeof(cwd)) == NULL ? "" : cwd); +#endif // GTEST_OS_WINDOWS_MOBILE +} + +// Returns a copy of the FilePath with the case-insensitive extension removed. +// Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns +// FilePath("dir/file"). If a case-insensitive extension is not +// found, returns a copy of the original FilePath. +FilePath FilePath::RemoveExtension(const char* extension) const { + const std::string dot_extension = std::string(".") + extension; + if (String::EndsWithCaseInsensitive(pathname_, dot_extension)) { + return FilePath(pathname_.substr( + 0, pathname_.length() - dot_extension.length())); + } + return *this; +} + +// Returns a pointer to the last occurrence of a valid path separator in +// the FilePath. On Windows, for example, both '/' and '\' are valid path +// separators. Returns NULL if no path separator was found. +const char* FilePath::FindLastPathSeparator() const { + const char* const last_sep = strrchr(c_str(), kPathSeparator); +#if GTEST_HAS_ALT_PATH_SEP_ + const char* const last_alt_sep = strrchr(c_str(), kAlternatePathSeparator); + // Comparing two pointers of which only one is NULL is undefined. + if (last_alt_sep != NULL && + (last_sep == NULL || last_alt_sep > last_sep)) { + return last_alt_sep; + } +#endif + return last_sep; +} + +// Returns a copy of the FilePath with the directory part removed. +// Example: FilePath("path/to/file").RemoveDirectoryName() returns +// FilePath("file"). If there is no directory part ("just_a_file"), it returns +// the FilePath unmodified. If there is no file part ("just_a_dir/") it +// returns an empty FilePath (""). +// On Windows platform, '\' is the path separator, otherwise it is '/'. +FilePath FilePath::RemoveDirectoryName() const { + const char* const last_sep = FindLastPathSeparator(); + return last_sep ? FilePath(last_sep + 1) : *this; +} + +// RemoveFileName returns the directory path with the filename removed. +// Example: FilePath("path/to/file").RemoveFileName() returns "path/to/". +// If the FilePath is "a_file" or "/a_file", RemoveFileName returns +// FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does +// not have a file, like "just/a/dir/", it returns the FilePath unmodified. +// On Windows platform, '\' is the path separator, otherwise it is '/'. +FilePath FilePath::RemoveFileName() const { + const char* const last_sep = FindLastPathSeparator(); + std::string dir; + if (last_sep) { + dir = std::string(c_str(), last_sep + 1 - c_str()); + } else { + dir = kCurrentDirectoryString; + } + return FilePath(dir); +} + +// Helper functions for naming files in a directory for xml output. + +// Given directory = "dir", base_name = "test", number = 0, +// extension = "xml", returns "dir/test.xml". If number is greater +// than zero (e.g., 12), returns "dir/test_12.xml". +// On Windows platform, uses \ as the separator rather than /. +FilePath FilePath::MakeFileName(const FilePath& directory, + const FilePath& base_name, + int number, + const char* extension) { + std::string file; + if (number == 0) { + file = base_name.string() + "." + extension; + } else { + file = base_name.string() + "_" + StreamableToString(number) + + "." + extension; + } + return ConcatPaths(directory, FilePath(file)); +} + +// Given directory = "dir", relative_path = "test.xml", returns "dir/test.xml". +// On Windows, uses \ as the separator rather than /. +FilePath FilePath::ConcatPaths(const FilePath& directory, + const FilePath& relative_path) { + if (directory.IsEmpty()) + return relative_path; + const FilePath dir(directory.RemoveTrailingPathSeparator()); + return FilePath(dir.string() + kPathSeparator + relative_path.string()); +} + +// Returns true if pathname describes something findable in the file-system, +// either a file, directory, or whatever. +bool FilePath::FileOrDirectoryExists() const { +#if GTEST_OS_WINDOWS_MOBILE + LPCWSTR unicode = String::AnsiToUtf16(pathname_.c_str()); + const DWORD attributes = GetFileAttributes(unicode); + delete [] unicode; + return attributes != kInvalidFileAttributes; +#else + posix::StatStruct file_stat; + return posix::Stat(pathname_.c_str(), &file_stat) == 0; +#endif // GTEST_OS_WINDOWS_MOBILE +} + +// Returns true if pathname describes a directory in the file-system +// that exists. +bool FilePath::DirectoryExists() const { + bool result = false; +#if GTEST_OS_WINDOWS + // Don't strip off trailing separator if path is a root directory on + // Windows (like "C:\\"). + const FilePath& path(IsRootDirectory() ? *this : + RemoveTrailingPathSeparator()); +#else + const FilePath& path(*this); +#endif + +#if GTEST_OS_WINDOWS_MOBILE + LPCWSTR unicode = String::AnsiToUtf16(path.c_str()); + const DWORD attributes = GetFileAttributes(unicode); + delete [] unicode; + if ((attributes != kInvalidFileAttributes) && + (attributes & FILE_ATTRIBUTE_DIRECTORY)) { + result = true; + } +#else + posix::StatStruct file_stat; + result = posix::Stat(path.c_str(), &file_stat) == 0 && + posix::IsDir(file_stat); +#endif // GTEST_OS_WINDOWS_MOBILE + + return result; +} + +// Returns true if pathname describes a root directory. (Windows has one +// root directory per disk drive.) +bool FilePath::IsRootDirectory() const { +#if GTEST_OS_WINDOWS + // TODO(wan@google.com): on Windows a network share like + // \\server\share can be a root directory, although it cannot be the + // current directory. Handle this properly. + return pathname_.length() == 3 && IsAbsolutePath(); +#else + return pathname_.length() == 1 && IsPathSeparator(pathname_.c_str()[0]); +#endif +} + +// Returns true if pathname describes an absolute path. +bool FilePath::IsAbsolutePath() const { + const char* const name = pathname_.c_str(); +#if GTEST_OS_WINDOWS + return pathname_.length() >= 3 && + ((name[0] >= 'a' && name[0] <= 'z') || + (name[0] >= 'A' && name[0] <= 'Z')) && + name[1] == ':' && + IsPathSeparator(name[2]); +#else + return IsPathSeparator(name[0]); +#endif +} + +// Returns a pathname for a file that does not currently exist. The pathname +// will be directory/base_name.extension or +// directory/base_name_<number>.extension if directory/base_name.extension +// already exists. The number will be incremented until a pathname is found +// that does not already exist. +// Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'. +// There could be a race condition if two or more processes are calling this +// function at the same time -- they could both pick the same filename. +FilePath FilePath::GenerateUniqueFileName(const FilePath& directory, + const FilePath& base_name, + const char* extension) { + FilePath full_pathname; + int number = 0; + do { + full_pathname.Set(MakeFileName(directory, base_name, number++, extension)); + } while (full_pathname.FileOrDirectoryExists()); + return full_pathname; +} + +// Returns true if FilePath ends with a path separator, which indicates that +// it is intended to represent a directory. Returns false otherwise. +// This does NOT check that a directory (or file) actually exists. +bool FilePath::IsDirectory() const { + return !pathname_.empty() && + IsPathSeparator(pathname_.c_str()[pathname_.length() - 1]); +} + +// Create directories so that path exists. Returns true if successful or if +// the directories already exist; returns false if unable to create directories +// for any reason. +bool FilePath::CreateDirectoriesRecursively() const { + if (!this->IsDirectory()) { + return false; + } + + if (pathname_.length() == 0 || this->DirectoryExists()) { + return true; + } + + const FilePath parent(this->RemoveTrailingPathSeparator().RemoveFileName()); + return parent.CreateDirectoriesRecursively() && this->CreateFolder(); +} + +// Create the directory so that path exists. Returns true if successful or +// if the directory already exists; returns false if unable to create the +// directory for any reason, including if the parent directory does not +// exist. Not named "CreateDirectory" because that's a macro on Windows. +bool FilePath::CreateFolder() const { +#if GTEST_OS_WINDOWS_MOBILE + FilePath removed_sep(this->RemoveTrailingPathSeparator()); + LPCWSTR unicode = String::AnsiToUtf16(removed_sep.c_str()); + int result = CreateDirectory(unicode, NULL) ? 0 : -1; + delete [] unicode; +#elif GTEST_OS_WINDOWS + int result = _mkdir(pathname_.c_str()); +#else + int result = mkdir(pathname_.c_str(), 0777); +#endif // GTEST_OS_WINDOWS_MOBILE + + if (result == -1) { + return this->DirectoryExists(); // An error is OK if the directory exists. + } + return true; // No error. +} + +// If input name has a trailing separator character, remove it and return the +// name, otherwise return the name string unmodified. +// On Windows platform, uses \ as the separator, other platforms use /. +FilePath FilePath::RemoveTrailingPathSeparator() const { + return IsDirectory() + ? FilePath(pathname_.substr(0, pathname_.length() - 1)) + : *this; +} + +// Removes any redundant separators that might be in the pathname. +// For example, "bar///foo" becomes "bar/foo". Does not eliminate other +// redundancies that might be in a pathname involving "." or "..". +// TODO(wan@google.com): handle Windows network shares (e.g. \\server\share). +void FilePath::Normalize() { + if (pathname_.c_str() == NULL) { + pathname_ = ""; + return; + } + const char* src = pathname_.c_str(); + char* const dest = new char[pathname_.length() + 1]; + char* dest_ptr = dest; + memset(dest_ptr, 0, pathname_.length() + 1); + + while (*src != '\0') { + *dest_ptr = *src; + if (!IsPathSeparator(*src)) { + src++; + } else { +#if GTEST_HAS_ALT_PATH_SEP_ + if (*dest_ptr == kAlternatePathSeparator) { + *dest_ptr = kPathSeparator; + } +#endif + while (IsPathSeparator(*src)) + src++; + } + dest_ptr++; + } + *dest_ptr = '\0'; + pathname_ = dest; + delete[] dest; +} + +} // namespace internal +} // namespace testing +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) + + +#include <limits.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> + +#if GTEST_OS_WINDOWS_MOBILE +# include <windows.h> // For TerminateProcess() +#elif GTEST_OS_WINDOWS +# include <io.h> +# include <sys/stat.h> +#else +# include <unistd.h> +#endif // GTEST_OS_WINDOWS_MOBILE + +#if GTEST_OS_MAC +# include <mach/mach_init.h> +# include <mach/task.h> +# include <mach/vm_map.h> +#endif // GTEST_OS_MAC + +#if GTEST_OS_QNX +# include <devctl.h> +# include <sys/procfs.h> +#endif // GTEST_OS_QNX + + +// Indicates that this translation unit is part of Google Test's +// implementation. It must come before gtest-internal-inl.h is +// included, or there will be a compiler error. This trick is to +// prevent a user from accidentally including gtest-internal-inl.h in +// his code. +#define GTEST_IMPLEMENTATION_ 1 +#undef GTEST_IMPLEMENTATION_ + +namespace testing { +namespace internal { + +#if defined(_MSC_VER) || defined(__BORLANDC__) +// MSVC and C++Builder do not provide a definition of STDERR_FILENO. +const int kStdOutFileno = 1; +const int kStdErrFileno = 2; +#else +const int kStdOutFileno = STDOUT_FILENO; +const int kStdErrFileno = STDERR_FILENO; +#endif // _MSC_VER + +#if GTEST_OS_MAC + +// Returns the number of threads running in the process, or 0 to indicate that +// we cannot detect it. +size_t GetThreadCount() { + const task_t task = mach_task_self(); + mach_msg_type_number_t thread_count; + thread_act_array_t thread_list; + const kern_return_t status = task_threads(task, &thread_list, &thread_count); + if (status == KERN_SUCCESS) { + // task_threads allocates resources in thread_list and we need to free them + // to avoid leaks. + vm_deallocate(task, + reinterpret_cast<vm_address_t>(thread_list), + sizeof(thread_t) * thread_count); + return static_cast<size_t>(thread_count); + } else { + return 0; + } +} + +#elif GTEST_OS_QNX + +// Returns the number of threads running in the process, or 0 to indicate that +// we cannot detect it. +size_t GetThreadCount() { + const int fd = open("/proc/self/as", O_RDONLY); + if (fd < 0) { + return 0; + } + procfs_info process_info; + const int status = + devctl(fd, DCMD_PROC_INFO, &process_info, sizeof(process_info), NULL); + close(fd); + if (status == EOK) { + return static_cast<size_t>(process_info.num_threads); + } else { + return 0; + } +} + +#else + +size_t GetThreadCount() { + // There's no portable way to detect the number of threads, so we just + // return 0 to indicate that we cannot detect it. + return 0; +} + +#endif // GTEST_OS_MAC + +#if GTEST_USES_POSIX_RE + +// Implements RE. Currently only needed for death tests. + +RE::~RE() { + if (is_valid_) { + // regfree'ing an invalid regex might crash because the content + // of the regex is undefined. Since the regex's are essentially + // the same, one cannot be valid (or invalid) without the other + // being so too. + regfree(&partial_regex_); + regfree(&full_regex_); + } + free(const_cast<char*>(pattern_)); +} + +// Returns true iff regular expression re matches the entire str. +bool RE::FullMatch(const char* str, const RE& re) { + if (!re.is_valid_) return false; + + regmatch_t match; + return regexec(&re.full_regex_, str, 1, &match, 0) == 0; +} + +// Returns true iff regular expression re matches a substring of str +// (including str itself). +bool RE::PartialMatch(const char* str, const RE& re) { + if (!re.is_valid_) return false; + + regmatch_t match; + return regexec(&re.partial_regex_, str, 1, &match, 0) == 0; +} + +// Initializes an RE from its string representation. +void RE::Init(const char* regex) { + pattern_ = posix::StrDup(regex); + + // Reserves enough bytes to hold the regular expression used for a + // full match. + const size_t full_regex_len = strlen(regex) + 10; + char* const full_pattern = new char[full_regex_len]; + + snprintf(full_pattern, full_regex_len, "^(%s)$", regex); + is_valid_ = regcomp(&full_regex_, full_pattern, REG_EXTENDED) == 0; + // We want to call regcomp(&partial_regex_, ...) even if the + // previous expression returns false. Otherwise partial_regex_ may + // not be properly initialized can may cause trouble when it's + // freed. + // + // Some implementation of POSIX regex (e.g. on at least some + // versions of Cygwin) doesn't accept the empty string as a valid + // regex. We change it to an equivalent form "()" to be safe. + if (is_valid_) { + const char* const partial_regex = (*regex == '\0') ? "()" : regex; + is_valid_ = regcomp(&partial_regex_, partial_regex, REG_EXTENDED) == 0; + } + EXPECT_TRUE(is_valid_) + << "Regular expression \"" << regex + << "\" is not a valid POSIX Extended regular expression."; + + delete[] full_pattern; +} + +#elif GTEST_USES_SIMPLE_RE + +// Returns true iff ch appears anywhere in str (excluding the +// terminating '\0' character). +bool IsInSet(char ch, const char* str) { + return ch != '\0' && strchr(str, ch) != NULL; +} + +// Returns true iff ch belongs to the given classification. Unlike +// similar functions in <ctype.h>, these aren't affected by the +// current locale. +bool IsAsciiDigit(char ch) { return '0' <= ch && ch <= '9'; } +bool IsAsciiPunct(char ch) { + return IsInSet(ch, "^-!\"#$%&'()*+,./:;<=>?@[\\]_`{|}~"); +} +bool IsRepeat(char ch) { return IsInSet(ch, "?*+"); } +bool IsAsciiWhiteSpace(char ch) { return IsInSet(ch, " \f\n\r\t\v"); } +bool IsAsciiWordChar(char ch) { + return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') || + ('0' <= ch && ch <= '9') || ch == '_'; +} + +// Returns true iff "\\c" is a supported escape sequence. +bool IsValidEscape(char c) { + return (IsAsciiPunct(c) || IsInSet(c, "dDfnrsStvwW")); +} + +// Returns true iff the given atom (specified by escaped and pattern) +// matches ch. The result is undefined if the atom is invalid. +bool AtomMatchesChar(bool escaped, char pattern_char, char ch) { + if (escaped) { // "\\p" where p is pattern_char. + switch (pattern_char) { + case 'd': return IsAsciiDigit(ch); + case 'D': return !IsAsciiDigit(ch); + case 'f': return ch == '\f'; + case 'n': return ch == '\n'; + case 'r': return ch == '\r'; + case 's': return IsAsciiWhiteSpace(ch); + case 'S': return !IsAsciiWhiteSpace(ch); + case 't': return ch == '\t'; + case 'v': return ch == '\v'; + case 'w': return IsAsciiWordChar(ch); + case 'W': return !IsAsciiWordChar(ch); + } + return IsAsciiPunct(pattern_char) && pattern_char == ch; + } + + return (pattern_char == '.' && ch != '\n') || pattern_char == ch; +} + +// Helper function used by ValidateRegex() to format error messages. +std::string FormatRegexSyntaxError(const char* regex, int index) { + return (Message() << "Syntax error at index " << index + << " in simple regular expression \"" << regex << "\": ").GetString(); +} + +// Generates non-fatal failures and returns false if regex is invalid; +// otherwise returns true. +bool ValidateRegex(const char* regex) { + if (regex == NULL) { + // TODO(wan@google.com): fix the source file location in the + // assertion failures to match where the regex is used in user + // code. + ADD_FAILURE() << "NULL is not a valid simple regular expression."; + return false; + } + + bool is_valid = true; + + // True iff ?, *, or + can follow the previous atom. + bool prev_repeatable = false; + for (int i = 0; regex[i]; i++) { + if (regex[i] == '\\') { // An escape sequence + i++; + if (regex[i] == '\0') { + ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1) + << "'\\' cannot appear at the end."; + return false; + } + + if (!IsValidEscape(regex[i])) { + ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1) + << "invalid escape sequence \"\\" << regex[i] << "\"."; + is_valid = false; + } + prev_repeatable = true; + } else { // Not an escape sequence. + const char ch = regex[i]; + + if (ch == '^' && i > 0) { + ADD_FAILURE() << FormatRegexSyntaxError(regex, i) + << "'^' can only appear at the beginning."; + is_valid = false; + } else if (ch == '$' && regex[i + 1] != '\0') { + ADD_FAILURE() << FormatRegexSyntaxError(regex, i) + << "'$' can only appear at the end."; + is_valid = false; + } else if (IsInSet(ch, "()[]{}|")) { + ADD_FAILURE() << FormatRegexSyntaxError(regex, i) + << "'" << ch << "' is unsupported."; + is_valid = false; + } else if (IsRepeat(ch) && !prev_repeatable) { + ADD_FAILURE() << FormatRegexSyntaxError(regex, i) + << "'" << ch << "' can only follow a repeatable token."; + is_valid = false; + } + + prev_repeatable = !IsInSet(ch, "^$?*+"); + } + } + + return is_valid; +} + +// Matches a repeated regex atom followed by a valid simple regular +// expression. The regex atom is defined as c if escaped is false, +// or \c otherwise. repeat is the repetition meta character (?, *, +// or +). The behavior is undefined if str contains too many +// characters to be indexable by size_t, in which case the test will +// probably time out anyway. We are fine with this limitation as +// std::string has it too. +bool MatchRepetitionAndRegexAtHead( + bool escaped, char c, char repeat, const char* regex, + const char* str) { + const size_t min_count = (repeat == '+') ? 1 : 0; + const size_t max_count = (repeat == '?') ? 1 : + static_cast<size_t>(-1) - 1; + // We cannot call numeric_limits::max() as it conflicts with the + // max() macro on Windows. + + for (size_t i = 0; i <= max_count; ++i) { + // We know that the atom matches each of the first i characters in str. + if (i >= min_count && MatchRegexAtHead(regex, str + i)) { + // We have enough matches at the head, and the tail matches too. + // Since we only care about *whether* the pattern matches str + // (as opposed to *how* it matches), there is no need to find a + // greedy match. + return true; + } + if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i])) + return false; + } + return false; +} + +// Returns true iff regex matches a prefix of str. regex must be a +// valid simple regular expression and not start with "^", or the +// result is undefined. +bool MatchRegexAtHead(const char* regex, const char* str) { + if (*regex == '\0') // An empty regex matches a prefix of anything. + return true; + + // "$" only matches the end of a string. Note that regex being + // valid guarantees that there's nothing after "$" in it. + if (*regex == '$') + return *str == '\0'; + + // Is the first thing in regex an escape sequence? + const bool escaped = *regex == '\\'; + if (escaped) + ++regex; + if (IsRepeat(regex[1])) { + // MatchRepetitionAndRegexAtHead() calls MatchRegexAtHead(), so + // here's an indirect recursion. It terminates as the regex gets + // shorter in each recursion. + return MatchRepetitionAndRegexAtHead( + escaped, regex[0], regex[1], regex + 2, str); + } else { + // regex isn't empty, isn't "$", and doesn't start with a + // repetition. We match the first atom of regex with the first + // character of str and recurse. + return (*str != '\0') && AtomMatchesChar(escaped, *regex, *str) && + MatchRegexAtHead(regex + 1, str + 1); + } +} + +// Returns true iff regex matches any substring of str. regex must be +// a valid simple regular expression, or the result is undefined. +// +// The algorithm is recursive, but the recursion depth doesn't exceed +// the regex length, so we won't need to worry about running out of +// stack space normally. In rare cases the time complexity can be +// exponential with respect to the regex length + the string length, +// but usually it's must faster (often close to linear). +bool MatchRegexAnywhere(const char* regex, const char* str) { + if (regex == NULL || str == NULL) + return false; + + if (*regex == '^') + return MatchRegexAtHead(regex + 1, str); + + // A successful match can be anywhere in str. + do { + if (MatchRegexAtHead(regex, str)) + return true; + } while (*str++ != '\0'); + return false; +} + +// Implements the RE class. + +RE::~RE() { + free(const_cast<char*>(pattern_)); + free(const_cast<char*>(full_pattern_)); +} + +// Returns true iff regular expression re matches the entire str. +bool RE::FullMatch(const char* str, const RE& re) { + return re.is_valid_ && MatchRegexAnywhere(re.full_pattern_, str); +} + +// Returns true iff regular expression re matches a substring of str +// (including str itself). +bool RE::PartialMatch(const char* str, const RE& re) { + return re.is_valid_ && MatchRegexAnywhere(re.pattern_, str); +} + +// Initializes an RE from its string representation. +void RE::Init(const char* regex) { + pattern_ = full_pattern_ = NULL; + if (regex != NULL) { + pattern_ = posix::StrDup(regex); + } + + is_valid_ = ValidateRegex(regex); + if (!is_valid_) { + // No need to calculate the full pattern when the regex is invalid. + return; + } + + const size_t len = strlen(regex); + // Reserves enough bytes to hold the regular expression used for a + // full match: we need space to prepend a '^', append a '$', and + // terminate the string with '\0'. + char* buffer = static_cast<char*>(malloc(len + 3)); + full_pattern_ = buffer; + + if (*regex != '^') + *buffer++ = '^'; // Makes sure full_pattern_ starts with '^'. + + // We don't use snprintf or strncpy, as they trigger a warning when + // compiled with VC++ 8.0. + memcpy(buffer, regex, len); + buffer += len; + + if (len == 0 || regex[len - 1] != '$') + *buffer++ = '$'; // Makes sure full_pattern_ ends with '$'. + + *buffer = '\0'; +} + +#endif // GTEST_USES_POSIX_RE + +const char kUnknownFile[] = "unknown file"; + +// Formats a source file path and a line number as they would appear +// in an error message from the compiler used to compile this code. +GTEST_API_ ::std::string FormatFileLocation(const char* file, int line) { + const std::string file_name(file == NULL ? kUnknownFile : file); + + if (line < 0) { + return file_name + ":"; + } +#ifdef _MSC_VER + return file_name + "(" + StreamableToString(line) + "):"; +#else + return file_name + ":" + StreamableToString(line) + ":"; +#endif // _MSC_VER +} + +// Formats a file location for compiler-independent XML output. +// Although this function is not platform dependent, we put it next to +// FormatFileLocation in order to contrast the two functions. +// Note that FormatCompilerIndependentFileLocation() does NOT append colon +// to the file location it produces, unlike FormatFileLocation(). +GTEST_API_ ::std::string FormatCompilerIndependentFileLocation( + const char* file, int line) { + const std::string file_name(file == NULL ? kUnknownFile : file); + + if (line < 0) + return file_name; + else + return file_name + ":" + StreamableToString(line); +} + + +GTestLog::GTestLog(GTestLogSeverity severity, const char* file, int line) + : severity_(severity) { + const char* const marker = + severity == GTEST_INFO ? "[ INFO ]" : + severity == GTEST_WARNING ? "[WARNING]" : + severity == GTEST_ERROR ? "[ ERROR ]" : "[ FATAL ]"; + GetStream() << ::std::endl << marker << " " + << FormatFileLocation(file, line).c_str() << ": "; +} + +// Flushes the buffers and, if severity is GTEST_FATAL, aborts the program. +GTestLog::~GTestLog() { + GetStream() << ::std::endl; + if (severity_ == GTEST_FATAL) { + fflush(stderr); + posix::Abort(); + } +} +// Disable Microsoft deprecation warnings for POSIX functions called from +// this class (creat, dup, dup2, and close) +#ifdef _MSC_VER +# pragma warning(push) +# pragma warning(disable: 4996) +#endif // _MSC_VER + +#if GTEST_HAS_STREAM_REDIRECTION + +// Object that captures an output stream (stdout/stderr). +class CapturedStream { + public: + // The ctor redirects the stream to a temporary file. + explicit CapturedStream(int fd) : fd_(fd), uncaptured_fd_(dup(fd)) { +# if GTEST_OS_WINDOWS + char temp_dir_path[MAX_PATH + 1] = { '\0' }; // NOLINT + char temp_file_path[MAX_PATH + 1] = { '\0' }; // NOLINT + + ::GetTempPathA(sizeof(temp_dir_path), temp_dir_path); + const UINT success = ::GetTempFileNameA(temp_dir_path, + "gtest_redir", + 0, // Generate unique file name. + temp_file_path); + GTEST_CHECK_(success != 0) + << "Unable to create a temporary file in " << temp_dir_path; + const int captured_fd = creat(temp_file_path, _S_IREAD | _S_IWRITE); + GTEST_CHECK_(captured_fd != -1) << "Unable to open temporary file " + << temp_file_path; + filename_ = temp_file_path; +# else + // There's no guarantee that a test has write access to the current + // directory, so we create the temporary file in the /tmp directory + // instead. We use /tmp on most systems, and /sdcard on Android. + // That's because Android doesn't have /tmp. +# if GTEST_OS_LINUX_ANDROID + // Note: Android applications are expected to call the framework's + // Context.getExternalStorageDirectory() method through JNI to get + // the location of the world-writable SD Card directory. However, + // this requires a Context handle, which cannot be retrieved + // globally from native code. Doing so also precludes running the + // code as part of a regular standalone executable, which doesn't + // run in a Dalvik process (e.g. when running it through 'adb shell'). + // + // The location /sdcard is directly accessible from native code + // and is the only location (unofficially) supported by the Android + // team. It's generally a symlink to the real SD Card mount point + // which can be /mnt/sdcard, /mnt/sdcard0, /system/media/sdcard, or + // other OEM-customized locations. Never rely on these, and always + // use /sdcard. + char name_template[] = "/sdcard/gtest_captured_stream.XXXXXX"; +# else + char name_template[] = "/tmp/captured_stream.XXXXXX"; +# endif // GTEST_OS_LINUX_ANDROID + const int captured_fd = mkstemp(name_template); + filename_ = name_template; +# endif // GTEST_OS_WINDOWS + fflush(NULL); + dup2(captured_fd, fd_); + close(captured_fd); + } + + ~CapturedStream() { + remove(filename_.c_str()); + } + + std::string GetCapturedString() { + if (uncaptured_fd_ != -1) { + // Restores the original stream. + fflush(NULL); + dup2(uncaptured_fd_, fd_); + close(uncaptured_fd_); + uncaptured_fd_ = -1; + } + + FILE* const file = posix::FOpen(filename_.c_str(), "r"); + const std::string content = ReadEntireFile(file); + posix::FClose(file); + return content; + } + + private: + // Reads the entire content of a file as an std::string. + static std::string ReadEntireFile(FILE* file); + + // Returns the size (in bytes) of a file. + static size_t GetFileSize(FILE* file); + + const int fd_; // A stream to capture. + int uncaptured_fd_; + // Name of the temporary file holding the stderr output. + ::std::string filename_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(CapturedStream); +}; + +// Returns the size (in bytes) of a file. +size_t CapturedStream::GetFileSize(FILE* file) { + fseek(file, 0, SEEK_END); + return static_cast<size_t>(ftell(file)); +} + +// Reads the entire content of a file as a string. +std::string CapturedStream::ReadEntireFile(FILE* file) { + const size_t file_size = GetFileSize(file); + char* const buffer = new char[file_size]; + + size_t bytes_last_read = 0; // # of bytes read in the last fread() + size_t bytes_read = 0; // # of bytes read so far + + fseek(file, 0, SEEK_SET); + + // Keeps reading the file until we cannot read further or the + // pre-determined file size is reached. + do { + bytes_last_read = fread(buffer+bytes_read, 1, file_size-bytes_read, file); + bytes_read += bytes_last_read; + } while (bytes_last_read > 0 && bytes_read < file_size); + + const std::string content(buffer, bytes_read); + delete[] buffer; + + return content; +} + +# ifdef _MSC_VER +# pragma warning(pop) +# endif // _MSC_VER + +static CapturedStream* g_captured_stderr = NULL; +static CapturedStream* g_captured_stdout = NULL; + +// Starts capturing an output stream (stdout/stderr). +void CaptureStream(int fd, const char* stream_name, CapturedStream** stream) { + if (*stream != NULL) { + GTEST_LOG_(FATAL) << "Only one " << stream_name + << " capturer can exist at a time."; + } + *stream = new CapturedStream(fd); +} + +// Stops capturing the output stream and returns the captured string. +std::string GetCapturedStream(CapturedStream** captured_stream) { + const std::string content = (*captured_stream)->GetCapturedString(); + + delete *captured_stream; + *captured_stream = NULL; + + return content; +} + +// Starts capturing stdout. +void CaptureStdout() { + CaptureStream(kStdOutFileno, "stdout", &g_captured_stdout); +} + +// Starts capturing stderr. +void CaptureStderr() { + CaptureStream(kStdErrFileno, "stderr", &g_captured_stderr); +} + +// Stops capturing stdout and returns the captured string. +std::string GetCapturedStdout() { + return GetCapturedStream(&g_captured_stdout); +} + +// Stops capturing stderr and returns the captured string. +std::string GetCapturedStderr() { + return GetCapturedStream(&g_captured_stderr); +} + +#endif // GTEST_HAS_STREAM_REDIRECTION + +#if GTEST_HAS_DEATH_TEST + +// A copy of all command line arguments. Set by InitGoogleTest(). +::std::vector<testing::internal::string> g_argvs; + +static const ::std::vector<testing::internal::string>* g_injected_test_argvs = + NULL; // Owned. + +void SetInjectableArgvs(const ::std::vector<testing::internal::string>* argvs) { + if (g_injected_test_argvs != argvs) + delete g_injected_test_argvs; + g_injected_test_argvs = argvs; +} + +const ::std::vector<testing::internal::string>& GetInjectableArgvs() { + if (g_injected_test_argvs != NULL) { + return *g_injected_test_argvs; + } + return g_argvs; +} +#endif // GTEST_HAS_DEATH_TEST + +#if GTEST_OS_WINDOWS_MOBILE +namespace posix { +void Abort() { + DebugBreak(); + TerminateProcess(GetCurrentProcess(), 1); +} +} // namespace posix +#endif // GTEST_OS_WINDOWS_MOBILE + +// Returns the name of the environment variable corresponding to the +// given flag. For example, FlagToEnvVar("foo") will return +// "GTEST_FOO" in the open-source version. +static std::string FlagToEnvVar(const char* flag) { + const std::string full_flag = + (Message() << GTEST_FLAG_PREFIX_ << flag).GetString(); + + Message env_var; + for (size_t i = 0; i != full_flag.length(); i++) { + env_var << ToUpper(full_flag.c_str()[i]); + } + + return env_var.GetString(); +} + +// Parses 'str' for a 32-bit signed integer. If successful, writes +// the result to *value and returns true; otherwise leaves *value +// unchanged and returns false. +bool ParseInt32(const Message& src_text, const char* str, Int32* value) { + // Parses the environment variable as a decimal integer. + char* end = NULL; + const long long_value = strtol(str, &end, 10); // NOLINT + + // Has strtol() consumed all characters in the string? + if (*end != '\0') { + // No - an invalid character was encountered. + Message msg; + msg << "WARNING: " << src_text + << " is expected to be a 32-bit integer, but actually" + << " has value \"" << str << "\".\n"; + printf("%s", msg.GetString().c_str()); + fflush(stdout); + return false; + } + + // Is the parsed value in the range of an Int32? + const Int32 result = static_cast<Int32>(long_value); + if (long_value == LONG_MAX || long_value == LONG_MIN || + // The parsed value overflows as a long. (strtol() returns + // LONG_MAX or LONG_MIN when the input overflows.) + result != long_value + // The parsed value overflows as an Int32. + ) { + Message msg; + msg << "WARNING: " << src_text + << " is expected to be a 32-bit integer, but actually" + << " has value " << str << ", which overflows.\n"; + printf("%s", msg.GetString().c_str()); + fflush(stdout); + return false; + } + + *value = result; + return true; +} + +// Reads and returns the Boolean environment variable corresponding to +// the given flag; if it's not set, returns default_value. +// +// The value is considered true iff it's not "0". +bool BoolFromGTestEnv(const char* flag, bool default_value) { + const std::string env_var = FlagToEnvVar(flag); + const char* const string_value = posix::GetEnv(env_var.c_str()); + return string_value == NULL ? + default_value : strcmp(string_value, "0") != 0; +} + +// Reads and returns a 32-bit integer stored in the environment +// variable corresponding to the given flag; if it isn't set or +// doesn't represent a valid 32-bit integer, returns default_value. +Int32 Int32FromGTestEnv(const char* flag, Int32 default_value) { + const std::string env_var = FlagToEnvVar(flag); + const char* const string_value = posix::GetEnv(env_var.c_str()); + if (string_value == NULL) { + // The environment variable is not set. + return default_value; + } + + Int32 result = default_value; + if (!ParseInt32(Message() << "Environment variable " << env_var, + string_value, &result)) { + printf("The default value %s is used.\n", + (Message() << default_value).GetString().c_str()); + fflush(stdout); + return default_value; + } + + return result; +} + +// Reads and returns the string environment variable corresponding to +// the given flag; if it's not set, returns default_value. +const char* StringFromGTestEnv(const char* flag, const char* default_value) { + const std::string env_var = FlagToEnvVar(flag); + const char* const value = posix::GetEnv(env_var.c_str()); + return value == NULL ? default_value : value; +} + +} // namespace internal +} // namespace testing +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) + +// Google Test - The Google C++ Testing Framework +// +// This file implements a universal value printer that can print a +// value of any type T: +// +// void ::testing::internal::UniversalPrinter<T>::Print(value, ostream_ptr); +// +// It uses the << operator when possible, and prints the bytes in the +// object otherwise. A user can override its behavior for a class +// type Foo by defining either operator<<(::std::ostream&, const Foo&) +// or void PrintTo(const Foo&, ::std::ostream*) in the namespace that +// defines Foo. + +#include <ctype.h> +#include <stdio.h> +#include <ostream> // NOLINT +#include <string> + +namespace testing { + +namespace { + +using ::std::ostream; + +// Prints a segment of bytes in the given object. +void PrintByteSegmentInObjectTo(const unsigned char* obj_bytes, size_t start, + size_t count, ostream* os) { + char text[5] = ""; + for (size_t i = 0; i != count; i++) { + const size_t j = start + i; + if (i != 0) { + // Organizes the bytes into groups of 2 for easy parsing by + // human. + if ((j % 2) == 0) + *os << ' '; + else + *os << '-'; + } + GTEST_SNPRINTF_(text, sizeof(text), "%02X", obj_bytes[j]); + *os << text; + } +} + +// Prints the bytes in the given value to the given ostream. +void PrintBytesInObjectToImpl(const unsigned char* obj_bytes, size_t count, + ostream* os) { + // Tells the user how big the object is. + *os << count << "-byte object <"; + + const size_t kThreshold = 132; + const size_t kChunkSize = 64; + // If the object size is bigger than kThreshold, we'll have to omit + // some details by printing only the first and the last kChunkSize + // bytes. + // TODO(wan): let the user control the threshold using a flag. + if (count < kThreshold) { + PrintByteSegmentInObjectTo(obj_bytes, 0, count, os); + } else { + PrintByteSegmentInObjectTo(obj_bytes, 0, kChunkSize, os); + *os << " ... "; + // Rounds up to 2-byte boundary. + const size_t resume_pos = (count - kChunkSize + 1)/2*2; + PrintByteSegmentInObjectTo(obj_bytes, resume_pos, count - resume_pos, os); + } + *os << ">"; +} + +} // namespace + +namespace internal2 { + +// Delegates to PrintBytesInObjectToImpl() to print the bytes in the +// given object. The delegation simplifies the implementation, which +// uses the << operator and thus is easier done outside of the +// ::testing::internal namespace, which contains a << operator that +// sometimes conflicts with the one in STL. +void PrintBytesInObjectTo(const unsigned char* obj_bytes, size_t count, + ostream* os) { + PrintBytesInObjectToImpl(obj_bytes, count, os); +} + +} // namespace internal2 + +namespace internal { + +// Depending on the value of a char (or wchar_t), we print it in one +// of three formats: +// - as is if it's a printable ASCII (e.g. 'a', '2', ' '), +// - as a hexidecimal escape sequence (e.g. '\x7F'), or +// - as a special escape sequence (e.g. '\r', '\n'). +enum CharFormat { + kAsIs, + kHexEscape, + kSpecialEscape +}; + +// Returns true if c is a printable ASCII character. We test the +// value of c directly instead of calling isprint(), which is buggy on +// Windows Mobile. +inline bool IsPrintableAscii(wchar_t c) { + return 0x20 <= c && c <= 0x7E; +} + +// Prints a wide or narrow char c as a character literal without the +// quotes, escaping it when necessary; returns how c was formatted. +// The template argument UnsignedChar is the unsigned version of Char, +// which is the type of c. +template <typename UnsignedChar, typename Char> +static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) { + switch (static_cast<wchar_t>(c)) { + case L'\0': + *os << "\\0"; + break; + case L'\'': + *os << "\\'"; + break; + case L'\\': + *os << "\\\\"; + break; + case L'\a': + *os << "\\a"; + break; + case L'\b': + *os << "\\b"; + break; + case L'\f': + *os << "\\f"; + break; + case L'\n': + *os << "\\n"; + break; + case L'\r': + *os << "\\r"; + break; + case L'\t': + *os << "\\t"; + break; + case L'\v': + *os << "\\v"; + break; + default: + if (IsPrintableAscii(c)) { + *os << static_cast<char>(c); + return kAsIs; + } else { + *os << "\\x" + String::FormatHexInt(static_cast<UnsignedChar>(c)); + return kHexEscape; + } + } + return kSpecialEscape; +} + +// Prints a wchar_t c as if it's part of a string literal, escaping it when +// necessary; returns how c was formatted. +static CharFormat PrintAsStringLiteralTo(wchar_t c, ostream* os) { + switch (c) { + case L'\'': + *os << "'"; + return kAsIs; + case L'"': + *os << "\\\""; + return kSpecialEscape; + default: + return PrintAsCharLiteralTo<wchar_t>(c, os); + } +} + +// Prints a char c as if it's part of a string literal, escaping it when +// necessary; returns how c was formatted. +static CharFormat PrintAsStringLiteralTo(char c, ostream* os) { + return PrintAsStringLiteralTo( + static_cast<wchar_t>(static_cast<unsigned char>(c)), os); +} + +// Prints a wide or narrow character c and its code. '\0' is printed +// as "'\\0'", other unprintable characters are also properly escaped +// using the standard C++ escape sequence. The template argument +// UnsignedChar is the unsigned version of Char, which is the type of c. +template <typename UnsignedChar, typename Char> +void PrintCharAndCodeTo(Char c, ostream* os) { + // First, print c as a literal in the most readable form we can find. + *os << ((sizeof(c) > 1) ? "L'" : "'"); + const CharFormat format = PrintAsCharLiteralTo<UnsignedChar>(c, os); + *os << "'"; + + // To aid user debugging, we also print c's code in decimal, unless + // it's 0 (in which case c was printed as '\\0', making the code + // obvious). + if (c == 0) + return; + *os << " (" << static_cast<int>(c); + + // For more convenience, we print c's code again in hexidecimal, + // unless c was already printed in the form '\x##' or the code is in + // [1, 9]. + if (format == kHexEscape || (1 <= c && c <= 9)) { + // Do nothing. + } else { + *os << ", 0x" << String::FormatHexInt(static_cast<UnsignedChar>(c)); + } + *os << ")"; +} + +void PrintTo(unsigned char c, ::std::ostream* os) { + PrintCharAndCodeTo<unsigned char>(c, os); +} +void PrintTo(signed char c, ::std::ostream* os) { + PrintCharAndCodeTo<unsigned char>(c, os); +} + +// Prints a wchar_t as a symbol if it is printable or as its internal +// code otherwise and also as its code. L'\0' is printed as "L'\\0'". +void PrintTo(wchar_t wc, ostream* os) { + PrintCharAndCodeTo<wchar_t>(wc, os); +} + +// Prints the given array of characters to the ostream. CharType must be either +// char or wchar_t. +// The array starts at begin, the length is len, it may include '\0' characters +// and may not be NUL-terminated. +template <typename CharType> +static void PrintCharsAsStringTo( + const CharType* begin, size_t len, ostream* os) { + const char* const kQuoteBegin = sizeof(CharType) == 1 ? "\"" : "L\""; + *os << kQuoteBegin; + bool is_previous_hex = false; + for (size_t index = 0; index < len; ++index) { + const CharType cur = begin[index]; + if (is_previous_hex && IsXDigit(cur)) { + // Previous character is of '\x..' form and this character can be + // interpreted as another hexadecimal digit in its number. Break string to + // disambiguate. + *os << "\" " << kQuoteBegin; + } + is_previous_hex = PrintAsStringLiteralTo(cur, os) == kHexEscape; + } + *os << "\""; +} + +// Prints a (const) char/wchar_t array of 'len' elements, starting at address +// 'begin'. CharType must be either char or wchar_t. +template <typename CharType> +static void UniversalPrintCharArray( + const CharType* begin, size_t len, ostream* os) { + // The code + // const char kFoo[] = "foo"; + // generates an array of 4, not 3, elements, with the last one being '\0'. + // + // Therefore when printing a char array, we don't print the last element if + // it's '\0', such that the output matches the string literal as it's + // written in the source code. + if (len > 0 && begin[len - 1] == '\0') { + PrintCharsAsStringTo(begin, len - 1, os); + return; + } + + // If, however, the last element in the array is not '\0', e.g. + // const char kFoo[] = { 'f', 'o', 'o' }; + // we must print the entire array. We also print a message to indicate + // that the array is not NUL-terminated. + PrintCharsAsStringTo(begin, len, os); + *os << " (no terminating NUL)"; +} + +// Prints a (const) char array of 'len' elements, starting at address 'begin'. +void UniversalPrintArray(const char* begin, size_t len, ostream* os) { + UniversalPrintCharArray(begin, len, os); +} + +// Prints a (const) wchar_t array of 'len' elements, starting at address +// 'begin'. +void UniversalPrintArray(const wchar_t* begin, size_t len, ostream* os) { + UniversalPrintCharArray(begin, len, os); +} + +// Prints the given C string to the ostream. +void PrintTo(const char* s, ostream* os) { + if (s == NULL) { + *os << "NULL"; + } else { + *os << ImplicitCast_<const void*>(s) << " pointing to "; + PrintCharsAsStringTo(s, strlen(s), os); + } +} + +// MSVC compiler can be configured to define whar_t as a typedef +// of unsigned short. Defining an overload for const wchar_t* in that case +// would cause pointers to unsigned shorts be printed as wide strings, +// possibly accessing more memory than intended and causing invalid +// memory accesses. MSVC defines _NATIVE_WCHAR_T_DEFINED symbol when +// wchar_t is implemented as a native type. +#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED) +// Prints the given wide C string to the ostream. +void PrintTo(const wchar_t* s, ostream* os) { + if (s == NULL) { + *os << "NULL"; + } else { + *os << ImplicitCast_<const void*>(s) << " pointing to "; + PrintCharsAsStringTo(s, wcslen(s), os); + } +} +#endif // wchar_t is native + +// Prints a ::string object. +#if GTEST_HAS_GLOBAL_STRING +void PrintStringTo(const ::string& s, ostream* os) { + PrintCharsAsStringTo(s.data(), s.size(), os); +} +#endif // GTEST_HAS_GLOBAL_STRING + +void PrintStringTo(const ::std::string& s, ostream* os) { + PrintCharsAsStringTo(s.data(), s.size(), os); +} + +// Prints a ::wstring object. +#if GTEST_HAS_GLOBAL_WSTRING +void PrintWideStringTo(const ::wstring& s, ostream* os) { + PrintCharsAsStringTo(s.data(), s.size(), os); +} +#endif // GTEST_HAS_GLOBAL_WSTRING + +#if GTEST_HAS_STD_WSTRING +void PrintWideStringTo(const ::std::wstring& s, ostream* os) { + PrintCharsAsStringTo(s.data(), s.size(), os); +} +#endif // GTEST_HAS_STD_WSTRING + +} // namespace internal + +} // namespace testing +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: mheule@google.com (Markus Heule) +// +// The Google C++ Testing Framework (Google Test) + + +// Indicates that this translation unit is part of Google Test's +// implementation. It must come before gtest-internal-inl.h is +// included, or there will be a compiler error. This trick is to +// prevent a user from accidentally including gtest-internal-inl.h in +// his code. +#define GTEST_IMPLEMENTATION_ 1 +#undef GTEST_IMPLEMENTATION_ + +namespace testing { + +using internal::GetUnitTestImpl; + +// Gets the summary of the failure message by omitting the stack trace +// in it. +std::string TestPartResult::ExtractSummary(const char* message) { + const char* const stack_trace = strstr(message, internal::kStackTraceMarker); + return stack_trace == NULL ? message : + std::string(message, stack_trace); +} + +// Prints a TestPartResult object. +std::ostream& operator<<(std::ostream& os, const TestPartResult& result) { + return os + << result.file_name() << ":" << result.line_number() << ": " + << (result.type() == TestPartResult::kSuccess ? "Success" : + result.type() == TestPartResult::kFatalFailure ? "Fatal failure" : + "Non-fatal failure") << ":\n" + << result.message() << std::endl; +} + +// Appends a TestPartResult to the array. +void TestPartResultArray::Append(const TestPartResult& result) { + array_.push_back(result); +} + +// Returns the TestPartResult at the given index (0-based). +const TestPartResult& TestPartResultArray::GetTestPartResult(int index) const { + if (index < 0 || index >= size()) { + printf("\nInvalid index (%d) into TestPartResultArray.\n", index); + internal::posix::Abort(); + } + + return array_[index]; +} + +// Returns the number of TestPartResult objects in the array. +int TestPartResultArray::size() const { + return static_cast<int>(array_.size()); +} + +namespace internal { + +HasNewFatalFailureHelper::HasNewFatalFailureHelper() + : has_new_fatal_failure_(false), + original_reporter_(GetUnitTestImpl()-> + GetTestPartResultReporterForCurrentThread()) { + GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(this); +} + +HasNewFatalFailureHelper::~HasNewFatalFailureHelper() { + GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread( + original_reporter_); +} + +void HasNewFatalFailureHelper::ReportTestPartResult( + const TestPartResult& result) { + if (result.fatally_failed()) + has_new_fatal_failure_ = true; + original_reporter_->ReportTestPartResult(result); +} + +} // namespace internal + +} // namespace testing +// Copyright 2008 Google Inc. +// All Rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) + + +namespace testing { +namespace internal { + +#if GTEST_HAS_TYPED_TEST_P + +// Skips to the first non-space char in str. Returns an empty string if str +// contains only whitespace characters. +static const char* SkipSpaces(const char* str) { + while (IsSpace(*str)) + str++; + return str; +} + +// Verifies that registered_tests match the test names in +// defined_test_names_; returns registered_tests if successful, or +// aborts the program otherwise. +const char* TypedTestCasePState::VerifyRegisteredTestNames( + const char* file, int line, const char* registered_tests) { + typedef ::std::set<const char*>::const_iterator DefinedTestIter; + registered_ = true; + + // Skip initial whitespace in registered_tests since some + // preprocessors prefix stringizied literals with whitespace. + registered_tests = SkipSpaces(registered_tests); + + Message errors; + ::std::set<std::string> tests; + for (const char* names = registered_tests; names != NULL; + names = SkipComma(names)) { + const std::string name = GetPrefixUntilComma(names); + if (tests.count(name) != 0) { + errors << "Test " << name << " is listed more than once.\n"; + continue; + } + + bool found = false; + for (DefinedTestIter it = defined_test_names_.begin(); + it != defined_test_names_.end(); + ++it) { + if (name == *it) { + found = true; + break; + } + } + + if (found) { + tests.insert(name); + } else { + errors << "No test named " << name + << " can be found in this test case.\n"; + } + } + + for (DefinedTestIter it = defined_test_names_.begin(); + it != defined_test_names_.end(); + ++it) { + if (tests.count(*it) == 0) { + errors << "You forgot to list test " << *it << ".\n"; + } + } + + const std::string& errors_str = errors.GetString(); + if (errors_str != "") { + fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(), + errors_str.c_str()); + fflush(stderr); + posix::Abort(); + } + + return registered_tests; +} + +#endif // GTEST_HAS_TYPED_TEST_P + +} // namespace internal +} // namespace testing diff --git a/packages/kokkos/tpls/gtest/gtest/gtest-test-part.h b/packages/kokkos/tpls/gtest/gtest/gtest-test-part.h new file mode 120000 index 0000000000000000000000000000000000000000..48d39090f1cabfc4a852d54e0e1f186362eeb1f5 --- /dev/null +++ b/packages/kokkos/tpls/gtest/gtest/gtest-test-part.h @@ -0,0 +1 @@ +gtest.h \ No newline at end of file diff --git a/packages/kokkos/tpls/gtest/gtest/gtest.h b/packages/kokkos/tpls/gtest/gtest/gtest.h new file mode 100644 index 0000000000000000000000000000000000000000..f39d0b87c90f73113e44c7dec9b2cdac9088644e --- /dev/null +++ b/packages/kokkos/tpls/gtest/gtest/gtest.h @@ -0,0 +1,20065 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) +// +// The Google C++ Testing Framework (Google Test) +// +// This header file defines the public API for Google Test. It should be +// included by any test program that uses Google Test. +// +// IMPORTANT NOTE: Due to limitation of the C++ language, we have to +// leave some internal implementation details in this header file. +// They are clearly marked by comments like this: +// +// // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +// +// Such code is NOT meant to be used by a user directly, and is subject +// to CHANGE WITHOUT NOTICE. Therefore DO NOT DEPEND ON IT in a user +// program! +// +// Acknowledgment: Google Test borrowed the idea of automatic test +// registration from Barthelemy Dagenais' (barthelemy@prologique.com) +// easyUnit framework. + +#ifdef __GNUC__ +#pragma GCC system_header +#endif + +#ifndef GTEST_INCLUDE_GTEST_GTEST_H_ +#define GTEST_INCLUDE_GTEST_GTEST_H_ + +#include <limits> +#include <ostream> +#include <vector> + +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee) +// +// The Google C++ Testing Framework (Google Test) +// +// This header file declares functions and macros used internally by +// Google Test. They are subject to change without notice. + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ + +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Authors: wan@google.com (Zhanyong Wan) +// +// Low-level types and utilities for porting Google Test to various +// platforms. They are subject to change without notice. DO NOT USE +// THEM IN USER CODE. +// +// This file is fundamental to Google Test. All other Google Test source +// files are expected to #include this. Therefore, it cannot #include +// any other Google Test header. + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ + +// The user can define the following macros in the build script to +// control Google Test's behavior. If the user doesn't define a macro +// in this list, Google Test will define it. +// +// GTEST_HAS_CLONE - Define it to 1/0 to indicate that clone(2) +// is/isn't available. +// GTEST_HAS_EXCEPTIONS - Define it to 1/0 to indicate that exceptions +// are enabled. +// GTEST_HAS_GLOBAL_STRING - Define it to 1/0 to indicate that ::string +// is/isn't available (some systems define +// ::string, which is different to std::string). +// GTEST_HAS_GLOBAL_WSTRING - Define it to 1/0 to indicate that ::string +// is/isn't available (some systems define +// ::wstring, which is different to std::wstring). +// GTEST_HAS_POSIX_RE - Define it to 1/0 to indicate that POSIX regular +// expressions are/aren't available. +// GTEST_HAS_PTHREAD - Define it to 1/0 to indicate that <pthread.h> +// is/isn't available. +// GTEST_HAS_RTTI - Define it to 1/0 to indicate that RTTI is/isn't +// enabled. +// GTEST_HAS_STD_WSTRING - Define it to 1/0 to indicate that +// std::wstring does/doesn't work (Google Test can +// be used where std::wstring is unavailable). +// GTEST_HAS_TR1_TUPLE - Define it to 1/0 to indicate tr1::tuple +// is/isn't available. +// GTEST_HAS_SEH - Define it to 1/0 to indicate whether the +// compiler supports Microsoft's "Structured +// Exception Handling". +// GTEST_HAS_STREAM_REDIRECTION +// - Define it to 1/0 to indicate whether the +// platform supports I/O stream redirection using +// dup() and dup2(). +// GTEST_USE_OWN_TR1_TUPLE - Define it to 1/0 to indicate whether Google +// Test's own tr1 tuple implementation should be +// used. Unused when the user sets +// GTEST_HAS_TR1_TUPLE to 0. +// GTEST_LANG_CXX11 - Define it to 1/0 to indicate that Google Test +// is building in C++11/C++98 mode. +// GTEST_LINKED_AS_SHARED_LIBRARY +// - Define to 1 when compiling tests that use +// Google Test as a shared library (known as +// DLL on Windows). +// GTEST_CREATE_SHARED_LIBRARY +// - Define to 1 when compiling Google Test itself +// as a shared library. + +// This header defines the following utilities: +// +// Macros indicating the current platform (defined to 1 if compiled on +// the given platform; otherwise undefined): +// GTEST_OS_AIX - IBM AIX +// GTEST_OS_CYGWIN - Cygwin +// GTEST_OS_HPUX - HP-UX +// GTEST_OS_LINUX - Linux +// GTEST_OS_LINUX_ANDROID - Google Android +// GTEST_OS_MAC - Mac OS X +// GTEST_OS_IOS - iOS +// GTEST_OS_IOS_SIMULATOR - iOS simulator +// GTEST_OS_NACL - Google Native Client (NaCl) +// GTEST_OS_OPENBSD - OpenBSD +// GTEST_OS_QNX - QNX +// GTEST_OS_SOLARIS - Sun Solaris +// GTEST_OS_SYMBIAN - Symbian +// GTEST_OS_WINDOWS - Windows (Desktop, MinGW, or Mobile) +// GTEST_OS_WINDOWS_DESKTOP - Windows Desktop +// GTEST_OS_WINDOWS_MINGW - MinGW +// GTEST_OS_WINDOWS_MOBILE - Windows Mobile +// GTEST_OS_ZOS - z/OS +// +// Among the platforms, Cygwin, Linux, Max OS X, and Windows have the +// most stable support. Since core members of the Google Test project +// don't have access to other platforms, support for them may be less +// stable. If you notice any problems on your platform, please notify +// googletestframework@googlegroups.com (patches for fixing them are +// even more welcome!). +// +// Note that it is possible that none of the GTEST_OS_* macros are defined. +// +// Macros indicating available Google Test features (defined to 1 if +// the corresponding feature is supported; otherwise undefined): +// GTEST_HAS_COMBINE - the Combine() function (for value-parameterized +// tests) +// GTEST_HAS_DEATH_TEST - death tests +// GTEST_HAS_PARAM_TEST - value-parameterized tests +// GTEST_HAS_TYPED_TEST - typed tests +// GTEST_HAS_TYPED_TEST_P - type-parameterized tests +// GTEST_USES_POSIX_RE - enhanced POSIX regex is used. Do not confuse with +// GTEST_HAS_POSIX_RE (see above) which users can +// define themselves. +// GTEST_USES_SIMPLE_RE - our own simple regex is used; +// the above two are mutually exclusive. +// GTEST_CAN_COMPARE_NULL - accepts untyped NULL in EXPECT_EQ(). +// +// Macros for basic C++ coding: +// GTEST_AMBIGUOUS_ELSE_BLOCKER_ - for disabling a gcc warning. +// GTEST_ATTRIBUTE_UNUSED_ - declares that a class' instances or a +// variable don't have to be used. +// GTEST_DISALLOW_ASSIGN_ - disables operator=. +// GTEST_DISALLOW_COPY_AND_ASSIGN_ - disables copy ctor and operator=. +// GTEST_MUST_USE_RESULT_ - declares that a function's result must be used. +// +// Synchronization: +// Mutex, MutexLock, ThreadLocal, GetThreadCount() +// - synchronization primitives. +// GTEST_IS_THREADSAFE - defined to 1 to indicate that the above +// synchronization primitives have real implementations +// and Google Test is thread-safe; or 0 otherwise. +// +// Template meta programming: +// is_pointer - as in TR1; needed on Symbian and IBM XL C/C++ only. +// IteratorTraits - partial implementation of std::iterator_traits, which +// is not available in libCstd when compiled with Sun C++. +// +// Smart pointers: +// scoped_ptr - as in TR2. +// +// Regular expressions: +// RE - a simple regular expression class using the POSIX +// Extended Regular Expression syntax on UNIX-like +// platforms, or a reduced regular exception syntax on +// other platforms, including Windows. +// +// Logging: +// GTEST_LOG_() - logs messages at the specified severity level. +// LogToStderr() - directs all log messages to stderr. +// FlushInfoLog() - flushes informational log messages. +// +// Stdout and stderr capturing: +// CaptureStdout() - starts capturing stdout. +// GetCapturedStdout() - stops capturing stdout and returns the captured +// string. +// CaptureStderr() - starts capturing stderr. +// GetCapturedStderr() - stops capturing stderr and returns the captured +// string. +// +// Integer types: +// TypeWithSize - maps an integer to a int type. +// Int32, UInt32, Int64, UInt64, TimeInMillis +// - integers of known sizes. +// BiggestInt - the biggest signed integer type. +// +// Command-line utilities: +// GTEST_FLAG() - references a flag. +// GTEST_DECLARE_*() - declares a flag. +// GTEST_DEFINE_*() - defines a flag. +// GetInjectableArgvs() - returns the command line as a vector of strings. +// +// Environment variable utilities: +// GetEnv() - gets the value of an environment variable. +// BoolFromGTestEnv() - parses a bool environment variable. +// Int32FromGTestEnv() - parses an Int32 environment variable. +// StringFromGTestEnv() - parses a string environment variable. + +#include <ctype.h> // for isspace, etc +#include <stddef.h> // for ptrdiff_t +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#ifndef _WIN32_WCE +# include <sys/types.h> +# include <sys/stat.h> +#endif // !_WIN32_WCE + +#if defined __APPLE__ +# include <AvailabilityMacros.h> +# include <TargetConditionals.h> +#endif + +#include <iostream> // NOLINT +#include <sstream> // NOLINT +#include <string> // NOLINT + +#define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com" +#define GTEST_FLAG_PREFIX_ "gtest_" +#define GTEST_FLAG_PREFIX_DASH_ "gtest-" +#define GTEST_FLAG_PREFIX_UPPER_ "GTEST_" +#define GTEST_NAME_ "Google Test" +#define GTEST_PROJECT_URL_ "http://code.google.com/p/googletest/" + +// Determines the version of gcc that is used to compile this. +#ifdef __GNUC__ +// 40302 means version 4.3.2. +# define GTEST_GCC_VER_ \ + (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__) +#endif // __GNUC__ + +// Determines the platform on which Google Test is compiled. +#ifdef __CYGWIN__ +# define GTEST_OS_CYGWIN 1 +#elif defined __SYMBIAN32__ +# define GTEST_OS_SYMBIAN 1 +#elif defined _WIN32 +# define GTEST_OS_WINDOWS 1 +# ifdef _WIN32_WCE +# define GTEST_OS_WINDOWS_MOBILE 1 +# elif defined(__MINGW__) || defined(__MINGW32__) +# define GTEST_OS_WINDOWS_MINGW 1 +# else +# define GTEST_OS_WINDOWS_DESKTOP 1 +# endif // _WIN32_WCE +#elif defined __APPLE__ +# define GTEST_OS_MAC 1 +# if TARGET_OS_IPHONE +# define GTEST_OS_IOS 1 +# if TARGET_IPHONE_SIMULATOR +# define GTEST_OS_IOS_SIMULATOR 1 +# endif +# endif +#elif defined __linux__ +# define GTEST_OS_LINUX 1 +# if defined __ANDROID__ +# define GTEST_OS_LINUX_ANDROID 1 +# endif +#elif defined __MVS__ +# define GTEST_OS_ZOS 1 +#elif defined(__sun) && defined(__SVR4) +# define GTEST_OS_SOLARIS 1 +#elif defined(_AIX) +# define GTEST_OS_AIX 1 +#elif defined(__hpux) +# define GTEST_OS_HPUX 1 +#elif defined __native_client__ +# define GTEST_OS_NACL 1 +#elif defined __OpenBSD__ +# define GTEST_OS_OPENBSD 1 +#elif defined __QNX__ +# define GTEST_OS_QNX 1 +#endif // __CYGWIN__ + +#ifndef GTEST_LANG_CXX11 +// gcc and clang define __GXX_EXPERIMENTAL_CXX0X__ when +// -std={c,gnu}++{0x,11} is passed. The C++11 standard specifies a +// value for __cplusplus, and recent versions of clang, gcc, and +// probably other compilers set that too in C++11 mode. +# if __GXX_EXPERIMENTAL_CXX0X__ || __cplusplus >= 201103L +// Compiling in at least C++11 mode. +# define GTEST_LANG_CXX11 1 +# else +# define GTEST_LANG_CXX11 0 +# endif +#endif + +// Brings in definitions for functions used in the testing::internal::posix +// namespace (read, write, close, chdir, isatty, stat). We do not currently +// use them on Windows Mobile. +#if !GTEST_OS_WINDOWS +// This assumes that non-Windows OSes provide unistd.h. For OSes where this +// is not the case, we need to include headers that provide the functions +// mentioned above. +# include <unistd.h> +# include <strings.h> +#elif !GTEST_OS_WINDOWS_MOBILE +# include <direct.h> +# include <io.h> +#endif + +#if GTEST_OS_LINUX_ANDROID +// Used to define __ANDROID_API__ matching the target NDK API level. +# include <android/api-level.h> // NOLINT +#endif + +// Defines this to true iff Google Test can use POSIX regular expressions. +#ifndef GTEST_HAS_POSIX_RE +# if GTEST_OS_LINUX_ANDROID +// On Android, <regex.h> is only available starting with Gingerbread. +# define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9) +# else +# define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS) +# endif +#endif + +#if GTEST_HAS_POSIX_RE + +// On some platforms, <regex.h> needs someone to define size_t, and +// won't compile otherwise. We can #include it here as we already +// included <stdlib.h>, which is guaranteed to define size_t through +// <stddef.h>. +# include <regex.h> // NOLINT + +# define GTEST_USES_POSIX_RE 1 + +#elif GTEST_OS_WINDOWS + +// <regex.h> is not available on Windows. Use our own simple regex +// implementation instead. +# define GTEST_USES_SIMPLE_RE 1 + +#else + +// <regex.h> may not be available on this platform. Use our own +// simple regex implementation instead. +# define GTEST_USES_SIMPLE_RE 1 + +#endif // GTEST_HAS_POSIX_RE + +#ifndef GTEST_HAS_EXCEPTIONS +// The user didn't tell us whether exceptions are enabled, so we need +// to figure it out. +# if defined(_MSC_VER) || defined(__BORLANDC__) +// MSVC's and C++Builder's implementations of the STL use the _HAS_EXCEPTIONS +// macro to enable exceptions, so we'll do the same. +// Assumes that exceptions are enabled by default. +# ifndef _HAS_EXCEPTIONS +# define _HAS_EXCEPTIONS 1 +# endif // _HAS_EXCEPTIONS +# define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS +# elif defined(__GNUC__) && __EXCEPTIONS +// gcc defines __EXCEPTIONS to 1 iff exceptions are enabled. +# define GTEST_HAS_EXCEPTIONS 1 +# elif defined(__SUNPRO_CC) +// Sun Pro CC supports exceptions. However, there is no compile-time way of +// detecting whether they are enabled or not. Therefore, we assume that +// they are enabled unless the user tells us otherwise. +# define GTEST_HAS_EXCEPTIONS 1 +# elif defined(__IBMCPP__) && __EXCEPTIONS +// xlC defines __EXCEPTIONS to 1 iff exceptions are enabled. +# define GTEST_HAS_EXCEPTIONS 1 +# elif defined(__HP_aCC) +// Exception handling is in effect by default in HP aCC compiler. It has to +// be turned of by +noeh compiler option if desired. +# define GTEST_HAS_EXCEPTIONS 1 +# else +// For other compilers, we assume exceptions are disabled to be +// conservative. +# define GTEST_HAS_EXCEPTIONS 0 +# endif // defined(_MSC_VER) || defined(__BORLANDC__) +#endif // GTEST_HAS_EXCEPTIONS + +#if !defined(GTEST_HAS_STD_STRING) +// Even though we don't use this macro any longer, we keep it in case +// some clients still depend on it. +# define GTEST_HAS_STD_STRING 1 +#elif !GTEST_HAS_STD_STRING +// The user told us that ::std::string isn't available. +# error "Google Test cannot be used where ::std::string isn't available." +#endif // !defined(GTEST_HAS_STD_STRING) + +#ifndef GTEST_HAS_GLOBAL_STRING +// The user didn't tell us whether ::string is available, so we need +// to figure it out. + +# define GTEST_HAS_GLOBAL_STRING 0 + +#endif // GTEST_HAS_GLOBAL_STRING + +#ifndef GTEST_HAS_STD_WSTRING +// The user didn't tell us whether ::std::wstring is available, so we need +// to figure it out. +// TODO(wan@google.com): uses autoconf to detect whether ::std::wstring +// is available. + +// Cygwin 1.7 and below doesn't support ::std::wstring. +// Solaris' libc++ doesn't support it either. Android has +// no support for it at least as recent as Froyo (2.2). +# define GTEST_HAS_STD_WSTRING \ + (!(GTEST_OS_LINUX_ANDROID || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS)) + +#endif // GTEST_HAS_STD_WSTRING + +#ifndef GTEST_HAS_GLOBAL_WSTRING +// The user didn't tell us whether ::wstring is available, so we need +// to figure it out. +# define GTEST_HAS_GLOBAL_WSTRING \ + (GTEST_HAS_STD_WSTRING && GTEST_HAS_GLOBAL_STRING) +#endif // GTEST_HAS_GLOBAL_WSTRING + +// Determines whether RTTI is available. +#ifndef GTEST_HAS_RTTI +// The user didn't tell us whether RTTI is enabled, so we need to +// figure it out. + +# ifdef _MSC_VER + +# ifdef _CPPRTTI // MSVC defines this macro iff RTTI is enabled. +# define GTEST_HAS_RTTI 1 +# else +# define GTEST_HAS_RTTI 0 +# endif + +// Starting with version 4.3.2, gcc defines __GXX_RTTI iff RTTI is enabled. +# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40302) + +# ifdef __GXX_RTTI +// When building against STLport with the Android NDK and with +// -frtti -fno-exceptions, the build fails at link time with undefined +// references to __cxa_bad_typeid. Note sure if STL or toolchain bug, +// so disable RTTI when detected. +# if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && \ + !defined(__EXCEPTIONS) +# define GTEST_HAS_RTTI 0 +# else +# define GTEST_HAS_RTTI 1 +# endif // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS +# else +# define GTEST_HAS_RTTI 0 +# endif // __GXX_RTTI + +// Clang defines __GXX_RTTI starting with version 3.0, but its manual recommends +// using has_feature instead. has_feature(cxx_rtti) is supported since 2.7, the +// first version with C++ support. +# elif defined(__clang__) + +# define GTEST_HAS_RTTI __has_feature(cxx_rtti) + +// Starting with version 9.0 IBM Visual Age defines __RTTI_ALL__ to 1 if +// both the typeid and dynamic_cast features are present. +# elif defined(__IBMCPP__) && (__IBMCPP__ >= 900) + +# ifdef __RTTI_ALL__ +# define GTEST_HAS_RTTI 1 +# else +# define GTEST_HAS_RTTI 0 +# endif + +# else + +// For all other compilers, we assume RTTI is enabled. +# define GTEST_HAS_RTTI 1 + +# endif // _MSC_VER + +#endif // GTEST_HAS_RTTI + +// It's this header's responsibility to #include <typeinfo> when RTTI +// is enabled. +#if GTEST_HAS_RTTI +# include <typeinfo> +#endif + +// Determines whether Google Test can use the pthreads library. +#ifndef GTEST_HAS_PTHREAD +// The user didn't tell us explicitly, so we assume pthreads support is +// available on Linux and Mac. +// +// To disable threading support in Google Test, add -DGTEST_HAS_PTHREAD=0 +// to your compiler flags. +# define GTEST_HAS_PTHREAD (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX \ + || GTEST_OS_QNX) +#endif // GTEST_HAS_PTHREAD + +#if GTEST_HAS_PTHREAD +// gtest-port.h guarantees to #include <pthread.h> when GTEST_HAS_PTHREAD is +// true. +# include <pthread.h> // NOLINT + +// For timespec and nanosleep, used below. +# include <time.h> // NOLINT +#endif + +// Determines whether Google Test can use tr1/tuple. You can define +// this macro to 0 to prevent Google Test from using tuple (any +// feature depending on tuple with be disabled in this mode). +#ifndef GTEST_HAS_TR1_TUPLE +# if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) +// STLport, provided with the Android NDK, has neither <tr1/tuple> or <tuple>. +# define GTEST_HAS_TR1_TUPLE 0 +# else +// The user didn't tell us not to do it, so we assume it's OK. +# define GTEST_HAS_TR1_TUPLE 1 +# endif +#endif // GTEST_HAS_TR1_TUPLE + +// Determines whether Google Test's own tr1 tuple implementation +// should be used. +#ifndef GTEST_USE_OWN_TR1_TUPLE +// The user didn't tell us, so we need to figure it out. + +// We use our own TR1 tuple if we aren't sure the user has an +// implementation of it already. At this time, libstdc++ 4.0.0+ and +// MSVC 2010 are the only mainstream standard libraries that come +// with a TR1 tuple implementation. NVIDIA's CUDA NVCC compiler +// pretends to be GCC by defining __GNUC__ and friends, but cannot +// compile GCC's tuple implementation. MSVC 2008 (9.0) provides TR1 +// tuple in a 323 MB Feature Pack download, which we cannot assume the +// user has. QNX's QCC compiler is a modified GCC but it doesn't +// support TR1 tuple. libc++ only provides std::tuple, in C++11 mode, +// and it can be used with some compilers that define __GNUC__. +# if (defined(__GNUC__) && !defined(__CUDACC__) && (GTEST_GCC_VER_ >= 40000) \ + && !GTEST_OS_QNX && !defined(_LIBCPP_VERSION)) || _MSC_VER >= 1600 +# define GTEST_ENV_HAS_TR1_TUPLE_ 1 +# endif + +// C++11 specifies that <tuple> provides std::tuple. Use that if gtest is used +// in C++11 mode and libstdc++ isn't very old (binaries targeting OS X 10.6 +// can build with clang but need to use gcc4.2's libstdc++). +# if GTEST_LANG_CXX11 && (!defined(__GLIBCXX__) || __GLIBCXX__ > 20110325) +# define GTEST_ENV_HAS_STD_TUPLE_ 1 +# endif + +# if GTEST_ENV_HAS_TR1_TUPLE_ || GTEST_ENV_HAS_STD_TUPLE_ +# define GTEST_USE_OWN_TR1_TUPLE 0 +# else +# define GTEST_USE_OWN_TR1_TUPLE 1 +# endif + +#endif // GTEST_USE_OWN_TR1_TUPLE + +// To avoid conditional compilation everywhere, we make it +// gtest-port.h's responsibility to #include the header implementing +// tr1/tuple. +#if GTEST_HAS_TR1_TUPLE + +# if GTEST_USE_OWN_TR1_TUPLE +// This file was GENERATED by command: +// pump.py gtest-tuple.h.pump +// DO NOT EDIT BY HAND!!! + +// Copyright 2009 Google Inc. +// All Rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) + +// Implements a subset of TR1 tuple needed by Google Test and Google Mock. + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_ + +#include <utility> // For ::std::pair. + +// The compiler used in Symbian has a bug that prevents us from declaring the +// tuple template as a friend (it complains that tuple is redefined). This +// hack bypasses the bug by declaring the members that should otherwise be +// private as public. +// Sun Studio versions < 12 also have the above bug. +#if defined(__SYMBIAN32__) || (defined(__SUNPRO_CC) && __SUNPRO_CC < 0x590) +# define GTEST_DECLARE_TUPLE_AS_FRIEND_ public: +#else +# define GTEST_DECLARE_TUPLE_AS_FRIEND_ \ + template <GTEST_10_TYPENAMES_(U)> friend class tuple; \ + private: +#endif + +// GTEST_n_TUPLE_(T) is the type of an n-tuple. +#define GTEST_0_TUPLE_(T) tuple<> +#define GTEST_1_TUPLE_(T) tuple<T##0, void, void, void, void, void, void, \ + void, void, void> +#define GTEST_2_TUPLE_(T) tuple<T##0, T##1, void, void, void, void, void, \ + void, void, void> +#define GTEST_3_TUPLE_(T) tuple<T##0, T##1, T##2, void, void, void, void, \ + void, void, void> +#define GTEST_4_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, void, void, void, \ + void, void, void> +#define GTEST_5_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, void, void, \ + void, void, void> +#define GTEST_6_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, void, \ + void, void, void> +#define GTEST_7_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \ + void, void, void> +#define GTEST_8_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \ + T##7, void, void> +#define GTEST_9_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \ + T##7, T##8, void> +#define GTEST_10_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \ + T##7, T##8, T##9> + +// GTEST_n_TYPENAMES_(T) declares a list of n typenames. +#define GTEST_0_TYPENAMES_(T) +#define GTEST_1_TYPENAMES_(T) typename T##0 +#define GTEST_2_TYPENAMES_(T) typename T##0, typename T##1 +#define GTEST_3_TYPENAMES_(T) typename T##0, typename T##1, typename T##2 +#define GTEST_4_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ + typename T##3 +#define GTEST_5_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ + typename T##3, typename T##4 +#define GTEST_6_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ + typename T##3, typename T##4, typename T##5 +#define GTEST_7_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ + typename T##3, typename T##4, typename T##5, typename T##6 +#define GTEST_8_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ + typename T##3, typename T##4, typename T##5, typename T##6, typename T##7 +#define GTEST_9_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ + typename T##3, typename T##4, typename T##5, typename T##6, \ + typename T##7, typename T##8 +#define GTEST_10_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ + typename T##3, typename T##4, typename T##5, typename T##6, \ + typename T##7, typename T##8, typename T##9 + +// In theory, defining stuff in the ::std namespace is undefined +// behavior. We can do this as we are playing the role of a standard +// library vendor. +namespace std { +namespace tr1 { + +template <typename T0 = void, typename T1 = void, typename T2 = void, + typename T3 = void, typename T4 = void, typename T5 = void, + typename T6 = void, typename T7 = void, typename T8 = void, + typename T9 = void> +class tuple; + +// Anything in namespace gtest_internal is Google Test's INTERNAL +// IMPLEMENTATION DETAIL and MUST NOT BE USED DIRECTLY in user code. +namespace gtest_internal { + +// ByRef<T>::type is T if T is a reference; otherwise it's const T&. +template <typename T> +struct ByRef { typedef const T& type; }; // NOLINT +template <typename T> +struct ByRef<T&> { typedef T& type; }; // NOLINT + +// A handy wrapper for ByRef. +#define GTEST_BY_REF_(T) typename ::std::tr1::gtest_internal::ByRef<T>::type + +// AddRef<T>::type is T if T is a reference; otherwise it's T&. This +// is the same as tr1::add_reference<T>::type. +template <typename T> +struct AddRef { typedef T& type; }; // NOLINT +template <typename T> +struct AddRef<T&> { typedef T& type; }; // NOLINT + +// A handy wrapper for AddRef. +#define GTEST_ADD_REF_(T) typename ::std::tr1::gtest_internal::AddRef<T>::type + +// A helper for implementing get<k>(). +template <int k> class Get; + +// A helper for implementing tuple_element<k, T>. kIndexValid is true +// iff k < the number of fields in tuple type T. +template <bool kIndexValid, int kIndex, class Tuple> +struct TupleElement; + +template <GTEST_10_TYPENAMES_(T)> +struct TupleElement<true, 0, GTEST_10_TUPLE_(T) > { + typedef T0 type; +}; + +template <GTEST_10_TYPENAMES_(T)> +struct TupleElement<true, 1, GTEST_10_TUPLE_(T) > { + typedef T1 type; +}; + +template <GTEST_10_TYPENAMES_(T)> +struct TupleElement<true, 2, GTEST_10_TUPLE_(T) > { + typedef T2 type; +}; + +template <GTEST_10_TYPENAMES_(T)> +struct TupleElement<true, 3, GTEST_10_TUPLE_(T) > { + typedef T3 type; +}; + +template <GTEST_10_TYPENAMES_(T)> +struct TupleElement<true, 4, GTEST_10_TUPLE_(T) > { + typedef T4 type; +}; + +template <GTEST_10_TYPENAMES_(T)> +struct TupleElement<true, 5, GTEST_10_TUPLE_(T) > { + typedef T5 type; +}; + +template <GTEST_10_TYPENAMES_(T)> +struct TupleElement<true, 6, GTEST_10_TUPLE_(T) > { + typedef T6 type; +}; + +template <GTEST_10_TYPENAMES_(T)> +struct TupleElement<true, 7, GTEST_10_TUPLE_(T) > { + typedef T7 type; +}; + +template <GTEST_10_TYPENAMES_(T)> +struct TupleElement<true, 8, GTEST_10_TUPLE_(T) > { + typedef T8 type; +}; + +template <GTEST_10_TYPENAMES_(T)> +struct TupleElement<true, 9, GTEST_10_TUPLE_(T) > { + typedef T9 type; +}; + +} // namespace gtest_internal + +template <> +class tuple<> { + public: + tuple() {} + tuple(const tuple& /* t */) {} + tuple& operator=(const tuple& /* t */) { return *this; } +}; + +template <GTEST_1_TYPENAMES_(T)> +class GTEST_1_TUPLE_(T) { + public: + template <int k> friend class gtest_internal::Get; + + tuple() : f0_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0) : f0_(f0) {} + + tuple(const tuple& t) : f0_(t.f0_) {} + + template <GTEST_1_TYPENAMES_(U)> + tuple(const GTEST_1_TUPLE_(U)& t) : f0_(t.f0_) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template <GTEST_1_TYPENAMES_(U)> + tuple& operator=(const GTEST_1_TUPLE_(U)& t) { + return CopyFrom(t); + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template <GTEST_1_TYPENAMES_(U)> + tuple& CopyFrom(const GTEST_1_TUPLE_(U)& t) { + f0_ = t.f0_; + return *this; + } + + T0 f0_; +}; + +template <GTEST_2_TYPENAMES_(T)> +class GTEST_2_TUPLE_(T) { + public: + template <int k> friend class gtest_internal::Get; + + tuple() : f0_(), f1_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1) : f0_(f0), + f1_(f1) {} + + tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_) {} + + template <GTEST_2_TYPENAMES_(U)> + tuple(const GTEST_2_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_) {} + template <typename U0, typename U1> + tuple(const ::std::pair<U0, U1>& p) : f0_(p.first), f1_(p.second) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template <GTEST_2_TYPENAMES_(U)> + tuple& operator=(const GTEST_2_TUPLE_(U)& t) { + return CopyFrom(t); + } + template <typename U0, typename U1> + tuple& operator=(const ::std::pair<U0, U1>& p) { + f0_ = p.first; + f1_ = p.second; + return *this; + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template <GTEST_2_TYPENAMES_(U)> + tuple& CopyFrom(const GTEST_2_TUPLE_(U)& t) { + f0_ = t.f0_; + f1_ = t.f1_; + return *this; + } + + T0 f0_; + T1 f1_; +}; + +template <GTEST_3_TYPENAMES_(T)> +class GTEST_3_TUPLE_(T) { + public: + template <int k> friend class gtest_internal::Get; + + tuple() : f0_(), f1_(), f2_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, + GTEST_BY_REF_(T2) f2) : f0_(f0), f1_(f1), f2_(f2) {} + + tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_) {} + + template <GTEST_3_TYPENAMES_(U)> + tuple(const GTEST_3_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template <GTEST_3_TYPENAMES_(U)> + tuple& operator=(const GTEST_3_TUPLE_(U)& t) { + return CopyFrom(t); + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template <GTEST_3_TYPENAMES_(U)> + tuple& CopyFrom(const GTEST_3_TUPLE_(U)& t) { + f0_ = t.f0_; + f1_ = t.f1_; + f2_ = t.f2_; + return *this; + } + + T0 f0_; + T1 f1_; + T2 f2_; +}; + +template <GTEST_4_TYPENAMES_(T)> +class GTEST_4_TUPLE_(T) { + public: + template <int k> friend class gtest_internal::Get; + + tuple() : f0_(), f1_(), f2_(), f3_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, + GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3) : f0_(f0), f1_(f1), f2_(f2), + f3_(f3) {} + + tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_) {} + + template <GTEST_4_TYPENAMES_(U)> + tuple(const GTEST_4_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), + f3_(t.f3_) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template <GTEST_4_TYPENAMES_(U)> + tuple& operator=(const GTEST_4_TUPLE_(U)& t) { + return CopyFrom(t); + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template <GTEST_4_TYPENAMES_(U)> + tuple& CopyFrom(const GTEST_4_TUPLE_(U)& t) { + f0_ = t.f0_; + f1_ = t.f1_; + f2_ = t.f2_; + f3_ = t.f3_; + return *this; + } + + T0 f0_; + T1 f1_; + T2 f2_; + T3 f3_; +}; + +template <GTEST_5_TYPENAMES_(T)> +class GTEST_5_TUPLE_(T) { + public: + template <int k> friend class gtest_internal::Get; + + tuple() : f0_(), f1_(), f2_(), f3_(), f4_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, + GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, + GTEST_BY_REF_(T4) f4) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4) {} + + tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), + f4_(t.f4_) {} + + template <GTEST_5_TYPENAMES_(U)> + tuple(const GTEST_5_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), + f3_(t.f3_), f4_(t.f4_) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template <GTEST_5_TYPENAMES_(U)> + tuple& operator=(const GTEST_5_TUPLE_(U)& t) { + return CopyFrom(t); + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template <GTEST_5_TYPENAMES_(U)> + tuple& CopyFrom(const GTEST_5_TUPLE_(U)& t) { + f0_ = t.f0_; + f1_ = t.f1_; + f2_ = t.f2_; + f3_ = t.f3_; + f4_ = t.f4_; + return *this; + } + + T0 f0_; + T1 f1_; + T2 f2_; + T3 f3_; + T4 f4_; +}; + +template <GTEST_6_TYPENAMES_(T)> +class GTEST_6_TUPLE_(T) { + public: + template <int k> friend class gtest_internal::Get; + + tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, + GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4, + GTEST_BY_REF_(T5) f5) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4), + f5_(f5) {} + + tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), + f4_(t.f4_), f5_(t.f5_) {} + + template <GTEST_6_TYPENAMES_(U)> + tuple(const GTEST_6_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), + f3_(t.f3_), f4_(t.f4_), f5_(t.f5_) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template <GTEST_6_TYPENAMES_(U)> + tuple& operator=(const GTEST_6_TUPLE_(U)& t) { + return CopyFrom(t); + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template <GTEST_6_TYPENAMES_(U)> + tuple& CopyFrom(const GTEST_6_TUPLE_(U)& t) { + f0_ = t.f0_; + f1_ = t.f1_; + f2_ = t.f2_; + f3_ = t.f3_; + f4_ = t.f4_; + f5_ = t.f5_; + return *this; + } + + T0 f0_; + T1 f1_; + T2 f2_; + T3 f3_; + T4 f4_; + T5 f5_; +}; + +template <GTEST_7_TYPENAMES_(T)> +class GTEST_7_TUPLE_(T) { + public: + template <int k> friend class gtest_internal::Get; + + tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, + GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4, + GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6) : f0_(f0), f1_(f1), f2_(f2), + f3_(f3), f4_(f4), f5_(f5), f6_(f6) {} + + tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), + f4_(t.f4_), f5_(t.f5_), f6_(t.f6_) {} + + template <GTEST_7_TYPENAMES_(U)> + tuple(const GTEST_7_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), + f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template <GTEST_7_TYPENAMES_(U)> + tuple& operator=(const GTEST_7_TUPLE_(U)& t) { + return CopyFrom(t); + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template <GTEST_7_TYPENAMES_(U)> + tuple& CopyFrom(const GTEST_7_TUPLE_(U)& t) { + f0_ = t.f0_; + f1_ = t.f1_; + f2_ = t.f2_; + f3_ = t.f3_; + f4_ = t.f4_; + f5_ = t.f5_; + f6_ = t.f6_; + return *this; + } + + T0 f0_; + T1 f1_; + T2 f2_; + T3 f3_; + T4 f4_; + T5 f5_; + T6 f6_; +}; + +template <GTEST_8_TYPENAMES_(T)> +class GTEST_8_TUPLE_(T) { + public: + template <int k> friend class gtest_internal::Get; + + tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, + GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4, + GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, + GTEST_BY_REF_(T7) f7) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4), + f5_(f5), f6_(f6), f7_(f7) {} + + tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), + f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_) {} + + template <GTEST_8_TYPENAMES_(U)> + tuple(const GTEST_8_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), + f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template <GTEST_8_TYPENAMES_(U)> + tuple& operator=(const GTEST_8_TUPLE_(U)& t) { + return CopyFrom(t); + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template <GTEST_8_TYPENAMES_(U)> + tuple& CopyFrom(const GTEST_8_TUPLE_(U)& t) { + f0_ = t.f0_; + f1_ = t.f1_; + f2_ = t.f2_; + f3_ = t.f3_; + f4_ = t.f4_; + f5_ = t.f5_; + f6_ = t.f6_; + f7_ = t.f7_; + return *this; + } + + T0 f0_; + T1 f1_; + T2 f2_; + T3 f3_; + T4 f4_; + T5 f5_; + T6 f6_; + T7 f7_; +}; + +template <GTEST_9_TYPENAMES_(T)> +class GTEST_9_TUPLE_(T) { + public: + template <int k> friend class gtest_internal::Get; + + tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_(), f8_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, + GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4, + GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, GTEST_BY_REF_(T7) f7, + GTEST_BY_REF_(T8) f8) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4), + f5_(f5), f6_(f6), f7_(f7), f8_(f8) {} + + tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), + f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_) {} + + template <GTEST_9_TYPENAMES_(U)> + tuple(const GTEST_9_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), + f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template <GTEST_9_TYPENAMES_(U)> + tuple& operator=(const GTEST_9_TUPLE_(U)& t) { + return CopyFrom(t); + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template <GTEST_9_TYPENAMES_(U)> + tuple& CopyFrom(const GTEST_9_TUPLE_(U)& t) { + f0_ = t.f0_; + f1_ = t.f1_; + f2_ = t.f2_; + f3_ = t.f3_; + f4_ = t.f4_; + f5_ = t.f5_; + f6_ = t.f6_; + f7_ = t.f7_; + f8_ = t.f8_; + return *this; + } + + T0 f0_; + T1 f1_; + T2 f2_; + T3 f3_; + T4 f4_; + T5 f5_; + T6 f6_; + T7 f7_; + T8 f8_; +}; + +template <GTEST_10_TYPENAMES_(T)> +class tuple { + public: + template <int k> friend class gtest_internal::Get; + + tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_(), f8_(), + f9_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, + GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4, + GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, GTEST_BY_REF_(T7) f7, + GTEST_BY_REF_(T8) f8, GTEST_BY_REF_(T9) f9) : f0_(f0), f1_(f1), f2_(f2), + f3_(f3), f4_(f4), f5_(f5), f6_(f6), f7_(f7), f8_(f8), f9_(f9) {} + + tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), + f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_), f9_(t.f9_) {} + + template <GTEST_10_TYPENAMES_(U)> + tuple(const GTEST_10_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), + f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_), + f9_(t.f9_) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template <GTEST_10_TYPENAMES_(U)> + tuple& operator=(const GTEST_10_TUPLE_(U)& t) { + return CopyFrom(t); + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template <GTEST_10_TYPENAMES_(U)> + tuple& CopyFrom(const GTEST_10_TUPLE_(U)& t) { + f0_ = t.f0_; + f1_ = t.f1_; + f2_ = t.f2_; + f3_ = t.f3_; + f4_ = t.f4_; + f5_ = t.f5_; + f6_ = t.f6_; + f7_ = t.f7_; + f8_ = t.f8_; + f9_ = t.f9_; + return *this; + } + + T0 f0_; + T1 f1_; + T2 f2_; + T3 f3_; + T4 f4_; + T5 f5_; + T6 f6_; + T7 f7_; + T8 f8_; + T9 f9_; +}; + +// 6.1.3.2 Tuple creation functions. + +// Known limitations: we don't support passing an +// std::tr1::reference_wrapper<T> to make_tuple(). And we don't +// implement tie(). + +inline tuple<> make_tuple() { return tuple<>(); } + +template <GTEST_1_TYPENAMES_(T)> +inline GTEST_1_TUPLE_(T) make_tuple(const T0& f0) { + return GTEST_1_TUPLE_(T)(f0); +} + +template <GTEST_2_TYPENAMES_(T)> +inline GTEST_2_TUPLE_(T) make_tuple(const T0& f0, const T1& f1) { + return GTEST_2_TUPLE_(T)(f0, f1); +} + +template <GTEST_3_TYPENAMES_(T)> +inline GTEST_3_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2) { + return GTEST_3_TUPLE_(T)(f0, f1, f2); +} + +template <GTEST_4_TYPENAMES_(T)> +inline GTEST_4_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, + const T3& f3) { + return GTEST_4_TUPLE_(T)(f0, f1, f2, f3); +} + +template <GTEST_5_TYPENAMES_(T)> +inline GTEST_5_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, + const T3& f3, const T4& f4) { + return GTEST_5_TUPLE_(T)(f0, f1, f2, f3, f4); +} + +template <GTEST_6_TYPENAMES_(T)> +inline GTEST_6_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, + const T3& f3, const T4& f4, const T5& f5) { + return GTEST_6_TUPLE_(T)(f0, f1, f2, f3, f4, f5); +} + +template <GTEST_7_TYPENAMES_(T)> +inline GTEST_7_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, + const T3& f3, const T4& f4, const T5& f5, const T6& f6) { + return GTEST_7_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6); +} + +template <GTEST_8_TYPENAMES_(T)> +inline GTEST_8_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, + const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7) { + return GTEST_8_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7); +} + +template <GTEST_9_TYPENAMES_(T)> +inline GTEST_9_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, + const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7, + const T8& f8) { + return GTEST_9_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7, f8); +} + +template <GTEST_10_TYPENAMES_(T)> +inline GTEST_10_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, + const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7, + const T8& f8, const T9& f9) { + return GTEST_10_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7, f8, f9); +} + +// 6.1.3.3 Tuple helper classes. + +template <typename Tuple> struct tuple_size; + +template <GTEST_0_TYPENAMES_(T)> +struct tuple_size<GTEST_0_TUPLE_(T) > { + static const int value = 0; +}; + +template <GTEST_1_TYPENAMES_(T)> +struct tuple_size<GTEST_1_TUPLE_(T) > { + static const int value = 1; +}; + +template <GTEST_2_TYPENAMES_(T)> +struct tuple_size<GTEST_2_TUPLE_(T) > { + static const int value = 2; +}; + +template <GTEST_3_TYPENAMES_(T)> +struct tuple_size<GTEST_3_TUPLE_(T) > { + static const int value = 3; +}; + +template <GTEST_4_TYPENAMES_(T)> +struct tuple_size<GTEST_4_TUPLE_(T) > { + static const int value = 4; +}; + +template <GTEST_5_TYPENAMES_(T)> +struct tuple_size<GTEST_5_TUPLE_(T) > { + static const int value = 5; +}; + +template <GTEST_6_TYPENAMES_(T)> +struct tuple_size<GTEST_6_TUPLE_(T) > { + static const int value = 6; +}; + +template <GTEST_7_TYPENAMES_(T)> +struct tuple_size<GTEST_7_TUPLE_(T) > { + static const int value = 7; +}; + +template <GTEST_8_TYPENAMES_(T)> +struct tuple_size<GTEST_8_TUPLE_(T) > { + static const int value = 8; +}; + +template <GTEST_9_TYPENAMES_(T)> +struct tuple_size<GTEST_9_TUPLE_(T) > { + static const int value = 9; +}; + +template <GTEST_10_TYPENAMES_(T)> +struct tuple_size<GTEST_10_TUPLE_(T) > { + static const int value = 10; +}; + +template <int k, class Tuple> +struct tuple_element { + typedef typename gtest_internal::TupleElement< + k < (tuple_size<Tuple>::value), k, Tuple>::type type; +}; + +#define GTEST_TUPLE_ELEMENT_(k, Tuple) typename tuple_element<k, Tuple >::type + +// 6.1.3.4 Element access. + +namespace gtest_internal { + +template <> +class Get<0> { + public: + template <class Tuple> + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(0, Tuple)) + Field(Tuple& t) { return t.f0_; } // NOLINT + + template <class Tuple> + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(0, Tuple)) + ConstField(const Tuple& t) { return t.f0_; } +}; + +template <> +class Get<1> { + public: + template <class Tuple> + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(1, Tuple)) + Field(Tuple& t) { return t.f1_; } // NOLINT + + template <class Tuple> + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(1, Tuple)) + ConstField(const Tuple& t) { return t.f1_; } +}; + +template <> +class Get<2> { + public: + template <class Tuple> + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(2, Tuple)) + Field(Tuple& t) { return t.f2_; } // NOLINT + + template <class Tuple> + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(2, Tuple)) + ConstField(const Tuple& t) { return t.f2_; } +}; + +template <> +class Get<3> { + public: + template <class Tuple> + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(3, Tuple)) + Field(Tuple& t) { return t.f3_; } // NOLINT + + template <class Tuple> + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(3, Tuple)) + ConstField(const Tuple& t) { return t.f3_; } +}; + +template <> +class Get<4> { + public: + template <class Tuple> + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(4, Tuple)) + Field(Tuple& t) { return t.f4_; } // NOLINT + + template <class Tuple> + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(4, Tuple)) + ConstField(const Tuple& t) { return t.f4_; } +}; + +template <> +class Get<5> { + public: + template <class Tuple> + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(5, Tuple)) + Field(Tuple& t) { return t.f5_; } // NOLINT + + template <class Tuple> + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(5, Tuple)) + ConstField(const Tuple& t) { return t.f5_; } +}; + +template <> +class Get<6> { + public: + template <class Tuple> + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(6, Tuple)) + Field(Tuple& t) { return t.f6_; } // NOLINT + + template <class Tuple> + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(6, Tuple)) + ConstField(const Tuple& t) { return t.f6_; } +}; + +template <> +class Get<7> { + public: + template <class Tuple> + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(7, Tuple)) + Field(Tuple& t) { return t.f7_; } // NOLINT + + template <class Tuple> + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(7, Tuple)) + ConstField(const Tuple& t) { return t.f7_; } +}; + +template <> +class Get<8> { + public: + template <class Tuple> + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(8, Tuple)) + Field(Tuple& t) { return t.f8_; } // NOLINT + + template <class Tuple> + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(8, Tuple)) + ConstField(const Tuple& t) { return t.f8_; } +}; + +template <> +class Get<9> { + public: + template <class Tuple> + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(9, Tuple)) + Field(Tuple& t) { return t.f9_; } // NOLINT + + template <class Tuple> + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(9, Tuple)) + ConstField(const Tuple& t) { return t.f9_; } +}; + +} // namespace gtest_internal + +template <int k, GTEST_10_TYPENAMES_(T)> +GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(k, GTEST_10_TUPLE_(T))) +get(GTEST_10_TUPLE_(T)& t) { + return gtest_internal::Get<k>::Field(t); +} + +template <int k, GTEST_10_TYPENAMES_(T)> +GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(k, GTEST_10_TUPLE_(T))) +get(const GTEST_10_TUPLE_(T)& t) { + return gtest_internal::Get<k>::ConstField(t); +} + +// 6.1.3.5 Relational operators + +// We only implement == and !=, as we don't have a need for the rest yet. + +namespace gtest_internal { + +// SameSizeTuplePrefixComparator<k, k>::Eq(t1, t2) returns true if the +// first k fields of t1 equals the first k fields of t2. +// SameSizeTuplePrefixComparator(k1, k2) would be a compiler error if +// k1 != k2. +template <int kSize1, int kSize2> +struct SameSizeTuplePrefixComparator; + +template <> +struct SameSizeTuplePrefixComparator<0, 0> { + template <class Tuple1, class Tuple2> + static bool Eq(const Tuple1& /* t1 */, const Tuple2& /* t2 */) { + return true; + } +}; + +template <int k> +struct SameSizeTuplePrefixComparator<k, k> { + template <class Tuple1, class Tuple2> + static bool Eq(const Tuple1& t1, const Tuple2& t2) { + return SameSizeTuplePrefixComparator<k - 1, k - 1>::Eq(t1, t2) && + ::std::tr1::get<k - 1>(t1) == ::std::tr1::get<k - 1>(t2); + } +}; + +} // namespace gtest_internal + +template <GTEST_10_TYPENAMES_(T), GTEST_10_TYPENAMES_(U)> +inline bool operator==(const GTEST_10_TUPLE_(T)& t, + const GTEST_10_TUPLE_(U)& u) { + return gtest_internal::SameSizeTuplePrefixComparator< + tuple_size<GTEST_10_TUPLE_(T) >::value, + tuple_size<GTEST_10_TUPLE_(U) >::value>::Eq(t, u); +} + +template <GTEST_10_TYPENAMES_(T), GTEST_10_TYPENAMES_(U)> +inline bool operator!=(const GTEST_10_TUPLE_(T)& t, + const GTEST_10_TUPLE_(U)& u) { return !(t == u); } + +// 6.1.4 Pairs. +// Unimplemented. + +} // namespace tr1 +} // namespace std + +#undef GTEST_0_TUPLE_ +#undef GTEST_1_TUPLE_ +#undef GTEST_2_TUPLE_ +#undef GTEST_3_TUPLE_ +#undef GTEST_4_TUPLE_ +#undef GTEST_5_TUPLE_ +#undef GTEST_6_TUPLE_ +#undef GTEST_7_TUPLE_ +#undef GTEST_8_TUPLE_ +#undef GTEST_9_TUPLE_ +#undef GTEST_10_TUPLE_ + +#undef GTEST_0_TYPENAMES_ +#undef GTEST_1_TYPENAMES_ +#undef GTEST_2_TYPENAMES_ +#undef GTEST_3_TYPENAMES_ +#undef GTEST_4_TYPENAMES_ +#undef GTEST_5_TYPENAMES_ +#undef GTEST_6_TYPENAMES_ +#undef GTEST_7_TYPENAMES_ +#undef GTEST_8_TYPENAMES_ +#undef GTEST_9_TYPENAMES_ +#undef GTEST_10_TYPENAMES_ + +#undef GTEST_DECLARE_TUPLE_AS_FRIEND_ +#undef GTEST_BY_REF_ +#undef GTEST_ADD_REF_ +#undef GTEST_TUPLE_ELEMENT_ + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_ +# elif GTEST_ENV_HAS_STD_TUPLE_ +# include <tuple> +// C++11 puts its tuple into the ::std namespace rather than +// ::std::tr1. gtest expects tuple to live in ::std::tr1, so put it there. +// This causes undefined behavior, but supported compilers react in +// the way we intend. +namespace std { +namespace tr1 { +using ::std::get; +using ::std::make_tuple; +using ::std::tuple; +using ::std::tuple_element; +using ::std::tuple_size; +} +} + +# elif GTEST_OS_SYMBIAN + +// On Symbian, BOOST_HAS_TR1_TUPLE causes Boost's TR1 tuple library to +// use STLport's tuple implementation, which unfortunately doesn't +// work as the copy of STLport distributed with Symbian is incomplete. +// By making sure BOOST_HAS_TR1_TUPLE is undefined, we force Boost to +// use its own tuple implementation. +# ifdef BOOST_HAS_TR1_TUPLE +# undef BOOST_HAS_TR1_TUPLE +# endif // BOOST_HAS_TR1_TUPLE + +// This prevents <boost/tr1/detail/config.hpp>, which defines +// BOOST_HAS_TR1_TUPLE, from being #included by Boost's <tuple>. +# define BOOST_TR1_DETAIL_CONFIG_HPP_INCLUDED +# include <tuple> + +# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40000) +// GCC 4.0+ implements tr1/tuple in the <tr1/tuple> header. This does +// not conform to the TR1 spec, which requires the header to be <tuple>. + +# if !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302 +// Until version 4.3.2, gcc has a bug that causes <tr1/functional>, +// which is #included by <tr1/tuple>, to not compile when RTTI is +// disabled. _TR1_FUNCTIONAL is the header guard for +// <tr1/functional>. Hence the following #define is a hack to prevent +// <tr1/functional> from being included. +# define _TR1_FUNCTIONAL 1 +# include <tr1/tuple> +# undef _TR1_FUNCTIONAL // Allows the user to #include + // <tr1/functional> if he chooses to. +# else +# include <tr1/tuple> // NOLINT +# endif // !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302 + +# else +// If the compiler is not GCC 4.0+, we assume the user is using a +// spec-conforming TR1 implementation. +# include <tuple> // NOLINT +# endif // GTEST_USE_OWN_TR1_TUPLE + +#endif // GTEST_HAS_TR1_TUPLE + +// Determines whether clone(2) is supported. +// Usually it will only be available on Linux, excluding +// Linux on the Itanium architecture. +// Also see http://linux.die.net/man/2/clone. +#ifndef GTEST_HAS_CLONE +// The user didn't tell us, so we need to figure it out. + +# if GTEST_OS_LINUX && !defined(__ia64__) +# if GTEST_OS_LINUX_ANDROID +// On Android, clone() is only available on ARM starting with Gingerbread. +# if defined(__arm__) && __ANDROID_API__ >= 9 +# define GTEST_HAS_CLONE 1 +# else +# define GTEST_HAS_CLONE 0 +# endif +# else +# define GTEST_HAS_CLONE 1 +# endif +# else +# define GTEST_HAS_CLONE 0 +# endif // GTEST_OS_LINUX && !defined(__ia64__) + +#endif // GTEST_HAS_CLONE + +// Determines whether to support stream redirection. This is used to test +// output correctness and to implement death tests. +#ifndef GTEST_HAS_STREAM_REDIRECTION +// By default, we assume that stream redirection is supported on all +// platforms except known mobile ones. +# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN +# define GTEST_HAS_STREAM_REDIRECTION 0 +# else +# define GTEST_HAS_STREAM_REDIRECTION 1 +# endif // !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_SYMBIAN +#endif // GTEST_HAS_STREAM_REDIRECTION + +// Determines whether to support death tests. +// Google Test does not support death tests for VC 7.1 and earlier as +// abort() in a VC 7.1 application compiled as GUI in debug config +// pops up a dialog window that cannot be suppressed programmatically. +#if (GTEST_OS_LINUX || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \ + (GTEST_OS_MAC && !GTEST_OS_IOS) || GTEST_OS_IOS_SIMULATOR || \ + (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER >= 1400) || \ + GTEST_OS_WINDOWS_MINGW || GTEST_OS_AIX || GTEST_OS_HPUX || \ + GTEST_OS_OPENBSD || GTEST_OS_QNX) +# define GTEST_HAS_DEATH_TEST 1 +# include <vector> // NOLINT +#endif + +// We don't support MSVC 7.1 with exceptions disabled now. Therefore +// all the compilers we care about are adequate for supporting +// value-parameterized tests. +#define GTEST_HAS_PARAM_TEST 1 + +// Determines whether to support type-driven tests. + +// Typed tests need <typeinfo> and variadic macros, which GCC, VC++ 8.0, +// Sun Pro CC, IBM Visual Age, and HP aCC support. +#if defined(__GNUC__) || (_MSC_VER >= 1400) || defined(__SUNPRO_CC) || \ + defined(__IBMCPP__) || defined(__HP_aCC) +# define GTEST_HAS_TYPED_TEST 1 +# define GTEST_HAS_TYPED_TEST_P 1 +#endif + +// Determines whether to support Combine(). This only makes sense when +// value-parameterized tests are enabled. The implementation doesn't +// work on Sun Studio since it doesn't understand templated conversion +// operators. +#if GTEST_HAS_PARAM_TEST && GTEST_HAS_TR1_TUPLE && !defined(__SUNPRO_CC) +# define GTEST_HAS_COMBINE 1 +#endif + +// Determines whether the system compiler uses UTF-16 for encoding wide strings. +#define GTEST_WIDE_STRING_USES_UTF16_ \ + (GTEST_OS_WINDOWS || GTEST_OS_CYGWIN || GTEST_OS_SYMBIAN || GTEST_OS_AIX) + +// Determines whether test results can be streamed to a socket. +#if GTEST_OS_LINUX +# define GTEST_CAN_STREAM_RESULTS_ 1 +#endif + +// Defines some utility macros. + +// The GNU compiler emits a warning if nested "if" statements are followed by +// an "else" statement and braces are not used to explicitly disambiguate the +// "else" binding. This leads to problems with code like: +// +// if (gate) +// ASSERT_*(condition) << "Some message"; +// +// The "switch (0) case 0:" idiom is used to suppress this. +#ifdef __INTEL_COMPILER +# define GTEST_AMBIGUOUS_ELSE_BLOCKER_ +#else +# define GTEST_AMBIGUOUS_ELSE_BLOCKER_ switch (0) case 0: default: // NOLINT +#endif + +// Use this annotation at the end of a struct/class definition to +// prevent the compiler from optimizing away instances that are never +// used. This is useful when all interesting logic happens inside the +// c'tor and / or d'tor. Example: +// +// struct Foo { +// Foo() { ... } +// } GTEST_ATTRIBUTE_UNUSED_; +// +// Also use it after a variable or parameter declaration to tell the +// compiler the variable/parameter does not have to be used. +#if defined(__GNUC__) && !defined(COMPILER_ICC) +# define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused)) +#else +# define GTEST_ATTRIBUTE_UNUSED_ +#endif + +// A macro to disallow operator= +// This should be used in the private: declarations for a class. +#define GTEST_DISALLOW_ASSIGN_(type)\ + void operator=(type const &) + +// A macro to disallow copy constructor and operator= +// This should be used in the private: declarations for a class. +#define GTEST_DISALLOW_COPY_AND_ASSIGN_(type)\ + type(type const &);\ + GTEST_DISALLOW_ASSIGN_(type) + +// Tell the compiler to warn about unused return values for functions declared +// with this macro. The macro should be used on function declarations +// following the argument list: +// +// Sprocket* AllocateSprocket() GTEST_MUST_USE_RESULT_; +#if defined(__GNUC__) && (GTEST_GCC_VER_ >= 30400) && !defined(COMPILER_ICC) +# define GTEST_MUST_USE_RESULT_ __attribute__ ((warn_unused_result)) +#else +# define GTEST_MUST_USE_RESULT_ +#endif // __GNUC__ && (GTEST_GCC_VER_ >= 30400) && !COMPILER_ICC + +// Determine whether the compiler supports Microsoft's Structured Exception +// Handling. This is supported by several Windows compilers but generally +// does not exist on any other system. +#ifndef GTEST_HAS_SEH +// The user didn't tell us, so we need to figure it out. + +# if defined(_MSC_VER) || defined(__BORLANDC__) +// These two compilers are known to support SEH. +# define GTEST_HAS_SEH 1 +# else +// Assume no SEH. +# define GTEST_HAS_SEH 0 +# endif + +#endif // GTEST_HAS_SEH + +#ifdef _MSC_VER + +# if GTEST_LINKED_AS_SHARED_LIBRARY +# define GTEST_API_ __declspec(dllimport) +# elif GTEST_CREATE_SHARED_LIBRARY +# define GTEST_API_ __declspec(dllexport) +# endif + +#endif // _MSC_VER + +#ifndef GTEST_API_ +# define GTEST_API_ +#endif + +#ifdef __GNUC__ +// Ask the compiler to never inline a given function. +# define GTEST_NO_INLINE_ __attribute__((noinline)) +#else +# define GTEST_NO_INLINE_ +#endif + +// _LIBCPP_VERSION is defined by the libc++ library from the LLVM project. +#if defined(__GLIBCXX__) || defined(_LIBCPP_VERSION) +# define GTEST_HAS_CXXABI_H_ 1 +#else +# define GTEST_HAS_CXXABI_H_ 0 +#endif + +namespace testing { + +class Message; + +namespace internal { + +// A secret type that Google Test users don't know about. It has no +// definition on purpose. Therefore it's impossible to create a +// Secret object, which is what we want. +class Secret; + +// The GTEST_COMPILE_ASSERT_ macro can be used to verify that a compile time +// expression is true. For example, you could use it to verify the +// size of a static array: +// +// GTEST_COMPILE_ASSERT_(ARRAYSIZE(content_type_names) == CONTENT_NUM_TYPES, +// content_type_names_incorrect_size); +// +// or to make sure a struct is smaller than a certain size: +// +// GTEST_COMPILE_ASSERT_(sizeof(foo) < 128, foo_too_large); +// +// The second argument to the macro is the name of the variable. If +// the expression is false, most compilers will issue a warning/error +// containing the name of the variable. + +template <bool> +struct CompileAssert { +}; + +#define GTEST_COMPILE_ASSERT_(expr, msg) \ + typedef ::testing::internal::CompileAssert<(static_cast<bool>(expr))> \ + msg[static_cast<bool>(expr) ? 1 : -1] GTEST_ATTRIBUTE_UNUSED_ + +// Implementation details of GTEST_COMPILE_ASSERT_: +// +// - GTEST_COMPILE_ASSERT_ works by defining an array type that has -1 +// elements (and thus is invalid) when the expression is false. +// +// - The simpler definition +// +// #define GTEST_COMPILE_ASSERT_(expr, msg) typedef char msg[(expr) ? 1 : -1] +// +// does not work, as gcc supports variable-length arrays whose sizes +// are determined at run-time (this is gcc's extension and not part +// of the C++ standard). As a result, gcc fails to reject the +// following code with the simple definition: +// +// int foo; +// GTEST_COMPILE_ASSERT_(foo, msg); // not supposed to compile as foo is +// // not a compile-time constant. +// +// - By using the type CompileAssert<(bool(expr))>, we ensures that +// expr is a compile-time constant. (Template arguments must be +// determined at compile-time.) +// +// - The outter parentheses in CompileAssert<(bool(expr))> are necessary +// to work around a bug in gcc 3.4.4 and 4.0.1. If we had written +// +// CompileAssert<bool(expr)> +// +// instead, these compilers will refuse to compile +// +// GTEST_COMPILE_ASSERT_(5 > 0, some_message); +// +// (They seem to think the ">" in "5 > 0" marks the end of the +// template argument list.) +// +// - The array size is (bool(expr) ? 1 : -1), instead of simply +// +// ((expr) ? 1 : -1). +// +// This is to avoid running into a bug in MS VC 7.1, which +// causes ((0.0) ? 1 : -1) to incorrectly evaluate to 1. + +// StaticAssertTypeEqHelper is used by StaticAssertTypeEq defined in gtest.h. +// +// This template is declared, but intentionally undefined. +template <typename T1, typename T2> +struct StaticAssertTypeEqHelper; + +template <typename T> +struct StaticAssertTypeEqHelper<T, T> {}; + +#if GTEST_HAS_GLOBAL_STRING +typedef ::string string; +#else +typedef ::std::string string; +#endif // GTEST_HAS_GLOBAL_STRING + +#if GTEST_HAS_GLOBAL_WSTRING +typedef ::wstring wstring; +#elif GTEST_HAS_STD_WSTRING +typedef ::std::wstring wstring; +#endif // GTEST_HAS_GLOBAL_WSTRING + +// A helper for suppressing warnings on constant condition. It just +// returns 'condition'. +GTEST_API_ bool IsTrue(bool condition); + +// Defines scoped_ptr. + +// This implementation of scoped_ptr is PARTIAL - it only contains +// enough stuff to satisfy Google Test's need. +template <typename T> +class scoped_ptr { + public: + typedef T element_type; + + explicit scoped_ptr(T* p = NULL) : ptr_(p) {} + ~scoped_ptr() { reset(); } + + T& operator*() const { return *ptr_; } + T* operator->() const { return ptr_; } + T* get() const { return ptr_; } + + T* release() { + T* const ptr = ptr_; + ptr_ = NULL; + return ptr; + } + + void reset(T* p = NULL) { + if (p != ptr_) { + if (IsTrue(sizeof(T) > 0)) { // Makes sure T is a complete type. + delete ptr_; + } + ptr_ = p; + } + } + + private: + T* ptr_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(scoped_ptr); +}; + +// Defines RE. + +// A simple C++ wrapper for <regex.h>. It uses the POSIX Extended +// Regular Expression syntax. +class GTEST_API_ RE { + public: + // A copy constructor is required by the Standard to initialize object + // references from r-values. + RE(const RE& other) { Init(other.pattern()); } + + // Constructs an RE from a string. + RE(const ::std::string& regex) { Init(regex.c_str()); } // NOLINT + +#if GTEST_HAS_GLOBAL_STRING + + RE(const ::string& regex) { Init(regex.c_str()); } // NOLINT + +#endif // GTEST_HAS_GLOBAL_STRING + + RE(const char* regex) { Init(regex); } // NOLINT + ~RE(); + + // Returns the string representation of the regex. + const char* pattern() const { return pattern_; } + + // FullMatch(str, re) returns true iff regular expression re matches + // the entire str. + // PartialMatch(str, re) returns true iff regular expression re + // matches a substring of str (including str itself). + // + // TODO(wan@google.com): make FullMatch() and PartialMatch() work + // when str contains NUL characters. + static bool FullMatch(const ::std::string& str, const RE& re) { + return FullMatch(str.c_str(), re); + } + static bool PartialMatch(const ::std::string& str, const RE& re) { + return PartialMatch(str.c_str(), re); + } + +#if GTEST_HAS_GLOBAL_STRING + + static bool FullMatch(const ::string& str, const RE& re) { + return FullMatch(str.c_str(), re); + } + static bool PartialMatch(const ::string& str, const RE& re) { + return PartialMatch(str.c_str(), re); + } + +#endif // GTEST_HAS_GLOBAL_STRING + + static bool FullMatch(const char* str, const RE& re); + static bool PartialMatch(const char* str, const RE& re); + + private: + void Init(const char* regex); + + // We use a const char* instead of an std::string, as Google Test used to be + // used where std::string is not available. TODO(wan@google.com): change to + // std::string. + const char* pattern_; + bool is_valid_; + +#if GTEST_USES_POSIX_RE + + regex_t full_regex_; // For FullMatch(). + regex_t partial_regex_; // For PartialMatch(). + +#else // GTEST_USES_SIMPLE_RE + + const char* full_pattern_; // For FullMatch(); + +#endif + + GTEST_DISALLOW_ASSIGN_(RE); +}; + +// Formats a source file path and a line number as they would appear +// in an error message from the compiler used to compile this code. +GTEST_API_ ::std::string FormatFileLocation(const char* file, int line); + +// Formats a file location for compiler-independent XML output. +// Although this function is not platform dependent, we put it next to +// FormatFileLocation in order to contrast the two functions. +GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file, + int line); + +// Defines logging utilities: +// GTEST_LOG_(severity) - logs messages at the specified severity level. The +// message itself is streamed into the macro. +// LogToStderr() - directs all log messages to stderr. +// FlushInfoLog() - flushes informational log messages. + +enum GTestLogSeverity { + GTEST_INFO, + GTEST_WARNING, + GTEST_ERROR, + GTEST_FATAL +}; + +// Formats log entry severity, provides a stream object for streaming the +// log message, and terminates the message with a newline when going out of +// scope. +class GTEST_API_ GTestLog { + public: + GTestLog(GTestLogSeverity severity, const char* file, int line); + + // Flushes the buffers and, if severity is GTEST_FATAL, aborts the program. + ~GTestLog(); + + ::std::ostream& GetStream() { return ::std::cerr; } + + private: + const GTestLogSeverity severity_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestLog); +}; + +#define GTEST_LOG_(severity) \ + ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \ + __FILE__, __LINE__).GetStream() + +inline void LogToStderr() {} +inline void FlushInfoLog() { fflush(NULL); } + +// INTERNAL IMPLEMENTATION - DO NOT USE. +// +// GTEST_CHECK_ is an all-mode assert. It aborts the program if the condition +// is not satisfied. +// Synopsys: +// GTEST_CHECK_(boolean_condition); +// or +// GTEST_CHECK_(boolean_condition) << "Additional message"; +// +// This checks the condition and if the condition is not satisfied +// it prints message about the condition violation, including the +// condition itself, plus additional message streamed into it, if any, +// and then it aborts the program. It aborts the program irrespective of +// whether it is built in the debug mode or not. +#define GTEST_CHECK_(condition) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::IsTrue(condition)) \ + ; \ + else \ + GTEST_LOG_(FATAL) << "Condition " #condition " failed. " + +// An all-mode assert to verify that the given POSIX-style function +// call returns 0 (indicating success). Known limitation: this +// doesn't expand to a balanced 'if' statement, so enclose the macro +// in {} if you need to use it as the only statement in an 'if' +// branch. +#define GTEST_CHECK_POSIX_SUCCESS_(posix_call) \ + if (const int gtest_error = (posix_call)) \ + GTEST_LOG_(FATAL) << #posix_call << "failed with error " \ + << gtest_error + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Use ImplicitCast_ as a safe version of static_cast for upcasting in +// the type hierarchy (e.g. casting a Foo* to a SuperclassOfFoo* or a +// const Foo*). When you use ImplicitCast_, the compiler checks that +// the cast is safe. Such explicit ImplicitCast_s are necessary in +// surprisingly many situations where C++ demands an exact type match +// instead of an argument type convertible to a target type. +// +// The syntax for using ImplicitCast_ is the same as for static_cast: +// +// ImplicitCast_<ToType>(expr) +// +// ImplicitCast_ would have been part of the C++ standard library, +// but the proposal was submitted too late. It will probably make +// its way into the language in the future. +// +// This relatively ugly name is intentional. It prevents clashes with +// similar functions users may have (e.g., implicit_cast). The internal +// namespace alone is not enough because the function can be found by ADL. +template<typename To> +inline To ImplicitCast_(To x) { return x; } + +// When you upcast (that is, cast a pointer from type Foo to type +// SuperclassOfFoo), it's fine to use ImplicitCast_<>, since upcasts +// always succeed. When you downcast (that is, cast a pointer from +// type Foo to type SubclassOfFoo), static_cast<> isn't safe, because +// how do you know the pointer is really of type SubclassOfFoo? It +// could be a bare Foo, or of type DifferentSubclassOfFoo. Thus, +// when you downcast, you should use this macro. In debug mode, we +// use dynamic_cast<> to double-check the downcast is legal (we die +// if it's not). In normal mode, we do the efficient static_cast<> +// instead. Thus, it's important to test in debug mode to make sure +// the cast is legal! +// This is the only place in the code we should use dynamic_cast<>. +// In particular, you SHOULDN'T be using dynamic_cast<> in order to +// do RTTI (eg code like this: +// if (dynamic_cast<Subclass1>(foo)) HandleASubclass1Object(foo); +// if (dynamic_cast<Subclass2>(foo)) HandleASubclass2Object(foo); +// You should design the code some other way not to need this. +// +// This relatively ugly name is intentional. It prevents clashes with +// similar functions users may have (e.g., down_cast). The internal +// namespace alone is not enough because the function can be found by ADL. +template<typename To, typename From> // use like this: DownCast_<T*>(foo); +inline To DownCast_(From* f) { // so we only accept pointers + // Ensures that To is a sub-type of From *. This test is here only + // for compile-time type checking, and has no overhead in an + // optimized build at run-time, as it will be optimized away + // completely. + if (false) { + const To to = NULL; + ::testing::internal::ImplicitCast_<From*>(to); + } + +#if GTEST_HAS_RTTI + // RTTI: debug mode only! + GTEST_CHECK_(f == NULL || dynamic_cast<To>(f) != NULL); +#endif + return static_cast<To>(f); +} + +// Downcasts the pointer of type Base to Derived. +// Derived must be a subclass of Base. The parameter MUST +// point to a class of type Derived, not any subclass of it. +// When RTTI is available, the function performs a runtime +// check to enforce this. +template <class Derived, class Base> +Derived* CheckedDowncastToActualType(Base* base) { +#if GTEST_HAS_RTTI + GTEST_CHECK_(typeid(*base) == typeid(Derived)); + return dynamic_cast<Derived*>(base); // NOLINT +#else + return static_cast<Derived*>(base); // Poor man's downcast. +#endif +} + +#if GTEST_HAS_STREAM_REDIRECTION + +// Defines the stderr capturer: +// CaptureStdout - starts capturing stdout. +// GetCapturedStdout - stops capturing stdout and returns the captured string. +// CaptureStderr - starts capturing stderr. +// GetCapturedStderr - stops capturing stderr and returns the captured string. +// +GTEST_API_ void CaptureStdout(); +GTEST_API_ std::string GetCapturedStdout(); +GTEST_API_ void CaptureStderr(); +GTEST_API_ std::string GetCapturedStderr(); + +#endif // GTEST_HAS_STREAM_REDIRECTION + + +#if GTEST_HAS_DEATH_TEST + +const ::std::vector<testing::internal::string>& GetInjectableArgvs(); +void SetInjectableArgvs(const ::std::vector<testing::internal::string>* + new_argvs); + +// A copy of all command line arguments. Set by InitGoogleTest(). +extern ::std::vector<testing::internal::string> g_argvs; + +#endif // GTEST_HAS_DEATH_TEST + +// Defines synchronization primitives. + +#if GTEST_HAS_PTHREAD + +// Sleeps for (roughly) n milli-seconds. This function is only for +// testing Google Test's own constructs. Don't use it in user tests, +// either directly or indirectly. +inline void SleepMilliseconds(int n) { + const timespec time = { + 0, // 0 seconds. + n * 1000L * 1000L, // And n ms. + }; + nanosleep(&time, NULL); +} + +// Allows a controller thread to pause execution of newly created +// threads until notified. Instances of this class must be created +// and destroyed in the controller thread. +// +// This class is only for testing Google Test's own constructs. Do not +// use it in user tests, either directly or indirectly. +class Notification { + public: + Notification() : notified_(false) { + GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, NULL)); + } + ~Notification() { + pthread_mutex_destroy(&mutex_); + } + + // Notifies all threads created with this notification to start. Must + // be called from the controller thread. + void Notify() { + pthread_mutex_lock(&mutex_); + notified_ = true; + pthread_mutex_unlock(&mutex_); + } + + // Blocks until the controller thread notifies. Must be called from a test + // thread. + void WaitForNotification() { + for (;;) { + pthread_mutex_lock(&mutex_); + const bool notified = notified_; + pthread_mutex_unlock(&mutex_); + if (notified) + break; + SleepMilliseconds(10); + } + } + + private: + pthread_mutex_t mutex_; + bool notified_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification); +}; + +// As a C-function, ThreadFuncWithCLinkage cannot be templated itself. +// Consequently, it cannot select a correct instantiation of ThreadWithParam +// in order to call its Run(). Introducing ThreadWithParamBase as a +// non-templated base class for ThreadWithParam allows us to bypass this +// problem. +class ThreadWithParamBase { + public: + virtual ~ThreadWithParamBase() {} + virtual void Run() = 0; +}; + +// pthread_create() accepts a pointer to a function type with the C linkage. +// According to the Standard (7.5/1), function types with different linkages +// are different even if they are otherwise identical. Some compilers (for +// example, SunStudio) treat them as different types. Since class methods +// cannot be defined with C-linkage we need to define a free C-function to +// pass into pthread_create(). +extern "C" inline void* ThreadFuncWithCLinkage(void* thread) { + static_cast<ThreadWithParamBase*>(thread)->Run(); + return NULL; +} + +// Helper class for testing Google Test's multi-threading constructs. +// To use it, write: +// +// void ThreadFunc(int param) { /* Do things with param */ } +// Notification thread_can_start; +// ... +// // The thread_can_start parameter is optional; you can supply NULL. +// ThreadWithParam<int> thread(&ThreadFunc, 5, &thread_can_start); +// thread_can_start.Notify(); +// +// These classes are only for testing Google Test's own constructs. Do +// not use them in user tests, either directly or indirectly. +template <typename T> +class ThreadWithParam : public ThreadWithParamBase { + public: + typedef void (*UserThreadFunc)(T); + + ThreadWithParam( + UserThreadFunc func, T param, Notification* thread_can_start) + : func_(func), + param_(param), + thread_can_start_(thread_can_start), + finished_(false) { + ThreadWithParamBase* const base = this; + // The thread can be created only after all fields except thread_ + // have been initialized. + GTEST_CHECK_POSIX_SUCCESS_( + pthread_create(&thread_, 0, &ThreadFuncWithCLinkage, base)); + } + ~ThreadWithParam() { Join(); } + + void Join() { + if (!finished_) { + GTEST_CHECK_POSIX_SUCCESS_(pthread_join(thread_, 0)); + finished_ = true; + } + } + + virtual void Run() { + if (thread_can_start_ != NULL) + thread_can_start_->WaitForNotification(); + func_(param_); + } + + private: + const UserThreadFunc func_; // User-supplied thread function. + const T param_; // User-supplied parameter to the thread function. + // When non-NULL, used to block execution until the controller thread + // notifies. + Notification* const thread_can_start_; + bool finished_; // true iff we know that the thread function has finished. + pthread_t thread_; // The native thread object. + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam); +}; + +// MutexBase and Mutex implement mutex on pthreads-based platforms. They +// are used in conjunction with class MutexLock: +// +// Mutex mutex; +// ... +// MutexLock lock(&mutex); // Acquires the mutex and releases it at the end +// // of the current scope. +// +// MutexBase implements behavior for both statically and dynamically +// allocated mutexes. Do not use MutexBase directly. Instead, write +// the following to define a static mutex: +// +// GTEST_DEFINE_STATIC_MUTEX_(g_some_mutex); +// +// You can forward declare a static mutex like this: +// +// GTEST_DECLARE_STATIC_MUTEX_(g_some_mutex); +// +// To create a dynamic mutex, just define an object of type Mutex. +class MutexBase { + public: + // Acquires this mutex. + void Lock() { + GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_lock(&mutex_)); + owner_ = pthread_self(); + has_owner_ = true; + } + + // Releases this mutex. + void Unlock() { + // Since the lock is being released the owner_ field should no longer be + // considered valid. We don't protect writing to has_owner_ here, as it's + // the caller's responsibility to ensure that the current thread holds the + // mutex when this is called. + has_owner_ = false; + GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_unlock(&mutex_)); + } + + // Does nothing if the current thread holds the mutex. Otherwise, crashes + // with high probability. + void AssertHeld() const { + GTEST_CHECK_(has_owner_ && pthread_equal(owner_, pthread_self())) + << "The current thread is not holding the mutex @" << this; + } + + // A static mutex may be used before main() is entered. It may even + // be used before the dynamic initialization stage. Therefore we + // must be able to initialize a static mutex object at link time. + // This means MutexBase has to be a POD and its member variables + // have to be public. + public: + pthread_mutex_t mutex_; // The underlying pthread mutex. + // has_owner_ indicates whether the owner_ field below contains a valid thread + // ID and is therefore safe to inspect (e.g., to use in pthread_equal()). All + // accesses to the owner_ field should be protected by a check of this field. + // An alternative might be to memset() owner_ to all zeros, but there's no + // guarantee that a zero'd pthread_t is necessarily invalid or even different + // from pthread_self(). + bool has_owner_; + pthread_t owner_; // The thread holding the mutex. +}; + +// Forward-declares a static mutex. +# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \ + extern ::testing::internal::MutexBase mutex + +// Defines and statically (i.e. at link time) initializes a static mutex. +// The initialization list here does not explicitly initialize each field, +// instead relying on default initialization for the unspecified fields. In +// particular, the owner_ field (a pthread_t) is not explicitly initialized. +// This allows initialization to work whether pthread_t is a scalar or struct. +// The flag -Wmissing-field-initializers must not be specified for this to work. +# define GTEST_DEFINE_STATIC_MUTEX_(mutex) \ + ::testing::internal::MutexBase mutex = { PTHREAD_MUTEX_INITIALIZER, false } + +// The Mutex class can only be used for mutexes created at runtime. It +// shares its API with MutexBase otherwise. +class Mutex : public MutexBase { + public: + Mutex() { + GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, NULL)); + has_owner_ = false; + } + ~Mutex() { + GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_)); + } + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex); +}; + +// We cannot name this class MutexLock as the ctor declaration would +// conflict with a macro named MutexLock, which is defined on some +// platforms. Hence the typedef trick below. +class GTestMutexLock { + public: + explicit GTestMutexLock(MutexBase* mutex) + : mutex_(mutex) { mutex_->Lock(); } + + ~GTestMutexLock() { mutex_->Unlock(); } + + private: + MutexBase* const mutex_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock); +}; + +typedef GTestMutexLock MutexLock; + +// Helpers for ThreadLocal. + +// pthread_key_create() requires DeleteThreadLocalValue() to have +// C-linkage. Therefore it cannot be templatized to access +// ThreadLocal<T>. Hence the need for class +// ThreadLocalValueHolderBase. +class ThreadLocalValueHolderBase { + public: + virtual ~ThreadLocalValueHolderBase() {} +}; + +// Called by pthread to delete thread-local data stored by +// pthread_setspecific(). +extern "C" inline void DeleteThreadLocalValue(void* value_holder) { + delete static_cast<ThreadLocalValueHolderBase*>(value_holder); +} + +// Implements thread-local storage on pthreads-based systems. +// +// // Thread 1 +// ThreadLocal<int> tl(100); // 100 is the default value for each thread. +// +// // Thread 2 +// tl.set(150); // Changes the value for thread 2 only. +// EXPECT_EQ(150, tl.get()); +// +// // Thread 1 +// EXPECT_EQ(100, tl.get()); // In thread 1, tl has the original value. +// tl.set(200); +// EXPECT_EQ(200, tl.get()); +// +// The template type argument T must have a public copy constructor. +// In addition, the default ThreadLocal constructor requires T to have +// a public default constructor. +// +// An object managed for a thread by a ThreadLocal instance is deleted +// when the thread exits. Or, if the ThreadLocal instance dies in +// that thread, when the ThreadLocal dies. It's the user's +// responsibility to ensure that all other threads using a ThreadLocal +// have exited when it dies, or the per-thread objects for those +// threads will not be deleted. +// +// Google Test only uses global ThreadLocal objects. That means they +// will die after main() has returned. Therefore, no per-thread +// object managed by Google Test will be leaked as long as all threads +// using Google Test have exited when main() returns. +template <typename T> +class ThreadLocal { + public: + ThreadLocal() : key_(CreateKey()), + default_() {} + explicit ThreadLocal(const T& value) : key_(CreateKey()), + default_(value) {} + + ~ThreadLocal() { + // Destroys the managed object for the current thread, if any. + DeleteThreadLocalValue(pthread_getspecific(key_)); + + // Releases resources associated with the key. This will *not* + // delete managed objects for other threads. + GTEST_CHECK_POSIX_SUCCESS_(pthread_key_delete(key_)); + } + + T* pointer() { return GetOrCreateValue(); } + const T* pointer() const { return GetOrCreateValue(); } + const T& get() const { return *pointer(); } + void set(const T& value) { *pointer() = value; } + + private: + // Holds a value of type T. + class ValueHolder : public ThreadLocalValueHolderBase { + public: + explicit ValueHolder(const T& value) : value_(value) {} + + T* pointer() { return &value_; } + + private: + T value_; + GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder); + }; + + static pthread_key_t CreateKey() { + pthread_key_t key; + // When a thread exits, DeleteThreadLocalValue() will be called on + // the object managed for that thread. + GTEST_CHECK_POSIX_SUCCESS_( + pthread_key_create(&key, &DeleteThreadLocalValue)); + return key; + } + + T* GetOrCreateValue() const { + ThreadLocalValueHolderBase* const holder = + static_cast<ThreadLocalValueHolderBase*>(pthread_getspecific(key_)); + if (holder != NULL) { + return CheckedDowncastToActualType<ValueHolder>(holder)->pointer(); + } + + ValueHolder* const new_holder = new ValueHolder(default_); + ThreadLocalValueHolderBase* const holder_base = new_holder; + GTEST_CHECK_POSIX_SUCCESS_(pthread_setspecific(key_, holder_base)); + return new_holder->pointer(); + } + + // A key pthreads uses for looking up per-thread values. + const pthread_key_t key_; + const T default_; // The default value for each thread. + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal); +}; + +# define GTEST_IS_THREADSAFE 1 + +#else // GTEST_HAS_PTHREAD + +// A dummy implementation of synchronization primitives (mutex, lock, +// and thread-local variable). Necessary for compiling Google Test where +// mutex is not supported - using Google Test in multiple threads is not +// supported on such platforms. + +class Mutex { + public: + Mutex() {} + void Lock() {} + void Unlock() {} + void AssertHeld() const {} +}; + +# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \ + extern ::testing::internal::Mutex mutex + +# define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex + +class GTestMutexLock { + public: + explicit GTestMutexLock(Mutex*) {} // NOLINT +}; + +typedef GTestMutexLock MutexLock; + +template <typename T> +class ThreadLocal { + public: + ThreadLocal() : value_() {} + explicit ThreadLocal(const T& value) : value_(value) {} + T* pointer() { return &value_; } + const T* pointer() const { return &value_; } + const T& get() const { return value_; } + void set(const T& value) { value_ = value; } + private: + T value_; +}; + +// The above synchronization primitives have dummy implementations. +// Therefore Google Test is not thread-safe. +# define GTEST_IS_THREADSAFE 0 + +#endif // GTEST_HAS_PTHREAD + +// Returns the number of threads running in the process, or 0 to indicate that +// we cannot detect it. +GTEST_API_ size_t GetThreadCount(); + +// Passing non-POD classes through ellipsis (...) crashes the ARM +// compiler and generates a warning in Sun Studio. The Nokia Symbian +// and the IBM XL C/C++ compiler try to instantiate a copy constructor +// for objects passed through ellipsis (...), failing for uncopyable +// objects. We define this to ensure that only POD is passed through +// ellipsis on these systems. +#if defined(__SYMBIAN32__) || defined(__IBMCPP__) || defined(__SUNPRO_CC) +// We lose support for NULL detection where the compiler doesn't like +// passing non-POD classes through ellipsis (...). +# define GTEST_ELLIPSIS_NEEDS_POD_ 1 +#else +# define GTEST_CAN_COMPARE_NULL 1 +#endif + +// The Nokia Symbian and IBM XL C/C++ compilers cannot decide between +// const T& and const T* in a function template. These compilers +// _can_ decide between class template specializations for T and T*, +// so a tr1::type_traits-like is_pointer works. +#if defined(__SYMBIAN32__) || defined(__IBMCPP__) +# define GTEST_NEEDS_IS_POINTER_ 1 +#endif + +template <bool bool_value> +struct bool_constant { + typedef bool_constant<bool_value> type; + static const bool value = bool_value; +}; +template <bool bool_value> const bool bool_constant<bool_value>::value; + +typedef bool_constant<false> false_type; +typedef bool_constant<true> true_type; + +template <typename T> +struct is_pointer : public false_type {}; + +template <typename T> +struct is_pointer<T*> : public true_type {}; + +template <typename Iterator> +struct IteratorTraits { + typedef typename Iterator::value_type value_type; +}; + +template <typename T> +struct IteratorTraits<T*> { + typedef T value_type; +}; + +template <typename T> +struct IteratorTraits<const T*> { + typedef T value_type; +}; + +#if GTEST_OS_WINDOWS +# define GTEST_PATH_SEP_ "\\" +# define GTEST_HAS_ALT_PATH_SEP_ 1 +// The biggest signed integer type the compiler supports. +typedef __int64 BiggestInt; +#else +# define GTEST_PATH_SEP_ "/" +# define GTEST_HAS_ALT_PATH_SEP_ 0 +typedef long long BiggestInt; // NOLINT +#endif // GTEST_OS_WINDOWS + +// Utilities for char. + +// isspace(int ch) and friends accept an unsigned char or EOF. char +// may be signed, depending on the compiler (or compiler flags). +// Therefore we need to cast a char to unsigned char before calling +// isspace(), etc. + +inline bool IsAlpha(char ch) { + return isalpha(static_cast<unsigned char>(ch)) != 0; +} +inline bool IsAlNum(char ch) { + return isalnum(static_cast<unsigned char>(ch)) != 0; +} +inline bool IsDigit(char ch) { + return isdigit(static_cast<unsigned char>(ch)) != 0; +} +inline bool IsLower(char ch) { + return islower(static_cast<unsigned char>(ch)) != 0; +} +inline bool IsSpace(char ch) { + return isspace(static_cast<unsigned char>(ch)) != 0; +} +inline bool IsUpper(char ch) { + return isupper(static_cast<unsigned char>(ch)) != 0; +} +inline bool IsXDigit(char ch) { + return isxdigit(static_cast<unsigned char>(ch)) != 0; +} +inline bool IsXDigit(wchar_t ch) { + const unsigned char low_byte = static_cast<unsigned char>(ch); + return ch == low_byte && isxdigit(low_byte) != 0; +} + +inline char ToLower(char ch) { + return static_cast<char>(tolower(static_cast<unsigned char>(ch))); +} +inline char ToUpper(char ch) { + return static_cast<char>(toupper(static_cast<unsigned char>(ch))); +} + +// The testing::internal::posix namespace holds wrappers for common +// POSIX functions. These wrappers hide the differences between +// Windows/MSVC and POSIX systems. Since some compilers define these +// standard functions as macros, the wrapper cannot have the same name +// as the wrapped function. + +namespace posix { + +// Functions with a different name on Windows. + +#if GTEST_OS_WINDOWS + +typedef struct _stat StatStruct; + +# ifdef __BORLANDC__ +inline int IsATTY(int fd) { return isatty(fd); } +inline int StrCaseCmp(const char* s1, const char* s2) { + return stricmp(s1, s2); +} +inline char* StrDup(const char* src) { return strdup(src); } +# else // !__BORLANDC__ +# if GTEST_OS_WINDOWS_MOBILE +inline int IsATTY(int /* fd */) { return 0; } +# else +inline int IsATTY(int fd) { return _isatty(fd); } +# endif // GTEST_OS_WINDOWS_MOBILE +inline int StrCaseCmp(const char* s1, const char* s2) { + return _stricmp(s1, s2); +} +inline char* StrDup(const char* src) { return _strdup(src); } +# endif // __BORLANDC__ + +# if GTEST_OS_WINDOWS_MOBILE +inline int FileNo(FILE* file) { return reinterpret_cast<int>(_fileno(file)); } +// Stat(), RmDir(), and IsDir() are not needed on Windows CE at this +// time and thus not defined there. +# else +inline int FileNo(FILE* file) { return _fileno(file); } +inline int Stat(const char* path, StatStruct* buf) { return _stat(path, buf); } +inline int RmDir(const char* dir) { return _rmdir(dir); } +inline bool IsDir(const StatStruct& st) { + return (_S_IFDIR & st.st_mode) != 0; +} +# endif // GTEST_OS_WINDOWS_MOBILE + +#else + +typedef struct stat StatStruct; + +inline int FileNo(FILE* file) { return fileno(file); } +inline int IsATTY(int fd) { return isatty(fd); } +inline int Stat(const char* path, StatStruct* buf) { return stat(path, buf); } +inline int StrCaseCmp(const char* s1, const char* s2) { + return strcasecmp(s1, s2); +} +inline char* StrDup(const char* src) { return strdup(src); } +inline int RmDir(const char* dir) { return rmdir(dir); } +inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); } + +#endif // GTEST_OS_WINDOWS + +// Functions deprecated by MSVC 8.0. + +#ifdef _MSC_VER +// Temporarily disable warning 4996 (deprecated function). +# pragma warning(push) +# pragma warning(disable:4996) +#endif + +inline const char* StrNCpy(char* dest, const char* src, size_t n) { + return strncpy(dest, src, n); +} + +// ChDir(), FReopen(), FDOpen(), Read(), Write(), Close(), and +// StrError() aren't needed on Windows CE at this time and thus not +// defined there. + +#if !GTEST_OS_WINDOWS_MOBILE +inline int ChDir(const char* dir) { return chdir(dir); } +#endif +inline FILE* FOpen(const char* path, const char* mode) { + return fopen(path, mode); +} +#if !GTEST_OS_WINDOWS_MOBILE +inline FILE *FReopen(const char* path, const char* mode, FILE* stream) { + return freopen(path, mode, stream); +} +inline FILE* FDOpen(int fd, const char* mode) { return fdopen(fd, mode); } +#endif +inline int FClose(FILE* fp) { return fclose(fp); } +#if !GTEST_OS_WINDOWS_MOBILE +inline int Read(int fd, void* buf, unsigned int count) { + return static_cast<int>(read(fd, buf, count)); +} +inline int Write(int fd, const void* buf, unsigned int count) { + return static_cast<int>(write(fd, buf, count)); +} +inline int Close(int fd) { return close(fd); } +inline const char* StrError(int errnum) { return strerror(errnum); } +#endif +inline const char* GetEnv(const char* name) { +#if GTEST_OS_WINDOWS_MOBILE + // We are on Windows CE, which has no environment variables. + return NULL; +#elif defined(__BORLANDC__) || defined(__SunOS_5_8) || defined(__SunOS_5_9) + // Environment variables which we programmatically clear will be set to the + // empty string rather than unset (NULL). Handle that case. + const char* const env = getenv(name); + return (env != NULL && env[0] != '\0') ? env : NULL; +#else + return getenv(name); +#endif +} + +#ifdef _MSC_VER +# pragma warning(pop) // Restores the warning state. +#endif + +#if GTEST_OS_WINDOWS_MOBILE +// Windows CE has no C library. The abort() function is used in +// several places in Google Test. This implementation provides a reasonable +// imitation of standard behaviour. +void Abort(); +#else +inline void Abort() { abort(); } +#endif // GTEST_OS_WINDOWS_MOBILE + +} // namespace posix + +// MSVC "deprecates" snprintf and issues warnings wherever it is used. In +// order to avoid these warnings, we need to use _snprintf or _snprintf_s on +// MSVC-based platforms. We map the GTEST_SNPRINTF_ macro to the appropriate +// function in order to achieve that. We use macro definition here because +// snprintf is a variadic function. +#if _MSC_VER >= 1400 && !GTEST_OS_WINDOWS_MOBILE +// MSVC 2005 and above support variadic macros. +# define GTEST_SNPRINTF_(buffer, size, format, ...) \ + _snprintf_s(buffer, size, size, format, __VA_ARGS__) +#elif defined(_MSC_VER) +// Windows CE does not define _snprintf_s and MSVC prior to 2005 doesn't +// complain about _snprintf. +# define GTEST_SNPRINTF_ _snprintf +#else +# define GTEST_SNPRINTF_ snprintf +#endif + +// The maximum number a BiggestInt can represent. This definition +// works no matter BiggestInt is represented in one's complement or +// two's complement. +// +// We cannot rely on numeric_limits in STL, as __int64 and long long +// are not part of standard C++ and numeric_limits doesn't need to be +// defined for them. +const BiggestInt kMaxBiggestInt = + ~(static_cast<BiggestInt>(1) << (8*sizeof(BiggestInt) - 1)); + +// This template class serves as a compile-time function from size to +// type. It maps a size in bytes to a primitive type with that +// size. e.g. +// +// TypeWithSize<4>::UInt +// +// is typedef-ed to be unsigned int (unsigned integer made up of 4 +// bytes). +// +// Such functionality should belong to STL, but I cannot find it +// there. +// +// Google Test uses this class in the implementation of floating-point +// comparison. +// +// For now it only handles UInt (unsigned int) as that's all Google Test +// needs. Other types can be easily added in the future if need +// arises. +template <size_t size> +class TypeWithSize { + public: + // This prevents the user from using TypeWithSize<N> with incorrect + // values of N. + typedef void UInt; +}; + +// The specialization for size 4. +template <> +class TypeWithSize<4> { + public: + // unsigned int has size 4 in both gcc and MSVC. + // + // As base/basictypes.h doesn't compile on Windows, we cannot use + // uint32, uint64, and etc here. + typedef int Int; + typedef unsigned int UInt; +}; + +// The specialization for size 8. +template <> +class TypeWithSize<8> { + public: +#if GTEST_OS_WINDOWS + typedef __int64 Int; + typedef unsigned __int64 UInt; +#else + typedef long long Int; // NOLINT + typedef unsigned long long UInt; // NOLINT +#endif // GTEST_OS_WINDOWS +}; + +// Integer types of known sizes. +typedef TypeWithSize<4>::Int Int32; +typedef TypeWithSize<4>::UInt UInt32; +typedef TypeWithSize<8>::Int Int64; +typedef TypeWithSize<8>::UInt UInt64; +typedef TypeWithSize<8>::Int TimeInMillis; // Represents time in milliseconds. + +// Utilities for command line flags and environment variables. + +// Macro for referencing flags. +#define GTEST_FLAG(name) FLAGS_gtest_##name + +// Macros for declaring flags. +#define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name) +#define GTEST_DECLARE_int32_(name) \ + GTEST_API_ extern ::testing::internal::Int32 GTEST_FLAG(name) +#define GTEST_DECLARE_string_(name) \ + GTEST_API_ extern ::std::string GTEST_FLAG(name) + +// Macros for defining flags. +#define GTEST_DEFINE_bool_(name, default_val, doc) \ + GTEST_API_ bool GTEST_FLAG(name) = (default_val) +#define GTEST_DEFINE_int32_(name, default_val, doc) \ + GTEST_API_ ::testing::internal::Int32 GTEST_FLAG(name) = (default_val) +#define GTEST_DEFINE_string_(name, default_val, doc) \ + GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val) + +// Thread annotations +#define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks) +#define GTEST_LOCK_EXCLUDED_(locks) + +// Parses 'str' for a 32-bit signed integer. If successful, writes the result +// to *value and returns true; otherwise leaves *value unchanged and returns +// false. +// TODO(chandlerc): Find a better way to refactor flag and environment parsing +// out of both gtest-port.cc and gtest.cc to avoid exporting this utility +// function. +bool ParseInt32(const Message& src_text, const char* str, Int32* value); + +// Parses a bool/Int32/string from the environment variable +// corresponding to the given Google Test flag. +bool BoolFromGTestEnv(const char* flag, bool default_val); +GTEST_API_ Int32 Int32FromGTestEnv(const char* flag, Int32 default_val); +const char* StringFromGTestEnv(const char* flag, const char* default_val); + +} // namespace internal +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ + +#if GTEST_OS_LINUX +# include <stdlib.h> +# include <sys/types.h> +# include <sys/wait.h> +# include <unistd.h> +#endif // GTEST_OS_LINUX + +#if GTEST_HAS_EXCEPTIONS +# include <stdexcept> +#endif + +#include <ctype.h> +#include <float.h> +#include <string.h> +#include <iomanip> +#include <limits> +#include <set> + +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) +// +// The Google C++ Testing Framework (Google Test) +// +// This header file defines the Message class. +// +// IMPORTANT NOTE: Due to limitation of the C++ language, we have to +// leave some internal implementation details in this header file. +// They are clearly marked by comments like this: +// +// // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +// +// Such code is NOT meant to be used by a user directly, and is subject +// to CHANGE WITHOUT NOTICE. Therefore DO NOT DEPEND ON IT in a user +// program! + +#ifndef GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ +#define GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ + +#include <limits> + + +// Ensures that there is at least one operator<< in the global namespace. +// See Message& operator<<(...) below for why. +void operator<<(const testing::internal::Secret&, int); + +namespace testing { + +// The Message class works like an ostream repeater. +// +// Typical usage: +// +// 1. You stream a bunch of values to a Message object. +// It will remember the text in a stringstream. +// 2. Then you stream the Message object to an ostream. +// This causes the text in the Message to be streamed +// to the ostream. +// +// For example; +// +// testing::Message foo; +// foo << 1 << " != " << 2; +// std::cout << foo; +// +// will print "1 != 2". +// +// Message is not intended to be inherited from. In particular, its +// destructor is not virtual. +// +// Note that stringstream behaves differently in gcc and in MSVC. You +// can stream a NULL char pointer to it in the former, but not in the +// latter (it causes an access violation if you do). The Message +// class hides this difference by treating a NULL char pointer as +// "(null)". +class GTEST_API_ Message { + private: + // The type of basic IO manipulators (endl, ends, and flush) for + // narrow streams. + typedef std::ostream& (*BasicNarrowIoManip)(std::ostream&); + + public: + // Constructs an empty Message. + Message(); + + // Copy constructor. + Message(const Message& msg) : ss_(new ::std::stringstream) { // NOLINT + *ss_ << msg.GetString(); + } + + // Constructs a Message from a C-string. + explicit Message(const char* str) : ss_(new ::std::stringstream) { + *ss_ << str; + } + +#if GTEST_OS_SYMBIAN + // Streams a value (either a pointer or not) to this object. + template <typename T> + inline Message& operator <<(const T& value) { + StreamHelper(typename internal::is_pointer<T>::type(), value); + return *this; + } +#else + // Streams a non-pointer value to this object. + template <typename T> + inline Message& operator <<(const T& val) { + // Some libraries overload << for STL containers. These + // overloads are defined in the global namespace instead of ::std. + // + // C++'s symbol lookup rule (i.e. Koenig lookup) says that these + // overloads are visible in either the std namespace or the global + // namespace, but not other namespaces, including the testing + // namespace which Google Test's Message class is in. + // + // To allow STL containers (and other types that has a << operator + // defined in the global namespace) to be used in Google Test + // assertions, testing::Message must access the custom << operator + // from the global namespace. With this using declaration, + // overloads of << defined in the global namespace and those + // visible via Koenig lookup are both exposed in this function. + using ::operator <<; + *ss_ << val; + return *this; + } + + // Streams a pointer value to this object. + // + // This function is an overload of the previous one. When you + // stream a pointer to a Message, this definition will be used as it + // is more specialized. (The C++ Standard, section + // [temp.func.order].) If you stream a non-pointer, then the + // previous definition will be used. + // + // The reason for this overload is that streaming a NULL pointer to + // ostream is undefined behavior. Depending on the compiler, you + // may get "0", "(nil)", "(null)", or an access violation. To + // ensure consistent result across compilers, we always treat NULL + // as "(null)". + template <typename T> + inline Message& operator <<(T* const& pointer) { // NOLINT + if (pointer == NULL) { + *ss_ << "(null)"; + } else { + *ss_ << pointer; + } + return *this; + } +#endif // GTEST_OS_SYMBIAN + + // Since the basic IO manipulators are overloaded for both narrow + // and wide streams, we have to provide this specialized definition + // of operator <<, even though its body is the same as the + // templatized version above. Without this definition, streaming + // endl or other basic IO manipulators to Message will confuse the + // compiler. + Message& operator <<(BasicNarrowIoManip val) { + *ss_ << val; + return *this; + } + + // Instead of 1/0, we want to see true/false for bool values. + Message& operator <<(bool b) { + return *this << (b ? "true" : "false"); + } + + // These two overloads allow streaming a wide C string to a Message + // using the UTF-8 encoding. + Message& operator <<(const wchar_t* wide_c_str); + Message& operator <<(wchar_t* wide_c_str); + +#if GTEST_HAS_STD_WSTRING + // Converts the given wide string to a narrow string using the UTF-8 + // encoding, and streams the result to this Message object. + Message& operator <<(const ::std::wstring& wstr); +#endif // GTEST_HAS_STD_WSTRING + +#if GTEST_HAS_GLOBAL_WSTRING + // Converts the given wide string to a narrow string using the UTF-8 + // encoding, and streams the result to this Message object. + Message& operator <<(const ::wstring& wstr); +#endif // GTEST_HAS_GLOBAL_WSTRING + + // Gets the text streamed to this object so far as an std::string. + // Each '\0' character in the buffer is replaced with "\\0". + // + // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. + std::string GetString() const; + + private: + +#if GTEST_OS_SYMBIAN + // These are needed as the Nokia Symbian Compiler cannot decide between + // const T& and const T* in a function template. The Nokia compiler _can_ + // decide between class template specializations for T and T*, so a + // tr1::type_traits-like is_pointer works, and we can overload on that. + template <typename T> + inline void StreamHelper(internal::true_type /*is_pointer*/, T* pointer) { + if (pointer == NULL) { + *ss_ << "(null)"; + } else { + *ss_ << pointer; + } + } + template <typename T> + inline void StreamHelper(internal::false_type /*is_pointer*/, + const T& value) { + // See the comments in Message& operator <<(const T&) above for why + // we need this using statement. + using ::operator <<; + *ss_ << value; + } +#endif // GTEST_OS_SYMBIAN + + // We'll hold the text streamed to this object here. + const internal::scoped_ptr< ::std::stringstream> ss_; + + // We declare (but don't implement) this to prevent the compiler + // from implementing the assignment operator. + void operator=(const Message&); +}; + +// Streams a Message to an ostream. +inline std::ostream& operator <<(std::ostream& os, const Message& sb) { + return os << sb.GetString(); +} + +namespace internal { + +// Converts a streamable value to an std::string. A NULL pointer is +// converted to "(null)". When the input value is a ::string, +// ::std::string, ::wstring, or ::std::wstring object, each NUL +// character in it is replaced with "\\0". +template <typename T> +std::string StreamableToString(const T& streamable) { + return (Message() << streamable).GetString(); +} + +} // namespace internal +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee) +// +// The Google C++ Testing Framework (Google Test) +// +// This header file declares the String class and functions used internally by +// Google Test. They are subject to change without notice. They should not used +// by code external to Google Test. +// +// This header file is #included by <gtest/internal/gtest-internal.h>. +// It should not be #included by other files. + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_ + +#ifdef __BORLANDC__ +// string.h is not guaranteed to provide strcpy on C++ Builder. +# include <mem.h> +#endif + +#include <string.h> +#include <string> + + +namespace testing { +namespace internal { + +// String - an abstract class holding static string utilities. +class GTEST_API_ String { + public: + // Static utility methods + + // Clones a 0-terminated C string, allocating memory using new. The + // caller is responsible for deleting the return value using + // delete[]. Returns the cloned string, or NULL if the input is + // NULL. + // + // This is different from strdup() in string.h, which allocates + // memory using malloc(). + static const char* CloneCString(const char* c_str); + +#if GTEST_OS_WINDOWS_MOBILE + // Windows CE does not have the 'ANSI' versions of Win32 APIs. To be + // able to pass strings to Win32 APIs on CE we need to convert them + // to 'Unicode', UTF-16. + + // Creates a UTF-16 wide string from the given ANSI string, allocating + // memory using new. The caller is responsible for deleting the return + // value using delete[]. Returns the wide string, or NULL if the + // input is NULL. + // + // The wide string is created using the ANSI codepage (CP_ACP) to + // match the behaviour of the ANSI versions of Win32 calls and the + // C runtime. + static LPCWSTR AnsiToUtf16(const char* c_str); + + // Creates an ANSI string from the given wide string, allocating + // memory using new. The caller is responsible for deleting the return + // value using delete[]. Returns the ANSI string, or NULL if the + // input is NULL. + // + // The returned string is created using the ANSI codepage (CP_ACP) to + // match the behaviour of the ANSI versions of Win32 calls and the + // C runtime. + static const char* Utf16ToAnsi(LPCWSTR utf16_str); +#endif + + // Compares two C strings. Returns true iff they have the same content. + // + // Unlike strcmp(), this function can handle NULL argument(s). A + // NULL C string is considered different to any non-NULL C string, + // including the empty string. + static bool CStringEquals(const char* lhs, const char* rhs); + + // Converts a wide C string to a String using the UTF-8 encoding. + // NULL will be converted to "(null)". If an error occurred during + // the conversion, "(failed to convert from wide string)" is + // returned. + static std::string ShowWideCString(const wchar_t* wide_c_str); + + // Compares two wide C strings. Returns true iff they have the same + // content. + // + // Unlike wcscmp(), this function can handle NULL argument(s). A + // NULL C string is considered different to any non-NULL C string, + // including the empty string. + static bool WideCStringEquals(const wchar_t* lhs, const wchar_t* rhs); + + // Compares two C strings, ignoring case. Returns true iff they + // have the same content. + // + // Unlike strcasecmp(), this function can handle NULL argument(s). + // A NULL C string is considered different to any non-NULL C string, + // including the empty string. + static bool CaseInsensitiveCStringEquals(const char* lhs, + const char* rhs); + + // Compares two wide C strings, ignoring case. Returns true iff they + // have the same content. + // + // Unlike wcscasecmp(), this function can handle NULL argument(s). + // A NULL C string is considered different to any non-NULL wide C string, + // including the empty string. + // NB: The implementations on different platforms slightly differ. + // On windows, this method uses _wcsicmp which compares according to LC_CTYPE + // environment variable. On GNU platform this method uses wcscasecmp + // which compares according to LC_CTYPE category of the current locale. + // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the + // current locale. + static bool CaseInsensitiveWideCStringEquals(const wchar_t* lhs, + const wchar_t* rhs); + + // Returns true iff the given string ends with the given suffix, ignoring + // case. Any string is considered to end with an empty suffix. + static bool EndsWithCaseInsensitive( + const std::string& str, const std::string& suffix); + + // Formats an int value as "%02d". + static std::string FormatIntWidth2(int value); // "%02d" for width == 2 + + // Formats an int value as "%X". + static std::string FormatHexInt(int value); + + // Formats a byte as "%02X". + static std::string FormatByte(unsigned char value); + + private: + String(); // Not meant to be instantiated. +}; // class String + +// Gets the content of the stringstream's buffer as an std::string. Each '\0' +// character in the buffer is replaced with "\\0". +GTEST_API_ std::string StringStreamToString(::std::stringstream* stream); + +} // namespace internal +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: keith.ray@gmail.com (Keith Ray) +// +// Google Test filepath utilities +// +// This header file declares classes and functions used internally by +// Google Test. They are subject to change without notice. +// +// This file is #included in <gtest/internal/gtest-internal.h>. +// Do not include this header file separately! + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ + + +namespace testing { +namespace internal { + +// FilePath - a class for file and directory pathname manipulation which +// handles platform-specific conventions (like the pathname separator). +// Used for helper functions for naming files in a directory for xml output. +// Except for Set methods, all methods are const or static, which provides an +// "immutable value object" -- useful for peace of mind. +// A FilePath with a value ending in a path separator ("like/this/") represents +// a directory, otherwise it is assumed to represent a file. In either case, +// it may or may not represent an actual file or directory in the file system. +// Names are NOT checked for syntax correctness -- no checking for illegal +// characters, malformed paths, etc. + +class GTEST_API_ FilePath { + public: + FilePath() : pathname_("") { } + FilePath(const FilePath& rhs) : pathname_(rhs.pathname_) { } + + explicit FilePath(const std::string& pathname) : pathname_(pathname) { + Normalize(); + } + + FilePath& operator=(const FilePath& rhs) { + Set(rhs); + return *this; + } + + void Set(const FilePath& rhs) { + pathname_ = rhs.pathname_; + } + + const std::string& string() const { return pathname_; } + const char* c_str() const { return pathname_.c_str(); } + + // Returns the current working directory, or "" if unsuccessful. + static FilePath GetCurrentDir(); + + // Given directory = "dir", base_name = "test", number = 0, + // extension = "xml", returns "dir/test.xml". If number is greater + // than zero (e.g., 12), returns "dir/test_12.xml". + // On Windows platform, uses \ as the separator rather than /. + static FilePath MakeFileName(const FilePath& directory, + const FilePath& base_name, + int number, + const char* extension); + + // Given directory = "dir", relative_path = "test.xml", + // returns "dir/test.xml". + // On Windows, uses \ as the separator rather than /. + static FilePath ConcatPaths(const FilePath& directory, + const FilePath& relative_path); + + // Returns a pathname for a file that does not currently exist. The pathname + // will be directory/base_name.extension or + // directory/base_name_<number>.extension if directory/base_name.extension + // already exists. The number will be incremented until a pathname is found + // that does not already exist. + // Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'. + // There could be a race condition if two or more processes are calling this + // function at the same time -- they could both pick the same filename. + static FilePath GenerateUniqueFileName(const FilePath& directory, + const FilePath& base_name, + const char* extension); + + // Returns true iff the path is "". + bool IsEmpty() const { return pathname_.empty(); } + + // If input name has a trailing separator character, removes it and returns + // the name, otherwise return the name string unmodified. + // On Windows platform, uses \ as the separator, other platforms use /. + FilePath RemoveTrailingPathSeparator() const; + + // Returns a copy of the FilePath with the directory part removed. + // Example: FilePath("path/to/file").RemoveDirectoryName() returns + // FilePath("file"). If there is no directory part ("just_a_file"), it returns + // the FilePath unmodified. If there is no file part ("just_a_dir/") it + // returns an empty FilePath (""). + // On Windows platform, '\' is the path separator, otherwise it is '/'. + FilePath RemoveDirectoryName() const; + + // RemoveFileName returns the directory path with the filename removed. + // Example: FilePath("path/to/file").RemoveFileName() returns "path/to/". + // If the FilePath is "a_file" or "/a_file", RemoveFileName returns + // FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does + // not have a file, like "just/a/dir/", it returns the FilePath unmodified. + // On Windows platform, '\' is the path separator, otherwise it is '/'. + FilePath RemoveFileName() const; + + // Returns a copy of the FilePath with the case-insensitive extension removed. + // Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns + // FilePath("dir/file"). If a case-insensitive extension is not + // found, returns a copy of the original FilePath. + FilePath RemoveExtension(const char* extension) const; + + // Creates directories so that path exists. Returns true if successful or if + // the directories already exist; returns false if unable to create + // directories for any reason. Will also return false if the FilePath does + // not represent a directory (that is, it doesn't end with a path separator). + bool CreateDirectoriesRecursively() const; + + // Create the directory so that path exists. Returns true if successful or + // if the directory already exists; returns false if unable to create the + // directory for any reason, including if the parent directory does not + // exist. Not named "CreateDirectory" because that's a macro on Windows. + bool CreateFolder() const; + + // Returns true if FilePath describes something in the file-system, + // either a file, directory, or whatever, and that something exists. + bool FileOrDirectoryExists() const; + + // Returns true if pathname describes a directory in the file-system + // that exists. + bool DirectoryExists() const; + + // Returns true if FilePath ends with a path separator, which indicates that + // it is intended to represent a directory. Returns false otherwise. + // This does NOT check that a directory (or file) actually exists. + bool IsDirectory() const; + + // Returns true if pathname describes a root directory. (Windows has one + // root directory per disk drive.) + bool IsRootDirectory() const; + + // Returns true if pathname describes an absolute path. + bool IsAbsolutePath() const; + + private: + // Replaces multiple consecutive separators with a single separator. + // For example, "bar///foo" becomes "bar/foo". Does not eliminate other + // redundancies that might be in a pathname involving "." or "..". + // + // A pathname with multiple consecutive separators may occur either through + // user error or as a result of some scripts or APIs that generate a pathname + // with a trailing separator. On other platforms the same API or script + // may NOT generate a pathname with a trailing "/". Then elsewhere that + // pathname may have another "/" and pathname components added to it, + // without checking for the separator already being there. + // The script language and operating system may allow paths like "foo//bar" + // but some of the functions in FilePath will not handle that correctly. In + // particular, RemoveTrailingPathSeparator() only removes one separator, and + // it is called in CreateDirectoriesRecursively() assuming that it will change + // a pathname from directory syntax (trailing separator) to filename syntax. + // + // On Windows this method also replaces the alternate path separator '/' with + // the primary path separator '\\', so that for example "bar\\/\\foo" becomes + // "bar\\foo". + + void Normalize(); + + // Returns a pointer to the last occurrence of a valid path separator in + // the FilePath. On Windows, for example, both '/' and '\' are valid path + // separators. Returns NULL if no path separator was found. + const char* FindLastPathSeparator() const; + + std::string pathname_; +}; // class FilePath + +} // namespace internal +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ +// This file was GENERATED by command: +// pump.py gtest-type-util.h.pump +// DO NOT EDIT BY HAND!!! + +// Copyright 2008 Google Inc. +// All Rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) + +// Type utilities needed for implementing typed and type-parameterized +// tests. This file is generated by a SCRIPT. DO NOT EDIT BY HAND! +// +// Currently we support at most 50 types in a list, and at most 50 +// type-parameterized tests in one type-parameterized test case. +// Please contact googletestframework@googlegroups.com if you need +// more. + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ + + +// #ifdef __GNUC__ is too general here. It is possible to use gcc without using +// libstdc++ (which is where cxxabi.h comes from). +# if GTEST_HAS_CXXABI_H_ +# include <cxxabi.h> +# elif defined(__HP_aCC) +# include <acxx_demangle.h> +# endif // GTEST_HASH_CXXABI_H_ + +namespace testing { +namespace internal { + +// GetTypeName<T>() returns a human-readable name of type T. +// NB: This function is also used in Google Mock, so don't move it inside of +// the typed-test-only section below. +template <typename T> +std::string GetTypeName() { +# if GTEST_HAS_RTTI + + const char* const name = typeid(T).name(); +# if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC) + int status = 0; + // gcc's implementation of typeid(T).name() mangles the type name, + // so we have to demangle it. +# if GTEST_HAS_CXXABI_H_ + using abi::__cxa_demangle; +# endif // GTEST_HAS_CXXABI_H_ + char* const readable_name = __cxa_demangle(name, 0, 0, &status); + const std::string name_str(status == 0 ? readable_name : name); + free(readable_name); + return name_str; +# else + return name; +# endif // GTEST_HAS_CXXABI_H_ || __HP_aCC + +# else + + return "<type>"; + +# endif // GTEST_HAS_RTTI +} + +#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P + +// AssertyTypeEq<T1, T2>::type is defined iff T1 and T2 are the same +// type. This can be used as a compile-time assertion to ensure that +// two types are equal. + +template <typename T1, typename T2> +struct AssertTypeEq; + +template <typename T> +struct AssertTypeEq<T, T> { + typedef bool type; +}; + +// A unique type used as the default value for the arguments of class +// template Types. This allows us to simulate variadic templates +// (e.g. Types<int>, Type<int, double>, and etc), which C++ doesn't +// support directly. +struct None {}; + +// The following family of struct and struct templates are used to +// represent type lists. In particular, TypesN<T1, T2, ..., TN> +// represents a type list with N types (T1, T2, ..., and TN) in it. +// Except for Types0, every struct in the family has two member types: +// Head for the first type in the list, and Tail for the rest of the +// list. + +// The empty type list. +struct Types0 {}; + +// Type lists of length 1, 2, 3, and so on. + +template <typename T1> +struct Types1 { + typedef T1 Head; + typedef Types0 Tail; +}; +template <typename T1, typename T2> +struct Types2 { + typedef T1 Head; + typedef Types1<T2> Tail; +}; + +template <typename T1, typename T2, typename T3> +struct Types3 { + typedef T1 Head; + typedef Types2<T2, T3> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4> +struct Types4 { + typedef T1 Head; + typedef Types3<T2, T3, T4> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5> +struct Types5 { + typedef T1 Head; + typedef Types4<T2, T3, T4, T5> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6> +struct Types6 { + typedef T1 Head; + typedef Types5<T2, T3, T4, T5, T6> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7> +struct Types7 { + typedef T1 Head; + typedef Types6<T2, T3, T4, T5, T6, T7> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8> +struct Types8 { + typedef T1 Head; + typedef Types7<T2, T3, T4, T5, T6, T7, T8> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9> +struct Types9 { + typedef T1 Head; + typedef Types8<T2, T3, T4, T5, T6, T7, T8, T9> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10> +struct Types10 { + typedef T1 Head; + typedef Types9<T2, T3, T4, T5, T6, T7, T8, T9, T10> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11> +struct Types11 { + typedef T1 Head; + typedef Types10<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12> +struct Types12 { + typedef T1 Head; + typedef Types11<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13> +struct Types13 { + typedef T1 Head; + typedef Types12<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14> +struct Types14 { + typedef T1 Head; + typedef Types13<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15> +struct Types15 { + typedef T1 Head; + typedef Types14<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16> +struct Types16 { + typedef T1 Head; + typedef Types15<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17> +struct Types17 { + typedef T1 Head; + typedef Types16<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18> +struct Types18 { + typedef T1 Head; + typedef Types17<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19> +struct Types19 { + typedef T1 Head; + typedef Types18<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20> +struct Types20 { + typedef T1 Head; + typedef Types19<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21> +struct Types21 { + typedef T1 Head; + typedef Types20<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22> +struct Types22 { + typedef T1 Head; + typedef Types21<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23> +struct Types23 { + typedef T1 Head; + typedef Types22<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24> +struct Types24 { + typedef T1 Head; + typedef Types23<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25> +struct Types25 { + typedef T1 Head; + typedef Types24<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26> +struct Types26 { + typedef T1 Head; + typedef Types25<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27> +struct Types27 { + typedef T1 Head; + typedef Types26<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28> +struct Types28 { + typedef T1 Head; + typedef Types27<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29> +struct Types29 { + typedef T1 Head; + typedef Types28<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30> +struct Types30 { + typedef T1 Head; + typedef Types29<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31> +struct Types31 { + typedef T1 Head; + typedef Types30<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, T31> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32> +struct Types32 { + typedef T1 Head; + typedef Types31<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, T31, T32> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33> +struct Types33 { + typedef T1 Head; + typedef Types32<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, T31, T32, T33> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34> +struct Types34 { + typedef T1 Head; + typedef Types33<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, T31, T32, T33, T34> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35> +struct Types35 { + typedef T1 Head; + typedef Types34<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, T31, T32, T33, T34, T35> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36> +struct Types36 { + typedef T1 Head; + typedef Types35<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, T31, T32, T33, T34, T35, T36> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37> +struct Types37 { + typedef T1 Head; + typedef Types36<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, T31, T32, T33, T34, T35, T36, T37> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38> +struct Types38 { + typedef T1 Head; + typedef Types37<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, T31, T32, T33, T34, T35, T36, T37, T38> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39> +struct Types39 { + typedef T1 Head; + typedef Types38<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40> +struct Types40 { + typedef T1 Head; + typedef Types39<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41> +struct Types41 { + typedef T1 Head; + typedef Types40<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41, typename T42> +struct Types42 { + typedef T1 Head; + typedef Types41<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41, typename T42, typename T43> +struct Types43 { + typedef T1 Head; + typedef Types42<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, + T43> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41, typename T42, typename T43, typename T44> +struct Types44 { + typedef T1 Head; + typedef Types43<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, + T44> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41, typename T42, typename T43, typename T44, typename T45> +struct Types45 { + typedef T1 Head; + typedef Types44<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, + T44, T45> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41, typename T42, typename T43, typename T44, typename T45, + typename T46> +struct Types46 { + typedef T1 Head; + typedef Types45<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, + T44, T45, T46> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41, typename T42, typename T43, typename T44, typename T45, + typename T46, typename T47> +struct Types47 { + typedef T1 Head; + typedef Types46<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, + T44, T45, T46, T47> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41, typename T42, typename T43, typename T44, typename T45, + typename T46, typename T47, typename T48> +struct Types48 { + typedef T1 Head; + typedef Types47<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, + T44, T45, T46, T47, T48> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41, typename T42, typename T43, typename T44, typename T45, + typename T46, typename T47, typename T48, typename T49> +struct Types49 { + typedef T1 Head; + typedef Types48<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, + T44, T45, T46, T47, T48, T49> Tail; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41, typename T42, typename T43, typename T44, typename T45, + typename T46, typename T47, typename T48, typename T49, typename T50> +struct Types50 { + typedef T1 Head; + typedef Types49<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, + T44, T45, T46, T47, T48, T49, T50> Tail; +}; + + +} // namespace internal + +// We don't want to require the users to write TypesN<...> directly, +// as that would require them to count the length. Types<...> is much +// easier to write, but generates horrible messages when there is a +// compiler error, as gcc insists on printing out each template +// argument, even if it has the default value (this means Types<int> +// will appear as Types<int, None, None, ..., None> in the compiler +// errors). +// +// Our solution is to combine the best part of the two approaches: a +// user would write Types<T1, ..., TN>, and Google Test will translate +// that to TypesN<T1, ..., TN> internally to make error messages +// readable. The translation is done by the 'type' member of the +// Types template. +template <typename T1 = internal::None, typename T2 = internal::None, + typename T3 = internal::None, typename T4 = internal::None, + typename T5 = internal::None, typename T6 = internal::None, + typename T7 = internal::None, typename T8 = internal::None, + typename T9 = internal::None, typename T10 = internal::None, + typename T11 = internal::None, typename T12 = internal::None, + typename T13 = internal::None, typename T14 = internal::None, + typename T15 = internal::None, typename T16 = internal::None, + typename T17 = internal::None, typename T18 = internal::None, + typename T19 = internal::None, typename T20 = internal::None, + typename T21 = internal::None, typename T22 = internal::None, + typename T23 = internal::None, typename T24 = internal::None, + typename T25 = internal::None, typename T26 = internal::None, + typename T27 = internal::None, typename T28 = internal::None, + typename T29 = internal::None, typename T30 = internal::None, + typename T31 = internal::None, typename T32 = internal::None, + typename T33 = internal::None, typename T34 = internal::None, + typename T35 = internal::None, typename T36 = internal::None, + typename T37 = internal::None, typename T38 = internal::None, + typename T39 = internal::None, typename T40 = internal::None, + typename T41 = internal::None, typename T42 = internal::None, + typename T43 = internal::None, typename T44 = internal::None, + typename T45 = internal::None, typename T46 = internal::None, + typename T47 = internal::None, typename T48 = internal::None, + typename T49 = internal::None, typename T50 = internal::None> +struct Types { + typedef internal::Types50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, + T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, + T41, T42, T43, T44, T45, T46, T47, T48, T49, T50> type; +}; + +template <> +struct Types<internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None> { + typedef internal::Types0 type; +}; +template <typename T1> +struct Types<T1, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None> { + typedef internal::Types1<T1> type; +}; +template <typename T1, typename T2> +struct Types<T1, T2, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None> { + typedef internal::Types2<T1, T2> type; +}; +template <typename T1, typename T2, typename T3> +struct Types<T1, T2, T3, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None> { + typedef internal::Types3<T1, T2, T3> type; +}; +template <typename T1, typename T2, typename T3, typename T4> +struct Types<T1, T2, T3, T4, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None> { + typedef internal::Types4<T1, T2, T3, T4> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5> +struct Types<T1, T2, T3, T4, T5, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None> { + typedef internal::Types5<T1, T2, T3, T4, T5> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6> +struct Types<T1, T2, T3, T4, T5, T6, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None> { + typedef internal::Types6<T1, T2, T3, T4, T5, T6> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7> +struct Types<T1, T2, T3, T4, T5, T6, T7, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None> { + typedef internal::Types7<T1, T2, T3, T4, T5, T6, T7> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None> { + typedef internal::Types8<T1, T2, T3, T4, T5, T6, T7, T8> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None> { + typedef internal::Types9<T1, T2, T3, T4, T5, T6, T7, T8, T9> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None> { + typedef internal::Types10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None> { + typedef internal::Types11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None> { + typedef internal::Types12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None> { + typedef internal::Types13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None> { + typedef internal::Types14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13, T14> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None> { + typedef internal::Types15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13, T14, T15> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None> { + typedef internal::Types16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13, T14, T15, T16> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None> { + typedef internal::Types17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13, T14, T15, T16, T17> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None> { + typedef internal::Types18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13, T14, T15, T16, T17, T18> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None> { + typedef internal::Types19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13, T14, T15, T16, T17, T18, T19> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None> { + typedef internal::Types20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13, T14, T15, T16, T17, T18, T19, T20> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None> { + typedef internal::Types21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13, T14, T15, T16, T17, T18, T19, T20, T21> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None> { + typedef internal::Types22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13, T14, T15, T16, T17, T18, T19, T20, T21, T22> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None> { + typedef internal::Types23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None> { + typedef internal::Types24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None> { + typedef internal::Types25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None> { + typedef internal::Types26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, + T26> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None> { + typedef internal::Types27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, + T27> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None> { + typedef internal::Types28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, + T27, T28> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None> { + typedef internal::Types29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, + T27, T28, T29> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None> { + typedef internal::Types30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, + T27, T28, T29, T30> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30, + T31, internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None> { + typedef internal::Types31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, + T27, T28, T29, T30, T31> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30, + T31, T32, internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None> { + typedef internal::Types32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, + T27, T28, T29, T30, T31, T32> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30, + T31, T32, T33, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None> { + typedef internal::Types33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, + T27, T28, T29, T30, T31, T32, T33> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30, + T31, T32, T33, T34, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None> { + typedef internal::Types34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, + T27, T28, T29, T30, T31, T32, T33, T34> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30, + T31, T32, T33, T34, T35, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None> { + typedef internal::Types35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, + T27, T28, T29, T30, T31, T32, T33, T34, T35> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30, + T31, T32, T33, T34, T35, T36, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None> { + typedef internal::Types36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, + T27, T28, T29, T30, T31, T32, T33, T34, T35, T36> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30, + T31, T32, T33, T34, T35, T36, T37, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None> { + typedef internal::Types37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, + T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30, + T31, T32, T33, T34, T35, T36, T37, T38, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None> { + typedef internal::Types38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, + T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30, + T31, T32, T33, T34, T35, T36, T37, T38, T39, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None> { + typedef internal::Types39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, + T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30, + T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None> { + typedef internal::Types40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, + T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, + T40> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30, + T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None, internal::None> { + typedef internal::Types41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, + T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, + T41> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41, typename T42> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30, + T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, internal::None, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None> { + typedef internal::Types42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, + T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, + T41, T42> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41, typename T42, typename T43> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30, + T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None, internal::None> { + typedef internal::Types43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, + T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, + T41, T42, T43> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41, typename T42, typename T43, typename T44> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30, + T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, + internal::None, internal::None, internal::None, internal::None, + internal::None, internal::None> { + typedef internal::Types44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, + T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, + T41, T42, T43, T44> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41, typename T42, typename T43, typename T44, typename T45> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30, + T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45, + internal::None, internal::None, internal::None, internal::None, + internal::None> { + typedef internal::Types45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, + T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, + T41, T42, T43, T44, T45> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41, typename T42, typename T43, typename T44, typename T45, + typename T46> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30, + T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45, + T46, internal::None, internal::None, internal::None, internal::None> { + typedef internal::Types46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, + T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, + T41, T42, T43, T44, T45, T46> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41, typename T42, typename T43, typename T44, typename T45, + typename T46, typename T47> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30, + T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45, + T46, T47, internal::None, internal::None, internal::None> { + typedef internal::Types47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, + T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, + T41, T42, T43, T44, T45, T46, T47> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41, typename T42, typename T43, typename T44, typename T45, + typename T46, typename T47, typename T48> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30, + T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45, + T46, T47, T48, internal::None, internal::None> { + typedef internal::Types48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, + T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, + T41, T42, T43, T44, T45, T46, T47, T48> type; +}; +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41, typename T42, typename T43, typename T44, typename T45, + typename T46, typename T47, typename T48, typename T49> +struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, + T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30, + T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45, + T46, T47, T48, T49, internal::None> { + typedef internal::Types49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, + T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, + T41, T42, T43, T44, T45, T46, T47, T48, T49> type; +}; + +namespace internal { + +# define GTEST_TEMPLATE_ template <typename T> class + +// The template "selector" struct TemplateSel<Tmpl> is used to +// represent Tmpl, which must be a class template with one type +// parameter, as a type. TemplateSel<Tmpl>::Bind<T>::type is defined +// as the type Tmpl<T>. This allows us to actually instantiate the +// template "selected" by TemplateSel<Tmpl>. +// +// This trick is necessary for simulating typedef for class templates, +// which C++ doesn't support directly. +template <GTEST_TEMPLATE_ Tmpl> +struct TemplateSel { + template <typename T> + struct Bind { + typedef Tmpl<T> type; + }; +}; + +# define GTEST_BIND_(TmplSel, T) \ + TmplSel::template Bind<T>::type + +// A unique struct template used as the default value for the +// arguments of class template Templates. This allows us to simulate +// variadic templates (e.g. Templates<int>, Templates<int, double>, +// and etc), which C++ doesn't support directly. +template <typename T> +struct NoneT {}; + +// The following family of struct and struct templates are used to +// represent template lists. In particular, TemplatesN<T1, T2, ..., +// TN> represents a list of N templates (T1, T2, ..., and TN). Except +// for Templates0, every struct in the family has two member types: +// Head for the selector of the first template in the list, and Tail +// for the rest of the list. + +// The empty template list. +struct Templates0 {}; + +// Template lists of length 1, 2, 3, and so on. + +template <GTEST_TEMPLATE_ T1> +struct Templates1 { + typedef TemplateSel<T1> Head; + typedef Templates0 Tail; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2> +struct Templates2 { + typedef TemplateSel<T1> Head; + typedef Templates1<T2> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3> +struct Templates3 { + typedef TemplateSel<T1> Head; + typedef Templates2<T2, T3> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4> +struct Templates4 { + typedef TemplateSel<T1> Head; + typedef Templates3<T2, T3, T4> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5> +struct Templates5 { + typedef TemplateSel<T1> Head; + typedef Templates4<T2, T3, T4, T5> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6> +struct Templates6 { + typedef TemplateSel<T1> Head; + typedef Templates5<T2, T3, T4, T5, T6> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7> +struct Templates7 { + typedef TemplateSel<T1> Head; + typedef Templates6<T2, T3, T4, T5, T6, T7> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8> +struct Templates8 { + typedef TemplateSel<T1> Head; + typedef Templates7<T2, T3, T4, T5, T6, T7, T8> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9> +struct Templates9 { + typedef TemplateSel<T1> Head; + typedef Templates8<T2, T3, T4, T5, T6, T7, T8, T9> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10> +struct Templates10 { + typedef TemplateSel<T1> Head; + typedef Templates9<T2, T3, T4, T5, T6, T7, T8, T9, T10> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11> +struct Templates11 { + typedef TemplateSel<T1> Head; + typedef Templates10<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12> +struct Templates12 { + typedef TemplateSel<T1> Head; + typedef Templates11<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13> +struct Templates13 { + typedef TemplateSel<T1> Head; + typedef Templates12<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14> +struct Templates14 { + typedef TemplateSel<T1> Head; + typedef Templates13<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15> +struct Templates15 { + typedef TemplateSel<T1> Head; + typedef Templates14<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16> +struct Templates16 { + typedef TemplateSel<T1> Head; + typedef Templates15<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17> +struct Templates17 { + typedef TemplateSel<T1> Head; + typedef Templates16<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18> +struct Templates18 { + typedef TemplateSel<T1> Head; + typedef Templates17<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19> +struct Templates19 { + typedef TemplateSel<T1> Head; + typedef Templates18<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20> +struct Templates20 { + typedef TemplateSel<T1> Head; + typedef Templates19<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21> +struct Templates21 { + typedef TemplateSel<T1> Head; + typedef Templates20<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22> +struct Templates22 { + typedef TemplateSel<T1> Head; + typedef Templates21<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23> +struct Templates23 { + typedef TemplateSel<T1> Head; + typedef Templates22<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24> +struct Templates24 { + typedef TemplateSel<T1> Head; + typedef Templates23<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25> +struct Templates25 { + typedef TemplateSel<T1> Head; + typedef Templates24<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26> +struct Templates26 { + typedef TemplateSel<T1> Head; + typedef Templates25<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27> +struct Templates27 { + typedef TemplateSel<T1> Head; + typedef Templates26<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28> +struct Templates28 { + typedef TemplateSel<T1> Head; + typedef Templates27<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, + T28> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29> +struct Templates29 { + typedef TemplateSel<T1> Head; + typedef Templates28<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30> +struct Templates30 { + typedef TemplateSel<T1> Head; + typedef Templates29<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30, + GTEST_TEMPLATE_ T31> +struct Templates31 { + typedef TemplateSel<T1> Head; + typedef Templates30<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30, + GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32> +struct Templates32 { + typedef TemplateSel<T1> Head; + typedef Templates31<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31, T32> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30, + GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33> +struct Templates33 { + typedef TemplateSel<T1> Head; + typedef Templates32<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31, T32, T33> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30, + GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33, + GTEST_TEMPLATE_ T34> +struct Templates34 { + typedef TemplateSel<T1> Head; + typedef Templates33<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31, T32, T33, T34> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30, + GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33, + GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35> +struct Templates35 { + typedef TemplateSel<T1> Head; + typedef Templates34<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31, T32, T33, T34, T35> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30, + GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33, + GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36> +struct Templates36 { + typedef TemplateSel<T1> Head; + typedef Templates35<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31, T32, T33, T34, T35, T36> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30, + GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33, + GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36, + GTEST_TEMPLATE_ T37> +struct Templates37 { + typedef TemplateSel<T1> Head; + typedef Templates36<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31, T32, T33, T34, T35, T36, T37> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30, + GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33, + GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36, + GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38> +struct Templates38 { + typedef TemplateSel<T1> Head; + typedef Templates37<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30, + GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33, + GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36, + GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39> +struct Templates39 { + typedef TemplateSel<T1> Head; + typedef Templates38<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30, + GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33, + GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36, + GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39, + GTEST_TEMPLATE_ T40> +struct Templates40 { + typedef TemplateSel<T1> Head; + typedef Templates39<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30, + GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33, + GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36, + GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39, + GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41> +struct Templates41 { + typedef TemplateSel<T1> Head; + typedef Templates40<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30, + GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33, + GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36, + GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39, + GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42> +struct Templates42 { + typedef TemplateSel<T1> Head; + typedef Templates41<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, + T42> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30, + GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33, + GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36, + GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39, + GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42, + GTEST_TEMPLATE_ T43> +struct Templates43 { + typedef TemplateSel<T1> Head; + typedef Templates42<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, + T43> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30, + GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33, + GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36, + GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39, + GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42, + GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44> +struct Templates44 { + typedef TemplateSel<T1> Head; + typedef Templates43<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, + T43, T44> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30, + GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33, + GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36, + GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39, + GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42, + GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45> +struct Templates45 { + typedef TemplateSel<T1> Head; + typedef Templates44<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, + T43, T44, T45> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30, + GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33, + GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36, + GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39, + GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42, + GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45, + GTEST_TEMPLATE_ T46> +struct Templates46 { + typedef TemplateSel<T1> Head; + typedef Templates45<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, + T43, T44, T45, T46> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30, + GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33, + GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36, + GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39, + GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42, + GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45, + GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47> +struct Templates47 { + typedef TemplateSel<T1> Head; + typedef Templates46<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, + T43, T44, T45, T46, T47> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30, + GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33, + GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36, + GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39, + GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42, + GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45, + GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48> +struct Templates48 { + typedef TemplateSel<T1> Head; + typedef Templates47<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, + T43, T44, T45, T46, T47, T48> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30, + GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33, + GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36, + GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39, + GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42, + GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45, + GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48, + GTEST_TEMPLATE_ T49> +struct Templates49 { + typedef TemplateSel<T1> Head; + typedef Templates48<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, + T43, T44, T45, T46, T47, T48, T49> Tail; +}; + +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30, + GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33, + GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36, + GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39, + GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42, + GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45, + GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48, + GTEST_TEMPLATE_ T49, GTEST_TEMPLATE_ T50> +struct Templates50 { + typedef TemplateSel<T1> Head; + typedef Templates49<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, + T43, T44, T45, T46, T47, T48, T49, T50> Tail; +}; + + +// We don't want to require the users to write TemplatesN<...> directly, +// as that would require them to count the length. Templates<...> is much +// easier to write, but generates horrible messages when there is a +// compiler error, as gcc insists on printing out each template +// argument, even if it has the default value (this means Templates<list> +// will appear as Templates<list, NoneT, NoneT, ..., NoneT> in the compiler +// errors). +// +// Our solution is to combine the best part of the two approaches: a +// user would write Templates<T1, ..., TN>, and Google Test will translate +// that to TemplatesN<T1, ..., TN> internally to make error messages +// readable. The translation is done by the 'type' member of the +// Templates template. +template <GTEST_TEMPLATE_ T1 = NoneT, GTEST_TEMPLATE_ T2 = NoneT, + GTEST_TEMPLATE_ T3 = NoneT, GTEST_TEMPLATE_ T4 = NoneT, + GTEST_TEMPLATE_ T5 = NoneT, GTEST_TEMPLATE_ T6 = NoneT, + GTEST_TEMPLATE_ T7 = NoneT, GTEST_TEMPLATE_ T8 = NoneT, + GTEST_TEMPLATE_ T9 = NoneT, GTEST_TEMPLATE_ T10 = NoneT, + GTEST_TEMPLATE_ T11 = NoneT, GTEST_TEMPLATE_ T12 = NoneT, + GTEST_TEMPLATE_ T13 = NoneT, GTEST_TEMPLATE_ T14 = NoneT, + GTEST_TEMPLATE_ T15 = NoneT, GTEST_TEMPLATE_ T16 = NoneT, + GTEST_TEMPLATE_ T17 = NoneT, GTEST_TEMPLATE_ T18 = NoneT, + GTEST_TEMPLATE_ T19 = NoneT, GTEST_TEMPLATE_ T20 = NoneT, + GTEST_TEMPLATE_ T21 = NoneT, GTEST_TEMPLATE_ T22 = NoneT, + GTEST_TEMPLATE_ T23 = NoneT, GTEST_TEMPLATE_ T24 = NoneT, + GTEST_TEMPLATE_ T25 = NoneT, GTEST_TEMPLATE_ T26 = NoneT, + GTEST_TEMPLATE_ T27 = NoneT, GTEST_TEMPLATE_ T28 = NoneT, + GTEST_TEMPLATE_ T29 = NoneT, GTEST_TEMPLATE_ T30 = NoneT, + GTEST_TEMPLATE_ T31 = NoneT, GTEST_TEMPLATE_ T32 = NoneT, + GTEST_TEMPLATE_ T33 = NoneT, GTEST_TEMPLATE_ T34 = NoneT, + GTEST_TEMPLATE_ T35 = NoneT, GTEST_TEMPLATE_ T36 = NoneT, + GTEST_TEMPLATE_ T37 = NoneT, GTEST_TEMPLATE_ T38 = NoneT, + GTEST_TEMPLATE_ T39 = NoneT, GTEST_TEMPLATE_ T40 = NoneT, + GTEST_TEMPLATE_ T41 = NoneT, GTEST_TEMPLATE_ T42 = NoneT, + GTEST_TEMPLATE_ T43 = NoneT, GTEST_TEMPLATE_ T44 = NoneT, + GTEST_TEMPLATE_ T45 = NoneT, GTEST_TEMPLATE_ T46 = NoneT, + GTEST_TEMPLATE_ T47 = NoneT, GTEST_TEMPLATE_ T48 = NoneT, + GTEST_TEMPLATE_ T49 = NoneT, GTEST_TEMPLATE_ T50 = NoneT> +struct Templates { + typedef Templates50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, + T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, + T42, T43, T44, T45, T46, T47, T48, T49, T50> type; +}; + +template <> +struct Templates<NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT> { + typedef Templates0 type; +}; +template <GTEST_TEMPLATE_ T1> +struct Templates<T1, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT> { + typedef Templates1<T1> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2> +struct Templates<T1, T2, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT> { + typedef Templates2<T1, T2> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3> +struct Templates<T1, T2, T3, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> { + typedef Templates3<T1, T2, T3> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4> +struct Templates<T1, T2, T3, T4, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> { + typedef Templates4<T1, T2, T3, T4> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5> +struct Templates<T1, T2, T3, T4, T5, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> { + typedef Templates5<T1, T2, T3, T4, T5> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6> +struct Templates<T1, T2, T3, T4, T5, T6, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> { + typedef Templates6<T1, T2, T3, T4, T5, T6> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7> +struct Templates<T1, T2, T3, T4, T5, T6, T7, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> { + typedef Templates7<T1, T2, T3, T4, T5, T6, T7> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> { + typedef Templates8<T1, T2, T3, T4, T5, T6, T7, T8> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> { + typedef Templates9<T1, T2, T3, T4, T5, T6, T7, T8, T9> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> { + typedef Templates10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> { + typedef Templates11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> { + typedef Templates12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> { + typedef Templates13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> { + typedef Templates14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT> { + typedef Templates15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT> { + typedef Templates16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT> { + typedef Templates17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT> { + typedef Templates18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT> { + typedef Templates19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT> { + typedef Templates20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT> { + typedef Templates21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT> { + typedef Templates22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT> { + typedef Templates23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT> { + typedef Templates24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT> { + typedef Templates25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT> { + typedef Templates26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT> { + typedef Templates27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, + T27> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT> { + typedef Templates28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, + T28> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT> { + typedef Templates29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, + T28, T29> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> { + typedef Templates30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, + T28, T29, T30> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30, + GTEST_TEMPLATE_ T31> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, T31, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> { + typedef Templates31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, + T28, T29, T30, T31> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30, + GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, T31, T32, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> { + typedef Templates32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, + T28, T29, T30, T31, T32> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30, + GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, T31, T32, T33, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> { + typedef Templates33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, + T28, T29, T30, T31, T32, T33> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30, + GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33, + GTEST_TEMPLATE_ T34> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, T31, T32, T33, T34, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> { + typedef Templates34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, + T28, T29, T30, T31, T32, T33, T34> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30, + GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33, + GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, T31, T32, T33, T34, T35, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> { + typedef Templates35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, + T28, T29, T30, T31, T32, T33, T34, T35> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30, + GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33, + GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, T31, T32, T33, T34, T35, T36, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> { + typedef Templates36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, + T28, T29, T30, T31, T32, T33, T34, T35, T36> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30, + GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33, + GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36, + GTEST_TEMPLATE_ T37> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, T31, T32, T33, T34, T35, T36, T37, NoneT, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> { + typedef Templates37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, + T28, T29, T30, T31, T32, T33, T34, T35, T36, T37> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30, + GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33, + GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36, + GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, T31, T32, T33, T34, T35, T36, T37, T38, NoneT, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> { + typedef Templates38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, + T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30, + GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33, + GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36, + GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> { + typedef Templates39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, + T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30, + GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33, + GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36, + GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39, + GTEST_TEMPLATE_ T40> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, NoneT, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> { + typedef Templates40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, + T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30, + GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33, + GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36, + GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39, + GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, NoneT, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> { + typedef Templates41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, + T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, + T41> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30, + GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33, + GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36, + GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39, + GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, NoneT, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> { + typedef Templates42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, + T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, + T42> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30, + GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33, + GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36, + GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39, + GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42, + GTEST_TEMPLATE_ T43> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> { + typedef Templates43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, + T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, + T42, T43> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30, + GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33, + GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36, + GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39, + GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42, + GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, + NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> { + typedef Templates44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, + T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, + T42, T43, T44> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30, + GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33, + GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36, + GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39, + GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42, + GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, + T45, NoneT, NoneT, NoneT, NoneT, NoneT> { + typedef Templates45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, + T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, + T42, T43, T44, T45> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30, + GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33, + GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36, + GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39, + GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42, + GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45, + GTEST_TEMPLATE_ T46> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, + T45, T46, NoneT, NoneT, NoneT, NoneT> { + typedef Templates46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, + T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, + T42, T43, T44, T45, T46> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30, + GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33, + GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36, + GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39, + GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42, + GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45, + GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, + T45, T46, T47, NoneT, NoneT, NoneT> { + typedef Templates47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, + T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, + T42, T43, T44, T45, T46, T47> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30, + GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33, + GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36, + GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39, + GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42, + GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45, + GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, + T45, T46, T47, T48, NoneT, NoneT> { + typedef Templates48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, + T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, + T42, T43, T44, T45, T46, T47, T48> type; +}; +template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3, + GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6, + GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9, + GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12, + GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15, + GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18, + GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21, + GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24, + GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27, + GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30, + GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33, + GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36, + GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39, + GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42, + GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45, + GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48, + GTEST_TEMPLATE_ T49> +struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, + T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, + T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, + T45, T46, T47, T48, T49, NoneT> { + typedef Templates49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, + T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, + T42, T43, T44, T45, T46, T47, T48, T49> type; +}; + +// The TypeList template makes it possible to use either a single type +// or a Types<...> list in TYPED_TEST_CASE() and +// INSTANTIATE_TYPED_TEST_CASE_P(). + +template <typename T> +struct TypeList { + typedef Types1<T> type; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41, typename T42, typename T43, typename T44, typename T45, + typename T46, typename T47, typename T48, typename T49, typename T50> +struct TypeList<Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, + T44, T45, T46, T47, T48, T49, T50> > { + typedef typename Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, + T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, + T41, T42, T43, T44, T45, T46, T47, T48, T49, T50>::type type; +}; + +#endif // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P + +} // namespace internal +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ + +// Due to C++ preprocessor weirdness, we need double indirection to +// concatenate two tokens when one of them is __LINE__. Writing +// +// foo ## __LINE__ +// +// will result in the token foo__LINE__, instead of foo followed by +// the current line number. For more details, see +// http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.6 +#define GTEST_CONCAT_TOKEN_(foo, bar) GTEST_CONCAT_TOKEN_IMPL_(foo, bar) +#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo ## bar + +class ProtocolMessage; +namespace proto2 { class Message; } + +namespace testing { + +// Forward declarations. + +class AssertionResult; // Result of an assertion. +class Message; // Represents a failure message. +class Test; // Represents a test. +class TestInfo; // Information about a test. +class TestPartResult; // Result of a test part. +class UnitTest; // A collection of test cases. + +template <typename T> +::std::string PrintToString(const T& value); + +namespace internal { + +struct TraceInfo; // Information about a trace point. +class ScopedTrace; // Implements scoped trace. +class TestInfoImpl; // Opaque implementation of TestInfo +class UnitTestImpl; // Opaque implementation of UnitTest + +// How many times InitGoogleTest() has been called. +GTEST_API_ extern int g_init_gtest_count; + +// The text used in failure messages to indicate the start of the +// stack trace. +GTEST_API_ extern const char kStackTraceMarker[]; + +// Two overloaded helpers for checking at compile time whether an +// expression is a null pointer literal (i.e. NULL or any 0-valued +// compile-time integral constant). Their return values have +// different sizes, so we can use sizeof() to test which version is +// picked by the compiler. These helpers have no implementations, as +// we only need their signatures. +// +// Given IsNullLiteralHelper(x), the compiler will pick the first +// version if x can be implicitly converted to Secret*, and pick the +// second version otherwise. Since Secret is a secret and incomplete +// type, the only expression a user can write that has type Secret* is +// a null pointer literal. Therefore, we know that x is a null +// pointer literal if and only if the first version is picked by the +// compiler. +char IsNullLiteralHelper(Secret* p); +char (&IsNullLiteralHelper(...))[2]; // NOLINT + +// A compile-time bool constant that is true if and only if x is a +// null pointer literal (i.e. NULL or any 0-valued compile-time +// integral constant). +#ifdef GTEST_ELLIPSIS_NEEDS_POD_ +// We lose support for NULL detection where the compiler doesn't like +// passing non-POD classes through ellipsis (...). +# define GTEST_IS_NULL_LITERAL_(x) false +#else +# define GTEST_IS_NULL_LITERAL_(x) \ + (sizeof(::testing::internal::IsNullLiteralHelper(x)) == 1) +#endif // GTEST_ELLIPSIS_NEEDS_POD_ + +// Appends the user-supplied message to the Google-Test-generated message. +GTEST_API_ std::string AppendUserMessage( + const std::string& gtest_msg, const Message& user_msg); + +#if GTEST_HAS_EXCEPTIONS + +// This exception is thrown by (and only by) a failed Google Test +// assertion when GTEST_FLAG(throw_on_failure) is true (if exceptions +// are enabled). We derive it from std::runtime_error, which is for +// errors presumably detectable only at run time. Since +// std::runtime_error inherits from std::exception, many testing +// frameworks know how to extract and print the message inside it. +class GTEST_API_ GoogleTestFailureException : public ::std::runtime_error { + public: + explicit GoogleTestFailureException(const TestPartResult& failure); +}; + +#endif // GTEST_HAS_EXCEPTIONS + +// A helper class for creating scoped traces in user programs. +class GTEST_API_ ScopedTrace { + public: + // The c'tor pushes the given source file location and message onto + // a trace stack maintained by Google Test. + ScopedTrace(const char* file, int line, const Message& message); + + // The d'tor pops the info pushed by the c'tor. + // + // Note that the d'tor is not virtual in order to be efficient. + // Don't inherit from ScopedTrace! + ~ScopedTrace(); + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedTrace); +} GTEST_ATTRIBUTE_UNUSED_; // A ScopedTrace object does its job in its + // c'tor and d'tor. Therefore it doesn't + // need to be used otherwise. + +// Constructs and returns the message for an equality assertion +// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure. +// +// The first four parameters are the expressions used in the assertion +// and their values, as strings. For example, for ASSERT_EQ(foo, bar) +// where foo is 5 and bar is 6, we have: +// +// expected_expression: "foo" +// actual_expression: "bar" +// expected_value: "5" +// actual_value: "6" +// +// The ignoring_case parameter is true iff the assertion is a +// *_STRCASEEQ*. When it's true, the string " (ignoring case)" will +// be inserted into the message. +GTEST_API_ AssertionResult EqFailure(const char* expected_expression, + const char* actual_expression, + const std::string& expected_value, + const std::string& actual_value, + bool ignoring_case); + +// Constructs a failure message for Boolean assertions such as EXPECT_TRUE. +GTEST_API_ std::string GetBoolAssertionFailureMessage( + const AssertionResult& assertion_result, + const char* expression_text, + const char* actual_predicate_value, + const char* expected_predicate_value); + +// This template class represents an IEEE floating-point number +// (either single-precision or double-precision, depending on the +// template parameters). +// +// The purpose of this class is to do more sophisticated number +// comparison. (Due to round-off error, etc, it's very unlikely that +// two floating-points will be equal exactly. Hence a naive +// comparison by the == operation often doesn't work.) +// +// Format of IEEE floating-point: +// +// The most-significant bit being the leftmost, an IEEE +// floating-point looks like +// +// sign_bit exponent_bits fraction_bits +// +// Here, sign_bit is a single bit that designates the sign of the +// number. +// +// For float, there are 8 exponent bits and 23 fraction bits. +// +// For double, there are 11 exponent bits and 52 fraction bits. +// +// More details can be found at +// http://en.wikipedia.org/wiki/IEEE_floating-point_standard. +// +// Template parameter: +// +// RawType: the raw floating-point type (either float or double) +template <typename RawType> +class FloatingPoint { + public: + // Defines the unsigned integer type that has the same size as the + // floating point number. + typedef typename TypeWithSize<sizeof(RawType)>::UInt Bits; + + // Constants. + + // # of bits in a number. + static const size_t kBitCount = 8*sizeof(RawType); + + // # of fraction bits in a number. + static const size_t kFractionBitCount = + std::numeric_limits<RawType>::digits - 1; + + // # of exponent bits in a number. + static const size_t kExponentBitCount = kBitCount - 1 - kFractionBitCount; + + // The mask for the sign bit. + static const Bits kSignBitMask = static_cast<Bits>(1) << (kBitCount - 1); + + // The mask for the fraction bits. + static const Bits kFractionBitMask = + ~static_cast<Bits>(0) >> (kExponentBitCount + 1); + + // The mask for the exponent bits. + static const Bits kExponentBitMask = ~(kSignBitMask | kFractionBitMask); + + // How many ULP's (Units in the Last Place) we want to tolerate when + // comparing two numbers. The larger the value, the more error we + // allow. A 0 value means that two numbers must be exactly the same + // to be considered equal. + // + // The maximum error of a single floating-point operation is 0.5 + // units in the last place. On Intel CPU's, all floating-point + // calculations are done with 80-bit precision, while double has 64 + // bits. Therefore, 4 should be enough for ordinary use. + // + // See the following article for more details on ULP: + // http://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/ + static const size_t kMaxUlps = 4; + + // Constructs a FloatingPoint from a raw floating-point number. + // + // On an Intel CPU, passing a non-normalized NAN (Not a Number) + // around may change its bits, although the new value is guaranteed + // to be also a NAN. Therefore, don't expect this constructor to + // preserve the bits in x when x is a NAN. + explicit FloatingPoint(const RawType& x) { u_.value_ = x; } + + // Static methods + + // Reinterprets a bit pattern as a floating-point number. + // + // This function is needed to test the AlmostEquals() method. + static RawType ReinterpretBits(const Bits bits) { + FloatingPoint fp(0); + fp.u_.bits_ = bits; + return fp.u_.value_; + } + + // Returns the floating-point number that represent positive infinity. + static RawType Infinity() { + return ReinterpretBits(kExponentBitMask); + } + + // Returns the maximum representable finite floating-point number. + static RawType Max(); + + // Non-static methods + + // Returns the bits that represents this number. + const Bits &bits() const { return u_.bits_; } + + // Returns the exponent bits of this number. + Bits exponent_bits() const { return kExponentBitMask & u_.bits_; } + + // Returns the fraction bits of this number. + Bits fraction_bits() const { return kFractionBitMask & u_.bits_; } + + // Returns the sign bit of this number. + Bits sign_bit() const { return kSignBitMask & u_.bits_; } + + // Returns true iff this is NAN (not a number). + bool is_nan() const { + // It's a NAN if the exponent bits are all ones and the fraction + // bits are not entirely zeros. + return (exponent_bits() == kExponentBitMask) && (fraction_bits() != 0); + } + + // Returns true iff this number is at most kMaxUlps ULP's away from + // rhs. In particular, this function: + // + // - returns false if either number is (or both are) NAN. + // - treats really large numbers as almost equal to infinity. + // - thinks +0.0 and -0.0 are 0 DLP's apart. + bool AlmostEquals(const FloatingPoint& rhs) const { + // The IEEE standard says that any comparison operation involving + // a NAN must return false. + if (is_nan() || rhs.is_nan()) return false; + + return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_) + <= kMaxUlps; + } + + private: + // The data type used to store the actual floating-point number. + union FloatingPointUnion { + RawType value_; // The raw floating-point number. + Bits bits_; // The bits that represent the number. + }; + + // Converts an integer from the sign-and-magnitude representation to + // the biased representation. More precisely, let N be 2 to the + // power of (kBitCount - 1), an integer x is represented by the + // unsigned number x + N. + // + // For instance, + // + // -N + 1 (the most negative number representable using + // sign-and-magnitude) is represented by 1; + // 0 is represented by N; and + // N - 1 (the biggest number representable using + // sign-and-magnitude) is represented by 2N - 1. + // + // Read http://en.wikipedia.org/wiki/Signed_number_representations + // for more details on signed number representations. + static Bits SignAndMagnitudeToBiased(const Bits &sam) { + if (kSignBitMask & sam) { + // sam represents a negative number. + return ~sam + 1; + } else { + // sam represents a positive number. + return kSignBitMask | sam; + } + } + + // Given two numbers in the sign-and-magnitude representation, + // returns the distance between them as an unsigned number. + static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits &sam1, + const Bits &sam2) { + const Bits biased1 = SignAndMagnitudeToBiased(sam1); + const Bits biased2 = SignAndMagnitudeToBiased(sam2); + return (biased1 >= biased2) ? (biased1 - biased2) : (biased2 - biased1); + } + + FloatingPointUnion u_; +}; + +// We cannot use std::numeric_limits<T>::max() as it clashes with the max() +// macro defined by <windows.h>. +template <> +inline float FloatingPoint<float>::Max() { return FLT_MAX; } +template <> +inline double FloatingPoint<double>::Max() { return DBL_MAX; } + +// Typedefs the instances of the FloatingPoint template class that we +// care to use. +typedef FloatingPoint<float> Float; +typedef FloatingPoint<double> Double; + +// In order to catch the mistake of putting tests that use different +// test fixture classes in the same test case, we need to assign +// unique IDs to fixture classes and compare them. The TypeId type is +// used to hold such IDs. The user should treat TypeId as an opaque +// type: the only operation allowed on TypeId values is to compare +// them for equality using the == operator. +typedef const void* TypeId; + +template <typename T> +class TypeIdHelper { + public: + // dummy_ must not have a const type. Otherwise an overly eager + // compiler (e.g. MSVC 7.1 & 8.0) may try to merge + // TypeIdHelper<T>::dummy_ for different Ts as an "optimization". + static bool dummy_; +}; + +template <typename T> +bool TypeIdHelper<T>::dummy_ = false; + +// GetTypeId<T>() returns the ID of type T. Different values will be +// returned for different types. Calling the function twice with the +// same type argument is guaranteed to return the same ID. +template <typename T> +TypeId GetTypeId() { + // The compiler is required to allocate a different + // TypeIdHelper<T>::dummy_ variable for each T used to instantiate + // the template. Therefore, the address of dummy_ is guaranteed to + // be unique. + return &(TypeIdHelper<T>::dummy_); +} + +// Returns the type ID of ::testing::Test. Always call this instead +// of GetTypeId< ::testing::Test>() to get the type ID of +// ::testing::Test, as the latter may give the wrong result due to a +// suspected linker bug when compiling Google Test as a Mac OS X +// framework. +GTEST_API_ TypeId GetTestTypeId(); + +// Defines the abstract factory interface that creates instances +// of a Test object. +class TestFactoryBase { + public: + virtual ~TestFactoryBase() {} + + // Creates a test instance to run. The instance is both created and destroyed + // within TestInfoImpl::Run() + virtual Test* CreateTest() = 0; + + protected: + TestFactoryBase() {} + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(TestFactoryBase); +}; + +// This class provides implementation of TeastFactoryBase interface. +// It is used in TEST and TEST_F macros. +template <class TestClass> +class TestFactoryImpl : public TestFactoryBase { + public: + virtual Test* CreateTest() { return new TestClass; } +}; + +#if GTEST_OS_WINDOWS + +// Predicate-formatters for implementing the HRESULT checking macros +// {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED} +// We pass a long instead of HRESULT to avoid causing an +// include dependency for the HRESULT type. +GTEST_API_ AssertionResult IsHRESULTSuccess(const char* expr, + long hr); // NOLINT +GTEST_API_ AssertionResult IsHRESULTFailure(const char* expr, + long hr); // NOLINT + +#endif // GTEST_OS_WINDOWS + +// Types of SetUpTestCase() and TearDownTestCase() functions. +typedef void (*SetUpTestCaseFunc)(); +typedef void (*TearDownTestCaseFunc)(); + +// Creates a new TestInfo object and registers it with Google Test; +// returns the created object. +// +// Arguments: +// +// test_case_name: name of the test case +// name: name of the test +// type_param the name of the test's type parameter, or NULL if +// this is not a typed or a type-parameterized test. +// value_param text representation of the test's value parameter, +// or NULL if this is not a type-parameterized test. +// fixture_class_id: ID of the test fixture class +// set_up_tc: pointer to the function that sets up the test case +// tear_down_tc: pointer to the function that tears down the test case +// factory: pointer to the factory that creates a test object. +// The newly created TestInfo instance will assume +// ownership of the factory object. +GTEST_API_ TestInfo* MakeAndRegisterTestInfo( + const char* test_case_name, + const char* name, + const char* type_param, + const char* value_param, + TypeId fixture_class_id, + SetUpTestCaseFunc set_up_tc, + TearDownTestCaseFunc tear_down_tc, + TestFactoryBase* factory); + +// If *pstr starts with the given prefix, modifies *pstr to be right +// past the prefix and returns true; otherwise leaves *pstr unchanged +// and returns false. None of pstr, *pstr, and prefix can be NULL. +GTEST_API_ bool SkipPrefix(const char* prefix, const char** pstr); + +#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P + +// State of the definition of a type-parameterized test case. +class GTEST_API_ TypedTestCasePState { + public: + TypedTestCasePState() : registered_(false) {} + + // Adds the given test name to defined_test_names_ and return true + // if the test case hasn't been registered; otherwise aborts the + // program. + bool AddTestName(const char* file, int line, const char* case_name, + const char* test_name) { + if (registered_) { + fprintf(stderr, "%s Test %s must be defined before " + "REGISTER_TYPED_TEST_CASE_P(%s, ...).\n", + FormatFileLocation(file, line).c_str(), test_name, case_name); + fflush(stderr); + posix::Abort(); + } + defined_test_names_.insert(test_name); + return true; + } + + // Verifies that registered_tests match the test names in + // defined_test_names_; returns registered_tests if successful, or + // aborts the program otherwise. + const char* VerifyRegisteredTestNames( + const char* file, int line, const char* registered_tests); + + private: + bool registered_; + ::std::set<const char*> defined_test_names_; +}; + +// Skips to the first non-space char after the first comma in 'str'; +// returns NULL if no comma is found in 'str'. +inline const char* SkipComma(const char* str) { + const char* comma = strchr(str, ','); + if (comma == NULL) { + return NULL; + } + while (IsSpace(*(++comma))) {} + return comma; +} + +// Returns the prefix of 'str' before the first comma in it; returns +// the entire string if it contains no comma. +inline std::string GetPrefixUntilComma(const char* str) { + const char* comma = strchr(str, ','); + return comma == NULL ? str : std::string(str, comma); +} + +// TypeParameterizedTest<Fixture, TestSel, Types>::Register() +// registers a list of type-parameterized tests with Google Test. The +// return value is insignificant - we just need to return something +// such that we can call this function in a namespace scope. +// +// Implementation note: The GTEST_TEMPLATE_ macro declares a template +// template parameter. It's defined in gtest-type-util.h. +template <GTEST_TEMPLATE_ Fixture, class TestSel, typename Types> +class TypeParameterizedTest { + public: + // 'index' is the index of the test in the type list 'Types' + // specified in INSTANTIATE_TYPED_TEST_CASE_P(Prefix, TestCase, + // Types). Valid values for 'index' are [0, N - 1] where N is the + // length of Types. + static bool Register(const char* prefix, const char* case_name, + const char* test_names, int index) { + typedef typename Types::Head Type; + typedef Fixture<Type> FixtureClass; + typedef typename GTEST_BIND_(TestSel, Type) TestClass; + + // First, registers the first type-parameterized test in the type + // list. + MakeAndRegisterTestInfo( + (std::string(prefix) + (prefix[0] == '\0' ? "" : "/") + case_name + "/" + + StreamableToString(index)).c_str(), + GetPrefixUntilComma(test_names).c_str(), + GetTypeName<Type>().c_str(), + NULL, // No value parameter. + GetTypeId<FixtureClass>(), + TestClass::SetUpTestCase, + TestClass::TearDownTestCase, + new TestFactoryImpl<TestClass>); + + // Next, recurses (at compile time) with the tail of the type list. + return TypeParameterizedTest<Fixture, TestSel, typename Types::Tail> + ::Register(prefix, case_name, test_names, index + 1); + } +}; + +// The base case for the compile time recursion. +template <GTEST_TEMPLATE_ Fixture, class TestSel> +class TypeParameterizedTest<Fixture, TestSel, Types0> { + public: + static bool Register(const char* /*prefix*/, const char* /*case_name*/, + const char* /*test_names*/, int /*index*/) { + return true; + } +}; + +// TypeParameterizedTestCase<Fixture, Tests, Types>::Register() +// registers *all combinations* of 'Tests' and 'Types' with Google +// Test. The return value is insignificant - we just need to return +// something such that we can call this function in a namespace scope. +template <GTEST_TEMPLATE_ Fixture, typename Tests, typename Types> +class TypeParameterizedTestCase { + public: + static bool Register(const char* prefix, const char* case_name, + const char* test_names) { + typedef typename Tests::Head Head; + + // First, register the first test in 'Test' for each type in 'Types'. + TypeParameterizedTest<Fixture, Head, Types>::Register( + prefix, case_name, test_names, 0); + + // Next, recurses (at compile time) with the tail of the test list. + return TypeParameterizedTestCase<Fixture, typename Tests::Tail, Types> + ::Register(prefix, case_name, SkipComma(test_names)); + } +}; + +// The base case for the compile time recursion. +template <GTEST_TEMPLATE_ Fixture, typename Types> +class TypeParameterizedTestCase<Fixture, Templates0, Types> { + public: + static bool Register(const char* /*prefix*/, const char* /*case_name*/, + const char* /*test_names*/) { + return true; + } +}; + +#endif // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P + +// Returns the current OS stack trace as an std::string. +// +// The maximum number of stack frames to be included is specified by +// the gtest_stack_trace_depth flag. The skip_count parameter +// specifies the number of top frames to be skipped, which doesn't +// count against the number of frames to be included. +// +// For example, if Foo() calls Bar(), which in turn calls +// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in +// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't. +GTEST_API_ std::string GetCurrentOsStackTraceExceptTop( + UnitTest* unit_test, int skip_count); + +// Helpers for suppressing warnings on unreachable code or constant +// condition. + +// Always returns true. +GTEST_API_ bool AlwaysTrue(); + +// Always returns false. +inline bool AlwaysFalse() { return !AlwaysTrue(); } + +// Helper for suppressing false warning from Clang on a const char* +// variable declared in a conditional expression always being NULL in +// the else branch. +struct GTEST_API_ ConstCharPtr { + ConstCharPtr(const char* str) : value(str) {} + operator bool() const { return true; } + const char* value; +}; + +// A simple Linear Congruential Generator for generating random +// numbers with a uniform distribution. Unlike rand() and srand(), it +// doesn't use global state (and therefore can't interfere with user +// code). Unlike rand_r(), it's portable. An LCG isn't very random, +// but it's good enough for our purposes. +class GTEST_API_ Random { + public: + static const UInt32 kMaxRange = 1u << 31; + + explicit Random(UInt32 seed) : state_(seed) {} + + void Reseed(UInt32 seed) { state_ = seed; } + + // Generates a random number from [0, range). Crashes if 'range' is + // 0 or greater than kMaxRange. + UInt32 Generate(UInt32 range); + + private: + UInt32 state_; + GTEST_DISALLOW_COPY_AND_ASSIGN_(Random); +}; + +// Defining a variable of type CompileAssertTypesEqual<T1, T2> will cause a +// compiler error iff T1 and T2 are different types. +template <typename T1, typename T2> +struct CompileAssertTypesEqual; + +template <typename T> +struct CompileAssertTypesEqual<T, T> { +}; + +// Removes the reference from a type if it is a reference type, +// otherwise leaves it unchanged. This is the same as +// tr1::remove_reference, which is not widely available yet. +template <typename T> +struct RemoveReference { typedef T type; }; // NOLINT +template <typename T> +struct RemoveReference<T&> { typedef T type; }; // NOLINT + +// A handy wrapper around RemoveReference that works when the argument +// T depends on template parameters. +#define GTEST_REMOVE_REFERENCE_(T) \ + typename ::testing::internal::RemoveReference<T>::type + +// Removes const from a type if it is a const type, otherwise leaves +// it unchanged. This is the same as tr1::remove_const, which is not +// widely available yet. +template <typename T> +struct RemoveConst { typedef T type; }; // NOLINT +template <typename T> +struct RemoveConst<const T> { typedef T type; }; // NOLINT + +// MSVC 8.0, Sun C++, and IBM XL C++ have a bug which causes the above +// definition to fail to remove the const in 'const int[3]' and 'const +// char[3][4]'. The following specialization works around the bug. +template <typename T, size_t N> +struct RemoveConst<const T[N]> { + typedef typename RemoveConst<T>::type type[N]; +}; + +#if defined(_MSC_VER) && _MSC_VER < 1400 +// This is the only specialization that allows VC++ 7.1 to remove const in +// 'const int[3] and 'const int[3][4]'. However, it causes trouble with GCC +// and thus needs to be conditionally compiled. +template <typename T, size_t N> +struct RemoveConst<T[N]> { + typedef typename RemoveConst<T>::type type[N]; +}; +#endif + +// A handy wrapper around RemoveConst that works when the argument +// T depends on template parameters. +#define GTEST_REMOVE_CONST_(T) \ + typename ::testing::internal::RemoveConst<T>::type + +// Turns const U&, U&, const U, and U all into U. +#define GTEST_REMOVE_REFERENCE_AND_CONST_(T) \ + GTEST_REMOVE_CONST_(GTEST_REMOVE_REFERENCE_(T)) + +// Adds reference to a type if it is not a reference type, +// otherwise leaves it unchanged. This is the same as +// tr1::add_reference, which is not widely available yet. +template <typename T> +struct AddReference { typedef T& type; }; // NOLINT +template <typename T> +struct AddReference<T&> { typedef T& type; }; // NOLINT + +// A handy wrapper around AddReference that works when the argument T +// depends on template parameters. +#define GTEST_ADD_REFERENCE_(T) \ + typename ::testing::internal::AddReference<T>::type + +// Adds a reference to const on top of T as necessary. For example, +// it transforms +// +// char ==> const char& +// const char ==> const char& +// char& ==> const char& +// const char& ==> const char& +// +// The argument T must depend on some template parameters. +#define GTEST_REFERENCE_TO_CONST_(T) \ + GTEST_ADD_REFERENCE_(const GTEST_REMOVE_REFERENCE_(T)) + +// ImplicitlyConvertible<From, To>::value is a compile-time bool +// constant that's true iff type From can be implicitly converted to +// type To. +template <typename From, typename To> +class ImplicitlyConvertible { + private: + // We need the following helper functions only for their types. + // They have no implementations. + + // MakeFrom() is an expression whose type is From. We cannot simply + // use From(), as the type From may not have a public default + // constructor. + static From MakeFrom(); + + // These two functions are overloaded. Given an expression + // Helper(x), the compiler will pick the first version if x can be + // implicitly converted to type To; otherwise it will pick the + // second version. + // + // The first version returns a value of size 1, and the second + // version returns a value of size 2. Therefore, by checking the + // size of Helper(x), which can be done at compile time, we can tell + // which version of Helper() is used, and hence whether x can be + // implicitly converted to type To. + static char Helper(To); + static char (&Helper(...))[2]; // NOLINT + + // We have to put the 'public' section after the 'private' section, + // or MSVC refuses to compile the code. + public: + // MSVC warns about implicitly converting from double to int for + // possible loss of data, so we need to temporarily disable the + // warning. +#ifdef _MSC_VER +# pragma warning(push) // Saves the current warning state. +# pragma warning(disable:4244) // Temporarily disables warning 4244. + + static const bool value = + sizeof(Helper(ImplicitlyConvertible::MakeFrom())) == 1; +# pragma warning(pop) // Restores the warning state. +#elif defined(__BORLANDC__) + // C++Builder cannot use member overload resolution during template + // instantiation. The simplest workaround is to use its C++0x type traits + // functions (C++Builder 2009 and above only). + static const bool value = __is_convertible(From, To); +#else + static const bool value = + sizeof(Helper(ImplicitlyConvertible::MakeFrom())) == 1; +#endif // _MSV_VER +}; +template <typename From, typename To> +const bool ImplicitlyConvertible<From, To>::value; + +// IsAProtocolMessage<T>::value is a compile-time bool constant that's +// true iff T is type ProtocolMessage, proto2::Message, or a subclass +// of those. +template <typename T> +struct IsAProtocolMessage + : public bool_constant< + ImplicitlyConvertible<const T*, const ::ProtocolMessage*>::value || + ImplicitlyConvertible<const T*, const ::proto2::Message*>::value> { +}; + +// When the compiler sees expression IsContainerTest<C>(0), if C is an +// STL-style container class, the first overload of IsContainerTest +// will be viable (since both C::iterator* and C::const_iterator* are +// valid types and NULL can be implicitly converted to them). It will +// be picked over the second overload as 'int' is a perfect match for +// the type of argument 0. If C::iterator or C::const_iterator is not +// a valid type, the first overload is not viable, and the second +// overload will be picked. Therefore, we can determine whether C is +// a container class by checking the type of IsContainerTest<C>(0). +// The value of the expression is insignificant. +// +// Note that we look for both C::iterator and C::const_iterator. The +// reason is that C++ injects the name of a class as a member of the +// class itself (e.g. you can refer to class iterator as either +// 'iterator' or 'iterator::iterator'). If we look for C::iterator +// only, for example, we would mistakenly think that a class named +// iterator is an STL container. +// +// Also note that the simpler approach of overloading +// IsContainerTest(typename C::const_iterator*) and +// IsContainerTest(...) doesn't work with Visual Age C++ and Sun C++. +typedef int IsContainer; +template <class C> +IsContainer IsContainerTest(int /* dummy */, + typename C::iterator* /* it */ = NULL, + typename C::const_iterator* /* const_it */ = NULL) { + return 0; +} + +typedef char IsNotContainer; +template <class C> +IsNotContainer IsContainerTest(long /* dummy */) { return '\0'; } + +// EnableIf<condition>::type is void when 'Cond' is true, and +// undefined when 'Cond' is false. To use SFINAE to make a function +// overload only apply when a particular expression is true, add +// "typename EnableIf<expression>::type* = 0" as the last parameter. +template<bool> struct EnableIf; +template<> struct EnableIf<true> { typedef void type; }; // NOLINT + +// Utilities for native arrays. + +// ArrayEq() compares two k-dimensional native arrays using the +// elements' operator==, where k can be any integer >= 0. When k is +// 0, ArrayEq() degenerates into comparing a single pair of values. + +template <typename T, typename U> +bool ArrayEq(const T* lhs, size_t size, const U* rhs); + +// This generic version is used when k is 0. +template <typename T, typename U> +inline bool ArrayEq(const T& lhs, const U& rhs) { return lhs == rhs; } + +// This overload is used when k >= 1. +template <typename T, typename U, size_t N> +inline bool ArrayEq(const T(&lhs)[N], const U(&rhs)[N]) { + return internal::ArrayEq(lhs, N, rhs); +} + +// This helper reduces code bloat. If we instead put its logic inside +// the previous ArrayEq() function, arrays with different sizes would +// lead to different copies of the template code. +template <typename T, typename U> +bool ArrayEq(const T* lhs, size_t size, const U* rhs) { + for (size_t i = 0; i != size; i++) { + if (!internal::ArrayEq(lhs[i], rhs[i])) + return false; + } + return true; +} + +// Finds the first element in the iterator range [begin, end) that +// equals elem. Element may be a native array type itself. +template <typename Iter, typename Element> +Iter ArrayAwareFind(Iter begin, Iter end, const Element& elem) { + for (Iter it = begin; it != end; ++it) { + if (internal::ArrayEq(*it, elem)) + return it; + } + return end; +} + +// CopyArray() copies a k-dimensional native array using the elements' +// operator=, where k can be any integer >= 0. When k is 0, +// CopyArray() degenerates into copying a single value. + +template <typename T, typename U> +void CopyArray(const T* from, size_t size, U* to); + +// This generic version is used when k is 0. +template <typename T, typename U> +inline void CopyArray(const T& from, U* to) { *to = from; } + +// This overload is used when k >= 1. +template <typename T, typename U, size_t N> +inline void CopyArray(const T(&from)[N], U(*to)[N]) { + internal::CopyArray(from, N, *to); +} + +// This helper reduces code bloat. If we instead put its logic inside +// the previous CopyArray() function, arrays with different sizes +// would lead to different copies of the template code. +template <typename T, typename U> +void CopyArray(const T* from, size_t size, U* to) { + for (size_t i = 0; i != size; i++) { + internal::CopyArray(from[i], to + i); + } +} + +// The relation between an NativeArray object (see below) and the +// native array it represents. +enum RelationToSource { + kReference, // The NativeArray references the native array. + kCopy // The NativeArray makes a copy of the native array and + // owns the copy. +}; + +// Adapts a native array to a read-only STL-style container. Instead +// of the complete STL container concept, this adaptor only implements +// members useful for Google Mock's container matchers. New members +// should be added as needed. To simplify the implementation, we only +// support Element being a raw type (i.e. having no top-level const or +// reference modifier). It's the client's responsibility to satisfy +// this requirement. Element can be an array type itself (hence +// multi-dimensional arrays are supported). +template <typename Element> +class NativeArray { + public: + // STL-style container typedefs. + typedef Element value_type; + typedef Element* iterator; + typedef const Element* const_iterator; + + // Constructs from a native array. + NativeArray(const Element* array, size_t count, RelationToSource relation) { + Init(array, count, relation); + } + + // Copy constructor. + NativeArray(const NativeArray& rhs) { + Init(rhs.array_, rhs.size_, rhs.relation_to_source_); + } + + ~NativeArray() { + // Ensures that the user doesn't instantiate NativeArray with a + // const or reference type. + static_cast<void>(StaticAssertTypeEqHelper<Element, + GTEST_REMOVE_REFERENCE_AND_CONST_(Element)>()); + if (relation_to_source_ == kCopy) + delete[] array_; + } + + // STL-style container methods. + size_t size() const { return size_; } + const_iterator begin() const { return array_; } + const_iterator end() const { return array_ + size_; } + bool operator==(const NativeArray& rhs) const { + return size() == rhs.size() && + ArrayEq(begin(), size(), rhs.begin()); + } + + private: + // Initializes this object; makes a copy of the input array if + // 'relation' is kCopy. + void Init(const Element* array, size_t a_size, RelationToSource relation) { + if (relation == kReference) { + array_ = array; + } else { + Element* const copy = new Element[a_size]; + CopyArray(array, a_size, copy); + array_ = copy; + } + size_ = a_size; + relation_to_source_ = relation; + } + + const Element* array_; + size_t size_; + RelationToSource relation_to_source_; + + GTEST_DISALLOW_ASSIGN_(NativeArray); +}; + +} // namespace internal +} // namespace testing + +#define GTEST_MESSAGE_AT_(file, line, message, result_type) \ + ::testing::internal::AssertHelper(result_type, file, line, message) \ + = ::testing::Message() + +#define GTEST_MESSAGE_(message, result_type) \ + GTEST_MESSAGE_AT_(__FILE__, __LINE__, message, result_type) + +#define GTEST_FATAL_FAILURE_(message) \ + return GTEST_MESSAGE_(message, ::testing::TestPartResult::kFatalFailure) + +#define GTEST_NONFATAL_FAILURE_(message) \ + GTEST_MESSAGE_(message, ::testing::TestPartResult::kNonFatalFailure) + +#define GTEST_SUCCESS_(message) \ + GTEST_MESSAGE_(message, ::testing::TestPartResult::kSuccess) + +// Suppresses MSVC warnings 4072 (unreachable code) for the code following +// statement if it returns or throws (or doesn't return or throw in some +// situations). +#define GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) \ + if (::testing::internal::AlwaysTrue()) { statement; } + +#define GTEST_TEST_THROW_(statement, expected_exception, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::ConstCharPtr gtest_msg = "") { \ + bool gtest_caught_expected = false; \ + try { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } \ + catch (expected_exception const&) { \ + gtest_caught_expected = true; \ + } \ + catch (...) { \ + gtest_msg.value = \ + "Expected: " #statement " throws an exception of type " \ + #expected_exception ".\n Actual: it throws a different type."; \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \ + } \ + if (!gtest_caught_expected) { \ + gtest_msg.value = \ + "Expected: " #statement " throws an exception of type " \ + #expected_exception ".\n Actual: it throws nothing."; \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \ + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__): \ + fail(gtest_msg.value) + +#define GTEST_TEST_NO_THROW_(statement, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + try { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } \ + catch (...) { \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \ + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__): \ + fail("Expected: " #statement " doesn't throw an exception.\n" \ + " Actual: it throws.") + +#define GTEST_TEST_ANY_THROW_(statement, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + bool gtest_caught_any = false; \ + try { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } \ + catch (...) { \ + gtest_caught_any = true; \ + } \ + if (!gtest_caught_any) { \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__); \ + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__): \ + fail("Expected: " #statement " throws an exception.\n" \ + " Actual: it doesn't.") + + +// Implements Boolean test assertions such as EXPECT_TRUE. expression can be +// either a boolean expression or an AssertionResult. text is a textual +// represenation of expression as it was passed into the EXPECT_TRUE. +#define GTEST_TEST_BOOLEAN_(expression, text, actual, expected, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (const ::testing::AssertionResult gtest_ar_ = \ + ::testing::AssertionResult(expression)) \ + ; \ + else \ + fail(::testing::internal::GetBoolAssertionFailureMessage(\ + gtest_ar_, text, #actual, #expected).c_str()) + +#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + ::testing::internal::HasNewFatalFailureHelper gtest_fatal_failure_checker; \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + if (gtest_fatal_failure_checker.has_new_fatal_failure()) { \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__); \ + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__): \ + fail("Expected: " #statement " doesn't generate new fatal " \ + "failures in the current thread.\n" \ + " Actual: it does.") + +// Expands to the name of the class that implements the given test. +#define GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \ + test_case_name##_##test_name##_Test + +// Helper macro for defining tests. +#define GTEST_TEST_(test_case_name, test_name, parent_class, parent_id)\ +class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) : public parent_class {\ + public:\ + GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}\ + private:\ + virtual void TestBody();\ + static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;\ + GTEST_DISALLOW_COPY_AND_ASSIGN_(\ + GTEST_TEST_CLASS_NAME_(test_case_name, test_name));\ +};\ +\ +::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_case_name, test_name)\ + ::test_info_ =\ + ::testing::internal::MakeAndRegisterTestInfo(\ + #test_case_name, #test_name, NULL, NULL, \ + (parent_id), \ + parent_class::SetUpTestCase, \ + parent_class::TearDownTestCase, \ + new ::testing::internal::TestFactoryImpl<\ + GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>);\ +void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody() + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) +// +// The Google C++ Testing Framework (Google Test) +// +// This header file defines the public API for death tests. It is +// #included by gtest.h so a user doesn't need to include this +// directly. + +#ifndef GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_ +#define GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_ + +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee) +// +// The Google C++ Testing Framework (Google Test) +// +// This header file defines internal utilities needed for implementing +// death tests. They are subject to change without notice. + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ + + +#include <stdio.h> + +namespace testing { +namespace internal { + +GTEST_DECLARE_string_(internal_run_death_test); + +// Names of the flags (needed for parsing Google Test flags). +const char kDeathTestStyleFlag[] = "death_test_style"; +const char kDeathTestUseFork[] = "death_test_use_fork"; +const char kInternalRunDeathTestFlag[] = "internal_run_death_test"; + +#if GTEST_HAS_DEATH_TEST + +// DeathTest is a class that hides much of the complexity of the +// GTEST_DEATH_TEST_ macro. It is abstract; its static Create method +// returns a concrete class that depends on the prevailing death test +// style, as defined by the --gtest_death_test_style and/or +// --gtest_internal_run_death_test flags. + +// In describing the results of death tests, these terms are used with +// the corresponding definitions: +// +// exit status: The integer exit information in the format specified +// by wait(2) +// exit code: The integer code passed to exit(3), _exit(2), or +// returned from main() +class GTEST_API_ DeathTest { + public: + // Create returns false if there was an error determining the + // appropriate action to take for the current death test; for example, + // if the gtest_death_test_style flag is set to an invalid value. + // The LastMessage method will return a more detailed message in that + // case. Otherwise, the DeathTest pointer pointed to by the "test" + // argument is set. If the death test should be skipped, the pointer + // is set to NULL; otherwise, it is set to the address of a new concrete + // DeathTest object that controls the execution of the current test. + static bool Create(const char* statement, const RE* regex, + const char* file, int line, DeathTest** test); + DeathTest(); + virtual ~DeathTest() { } + + // A helper class that aborts a death test when it's deleted. + class ReturnSentinel { + public: + explicit ReturnSentinel(DeathTest* test) : test_(test) { } + ~ReturnSentinel() { test_->Abort(TEST_ENCOUNTERED_RETURN_STATEMENT); } + private: + DeathTest* const test_; + GTEST_DISALLOW_COPY_AND_ASSIGN_(ReturnSentinel); + } GTEST_ATTRIBUTE_UNUSED_; + + // An enumeration of possible roles that may be taken when a death + // test is encountered. EXECUTE means that the death test logic should + // be executed immediately. OVERSEE means that the program should prepare + // the appropriate environment for a child process to execute the death + // test, then wait for it to complete. + enum TestRole { OVERSEE_TEST, EXECUTE_TEST }; + + // An enumeration of the three reasons that a test might be aborted. + enum AbortReason { + TEST_ENCOUNTERED_RETURN_STATEMENT, + TEST_THREW_EXCEPTION, + TEST_DID_NOT_DIE + }; + + // Assumes one of the above roles. + virtual TestRole AssumeRole() = 0; + + // Waits for the death test to finish and returns its status. + virtual int Wait() = 0; + + // Returns true if the death test passed; that is, the test process + // exited during the test, its exit status matches a user-supplied + // predicate, and its stderr output matches a user-supplied regular + // expression. + // The user-supplied predicate may be a macro expression rather + // than a function pointer or functor, or else Wait and Passed could + // be combined. + virtual bool Passed(bool exit_status_ok) = 0; + + // Signals that the death test did not die as expected. + virtual void Abort(AbortReason reason) = 0; + + // Returns a human-readable outcome message regarding the outcome of + // the last death test. + static const char* LastMessage(); + + static void set_last_death_test_message(const std::string& message); + + private: + // A string containing a description of the outcome of the last death test. + static std::string last_death_test_message_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(DeathTest); +}; + +// Factory interface for death tests. May be mocked out for testing. +class DeathTestFactory { + public: + virtual ~DeathTestFactory() { } + virtual bool Create(const char* statement, const RE* regex, + const char* file, int line, DeathTest** test) = 0; +}; + +// A concrete DeathTestFactory implementation for normal use. +class DefaultDeathTestFactory : public DeathTestFactory { + public: + virtual bool Create(const char* statement, const RE* regex, + const char* file, int line, DeathTest** test); +}; + +// Returns true if exit_status describes a process that was terminated +// by a signal, or exited normally with a nonzero exit code. +GTEST_API_ bool ExitedUnsuccessfully(int exit_status); + +// Traps C++ exceptions escaping statement and reports them as test +// failures. Note that trapping SEH exceptions is not implemented here. +# if GTEST_HAS_EXCEPTIONS +# define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \ + try { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } catch (const ::std::exception& gtest_exception) { \ + fprintf(\ + stderr, \ + "\n%s: Caught std::exception-derived exception escaping the " \ + "death test statement. Exception message: %s\n", \ + ::testing::internal::FormatFileLocation(__FILE__, __LINE__).c_str(), \ + gtest_exception.what()); \ + fflush(stderr); \ + death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \ + } catch (...) { \ + death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \ + } + +# else +# define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) + +# endif + +// This macro is for implementing ASSERT_DEATH*, EXPECT_DEATH*, +// ASSERT_EXIT*, and EXPECT_EXIT*. +# define GTEST_DEATH_TEST_(statement, predicate, regex, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + const ::testing::internal::RE& gtest_regex = (regex); \ + ::testing::internal::DeathTest* gtest_dt; \ + if (!::testing::internal::DeathTest::Create(#statement, >est_regex, \ + __FILE__, __LINE__, >est_dt)) { \ + goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \ + } \ + if (gtest_dt != NULL) { \ + ::testing::internal::scoped_ptr< ::testing::internal::DeathTest> \ + gtest_dt_ptr(gtest_dt); \ + switch (gtest_dt->AssumeRole()) { \ + case ::testing::internal::DeathTest::OVERSEE_TEST: \ + if (!gtest_dt->Passed(predicate(gtest_dt->Wait()))) { \ + goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \ + } \ + break; \ + case ::testing::internal::DeathTest::EXECUTE_TEST: { \ + ::testing::internal::DeathTest::ReturnSentinel \ + gtest_sentinel(gtest_dt); \ + GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, gtest_dt); \ + gtest_dt->Abort(::testing::internal::DeathTest::TEST_DID_NOT_DIE); \ + break; \ + } \ + default: \ + break; \ + } \ + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__): \ + fail(::testing::internal::DeathTest::LastMessage()) +// The symbol "fail" here expands to something into which a message +// can be streamed. + +// This macro is for implementing ASSERT/EXPECT_DEBUG_DEATH when compiled in +// NDEBUG mode. In this case we need the statements to be executed, the regex is +// ignored, and the macro must accept a streamed message even though the message +// is never printed. +# define GTEST_EXECUTE_STATEMENT_(statement, regex) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } else \ + ::testing::Message() + +// A class representing the parsed contents of the +// --gtest_internal_run_death_test flag, as it existed when +// RUN_ALL_TESTS was called. +class InternalRunDeathTestFlag { + public: + InternalRunDeathTestFlag(const std::string& a_file, + int a_line, + int an_index, + int a_write_fd) + : file_(a_file), line_(a_line), index_(an_index), + write_fd_(a_write_fd) {} + + ~InternalRunDeathTestFlag() { + if (write_fd_ >= 0) + posix::Close(write_fd_); + } + + const std::string& file() const { return file_; } + int line() const { return line_; } + int index() const { return index_; } + int write_fd() const { return write_fd_; } + + private: + std::string file_; + int line_; + int index_; + int write_fd_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(InternalRunDeathTestFlag); +}; + +// Returns a newly created InternalRunDeathTestFlag object with fields +// initialized from the GTEST_FLAG(internal_run_death_test) flag if +// the flag is specified; otherwise returns NULL. +InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag(); + +#else // GTEST_HAS_DEATH_TEST + +// This macro is used for implementing macros such as +// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED on systems where +// death tests are not supported. Those macros must compile on such systems +// iff EXPECT_DEATH and ASSERT_DEATH compile with the same parameters on +// systems that support death tests. This allows one to write such a macro +// on a system that does not support death tests and be sure that it will +// compile on a death-test supporting system. +// +// Parameters: +// statement - A statement that a macro such as EXPECT_DEATH would test +// for program termination. This macro has to make sure this +// statement is compiled but not executed, to ensure that +// EXPECT_DEATH_IF_SUPPORTED compiles with a certain +// parameter iff EXPECT_DEATH compiles with it. +// regex - A regex that a macro such as EXPECT_DEATH would use to test +// the output of statement. This parameter has to be +// compiled but not evaluated by this macro, to ensure that +// this macro only accepts expressions that a macro such as +// EXPECT_DEATH would accept. +// terminator - Must be an empty statement for EXPECT_DEATH_IF_SUPPORTED +// and a return statement for ASSERT_DEATH_IF_SUPPORTED. +// This ensures that ASSERT_DEATH_IF_SUPPORTED will not +// compile inside functions where ASSERT_DEATH doesn't +// compile. +// +// The branch that has an always false condition is used to ensure that +// statement and regex are compiled (and thus syntactically correct) but +// never executed. The unreachable code macro protects the terminator +// statement from generating an 'unreachable code' warning in case +// statement unconditionally returns or throws. The Message constructor at +// the end allows the syntax of streaming additional messages into the +// macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH. +# define GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, terminator) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + GTEST_LOG_(WARNING) \ + << "Death tests are not supported on this platform.\n" \ + << "Statement '" #statement "' cannot be verified."; \ + } else if (::testing::internal::AlwaysFalse()) { \ + ::testing::internal::RE::PartialMatch(".*", (regex)); \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + terminator; \ + } else \ + ::testing::Message() + +#endif // GTEST_HAS_DEATH_TEST + +} // namespace internal +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ + +namespace testing { + +// This flag controls the style of death tests. Valid values are "threadsafe", +// meaning that the death test child process will re-execute the test binary +// from the start, running only a single death test, or "fast", +// meaning that the child process will execute the test logic immediately +// after forking. +GTEST_DECLARE_string_(death_test_style); + +#if GTEST_HAS_DEATH_TEST + +namespace internal { + +// Returns a Boolean value indicating whether the caller is currently +// executing in the context of the death test child process. Tools such as +// Valgrind heap checkers may need this to modify their behavior in death +// tests. IMPORTANT: This is an internal utility. Using it may break the +// implementation of death tests. User code MUST NOT use it. +GTEST_API_ bool InDeathTestChild(); + +} // namespace internal + +// The following macros are useful for writing death tests. + +// Here's what happens when an ASSERT_DEATH* or EXPECT_DEATH* is +// executed: +// +// 1. It generates a warning if there is more than one active +// thread. This is because it's safe to fork() or clone() only +// when there is a single thread. +// +// 2. The parent process clone()s a sub-process and runs the death +// test in it; the sub-process exits with code 0 at the end of the +// death test, if it hasn't exited already. +// +// 3. The parent process waits for the sub-process to terminate. +// +// 4. The parent process checks the exit code and error message of +// the sub-process. +// +// Examples: +// +// ASSERT_DEATH(server.SendMessage(56, "Hello"), "Invalid port number"); +// for (int i = 0; i < 5; i++) { +// EXPECT_DEATH(server.ProcessRequest(i), +// "Invalid request .* in ProcessRequest()") +// << "Failed to die on request " << i; +// } +// +// ASSERT_EXIT(server.ExitNow(), ::testing::ExitedWithCode(0), "Exiting"); +// +// bool KilledBySIGHUP(int exit_code) { +// return WIFSIGNALED(exit_code) && WTERMSIG(exit_code) == SIGHUP; +// } +// +// ASSERT_EXIT(client.HangUpServer(), KilledBySIGHUP, "Hanging up!"); +// +// On the regular expressions used in death tests: +// +// On POSIX-compliant systems (*nix), we use the <regex.h> library, +// which uses the POSIX extended regex syntax. +// +// On other platforms (e.g. Windows), we only support a simple regex +// syntax implemented as part of Google Test. This limited +// implementation should be enough most of the time when writing +// death tests; though it lacks many features you can find in PCRE +// or POSIX extended regex syntax. For example, we don't support +// union ("x|y"), grouping ("(xy)"), brackets ("[xy]"), and +// repetition count ("x{5,7}"), among others. +// +// Below is the syntax that we do support. We chose it to be a +// subset of both PCRE and POSIX extended regex, so it's easy to +// learn wherever you come from. In the following: 'A' denotes a +// literal character, period (.), or a single \\ escape sequence; +// 'x' and 'y' denote regular expressions; 'm' and 'n' are for +// natural numbers. +// +// c matches any literal character c +// \\d matches any decimal digit +// \\D matches any character that's not a decimal digit +// \\f matches \f +// \\n matches \n +// \\r matches \r +// \\s matches any ASCII whitespace, including \n +// \\S matches any character that's not a whitespace +// \\t matches \t +// \\v matches \v +// \\w matches any letter, _, or decimal digit +// \\W matches any character that \\w doesn't match +// \\c matches any literal character c, which must be a punctuation +// . matches any single character except \n +// A? matches 0 or 1 occurrences of A +// A* matches 0 or many occurrences of A +// A+ matches 1 or many occurrences of A +// ^ matches the beginning of a string (not that of each line) +// $ matches the end of a string (not that of each line) +// xy matches x followed by y +// +// If you accidentally use PCRE or POSIX extended regex features +// not implemented by us, you will get a run-time failure. In that +// case, please try to rewrite your regular expression within the +// above syntax. +// +// This implementation is *not* meant to be as highly tuned or robust +// as a compiled regex library, but should perform well enough for a +// death test, which already incurs significant overhead by launching +// a child process. +// +// Known caveats: +// +// A "threadsafe" style death test obtains the path to the test +// program from argv[0] and re-executes it in the sub-process. For +// simplicity, the current implementation doesn't search the PATH +// when launching the sub-process. This means that the user must +// invoke the test program via a path that contains at least one +// path separator (e.g. path/to/foo_test and +// /absolute/path/to/bar_test are fine, but foo_test is not). This +// is rarely a problem as people usually don't put the test binary +// directory in PATH. +// +// TODO(wan@google.com): make thread-safe death tests search the PATH. + +// Asserts that a given statement causes the program to exit, with an +// integer exit status that satisfies predicate, and emitting error output +// that matches regex. +# define ASSERT_EXIT(statement, predicate, regex) \ + GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_FATAL_FAILURE_) + +// Like ASSERT_EXIT, but continues on to successive tests in the +// test case, if any: +# define EXPECT_EXIT(statement, predicate, regex) \ + GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_NONFATAL_FAILURE_) + +// Asserts that a given statement causes the program to exit, either by +// explicitly exiting with a nonzero exit code or being killed by a +// signal, and emitting error output that matches regex. +# define ASSERT_DEATH(statement, regex) \ + ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex) + +// Like ASSERT_DEATH, but continues on to successive tests in the +// test case, if any: +# define EXPECT_DEATH(statement, regex) \ + EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex) + +// Two predicate classes that can be used in {ASSERT,EXPECT}_EXIT*: + +// Tests that an exit code describes a normal exit with a given exit code. +class GTEST_API_ ExitedWithCode { + public: + explicit ExitedWithCode(int exit_code); + bool operator()(int exit_status) const; + private: + // No implementation - assignment is unsupported. + void operator=(const ExitedWithCode& other); + + const int exit_code_; +}; + +# if !GTEST_OS_WINDOWS +// Tests that an exit code describes an exit due to termination by a +// given signal. +class GTEST_API_ KilledBySignal { + public: + explicit KilledBySignal(int signum); + bool operator()(int exit_status) const; + private: + const int signum_; +}; +# endif // !GTEST_OS_WINDOWS + +// EXPECT_DEBUG_DEATH asserts that the given statements die in debug mode. +// The death testing framework causes this to have interesting semantics, +// since the sideeffects of the call are only visible in opt mode, and not +// in debug mode. +// +// In practice, this can be used to test functions that utilize the +// LOG(DFATAL) macro using the following style: +// +// int DieInDebugOr12(int* sideeffect) { +// if (sideeffect) { +// *sideeffect = 12; +// } +// LOG(DFATAL) << "death"; +// return 12; +// } +// +// TEST(TestCase, TestDieOr12WorksInDgbAndOpt) { +// int sideeffect = 0; +// // Only asserts in dbg. +// EXPECT_DEBUG_DEATH(DieInDebugOr12(&sideeffect), "death"); +// +// #ifdef NDEBUG +// // opt-mode has sideeffect visible. +// EXPECT_EQ(12, sideeffect); +// #else +// // dbg-mode no visible sideeffect. +// EXPECT_EQ(0, sideeffect); +// #endif +// } +// +// This will assert that DieInDebugReturn12InOpt() crashes in debug +// mode, usually due to a DCHECK or LOG(DFATAL), but returns the +// appropriate fallback value (12 in this case) in opt mode. If you +// need to test that a function has appropriate side-effects in opt +// mode, include assertions against the side-effects. A general +// pattern for this is: +// +// EXPECT_DEBUG_DEATH({ +// // Side-effects here will have an effect after this statement in +// // opt mode, but none in debug mode. +// EXPECT_EQ(12, DieInDebugOr12(&sideeffect)); +// }, "death"); +// +# ifdef NDEBUG + +# define EXPECT_DEBUG_DEATH(statement, regex) \ + GTEST_EXECUTE_STATEMENT_(statement, regex) + +# define ASSERT_DEBUG_DEATH(statement, regex) \ + GTEST_EXECUTE_STATEMENT_(statement, regex) + +# else + +# define EXPECT_DEBUG_DEATH(statement, regex) \ + EXPECT_DEATH(statement, regex) + +# define ASSERT_DEBUG_DEATH(statement, regex) \ + ASSERT_DEATH(statement, regex) + +# endif // NDEBUG for EXPECT_DEBUG_DEATH +#endif // GTEST_HAS_DEATH_TEST + +// EXPECT_DEATH_IF_SUPPORTED(statement, regex) and +// ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if +// death tests are supported; otherwise they just issue a warning. This is +// useful when you are combining death test assertions with normal test +// assertions in one test. +#if GTEST_HAS_DEATH_TEST +# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \ + EXPECT_DEATH(statement, regex) +# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \ + ASSERT_DEATH(statement, regex) +#else +# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \ + GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, ) +# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \ + GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, return) +#endif + +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_ +// This file was GENERATED by command: +// pump.py gtest-param-test.h.pump +// DO NOT EDIT BY HAND!!! + +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Authors: vladl@google.com (Vlad Losev) +// +// Macros and functions for implementing parameterized tests +// in Google C++ Testing Framework (Google Test) +// +// This file is generated by a SCRIPT. DO NOT EDIT BY HAND! +// +#ifndef GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ +#define GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ + + +// Value-parameterized tests allow you to test your code with different +// parameters without writing multiple copies of the same test. +// +// Here is how you use value-parameterized tests: + +#if 0 + +// To write value-parameterized tests, first you should define a fixture +// class. It is usually derived from testing::TestWithParam<T> (see below for +// another inheritance scheme that's sometimes useful in more complicated +// class hierarchies), where the type of your parameter values. +// TestWithParam<T> is itself derived from testing::Test. T can be any +// copyable type. If it's a raw pointer, you are responsible for managing the +// lifespan of the pointed values. + +class FooTest : public ::testing::TestWithParam<const char*> { + // You can implement all the usual class fixture members here. +}; + +// Then, use the TEST_P macro to define as many parameterized tests +// for this fixture as you want. The _P suffix is for "parameterized" +// or "pattern", whichever you prefer to think. + +TEST_P(FooTest, DoesBlah) { + // Inside a test, access the test parameter with the GetParam() method + // of the TestWithParam<T> class: + EXPECT_TRUE(foo.Blah(GetParam())); + ... +} + +TEST_P(FooTest, HasBlahBlah) { + ... +} + +// Finally, you can use INSTANTIATE_TEST_CASE_P to instantiate the test +// case with any set of parameters you want. Google Test defines a number +// of functions for generating test parameters. They return what we call +// (surprise!) parameter generators. Here is a summary of them, which +// are all in the testing namespace: +// +// +// Range(begin, end [, step]) - Yields values {begin, begin+step, +// begin+step+step, ...}. The values do not +// include end. step defaults to 1. +// Values(v1, v2, ..., vN) - Yields values {v1, v2, ..., vN}. +// ValuesIn(container) - Yields values from a C-style array, an STL +// ValuesIn(begin,end) container, or an iterator range [begin, end). +// Bool() - Yields sequence {false, true}. +// Combine(g1, g2, ..., gN) - Yields all combinations (the Cartesian product +// for the math savvy) of the values generated +// by the N generators. +// +// For more details, see comments at the definitions of these functions below +// in this file. +// +// The following statement will instantiate tests from the FooTest test case +// each with parameter values "meeny", "miny", and "moe". + +INSTANTIATE_TEST_CASE_P(InstantiationName, + FooTest, + Values("meeny", "miny", "moe")); + +// To distinguish different instances of the pattern, (yes, you +// can instantiate it more then once) the first argument to the +// INSTANTIATE_TEST_CASE_P macro is a prefix that will be added to the +// actual test case name. Remember to pick unique prefixes for different +// instantiations. The tests from the instantiation above will have +// these names: +// +// * InstantiationName/FooTest.DoesBlah/0 for "meeny" +// * InstantiationName/FooTest.DoesBlah/1 for "miny" +// * InstantiationName/FooTest.DoesBlah/2 for "moe" +// * InstantiationName/FooTest.HasBlahBlah/0 for "meeny" +// * InstantiationName/FooTest.HasBlahBlah/1 for "miny" +// * InstantiationName/FooTest.HasBlahBlah/2 for "moe" +// +// You can use these names in --gtest_filter. +// +// This statement will instantiate all tests from FooTest again, each +// with parameter values "cat" and "dog": + +const char* pets[] = {"cat", "dog"}; +INSTANTIATE_TEST_CASE_P(AnotherInstantiationName, FooTest, ValuesIn(pets)); + +// The tests from the instantiation above will have these names: +// +// * AnotherInstantiationName/FooTest.DoesBlah/0 for "cat" +// * AnotherInstantiationName/FooTest.DoesBlah/1 for "dog" +// * AnotherInstantiationName/FooTest.HasBlahBlah/0 for "cat" +// * AnotherInstantiationName/FooTest.HasBlahBlah/1 for "dog" +// +// Please note that INSTANTIATE_TEST_CASE_P will instantiate all tests +// in the given test case, whether their definitions come before or +// AFTER the INSTANTIATE_TEST_CASE_P statement. +// +// Please also note that generator expressions (including parameters to the +// generators) are evaluated in InitGoogleTest(), after main() has started. +// This allows the user on one hand, to adjust generator parameters in order +// to dynamically determine a set of tests to run and on the other hand, +// give the user a chance to inspect the generated tests with Google Test +// reflection API before RUN_ALL_TESTS() is executed. +// +// You can see samples/sample7_unittest.cc and samples/sample8_unittest.cc +// for more examples. +// +// In the future, we plan to publish the API for defining new parameter +// generators. But for now this interface remains part of the internal +// implementation and is subject to change. +// +// +// A parameterized test fixture must be derived from testing::Test and from +// testing::WithParamInterface<T>, where T is the type of the parameter +// values. Inheriting from TestWithParam<T> satisfies that requirement because +// TestWithParam<T> inherits from both Test and WithParamInterface. In more +// complicated hierarchies, however, it is occasionally useful to inherit +// separately from Test and WithParamInterface. For example: + +class BaseTest : public ::testing::Test { + // You can inherit all the usual members for a non-parameterized test + // fixture here. +}; + +class DerivedTest : public BaseTest, public ::testing::WithParamInterface<int> { + // The usual test fixture members go here too. +}; + +TEST_F(BaseTest, HasFoo) { + // This is an ordinary non-parameterized test. +} + +TEST_P(DerivedTest, DoesBlah) { + // GetParam works just the same here as if you inherit from TestWithParam. + EXPECT_TRUE(foo.Blah(GetParam())); +} + +#endif // 0 + + +#if !GTEST_OS_SYMBIAN +# include <utility> +#endif + +// scripts/fuse_gtest.py depends on gtest's own header being #included +// *unconditionally*. Therefore these #includes cannot be moved +// inside #if GTEST_HAS_PARAM_TEST. +// Copyright 2008 Google Inc. +// All Rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: vladl@google.com (Vlad Losev) + +// Type and function utilities for implementing parameterized tests. + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ + +#include <iterator> +#include <utility> +#include <vector> + +// scripts/fuse_gtest.py depends on gtest's own header being #included +// *unconditionally*. Therefore these #includes cannot be moved +// inside #if GTEST_HAS_PARAM_TEST. +// Copyright 2003 Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Authors: Dan Egnor (egnor@google.com) +// +// A "smart" pointer type with reference tracking. Every pointer to a +// particular object is kept on a circular linked list. When the last pointer +// to an object is destroyed or reassigned, the object is deleted. +// +// Used properly, this deletes the object when the last reference goes away. +// There are several caveats: +// - Like all reference counting schemes, cycles lead to leaks. +// - Each smart pointer is actually two pointers (8 bytes instead of 4). +// - Every time a pointer is assigned, the entire list of pointers to that +// object is traversed. This class is therefore NOT SUITABLE when there +// will often be more than two or three pointers to a particular object. +// - References are only tracked as long as linked_ptr<> objects are copied. +// If a linked_ptr<> is converted to a raw pointer and back, BAD THINGS +// will happen (double deletion). +// +// A good use of this class is storing object references in STL containers. +// You can safely put linked_ptr<> in a vector<>. +// Other uses may not be as good. +// +// Note: If you use an incomplete type with linked_ptr<>, the class +// *containing* linked_ptr<> must have a constructor and destructor (even +// if they do nothing!). +// +// Bill Gibbons suggested we use something like this. +// +// Thread Safety: +// Unlike other linked_ptr implementations, in this implementation +// a linked_ptr object is thread-safe in the sense that: +// - it's safe to copy linked_ptr objects concurrently, +// - it's safe to copy *from* a linked_ptr and read its underlying +// raw pointer (e.g. via get()) concurrently, and +// - it's safe to write to two linked_ptrs that point to the same +// shared object concurrently. +// TODO(wan@google.com): rename this to safe_linked_ptr to avoid +// confusion with normal linked_ptr. + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_ + +#include <stdlib.h> +#include <assert.h> + + +namespace testing { +namespace internal { + +// Protects copying of all linked_ptr objects. +GTEST_API_ GTEST_DECLARE_STATIC_MUTEX_(g_linked_ptr_mutex); + +// This is used internally by all instances of linked_ptr<>. It needs to be +// a non-template class because different types of linked_ptr<> can refer to +// the same object (linked_ptr<Superclass>(obj) vs linked_ptr<Subclass>(obj)). +// So, it needs to be possible for different types of linked_ptr to participate +// in the same circular linked list, so we need a single class type here. +// +// DO NOT USE THIS CLASS DIRECTLY YOURSELF. Use linked_ptr<T>. +class linked_ptr_internal { + public: + // Create a new circle that includes only this instance. + void join_new() { + next_ = this; + } + + // Many linked_ptr operations may change p.link_ for some linked_ptr + // variable p in the same circle as this object. Therefore we need + // to prevent two such operations from occurring concurrently. + // + // Note that different types of linked_ptr objects can coexist in a + // circle (e.g. linked_ptr<Base>, linked_ptr<Derived1>, and + // linked_ptr<Derived2>). Therefore we must use a single mutex to + // protect all linked_ptr objects. This can create serious + // contention in production code, but is acceptable in a testing + // framework. + + // Join an existing circle. + void join(linked_ptr_internal const* ptr) + GTEST_LOCK_EXCLUDED_(g_linked_ptr_mutex) { + MutexLock lock(&g_linked_ptr_mutex); + + linked_ptr_internal const* p = ptr; + while (p->next_ != ptr) p = p->next_; + p->next_ = this; + next_ = ptr; + } + + // Leave whatever circle we're part of. Returns true if we were the + // last member of the circle. Once this is done, you can join() another. + bool depart() + GTEST_LOCK_EXCLUDED_(g_linked_ptr_mutex) { + MutexLock lock(&g_linked_ptr_mutex); + + if (next_ == this) return true; + linked_ptr_internal const* p = next_; + while (p->next_ != this) p = p->next_; + p->next_ = next_; + return false; + } + + private: + mutable linked_ptr_internal const* next_; +}; + +template <typename T> +class linked_ptr { + public: + typedef T element_type; + + // Take over ownership of a raw pointer. This should happen as soon as + // possible after the object is created. + explicit linked_ptr(T* ptr = NULL) { capture(ptr); } + ~linked_ptr() { depart(); } + + // Copy an existing linked_ptr<>, adding ourselves to the list of references. + template <typename U> linked_ptr(linked_ptr<U> const& ptr) { copy(&ptr); } + linked_ptr(linked_ptr const& ptr) { // NOLINT + assert(&ptr != this); + copy(&ptr); + } + + // Assignment releases the old value and acquires the new. + template <typename U> linked_ptr& operator=(linked_ptr<U> const& ptr) { + depart(); + copy(&ptr); + return *this; + } + + linked_ptr& operator=(linked_ptr const& ptr) { + if (&ptr != this) { + depart(); + copy(&ptr); + } + return *this; + } + + // Smart pointer members. + void reset(T* ptr = NULL) { + depart(); + capture(ptr); + } + T* get() const { return value_; } + T* operator->() const { return value_; } + T& operator*() const { return *value_; } + + bool operator==(T* p) const { return value_ == p; } + bool operator!=(T* p) const { return value_ != p; } + template <typename U> + bool operator==(linked_ptr<U> const& ptr) const { + return value_ == ptr.get(); + } + template <typename U> + bool operator!=(linked_ptr<U> const& ptr) const { + return value_ != ptr.get(); + } + + private: + template <typename U> + friend class linked_ptr; + + T* value_; + linked_ptr_internal link_; + + void depart() { + if (link_.depart()) delete value_; + } + + void capture(T* ptr) { + value_ = ptr; + link_.join_new(); + } + + template <typename U> void copy(linked_ptr<U> const* ptr) { + value_ = ptr->get(); + if (value_) + link_.join(&ptr->link_); + else + link_.join_new(); + } +}; + +template<typename T> inline +bool operator==(T* ptr, const linked_ptr<T>& x) { + return ptr == x.get(); +} + +template<typename T> inline +bool operator!=(T* ptr, const linked_ptr<T>& x) { + return ptr != x.get(); +} + +// A function to convert T* into linked_ptr<T> +// Doing e.g. make_linked_ptr(new FooBarBaz<type>(arg)) is a shorter notation +// for linked_ptr<FooBarBaz<type> >(new FooBarBaz<type>(arg)) +template <typename T> +linked_ptr<T> make_linked_ptr(T* ptr) { + return linked_ptr<T>(ptr); +} + +} // namespace internal +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) + +// Google Test - The Google C++ Testing Framework +// +// This file implements a universal value printer that can print a +// value of any type T: +// +// void ::testing::internal::UniversalPrinter<T>::Print(value, ostream_ptr); +// +// A user can teach this function how to print a class type T by +// defining either operator<<() or PrintTo() in the namespace that +// defines T. More specifically, the FIRST defined function in the +// following list will be used (assuming T is defined in namespace +// foo): +// +// 1. foo::PrintTo(const T&, ostream*) +// 2. operator<<(ostream&, const T&) defined in either foo or the +// global namespace. +// +// If none of the above is defined, it will print the debug string of +// the value if it is a protocol buffer, or print the raw bytes in the +// value otherwise. +// +// To aid debugging: when T is a reference type, the address of the +// value is also printed; when T is a (const) char pointer, both the +// pointer value and the NUL-terminated string it points to are +// printed. +// +// We also provide some convenient wrappers: +// +// // Prints a value to a string. For a (const or not) char +// // pointer, the NUL-terminated string (but not the pointer) is +// // printed. +// std::string ::testing::PrintToString(const T& value); +// +// // Prints a value tersely: for a reference type, the referenced +// // value (but not the address) is printed; for a (const or not) char +// // pointer, the NUL-terminated string (but not the pointer) is +// // printed. +// void ::testing::internal::UniversalTersePrint(const T& value, ostream*); +// +// // Prints value using the type inferred by the compiler. The difference +// // from UniversalTersePrint() is that this function prints both the +// // pointer and the NUL-terminated string for a (const or not) char pointer. +// void ::testing::internal::UniversalPrint(const T& value, ostream*); +// +// // Prints the fields of a tuple tersely to a string vector, one +// // element for each field. Tuple support must be enabled in +// // gtest-port.h. +// std::vector<string> UniversalTersePrintTupleFieldsToStrings( +// const Tuple& value); +// +// Known limitation: +// +// The print primitives print the elements of an STL-style container +// using the compiler-inferred type of *iter where iter is a +// const_iterator of the container. When const_iterator is an input +// iterator but not a forward iterator, this inferred type may not +// match value_type, and the print output may be incorrect. In +// practice, this is rarely a problem as for most containers +// const_iterator is a forward iterator. We'll fix this if there's an +// actual need for it. Note that this fix cannot rely on value_type +// being defined as many user-defined container types don't have +// value_type. + +#ifndef GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ +#define GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ + +#include <ostream> // NOLINT +#include <sstream> +#include <string> +#include <utility> +#include <vector> + +namespace testing { + +// Definitions in the 'internal' and 'internal2' name spaces are +// subject to change without notice. DO NOT USE THEM IN USER CODE! +namespace internal2 { + +// Prints the given number of bytes in the given object to the given +// ostream. +GTEST_API_ void PrintBytesInObjectTo(const unsigned char* obj_bytes, + size_t count, + ::std::ostream* os); + +// For selecting which printer to use when a given type has neither << +// nor PrintTo(). +enum TypeKind { + kProtobuf, // a protobuf type + kConvertibleToInteger, // a type implicitly convertible to BiggestInt + // (e.g. a named or unnamed enum type) + kOtherType // anything else +}; + +// TypeWithoutFormatter<T, kTypeKind>::PrintValue(value, os) is called +// by the universal printer to print a value of type T when neither +// operator<< nor PrintTo() is defined for T, where kTypeKind is the +// "kind" of T as defined by enum TypeKind. +template <typename T, TypeKind kTypeKind> +class TypeWithoutFormatter { + public: + // This default version is called when kTypeKind is kOtherType. + static void PrintValue(const T& value, ::std::ostream* os) { + PrintBytesInObjectTo(reinterpret_cast<const unsigned char*>(&value), + sizeof(value), os); + } +}; + +// We print a protobuf using its ShortDebugString() when the string +// doesn't exceed this many characters; otherwise we print it using +// DebugString() for better readability. +const size_t kProtobufOneLinerMaxLength = 50; + +template <typename T> +class TypeWithoutFormatter<T, kProtobuf> { + public: + static void PrintValue(const T& value, ::std::ostream* os) { + const ::testing::internal::string short_str = value.ShortDebugString(); + const ::testing::internal::string pretty_str = + short_str.length() <= kProtobufOneLinerMaxLength ? + short_str : ("\n" + value.DebugString()); + *os << ("<" + pretty_str + ">"); + } +}; + +template <typename T> +class TypeWithoutFormatter<T, kConvertibleToInteger> { + public: + // Since T has no << operator or PrintTo() but can be implicitly + // converted to BiggestInt, we print it as a BiggestInt. + // + // Most likely T is an enum type (either named or unnamed), in which + // case printing it as an integer is the desired behavior. In case + // T is not an enum, printing it as an integer is the best we can do + // given that it has no user-defined printer. + static void PrintValue(const T& value, ::std::ostream* os) { + const internal::BiggestInt kBigInt = value; + *os << kBigInt; + } +}; + +// Prints the given value to the given ostream. If the value is a +// protocol message, its debug string is printed; if it's an enum or +// of a type implicitly convertible to BiggestInt, it's printed as an +// integer; otherwise the bytes in the value are printed. This is +// what UniversalPrinter<T>::Print() does when it knows nothing about +// type T and T has neither << operator nor PrintTo(). +// +// A user can override this behavior for a class type Foo by defining +// a << operator in the namespace where Foo is defined. +// +// We put this operator in namespace 'internal2' instead of 'internal' +// to simplify the implementation, as much code in 'internal' needs to +// use << in STL, which would conflict with our own << were it defined +// in 'internal'. +// +// Note that this operator<< takes a generic std::basic_ostream<Char, +// CharTraits> type instead of the more restricted std::ostream. If +// we define it to take an std::ostream instead, we'll get an +// "ambiguous overloads" compiler error when trying to print a type +// Foo that supports streaming to std::basic_ostream<Char, +// CharTraits>, as the compiler cannot tell whether +// operator<<(std::ostream&, const T&) or +// operator<<(std::basic_stream<Char, CharTraits>, const Foo&) is more +// specific. +template <typename Char, typename CharTraits, typename T> +::std::basic_ostream<Char, CharTraits>& operator<<( + ::std::basic_ostream<Char, CharTraits>& os, const T& x) { + TypeWithoutFormatter<T, + (internal::IsAProtocolMessage<T>::value ? kProtobuf : + internal::ImplicitlyConvertible<const T&, internal::BiggestInt>::value ? + kConvertibleToInteger : kOtherType)>::PrintValue(x, &os); + return os; +} + +} // namespace internal2 +} // namespace testing + +// This namespace MUST NOT BE NESTED IN ::testing, or the name look-up +// magic needed for implementing UniversalPrinter won't work. +namespace testing_internal { + +// Used to print a value that is not an STL-style container when the +// user doesn't define PrintTo() for it. +template <typename T> +void DefaultPrintNonContainerTo(const T& value, ::std::ostream* os) { + // With the following statement, during unqualified name lookup, + // testing::internal2::operator<< appears as if it was declared in + // the nearest enclosing namespace that contains both + // ::testing_internal and ::testing::internal2, i.e. the global + // namespace. For more details, refer to the C++ Standard section + // 7.3.4-1 [namespace.udir]. This allows us to fall back onto + // testing::internal2::operator<< in case T doesn't come with a << + // operator. + // + // We cannot write 'using ::testing::internal2::operator<<;', which + // gcc 3.3 fails to compile due to a compiler bug. + using namespace ::testing::internal2; // NOLINT + + // Assuming T is defined in namespace foo, in the next statement, + // the compiler will consider all of: + // + // 1. foo::operator<< (thanks to Koenig look-up), + // 2. ::operator<< (as the current namespace is enclosed in ::), + // 3. testing::internal2::operator<< (thanks to the using statement above). + // + // The operator<< whose type matches T best will be picked. + // + // We deliberately allow #2 to be a candidate, as sometimes it's + // impossible to define #1 (e.g. when foo is ::std, defining + // anything in it is undefined behavior unless you are a compiler + // vendor.). + *os << value; +} + +} // namespace testing_internal + +namespace testing { +namespace internal { + +// UniversalPrinter<T>::Print(value, ostream_ptr) prints the given +// value to the given ostream. The caller must ensure that +// 'ostream_ptr' is not NULL, or the behavior is undefined. +// +// We define UniversalPrinter as a class template (as opposed to a +// function template), as we need to partially specialize it for +// reference types, which cannot be done with function templates. +template <typename T> +class UniversalPrinter; + +template <typename T> +void UniversalPrint(const T& value, ::std::ostream* os); + +// Used to print an STL-style container when the user doesn't define +// a PrintTo() for it. +template <typename C> +void DefaultPrintTo(IsContainer /* dummy */, + false_type /* is not a pointer */, + const C& container, ::std::ostream* os) { + const size_t kMaxCount = 32; // The maximum number of elements to print. + *os << '{'; + size_t count = 0; + for (typename C::const_iterator it = container.begin(); + it != container.end(); ++it, ++count) { + if (count > 0) { + *os << ','; + if (count == kMaxCount) { // Enough has been printed. + *os << " ..."; + break; + } + } + *os << ' '; + // We cannot call PrintTo(*it, os) here as PrintTo() doesn't + // handle *it being a native array. + internal::UniversalPrint(*it, os); + } + + if (count > 0) { + *os << ' '; + } + *os << '}'; +} + +// Used to print a pointer that is neither a char pointer nor a member +// pointer, when the user doesn't define PrintTo() for it. (A member +// variable pointer or member function pointer doesn't really point to +// a location in the address space. Their representation is +// implementation-defined. Therefore they will be printed as raw +// bytes.) +template <typename T> +void DefaultPrintTo(IsNotContainer /* dummy */, + true_type /* is a pointer */, + T* p, ::std::ostream* os) { + if (p == NULL) { + *os << "NULL"; + } else { + // C++ doesn't allow casting from a function pointer to any object + // pointer. + // + // IsTrue() silences warnings: "Condition is always true", + // "unreachable code". + if (IsTrue(ImplicitlyConvertible<T*, const void*>::value)) { + // T is not a function type. We just call << to print p, + // relying on ADL to pick up user-defined << for their pointer + // types, if any. + *os << p; + } else { + // T is a function type, so '*os << p' doesn't do what we want + // (it just prints p as bool). We want to print p as a const + // void*. However, we cannot cast it to const void* directly, + // even using reinterpret_cast, as earlier versions of gcc + // (e.g. 3.4.5) cannot compile the cast when p is a function + // pointer. Casting to UInt64 first solves the problem. + *os << reinterpret_cast<const void*>( + reinterpret_cast<internal::UInt64>(p)); + } + } +} + +// Used to print a non-container, non-pointer value when the user +// doesn't define PrintTo() for it. +template <typename T> +void DefaultPrintTo(IsNotContainer /* dummy */, + false_type /* is not a pointer */, + const T& value, ::std::ostream* os) { + ::testing_internal::DefaultPrintNonContainerTo(value, os); +} + +// Prints the given value using the << operator if it has one; +// otherwise prints the bytes in it. This is what +// UniversalPrinter<T>::Print() does when PrintTo() is not specialized +// or overloaded for type T. +// +// A user can override this behavior for a class type Foo by defining +// an overload of PrintTo() in the namespace where Foo is defined. We +// give the user this option as sometimes defining a << operator for +// Foo is not desirable (e.g. the coding style may prevent doing it, +// or there is already a << operator but it doesn't do what the user +// wants). +template <typename T> +void PrintTo(const T& value, ::std::ostream* os) { + // DefaultPrintTo() is overloaded. The type of its first two + // arguments determine which version will be picked. If T is an + // STL-style container, the version for container will be called; if + // T is a pointer, the pointer version will be called; otherwise the + // generic version will be called. + // + // Note that we check for container types here, prior to we check + // for protocol message types in our operator<<. The rationale is: + // + // For protocol messages, we want to give people a chance to + // override Google Mock's format by defining a PrintTo() or + // operator<<. For STL containers, other formats can be + // incompatible with Google Mock's format for the container + // elements; therefore we check for container types here to ensure + // that our format is used. + // + // The second argument of DefaultPrintTo() is needed to bypass a bug + // in Symbian's C++ compiler that prevents it from picking the right + // overload between: + // + // PrintTo(const T& x, ...); + // PrintTo(T* x, ...); + DefaultPrintTo(IsContainerTest<T>(0), is_pointer<T>(), value, os); +} + +// The following list of PrintTo() overloads tells +// UniversalPrinter<T>::Print() how to print standard types (built-in +// types, strings, plain arrays, and pointers). + +// Overloads for various char types. +GTEST_API_ void PrintTo(unsigned char c, ::std::ostream* os); +GTEST_API_ void PrintTo(signed char c, ::std::ostream* os); +inline void PrintTo(char c, ::std::ostream* os) { + // When printing a plain char, we always treat it as unsigned. This + // way, the output won't be affected by whether the compiler thinks + // char is signed or not. + PrintTo(static_cast<unsigned char>(c), os); +} + +// Overloads for other simple built-in types. +inline void PrintTo(bool x, ::std::ostream* os) { + *os << (x ? "true" : "false"); +} + +// Overload for wchar_t type. +// Prints a wchar_t as a symbol if it is printable or as its internal +// code otherwise and also as its decimal code (except for L'\0'). +// The L'\0' char is printed as "L'\\0'". The decimal code is printed +// as signed integer when wchar_t is implemented by the compiler +// as a signed type and is printed as an unsigned integer when wchar_t +// is implemented as an unsigned type. +GTEST_API_ void PrintTo(wchar_t wc, ::std::ostream* os); + +// Overloads for C strings. +GTEST_API_ void PrintTo(const char* s, ::std::ostream* os); +inline void PrintTo(char* s, ::std::ostream* os) { + PrintTo(ImplicitCast_<const char*>(s), os); +} + +// signed/unsigned char is often used for representing binary data, so +// we print pointers to it as void* to be safe. +inline void PrintTo(const signed char* s, ::std::ostream* os) { + PrintTo(ImplicitCast_<const void*>(s), os); +} +inline void PrintTo(signed char* s, ::std::ostream* os) { + PrintTo(ImplicitCast_<const void*>(s), os); +} +inline void PrintTo(const unsigned char* s, ::std::ostream* os) { + PrintTo(ImplicitCast_<const void*>(s), os); +} +inline void PrintTo(unsigned char* s, ::std::ostream* os) { + PrintTo(ImplicitCast_<const void*>(s), os); +} + +// MSVC can be configured to define wchar_t as a typedef of unsigned +// short. It defines _NATIVE_WCHAR_T_DEFINED when wchar_t is a native +// type. When wchar_t is a typedef, defining an overload for const +// wchar_t* would cause unsigned short* be printed as a wide string, +// possibly causing invalid memory accesses. +#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED) +// Overloads for wide C strings +GTEST_API_ void PrintTo(const wchar_t* s, ::std::ostream* os); +inline void PrintTo(wchar_t* s, ::std::ostream* os) { + PrintTo(ImplicitCast_<const wchar_t*>(s), os); +} +#endif + +// Overload for C arrays. Multi-dimensional arrays are printed +// properly. + +// Prints the given number of elements in an array, without printing +// the curly braces. +template <typename T> +void PrintRawArrayTo(const T a[], size_t count, ::std::ostream* os) { + UniversalPrint(a[0], os); + for (size_t i = 1; i != count; i++) { + *os << ", "; + UniversalPrint(a[i], os); + } +} + +// Overloads for ::string and ::std::string. +#if GTEST_HAS_GLOBAL_STRING +GTEST_API_ void PrintStringTo(const ::string&s, ::std::ostream* os); +inline void PrintTo(const ::string& s, ::std::ostream* os) { + PrintStringTo(s, os); +} +#endif // GTEST_HAS_GLOBAL_STRING + +GTEST_API_ void PrintStringTo(const ::std::string&s, ::std::ostream* os); +inline void PrintTo(const ::std::string& s, ::std::ostream* os) { + PrintStringTo(s, os); +} + +// Overloads for ::wstring and ::std::wstring. +#if GTEST_HAS_GLOBAL_WSTRING +GTEST_API_ void PrintWideStringTo(const ::wstring&s, ::std::ostream* os); +inline void PrintTo(const ::wstring& s, ::std::ostream* os) { + PrintWideStringTo(s, os); +} +#endif // GTEST_HAS_GLOBAL_WSTRING + +#if GTEST_HAS_STD_WSTRING +GTEST_API_ void PrintWideStringTo(const ::std::wstring&s, ::std::ostream* os); +inline void PrintTo(const ::std::wstring& s, ::std::ostream* os) { + PrintWideStringTo(s, os); +} +#endif // GTEST_HAS_STD_WSTRING + +#if GTEST_HAS_TR1_TUPLE +// Overload for ::std::tr1::tuple. Needed for printing function arguments, +// which are packed as tuples. + +// Helper function for printing a tuple. T must be instantiated with +// a tuple type. +template <typename T> +void PrintTupleTo(const T& t, ::std::ostream* os); + +// Overloaded PrintTo() for tuples of various arities. We support +// tuples of up-to 10 fields. The following implementation works +// regardless of whether tr1::tuple is implemented using the +// non-standard variadic template feature or not. + +inline void PrintTo(const ::std::tr1::tuple<>& t, ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template <typename T1> +void PrintTo(const ::std::tr1::tuple<T1>& t, ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template <typename T1, typename T2> +void PrintTo(const ::std::tr1::tuple<T1, T2>& t, ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template <typename T1, typename T2, typename T3> +void PrintTo(const ::std::tr1::tuple<T1, T2, T3>& t, ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template <typename T1, typename T2, typename T3, typename T4> +void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4>& t, ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5> +void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5>& t, + ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6> +void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6>& t, + ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7> +void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7>& t, + ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8> +void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8>& t, + ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9> +void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9>& t, + ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10> +void PrintTo( + const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10>& t, + ::std::ostream* os) { + PrintTupleTo(t, os); +} +#endif // GTEST_HAS_TR1_TUPLE + +// Overload for std::pair. +template <typename T1, typename T2> +void PrintTo(const ::std::pair<T1, T2>& value, ::std::ostream* os) { + *os << '('; + // We cannot use UniversalPrint(value.first, os) here, as T1 may be + // a reference type. The same for printing value.second. + UniversalPrinter<T1>::Print(value.first, os); + *os << ", "; + UniversalPrinter<T2>::Print(value.second, os); + *os << ')'; +} + +// Implements printing a non-reference type T by letting the compiler +// pick the right overload of PrintTo() for T. +template <typename T> +class UniversalPrinter { + public: + // MSVC warns about adding const to a function type, so we want to + // disable the warning. +#ifdef _MSC_VER +# pragma warning(push) // Saves the current warning state. +# pragma warning(disable:4180) // Temporarily disables warning 4180. +#endif // _MSC_VER + + // Note: we deliberately don't call this PrintTo(), as that name + // conflicts with ::testing::internal::PrintTo in the body of the + // function. + static void Print(const T& value, ::std::ostream* os) { + // By default, ::testing::internal::PrintTo() is used for printing + // the value. + // + // Thanks to Koenig look-up, if T is a class and has its own + // PrintTo() function defined in its namespace, that function will + // be visible here. Since it is more specific than the generic ones + // in ::testing::internal, it will be picked by the compiler in the + // following statement - exactly what we want. + PrintTo(value, os); + } + +#ifdef _MSC_VER +# pragma warning(pop) // Restores the warning state. +#endif // _MSC_VER +}; + +// UniversalPrintArray(begin, len, os) prints an array of 'len' +// elements, starting at address 'begin'. +template <typename T> +void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) { + if (len == 0) { + *os << "{}"; + } else { + *os << "{ "; + const size_t kThreshold = 18; + const size_t kChunkSize = 8; + // If the array has more than kThreshold elements, we'll have to + // omit some details by printing only the first and the last + // kChunkSize elements. + // TODO(wan@google.com): let the user control the threshold using a flag. + if (len <= kThreshold) { + PrintRawArrayTo(begin, len, os); + } else { + PrintRawArrayTo(begin, kChunkSize, os); + *os << ", ..., "; + PrintRawArrayTo(begin + len - kChunkSize, kChunkSize, os); + } + *os << " }"; + } +} +// This overload prints a (const) char array compactly. +GTEST_API_ void UniversalPrintArray( + const char* begin, size_t len, ::std::ostream* os); + +// This overload prints a (const) wchar_t array compactly. +GTEST_API_ void UniversalPrintArray( + const wchar_t* begin, size_t len, ::std::ostream* os); + +// Implements printing an array type T[N]. +template <typename T, size_t N> +class UniversalPrinter<T[N]> { + public: + // Prints the given array, omitting some elements when there are too + // many. + static void Print(const T (&a)[N], ::std::ostream* os) { + UniversalPrintArray(a, N, os); + } +}; + +// Implements printing a reference type T&. +template <typename T> +class UniversalPrinter<T&> { + public: + // MSVC warns about adding const to a function type, so we want to + // disable the warning. +#ifdef _MSC_VER +# pragma warning(push) // Saves the current warning state. +# pragma warning(disable:4180) // Temporarily disables warning 4180. +#endif // _MSC_VER + + static void Print(const T& value, ::std::ostream* os) { + // Prints the address of the value. We use reinterpret_cast here + // as static_cast doesn't compile when T is a function type. + *os << "@" << reinterpret_cast<const void*>(&value) << " "; + + // Then prints the value itself. + UniversalPrint(value, os); + } + +#ifdef _MSC_VER +# pragma warning(pop) // Restores the warning state. +#endif // _MSC_VER +}; + +// Prints a value tersely: for a reference type, the referenced value +// (but not the address) is printed; for a (const) char pointer, the +// NUL-terminated string (but not the pointer) is printed. + +template <typename T> +class UniversalTersePrinter { + public: + static void Print(const T& value, ::std::ostream* os) { + UniversalPrint(value, os); + } +}; +template <typename T> +class UniversalTersePrinter<T&> { + public: + static void Print(const T& value, ::std::ostream* os) { + UniversalPrint(value, os); + } +}; +template <typename T, size_t N> +class UniversalTersePrinter<T[N]> { + public: + static void Print(const T (&value)[N], ::std::ostream* os) { + UniversalPrinter<T[N]>::Print(value, os); + } +}; +template <> +class UniversalTersePrinter<const char*> { + public: + static void Print(const char* str, ::std::ostream* os) { + if (str == NULL) { + *os << "NULL"; + } else { + UniversalPrint(string(str), os); + } + } +}; +template <> +class UniversalTersePrinter<char*> { + public: + static void Print(char* str, ::std::ostream* os) { + UniversalTersePrinter<const char*>::Print(str, os); + } +}; + +#if GTEST_HAS_STD_WSTRING +template <> +class UniversalTersePrinter<const wchar_t*> { + public: + static void Print(const wchar_t* str, ::std::ostream* os) { + if (str == NULL) { + *os << "NULL"; + } else { + UniversalPrint(::std::wstring(str), os); + } + } +}; +#endif + +template <> +class UniversalTersePrinter<wchar_t*> { + public: + static void Print(wchar_t* str, ::std::ostream* os) { + UniversalTersePrinter<const wchar_t*>::Print(str, os); + } +}; + +template <typename T> +void UniversalTersePrint(const T& value, ::std::ostream* os) { + UniversalTersePrinter<T>::Print(value, os); +} + +// Prints a value using the type inferred by the compiler. The +// difference between this and UniversalTersePrint() is that for a +// (const) char pointer, this prints both the pointer and the +// NUL-terminated string. +template <typename T> +void UniversalPrint(const T& value, ::std::ostream* os) { + // A workaround for the bug in VC++ 7.1 that prevents us from instantiating + // UniversalPrinter with T directly. + typedef T T1; + UniversalPrinter<T1>::Print(value, os); +} + +#if GTEST_HAS_TR1_TUPLE +typedef ::std::vector<string> Strings; + +// This helper template allows PrintTo() for tuples and +// UniversalTersePrintTupleFieldsToStrings() to be defined by +// induction on the number of tuple fields. The idea is that +// TuplePrefixPrinter<N>::PrintPrefixTo(t, os) prints the first N +// fields in tuple t, and can be defined in terms of +// TuplePrefixPrinter<N - 1>. + +// The inductive case. +template <size_t N> +struct TuplePrefixPrinter { + // Prints the first N fields of a tuple. + template <typename Tuple> + static void PrintPrefixTo(const Tuple& t, ::std::ostream* os) { + TuplePrefixPrinter<N - 1>::PrintPrefixTo(t, os); + *os << ", "; + UniversalPrinter<typename ::std::tr1::tuple_element<N - 1, Tuple>::type> + ::Print(::std::tr1::get<N - 1>(t), os); + } + + // Tersely prints the first N fields of a tuple to a string vector, + // one element for each field. + template <typename Tuple> + static void TersePrintPrefixToStrings(const Tuple& t, Strings* strings) { + TuplePrefixPrinter<N - 1>::TersePrintPrefixToStrings(t, strings); + ::std::stringstream ss; + UniversalTersePrint(::std::tr1::get<N - 1>(t), &ss); + strings->push_back(ss.str()); + } +}; + +// Base cases. +template <> +struct TuplePrefixPrinter<0> { + template <typename Tuple> + static void PrintPrefixTo(const Tuple&, ::std::ostream*) {} + + template <typename Tuple> + static void TersePrintPrefixToStrings(const Tuple&, Strings*) {} +}; +// We have to specialize the entire TuplePrefixPrinter<> class +// template here, even though the definition of +// TersePrintPrefixToStrings() is the same as the generic version, as +// Embarcadero (formerly CodeGear, formerly Borland) C++ doesn't +// support specializing a method template of a class template. +template <> +struct TuplePrefixPrinter<1> { + template <typename Tuple> + static void PrintPrefixTo(const Tuple& t, ::std::ostream* os) { + UniversalPrinter<typename ::std::tr1::tuple_element<0, Tuple>::type>:: + Print(::std::tr1::get<0>(t), os); + } + + template <typename Tuple> + static void TersePrintPrefixToStrings(const Tuple& t, Strings* strings) { + ::std::stringstream ss; + UniversalTersePrint(::std::tr1::get<0>(t), &ss); + strings->push_back(ss.str()); + } +}; + +// Helper function for printing a tuple. T must be instantiated with +// a tuple type. +template <typename T> +void PrintTupleTo(const T& t, ::std::ostream* os) { + *os << "("; + TuplePrefixPrinter< ::std::tr1::tuple_size<T>::value>:: + PrintPrefixTo(t, os); + *os << ")"; +} + +// Prints the fields of a tuple tersely to a string vector, one +// element for each field. See the comment before +// UniversalTersePrint() for how we define "tersely". +template <typename Tuple> +Strings UniversalTersePrintTupleFieldsToStrings(const Tuple& value) { + Strings result; + TuplePrefixPrinter< ::std::tr1::tuple_size<Tuple>::value>:: + TersePrintPrefixToStrings(value, &result); + return result; +} +#endif // GTEST_HAS_TR1_TUPLE + +} // namespace internal + +template <typename T> +::std::string PrintToString(const T& value) { + ::std::stringstream ss; + internal::UniversalTersePrinter<T>::Print(value, &ss); + return ss.str(); +} + +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ + +#if GTEST_HAS_PARAM_TEST + +namespace testing { +namespace internal { + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Outputs a message explaining invalid registration of different +// fixture class for the same test case. This may happen when +// TEST_P macro is used to define two tests with the same name +// but in different namespaces. +GTEST_API_ void ReportInvalidTestCaseType(const char* test_case_name, + const char* file, int line); + +template <typename> class ParamGeneratorInterface; +template <typename> class ParamGenerator; + +// Interface for iterating over elements provided by an implementation +// of ParamGeneratorInterface<T>. +template <typename T> +class ParamIteratorInterface { + public: + virtual ~ParamIteratorInterface() {} + // A pointer to the base generator instance. + // Used only for the purposes of iterator comparison + // to make sure that two iterators belong to the same generator. + virtual const ParamGeneratorInterface<T>* BaseGenerator() const = 0; + // Advances iterator to point to the next element + // provided by the generator. The caller is responsible + // for not calling Advance() on an iterator equal to + // BaseGenerator()->End(). + virtual void Advance() = 0; + // Clones the iterator object. Used for implementing copy semantics + // of ParamIterator<T>. + virtual ParamIteratorInterface* Clone() const = 0; + // Dereferences the current iterator and provides (read-only) access + // to the pointed value. It is the caller's responsibility not to call + // Current() on an iterator equal to BaseGenerator()->End(). + // Used for implementing ParamGenerator<T>::operator*(). + virtual const T* Current() const = 0; + // Determines whether the given iterator and other point to the same + // element in the sequence generated by the generator. + // Used for implementing ParamGenerator<T>::operator==(). + virtual bool Equals(const ParamIteratorInterface& other) const = 0; +}; + +// Class iterating over elements provided by an implementation of +// ParamGeneratorInterface<T>. It wraps ParamIteratorInterface<T> +// and implements the const forward iterator concept. +template <typename T> +class ParamIterator { + public: + typedef T value_type; + typedef const T& reference; + typedef ptrdiff_t difference_type; + + // ParamIterator assumes ownership of the impl_ pointer. + ParamIterator(const ParamIterator& other) : impl_(other.impl_->Clone()) {} + ParamIterator& operator=(const ParamIterator& other) { + if (this != &other) + impl_.reset(other.impl_->Clone()); + return *this; + } + + const T& operator*() const { return *impl_->Current(); } + const T* operator->() const { return impl_->Current(); } + // Prefix version of operator++. + ParamIterator& operator++() { + impl_->Advance(); + return *this; + } + // Postfix version of operator++. + ParamIterator operator++(int /*unused*/) { + ParamIteratorInterface<T>* clone = impl_->Clone(); + impl_->Advance(); + return ParamIterator(clone); + } + bool operator==(const ParamIterator& other) const { + return impl_.get() == other.impl_.get() || impl_->Equals(*other.impl_); + } + bool operator!=(const ParamIterator& other) const { + return !(*this == other); + } + + private: + friend class ParamGenerator<T>; + explicit ParamIterator(ParamIteratorInterface<T>* impl) : impl_(impl) {} + scoped_ptr<ParamIteratorInterface<T> > impl_; +}; + +// ParamGeneratorInterface<T> is the binary interface to access generators +// defined in other translation units. +template <typename T> +class ParamGeneratorInterface { + public: + typedef T ParamType; + + virtual ~ParamGeneratorInterface() {} + + // Generator interface definition + virtual ParamIteratorInterface<T>* Begin() const = 0; + virtual ParamIteratorInterface<T>* End() const = 0; +}; + +// Wraps ParamGeneratorInterface<T> and provides general generator syntax +// compatible with the STL Container concept. +// This class implements copy initialization semantics and the contained +// ParamGeneratorInterface<T> instance is shared among all copies +// of the original object. This is possible because that instance is immutable. +template<typename T> +class ParamGenerator { + public: + typedef ParamIterator<T> iterator; + + explicit ParamGenerator(ParamGeneratorInterface<T>* impl) : impl_(impl) {} + ParamGenerator(const ParamGenerator& other) : impl_(other.impl_) {} + + ParamGenerator& operator=(const ParamGenerator& other) { + impl_ = other.impl_; + return *this; + } + + iterator begin() const { return iterator(impl_->Begin()); } + iterator end() const { return iterator(impl_->End()); } + + private: + linked_ptr<const ParamGeneratorInterface<T> > impl_; +}; + +// Generates values from a range of two comparable values. Can be used to +// generate sequences of user-defined types that implement operator+() and +// operator<(). +// This class is used in the Range() function. +template <typename T, typename IncrementT> +class RangeGenerator : public ParamGeneratorInterface<T> { + public: + RangeGenerator(T begin, T end, IncrementT step) + : begin_(begin), end_(end), + step_(step), end_index_(CalculateEndIndex(begin, end, step)) {} + virtual ~RangeGenerator() {} + + virtual ParamIteratorInterface<T>* Begin() const { + return new Iterator(this, begin_, 0, step_); + } + virtual ParamIteratorInterface<T>* End() const { + return new Iterator(this, end_, end_index_, step_); + } + + private: + class Iterator : public ParamIteratorInterface<T> { + public: + Iterator(const ParamGeneratorInterface<T>* base, T value, int index, + IncrementT step) + : base_(base), value_(value), index_(index), step_(step) {} + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface<T>* BaseGenerator() const { + return base_; + } + virtual void Advance() { + value_ = value_ + step_; + index_++; + } + virtual ParamIteratorInterface<T>* Clone() const { + return new Iterator(*this); + } + virtual const T* Current() const { return &value_; } + virtual bool Equals(const ParamIteratorInterface<T>& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const int other_index = + CheckedDowncastToActualType<const Iterator>(&other)->index_; + return index_ == other_index; + } + + private: + Iterator(const Iterator& other) + : ParamIteratorInterface<T>(), + base_(other.base_), value_(other.value_), index_(other.index_), + step_(other.step_) {} + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface<T>* const base_; + T value_; + int index_; + const IncrementT step_; + }; // class RangeGenerator::Iterator + + static int CalculateEndIndex(const T& begin, + const T& end, + const IncrementT& step) { + int end_index = 0; + for (T i = begin; i < end; i = i + step) + end_index++; + return end_index; + } + + // No implementation - assignment is unsupported. + void operator=(const RangeGenerator& other); + + const T begin_; + const T end_; + const IncrementT step_; + // The index for the end() iterator. All the elements in the generated + // sequence are indexed (0-based) to aid iterator comparison. + const int end_index_; +}; // class RangeGenerator + + +// Generates values from a pair of STL-style iterators. Used in the +// ValuesIn() function. The elements are copied from the source range +// since the source can be located on the stack, and the generator +// is likely to persist beyond that stack frame. +template <typename T> +class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface<T> { + public: + template <typename ForwardIterator> + ValuesInIteratorRangeGenerator(ForwardIterator begin, ForwardIterator end) + : container_(begin, end) {} + virtual ~ValuesInIteratorRangeGenerator() {} + + virtual ParamIteratorInterface<T>* Begin() const { + return new Iterator(this, container_.begin()); + } + virtual ParamIteratorInterface<T>* End() const { + return new Iterator(this, container_.end()); + } + + private: + typedef typename ::std::vector<T> ContainerType; + + class Iterator : public ParamIteratorInterface<T> { + public: + Iterator(const ParamGeneratorInterface<T>* base, + typename ContainerType::const_iterator iterator) + : base_(base), iterator_(iterator) {} + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface<T>* BaseGenerator() const { + return base_; + } + virtual void Advance() { + ++iterator_; + value_.reset(); + } + virtual ParamIteratorInterface<T>* Clone() const { + return new Iterator(*this); + } + // We need to use cached value referenced by iterator_ because *iterator_ + // can return a temporary object (and of type other then T), so just + // having "return &*iterator_;" doesn't work. + // value_ is updated here and not in Advance() because Advance() + // can advance iterator_ beyond the end of the range, and we cannot + // detect that fact. The client code, on the other hand, is + // responsible for not calling Current() on an out-of-range iterator. + virtual const T* Current() const { + if (value_.get() == NULL) + value_.reset(new T(*iterator_)); + return value_.get(); + } + virtual bool Equals(const ParamIteratorInterface<T>& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + return iterator_ == + CheckedDowncastToActualType<const Iterator>(&other)->iterator_; + } + + private: + Iterator(const Iterator& other) + // The explicit constructor call suppresses a false warning + // emitted by gcc when supplied with the -Wextra option. + : ParamIteratorInterface<T>(), + base_(other.base_), + iterator_(other.iterator_) {} + + const ParamGeneratorInterface<T>* const base_; + typename ContainerType::const_iterator iterator_; + // A cached value of *iterator_. We keep it here to allow access by + // pointer in the wrapping iterator's operator->(). + // value_ needs to be mutable to be accessed in Current(). + // Use of scoped_ptr helps manage cached value's lifetime, + // which is bound by the lifespan of the iterator itself. + mutable scoped_ptr<const T> value_; + }; // class ValuesInIteratorRangeGenerator::Iterator + + // No implementation - assignment is unsupported. + void operator=(const ValuesInIteratorRangeGenerator& other); + + const ContainerType container_; +}; // class ValuesInIteratorRangeGenerator + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Stores a parameter value and later creates tests parameterized with that +// value. +template <class TestClass> +class ParameterizedTestFactory : public TestFactoryBase { + public: + typedef typename TestClass::ParamType ParamType; + explicit ParameterizedTestFactory(ParamType parameter) : + parameter_(parameter) {} + virtual Test* CreateTest() { + TestClass::SetParam(¶meter_); + return new TestClass(); + } + + private: + const ParamType parameter_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestFactory); +}; + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// TestMetaFactoryBase is a base class for meta-factories that create +// test factories for passing into MakeAndRegisterTestInfo function. +template <class ParamType> +class TestMetaFactoryBase { + public: + virtual ~TestMetaFactoryBase() {} + + virtual TestFactoryBase* CreateTestFactory(ParamType parameter) = 0; +}; + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// TestMetaFactory creates test factories for passing into +// MakeAndRegisterTestInfo function. Since MakeAndRegisterTestInfo receives +// ownership of test factory pointer, same factory object cannot be passed +// into that method twice. But ParameterizedTestCaseInfo is going to call +// it for each Test/Parameter value combination. Thus it needs meta factory +// creator class. +template <class TestCase> +class TestMetaFactory + : public TestMetaFactoryBase<typename TestCase::ParamType> { + public: + typedef typename TestCase::ParamType ParamType; + + TestMetaFactory() {} + + virtual TestFactoryBase* CreateTestFactory(ParamType parameter) { + return new ParameterizedTestFactory<TestCase>(parameter); + } + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(TestMetaFactory); +}; + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// ParameterizedTestCaseInfoBase is a generic interface +// to ParameterizedTestCaseInfo classes. ParameterizedTestCaseInfoBase +// accumulates test information provided by TEST_P macro invocations +// and generators provided by INSTANTIATE_TEST_CASE_P macro invocations +// and uses that information to register all resulting test instances +// in RegisterTests method. The ParameterizeTestCaseRegistry class holds +// a collection of pointers to the ParameterizedTestCaseInfo objects +// and calls RegisterTests() on each of them when asked. +class ParameterizedTestCaseInfoBase { + public: + virtual ~ParameterizedTestCaseInfoBase() {} + + // Base part of test case name for display purposes. + virtual const string& GetTestCaseName() const = 0; + // Test case id to verify identity. + virtual TypeId GetTestCaseTypeId() const = 0; + // UnitTest class invokes this method to register tests in this + // test case right before running them in RUN_ALL_TESTS macro. + // This method should not be called more then once on any single + // instance of a ParameterizedTestCaseInfoBase derived class. + virtual void RegisterTests() = 0; + + protected: + ParameterizedTestCaseInfoBase() {} + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseInfoBase); +}; + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// ParameterizedTestCaseInfo accumulates tests obtained from TEST_P +// macro invocations for a particular test case and generators +// obtained from INSTANTIATE_TEST_CASE_P macro invocations for that +// test case. It registers tests with all values generated by all +// generators when asked. +template <class TestCase> +class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase { + public: + // ParamType and GeneratorCreationFunc are private types but are required + // for declarations of public methods AddTestPattern() and + // AddTestCaseInstantiation(). + typedef typename TestCase::ParamType ParamType; + // A function that returns an instance of appropriate generator type. + typedef ParamGenerator<ParamType>(GeneratorCreationFunc)(); + + explicit ParameterizedTestCaseInfo(const char* name) + : test_case_name_(name) {} + + // Test case base name for display purposes. + virtual const string& GetTestCaseName() const { return test_case_name_; } + // Test case id to verify identity. + virtual TypeId GetTestCaseTypeId() const { return GetTypeId<TestCase>(); } + // TEST_P macro uses AddTestPattern() to record information + // about a single test in a LocalTestInfo structure. + // test_case_name is the base name of the test case (without invocation + // prefix). test_base_name is the name of an individual test without + // parameter index. For the test SequenceA/FooTest.DoBar/1 FooTest is + // test case base name and DoBar is test base name. + void AddTestPattern(const char* test_case_name, + const char* test_base_name, + TestMetaFactoryBase<ParamType>* meta_factory) { + tests_.push_back(linked_ptr<TestInfo>(new TestInfo(test_case_name, + test_base_name, + meta_factory))); + } + // INSTANTIATE_TEST_CASE_P macro uses AddGenerator() to record information + // about a generator. + int AddTestCaseInstantiation(const string& instantiation_name, + GeneratorCreationFunc* func, + const char* /* file */, + int /* line */) { + instantiations_.push_back(::std::make_pair(instantiation_name, func)); + return 0; // Return value used only to run this method in namespace scope. + } + // UnitTest class invokes this method to register tests in this test case + // test cases right before running tests in RUN_ALL_TESTS macro. + // This method should not be called more then once on any single + // instance of a ParameterizedTestCaseInfoBase derived class. + // UnitTest has a guard to prevent from calling this method more then once. + virtual void RegisterTests() { + for (typename TestInfoContainer::iterator test_it = tests_.begin(); + test_it != tests_.end(); ++test_it) { + linked_ptr<TestInfo> test_info = *test_it; + for (typename InstantiationContainer::iterator gen_it = + instantiations_.begin(); gen_it != instantiations_.end(); + ++gen_it) { + const string& instantiation_name = gen_it->first; + ParamGenerator<ParamType> generator((*gen_it->second)()); + + string test_case_name; + if ( !instantiation_name.empty() ) + test_case_name = instantiation_name + "/"; + test_case_name += test_info->test_case_base_name; + + int i = 0; + for (typename ParamGenerator<ParamType>::iterator param_it = + generator.begin(); + param_it != generator.end(); ++param_it, ++i) { + Message test_name_stream; + test_name_stream << test_info->test_base_name << "/" << i; + MakeAndRegisterTestInfo( + test_case_name.c_str(), + test_name_stream.GetString().c_str(), + NULL, // No type parameter. + PrintToString(*param_it).c_str(), + GetTestCaseTypeId(), + TestCase::SetUpTestCase, + TestCase::TearDownTestCase, + test_info->test_meta_factory->CreateTestFactory(*param_it)); + } // for param_it + } // for gen_it + } // for test_it + } // RegisterTests + + private: + // LocalTestInfo structure keeps information about a single test registered + // with TEST_P macro. + struct TestInfo { + TestInfo(const char* a_test_case_base_name, + const char* a_test_base_name, + TestMetaFactoryBase<ParamType>* a_test_meta_factory) : + test_case_base_name(a_test_case_base_name), + test_base_name(a_test_base_name), + test_meta_factory(a_test_meta_factory) {} + + const string test_case_base_name; + const string test_base_name; + const scoped_ptr<TestMetaFactoryBase<ParamType> > test_meta_factory; + }; + typedef ::std::vector<linked_ptr<TestInfo> > TestInfoContainer; + // Keeps pairs of <Instantiation name, Sequence generator creation function> + // received from INSTANTIATE_TEST_CASE_P macros. + typedef ::std::vector<std::pair<string, GeneratorCreationFunc*> > + InstantiationContainer; + + const string test_case_name_; + TestInfoContainer tests_; + InstantiationContainer instantiations_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseInfo); +}; // class ParameterizedTestCaseInfo + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// ParameterizedTestCaseRegistry contains a map of ParameterizedTestCaseInfoBase +// classes accessed by test case names. TEST_P and INSTANTIATE_TEST_CASE_P +// macros use it to locate their corresponding ParameterizedTestCaseInfo +// descriptors. +class ParameterizedTestCaseRegistry { + public: + ParameterizedTestCaseRegistry() {} + ~ParameterizedTestCaseRegistry() { + for (TestCaseInfoContainer::iterator it = test_case_infos_.begin(); + it != test_case_infos_.end(); ++it) { + delete *it; + } + } + + // Looks up or creates and returns a structure containing information about + // tests and instantiations of a particular test case. + template <class TestCase> + ParameterizedTestCaseInfo<TestCase>* GetTestCasePatternHolder( + const char* test_case_name, + const char* file, + int line) { + ParameterizedTestCaseInfo<TestCase>* typed_test_info = NULL; + for (TestCaseInfoContainer::iterator it = test_case_infos_.begin(); + it != test_case_infos_.end(); ++it) { + if ((*it)->GetTestCaseName() == test_case_name) { + if ((*it)->GetTestCaseTypeId() != GetTypeId<TestCase>()) { + // Complain about incorrect usage of Google Test facilities + // and terminate the program since we cannot guaranty correct + // test case setup and tear-down in this case. + ReportInvalidTestCaseType(test_case_name, file, line); + posix::Abort(); + } else { + // At this point we are sure that the object we found is of the same + // type we are looking for, so we downcast it to that type + // without further checks. + typed_test_info = CheckedDowncastToActualType< + ParameterizedTestCaseInfo<TestCase> >(*it); + } + break; + } + } + if (typed_test_info == NULL) { + typed_test_info = new ParameterizedTestCaseInfo<TestCase>(test_case_name); + test_case_infos_.push_back(typed_test_info); + } + return typed_test_info; + } + void RegisterTests() { + for (TestCaseInfoContainer::iterator it = test_case_infos_.begin(); + it != test_case_infos_.end(); ++it) { + (*it)->RegisterTests(); + } + } + + private: + typedef ::std::vector<ParameterizedTestCaseInfoBase*> TestCaseInfoContainer; + + TestCaseInfoContainer test_case_infos_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseRegistry); +}; + +} // namespace internal +} // namespace testing + +#endif // GTEST_HAS_PARAM_TEST + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ +// This file was GENERATED by command: +// pump.py gtest-param-util-generated.h.pump +// DO NOT EDIT BY HAND!!! + +// Copyright 2008 Google Inc. +// All Rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: vladl@google.com (Vlad Losev) + +// Type and function utilities for implementing parameterized tests. +// This file is generated by a SCRIPT. DO NOT EDIT BY HAND! +// +// Currently Google Test supports at most 50 arguments in Values, +// and at most 10 arguments in Combine. Please contact +// googletestframework@googlegroups.com if you need more. +// Please note that the number of arguments to Combine is limited +// by the maximum arity of the implementation of tr1::tuple which is +// currently set at 10. + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_ + +// scripts/fuse_gtest.py depends on gtest's own header being #included +// *unconditionally*. Therefore these #includes cannot be moved +// inside #if GTEST_HAS_PARAM_TEST. + +#if GTEST_HAS_PARAM_TEST + +namespace testing { + +// Forward declarations of ValuesIn(), which is implemented in +// include/gtest/gtest-param-test.h. +template <typename ForwardIterator> +internal::ParamGenerator< + typename ::testing::internal::IteratorTraits<ForwardIterator>::value_type> +ValuesIn(ForwardIterator begin, ForwardIterator end); + +template <typename T, size_t N> +internal::ParamGenerator<T> ValuesIn(const T (&array)[N]); + +template <class Container> +internal::ParamGenerator<typename Container::value_type> ValuesIn( + const Container& container); + +namespace internal { + +// Used in the Values() function to provide polymorphic capabilities. +template <typename T1> +class ValueArray1 { + public: + explicit ValueArray1(T1 v1) : v1_(v1) {} + + template <typename T> + operator ParamGenerator<T>() const { return ValuesIn(&v1_, &v1_ + 1); } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray1& other); + + const T1 v1_; +}; + +template <typename T1, typename T2> +class ValueArray2 { + public: + ValueArray2(T1 v1, T2 v2) : v1_(v1), v2_(v2) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray2& other); + + const T1 v1_; + const T2 v2_; +}; + +template <typename T1, typename T2, typename T3> +class ValueArray3 { + public: + ValueArray3(T1 v1, T2 v2, T3 v3) : v1_(v1), v2_(v2), v3_(v3) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray3& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; +}; + +template <typename T1, typename T2, typename T3, typename T4> +class ValueArray4 { + public: + ValueArray4(T1 v1, T2 v2, T3 v3, T4 v4) : v1_(v1), v2_(v2), v3_(v3), + v4_(v4) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray4& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5> +class ValueArray5 { + public: + ValueArray5(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5) : v1_(v1), v2_(v2), v3_(v3), + v4_(v4), v5_(v5) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray5& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6> +class ValueArray6 { + public: + ValueArray6(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6) : v1_(v1), v2_(v2), + v3_(v3), v4_(v4), v5_(v5), v6_(v6) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray6& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7> +class ValueArray7 { + public: + ValueArray7(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7) : v1_(v1), + v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray7& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8> +class ValueArray8 { + public: + ValueArray8(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, + T8 v8) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray8& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9> +class ValueArray9 { + public: + ValueArray9(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, + T9 v9) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray9& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10> +class ValueArray10 { + public: + ValueArray10(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray10& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11> +class ValueArray11 { + public: + ValueArray11(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), + v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray11& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12> +class ValueArray12 { + public: + ValueArray12(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), + v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_), + static_cast<T>(v12_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray12& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13> +class ValueArray13 { + public: + ValueArray13(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), + v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), + v12_(v12), v13_(v13) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_), + static_cast<T>(v12_), static_cast<T>(v13_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray13& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14> +class ValueArray14 { + public: + ValueArray14(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14) : v1_(v1), v2_(v2), v3_(v3), + v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_), + static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray14& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15> +class ValueArray15 { + public: + ValueArray15(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15) : v1_(v1), v2_(v2), + v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_), + static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_), + static_cast<T>(v15_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray15& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16> +class ValueArray16 { + public: + ValueArray16(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16) : v1_(v1), + v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), + v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), + v16_(v16) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_), + static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_), + static_cast<T>(v15_), static_cast<T>(v16_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray16& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17> +class ValueArray17 { + public: + ValueArray17(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, + T17 v17) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_), + static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_), + static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray17& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18> +class ValueArray18 { + public: + ValueArray18(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17), v18_(v18) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_), + static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_), + static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_), + static_cast<T>(v18_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray18& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19> +class ValueArray19 { + public: + ValueArray19(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), + v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), + v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_), + static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_), + static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_), + static_cast<T>(v18_), static_cast<T>(v19_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray19& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20> +class ValueArray20 { + public: + ValueArray20(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), + v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), + v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), + v19_(v19), v20_(v20) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_), + static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_), + static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_), + static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray20& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21> +class ValueArray21 { + public: + ValueArray21(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), + v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), + v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), + v18_(v18), v19_(v19), v20_(v20), v21_(v21) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_), + static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_), + static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_), + static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_), + static_cast<T>(v21_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray21& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22> +class ValueArray22 { + public: + ValueArray22(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22) : v1_(v1), v2_(v2), v3_(v3), + v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), + v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_), + static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_), + static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_), + static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_), + static_cast<T>(v21_), static_cast<T>(v22_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray22& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23> +class ValueArray23 { + public: + ValueArray23(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23) : v1_(v1), v2_(v2), + v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), + v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), + v23_(v23) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_), + static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_), + static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_), + static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_), + static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray23& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24> +class ValueArray24 { + public: + ValueArray24(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24) : v1_(v1), + v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), + v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), + v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), + v22_(v22), v23_(v23), v24_(v24) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_), + static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_), + static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_), + static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_), + static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_), + static_cast<T>(v24_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray24& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25> +class ValueArray25 { + public: + ValueArray25(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, + T25 v25) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), + v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_), + static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_), + static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_), + static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_), + static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_), + static_cast<T>(v24_), static_cast<T>(v25_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray25& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26> +class ValueArray26 { + public: + ValueArray26(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), + v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_), + static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_), + static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_), + static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_), + static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_), + static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray26& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27> +class ValueArray27 { + public: + ValueArray27(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), + v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), + v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), + v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), + v26_(v26), v27_(v27) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_), + static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_), + static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_), + static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_), + static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_), + static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_), + static_cast<T>(v27_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray27& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28> +class ValueArray28 { + public: + ValueArray28(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), + v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), + v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), + v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), + v25_(v25), v26_(v26), v27_(v27), v28_(v28) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_), + static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_), + static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_), + static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_), + static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_), + static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_), + static_cast<T>(v27_), static_cast<T>(v28_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray28& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29> +class ValueArray29 { + public: + ValueArray29(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), + v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), + v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), + v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), + v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_), + static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_), + static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_), + static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_), + static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_), + static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_), + static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray29& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30> +class ValueArray30 { + public: + ValueArray30(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30) : v1_(v1), v2_(v2), v3_(v3), + v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), + v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), + v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), + v29_(v29), v30_(v30) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_), + static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_), + static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_), + static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_), + static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_), + static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_), + static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_), + static_cast<T>(v30_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray30& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31> +class ValueArray31 { + public: + ValueArray31(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31) : v1_(v1), v2_(v2), + v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), + v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), + v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), + v29_(v29), v30_(v30), v31_(v31) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_), + static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_), + static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_), + static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_), + static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_), + static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_), + static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_), + static_cast<T>(v30_), static_cast<T>(v31_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray31& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32> +class ValueArray32 { + public: + ValueArray32(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32) : v1_(v1), + v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), + v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), + v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), + v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), + v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_), + static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_), + static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_), + static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_), + static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_), + static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_), + static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_), + static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray32& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33> +class ValueArray33 { + public: + ValueArray33(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, + T33 v33) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), + v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), + v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), + v33_(v33) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_), + static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_), + static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_), + static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_), + static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_), + static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_), + static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_), + static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_), + static_cast<T>(v33_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray33& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34> +class ValueArray34 { + public: + ValueArray34(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), + v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), + v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), + v33_(v33), v34_(v34) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_), + static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_), + static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_), + static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_), + static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_), + static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_), + static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_), + static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_), + static_cast<T>(v33_), static_cast<T>(v34_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray34& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35> +class ValueArray35 { + public: + ValueArray35(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), + v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), + v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), + v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), + v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), + v32_(v32), v33_(v33), v34_(v34), v35_(v35) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_), + static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_), + static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_), + static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_), + static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_), + static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_), + static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_), + static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_), + static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray35& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36> +class ValueArray36 { + public: + ValueArray36(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), + v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), + v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), + v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), + v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), + v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_), + static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_), + static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_), + static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_), + static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_), + static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_), + static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_), + static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_), + static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_), + static_cast<T>(v36_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray36& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37> +class ValueArray37 { + public: + ValueArray37(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), + v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), + v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), + v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), + v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), + v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), + v36_(v36), v37_(v37) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_), + static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_), + static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_), + static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_), + static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_), + static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_), + static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_), + static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_), + static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_), + static_cast<T>(v36_), static_cast<T>(v37_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray37& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38> +class ValueArray38 { + public: + ValueArray38(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38) : v1_(v1), v2_(v2), v3_(v3), + v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), + v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), + v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), + v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), + v35_(v35), v36_(v36), v37_(v37), v38_(v38) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_), + static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_), + static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_), + static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_), + static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_), + static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_), + static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_), + static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_), + static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_), + static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray38& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39> +class ValueArray39 { + public: + ValueArray39(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39) : v1_(v1), v2_(v2), + v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), + v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), + v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), + v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), + v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_), + static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_), + static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_), + static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_), + static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_), + static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_), + static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_), + static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_), + static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_), + static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_), + static_cast<T>(v39_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray39& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40> +class ValueArray40 { + public: + ValueArray40(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40) : v1_(v1), + v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), + v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), + v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), + v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), + v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), + v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), + v40_(v40) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_), + static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_), + static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_), + static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_), + static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_), + static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_), + static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_), + static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_), + static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_), + static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_), + static_cast<T>(v39_), static_cast<T>(v40_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray40& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41> +class ValueArray41 { + public: + ValueArray41(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, + T41 v41) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), + v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), + v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), + v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), + v39_(v39), v40_(v40), v41_(v41) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_), + static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_), + static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_), + static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_), + static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_), + static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_), + static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_), + static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_), + static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_), + static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_), + static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray41& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41, typename T42> +class ValueArray42 { + public: + ValueArray42(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), + v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), + v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), + v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), + v39_(v39), v40_(v40), v41_(v41), v42_(v42) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_), + static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_), + static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_), + static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_), + static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_), + static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_), + static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_), + static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_), + static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_), + static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_), + static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_), + static_cast<T>(v42_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray42& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; + const T42 v42_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41, typename T42, typename T43> +class ValueArray43 { + public: + ValueArray43(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), + v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), + v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), + v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), + v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), + v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), + v38_(v38), v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_), + static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_), + static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_), + static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_), + static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_), + static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_), + static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_), + static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_), + static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_), + static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_), + static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_), + static_cast<T>(v42_), static_cast<T>(v43_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray43& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; + const T42 v42_; + const T43 v43_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41, typename T42, typename T43, typename T44> +class ValueArray44 { + public: + ValueArray44(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43, T44 v44) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), + v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), + v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), + v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), + v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), + v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36), + v37_(v37), v38_(v38), v39_(v39), v40_(v40), v41_(v41), v42_(v42), + v43_(v43), v44_(v44) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_), + static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_), + static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_), + static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_), + static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_), + static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_), + static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_), + static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_), + static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_), + static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_), + static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_), + static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray44& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; + const T42 v42_; + const T43 v43_; + const T44 v44_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41, typename T42, typename T43, typename T44, typename T45> +class ValueArray45 { + public: + ValueArray45(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43, T44 v44, T45 v45) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), + v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), + v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), + v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), + v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), + v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), + v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40), v41_(v41), + v42_(v42), v43_(v43), v44_(v44), v45_(v45) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_), + static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_), + static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_), + static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_), + static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_), + static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_), + static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_), + static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_), + static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_), + static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_), + static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_), + static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_), + static_cast<T>(v45_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray45& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; + const T42 v42_; + const T43 v43_; + const T44 v44_; + const T45 v45_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41, typename T42, typename T43, typename T44, typename T45, + typename T46> +class ValueArray46 { + public: + ValueArray46(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43, T44 v44, T45 v45, T46 v46) : v1_(v1), v2_(v2), v3_(v3), + v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), + v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), + v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), + v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), + v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40), + v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), v46_(v46) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_), + static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_), + static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_), + static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_), + static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_), + static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_), + static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_), + static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_), + static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_), + static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_), + static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_), + static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_), + static_cast<T>(v45_), static_cast<T>(v46_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray46& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; + const T42 v42_; + const T43 v43_; + const T44 v44_; + const T45 v45_; + const T46 v46_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41, typename T42, typename T43, typename T44, typename T45, + typename T46, typename T47> +class ValueArray47 { + public: + ValueArray47(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47) : v1_(v1), v2_(v2), + v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), + v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), + v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), + v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), + v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40), + v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), v46_(v46), + v47_(v47) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_), + static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_), + static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_), + static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_), + static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_), + static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_), + static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_), + static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_), + static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_), + static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_), + static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_), + static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_), + static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray47& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; + const T42 v42_; + const T43 v43_; + const T44 v44_; + const T45 v45_; + const T46 v46_; + const T47 v47_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41, typename T42, typename T43, typename T44, typename T45, + typename T46, typename T47, typename T48> +class ValueArray48 { + public: + ValueArray48(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48) : v1_(v1), + v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), + v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), + v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), + v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), + v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), + v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), + v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), + v46_(v46), v47_(v47), v48_(v48) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_), + static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_), + static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_), + static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_), + static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_), + static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_), + static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_), + static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_), + static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_), + static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_), + static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_), + static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_), + static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_), + static_cast<T>(v48_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray48& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; + const T42 v42_; + const T43 v43_; + const T44 v44_; + const T45 v45_; + const T46 v46_; + const T47 v47_; + const T48 v48_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41, typename T42, typename T43, typename T44, typename T45, + typename T46, typename T47, typename T48, typename T49> +class ValueArray49 { + public: + ValueArray49(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48, + T49 v49) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), + v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), + v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), + v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), + v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44), + v45_(v45), v46_(v46), v47_(v47), v48_(v48), v49_(v49) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_), + static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_), + static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_), + static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_), + static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_), + static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_), + static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_), + static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_), + static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_), + static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_), + static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_), + static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_), + static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_), + static_cast<T>(v48_), static_cast<T>(v49_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray49& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; + const T42 v42_; + const T43 v43_; + const T44 v44_; + const T45 v45_; + const T46 v46_; + const T47 v47_; + const T48 v48_; + const T49 v49_; +}; + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41, typename T42, typename T43, typename T44, typename T45, + typename T46, typename T47, typename T48, typename T49, typename T50> +class ValueArray50 { + public: + ValueArray50(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48, T49 v49, + T50 v50) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), + v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), + v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), + v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), + v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44), + v45_(v45), v46_(v46), v47_(v47), v48_(v48), v49_(v49), v50_(v50) {} + + template <typename T> + operator ParamGenerator<T>() const { + const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_), + static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_), + static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_), + static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_), + static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_), + static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_), + static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_), + static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_), + static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_), + static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_), + static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_), + static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_), + static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_), + static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_), + static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_), + static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_), + static_cast<T>(v48_), static_cast<T>(v49_), static_cast<T>(v50_)}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray50& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; + const T42 v42_; + const T43 v43_; + const T44 v44_; + const T45 v45_; + const T46 v46_; + const T47 v47_; + const T48 v48_; + const T49 v49_; + const T50 v50_; +}; + +# if GTEST_HAS_COMBINE +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Generates values from the Cartesian product of values produced +// by the argument generators. +// +template <typename T1, typename T2> +class CartesianProductGenerator2 + : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2> > { + public: + typedef ::std::tr1::tuple<T1, T2> ParamType; + + CartesianProductGenerator2(const ParamGenerator<T1>& g1, + const ParamGenerator<T2>& g2) + : g1_(g1), g2_(g2) {} + virtual ~CartesianProductGenerator2() {} + + virtual ParamIteratorInterface<ParamType>* Begin() const { + return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin()); + } + virtual ParamIteratorInterface<ParamType>* End() const { + return new Iterator(this, g1_, g1_.end(), g2_, g2_.end()); + } + + private: + class Iterator : public ParamIteratorInterface<ParamType> { + public: + Iterator(const ParamGeneratorInterface<ParamType>* base, + const ParamGenerator<T1>& g1, + const typename ParamGenerator<T1>::iterator& current1, + const ParamGenerator<T2>& g2, + const typename ParamGenerator<T2>::iterator& current2) + : base_(base), + begin1_(g1.begin()), end1_(g1.end()), current1_(current1), + begin2_(g2.begin()), end2_(g2.end()), current2_(current2) { + ComputeCurrentValue(); + } + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + virtual void Advance() { + assert(!AtEnd()); + ++current2_; + if (current2_ == end2_) { + current2_ = begin2_; + ++current1_; + } + ComputeCurrentValue(); + } + virtual ParamIteratorInterface<ParamType>* Clone() const { + return new Iterator(*this); + } + virtual const ParamType* Current() const { return ¤t_value_; } + virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const Iterator* typed_other = + CheckedDowncastToActualType<const Iterator>(&other); + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + return (AtEnd() && typed_other->AtEnd()) || + ( + current1_ == typed_other->current1_ && + current2_ == typed_other->current2_); + } + + private: + Iterator(const Iterator& other) + : base_(other.base_), + begin1_(other.begin1_), + end1_(other.end1_), + current1_(other.current1_), + begin2_(other.begin2_), + end2_(other.end2_), + current2_(other.current2_) { + ComputeCurrentValue(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_ = ParamType(*current1_, *current2_); + } + bool AtEnd() const { + // We must report iterator past the end of the range when either of the + // component iterators has reached the end of its range. + return + current1_ == end1_ || + current2_ == end2_; + } + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface<ParamType>* const base_; + // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. + // current[i]_ is the actual traversing iterator. + const typename ParamGenerator<T1>::iterator begin1_; + const typename ParamGenerator<T1>::iterator end1_; + typename ParamGenerator<T1>::iterator current1_; + const typename ParamGenerator<T2>::iterator begin2_; + const typename ParamGenerator<T2>::iterator end2_; + typename ParamGenerator<T2>::iterator current2_; + ParamType current_value_; + }; // class CartesianProductGenerator2::Iterator + + // No implementation - assignment is unsupported. + void operator=(const CartesianProductGenerator2& other); + + const ParamGenerator<T1> g1_; + const ParamGenerator<T2> g2_; +}; // class CartesianProductGenerator2 + + +template <typename T1, typename T2, typename T3> +class CartesianProductGenerator3 + : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3> > { + public: + typedef ::std::tr1::tuple<T1, T2, T3> ParamType; + + CartesianProductGenerator3(const ParamGenerator<T1>& g1, + const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3) + : g1_(g1), g2_(g2), g3_(g3) {} + virtual ~CartesianProductGenerator3() {} + + virtual ParamIteratorInterface<ParamType>* Begin() const { + return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, + g3_.begin()); + } + virtual ParamIteratorInterface<ParamType>* End() const { + return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end()); + } + + private: + class Iterator : public ParamIteratorInterface<ParamType> { + public: + Iterator(const ParamGeneratorInterface<ParamType>* base, + const ParamGenerator<T1>& g1, + const typename ParamGenerator<T1>::iterator& current1, + const ParamGenerator<T2>& g2, + const typename ParamGenerator<T2>::iterator& current2, + const ParamGenerator<T3>& g3, + const typename ParamGenerator<T3>::iterator& current3) + : base_(base), + begin1_(g1.begin()), end1_(g1.end()), current1_(current1), + begin2_(g2.begin()), end2_(g2.end()), current2_(current2), + begin3_(g3.begin()), end3_(g3.end()), current3_(current3) { + ComputeCurrentValue(); + } + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + virtual void Advance() { + assert(!AtEnd()); + ++current3_; + if (current3_ == end3_) { + current3_ = begin3_; + ++current2_; + } + if (current2_ == end2_) { + current2_ = begin2_; + ++current1_; + } + ComputeCurrentValue(); + } + virtual ParamIteratorInterface<ParamType>* Clone() const { + return new Iterator(*this); + } + virtual const ParamType* Current() const { return ¤t_value_; } + virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const Iterator* typed_other = + CheckedDowncastToActualType<const Iterator>(&other); + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + return (AtEnd() && typed_other->AtEnd()) || + ( + current1_ == typed_other->current1_ && + current2_ == typed_other->current2_ && + current3_ == typed_other->current3_); + } + + private: + Iterator(const Iterator& other) + : base_(other.base_), + begin1_(other.begin1_), + end1_(other.end1_), + current1_(other.current1_), + begin2_(other.begin2_), + end2_(other.end2_), + current2_(other.current2_), + begin3_(other.begin3_), + end3_(other.end3_), + current3_(other.current3_) { + ComputeCurrentValue(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_ = ParamType(*current1_, *current2_, *current3_); + } + bool AtEnd() const { + // We must report iterator past the end of the range when either of the + // component iterators has reached the end of its range. + return + current1_ == end1_ || + current2_ == end2_ || + current3_ == end3_; + } + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface<ParamType>* const base_; + // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. + // current[i]_ is the actual traversing iterator. + const typename ParamGenerator<T1>::iterator begin1_; + const typename ParamGenerator<T1>::iterator end1_; + typename ParamGenerator<T1>::iterator current1_; + const typename ParamGenerator<T2>::iterator begin2_; + const typename ParamGenerator<T2>::iterator end2_; + typename ParamGenerator<T2>::iterator current2_; + const typename ParamGenerator<T3>::iterator begin3_; + const typename ParamGenerator<T3>::iterator end3_; + typename ParamGenerator<T3>::iterator current3_; + ParamType current_value_; + }; // class CartesianProductGenerator3::Iterator + + // No implementation - assignment is unsupported. + void operator=(const CartesianProductGenerator3& other); + + const ParamGenerator<T1> g1_; + const ParamGenerator<T2> g2_; + const ParamGenerator<T3> g3_; +}; // class CartesianProductGenerator3 + + +template <typename T1, typename T2, typename T3, typename T4> +class CartesianProductGenerator4 + : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4> > { + public: + typedef ::std::tr1::tuple<T1, T2, T3, T4> ParamType; + + CartesianProductGenerator4(const ParamGenerator<T1>& g1, + const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3, + const ParamGenerator<T4>& g4) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4) {} + virtual ~CartesianProductGenerator4() {} + + virtual ParamIteratorInterface<ParamType>* Begin() const { + return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, + g3_.begin(), g4_, g4_.begin()); + } + virtual ParamIteratorInterface<ParamType>* End() const { + return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), + g4_, g4_.end()); + } + + private: + class Iterator : public ParamIteratorInterface<ParamType> { + public: + Iterator(const ParamGeneratorInterface<ParamType>* base, + const ParamGenerator<T1>& g1, + const typename ParamGenerator<T1>::iterator& current1, + const ParamGenerator<T2>& g2, + const typename ParamGenerator<T2>::iterator& current2, + const ParamGenerator<T3>& g3, + const typename ParamGenerator<T3>::iterator& current3, + const ParamGenerator<T4>& g4, + const typename ParamGenerator<T4>::iterator& current4) + : base_(base), + begin1_(g1.begin()), end1_(g1.end()), current1_(current1), + begin2_(g2.begin()), end2_(g2.end()), current2_(current2), + begin3_(g3.begin()), end3_(g3.end()), current3_(current3), + begin4_(g4.begin()), end4_(g4.end()), current4_(current4) { + ComputeCurrentValue(); + } + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + virtual void Advance() { + assert(!AtEnd()); + ++current4_; + if (current4_ == end4_) { + current4_ = begin4_; + ++current3_; + } + if (current3_ == end3_) { + current3_ = begin3_; + ++current2_; + } + if (current2_ == end2_) { + current2_ = begin2_; + ++current1_; + } + ComputeCurrentValue(); + } + virtual ParamIteratorInterface<ParamType>* Clone() const { + return new Iterator(*this); + } + virtual const ParamType* Current() const { return ¤t_value_; } + virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const Iterator* typed_other = + CheckedDowncastToActualType<const Iterator>(&other); + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + return (AtEnd() && typed_other->AtEnd()) || + ( + current1_ == typed_other->current1_ && + current2_ == typed_other->current2_ && + current3_ == typed_other->current3_ && + current4_ == typed_other->current4_); + } + + private: + Iterator(const Iterator& other) + : base_(other.base_), + begin1_(other.begin1_), + end1_(other.end1_), + current1_(other.current1_), + begin2_(other.begin2_), + end2_(other.end2_), + current2_(other.current2_), + begin3_(other.begin3_), + end3_(other.end3_), + current3_(other.current3_), + begin4_(other.begin4_), + end4_(other.end4_), + current4_(other.current4_) { + ComputeCurrentValue(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_ = ParamType(*current1_, *current2_, *current3_, + *current4_); + } + bool AtEnd() const { + // We must report iterator past the end of the range when either of the + // component iterators has reached the end of its range. + return + current1_ == end1_ || + current2_ == end2_ || + current3_ == end3_ || + current4_ == end4_; + } + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface<ParamType>* const base_; + // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. + // current[i]_ is the actual traversing iterator. + const typename ParamGenerator<T1>::iterator begin1_; + const typename ParamGenerator<T1>::iterator end1_; + typename ParamGenerator<T1>::iterator current1_; + const typename ParamGenerator<T2>::iterator begin2_; + const typename ParamGenerator<T2>::iterator end2_; + typename ParamGenerator<T2>::iterator current2_; + const typename ParamGenerator<T3>::iterator begin3_; + const typename ParamGenerator<T3>::iterator end3_; + typename ParamGenerator<T3>::iterator current3_; + const typename ParamGenerator<T4>::iterator begin4_; + const typename ParamGenerator<T4>::iterator end4_; + typename ParamGenerator<T4>::iterator current4_; + ParamType current_value_; + }; // class CartesianProductGenerator4::Iterator + + // No implementation - assignment is unsupported. + void operator=(const CartesianProductGenerator4& other); + + const ParamGenerator<T1> g1_; + const ParamGenerator<T2> g2_; + const ParamGenerator<T3> g3_; + const ParamGenerator<T4> g4_; +}; // class CartesianProductGenerator4 + + +template <typename T1, typename T2, typename T3, typename T4, typename T5> +class CartesianProductGenerator5 + : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5> > { + public: + typedef ::std::tr1::tuple<T1, T2, T3, T4, T5> ParamType; + + CartesianProductGenerator5(const ParamGenerator<T1>& g1, + const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3, + const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5) {} + virtual ~CartesianProductGenerator5() {} + + virtual ParamIteratorInterface<ParamType>* Begin() const { + return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, + g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin()); + } + virtual ParamIteratorInterface<ParamType>* End() const { + return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), + g4_, g4_.end(), g5_, g5_.end()); + } + + private: + class Iterator : public ParamIteratorInterface<ParamType> { + public: + Iterator(const ParamGeneratorInterface<ParamType>* base, + const ParamGenerator<T1>& g1, + const typename ParamGenerator<T1>::iterator& current1, + const ParamGenerator<T2>& g2, + const typename ParamGenerator<T2>::iterator& current2, + const ParamGenerator<T3>& g3, + const typename ParamGenerator<T3>::iterator& current3, + const ParamGenerator<T4>& g4, + const typename ParamGenerator<T4>::iterator& current4, + const ParamGenerator<T5>& g5, + const typename ParamGenerator<T5>::iterator& current5) + : base_(base), + begin1_(g1.begin()), end1_(g1.end()), current1_(current1), + begin2_(g2.begin()), end2_(g2.end()), current2_(current2), + begin3_(g3.begin()), end3_(g3.end()), current3_(current3), + begin4_(g4.begin()), end4_(g4.end()), current4_(current4), + begin5_(g5.begin()), end5_(g5.end()), current5_(current5) { + ComputeCurrentValue(); + } + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + virtual void Advance() { + assert(!AtEnd()); + ++current5_; + if (current5_ == end5_) { + current5_ = begin5_; + ++current4_; + } + if (current4_ == end4_) { + current4_ = begin4_; + ++current3_; + } + if (current3_ == end3_) { + current3_ = begin3_; + ++current2_; + } + if (current2_ == end2_) { + current2_ = begin2_; + ++current1_; + } + ComputeCurrentValue(); + } + virtual ParamIteratorInterface<ParamType>* Clone() const { + return new Iterator(*this); + } + virtual const ParamType* Current() const { return ¤t_value_; } + virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const Iterator* typed_other = + CheckedDowncastToActualType<const Iterator>(&other); + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + return (AtEnd() && typed_other->AtEnd()) || + ( + current1_ == typed_other->current1_ && + current2_ == typed_other->current2_ && + current3_ == typed_other->current3_ && + current4_ == typed_other->current4_ && + current5_ == typed_other->current5_); + } + + private: + Iterator(const Iterator& other) + : base_(other.base_), + begin1_(other.begin1_), + end1_(other.end1_), + current1_(other.current1_), + begin2_(other.begin2_), + end2_(other.end2_), + current2_(other.current2_), + begin3_(other.begin3_), + end3_(other.end3_), + current3_(other.current3_), + begin4_(other.begin4_), + end4_(other.end4_), + current4_(other.current4_), + begin5_(other.begin5_), + end5_(other.end5_), + current5_(other.current5_) { + ComputeCurrentValue(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_ = ParamType(*current1_, *current2_, *current3_, + *current4_, *current5_); + } + bool AtEnd() const { + // We must report iterator past the end of the range when either of the + // component iterators has reached the end of its range. + return + current1_ == end1_ || + current2_ == end2_ || + current3_ == end3_ || + current4_ == end4_ || + current5_ == end5_; + } + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface<ParamType>* const base_; + // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. + // current[i]_ is the actual traversing iterator. + const typename ParamGenerator<T1>::iterator begin1_; + const typename ParamGenerator<T1>::iterator end1_; + typename ParamGenerator<T1>::iterator current1_; + const typename ParamGenerator<T2>::iterator begin2_; + const typename ParamGenerator<T2>::iterator end2_; + typename ParamGenerator<T2>::iterator current2_; + const typename ParamGenerator<T3>::iterator begin3_; + const typename ParamGenerator<T3>::iterator end3_; + typename ParamGenerator<T3>::iterator current3_; + const typename ParamGenerator<T4>::iterator begin4_; + const typename ParamGenerator<T4>::iterator end4_; + typename ParamGenerator<T4>::iterator current4_; + const typename ParamGenerator<T5>::iterator begin5_; + const typename ParamGenerator<T5>::iterator end5_; + typename ParamGenerator<T5>::iterator current5_; + ParamType current_value_; + }; // class CartesianProductGenerator5::Iterator + + // No implementation - assignment is unsupported. + void operator=(const CartesianProductGenerator5& other); + + const ParamGenerator<T1> g1_; + const ParamGenerator<T2> g2_; + const ParamGenerator<T3> g3_; + const ParamGenerator<T4> g4_; + const ParamGenerator<T5> g5_; +}; // class CartesianProductGenerator5 + + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6> +class CartesianProductGenerator6 + : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5, + T6> > { + public: + typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6> ParamType; + + CartesianProductGenerator6(const ParamGenerator<T1>& g1, + const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3, + const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5, + const ParamGenerator<T6>& g6) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6) {} + virtual ~CartesianProductGenerator6() {} + + virtual ParamIteratorInterface<ParamType>* Begin() const { + return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, + g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin()); + } + virtual ParamIteratorInterface<ParamType>* End() const { + return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), + g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end()); + } + + private: + class Iterator : public ParamIteratorInterface<ParamType> { + public: + Iterator(const ParamGeneratorInterface<ParamType>* base, + const ParamGenerator<T1>& g1, + const typename ParamGenerator<T1>::iterator& current1, + const ParamGenerator<T2>& g2, + const typename ParamGenerator<T2>::iterator& current2, + const ParamGenerator<T3>& g3, + const typename ParamGenerator<T3>::iterator& current3, + const ParamGenerator<T4>& g4, + const typename ParamGenerator<T4>::iterator& current4, + const ParamGenerator<T5>& g5, + const typename ParamGenerator<T5>::iterator& current5, + const ParamGenerator<T6>& g6, + const typename ParamGenerator<T6>::iterator& current6) + : base_(base), + begin1_(g1.begin()), end1_(g1.end()), current1_(current1), + begin2_(g2.begin()), end2_(g2.end()), current2_(current2), + begin3_(g3.begin()), end3_(g3.end()), current3_(current3), + begin4_(g4.begin()), end4_(g4.end()), current4_(current4), + begin5_(g5.begin()), end5_(g5.end()), current5_(current5), + begin6_(g6.begin()), end6_(g6.end()), current6_(current6) { + ComputeCurrentValue(); + } + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + virtual void Advance() { + assert(!AtEnd()); + ++current6_; + if (current6_ == end6_) { + current6_ = begin6_; + ++current5_; + } + if (current5_ == end5_) { + current5_ = begin5_; + ++current4_; + } + if (current4_ == end4_) { + current4_ = begin4_; + ++current3_; + } + if (current3_ == end3_) { + current3_ = begin3_; + ++current2_; + } + if (current2_ == end2_) { + current2_ = begin2_; + ++current1_; + } + ComputeCurrentValue(); + } + virtual ParamIteratorInterface<ParamType>* Clone() const { + return new Iterator(*this); + } + virtual const ParamType* Current() const { return ¤t_value_; } + virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const Iterator* typed_other = + CheckedDowncastToActualType<const Iterator>(&other); + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + return (AtEnd() && typed_other->AtEnd()) || + ( + current1_ == typed_other->current1_ && + current2_ == typed_other->current2_ && + current3_ == typed_other->current3_ && + current4_ == typed_other->current4_ && + current5_ == typed_other->current5_ && + current6_ == typed_other->current6_); + } + + private: + Iterator(const Iterator& other) + : base_(other.base_), + begin1_(other.begin1_), + end1_(other.end1_), + current1_(other.current1_), + begin2_(other.begin2_), + end2_(other.end2_), + current2_(other.current2_), + begin3_(other.begin3_), + end3_(other.end3_), + current3_(other.current3_), + begin4_(other.begin4_), + end4_(other.end4_), + current4_(other.current4_), + begin5_(other.begin5_), + end5_(other.end5_), + current5_(other.current5_), + begin6_(other.begin6_), + end6_(other.end6_), + current6_(other.current6_) { + ComputeCurrentValue(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_ = ParamType(*current1_, *current2_, *current3_, + *current4_, *current5_, *current6_); + } + bool AtEnd() const { + // We must report iterator past the end of the range when either of the + // component iterators has reached the end of its range. + return + current1_ == end1_ || + current2_ == end2_ || + current3_ == end3_ || + current4_ == end4_ || + current5_ == end5_ || + current6_ == end6_; + } + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface<ParamType>* const base_; + // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. + // current[i]_ is the actual traversing iterator. + const typename ParamGenerator<T1>::iterator begin1_; + const typename ParamGenerator<T1>::iterator end1_; + typename ParamGenerator<T1>::iterator current1_; + const typename ParamGenerator<T2>::iterator begin2_; + const typename ParamGenerator<T2>::iterator end2_; + typename ParamGenerator<T2>::iterator current2_; + const typename ParamGenerator<T3>::iterator begin3_; + const typename ParamGenerator<T3>::iterator end3_; + typename ParamGenerator<T3>::iterator current3_; + const typename ParamGenerator<T4>::iterator begin4_; + const typename ParamGenerator<T4>::iterator end4_; + typename ParamGenerator<T4>::iterator current4_; + const typename ParamGenerator<T5>::iterator begin5_; + const typename ParamGenerator<T5>::iterator end5_; + typename ParamGenerator<T5>::iterator current5_; + const typename ParamGenerator<T6>::iterator begin6_; + const typename ParamGenerator<T6>::iterator end6_; + typename ParamGenerator<T6>::iterator current6_; + ParamType current_value_; + }; // class CartesianProductGenerator6::Iterator + + // No implementation - assignment is unsupported. + void operator=(const CartesianProductGenerator6& other); + + const ParamGenerator<T1> g1_; + const ParamGenerator<T2> g2_; + const ParamGenerator<T3> g3_; + const ParamGenerator<T4> g4_; + const ParamGenerator<T5> g5_; + const ParamGenerator<T6> g6_; +}; // class CartesianProductGenerator6 + + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7> +class CartesianProductGenerator7 + : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, + T7> > { + public: + typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7> ParamType; + + CartesianProductGenerator7(const ParamGenerator<T1>& g1, + const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3, + const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5, + const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7) {} + virtual ~CartesianProductGenerator7() {} + + virtual ParamIteratorInterface<ParamType>* Begin() const { + return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, + g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_, + g7_.begin()); + } + virtual ParamIteratorInterface<ParamType>* End() const { + return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), + g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end()); + } + + private: + class Iterator : public ParamIteratorInterface<ParamType> { + public: + Iterator(const ParamGeneratorInterface<ParamType>* base, + const ParamGenerator<T1>& g1, + const typename ParamGenerator<T1>::iterator& current1, + const ParamGenerator<T2>& g2, + const typename ParamGenerator<T2>::iterator& current2, + const ParamGenerator<T3>& g3, + const typename ParamGenerator<T3>::iterator& current3, + const ParamGenerator<T4>& g4, + const typename ParamGenerator<T4>::iterator& current4, + const ParamGenerator<T5>& g5, + const typename ParamGenerator<T5>::iterator& current5, + const ParamGenerator<T6>& g6, + const typename ParamGenerator<T6>::iterator& current6, + const ParamGenerator<T7>& g7, + const typename ParamGenerator<T7>::iterator& current7) + : base_(base), + begin1_(g1.begin()), end1_(g1.end()), current1_(current1), + begin2_(g2.begin()), end2_(g2.end()), current2_(current2), + begin3_(g3.begin()), end3_(g3.end()), current3_(current3), + begin4_(g4.begin()), end4_(g4.end()), current4_(current4), + begin5_(g5.begin()), end5_(g5.end()), current5_(current5), + begin6_(g6.begin()), end6_(g6.end()), current6_(current6), + begin7_(g7.begin()), end7_(g7.end()), current7_(current7) { + ComputeCurrentValue(); + } + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + virtual void Advance() { + assert(!AtEnd()); + ++current7_; + if (current7_ == end7_) { + current7_ = begin7_; + ++current6_; + } + if (current6_ == end6_) { + current6_ = begin6_; + ++current5_; + } + if (current5_ == end5_) { + current5_ = begin5_; + ++current4_; + } + if (current4_ == end4_) { + current4_ = begin4_; + ++current3_; + } + if (current3_ == end3_) { + current3_ = begin3_; + ++current2_; + } + if (current2_ == end2_) { + current2_ = begin2_; + ++current1_; + } + ComputeCurrentValue(); + } + virtual ParamIteratorInterface<ParamType>* Clone() const { + return new Iterator(*this); + } + virtual const ParamType* Current() const { return ¤t_value_; } + virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const Iterator* typed_other = + CheckedDowncastToActualType<const Iterator>(&other); + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + return (AtEnd() && typed_other->AtEnd()) || + ( + current1_ == typed_other->current1_ && + current2_ == typed_other->current2_ && + current3_ == typed_other->current3_ && + current4_ == typed_other->current4_ && + current5_ == typed_other->current5_ && + current6_ == typed_other->current6_ && + current7_ == typed_other->current7_); + } + + private: + Iterator(const Iterator& other) + : base_(other.base_), + begin1_(other.begin1_), + end1_(other.end1_), + current1_(other.current1_), + begin2_(other.begin2_), + end2_(other.end2_), + current2_(other.current2_), + begin3_(other.begin3_), + end3_(other.end3_), + current3_(other.current3_), + begin4_(other.begin4_), + end4_(other.end4_), + current4_(other.current4_), + begin5_(other.begin5_), + end5_(other.end5_), + current5_(other.current5_), + begin6_(other.begin6_), + end6_(other.end6_), + current6_(other.current6_), + begin7_(other.begin7_), + end7_(other.end7_), + current7_(other.current7_) { + ComputeCurrentValue(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_ = ParamType(*current1_, *current2_, *current3_, + *current4_, *current5_, *current6_, *current7_); + } + bool AtEnd() const { + // We must report iterator past the end of the range when either of the + // component iterators has reached the end of its range. + return + current1_ == end1_ || + current2_ == end2_ || + current3_ == end3_ || + current4_ == end4_ || + current5_ == end5_ || + current6_ == end6_ || + current7_ == end7_; + } + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface<ParamType>* const base_; + // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. + // current[i]_ is the actual traversing iterator. + const typename ParamGenerator<T1>::iterator begin1_; + const typename ParamGenerator<T1>::iterator end1_; + typename ParamGenerator<T1>::iterator current1_; + const typename ParamGenerator<T2>::iterator begin2_; + const typename ParamGenerator<T2>::iterator end2_; + typename ParamGenerator<T2>::iterator current2_; + const typename ParamGenerator<T3>::iterator begin3_; + const typename ParamGenerator<T3>::iterator end3_; + typename ParamGenerator<T3>::iterator current3_; + const typename ParamGenerator<T4>::iterator begin4_; + const typename ParamGenerator<T4>::iterator end4_; + typename ParamGenerator<T4>::iterator current4_; + const typename ParamGenerator<T5>::iterator begin5_; + const typename ParamGenerator<T5>::iterator end5_; + typename ParamGenerator<T5>::iterator current5_; + const typename ParamGenerator<T6>::iterator begin6_; + const typename ParamGenerator<T6>::iterator end6_; + typename ParamGenerator<T6>::iterator current6_; + const typename ParamGenerator<T7>::iterator begin7_; + const typename ParamGenerator<T7>::iterator end7_; + typename ParamGenerator<T7>::iterator current7_; + ParamType current_value_; + }; // class CartesianProductGenerator7::Iterator + + // No implementation - assignment is unsupported. + void operator=(const CartesianProductGenerator7& other); + + const ParamGenerator<T1> g1_; + const ParamGenerator<T2> g2_; + const ParamGenerator<T3> g3_; + const ParamGenerator<T4> g4_; + const ParamGenerator<T5> g5_; + const ParamGenerator<T6> g6_; + const ParamGenerator<T7> g7_; +}; // class CartesianProductGenerator7 + + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8> +class CartesianProductGenerator8 + : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, + T7, T8> > { + public: + typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8> ParamType; + + CartesianProductGenerator8(const ParamGenerator<T1>& g1, + const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3, + const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5, + const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7, + const ParamGenerator<T8>& g8) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), + g8_(g8) {} + virtual ~CartesianProductGenerator8() {} + + virtual ParamIteratorInterface<ParamType>* Begin() const { + return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, + g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_, + g7_.begin(), g8_, g8_.begin()); + } + virtual ParamIteratorInterface<ParamType>* End() const { + return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), + g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_, + g8_.end()); + } + + private: + class Iterator : public ParamIteratorInterface<ParamType> { + public: + Iterator(const ParamGeneratorInterface<ParamType>* base, + const ParamGenerator<T1>& g1, + const typename ParamGenerator<T1>::iterator& current1, + const ParamGenerator<T2>& g2, + const typename ParamGenerator<T2>::iterator& current2, + const ParamGenerator<T3>& g3, + const typename ParamGenerator<T3>::iterator& current3, + const ParamGenerator<T4>& g4, + const typename ParamGenerator<T4>::iterator& current4, + const ParamGenerator<T5>& g5, + const typename ParamGenerator<T5>::iterator& current5, + const ParamGenerator<T6>& g6, + const typename ParamGenerator<T6>::iterator& current6, + const ParamGenerator<T7>& g7, + const typename ParamGenerator<T7>::iterator& current7, + const ParamGenerator<T8>& g8, + const typename ParamGenerator<T8>::iterator& current8) + : base_(base), + begin1_(g1.begin()), end1_(g1.end()), current1_(current1), + begin2_(g2.begin()), end2_(g2.end()), current2_(current2), + begin3_(g3.begin()), end3_(g3.end()), current3_(current3), + begin4_(g4.begin()), end4_(g4.end()), current4_(current4), + begin5_(g5.begin()), end5_(g5.end()), current5_(current5), + begin6_(g6.begin()), end6_(g6.end()), current6_(current6), + begin7_(g7.begin()), end7_(g7.end()), current7_(current7), + begin8_(g8.begin()), end8_(g8.end()), current8_(current8) { + ComputeCurrentValue(); + } + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + virtual void Advance() { + assert(!AtEnd()); + ++current8_; + if (current8_ == end8_) { + current8_ = begin8_; + ++current7_; + } + if (current7_ == end7_) { + current7_ = begin7_; + ++current6_; + } + if (current6_ == end6_) { + current6_ = begin6_; + ++current5_; + } + if (current5_ == end5_) { + current5_ = begin5_; + ++current4_; + } + if (current4_ == end4_) { + current4_ = begin4_; + ++current3_; + } + if (current3_ == end3_) { + current3_ = begin3_; + ++current2_; + } + if (current2_ == end2_) { + current2_ = begin2_; + ++current1_; + } + ComputeCurrentValue(); + } + virtual ParamIteratorInterface<ParamType>* Clone() const { + return new Iterator(*this); + } + virtual const ParamType* Current() const { return ¤t_value_; } + virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const Iterator* typed_other = + CheckedDowncastToActualType<const Iterator>(&other); + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + return (AtEnd() && typed_other->AtEnd()) || + ( + current1_ == typed_other->current1_ && + current2_ == typed_other->current2_ && + current3_ == typed_other->current3_ && + current4_ == typed_other->current4_ && + current5_ == typed_other->current5_ && + current6_ == typed_other->current6_ && + current7_ == typed_other->current7_ && + current8_ == typed_other->current8_); + } + + private: + Iterator(const Iterator& other) + : base_(other.base_), + begin1_(other.begin1_), + end1_(other.end1_), + current1_(other.current1_), + begin2_(other.begin2_), + end2_(other.end2_), + current2_(other.current2_), + begin3_(other.begin3_), + end3_(other.end3_), + current3_(other.current3_), + begin4_(other.begin4_), + end4_(other.end4_), + current4_(other.current4_), + begin5_(other.begin5_), + end5_(other.end5_), + current5_(other.current5_), + begin6_(other.begin6_), + end6_(other.end6_), + current6_(other.current6_), + begin7_(other.begin7_), + end7_(other.end7_), + current7_(other.current7_), + begin8_(other.begin8_), + end8_(other.end8_), + current8_(other.current8_) { + ComputeCurrentValue(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_ = ParamType(*current1_, *current2_, *current3_, + *current4_, *current5_, *current6_, *current7_, *current8_); + } + bool AtEnd() const { + // We must report iterator past the end of the range when either of the + // component iterators has reached the end of its range. + return + current1_ == end1_ || + current2_ == end2_ || + current3_ == end3_ || + current4_ == end4_ || + current5_ == end5_ || + current6_ == end6_ || + current7_ == end7_ || + current8_ == end8_; + } + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface<ParamType>* const base_; + // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. + // current[i]_ is the actual traversing iterator. + const typename ParamGenerator<T1>::iterator begin1_; + const typename ParamGenerator<T1>::iterator end1_; + typename ParamGenerator<T1>::iterator current1_; + const typename ParamGenerator<T2>::iterator begin2_; + const typename ParamGenerator<T2>::iterator end2_; + typename ParamGenerator<T2>::iterator current2_; + const typename ParamGenerator<T3>::iterator begin3_; + const typename ParamGenerator<T3>::iterator end3_; + typename ParamGenerator<T3>::iterator current3_; + const typename ParamGenerator<T4>::iterator begin4_; + const typename ParamGenerator<T4>::iterator end4_; + typename ParamGenerator<T4>::iterator current4_; + const typename ParamGenerator<T5>::iterator begin5_; + const typename ParamGenerator<T5>::iterator end5_; + typename ParamGenerator<T5>::iterator current5_; + const typename ParamGenerator<T6>::iterator begin6_; + const typename ParamGenerator<T6>::iterator end6_; + typename ParamGenerator<T6>::iterator current6_; + const typename ParamGenerator<T7>::iterator begin7_; + const typename ParamGenerator<T7>::iterator end7_; + typename ParamGenerator<T7>::iterator current7_; + const typename ParamGenerator<T8>::iterator begin8_; + const typename ParamGenerator<T8>::iterator end8_; + typename ParamGenerator<T8>::iterator current8_; + ParamType current_value_; + }; // class CartesianProductGenerator8::Iterator + + // No implementation - assignment is unsupported. + void operator=(const CartesianProductGenerator8& other); + + const ParamGenerator<T1> g1_; + const ParamGenerator<T2> g2_; + const ParamGenerator<T3> g3_; + const ParamGenerator<T4> g4_; + const ParamGenerator<T5> g5_; + const ParamGenerator<T6> g6_; + const ParamGenerator<T7> g7_; + const ParamGenerator<T8> g8_; +}; // class CartesianProductGenerator8 + + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9> +class CartesianProductGenerator9 + : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, + T7, T8, T9> > { + public: + typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9> ParamType; + + CartesianProductGenerator9(const ParamGenerator<T1>& g1, + const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3, + const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5, + const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7, + const ParamGenerator<T8>& g8, const ParamGenerator<T9>& g9) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8), + g9_(g9) {} + virtual ~CartesianProductGenerator9() {} + + virtual ParamIteratorInterface<ParamType>* Begin() const { + return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, + g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_, + g7_.begin(), g8_, g8_.begin(), g9_, g9_.begin()); + } + virtual ParamIteratorInterface<ParamType>* End() const { + return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), + g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_, + g8_.end(), g9_, g9_.end()); + } + + private: + class Iterator : public ParamIteratorInterface<ParamType> { + public: + Iterator(const ParamGeneratorInterface<ParamType>* base, + const ParamGenerator<T1>& g1, + const typename ParamGenerator<T1>::iterator& current1, + const ParamGenerator<T2>& g2, + const typename ParamGenerator<T2>::iterator& current2, + const ParamGenerator<T3>& g3, + const typename ParamGenerator<T3>::iterator& current3, + const ParamGenerator<T4>& g4, + const typename ParamGenerator<T4>::iterator& current4, + const ParamGenerator<T5>& g5, + const typename ParamGenerator<T5>::iterator& current5, + const ParamGenerator<T6>& g6, + const typename ParamGenerator<T6>::iterator& current6, + const ParamGenerator<T7>& g7, + const typename ParamGenerator<T7>::iterator& current7, + const ParamGenerator<T8>& g8, + const typename ParamGenerator<T8>::iterator& current8, + const ParamGenerator<T9>& g9, + const typename ParamGenerator<T9>::iterator& current9) + : base_(base), + begin1_(g1.begin()), end1_(g1.end()), current1_(current1), + begin2_(g2.begin()), end2_(g2.end()), current2_(current2), + begin3_(g3.begin()), end3_(g3.end()), current3_(current3), + begin4_(g4.begin()), end4_(g4.end()), current4_(current4), + begin5_(g5.begin()), end5_(g5.end()), current5_(current5), + begin6_(g6.begin()), end6_(g6.end()), current6_(current6), + begin7_(g7.begin()), end7_(g7.end()), current7_(current7), + begin8_(g8.begin()), end8_(g8.end()), current8_(current8), + begin9_(g9.begin()), end9_(g9.end()), current9_(current9) { + ComputeCurrentValue(); + } + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + virtual void Advance() { + assert(!AtEnd()); + ++current9_; + if (current9_ == end9_) { + current9_ = begin9_; + ++current8_; + } + if (current8_ == end8_) { + current8_ = begin8_; + ++current7_; + } + if (current7_ == end7_) { + current7_ = begin7_; + ++current6_; + } + if (current6_ == end6_) { + current6_ = begin6_; + ++current5_; + } + if (current5_ == end5_) { + current5_ = begin5_; + ++current4_; + } + if (current4_ == end4_) { + current4_ = begin4_; + ++current3_; + } + if (current3_ == end3_) { + current3_ = begin3_; + ++current2_; + } + if (current2_ == end2_) { + current2_ = begin2_; + ++current1_; + } + ComputeCurrentValue(); + } + virtual ParamIteratorInterface<ParamType>* Clone() const { + return new Iterator(*this); + } + virtual const ParamType* Current() const { return ¤t_value_; } + virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const Iterator* typed_other = + CheckedDowncastToActualType<const Iterator>(&other); + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + return (AtEnd() && typed_other->AtEnd()) || + ( + current1_ == typed_other->current1_ && + current2_ == typed_other->current2_ && + current3_ == typed_other->current3_ && + current4_ == typed_other->current4_ && + current5_ == typed_other->current5_ && + current6_ == typed_other->current6_ && + current7_ == typed_other->current7_ && + current8_ == typed_other->current8_ && + current9_ == typed_other->current9_); + } + + private: + Iterator(const Iterator& other) + : base_(other.base_), + begin1_(other.begin1_), + end1_(other.end1_), + current1_(other.current1_), + begin2_(other.begin2_), + end2_(other.end2_), + current2_(other.current2_), + begin3_(other.begin3_), + end3_(other.end3_), + current3_(other.current3_), + begin4_(other.begin4_), + end4_(other.end4_), + current4_(other.current4_), + begin5_(other.begin5_), + end5_(other.end5_), + current5_(other.current5_), + begin6_(other.begin6_), + end6_(other.end6_), + current6_(other.current6_), + begin7_(other.begin7_), + end7_(other.end7_), + current7_(other.current7_), + begin8_(other.begin8_), + end8_(other.end8_), + current8_(other.current8_), + begin9_(other.begin9_), + end9_(other.end9_), + current9_(other.current9_) { + ComputeCurrentValue(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_ = ParamType(*current1_, *current2_, *current3_, + *current4_, *current5_, *current6_, *current7_, *current8_, + *current9_); + } + bool AtEnd() const { + // We must report iterator past the end of the range when either of the + // component iterators has reached the end of its range. + return + current1_ == end1_ || + current2_ == end2_ || + current3_ == end3_ || + current4_ == end4_ || + current5_ == end5_ || + current6_ == end6_ || + current7_ == end7_ || + current8_ == end8_ || + current9_ == end9_; + } + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface<ParamType>* const base_; + // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. + // current[i]_ is the actual traversing iterator. + const typename ParamGenerator<T1>::iterator begin1_; + const typename ParamGenerator<T1>::iterator end1_; + typename ParamGenerator<T1>::iterator current1_; + const typename ParamGenerator<T2>::iterator begin2_; + const typename ParamGenerator<T2>::iterator end2_; + typename ParamGenerator<T2>::iterator current2_; + const typename ParamGenerator<T3>::iterator begin3_; + const typename ParamGenerator<T3>::iterator end3_; + typename ParamGenerator<T3>::iterator current3_; + const typename ParamGenerator<T4>::iterator begin4_; + const typename ParamGenerator<T4>::iterator end4_; + typename ParamGenerator<T4>::iterator current4_; + const typename ParamGenerator<T5>::iterator begin5_; + const typename ParamGenerator<T5>::iterator end5_; + typename ParamGenerator<T5>::iterator current5_; + const typename ParamGenerator<T6>::iterator begin6_; + const typename ParamGenerator<T6>::iterator end6_; + typename ParamGenerator<T6>::iterator current6_; + const typename ParamGenerator<T7>::iterator begin7_; + const typename ParamGenerator<T7>::iterator end7_; + typename ParamGenerator<T7>::iterator current7_; + const typename ParamGenerator<T8>::iterator begin8_; + const typename ParamGenerator<T8>::iterator end8_; + typename ParamGenerator<T8>::iterator current8_; + const typename ParamGenerator<T9>::iterator begin9_; + const typename ParamGenerator<T9>::iterator end9_; + typename ParamGenerator<T9>::iterator current9_; + ParamType current_value_; + }; // class CartesianProductGenerator9::Iterator + + // No implementation - assignment is unsupported. + void operator=(const CartesianProductGenerator9& other); + + const ParamGenerator<T1> g1_; + const ParamGenerator<T2> g2_; + const ParamGenerator<T3> g3_; + const ParamGenerator<T4> g4_; + const ParamGenerator<T5> g5_; + const ParamGenerator<T6> g6_; + const ParamGenerator<T7> g7_; + const ParamGenerator<T8> g8_; + const ParamGenerator<T9> g9_; +}; // class CartesianProductGenerator9 + + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10> +class CartesianProductGenerator10 + : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, + T7, T8, T9, T10> > { + public: + typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> ParamType; + + CartesianProductGenerator10(const ParamGenerator<T1>& g1, + const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3, + const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5, + const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7, + const ParamGenerator<T8>& g8, const ParamGenerator<T9>& g9, + const ParamGenerator<T10>& g10) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8), + g9_(g9), g10_(g10) {} + virtual ~CartesianProductGenerator10() {} + + virtual ParamIteratorInterface<ParamType>* Begin() const { + return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, + g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_, + g7_.begin(), g8_, g8_.begin(), g9_, g9_.begin(), g10_, g10_.begin()); + } + virtual ParamIteratorInterface<ParamType>* End() const { + return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), + g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_, + g8_.end(), g9_, g9_.end(), g10_, g10_.end()); + } + + private: + class Iterator : public ParamIteratorInterface<ParamType> { + public: + Iterator(const ParamGeneratorInterface<ParamType>* base, + const ParamGenerator<T1>& g1, + const typename ParamGenerator<T1>::iterator& current1, + const ParamGenerator<T2>& g2, + const typename ParamGenerator<T2>::iterator& current2, + const ParamGenerator<T3>& g3, + const typename ParamGenerator<T3>::iterator& current3, + const ParamGenerator<T4>& g4, + const typename ParamGenerator<T4>::iterator& current4, + const ParamGenerator<T5>& g5, + const typename ParamGenerator<T5>::iterator& current5, + const ParamGenerator<T6>& g6, + const typename ParamGenerator<T6>::iterator& current6, + const ParamGenerator<T7>& g7, + const typename ParamGenerator<T7>::iterator& current7, + const ParamGenerator<T8>& g8, + const typename ParamGenerator<T8>::iterator& current8, + const ParamGenerator<T9>& g9, + const typename ParamGenerator<T9>::iterator& current9, + const ParamGenerator<T10>& g10, + const typename ParamGenerator<T10>::iterator& current10) + : base_(base), + begin1_(g1.begin()), end1_(g1.end()), current1_(current1), + begin2_(g2.begin()), end2_(g2.end()), current2_(current2), + begin3_(g3.begin()), end3_(g3.end()), current3_(current3), + begin4_(g4.begin()), end4_(g4.end()), current4_(current4), + begin5_(g5.begin()), end5_(g5.end()), current5_(current5), + begin6_(g6.begin()), end6_(g6.end()), current6_(current6), + begin7_(g7.begin()), end7_(g7.end()), current7_(current7), + begin8_(g8.begin()), end8_(g8.end()), current8_(current8), + begin9_(g9.begin()), end9_(g9.end()), current9_(current9), + begin10_(g10.begin()), end10_(g10.end()), current10_(current10) { + ComputeCurrentValue(); + } + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + virtual void Advance() { + assert(!AtEnd()); + ++current10_; + if (current10_ == end10_) { + current10_ = begin10_; + ++current9_; + } + if (current9_ == end9_) { + current9_ = begin9_; + ++current8_; + } + if (current8_ == end8_) { + current8_ = begin8_; + ++current7_; + } + if (current7_ == end7_) { + current7_ = begin7_; + ++current6_; + } + if (current6_ == end6_) { + current6_ = begin6_; + ++current5_; + } + if (current5_ == end5_) { + current5_ = begin5_; + ++current4_; + } + if (current4_ == end4_) { + current4_ = begin4_; + ++current3_; + } + if (current3_ == end3_) { + current3_ = begin3_; + ++current2_; + } + if (current2_ == end2_) { + current2_ = begin2_; + ++current1_; + } + ComputeCurrentValue(); + } + virtual ParamIteratorInterface<ParamType>* Clone() const { + return new Iterator(*this); + } + virtual const ParamType* Current() const { return ¤t_value_; } + virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const Iterator* typed_other = + CheckedDowncastToActualType<const Iterator>(&other); + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + return (AtEnd() && typed_other->AtEnd()) || + ( + current1_ == typed_other->current1_ && + current2_ == typed_other->current2_ && + current3_ == typed_other->current3_ && + current4_ == typed_other->current4_ && + current5_ == typed_other->current5_ && + current6_ == typed_other->current6_ && + current7_ == typed_other->current7_ && + current8_ == typed_other->current8_ && + current9_ == typed_other->current9_ && + current10_ == typed_other->current10_); + } + + private: + Iterator(const Iterator& other) + : base_(other.base_), + begin1_(other.begin1_), + end1_(other.end1_), + current1_(other.current1_), + begin2_(other.begin2_), + end2_(other.end2_), + current2_(other.current2_), + begin3_(other.begin3_), + end3_(other.end3_), + current3_(other.current3_), + begin4_(other.begin4_), + end4_(other.end4_), + current4_(other.current4_), + begin5_(other.begin5_), + end5_(other.end5_), + current5_(other.current5_), + begin6_(other.begin6_), + end6_(other.end6_), + current6_(other.current6_), + begin7_(other.begin7_), + end7_(other.end7_), + current7_(other.current7_), + begin8_(other.begin8_), + end8_(other.end8_), + current8_(other.current8_), + begin9_(other.begin9_), + end9_(other.end9_), + current9_(other.current9_), + begin10_(other.begin10_), + end10_(other.end10_), + current10_(other.current10_) { + ComputeCurrentValue(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_ = ParamType(*current1_, *current2_, *current3_, + *current4_, *current5_, *current6_, *current7_, *current8_, + *current9_, *current10_); + } + bool AtEnd() const { + // We must report iterator past the end of the range when either of the + // component iterators has reached the end of its range. + return + current1_ == end1_ || + current2_ == end2_ || + current3_ == end3_ || + current4_ == end4_ || + current5_ == end5_ || + current6_ == end6_ || + current7_ == end7_ || + current8_ == end8_ || + current9_ == end9_ || + current10_ == end10_; + } + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface<ParamType>* const base_; + // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. + // current[i]_ is the actual traversing iterator. + const typename ParamGenerator<T1>::iterator begin1_; + const typename ParamGenerator<T1>::iterator end1_; + typename ParamGenerator<T1>::iterator current1_; + const typename ParamGenerator<T2>::iterator begin2_; + const typename ParamGenerator<T2>::iterator end2_; + typename ParamGenerator<T2>::iterator current2_; + const typename ParamGenerator<T3>::iterator begin3_; + const typename ParamGenerator<T3>::iterator end3_; + typename ParamGenerator<T3>::iterator current3_; + const typename ParamGenerator<T4>::iterator begin4_; + const typename ParamGenerator<T4>::iterator end4_; + typename ParamGenerator<T4>::iterator current4_; + const typename ParamGenerator<T5>::iterator begin5_; + const typename ParamGenerator<T5>::iterator end5_; + typename ParamGenerator<T5>::iterator current5_; + const typename ParamGenerator<T6>::iterator begin6_; + const typename ParamGenerator<T6>::iterator end6_; + typename ParamGenerator<T6>::iterator current6_; + const typename ParamGenerator<T7>::iterator begin7_; + const typename ParamGenerator<T7>::iterator end7_; + typename ParamGenerator<T7>::iterator current7_; + const typename ParamGenerator<T8>::iterator begin8_; + const typename ParamGenerator<T8>::iterator end8_; + typename ParamGenerator<T8>::iterator current8_; + const typename ParamGenerator<T9>::iterator begin9_; + const typename ParamGenerator<T9>::iterator end9_; + typename ParamGenerator<T9>::iterator current9_; + const typename ParamGenerator<T10>::iterator begin10_; + const typename ParamGenerator<T10>::iterator end10_; + typename ParamGenerator<T10>::iterator current10_; + ParamType current_value_; + }; // class CartesianProductGenerator10::Iterator + + // No implementation - assignment is unsupported. + void operator=(const CartesianProductGenerator10& other); + + const ParamGenerator<T1> g1_; + const ParamGenerator<T2> g2_; + const ParamGenerator<T3> g3_; + const ParamGenerator<T4> g4_; + const ParamGenerator<T5> g5_; + const ParamGenerator<T6> g6_; + const ParamGenerator<T7> g7_; + const ParamGenerator<T8> g8_; + const ParamGenerator<T9> g9_; + const ParamGenerator<T10> g10_; +}; // class CartesianProductGenerator10 + + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Helper classes providing Combine() with polymorphic features. They allow +// casting CartesianProductGeneratorN<T> to ParamGenerator<U> if T is +// convertible to U. +// +template <class Generator1, class Generator2> +class CartesianProductHolder2 { + public: +CartesianProductHolder2(const Generator1& g1, const Generator2& g2) + : g1_(g1), g2_(g2) {} + template <typename T1, typename T2> + operator ParamGenerator< ::std::tr1::tuple<T1, T2> >() const { + return ParamGenerator< ::std::tr1::tuple<T1, T2> >( + new CartesianProductGenerator2<T1, T2>( + static_cast<ParamGenerator<T1> >(g1_), + static_cast<ParamGenerator<T2> >(g2_))); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const CartesianProductHolder2& other); + + const Generator1 g1_; + const Generator2 g2_; +}; // class CartesianProductHolder2 + +template <class Generator1, class Generator2, class Generator3> +class CartesianProductHolder3 { + public: +CartesianProductHolder3(const Generator1& g1, const Generator2& g2, + const Generator3& g3) + : g1_(g1), g2_(g2), g3_(g3) {} + template <typename T1, typename T2, typename T3> + operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3> >() const { + return ParamGenerator< ::std::tr1::tuple<T1, T2, T3> >( + new CartesianProductGenerator3<T1, T2, T3>( + static_cast<ParamGenerator<T1> >(g1_), + static_cast<ParamGenerator<T2> >(g2_), + static_cast<ParamGenerator<T3> >(g3_))); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const CartesianProductHolder3& other); + + const Generator1 g1_; + const Generator2 g2_; + const Generator3 g3_; +}; // class CartesianProductHolder3 + +template <class Generator1, class Generator2, class Generator3, + class Generator4> +class CartesianProductHolder4 { + public: +CartesianProductHolder4(const Generator1& g1, const Generator2& g2, + const Generator3& g3, const Generator4& g4) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4) {} + template <typename T1, typename T2, typename T3, typename T4> + operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4> >() const { + return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4> >( + new CartesianProductGenerator4<T1, T2, T3, T4>( + static_cast<ParamGenerator<T1> >(g1_), + static_cast<ParamGenerator<T2> >(g2_), + static_cast<ParamGenerator<T3> >(g3_), + static_cast<ParamGenerator<T4> >(g4_))); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const CartesianProductHolder4& other); + + const Generator1 g1_; + const Generator2 g2_; + const Generator3 g3_; + const Generator4 g4_; +}; // class CartesianProductHolder4 + +template <class Generator1, class Generator2, class Generator3, + class Generator4, class Generator5> +class CartesianProductHolder5 { + public: +CartesianProductHolder5(const Generator1& g1, const Generator2& g2, + const Generator3& g3, const Generator4& g4, const Generator5& g5) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5) {} + template <typename T1, typename T2, typename T3, typename T4, typename T5> + operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5> >() const { + return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5> >( + new CartesianProductGenerator5<T1, T2, T3, T4, T5>( + static_cast<ParamGenerator<T1> >(g1_), + static_cast<ParamGenerator<T2> >(g2_), + static_cast<ParamGenerator<T3> >(g3_), + static_cast<ParamGenerator<T4> >(g4_), + static_cast<ParamGenerator<T5> >(g5_))); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const CartesianProductHolder5& other); + + const Generator1 g1_; + const Generator2 g2_; + const Generator3 g3_; + const Generator4 g4_; + const Generator5 g5_; +}; // class CartesianProductHolder5 + +template <class Generator1, class Generator2, class Generator3, + class Generator4, class Generator5, class Generator6> +class CartesianProductHolder6 { + public: +CartesianProductHolder6(const Generator1& g1, const Generator2& g2, + const Generator3& g3, const Generator4& g4, const Generator5& g5, + const Generator6& g6) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6) {} + template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6> + operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6> >() const { + return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6> >( + new CartesianProductGenerator6<T1, T2, T3, T4, T5, T6>( + static_cast<ParamGenerator<T1> >(g1_), + static_cast<ParamGenerator<T2> >(g2_), + static_cast<ParamGenerator<T3> >(g3_), + static_cast<ParamGenerator<T4> >(g4_), + static_cast<ParamGenerator<T5> >(g5_), + static_cast<ParamGenerator<T6> >(g6_))); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const CartesianProductHolder6& other); + + const Generator1 g1_; + const Generator2 g2_; + const Generator3 g3_; + const Generator4 g4_; + const Generator5 g5_; + const Generator6 g6_; +}; // class CartesianProductHolder6 + +template <class Generator1, class Generator2, class Generator3, + class Generator4, class Generator5, class Generator6, class Generator7> +class CartesianProductHolder7 { + public: +CartesianProductHolder7(const Generator1& g1, const Generator2& g2, + const Generator3& g3, const Generator4& g4, const Generator5& g5, + const Generator6& g6, const Generator7& g7) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7) {} + template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7> + operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, + T7> >() const { + return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7> >( + new CartesianProductGenerator7<T1, T2, T3, T4, T5, T6, T7>( + static_cast<ParamGenerator<T1> >(g1_), + static_cast<ParamGenerator<T2> >(g2_), + static_cast<ParamGenerator<T3> >(g3_), + static_cast<ParamGenerator<T4> >(g4_), + static_cast<ParamGenerator<T5> >(g5_), + static_cast<ParamGenerator<T6> >(g6_), + static_cast<ParamGenerator<T7> >(g7_))); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const CartesianProductHolder7& other); + + const Generator1 g1_; + const Generator2 g2_; + const Generator3 g3_; + const Generator4 g4_; + const Generator5 g5_; + const Generator6 g6_; + const Generator7 g7_; +}; // class CartesianProductHolder7 + +template <class Generator1, class Generator2, class Generator3, + class Generator4, class Generator5, class Generator6, class Generator7, + class Generator8> +class CartesianProductHolder8 { + public: +CartesianProductHolder8(const Generator1& g1, const Generator2& g2, + const Generator3& g3, const Generator4& g4, const Generator5& g5, + const Generator6& g6, const Generator7& g7, const Generator8& g8) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), + g8_(g8) {} + template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8> + operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, + T8> >() const { + return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8> >( + new CartesianProductGenerator8<T1, T2, T3, T4, T5, T6, T7, T8>( + static_cast<ParamGenerator<T1> >(g1_), + static_cast<ParamGenerator<T2> >(g2_), + static_cast<ParamGenerator<T3> >(g3_), + static_cast<ParamGenerator<T4> >(g4_), + static_cast<ParamGenerator<T5> >(g5_), + static_cast<ParamGenerator<T6> >(g6_), + static_cast<ParamGenerator<T7> >(g7_), + static_cast<ParamGenerator<T8> >(g8_))); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const CartesianProductHolder8& other); + + const Generator1 g1_; + const Generator2 g2_; + const Generator3 g3_; + const Generator4 g4_; + const Generator5 g5_; + const Generator6 g6_; + const Generator7 g7_; + const Generator8 g8_; +}; // class CartesianProductHolder8 + +template <class Generator1, class Generator2, class Generator3, + class Generator4, class Generator5, class Generator6, class Generator7, + class Generator8, class Generator9> +class CartesianProductHolder9 { + public: +CartesianProductHolder9(const Generator1& g1, const Generator2& g2, + const Generator3& g3, const Generator4& g4, const Generator5& g5, + const Generator6& g6, const Generator7& g7, const Generator8& g8, + const Generator9& g9) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8), + g9_(g9) {} + template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9> + operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, + T9> >() const { + return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, + T9> >( + new CartesianProductGenerator9<T1, T2, T3, T4, T5, T6, T7, T8, T9>( + static_cast<ParamGenerator<T1> >(g1_), + static_cast<ParamGenerator<T2> >(g2_), + static_cast<ParamGenerator<T3> >(g3_), + static_cast<ParamGenerator<T4> >(g4_), + static_cast<ParamGenerator<T5> >(g5_), + static_cast<ParamGenerator<T6> >(g6_), + static_cast<ParamGenerator<T7> >(g7_), + static_cast<ParamGenerator<T8> >(g8_), + static_cast<ParamGenerator<T9> >(g9_))); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const CartesianProductHolder9& other); + + const Generator1 g1_; + const Generator2 g2_; + const Generator3 g3_; + const Generator4 g4_; + const Generator5 g5_; + const Generator6 g6_; + const Generator7 g7_; + const Generator8 g8_; + const Generator9 g9_; +}; // class CartesianProductHolder9 + +template <class Generator1, class Generator2, class Generator3, + class Generator4, class Generator5, class Generator6, class Generator7, + class Generator8, class Generator9, class Generator10> +class CartesianProductHolder10 { + public: +CartesianProductHolder10(const Generator1& g1, const Generator2& g2, + const Generator3& g3, const Generator4& g4, const Generator5& g5, + const Generator6& g6, const Generator7& g7, const Generator8& g8, + const Generator9& g9, const Generator10& g10) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8), + g9_(g9), g10_(g10) {} + template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10> + operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, + T9, T10> >() const { + return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, + T9, T10> >( + new CartesianProductGenerator10<T1, T2, T3, T4, T5, T6, T7, T8, T9, + T10>( + static_cast<ParamGenerator<T1> >(g1_), + static_cast<ParamGenerator<T2> >(g2_), + static_cast<ParamGenerator<T3> >(g3_), + static_cast<ParamGenerator<T4> >(g4_), + static_cast<ParamGenerator<T5> >(g5_), + static_cast<ParamGenerator<T6> >(g6_), + static_cast<ParamGenerator<T7> >(g7_), + static_cast<ParamGenerator<T8> >(g8_), + static_cast<ParamGenerator<T9> >(g9_), + static_cast<ParamGenerator<T10> >(g10_))); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const CartesianProductHolder10& other); + + const Generator1 g1_; + const Generator2 g2_; + const Generator3 g3_; + const Generator4 g4_; + const Generator5 g5_; + const Generator6 g6_; + const Generator7 g7_; + const Generator8 g8_; + const Generator9 g9_; + const Generator10 g10_; +}; // class CartesianProductHolder10 + +# endif // GTEST_HAS_COMBINE + +} // namespace internal +} // namespace testing + +#endif // GTEST_HAS_PARAM_TEST + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_ + +#if GTEST_HAS_PARAM_TEST + +namespace testing { + +// Functions producing parameter generators. +// +// Google Test uses these generators to produce parameters for value- +// parameterized tests. When a parameterized test case is instantiated +// with a particular generator, Google Test creates and runs tests +// for each element in the sequence produced by the generator. +// +// In the following sample, tests from test case FooTest are instantiated +// each three times with parameter values 3, 5, and 8: +// +// class FooTest : public TestWithParam<int> { ... }; +// +// TEST_P(FooTest, TestThis) { +// } +// TEST_P(FooTest, TestThat) { +// } +// INSTANTIATE_TEST_CASE_P(TestSequence, FooTest, Values(3, 5, 8)); +// + +// Range() returns generators providing sequences of values in a range. +// +// Synopsis: +// Range(start, end) +// - returns a generator producing a sequence of values {start, start+1, +// start+2, ..., }. +// Range(start, end, step) +// - returns a generator producing a sequence of values {start, start+step, +// start+step+step, ..., }. +// Notes: +// * The generated sequences never include end. For example, Range(1, 5) +// returns a generator producing a sequence {1, 2, 3, 4}. Range(1, 9, 2) +// returns a generator producing {1, 3, 5, 7}. +// * start and end must have the same type. That type may be any integral or +// floating-point type or a user defined type satisfying these conditions: +// * It must be assignable (have operator=() defined). +// * It must have operator+() (operator+(int-compatible type) for +// two-operand version). +// * It must have operator<() defined. +// Elements in the resulting sequences will also have that type. +// * Condition start < end must be satisfied in order for resulting sequences +// to contain any elements. +// +template <typename T, typename IncrementT> +internal::ParamGenerator<T> Range(T start, T end, IncrementT step) { + return internal::ParamGenerator<T>( + new internal::RangeGenerator<T, IncrementT>(start, end, step)); +} + +template <typename T> +internal::ParamGenerator<T> Range(T start, T end) { + return Range(start, end, 1); +} + +// ValuesIn() function allows generation of tests with parameters coming from +// a container. +// +// Synopsis: +// ValuesIn(const T (&array)[N]) +// - returns a generator producing sequences with elements from +// a C-style array. +// ValuesIn(const Container& container) +// - returns a generator producing sequences with elements from +// an STL-style container. +// ValuesIn(Iterator begin, Iterator end) +// - returns a generator producing sequences with elements from +// a range [begin, end) defined by a pair of STL-style iterators. These +// iterators can also be plain C pointers. +// +// Please note that ValuesIn copies the values from the containers +// passed in and keeps them to generate tests in RUN_ALL_TESTS(). +// +// Examples: +// +// This instantiates tests from test case StringTest +// each with C-string values of "foo", "bar", and "baz": +// +// const char* strings[] = {"foo", "bar", "baz"}; +// INSTANTIATE_TEST_CASE_P(StringSequence, SrtingTest, ValuesIn(strings)); +// +// This instantiates tests from test case StlStringTest +// each with STL strings with values "a" and "b": +// +// ::std::vector< ::std::string> GetParameterStrings() { +// ::std::vector< ::std::string> v; +// v.push_back("a"); +// v.push_back("b"); +// return v; +// } +// +// INSTANTIATE_TEST_CASE_P(CharSequence, +// StlStringTest, +// ValuesIn(GetParameterStrings())); +// +// +// This will also instantiate tests from CharTest +// each with parameter values 'a' and 'b': +// +// ::std::list<char> GetParameterChars() { +// ::std::list<char> list; +// list.push_back('a'); +// list.push_back('b'); +// return list; +// } +// ::std::list<char> l = GetParameterChars(); +// INSTANTIATE_TEST_CASE_P(CharSequence2, +// CharTest, +// ValuesIn(l.begin(), l.end())); +// +template <typename ForwardIterator> +internal::ParamGenerator< + typename ::testing::internal::IteratorTraits<ForwardIterator>::value_type> +ValuesIn(ForwardIterator begin, ForwardIterator end) { + typedef typename ::testing::internal::IteratorTraits<ForwardIterator> + ::value_type ParamType; + return internal::ParamGenerator<ParamType>( + new internal::ValuesInIteratorRangeGenerator<ParamType>(begin, end)); +} + +template <typename T, size_t N> +internal::ParamGenerator<T> ValuesIn(const T (&array)[N]) { + return ValuesIn(array, array + N); +} + +template <class Container> +internal::ParamGenerator<typename Container::value_type> ValuesIn( + const Container& container) { + return ValuesIn(container.begin(), container.end()); +} + +// Values() allows generating tests from explicitly specified list of +// parameters. +// +// Synopsis: +// Values(T v1, T v2, ..., T vN) +// - returns a generator producing sequences with elements v1, v2, ..., vN. +// +// For example, this instantiates tests from test case BarTest each +// with values "one", "two", and "three": +// +// INSTANTIATE_TEST_CASE_P(NumSequence, BarTest, Values("one", "two", "three")); +// +// This instantiates tests from test case BazTest each with values 1, 2, 3.5. +// The exact type of values will depend on the type of parameter in BazTest. +// +// INSTANTIATE_TEST_CASE_P(FloatingNumbers, BazTest, Values(1, 2, 3.5)); +// +// Currently, Values() supports from 1 to 50 parameters. +// +template <typename T1> +internal::ValueArray1<T1> Values(T1 v1) { + return internal::ValueArray1<T1>(v1); +} + +template <typename T1, typename T2> +internal::ValueArray2<T1, T2> Values(T1 v1, T2 v2) { + return internal::ValueArray2<T1, T2>(v1, v2); +} + +template <typename T1, typename T2, typename T3> +internal::ValueArray3<T1, T2, T3> Values(T1 v1, T2 v2, T3 v3) { + return internal::ValueArray3<T1, T2, T3>(v1, v2, v3); +} + +template <typename T1, typename T2, typename T3, typename T4> +internal::ValueArray4<T1, T2, T3, T4> Values(T1 v1, T2 v2, T3 v3, T4 v4) { + return internal::ValueArray4<T1, T2, T3, T4>(v1, v2, v3, v4); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5> +internal::ValueArray5<T1, T2, T3, T4, T5> Values(T1 v1, T2 v2, T3 v3, T4 v4, + T5 v5) { + return internal::ValueArray5<T1, T2, T3, T4, T5>(v1, v2, v3, v4, v5); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6> +internal::ValueArray6<T1, T2, T3, T4, T5, T6> Values(T1 v1, T2 v2, T3 v3, + T4 v4, T5 v5, T6 v6) { + return internal::ValueArray6<T1, T2, T3, T4, T5, T6>(v1, v2, v3, v4, v5, v6); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7> +internal::ValueArray7<T1, T2, T3, T4, T5, T6, T7> Values(T1 v1, T2 v2, T3 v3, + T4 v4, T5 v5, T6 v6, T7 v7) { + return internal::ValueArray7<T1, T2, T3, T4, T5, T6, T7>(v1, v2, v3, v4, v5, + v6, v7); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8> +internal::ValueArray8<T1, T2, T3, T4, T5, T6, T7, T8> Values(T1 v1, T2 v2, + T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8) { + return internal::ValueArray8<T1, T2, T3, T4, T5, T6, T7, T8>(v1, v2, v3, v4, + v5, v6, v7, v8); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9> +internal::ValueArray9<T1, T2, T3, T4, T5, T6, T7, T8, T9> Values(T1 v1, T2 v2, + T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9) { + return internal::ValueArray9<T1, T2, T3, T4, T5, T6, T7, T8, T9>(v1, v2, v3, + v4, v5, v6, v7, v8, v9); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10> +internal::ValueArray10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> Values(T1 v1, + T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10) { + return internal::ValueArray10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10>(v1, + v2, v3, v4, v5, v6, v7, v8, v9, v10); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11> +internal::ValueArray11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, + T11> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11) { + return internal::ValueArray11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, + T11>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12> +internal::ValueArray12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12) { + return internal::ValueArray12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13> +internal::ValueArray13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, + T13> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13) { + return internal::ValueArray13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12, T13>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14> +internal::ValueArray14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14) { + return internal::ValueArray14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12, T13, T14>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, + v14); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15> +internal::ValueArray15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, + T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15) { + return internal::ValueArray15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12, T13, T14, T15>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, + v13, v14, v15); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16> +internal::ValueArray16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, + T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, + T16 v16) { + return internal::ValueArray16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12, T13, T14, T15, T16>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, + v12, v13, v14, v15, v16); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17> +internal::ValueArray17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, + T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, + T16 v16, T17 v17) { + return internal::ValueArray17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12, T13, T14, T15, T16, T17>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, + v11, v12, v13, v14, v15, v16, v17); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18> +internal::ValueArray18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, + T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, + T16 v16, T17 v17, T18 v18) { + return internal::ValueArray18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12, T13, T14, T15, T16, T17, T18>(v1, v2, v3, v4, v5, v6, v7, v8, v9, + v10, v11, v12, v13, v14, v15, v16, v17, v18); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19> +internal::ValueArray19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, + T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, + T15 v15, T16 v16, T17 v17, T18 v18, T19 v19) { + return internal::ValueArray19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12, T13, T14, T15, T16, T17, T18, T19>(v1, v2, v3, v4, v5, v6, v7, v8, + v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20> +internal::ValueArray20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20> Values(T1 v1, T2 v2, T3 v3, T4 v4, + T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, + T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20) { + return internal::ValueArray20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12, T13, T14, T15, T16, T17, T18, T19, T20>(v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21> +internal::ValueArray21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21> Values(T1 v1, T2 v2, T3 v3, T4 v4, + T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, + T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21) { + return internal::ValueArray21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12, T13, T14, T15, T16, T17, T18, T19, T20, T21>(v1, v2, v3, v4, v5, v6, + v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22> +internal::ValueArray22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22> Values(T1 v1, T2 v2, T3 v3, + T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, + T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, + T21 v21, T22 v22) { + return internal::ValueArray22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22>(v1, v2, v3, v4, + v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, + v20, v21, v22); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23> +internal::ValueArray23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> Values(T1 v1, T2 v2, + T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, + T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, + T21 v21, T22 v22, T23 v23) { + return internal::ValueArray23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23>(v1, v2, v3, + v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, + v20, v21, v22, v23); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24> +internal::ValueArray24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> Values(T1 v1, T2 v2, + T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, + T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, + T21 v21, T22 v22, T23 v23, T24 v24) { + return internal::ValueArray24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24>(v1, v2, + v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, + v19, v20, v21, v22, v23, v24); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25> +internal::ValueArray25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Values(T1 v1, + T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, + T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, + T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25) { + return internal::ValueArray25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25>(v1, + v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, + v18, v19, v20, v21, v22, v23, v24, v25); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26> +internal::ValueArray26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, + T26> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26) { + return internal::ValueArray26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, + T26>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, + v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27> +internal::ValueArray27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, + T27> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27) { + return internal::ValueArray27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, + T26, T27>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, + v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28> +internal::ValueArray28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, + T28> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28) { + return internal::ValueArray28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, + T26, T27, T28>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, + v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, + v28); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29> +internal::ValueArray29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29) { + return internal::ValueArray29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, + T26, T27, T28, T29>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, + v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, + v27, v28, v29); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30> +internal::ValueArray30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, + T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, + T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, + T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30) { + return internal::ValueArray30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, + T26, T27, T28, T29, T30>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, + v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, + v26, v27, v28, v29, v30); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31> +internal::ValueArray31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, + T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, + T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, + T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31) { + return internal::ValueArray31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, + T26, T27, T28, T29, T30, T31>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, + v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, + v25, v26, v27, v28, v29, v30, v31); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32> +internal::ValueArray32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31, T32> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, + T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, + T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, + T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, + T32 v32) { + return internal::ValueArray32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, + T26, T27, T28, T29, T30, T31, T32>(v1, v2, v3, v4, v5, v6, v7, v8, v9, + v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, + v24, v25, v26, v27, v28, v29, v30, v31, v32); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33> +internal::ValueArray33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31, T32, T33> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, + T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, + T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, + T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, + T32 v32, T33 v33) { + return internal::ValueArray33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, + T26, T27, T28, T29, T30, T31, T32, T33>(v1, v2, v3, v4, v5, v6, v7, v8, + v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, + v24, v25, v26, v27, v28, v29, v30, v31, v32, v33); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34> +internal::ValueArray34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31, T32, T33, T34> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, + T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, + T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, + T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, + T31 v31, T32 v32, T33 v33, T34 v34) { + return internal::ValueArray34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, + T26, T27, T28, T29, T30, T31, T32, T33, T34>(v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, + v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35> +internal::ValueArray35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31, T32, T33, T34, T35> Values(T1 v1, T2 v2, T3 v3, T4 v4, + T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, + T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, + T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, + T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35) { + return internal::ValueArray35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, + T26, T27, T28, T29, T30, T31, T32, T33, T34, T35>(v1, v2, v3, v4, v5, v6, + v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, + v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36> +internal::ValueArray36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31, T32, T33, T34, T35, T36> Values(T1 v1, T2 v2, T3 v3, T4 v4, + T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, + T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, + T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, + T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36) { + return internal::ValueArray36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, + T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36>(v1, v2, v3, v4, + v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, + v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, + v34, v35, v36); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37> +internal::ValueArray37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31, T32, T33, T34, T35, T36, T37> Values(T1 v1, T2 v2, T3 v3, + T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, + T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, + T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, + T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, + T37 v37) { + return internal::ValueArray37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, + T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37>(v1, v2, v3, + v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, + v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, + v34, v35, v36, v37); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38> +internal::ValueArray38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> Values(T1 v1, T2 v2, + T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, + T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, + T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, + T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, + T37 v37, T38 v38) { + return internal::ValueArray38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, + T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38>(v1, v2, + v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, + v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, + v33, v34, v35, v36, v37, v38); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39> +internal::ValueArray39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Values(T1 v1, T2 v2, + T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, + T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, + T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, + T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, + T37 v37, T38 v38, T39 v39) { + return internal::ValueArray39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, + T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39>(v1, + v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, + v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, + v32, v33, v34, v35, v36, v37, v38, v39); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40> +internal::ValueArray40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Values(T1 v1, + T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, + T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, + T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, + T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, + T36 v36, T37 v37, T38 v38, T39 v39, T40 v40) { + return internal::ValueArray40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, + T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, + T40>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, + v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, + v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41> +internal::ValueArray41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, + T41> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41) { + return internal::ValueArray41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, + T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, + T40, T41>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, + v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, + v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41, typename T42> +internal::ValueArray42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, + T42> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42) { + return internal::ValueArray42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, + T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, + T40, T41, T42>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, + v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, + v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, + v42); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41, typename T42, typename T43> +internal::ValueArray43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, + T43> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43) { + return internal::ValueArray43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, + T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, + T40, T41, T42, T43>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, + v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, + v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, + v41, v42, v43); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41, typename T42, typename T43, typename T44> +internal::ValueArray44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, + T44> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43, T44 v44) { + return internal::ValueArray44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, + T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, + T40, T41, T42, T43, T44>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, + v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, + v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, + v40, v41, v42, v43, v44); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41, typename T42, typename T43, typename T44, typename T45> +internal::ValueArray45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, + T44, T45> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, + T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, + T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, + T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, + T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, + T41 v41, T42 v42, T43 v43, T44 v44, T45 v45) { + return internal::ValueArray45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, + T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, + T40, T41, T42, T43, T44, T45>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, + v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, + v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, + v39, v40, v41, v42, v43, v44, v45); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41, typename T42, typename T43, typename T44, typename T45, + typename T46> +internal::ValueArray46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, + T44, T45, T46> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, + T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, + T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, + T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, + T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, + T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46) { + return internal::ValueArray46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, + T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, + T40, T41, T42, T43, T44, T45, T46>(v1, v2, v3, v4, v5, v6, v7, v8, v9, + v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, + v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, + v38, v39, v40, v41, v42, v43, v44, v45, v46); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41, typename T42, typename T43, typename T44, typename T45, + typename T46, typename T47> +internal::ValueArray47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, + T44, T45, T46, T47> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, + T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, + T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, + T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, + T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, + T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47) { + return internal::ValueArray47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, + T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, + T40, T41, T42, T43, T44, T45, T46, T47>(v1, v2, v3, v4, v5, v6, v7, v8, + v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, + v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, + v38, v39, v40, v41, v42, v43, v44, v45, v46, v47); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41, typename T42, typename T43, typename T44, typename T45, + typename T46, typename T47, typename T48> +internal::ValueArray48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, + T44, T45, T46, T47, T48> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, + T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, + T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, + T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, + T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, + T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, + T48 v48) { + return internal::ValueArray48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, + T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, + T40, T41, T42, T43, T44, T45, T46, T47, T48>(v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, + v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, + v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41, typename T42, typename T43, typename T44, typename T45, + typename T46, typename T47, typename T48, typename T49> +internal::ValueArray49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, + T44, T45, T46, T47, T48, T49> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, + T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, + T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, + T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, + T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, + T39 v39, T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, + T47 v47, T48 v48, T49 v49) { + return internal::ValueArray49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, + T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, + T40, T41, T42, T43, T44, T45, T46, T47, T48, T49>(v1, v2, v3, v4, v5, v6, + v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, + v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, + v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49); +} + +template <typename T1, typename T2, typename T3, typename T4, typename T5, + typename T6, typename T7, typename T8, typename T9, typename T10, + typename T11, typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, typename T20, + typename T21, typename T22, typename T23, typename T24, typename T25, + typename T26, typename T27, typename T28, typename T29, typename T30, + typename T31, typename T32, typename T33, typename T34, typename T35, + typename T36, typename T37, typename T38, typename T39, typename T40, + typename T41, typename T42, typename T43, typename T44, typename T45, + typename T46, typename T47, typename T48, typename T49, typename T50> +internal::ValueArray50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, + T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, + T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, + T44, T45, T46, T47, T48, T49, T50> Values(T1 v1, T2 v2, T3 v3, T4 v4, + T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, + T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, + T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, + T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, + T38 v38, T39 v39, T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, + T46 v46, T47 v47, T48 v48, T49 v49, T50 v50) { + return internal::ValueArray50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, + T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, + T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, + T40, T41, T42, T43, T44, T45, T46, T47, T48, T49, T50>(v1, v2, v3, v4, + v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, + v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, + v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, + v48, v49, v50); +} + +// Bool() allows generating tests with parameters in a set of (false, true). +// +// Synopsis: +// Bool() +// - returns a generator producing sequences with elements {false, true}. +// +// It is useful when testing code that depends on Boolean flags. Combinations +// of multiple flags can be tested when several Bool()'s are combined using +// Combine() function. +// +// In the following example all tests in the test case FlagDependentTest +// will be instantiated twice with parameters false and true. +// +// class FlagDependentTest : public testing::TestWithParam<bool> { +// virtual void SetUp() { +// external_flag = GetParam(); +// } +// } +// INSTANTIATE_TEST_CASE_P(BoolSequence, FlagDependentTest, Bool()); +// +inline internal::ParamGenerator<bool> Bool() { + return Values(false, true); +} + +# if GTEST_HAS_COMBINE +// Combine() allows the user to combine two or more sequences to produce +// values of a Cartesian product of those sequences' elements. +// +// Synopsis: +// Combine(gen1, gen2, ..., genN) +// - returns a generator producing sequences with elements coming from +// the Cartesian product of elements from the sequences generated by +// gen1, gen2, ..., genN. The sequence elements will have a type of +// tuple<T1, T2, ..., TN> where T1, T2, ..., TN are the types +// of elements from sequences produces by gen1, gen2, ..., genN. +// +// Combine can have up to 10 arguments. This number is currently limited +// by the maximum number of elements in the tuple implementation used by Google +// Test. +// +// Example: +// +// This will instantiate tests in test case AnimalTest each one with +// the parameter values tuple("cat", BLACK), tuple("cat", WHITE), +// tuple("dog", BLACK), and tuple("dog", WHITE): +// +// enum Color { BLACK, GRAY, WHITE }; +// class AnimalTest +// : public testing::TestWithParam<tuple<const char*, Color> > {...}; +// +// TEST_P(AnimalTest, AnimalLooksNice) {...} +// +// INSTANTIATE_TEST_CASE_P(AnimalVariations, AnimalTest, +// Combine(Values("cat", "dog"), +// Values(BLACK, WHITE))); +// +// This will instantiate tests in FlagDependentTest with all variations of two +// Boolean flags: +// +// class FlagDependentTest +// : public testing::TestWithParam<tuple<bool, bool> > { +// virtual void SetUp() { +// // Assigns external_flag_1 and external_flag_2 values from the tuple. +// tie(external_flag_1, external_flag_2) = GetParam(); +// } +// }; +// +// TEST_P(FlagDependentTest, TestFeature1) { +// // Test your code using external_flag_1 and external_flag_2 here. +// } +// INSTANTIATE_TEST_CASE_P(TwoBoolSequence, FlagDependentTest, +// Combine(Bool(), Bool())); +// +template <typename Generator1, typename Generator2> +internal::CartesianProductHolder2<Generator1, Generator2> Combine( + const Generator1& g1, const Generator2& g2) { + return internal::CartesianProductHolder2<Generator1, Generator2>( + g1, g2); +} + +template <typename Generator1, typename Generator2, typename Generator3> +internal::CartesianProductHolder3<Generator1, Generator2, Generator3> Combine( + const Generator1& g1, const Generator2& g2, const Generator3& g3) { + return internal::CartesianProductHolder3<Generator1, Generator2, Generator3>( + g1, g2, g3); +} + +template <typename Generator1, typename Generator2, typename Generator3, + typename Generator4> +internal::CartesianProductHolder4<Generator1, Generator2, Generator3, + Generator4> Combine( + const Generator1& g1, const Generator2& g2, const Generator3& g3, + const Generator4& g4) { + return internal::CartesianProductHolder4<Generator1, Generator2, Generator3, + Generator4>( + g1, g2, g3, g4); +} + +template <typename Generator1, typename Generator2, typename Generator3, + typename Generator4, typename Generator5> +internal::CartesianProductHolder5<Generator1, Generator2, Generator3, + Generator4, Generator5> Combine( + const Generator1& g1, const Generator2& g2, const Generator3& g3, + const Generator4& g4, const Generator5& g5) { + return internal::CartesianProductHolder5<Generator1, Generator2, Generator3, + Generator4, Generator5>( + g1, g2, g3, g4, g5); +} + +template <typename Generator1, typename Generator2, typename Generator3, + typename Generator4, typename Generator5, typename Generator6> +internal::CartesianProductHolder6<Generator1, Generator2, Generator3, + Generator4, Generator5, Generator6> Combine( + const Generator1& g1, const Generator2& g2, const Generator3& g3, + const Generator4& g4, const Generator5& g5, const Generator6& g6) { + return internal::CartesianProductHolder6<Generator1, Generator2, Generator3, + Generator4, Generator5, Generator6>( + g1, g2, g3, g4, g5, g6); +} + +template <typename Generator1, typename Generator2, typename Generator3, + typename Generator4, typename Generator5, typename Generator6, + typename Generator7> +internal::CartesianProductHolder7<Generator1, Generator2, Generator3, + Generator4, Generator5, Generator6, Generator7> Combine( + const Generator1& g1, const Generator2& g2, const Generator3& g3, + const Generator4& g4, const Generator5& g5, const Generator6& g6, + const Generator7& g7) { + return internal::CartesianProductHolder7<Generator1, Generator2, Generator3, + Generator4, Generator5, Generator6, Generator7>( + g1, g2, g3, g4, g5, g6, g7); +} + +template <typename Generator1, typename Generator2, typename Generator3, + typename Generator4, typename Generator5, typename Generator6, + typename Generator7, typename Generator8> +internal::CartesianProductHolder8<Generator1, Generator2, Generator3, + Generator4, Generator5, Generator6, Generator7, Generator8> Combine( + const Generator1& g1, const Generator2& g2, const Generator3& g3, + const Generator4& g4, const Generator5& g5, const Generator6& g6, + const Generator7& g7, const Generator8& g8) { + return internal::CartesianProductHolder8<Generator1, Generator2, Generator3, + Generator4, Generator5, Generator6, Generator7, Generator8>( + g1, g2, g3, g4, g5, g6, g7, g8); +} + +template <typename Generator1, typename Generator2, typename Generator3, + typename Generator4, typename Generator5, typename Generator6, + typename Generator7, typename Generator8, typename Generator9> +internal::CartesianProductHolder9<Generator1, Generator2, Generator3, + Generator4, Generator5, Generator6, Generator7, Generator8, + Generator9> Combine( + const Generator1& g1, const Generator2& g2, const Generator3& g3, + const Generator4& g4, const Generator5& g5, const Generator6& g6, + const Generator7& g7, const Generator8& g8, const Generator9& g9) { + return internal::CartesianProductHolder9<Generator1, Generator2, Generator3, + Generator4, Generator5, Generator6, Generator7, Generator8, Generator9>( + g1, g2, g3, g4, g5, g6, g7, g8, g9); +} + +template <typename Generator1, typename Generator2, typename Generator3, + typename Generator4, typename Generator5, typename Generator6, + typename Generator7, typename Generator8, typename Generator9, + typename Generator10> +internal::CartesianProductHolder10<Generator1, Generator2, Generator3, + Generator4, Generator5, Generator6, Generator7, Generator8, Generator9, + Generator10> Combine( + const Generator1& g1, const Generator2& g2, const Generator3& g3, + const Generator4& g4, const Generator5& g5, const Generator6& g6, + const Generator7& g7, const Generator8& g8, const Generator9& g9, + const Generator10& g10) { + return internal::CartesianProductHolder10<Generator1, Generator2, Generator3, + Generator4, Generator5, Generator6, Generator7, Generator8, Generator9, + Generator10>( + g1, g2, g3, g4, g5, g6, g7, g8, g9, g10); +} +# endif // GTEST_HAS_COMBINE + + + +# define TEST_P(test_case_name, test_name) \ + class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \ + : public test_case_name { \ + public: \ + GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {} \ + virtual void TestBody(); \ + private: \ + static int AddToRegistry() { \ + ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \ + GetTestCasePatternHolder<test_case_name>(\ + #test_case_name, __FILE__, __LINE__)->AddTestPattern(\ + #test_case_name, \ + #test_name, \ + new ::testing::internal::TestMetaFactory< \ + GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>()); \ + return 0; \ + } \ + static int gtest_registering_dummy_; \ + GTEST_DISALLOW_COPY_AND_ASSIGN_(\ + GTEST_TEST_CLASS_NAME_(test_case_name, test_name)); \ + }; \ + int GTEST_TEST_CLASS_NAME_(test_case_name, \ + test_name)::gtest_registering_dummy_ = \ + GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry(); \ + void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody() + +# define INSTANTIATE_TEST_CASE_P(prefix, test_case_name, generator) \ + ::testing::internal::ParamGenerator<test_case_name::ParamType> \ + gtest_##prefix##test_case_name##_EvalGenerator_() { return generator; } \ + int gtest_##prefix##test_case_name##_dummy_ = \ + ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \ + GetTestCasePatternHolder<test_case_name>(\ + #test_case_name, __FILE__, __LINE__)->AddTestCaseInstantiation(\ + #prefix, \ + >est_##prefix##test_case_name##_EvalGenerator_, \ + __FILE__, __LINE__) + +} // namespace testing + +#endif // GTEST_HAS_PARAM_TEST + +#endif // GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ +// Copyright 2006, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) +// +// Google C++ Testing Framework definitions useful in production code. + +#ifndef GTEST_INCLUDE_GTEST_GTEST_PROD_H_ +#define GTEST_INCLUDE_GTEST_GTEST_PROD_H_ + +// When you need to test the private or protected members of a class, +// use the FRIEND_TEST macro to declare your tests as friends of the +// class. For example: +// +// class MyClass { +// private: +// void MyMethod(); +// FRIEND_TEST(MyClassTest, MyMethod); +// }; +// +// class MyClassTest : public testing::Test { +// // ... +// }; +// +// TEST_F(MyClassTest, MyMethod) { +// // Can call MyClass::MyMethod() here. +// } + +#define FRIEND_TEST(test_case_name, test_name)\ +friend class test_case_name##_##test_name##_Test + +#endif // GTEST_INCLUDE_GTEST_GTEST_PROD_H_ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: mheule@google.com (Markus Heule) +// + +#ifndef GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ +#define GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ + +#include <iosfwd> +#include <vector> + +namespace testing { + +// A copyable object representing the result of a test part (i.e. an +// assertion or an explicit FAIL(), ADD_FAILURE(), or SUCCESS()). +// +// Don't inherit from TestPartResult as its destructor is not virtual. +class GTEST_API_ TestPartResult { + public: + // The possible outcomes of a test part (i.e. an assertion or an + // explicit SUCCEED(), FAIL(), or ADD_FAILURE()). + enum Type { + kSuccess, // Succeeded. + kNonFatalFailure, // Failed but the test can continue. + kFatalFailure // Failed and the test should be terminated. + }; + + // C'tor. TestPartResult does NOT have a default constructor. + // Always use this constructor (with parameters) to create a + // TestPartResult object. + TestPartResult(Type a_type, + const char* a_file_name, + int a_line_number, + const char* a_message) + : type_(a_type), + file_name_(a_file_name == NULL ? "" : a_file_name), + line_number_(a_line_number), + summary_(ExtractSummary(a_message)), + message_(a_message) { + } + + // Gets the outcome of the test part. + Type type() const { return type_; } + + // Gets the name of the source file where the test part took place, or + // NULL if it's unknown. + const char* file_name() const { + return file_name_.empty() ? NULL : file_name_.c_str(); + } + + // Gets the line in the source file where the test part took place, + // or -1 if it's unknown. + int line_number() const { return line_number_; } + + // Gets the summary of the failure message. + const char* summary() const { return summary_.c_str(); } + + // Gets the message associated with the test part. + const char* message() const { return message_.c_str(); } + + // Returns true iff the test part passed. + bool passed() const { return type_ == kSuccess; } + + // Returns true iff the test part failed. + bool failed() const { return type_ != kSuccess; } + + // Returns true iff the test part non-fatally failed. + bool nonfatally_failed() const { return type_ == kNonFatalFailure; } + + // Returns true iff the test part fatally failed. + bool fatally_failed() const { return type_ == kFatalFailure; } + + private: + Type type_; + + // Gets the summary of the failure message by omitting the stack + // trace in it. + static std::string ExtractSummary(const char* message); + + // The name of the source file where the test part took place, or + // "" if the source file is unknown. + std::string file_name_; + // The line in the source file where the test part took place, or -1 + // if the line number is unknown. + int line_number_; + std::string summary_; // The test failure summary. + std::string message_; // The test failure message. +}; + +// Prints a TestPartResult object. +std::ostream& operator<<(std::ostream& os, const TestPartResult& result); + +// An array of TestPartResult objects. +// +// Don't inherit from TestPartResultArray as its destructor is not +// virtual. +class GTEST_API_ TestPartResultArray { + public: + TestPartResultArray() {} + + // Appends the given TestPartResult to the array. + void Append(const TestPartResult& result); + + // Returns the TestPartResult at the given index (0-based). + const TestPartResult& GetTestPartResult(int index) const; + + // Returns the number of TestPartResult objects in the array. + int size() const; + + private: + std::vector<TestPartResult> array_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(TestPartResultArray); +}; + +// This interface knows how to report a test part result. +class TestPartResultReporterInterface { + public: + virtual ~TestPartResultReporterInterface() {} + + virtual void ReportTestPartResult(const TestPartResult& result) = 0; +}; + +namespace internal { + +// This helper class is used by {ASSERT|EXPECT}_NO_FATAL_FAILURE to check if a +// statement generates new fatal failures. To do so it registers itself as the +// current test part result reporter. Besides checking if fatal failures were +// reported, it only delegates the reporting to the former result reporter. +// The original result reporter is restored in the destructor. +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +class GTEST_API_ HasNewFatalFailureHelper + : public TestPartResultReporterInterface { + public: + HasNewFatalFailureHelper(); + virtual ~HasNewFatalFailureHelper(); + virtual void ReportTestPartResult(const TestPartResult& result); + bool has_new_fatal_failure() const { return has_new_fatal_failure_; } + private: + bool has_new_fatal_failure_; + TestPartResultReporterInterface* original_reporter_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(HasNewFatalFailureHelper); +}; + +} // namespace internal + +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ +// Copyright 2008 Google Inc. +// All Rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) + +#ifndef GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ +#define GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ + +// This header implements typed tests and type-parameterized tests. + +// Typed (aka type-driven) tests repeat the same test for types in a +// list. You must know which types you want to test with when writing +// typed tests. Here's how you do it: + +#if 0 + +// First, define a fixture class template. It should be parameterized +// by a type. Remember to derive it from testing::Test. +template <typename T> +class FooTest : public testing::Test { + public: + ... + typedef std::list<T> List; + static T shared_; + T value_; +}; + +// Next, associate a list of types with the test case, which will be +// repeated for each type in the list. The typedef is necessary for +// the macro to parse correctly. +typedef testing::Types<char, int, unsigned int> MyTypes; +TYPED_TEST_CASE(FooTest, MyTypes); + +// If the type list contains only one type, you can write that type +// directly without Types<...>: +// TYPED_TEST_CASE(FooTest, int); + +// Then, use TYPED_TEST() instead of TEST_F() to define as many typed +// tests for this test case as you want. +TYPED_TEST(FooTest, DoesBlah) { + // Inside a test, refer to TypeParam to get the type parameter. + // Since we are inside a derived class template, C++ requires use to + // visit the members of FooTest via 'this'. + TypeParam n = this->value_; + + // To visit static members of the fixture, add the TestFixture:: + // prefix. + n += TestFixture::shared_; + + // To refer to typedefs in the fixture, add the "typename + // TestFixture::" prefix. + typename TestFixture::List values; + values.push_back(n); + ... +} + +TYPED_TEST(FooTest, HasPropertyA) { ... } + +#endif // 0 + +// Type-parameterized tests are abstract test patterns parameterized +// by a type. Compared with typed tests, type-parameterized tests +// allow you to define the test pattern without knowing what the type +// parameters are. The defined pattern can be instantiated with +// different types any number of times, in any number of translation +// units. +// +// If you are designing an interface or concept, you can define a +// suite of type-parameterized tests to verify properties that any +// valid implementation of the interface/concept should have. Then, +// each implementation can easily instantiate the test suite to verify +// that it conforms to the requirements, without having to write +// similar tests repeatedly. Here's an example: + +#if 0 + +// First, define a fixture class template. It should be parameterized +// by a type. Remember to derive it from testing::Test. +template <typename T> +class FooTest : public testing::Test { + ... +}; + +// Next, declare that you will define a type-parameterized test case +// (the _P suffix is for "parameterized" or "pattern", whichever you +// prefer): +TYPED_TEST_CASE_P(FooTest); + +// Then, use TYPED_TEST_P() to define as many type-parameterized tests +// for this type-parameterized test case as you want. +TYPED_TEST_P(FooTest, DoesBlah) { + // Inside a test, refer to TypeParam to get the type parameter. + TypeParam n = 0; + ... +} + +TYPED_TEST_P(FooTest, HasPropertyA) { ... } + +// Now the tricky part: you need to register all test patterns before +// you can instantiate them. The first argument of the macro is the +// test case name; the rest are the names of the tests in this test +// case. +REGISTER_TYPED_TEST_CASE_P(FooTest, + DoesBlah, HasPropertyA); + +// Finally, you are free to instantiate the pattern with the types you +// want. If you put the above code in a header file, you can #include +// it in multiple C++ source files and instantiate it multiple times. +// +// To distinguish different instances of the pattern, the first +// argument to the INSTANTIATE_* macro is a prefix that will be added +// to the actual test case name. Remember to pick unique prefixes for +// different instances. +typedef testing::Types<char, int, unsigned int> MyTypes; +INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes); + +// If the type list contains only one type, you can write that type +// directly without Types<...>: +// INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, int); + +#endif // 0 + + +// Implements typed tests. + +#if GTEST_HAS_TYPED_TEST + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Expands to the name of the typedef for the type parameters of the +// given test case. +# define GTEST_TYPE_PARAMS_(TestCaseName) gtest_type_params_##TestCaseName##_ + +// The 'Types' template argument below must have spaces around it +// since some compilers may choke on '>>' when passing a template +// instance (e.g. Types<int>) +# define TYPED_TEST_CASE(CaseName, Types) \ + typedef ::testing::internal::TypeList< Types >::type \ + GTEST_TYPE_PARAMS_(CaseName) + +# define TYPED_TEST(CaseName, TestName) \ + template <typename gtest_TypeParam_> \ + class GTEST_TEST_CLASS_NAME_(CaseName, TestName) \ + : public CaseName<gtest_TypeParam_> { \ + private: \ + typedef CaseName<gtest_TypeParam_> TestFixture; \ + typedef gtest_TypeParam_ TypeParam; \ + virtual void TestBody(); \ + }; \ + bool gtest_##CaseName##_##TestName##_registered_ GTEST_ATTRIBUTE_UNUSED_ = \ + ::testing::internal::TypeParameterizedTest< \ + CaseName, \ + ::testing::internal::TemplateSel< \ + GTEST_TEST_CLASS_NAME_(CaseName, TestName)>, \ + GTEST_TYPE_PARAMS_(CaseName)>::Register(\ + "", #CaseName, #TestName, 0); \ + template <typename gtest_TypeParam_> \ + void GTEST_TEST_CLASS_NAME_(CaseName, TestName)<gtest_TypeParam_>::TestBody() + +#endif // GTEST_HAS_TYPED_TEST + +// Implements type-parameterized tests. + +#if GTEST_HAS_TYPED_TEST_P + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Expands to the namespace name that the type-parameterized tests for +// the given type-parameterized test case are defined in. The exact +// name of the namespace is subject to change without notice. +# define GTEST_CASE_NAMESPACE_(TestCaseName) \ + gtest_case_##TestCaseName##_ + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Expands to the name of the variable used to remember the names of +// the defined tests in the given test case. +# define GTEST_TYPED_TEST_CASE_P_STATE_(TestCaseName) \ + gtest_typed_test_case_p_state_##TestCaseName##_ + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE DIRECTLY. +// +// Expands to the name of the variable used to remember the names of +// the registered tests in the given test case. +# define GTEST_REGISTERED_TEST_NAMES_(TestCaseName) \ + gtest_registered_test_names_##TestCaseName##_ + +// The variables defined in the type-parameterized test macros are +// static as typically these macros are used in a .h file that can be +// #included in multiple translation units linked together. +# define TYPED_TEST_CASE_P(CaseName) \ + static ::testing::internal::TypedTestCasePState \ + GTEST_TYPED_TEST_CASE_P_STATE_(CaseName) + +# define TYPED_TEST_P(CaseName, TestName) \ + namespace GTEST_CASE_NAMESPACE_(CaseName) { \ + template <typename gtest_TypeParam_> \ + class TestName : public CaseName<gtest_TypeParam_> { \ + private: \ + typedef CaseName<gtest_TypeParam_> TestFixture; \ + typedef gtest_TypeParam_ TypeParam; \ + virtual void TestBody(); \ + }; \ + static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ = \ + GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).AddTestName(\ + __FILE__, __LINE__, #CaseName, #TestName); \ + } \ + template <typename gtest_TypeParam_> \ + void GTEST_CASE_NAMESPACE_(CaseName)::TestName<gtest_TypeParam_>::TestBody() + +# define REGISTER_TYPED_TEST_CASE_P(CaseName, ...) \ + namespace GTEST_CASE_NAMESPACE_(CaseName) { \ + typedef ::testing::internal::Templates<__VA_ARGS__>::type gtest_AllTests_; \ + } \ + static const char* const GTEST_REGISTERED_TEST_NAMES_(CaseName) = \ + GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).VerifyRegisteredTestNames(\ + __FILE__, __LINE__, #__VA_ARGS__) + +// The 'Types' template argument below must have spaces around it +// since some compilers may choke on '>>' when passing a template +// instance (e.g. Types<int>) +# define INSTANTIATE_TYPED_TEST_CASE_P(Prefix, CaseName, Types) \ + bool gtest_##Prefix##_##CaseName GTEST_ATTRIBUTE_UNUSED_ = \ + ::testing::internal::TypeParameterizedTestCase<CaseName, \ + GTEST_CASE_NAMESPACE_(CaseName)::gtest_AllTests_, \ + ::testing::internal::TypeList< Types >::type>::Register(\ + #Prefix, #CaseName, GTEST_REGISTERED_TEST_NAMES_(CaseName)) + +#endif // GTEST_HAS_TYPED_TEST_P + +#endif // GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ + +// Depending on the platform, different string classes are available. +// On Linux, in addition to ::std::string, Google also makes use of +// class ::string, which has the same interface as ::std::string, but +// has a different implementation. +// +// The user can define GTEST_HAS_GLOBAL_STRING to 1 to indicate that +// ::string is available AND is a distinct type to ::std::string, or +// define it to 0 to indicate otherwise. +// +// If the user's ::std::string and ::string are the same class due to +// aliasing, he should define GTEST_HAS_GLOBAL_STRING to 0. +// +// If the user doesn't define GTEST_HAS_GLOBAL_STRING, it is defined +// heuristically. + +namespace testing { + +// Declares the flags. + +// This flag temporary enables the disabled tests. +GTEST_DECLARE_bool_(also_run_disabled_tests); + +// This flag brings the debugger on an assertion failure. +GTEST_DECLARE_bool_(break_on_failure); + +// This flag controls whether Google Test catches all test-thrown exceptions +// and logs them as failures. +GTEST_DECLARE_bool_(catch_exceptions); + +// This flag enables using colors in terminal output. Available values are +// "yes" to enable colors, "no" (disable colors), or "auto" (the default) +// to let Google Test decide. +GTEST_DECLARE_string_(color); + +// This flag sets up the filter to select by name using a glob pattern +// the tests to run. If the filter is not given all tests are executed. +GTEST_DECLARE_string_(filter); + +// This flag causes the Google Test to list tests. None of the tests listed +// are actually run if the flag is provided. +GTEST_DECLARE_bool_(list_tests); + +// This flag controls whether Google Test emits a detailed XML report to a file +// in addition to its normal textual output. +GTEST_DECLARE_string_(output); + +// This flags control whether Google Test prints the elapsed time for each +// test. +GTEST_DECLARE_bool_(print_time); + +// This flag specifies the random number seed. +GTEST_DECLARE_int32_(random_seed); + +// This flag sets how many times the tests are repeated. The default value +// is 1. If the value is -1 the tests are repeating forever. +GTEST_DECLARE_int32_(repeat); + +// This flag controls whether Google Test includes Google Test internal +// stack frames in failure stack traces. +GTEST_DECLARE_bool_(show_internal_stack_frames); + +// When this flag is specified, tests' order is randomized on every iteration. +GTEST_DECLARE_bool_(shuffle); + +// This flag specifies the maximum number of stack frames to be +// printed in a failure message. +GTEST_DECLARE_int32_(stack_trace_depth); + +// When this flag is specified, a failed assertion will throw an +// exception if exceptions are enabled, or exit the program with a +// non-zero code otherwise. +GTEST_DECLARE_bool_(throw_on_failure); + +// When this flag is set with a "host:port" string, on supported +// platforms test results are streamed to the specified port on +// the specified host machine. +GTEST_DECLARE_string_(stream_result_to); + +// The upper limit for valid stack trace depths. +const int kMaxStackTraceDepth = 100; + +namespace internal { + +class AssertHelper; +class DefaultGlobalTestPartResultReporter; +class ExecDeathTest; +class NoExecDeathTest; +class FinalSuccessChecker; +class GTestFlagSaver; +class StreamingListenerTest; +class TestResultAccessor; +class TestEventListenersAccessor; +class TestEventRepeater; +class UnitTestRecordPropertyTestHelper; +class WindowsDeathTest; +class UnitTestImpl* GetUnitTestImpl(); +void ReportFailureInUnknownLocation(TestPartResult::Type result_type, + const std::string& message); + +} // namespace internal + +// The friend relationship of some of these classes is cyclic. +// If we don't forward declare them the compiler might confuse the classes +// in friendship clauses with same named classes on the scope. +class Test; +class TestCase; +class TestInfo; +class UnitTest; + +// A class for indicating whether an assertion was successful. When +// the assertion wasn't successful, the AssertionResult object +// remembers a non-empty message that describes how it failed. +// +// To create an instance of this class, use one of the factory functions +// (AssertionSuccess() and AssertionFailure()). +// +// This class is useful for two purposes: +// 1. Defining predicate functions to be used with Boolean test assertions +// EXPECT_TRUE/EXPECT_FALSE and their ASSERT_ counterparts +// 2. Defining predicate-format functions to be +// used with predicate assertions (ASSERT_PRED_FORMAT*, etc). +// +// For example, if you define IsEven predicate: +// +// testing::AssertionResult IsEven(int n) { +// if ((n % 2) == 0) +// return testing::AssertionSuccess(); +// else +// return testing::AssertionFailure() << n << " is odd"; +// } +// +// Then the failed expectation EXPECT_TRUE(IsEven(Fib(5))) +// will print the message +// +// Value of: IsEven(Fib(5)) +// Actual: false (5 is odd) +// Expected: true +// +// instead of a more opaque +// +// Value of: IsEven(Fib(5)) +// Actual: false +// Expected: true +// +// in case IsEven is a simple Boolean predicate. +// +// If you expect your predicate to be reused and want to support informative +// messages in EXPECT_FALSE and ASSERT_FALSE (negative assertions show up +// about half as often as positive ones in our tests), supply messages for +// both success and failure cases: +// +// testing::AssertionResult IsEven(int n) { +// if ((n % 2) == 0) +// return testing::AssertionSuccess() << n << " is even"; +// else +// return testing::AssertionFailure() << n << " is odd"; +// } +// +// Then a statement EXPECT_FALSE(IsEven(Fib(6))) will print +// +// Value of: IsEven(Fib(6)) +// Actual: true (8 is even) +// Expected: false +// +// NB: Predicates that support negative Boolean assertions have reduced +// performance in positive ones so be careful not to use them in tests +// that have lots (tens of thousands) of positive Boolean assertions. +// +// To use this class with EXPECT_PRED_FORMAT assertions such as: +// +// // Verifies that Foo() returns an even number. +// EXPECT_PRED_FORMAT1(IsEven, Foo()); +// +// you need to define: +// +// testing::AssertionResult IsEven(const char* expr, int n) { +// if ((n % 2) == 0) +// return testing::AssertionSuccess(); +// else +// return testing::AssertionFailure() +// << "Expected: " << expr << " is even\n Actual: it's " << n; +// } +// +// If Foo() returns 5, you will see the following message: +// +// Expected: Foo() is even +// Actual: it's 5 +// +class GTEST_API_ AssertionResult { + public: + // Copy constructor. + // Used in EXPECT_TRUE/FALSE(assertion_result). + AssertionResult(const AssertionResult& other); + // Used in the EXPECT_TRUE/FALSE(bool_expression). + explicit AssertionResult(bool success) : success_(success) {} + + // Returns true iff the assertion succeeded. + operator bool() const { return success_; } // NOLINT + + // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE. + AssertionResult operator!() const; + + // Returns the text streamed into this AssertionResult. Test assertions + // use it when they fail (i.e., the predicate's outcome doesn't match the + // assertion's expectation). When nothing has been streamed into the + // object, returns an empty string. + const char* message() const { + return message_.get() != NULL ? message_->c_str() : ""; + } + // TODO(vladl@google.com): Remove this after making sure no clients use it. + // Deprecated; please use message() instead. + const char* failure_message() const { return message(); } + + // Streams a custom failure message into this object. + template <typename T> AssertionResult& operator<<(const T& value) { + AppendMessage(Message() << value); + return *this; + } + + // Allows streaming basic output manipulators such as endl or flush into + // this object. + AssertionResult& operator<<( + ::std::ostream& (*basic_manipulator)(::std::ostream& stream)) { + AppendMessage(Message() << basic_manipulator); + return *this; + } + + private: + // Appends the contents of message to message_. + void AppendMessage(const Message& a_message) { + if (message_.get() == NULL) + message_.reset(new ::std::string); + message_->append(a_message.GetString().c_str()); + } + + // Stores result of the assertion predicate. + bool success_; + // Stores the message describing the condition in case the expectation + // construct is not satisfied with the predicate's outcome. + // Referenced via a pointer to avoid taking too much stack frame space + // with test assertions. + internal::scoped_ptr< ::std::string> message_; + + GTEST_DISALLOW_ASSIGN_(AssertionResult); +}; + +// Makes a successful assertion result. +GTEST_API_ AssertionResult AssertionSuccess(); + +// Makes a failed assertion result. +GTEST_API_ AssertionResult AssertionFailure(); + +// Makes a failed assertion result with the given failure message. +// Deprecated; use AssertionFailure() << msg. +GTEST_API_ AssertionResult AssertionFailure(const Message& msg); + +// The abstract class that all tests inherit from. +// +// In Google Test, a unit test program contains one or many TestCases, and +// each TestCase contains one or many Tests. +// +// When you define a test using the TEST macro, you don't need to +// explicitly derive from Test - the TEST macro automatically does +// this for you. +// +// The only time you derive from Test is when defining a test fixture +// to be used a TEST_F. For example: +// +// class FooTest : public testing::Test { +// protected: +// virtual void SetUp() { ... } +// virtual void TearDown() { ... } +// ... +// }; +// +// TEST_F(FooTest, Bar) { ... } +// TEST_F(FooTest, Baz) { ... } +// +// Test is not copyable. +class GTEST_API_ Test { + public: + friend class TestInfo; + + // Defines types for pointers to functions that set up and tear down + // a test case. + typedef internal::SetUpTestCaseFunc SetUpTestCaseFunc; + typedef internal::TearDownTestCaseFunc TearDownTestCaseFunc; + + // The d'tor is virtual as we intend to inherit from Test. + virtual ~Test(); + + // Sets up the stuff shared by all tests in this test case. + // + // Google Test will call Foo::SetUpTestCase() before running the first + // test in test case Foo. Hence a sub-class can define its own + // SetUpTestCase() method to shadow the one defined in the super + // class. + static void SetUpTestCase() {} + + // Tears down the stuff shared by all tests in this test case. + // + // Google Test will call Foo::TearDownTestCase() after running the last + // test in test case Foo. Hence a sub-class can define its own + // TearDownTestCase() method to shadow the one defined in the super + // class. + static void TearDownTestCase() {} + + // Returns true iff the current test has a fatal failure. + static bool HasFatalFailure(); + + // Returns true iff the current test has a non-fatal failure. + static bool HasNonfatalFailure(); + + // Returns true iff the current test has a (either fatal or + // non-fatal) failure. + static bool HasFailure() { return HasFatalFailure() || HasNonfatalFailure(); } + + // Logs a property for the current test, test case, or for the entire + // invocation of the test program when used outside of the context of a + // test case. Only the last value for a given key is remembered. These + // are public static so they can be called from utility functions that are + // not members of the test fixture. Calls to RecordProperty made during + // lifespan of the test (from the moment its constructor starts to the + // moment its destructor finishes) will be output in XML as attributes of + // the <testcase> element. Properties recorded from fixture's + // SetUpTestCase or TearDownTestCase are logged as attributes of the + // corresponding <testsuite> element. Calls to RecordProperty made in the + // global context (before or after invocation of RUN_ALL_TESTS and from + // SetUp/TearDown method of Environment objects registered with Google + // Test) will be output as attributes of the <testsuites> element. + static void RecordProperty(const std::string& key, const std::string& value); + static void RecordProperty(const std::string& key, int value); + + protected: + // Creates a Test object. + Test(); + + // Sets up the test fixture. + virtual void SetUp(); + + // Tears down the test fixture. + virtual void TearDown(); + + private: + // Returns true iff the current test has the same fixture class as + // the first test in the current test case. + static bool HasSameFixtureClass(); + + // Runs the test after the test fixture has been set up. + // + // A sub-class must implement this to define the test logic. + // + // DO NOT OVERRIDE THIS FUNCTION DIRECTLY IN A USER PROGRAM. + // Instead, use the TEST or TEST_F macro. + virtual void TestBody() = 0; + + // Sets up, executes, and tears down the test. + void Run(); + + // Deletes self. We deliberately pick an unusual name for this + // internal method to avoid clashing with names used in user TESTs. + void DeleteSelf_() { delete this; } + + // Uses a GTestFlagSaver to save and restore all Google Test flags. + const internal::GTestFlagSaver* const gtest_flag_saver_; + + // Often a user mis-spells SetUp() as Setup() and spends a long time + // wondering why it is never called by Google Test. The declaration of + // the following method is solely for catching such an error at + // compile time: + // + // - The return type is deliberately chosen to be not void, so it + // will be a conflict if a user declares void Setup() in his test + // fixture. + // + // - This method is private, so it will be another compiler error + // if a user calls it from his test fixture. + // + // DO NOT OVERRIDE THIS FUNCTION. + // + // If you see an error about overriding the following function or + // about it being private, you have mis-spelled SetUp() as Setup(). + struct Setup_should_be_spelled_SetUp {}; + virtual Setup_should_be_spelled_SetUp* Setup() { return NULL; } + + // We disallow copying Tests. + GTEST_DISALLOW_COPY_AND_ASSIGN_(Test); +}; + +typedef internal::TimeInMillis TimeInMillis; + +// A copyable object representing a user specified test property which can be +// output as a key/value string pair. +// +// Don't inherit from TestProperty as its destructor is not virtual. +class TestProperty { + public: + // C'tor. TestProperty does NOT have a default constructor. + // Always use this constructor (with parameters) to create a + // TestProperty object. + TestProperty(const std::string& a_key, const std::string& a_value) : + key_(a_key), value_(a_value) { + } + + // Gets the user supplied key. + const char* key() const { + return key_.c_str(); + } + + // Gets the user supplied value. + const char* value() const { + return value_.c_str(); + } + + // Sets a new value, overriding the one supplied in the constructor. + void SetValue(const std::string& new_value) { + value_ = new_value; + } + + private: + // The key supplied by the user. + std::string key_; + // The value supplied by the user. + std::string value_; +}; + +// The result of a single Test. This includes a list of +// TestPartResults, a list of TestProperties, a count of how many +// death tests there are in the Test, and how much time it took to run +// the Test. +// +// TestResult is not copyable. +class GTEST_API_ TestResult { + public: + // Creates an empty TestResult. + TestResult(); + + // D'tor. Do not inherit from TestResult. + ~TestResult(); + + // Gets the number of all test parts. This is the sum of the number + // of successful test parts and the number of failed test parts. + int total_part_count() const; + + // Returns the number of the test properties. + int test_property_count() const; + + // Returns true iff the test passed (i.e. no test part failed). + bool Passed() const { return !Failed(); } + + // Returns true iff the test failed. + bool Failed() const; + + // Returns true iff the test fatally failed. + bool HasFatalFailure() const; + + // Returns true iff the test has a non-fatal failure. + bool HasNonfatalFailure() const; + + // Returns the elapsed time, in milliseconds. + TimeInMillis elapsed_time() const { return elapsed_time_; } + + // Returns the i-th test part result among all the results. i can range + // from 0 to test_property_count() - 1. If i is not in that range, aborts + // the program. + const TestPartResult& GetTestPartResult(int i) const; + + // Returns the i-th test property. i can range from 0 to + // test_property_count() - 1. If i is not in that range, aborts the + // program. + const TestProperty& GetTestProperty(int i) const; + + private: + friend class TestInfo; + friend class TestCase; + friend class UnitTest; + friend class internal::DefaultGlobalTestPartResultReporter; + friend class internal::ExecDeathTest; + friend class internal::TestResultAccessor; + friend class internal::UnitTestImpl; + friend class internal::WindowsDeathTest; + + // Gets the vector of TestPartResults. + const std::vector<TestPartResult>& test_part_results() const { + return test_part_results_; + } + + // Gets the vector of TestProperties. + const std::vector<TestProperty>& test_properties() const { + return test_properties_; + } + + // Sets the elapsed time. + void set_elapsed_time(TimeInMillis elapsed) { elapsed_time_ = elapsed; } + + // Adds a test property to the list. The property is validated and may add + // a non-fatal failure if invalid (e.g., if it conflicts with reserved + // key names). If a property is already recorded for the same key, the + // value will be updated, rather than storing multiple values for the same + // key. xml_element specifies the element for which the property is being + // recorded and is used for validation. + void RecordProperty(const std::string& xml_element, + const TestProperty& test_property); + + // Adds a failure if the key is a reserved attribute of Google Test + // testcase tags. Returns true if the property is valid. + // TODO(russr): Validate attribute names are legal and human readable. + static bool ValidateTestProperty(const std::string& xml_element, + const TestProperty& test_property); + + // Adds a test part result to the list. + void AddTestPartResult(const TestPartResult& test_part_result); + + // Returns the death test count. + int death_test_count() const { return death_test_count_; } + + // Increments the death test count, returning the new count. + int increment_death_test_count() { return ++death_test_count_; } + + // Clears the test part results. + void ClearTestPartResults(); + + // Clears the object. + void Clear(); + + // Protects mutable state of the property vector and of owned + // properties, whose values may be updated. + internal::Mutex test_properites_mutex_; + + // The vector of TestPartResults + std::vector<TestPartResult> test_part_results_; + // The vector of TestProperties + std::vector<TestProperty> test_properties_; + // Running count of death tests. + int death_test_count_; + // The elapsed time, in milliseconds. + TimeInMillis elapsed_time_; + + // We disallow copying TestResult. + GTEST_DISALLOW_COPY_AND_ASSIGN_(TestResult); +}; // class TestResult + +// A TestInfo object stores the following information about a test: +// +// Test case name +// Test name +// Whether the test should be run +// A function pointer that creates the test object when invoked +// Test result +// +// The constructor of TestInfo registers itself with the UnitTest +// singleton such that the RUN_ALL_TESTS() macro knows which tests to +// run. +class GTEST_API_ TestInfo { + public: + // Destructs a TestInfo object. This function is not virtual, so + // don't inherit from TestInfo. + ~TestInfo(); + + // Returns the test case name. + const char* test_case_name() const { return test_case_name_.c_str(); } + + // Returns the test name. + const char* name() const { return name_.c_str(); } + + // Returns the name of the parameter type, or NULL if this is not a typed + // or a type-parameterized test. + const char* type_param() const { + if (type_param_.get() != NULL) + return type_param_->c_str(); + return NULL; + } + + // Returns the text representation of the value parameter, or NULL if this + // is not a value-parameterized test. + const char* value_param() const { + if (value_param_.get() != NULL) + return value_param_->c_str(); + return NULL; + } + + // Returns true if this test should run, that is if the test is not + // disabled (or it is disabled but the also_run_disabled_tests flag has + // been specified) and its full name matches the user-specified filter. + // + // Google Test allows the user to filter the tests by their full names. + // The full name of a test Bar in test case Foo is defined as + // "Foo.Bar". Only the tests that match the filter will run. + // + // A filter is a colon-separated list of glob (not regex) patterns, + // optionally followed by a '-' and a colon-separated list of + // negative patterns (tests to exclude). A test is run if it + // matches one of the positive patterns and does not match any of + // the negative patterns. + // + // For example, *A*:Foo.* is a filter that matches any string that + // contains the character 'A' or starts with "Foo.". + bool should_run() const { return should_run_; } + + // Returns true iff this test will appear in the XML report. + bool is_reportable() const { + // For now, the XML report includes all tests matching the filter. + // In the future, we may trim tests that are excluded because of + // sharding. + return matches_filter_; + } + + // Returns the result of the test. + const TestResult* result() const { return &result_; } + + private: +#if GTEST_HAS_DEATH_TEST + friend class internal::DefaultDeathTestFactory; +#endif // GTEST_HAS_DEATH_TEST + friend class Test; + friend class TestCase; + friend class internal::UnitTestImpl; + friend class internal::StreamingListenerTest; + friend TestInfo* internal::MakeAndRegisterTestInfo( + const char* test_case_name, + const char* name, + const char* type_param, + const char* value_param, + internal::TypeId fixture_class_id, + Test::SetUpTestCaseFunc set_up_tc, + Test::TearDownTestCaseFunc tear_down_tc, + internal::TestFactoryBase* factory); + + // Constructs a TestInfo object. The newly constructed instance assumes + // ownership of the factory object. + TestInfo(const std::string& test_case_name, + const std::string& name, + const char* a_type_param, // NULL if not a type-parameterized test + const char* a_value_param, // NULL if not a value-parameterized test + internal::TypeId fixture_class_id, + internal::TestFactoryBase* factory); + + // Increments the number of death tests encountered in this test so + // far. + int increment_death_test_count() { + return result_.increment_death_test_count(); + } + + // Creates the test object, runs it, records its result, and then + // deletes it. + void Run(); + + static void ClearTestResult(TestInfo* test_info) { + test_info->result_.Clear(); + } + + // These fields are immutable properties of the test. + const std::string test_case_name_; // Test case name + const std::string name_; // Test name + // Name of the parameter type, or NULL if this is not a typed or a + // type-parameterized test. + const internal::scoped_ptr<const ::std::string> type_param_; + // Text representation of the value parameter, or NULL if this is not a + // value-parameterized test. + const internal::scoped_ptr<const ::std::string> value_param_; + const internal::TypeId fixture_class_id_; // ID of the test fixture class + bool should_run_; // True iff this test should run + bool is_disabled_; // True iff this test is disabled + bool matches_filter_; // True if this test matches the + // user-specified filter. + internal::TestFactoryBase* const factory_; // The factory that creates + // the test object + + // This field is mutable and needs to be reset before running the + // test for the second time. + TestResult result_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(TestInfo); +}; + +// A test case, which consists of a vector of TestInfos. +// +// TestCase is not copyable. +class GTEST_API_ TestCase { + public: + // Creates a TestCase with the given name. + // + // TestCase does NOT have a default constructor. Always use this + // constructor to create a TestCase object. + // + // Arguments: + // + // name: name of the test case + // a_type_param: the name of the test's type parameter, or NULL if + // this is not a type-parameterized test. + // set_up_tc: pointer to the function that sets up the test case + // tear_down_tc: pointer to the function that tears down the test case + TestCase(const char* name, const char* a_type_param, + Test::SetUpTestCaseFunc set_up_tc, + Test::TearDownTestCaseFunc tear_down_tc); + + // Destructor of TestCase. + virtual ~TestCase(); + + // Gets the name of the TestCase. + const char* name() const { return name_.c_str(); } + + // Returns the name of the parameter type, or NULL if this is not a + // type-parameterized test case. + const char* type_param() const { + if (type_param_.get() != NULL) + return type_param_->c_str(); + return NULL; + } + + // Returns true if any test in this test case should run. + bool should_run() const { return should_run_; } + + // Gets the number of successful tests in this test case. + int successful_test_count() const; + + // Gets the number of failed tests in this test case. + int failed_test_count() const; + + // Gets the number of disabled tests that will be reported in the XML report. + int reportable_disabled_test_count() const; + + // Gets the number of disabled tests in this test case. + int disabled_test_count() const; + + // Gets the number of tests to be printed in the XML report. + int reportable_test_count() const; + + // Get the number of tests in this test case that should run. + int test_to_run_count() const; + + // Gets the number of all tests in this test case. + int total_test_count() const; + + // Returns true iff the test case passed. + bool Passed() const { return !Failed(); } + + // Returns true iff the test case failed. + bool Failed() const { return failed_test_count() > 0; } + + // Returns the elapsed time, in milliseconds. + TimeInMillis elapsed_time() const { return elapsed_time_; } + + // Returns the i-th test among all the tests. i can range from 0 to + // total_test_count() - 1. If i is not in that range, returns NULL. + const TestInfo* GetTestInfo(int i) const; + + // Returns the TestResult that holds test properties recorded during + // execution of SetUpTestCase and TearDownTestCase. + const TestResult& ad_hoc_test_result() const { return ad_hoc_test_result_; } + + private: + friend class Test; + friend class internal::UnitTestImpl; + + // Gets the (mutable) vector of TestInfos in this TestCase. + std::vector<TestInfo*>& test_info_list() { return test_info_list_; } + + // Gets the (immutable) vector of TestInfos in this TestCase. + const std::vector<TestInfo*>& test_info_list() const { + return test_info_list_; + } + + // Returns the i-th test among all the tests. i can range from 0 to + // total_test_count() - 1. If i is not in that range, returns NULL. + TestInfo* GetMutableTestInfo(int i); + + // Sets the should_run member. + void set_should_run(bool should) { should_run_ = should; } + + // Adds a TestInfo to this test case. Will delete the TestInfo upon + // destruction of the TestCase object. + void AddTestInfo(TestInfo * test_info); + + // Clears the results of all tests in this test case. + void ClearResult(); + + // Clears the results of all tests in the given test case. + static void ClearTestCaseResult(TestCase* test_case) { + test_case->ClearResult(); + } + + // Runs every test in this TestCase. + void Run(); + + // Runs SetUpTestCase() for this TestCase. This wrapper is needed + // for catching exceptions thrown from SetUpTestCase(). + void RunSetUpTestCase() { (*set_up_tc_)(); } + + // Runs TearDownTestCase() for this TestCase. This wrapper is + // needed for catching exceptions thrown from TearDownTestCase(). + void RunTearDownTestCase() { (*tear_down_tc_)(); } + + // Returns true iff test passed. + static bool TestPassed(const TestInfo* test_info) { + return test_info->should_run() && test_info->result()->Passed(); + } + + // Returns true iff test failed. + static bool TestFailed(const TestInfo* test_info) { + return test_info->should_run() && test_info->result()->Failed(); + } + + // Returns true iff the test is disabled and will be reported in the XML + // report. + static bool TestReportableDisabled(const TestInfo* test_info) { + return test_info->is_reportable() && test_info->is_disabled_; + } + + // Returns true iff test is disabled. + static bool TestDisabled(const TestInfo* test_info) { + return test_info->is_disabled_; + } + + // Returns true iff this test will appear in the XML report. + static bool TestReportable(const TestInfo* test_info) { + return test_info->is_reportable(); + } + + // Returns true if the given test should run. + static bool ShouldRunTest(const TestInfo* test_info) { + return test_info->should_run(); + } + + // Shuffles the tests in this test case. + void ShuffleTests(internal::Random* random); + + // Restores the test order to before the first shuffle. + void UnshuffleTests(); + + // Name of the test case. + std::string name_; + // Name of the parameter type, or NULL if this is not a typed or a + // type-parameterized test. + const internal::scoped_ptr<const ::std::string> type_param_; + // The vector of TestInfos in their original order. It owns the + // elements in the vector. + std::vector<TestInfo*> test_info_list_; + // Provides a level of indirection for the test list to allow easy + // shuffling and restoring the test order. The i-th element in this + // vector is the index of the i-th test in the shuffled test list. + std::vector<int> test_indices_; + // Pointer to the function that sets up the test case. + Test::SetUpTestCaseFunc set_up_tc_; + // Pointer to the function that tears down the test case. + Test::TearDownTestCaseFunc tear_down_tc_; + // True iff any test in this test case should run. + bool should_run_; + // Elapsed time, in milliseconds. + TimeInMillis elapsed_time_; + // Holds test properties recorded during execution of SetUpTestCase and + // TearDownTestCase. + TestResult ad_hoc_test_result_; + + // We disallow copying TestCases. + GTEST_DISALLOW_COPY_AND_ASSIGN_(TestCase); +}; + +// An Environment object is capable of setting up and tearing down an +// environment. The user should subclass this to define his own +// environment(s). +// +// An Environment object does the set-up and tear-down in virtual +// methods SetUp() and TearDown() instead of the constructor and the +// destructor, as: +// +// 1. You cannot safely throw from a destructor. This is a problem +// as in some cases Google Test is used where exceptions are enabled, and +// we may want to implement ASSERT_* using exceptions where they are +// available. +// 2. You cannot use ASSERT_* directly in a constructor or +// destructor. +class Environment { + public: + // The d'tor is virtual as we need to subclass Environment. + virtual ~Environment() {} + + // Override this to define how to set up the environment. + virtual void SetUp() {} + + // Override this to define how to tear down the environment. + virtual void TearDown() {} + private: + // If you see an error about overriding the following function or + // about it being private, you have mis-spelled SetUp() as Setup(). + struct Setup_should_be_spelled_SetUp {}; + virtual Setup_should_be_spelled_SetUp* Setup() { return NULL; } +}; + +// The interface for tracing execution of tests. The methods are organized in +// the order the corresponding events are fired. +class TestEventListener { + public: + virtual ~TestEventListener() {} + + // Fired before any test activity starts. + virtual void OnTestProgramStart(const UnitTest& unit_test) = 0; + + // Fired before each iteration of tests starts. There may be more than + // one iteration if GTEST_FLAG(repeat) is set. iteration is the iteration + // index, starting from 0. + virtual void OnTestIterationStart(const UnitTest& unit_test, + int iteration) = 0; + + // Fired before environment set-up for each iteration of tests starts. + virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test) = 0; + + // Fired after environment set-up for each iteration of tests ends. + virtual void OnEnvironmentsSetUpEnd(const UnitTest& unit_test) = 0; + + // Fired before the test case starts. + virtual void OnTestCaseStart(const TestCase& test_case) = 0; + + // Fired before the test starts. + virtual void OnTestStart(const TestInfo& test_info) = 0; + + // Fired after a failed assertion or a SUCCEED() invocation. + virtual void OnTestPartResult(const TestPartResult& test_part_result) = 0; + + // Fired after the test ends. + virtual void OnTestEnd(const TestInfo& test_info) = 0; + + // Fired after the test case ends. + virtual void OnTestCaseEnd(const TestCase& test_case) = 0; + + // Fired before environment tear-down for each iteration of tests starts. + virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test) = 0; + + // Fired after environment tear-down for each iteration of tests ends. + virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test) = 0; + + // Fired after each iteration of tests finishes. + virtual void OnTestIterationEnd(const UnitTest& unit_test, + int iteration) = 0; + + // Fired after all test activities have ended. + virtual void OnTestProgramEnd(const UnitTest& unit_test) = 0; +}; + +// The convenience class for users who need to override just one or two +// methods and are not concerned that a possible change to a signature of +// the methods they override will not be caught during the build. For +// comments about each method please see the definition of TestEventListener +// above. +class EmptyTestEventListener : public TestEventListener { + public: + virtual void OnTestProgramStart(const UnitTest& /*unit_test*/) {} + virtual void OnTestIterationStart(const UnitTest& /*unit_test*/, + int /*iteration*/) {} + virtual void OnEnvironmentsSetUpStart(const UnitTest& /*unit_test*/) {} + virtual void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) {} + virtual void OnTestCaseStart(const TestCase& /*test_case*/) {} + virtual void OnTestStart(const TestInfo& /*test_info*/) {} + virtual void OnTestPartResult(const TestPartResult& /*test_part_result*/) {} + virtual void OnTestEnd(const TestInfo& /*test_info*/) {} + virtual void OnTestCaseEnd(const TestCase& /*test_case*/) {} + virtual void OnEnvironmentsTearDownStart(const UnitTest& /*unit_test*/) {} + virtual void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) {} + virtual void OnTestIterationEnd(const UnitTest& /*unit_test*/, + int /*iteration*/) {} + virtual void OnTestProgramEnd(const UnitTest& /*unit_test*/) {} +}; + +// TestEventListeners lets users add listeners to track events in Google Test. +class GTEST_API_ TestEventListeners { + public: + TestEventListeners(); + ~TestEventListeners(); + + // Appends an event listener to the end of the list. Google Test assumes + // the ownership of the listener (i.e. it will delete the listener when + // the test program finishes). + void Append(TestEventListener* listener); + + // Removes the given event listener from the list and returns it. It then + // becomes the caller's responsibility to delete the listener. Returns + // NULL if the listener is not found in the list. + TestEventListener* Release(TestEventListener* listener); + + // Returns the standard listener responsible for the default console + // output. Can be removed from the listeners list to shut down default + // console output. Note that removing this object from the listener list + // with Release transfers its ownership to the caller and makes this + // function return NULL the next time. + TestEventListener* default_result_printer() const { + return default_result_printer_; + } + + // Returns the standard listener responsible for the default XML output + // controlled by the --gtest_output=xml flag. Can be removed from the + // listeners list by users who want to shut down the default XML output + // controlled by this flag and substitute it with custom one. Note that + // removing this object from the listener list with Release transfers its + // ownership to the caller and makes this function return NULL the next + // time. + TestEventListener* default_xml_generator() const { + return default_xml_generator_; + } + + private: + friend class TestCase; + friend class TestInfo; + friend class internal::DefaultGlobalTestPartResultReporter; + friend class internal::NoExecDeathTest; + friend class internal::TestEventListenersAccessor; + friend class internal::UnitTestImpl; + + // Returns repeater that broadcasts the TestEventListener events to all + // subscribers. + TestEventListener* repeater(); + + // Sets the default_result_printer attribute to the provided listener. + // The listener is also added to the listener list and previous + // default_result_printer is removed from it and deleted. The listener can + // also be NULL in which case it will not be added to the list. Does + // nothing if the previous and the current listener objects are the same. + void SetDefaultResultPrinter(TestEventListener* listener); + + // Sets the default_xml_generator attribute to the provided listener. The + // listener is also added to the listener list and previous + // default_xml_generator is removed from it and deleted. The listener can + // also be NULL in which case it will not be added to the list. Does + // nothing if the previous and the current listener objects are the same. + void SetDefaultXmlGenerator(TestEventListener* listener); + + // Controls whether events will be forwarded by the repeater to the + // listeners in the list. + bool EventForwardingEnabled() const; + void SuppressEventForwarding(); + + // The actual list of listeners. + internal::TestEventRepeater* repeater_; + // Listener responsible for the standard result output. + TestEventListener* default_result_printer_; + // Listener responsible for the creation of the XML output file. + TestEventListener* default_xml_generator_; + + // We disallow copying TestEventListeners. + GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventListeners); +}; + +// A UnitTest consists of a vector of TestCases. +// +// This is a singleton class. The only instance of UnitTest is +// created when UnitTest::GetInstance() is first called. This +// instance is never deleted. +// +// UnitTest is not copyable. +// +// This class is thread-safe as long as the methods are called +// according to their specification. +class GTEST_API_ UnitTest { + public: + // Gets the singleton UnitTest object. The first time this method + // is called, a UnitTest object is constructed and returned. + // Consecutive calls will return the same object. + static UnitTest* GetInstance(); + + // Runs all tests in this UnitTest object and prints the result. + // Returns 0 if successful, or 1 otherwise. + // + // This method can only be called from the main thread. + // + // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. + int Run() GTEST_MUST_USE_RESULT_; + + // Returns the working directory when the first TEST() or TEST_F() + // was executed. The UnitTest object owns the string. + const char* original_working_dir() const; + + // Returns the TestCase object for the test that's currently running, + // or NULL if no test is running. + const TestCase* current_test_case() const + GTEST_LOCK_EXCLUDED_(mutex_); + + // Returns the TestInfo object for the test that's currently running, + // or NULL if no test is running. + const TestInfo* current_test_info() const + GTEST_LOCK_EXCLUDED_(mutex_); + + // Returns the random seed used at the start of the current test run. + int random_seed() const; + +#if GTEST_HAS_PARAM_TEST + // Returns the ParameterizedTestCaseRegistry object used to keep track of + // value-parameterized tests and instantiate and register them. + // + // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. + internal::ParameterizedTestCaseRegistry& parameterized_test_registry() + GTEST_LOCK_EXCLUDED_(mutex_); +#endif // GTEST_HAS_PARAM_TEST + + // Gets the number of successful test cases. + int successful_test_case_count() const; + + // Gets the number of failed test cases. + int failed_test_case_count() const; + + // Gets the number of all test cases. + int total_test_case_count() const; + + // Gets the number of all test cases that contain at least one test + // that should run. + int test_case_to_run_count() const; + + // Gets the number of successful tests. + int successful_test_count() const; + + // Gets the number of failed tests. + int failed_test_count() const; + + // Gets the number of disabled tests that will be reported in the XML report. + int reportable_disabled_test_count() const; + + // Gets the number of disabled tests. + int disabled_test_count() const; + + // Gets the number of tests to be printed in the XML report. + int reportable_test_count() const; + + // Gets the number of all tests. + int total_test_count() const; + + // Gets the number of tests that should run. + int test_to_run_count() const; + + // Gets the time of the test program start, in ms from the start of the + // UNIX epoch. + TimeInMillis start_timestamp() const; + + // Gets the elapsed time, in milliseconds. + TimeInMillis elapsed_time() const; + + // Returns true iff the unit test passed (i.e. all test cases passed). + bool Passed() const; + + // Returns true iff the unit test failed (i.e. some test case failed + // or something outside of all tests failed). + bool Failed() const; + + // Gets the i-th test case among all the test cases. i can range from 0 to + // total_test_case_count() - 1. If i is not in that range, returns NULL. + const TestCase* GetTestCase(int i) const; + + // Returns the TestResult containing information on test failures and + // properties logged outside of individual test cases. + const TestResult& ad_hoc_test_result() const; + + // Returns the list of event listeners that can be used to track events + // inside Google Test. + TestEventListeners& listeners(); + + private: + // Registers and returns a global test environment. When a test + // program is run, all global test environments will be set-up in + // the order they were registered. After all tests in the program + // have finished, all global test environments will be torn-down in + // the *reverse* order they were registered. + // + // The UnitTest object takes ownership of the given environment. + // + // This method can only be called from the main thread. + Environment* AddEnvironment(Environment* env); + + // Adds a TestPartResult to the current TestResult object. All + // Google Test assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) + // eventually call this to report their results. The user code + // should use the assertion macros instead of calling this directly. + void AddTestPartResult(TestPartResult::Type result_type, + const char* file_name, + int line_number, + const std::string& message, + const std::string& os_stack_trace) + GTEST_LOCK_EXCLUDED_(mutex_); + + // Adds a TestProperty to the current TestResult object when invoked from + // inside a test, to current TestCase's ad_hoc_test_result_ when invoked + // from SetUpTestCase or TearDownTestCase, or to the global property set + // when invoked elsewhere. If the result already contains a property with + // the same key, the value will be updated. + void RecordProperty(const std::string& key, const std::string& value); + + // Gets the i-th test case among all the test cases. i can range from 0 to + // total_test_case_count() - 1. If i is not in that range, returns NULL. + TestCase* GetMutableTestCase(int i); + + // Accessors for the implementation object. + internal::UnitTestImpl* impl() { return impl_; } + const internal::UnitTestImpl* impl() const { return impl_; } + + // These classes and functions are friends as they need to access private + // members of UnitTest. + friend class Test; + friend class internal::AssertHelper; + friend class internal::ScopedTrace; + friend class internal::StreamingListenerTest; + friend class internal::UnitTestRecordPropertyTestHelper; + friend Environment* AddGlobalTestEnvironment(Environment* env); + friend internal::UnitTestImpl* internal::GetUnitTestImpl(); + friend void internal::ReportFailureInUnknownLocation( + TestPartResult::Type result_type, + const std::string& message); + + // Creates an empty UnitTest. + UnitTest(); + + // D'tor + virtual ~UnitTest(); + + // Pushes a trace defined by SCOPED_TRACE() on to the per-thread + // Google Test trace stack. + void PushGTestTrace(const internal::TraceInfo& trace) + GTEST_LOCK_EXCLUDED_(mutex_); + + // Pops a trace from the per-thread Google Test trace stack. + void PopGTestTrace() + GTEST_LOCK_EXCLUDED_(mutex_); + + // Protects mutable state in *impl_. This is mutable as some const + // methods need to lock it too. + mutable internal::Mutex mutex_; + + // Opaque implementation object. This field is never changed once + // the object is constructed. We don't mark it as const here, as + // doing so will cause a warning in the constructor of UnitTest. + // Mutable state in *impl_ is protected by mutex_. + internal::UnitTestImpl* impl_; + + // We disallow copying UnitTest. + GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTest); +}; + +// A convenient wrapper for adding an environment for the test +// program. +// +// You should call this before RUN_ALL_TESTS() is called, probably in +// main(). If you use gtest_main, you need to call this before main() +// starts for it to take effect. For example, you can define a global +// variable like this: +// +// testing::Environment* const foo_env = +// testing::AddGlobalTestEnvironment(new FooEnvironment); +// +// However, we strongly recommend you to write your own main() and +// call AddGlobalTestEnvironment() there, as relying on initialization +// of global variables makes the code harder to read and may cause +// problems when you register multiple environments from different +// translation units and the environments have dependencies among them +// (remember that the compiler doesn't guarantee the order in which +// global variables from different translation units are initialized). +inline Environment* AddGlobalTestEnvironment(Environment* env) { + return UnitTest::GetInstance()->AddEnvironment(env); +} + +// Initializes Google Test. This must be called before calling +// RUN_ALL_TESTS(). In particular, it parses a command line for the +// flags that Google Test recognizes. Whenever a Google Test flag is +// seen, it is removed from argv, and *argc is decremented. +// +// No value is returned. Instead, the Google Test flag variables are +// updated. +// +// Calling the function for the second time has no user-visible effect. +GTEST_API_ void InitGoogleTest(int* argc, char** argv); + +// This overloaded version can be used in Windows programs compiled in +// UNICODE mode. +GTEST_API_ void InitGoogleTest(int* argc, wchar_t** argv); + +namespace internal { + +// FormatForComparison<ToPrint, OtherOperand>::Format(value) formats a +// value of type ToPrint that is an operand of a comparison assertion +// (e.g. ASSERT_EQ). OtherOperand is the type of the other operand in +// the comparison, and is used to help determine the best way to +// format the value. In particular, when the value is a C string +// (char pointer) and the other operand is an STL string object, we +// want to format the C string as a string, since we know it is +// compared by value with the string object. If the value is a char +// pointer but the other operand is not an STL string object, we don't +// know whether the pointer is supposed to point to a NUL-terminated +// string, and thus want to print it as a pointer to be safe. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. + +// The default case. +template <typename ToPrint, typename OtherOperand> +class FormatForComparison { + public: + static ::std::string Format(const ToPrint& value) { + return ::testing::PrintToString(value); + } +}; + +// Array. +template <typename ToPrint, size_t N, typename OtherOperand> +class FormatForComparison<ToPrint[N], OtherOperand> { + public: + static ::std::string Format(const ToPrint* value) { + return FormatForComparison<const ToPrint*, OtherOperand>::Format(value); + } +}; + +// By default, print C string as pointers to be safe, as we don't know +// whether they actually point to a NUL-terminated string. + +#define GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(CharType) \ + template <typename OtherOperand> \ + class FormatForComparison<CharType*, OtherOperand> { \ + public: \ + static ::std::string Format(CharType* value) { \ + return ::testing::PrintToString(static_cast<const void*>(value)); \ + } \ + } + +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char); +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char); +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(wchar_t); +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t); + +#undef GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_ + +// If a C string is compared with an STL string object, we know it's meant +// to point to a NUL-terminated string, and thus can print it as a string. + +#define GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(CharType, OtherStringType) \ + template <> \ + class FormatForComparison<CharType*, OtherStringType> { \ + public: \ + static ::std::string Format(CharType* value) { \ + return ::testing::PrintToString(value); \ + } \ + } + +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string); +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::std::string); + +#if GTEST_HAS_GLOBAL_STRING +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::string); +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::string); +#endif + +#if GTEST_HAS_GLOBAL_WSTRING +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::wstring); +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::wstring); +#endif + +#if GTEST_HAS_STD_WSTRING +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::std::wstring); +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::std::wstring); +#endif + +#undef GTEST_IMPL_FORMAT_C_STRING_AS_STRING_ + +// Formats a comparison assertion (e.g. ASSERT_EQ, EXPECT_LT, and etc) +// operand to be used in a failure message. The type (but not value) +// of the other operand may affect the format. This allows us to +// print a char* as a raw pointer when it is compared against another +// char* or void*, and print it as a C string when it is compared +// against an std::string object, for example. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +template <typename T1, typename T2> +std::string FormatForComparisonFailureMessage( + const T1& value, const T2& /* other_operand */) { + return FormatForComparison<T1, T2>::Format(value); +} + +// The helper function for {ASSERT|EXPECT}_EQ. +template <typename T1, typename T2> +AssertionResult CmpHelperEQ(const char* expected_expression, + const char* actual_expression, + const T1& expected, + const T2& actual) { +#ifdef _MSC_VER +# pragma warning(push) // Saves the current warning state. +# pragma warning(disable:4389) // Temporarily disables warning on + // signed/unsigned mismatch. +#endif + + if (expected == actual) { + return AssertionSuccess(); + } + +#ifdef _MSC_VER +# pragma warning(pop) // Restores the warning state. +#endif + + return EqFailure(expected_expression, + actual_expression, + FormatForComparisonFailureMessage(expected, actual), + FormatForComparisonFailureMessage(actual, expected), + false); +} + +// With this overloaded version, we allow anonymous enums to be used +// in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous enums +// can be implicitly cast to BiggestInt. +GTEST_API_ AssertionResult CmpHelperEQ(const char* expected_expression, + const char* actual_expression, + BiggestInt expected, + BiggestInt actual); + +// The helper class for {ASSERT|EXPECT}_EQ. The template argument +// lhs_is_null_literal is true iff the first argument to ASSERT_EQ() +// is a null pointer literal. The following default implementation is +// for lhs_is_null_literal being false. +template <bool lhs_is_null_literal> +class EqHelper { + public: + // This templatized version is for the general case. + template <typename T1, typename T2> + static AssertionResult Compare(const char* expected_expression, + const char* actual_expression, + const T1& expected, + const T2& actual) { + return CmpHelperEQ(expected_expression, actual_expression, expected, + actual); + } + + // With this overloaded version, we allow anonymous enums to be used + // in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous + // enums can be implicitly cast to BiggestInt. + // + // Even though its body looks the same as the above version, we + // cannot merge the two, as it will make anonymous enums unhappy. + static AssertionResult Compare(const char* expected_expression, + const char* actual_expression, + BiggestInt expected, + BiggestInt actual) { + return CmpHelperEQ(expected_expression, actual_expression, expected, + actual); + } +}; + +// This specialization is used when the first argument to ASSERT_EQ() +// is a null pointer literal, like NULL, false, or 0. +template <> +class EqHelper<true> { + public: + // We define two overloaded versions of Compare(). The first + // version will be picked when the second argument to ASSERT_EQ() is + // NOT a pointer, e.g. ASSERT_EQ(0, AnIntFunction()) or + // EXPECT_EQ(false, a_bool). + template <typename T1, typename T2> + static AssertionResult Compare( + const char* expected_expression, + const char* actual_expression, + const T1& expected, + const T2& actual, + // The following line prevents this overload from being considered if T2 + // is not a pointer type. We need this because ASSERT_EQ(NULL, my_ptr) + // expands to Compare("", "", NULL, my_ptr), which requires a conversion + // to match the Secret* in the other overload, which would otherwise make + // this template match better. + typename EnableIf<!is_pointer<T2>::value>::type* = 0) { + return CmpHelperEQ(expected_expression, actual_expression, expected, + actual); + } + + // This version will be picked when the second argument to ASSERT_EQ() is a + // pointer, e.g. ASSERT_EQ(NULL, a_pointer). + template <typename T> + static AssertionResult Compare( + const char* expected_expression, + const char* actual_expression, + // We used to have a second template parameter instead of Secret*. That + // template parameter would deduce to 'long', making this a better match + // than the first overload even without the first overload's EnableIf. + // Unfortunately, gcc with -Wconversion-null warns when "passing NULL to + // non-pointer argument" (even a deduced integral argument), so the old + // implementation caused warnings in user code. + Secret* /* expected (NULL) */, + T* actual) { + // We already know that 'expected' is a null pointer. + return CmpHelperEQ(expected_expression, actual_expression, + static_cast<T*>(NULL), actual); + } +}; + +// A macro for implementing the helper functions needed to implement +// ASSERT_?? and EXPECT_??. It is here just to avoid copy-and-paste +// of similar code. +// +// For each templatized helper function, we also define an overloaded +// version for BiggestInt in order to reduce code bloat and allow +// anonymous enums to be used with {ASSERT|EXPECT}_?? when compiled +// with gcc 4. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +#define GTEST_IMPL_CMP_HELPER_(op_name, op)\ +template <typename T1, typename T2>\ +AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \ + const T1& val1, const T2& val2) {\ + if (val1 op val2) {\ + return AssertionSuccess();\ + } else {\ + return AssertionFailure() \ + << "Expected: (" << expr1 << ") " #op " (" << expr2\ + << "), actual: " << FormatForComparisonFailureMessage(val1, val2)\ + << " vs " << FormatForComparisonFailureMessage(val2, val1);\ + }\ +}\ +GTEST_API_ AssertionResult CmpHelper##op_name(\ + const char* expr1, const char* expr2, BiggestInt val1, BiggestInt val2) + +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. + +// Implements the helper function for {ASSERT|EXPECT}_NE +GTEST_IMPL_CMP_HELPER_(NE, !=); +// Implements the helper function for {ASSERT|EXPECT}_LE +GTEST_IMPL_CMP_HELPER_(LE, <=); +// Implements the helper function for {ASSERT|EXPECT}_LT +GTEST_IMPL_CMP_HELPER_(LT, <); +// Implements the helper function for {ASSERT|EXPECT}_GE +GTEST_IMPL_CMP_HELPER_(GE, >=); +// Implements the helper function for {ASSERT|EXPECT}_GT +GTEST_IMPL_CMP_HELPER_(GT, >); + +#undef GTEST_IMPL_CMP_HELPER_ + +// The helper function for {ASSERT|EXPECT}_STREQ. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +GTEST_API_ AssertionResult CmpHelperSTREQ(const char* expected_expression, + const char* actual_expression, + const char* expected, + const char* actual); + +// The helper function for {ASSERT|EXPECT}_STRCASEEQ. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char* expected_expression, + const char* actual_expression, + const char* expected, + const char* actual); + +// The helper function for {ASSERT|EXPECT}_STRNE. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression, + const char* s2_expression, + const char* s1, + const char* s2); + +// The helper function for {ASSERT|EXPECT}_STRCASENE. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +GTEST_API_ AssertionResult CmpHelperSTRCASENE(const char* s1_expression, + const char* s2_expression, + const char* s1, + const char* s2); + + +// Helper function for *_STREQ on wide strings. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +GTEST_API_ AssertionResult CmpHelperSTREQ(const char* expected_expression, + const char* actual_expression, + const wchar_t* expected, + const wchar_t* actual); + +// Helper function for *_STRNE on wide strings. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression, + const char* s2_expression, + const wchar_t* s1, + const wchar_t* s2); + +} // namespace internal + +// IsSubstring() and IsNotSubstring() are intended to be used as the +// first argument to {EXPECT,ASSERT}_PRED_FORMAT2(), not by +// themselves. They check whether needle is a substring of haystack +// (NULL is considered a substring of itself only), and return an +// appropriate error message when they fail. +// +// The {needle,haystack}_expr arguments are the stringified +// expressions that generated the two real arguments. +GTEST_API_ AssertionResult IsSubstring( + const char* needle_expr, const char* haystack_expr, + const char* needle, const char* haystack); +GTEST_API_ AssertionResult IsSubstring( + const char* needle_expr, const char* haystack_expr, + const wchar_t* needle, const wchar_t* haystack); +GTEST_API_ AssertionResult IsNotSubstring( + const char* needle_expr, const char* haystack_expr, + const char* needle, const char* haystack); +GTEST_API_ AssertionResult IsNotSubstring( + const char* needle_expr, const char* haystack_expr, + const wchar_t* needle, const wchar_t* haystack); +GTEST_API_ AssertionResult IsSubstring( + const char* needle_expr, const char* haystack_expr, + const ::std::string& needle, const ::std::string& haystack); +GTEST_API_ AssertionResult IsNotSubstring( + const char* needle_expr, const char* haystack_expr, + const ::std::string& needle, const ::std::string& haystack); + +#if GTEST_HAS_STD_WSTRING +GTEST_API_ AssertionResult IsSubstring( + const char* needle_expr, const char* haystack_expr, + const ::std::wstring& needle, const ::std::wstring& haystack); +GTEST_API_ AssertionResult IsNotSubstring( + const char* needle_expr, const char* haystack_expr, + const ::std::wstring& needle, const ::std::wstring& haystack); +#endif // GTEST_HAS_STD_WSTRING + +namespace internal { + +// Helper template function for comparing floating-points. +// +// Template parameter: +// +// RawType: the raw floating-point type (either float or double) +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +template <typename RawType> +AssertionResult CmpHelperFloatingPointEQ(const char* expected_expression, + const char* actual_expression, + RawType expected, + RawType actual) { + const FloatingPoint<RawType> lhs(expected), rhs(actual); + + if (lhs.AlmostEquals(rhs)) { + return AssertionSuccess(); + } + + ::std::stringstream expected_ss; + expected_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2) + << expected; + + ::std::stringstream actual_ss; + actual_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2) + << actual; + + return EqFailure(expected_expression, + actual_expression, + StringStreamToString(&expected_ss), + StringStreamToString(&actual_ss), + false); +} + +// Helper function for implementing ASSERT_NEAR. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +GTEST_API_ AssertionResult DoubleNearPredFormat(const char* expr1, + const char* expr2, + const char* abs_error_expr, + double val1, + double val2, + double abs_error); + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// A class that enables one to stream messages to assertion macros +class GTEST_API_ AssertHelper { + public: + // Constructor. + AssertHelper(TestPartResult::Type type, + const char* file, + int line, + const char* message); + ~AssertHelper(); + + // Message assignment is a semantic trick to enable assertion + // streaming; see the GTEST_MESSAGE_ macro below. + void operator=(const Message& message) const; + + private: + // We put our data in a struct so that the size of the AssertHelper class can + // be as small as possible. This is important because gcc is incapable of + // re-using stack space even for temporary variables, so every EXPECT_EQ + // reserves stack space for another AssertHelper. + struct AssertHelperData { + AssertHelperData(TestPartResult::Type t, + const char* srcfile, + int line_num, + const char* msg) + : type(t), file(srcfile), line(line_num), message(msg) { } + + TestPartResult::Type const type; + const char* const file; + int const line; + std::string const message; + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelperData); + }; + + AssertHelperData* const data_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelper); +}; + +} // namespace internal + +#if GTEST_HAS_PARAM_TEST +// The pure interface class that all value-parameterized tests inherit from. +// A value-parameterized class must inherit from both ::testing::Test and +// ::testing::WithParamInterface. In most cases that just means inheriting +// from ::testing::TestWithParam, but more complicated test hierarchies +// may need to inherit from Test and WithParamInterface at different levels. +// +// This interface has support for accessing the test parameter value via +// the GetParam() method. +// +// Use it with one of the parameter generator defining functions, like Range(), +// Values(), ValuesIn(), Bool(), and Combine(). +// +// class FooTest : public ::testing::TestWithParam<int> { +// protected: +// FooTest() { +// // Can use GetParam() here. +// } +// virtual ~FooTest() { +// // Can use GetParam() here. +// } +// virtual void SetUp() { +// // Can use GetParam() here. +// } +// virtual void TearDown { +// // Can use GetParam() here. +// } +// }; +// TEST_P(FooTest, DoesBar) { +// // Can use GetParam() method here. +// Foo foo; +// ASSERT_TRUE(foo.DoesBar(GetParam())); +// } +// INSTANTIATE_TEST_CASE_P(OneToTenRange, FooTest, ::testing::Range(1, 10)); + +template <typename T> +class WithParamInterface { + public: + typedef T ParamType; + virtual ~WithParamInterface() {} + + // The current parameter value. Is also available in the test fixture's + // constructor. This member function is non-static, even though it only + // references static data, to reduce the opportunity for incorrect uses + // like writing 'WithParamInterface<bool>::GetParam()' for a test that + // uses a fixture whose parameter type is int. + const ParamType& GetParam() const { + GTEST_CHECK_(parameter_ != NULL) + << "GetParam() can only be called inside a value-parameterized test " + << "-- did you intend to write TEST_P instead of TEST_F?"; + return *parameter_; + } + + private: + // Sets parameter value. The caller is responsible for making sure the value + // remains alive and unchanged throughout the current test. + static void SetParam(const ParamType* parameter) { + parameter_ = parameter; + } + + // Static value used for accessing parameter during a test lifetime. + static const ParamType* parameter_; + + // TestClass must be a subclass of WithParamInterface<T> and Test. + template <class TestClass> friend class internal::ParameterizedTestFactory; +}; + +template <typename T> +const T* WithParamInterface<T>::parameter_ = NULL; + +// Most value-parameterized classes can ignore the existence of +// WithParamInterface, and can just inherit from ::testing::TestWithParam. + +template <typename T> +class TestWithParam : public Test, public WithParamInterface<T> { +}; + +#endif // GTEST_HAS_PARAM_TEST + +// Macros for indicating success/failure in test code. + +// ADD_FAILURE unconditionally adds a failure to the current test. +// SUCCEED generates a success - it doesn't automatically make the +// current test successful, as a test is only successful when it has +// no failure. +// +// EXPECT_* verifies that a certain condition is satisfied. If not, +// it behaves like ADD_FAILURE. In particular: +// +// EXPECT_TRUE verifies that a Boolean condition is true. +// EXPECT_FALSE verifies that a Boolean condition is false. +// +// FAIL and ASSERT_* are similar to ADD_FAILURE and EXPECT_*, except +// that they will also abort the current function on failure. People +// usually want the fail-fast behavior of FAIL and ASSERT_*, but those +// writing data-driven tests often find themselves using ADD_FAILURE +// and EXPECT_* more. + +// Generates a nonfatal failure with a generic message. +#define ADD_FAILURE() GTEST_NONFATAL_FAILURE_("Failed") + +// Generates a nonfatal failure at the given source file location with +// a generic message. +#define ADD_FAILURE_AT(file, line) \ + GTEST_MESSAGE_AT_(file, line, "Failed", \ + ::testing::TestPartResult::kNonFatalFailure) + +// Generates a fatal failure with a generic message. +#define GTEST_FAIL() GTEST_FATAL_FAILURE_("Failed") + +// Define this macro to 1 to omit the definition of FAIL(), which is a +// generic name and clashes with some other libraries. +#if !GTEST_DONT_DEFINE_FAIL +# define FAIL() GTEST_FAIL() +#endif + +// Generates a success with a generic message. +#define GTEST_SUCCEED() GTEST_SUCCESS_("Succeeded") + +// Define this macro to 1 to omit the definition of SUCCEED(), which +// is a generic name and clashes with some other libraries. +#if !GTEST_DONT_DEFINE_SUCCEED +# define SUCCEED() GTEST_SUCCEED() +#endif + +// Macros for testing exceptions. +// +// * {ASSERT|EXPECT}_THROW(statement, expected_exception): +// Tests that the statement throws the expected exception. +// * {ASSERT|EXPECT}_NO_THROW(statement): +// Tests that the statement doesn't throw any exception. +// * {ASSERT|EXPECT}_ANY_THROW(statement): +// Tests that the statement throws an exception. + +#define EXPECT_THROW(statement, expected_exception) \ + GTEST_TEST_THROW_(statement, expected_exception, GTEST_NONFATAL_FAILURE_) +#define EXPECT_NO_THROW(statement) \ + GTEST_TEST_NO_THROW_(statement, GTEST_NONFATAL_FAILURE_) +#define EXPECT_ANY_THROW(statement) \ + GTEST_TEST_ANY_THROW_(statement, GTEST_NONFATAL_FAILURE_) +#define ASSERT_THROW(statement, expected_exception) \ + GTEST_TEST_THROW_(statement, expected_exception, GTEST_FATAL_FAILURE_) +#define ASSERT_NO_THROW(statement) \ + GTEST_TEST_NO_THROW_(statement, GTEST_FATAL_FAILURE_) +#define ASSERT_ANY_THROW(statement) \ + GTEST_TEST_ANY_THROW_(statement, GTEST_FATAL_FAILURE_) + +// Boolean assertions. Condition can be either a Boolean expression or an +// AssertionResult. For more information on how to use AssertionResult with +// these macros see comments on that class. +#define EXPECT_TRUE(condition) \ + GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \ + GTEST_NONFATAL_FAILURE_) +#define EXPECT_FALSE(condition) \ + GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \ + GTEST_NONFATAL_FAILURE_) +#define ASSERT_TRUE(condition) \ + GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \ + GTEST_FATAL_FAILURE_) +#define ASSERT_FALSE(condition) \ + GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \ + GTEST_FATAL_FAILURE_) + +// Includes the auto-generated header that implements a family of +// generic predicate assertion macros. +// Copyright 2006, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// This file is AUTOMATICALLY GENERATED on 10/31/2011 by command +// 'gen_gtest_pred_impl.py 5'. DO NOT EDIT BY HAND! +// +// Implements a family of generic predicate assertion macros. + +#ifndef GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ +#define GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ + +// Makes sure this header is not included before gtest.h. +#ifndef GTEST_INCLUDE_GTEST_GTEST_H_ +# error Do not include gtest_pred_impl.h directly. Include gtest.h instead. +#endif // GTEST_INCLUDE_GTEST_GTEST_H_ + +// This header implements a family of generic predicate assertion +// macros: +// +// ASSERT_PRED_FORMAT1(pred_format, v1) +// ASSERT_PRED_FORMAT2(pred_format, v1, v2) +// ... +// +// where pred_format is a function or functor that takes n (in the +// case of ASSERT_PRED_FORMATn) values and their source expression +// text, and returns a testing::AssertionResult. See the definition +// of ASSERT_EQ in gtest.h for an example. +// +// If you don't care about formatting, you can use the more +// restrictive version: +// +// ASSERT_PRED1(pred, v1) +// ASSERT_PRED2(pred, v1, v2) +// ... +// +// where pred is an n-ary function or functor that returns bool, +// and the values v1, v2, ..., must support the << operator for +// streaming to std::ostream. +// +// We also define the EXPECT_* variations. +// +// For now we only support predicates whose arity is at most 5. +// Please email googletestframework@googlegroups.com if you need +// support for higher arities. + +// GTEST_ASSERT_ is the basic statement to which all of the assertions +// in this file reduce. Don't use this in your code. + +#define GTEST_ASSERT_(expression, on_failure) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (const ::testing::AssertionResult gtest_ar = (expression)) \ + ; \ + else \ + on_failure(gtest_ar.failure_message()) + + +// Helper function for implementing {EXPECT|ASSERT}_PRED1. Don't use +// this in your code. +template <typename Pred, + typename T1> +AssertionResult AssertPred1Helper(const char* pred_text, + const char* e1, + Pred pred, + const T1& v1) { + if (pred(v1)) return AssertionSuccess(); + + return AssertionFailure() << pred_text << "(" + << e1 << ") evaluates to false, where" + << "\n" << e1 << " evaluates to " << v1; +} + +// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT1. +// Don't use this in your code. +#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure)\ + GTEST_ASSERT_(pred_format(#v1, v1), \ + on_failure) + +// Internal macro for implementing {EXPECT|ASSERT}_PRED1. Don't use +// this in your code. +#define GTEST_PRED1_(pred, v1, on_failure)\ + GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, \ + #v1, \ + pred, \ + v1), on_failure) + +// Unary predicate assertion macros. +#define EXPECT_PRED_FORMAT1(pred_format, v1) \ + GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_NONFATAL_FAILURE_) +#define EXPECT_PRED1(pred, v1) \ + GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_) +#define ASSERT_PRED_FORMAT1(pred_format, v1) \ + GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_FATAL_FAILURE_) +#define ASSERT_PRED1(pred, v1) \ + GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_) + + + +// Helper function for implementing {EXPECT|ASSERT}_PRED2. Don't use +// this in your code. +template <typename Pred, + typename T1, + typename T2> +AssertionResult AssertPred2Helper(const char* pred_text, + const char* e1, + const char* e2, + Pred pred, + const T1& v1, + const T2& v2) { + if (pred(v1, v2)) return AssertionSuccess(); + + return AssertionFailure() << pred_text << "(" + << e1 << ", " + << e2 << ") evaluates to false, where" + << "\n" << e1 << " evaluates to " << v1 + << "\n" << e2 << " evaluates to " << v2; +} + +// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2. +// Don't use this in your code. +#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure)\ + GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), \ + on_failure) + +// Internal macro for implementing {EXPECT|ASSERT}_PRED2. Don't use +// this in your code. +#define GTEST_PRED2_(pred, v1, v2, on_failure)\ + GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, \ + #v1, \ + #v2, \ + pred, \ + v1, \ + v2), on_failure) + +// Binary predicate assertion macros. +#define EXPECT_PRED_FORMAT2(pred_format, v1, v2) \ + GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_NONFATAL_FAILURE_) +#define EXPECT_PRED2(pred, v1, v2) \ + GTEST_PRED2_(pred, v1, v2, GTEST_NONFATAL_FAILURE_) +#define ASSERT_PRED_FORMAT2(pred_format, v1, v2) \ + GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_FATAL_FAILURE_) +#define ASSERT_PRED2(pred, v1, v2) \ + GTEST_PRED2_(pred, v1, v2, GTEST_FATAL_FAILURE_) + + + +// Helper function for implementing {EXPECT|ASSERT}_PRED3. Don't use +// this in your code. +template <typename Pred, + typename T1, + typename T2, + typename T3> +AssertionResult AssertPred3Helper(const char* pred_text, + const char* e1, + const char* e2, + const char* e3, + Pred pred, + const T1& v1, + const T2& v2, + const T3& v3) { + if (pred(v1, v2, v3)) return AssertionSuccess(); + + return AssertionFailure() << pred_text << "(" + << e1 << ", " + << e2 << ", " + << e3 << ") evaluates to false, where" + << "\n" << e1 << " evaluates to " << v1 + << "\n" << e2 << " evaluates to " << v2 + << "\n" << e3 << " evaluates to " << v3; +} + +// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT3. +// Don't use this in your code. +#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure)\ + GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), \ + on_failure) + +// Internal macro for implementing {EXPECT|ASSERT}_PRED3. Don't use +// this in your code. +#define GTEST_PRED3_(pred, v1, v2, v3, on_failure)\ + GTEST_ASSERT_(::testing::AssertPred3Helper(#pred, \ + #v1, \ + #v2, \ + #v3, \ + pred, \ + v1, \ + v2, \ + v3), on_failure) + +// Ternary predicate assertion macros. +#define EXPECT_PRED_FORMAT3(pred_format, v1, v2, v3) \ + GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_NONFATAL_FAILURE_) +#define EXPECT_PRED3(pred, v1, v2, v3) \ + GTEST_PRED3_(pred, v1, v2, v3, GTEST_NONFATAL_FAILURE_) +#define ASSERT_PRED_FORMAT3(pred_format, v1, v2, v3) \ + GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_FATAL_FAILURE_) +#define ASSERT_PRED3(pred, v1, v2, v3) \ + GTEST_PRED3_(pred, v1, v2, v3, GTEST_FATAL_FAILURE_) + + + +// Helper function for implementing {EXPECT|ASSERT}_PRED4. Don't use +// this in your code. +template <typename Pred, + typename T1, + typename T2, + typename T3, + typename T4> +AssertionResult AssertPred4Helper(const char* pred_text, + const char* e1, + const char* e2, + const char* e3, + const char* e4, + Pred pred, + const T1& v1, + const T2& v2, + const T3& v3, + const T4& v4) { + if (pred(v1, v2, v3, v4)) return AssertionSuccess(); + + return AssertionFailure() << pred_text << "(" + << e1 << ", " + << e2 << ", " + << e3 << ", " + << e4 << ") evaluates to false, where" + << "\n" << e1 << " evaluates to " << v1 + << "\n" << e2 << " evaluates to " << v2 + << "\n" << e3 << " evaluates to " << v3 + << "\n" << e4 << " evaluates to " << v4; +} + +// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT4. +// Don't use this in your code. +#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure)\ + GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), \ + on_failure) + +// Internal macro for implementing {EXPECT|ASSERT}_PRED4. Don't use +// this in your code. +#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure)\ + GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, \ + #v1, \ + #v2, \ + #v3, \ + #v4, \ + pred, \ + v1, \ + v2, \ + v3, \ + v4), on_failure) + +// 4-ary predicate assertion macros. +#define EXPECT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \ + GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_) +#define EXPECT_PRED4(pred, v1, v2, v3, v4) \ + GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_) +#define ASSERT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \ + GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_FATAL_FAILURE_) +#define ASSERT_PRED4(pred, v1, v2, v3, v4) \ + GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_FATAL_FAILURE_) + + + +// Helper function for implementing {EXPECT|ASSERT}_PRED5. Don't use +// this in your code. +template <typename Pred, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5> +AssertionResult AssertPred5Helper(const char* pred_text, + const char* e1, + const char* e2, + const char* e3, + const char* e4, + const char* e5, + Pred pred, + const T1& v1, + const T2& v2, + const T3& v3, + const T4& v4, + const T5& v5) { + if (pred(v1, v2, v3, v4, v5)) return AssertionSuccess(); + + return AssertionFailure() << pred_text << "(" + << e1 << ", " + << e2 << ", " + << e3 << ", " + << e4 << ", " + << e5 << ") evaluates to false, where" + << "\n" << e1 << " evaluates to " << v1 + << "\n" << e2 << " evaluates to " << v2 + << "\n" << e3 << " evaluates to " << v3 + << "\n" << e4 << " evaluates to " << v4 + << "\n" << e5 << " evaluates to " << v5; +} + +// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT5. +// Don't use this in your code. +#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)\ + GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, #v5, v1, v2, v3, v4, v5), \ + on_failure) + +// Internal macro for implementing {EXPECT|ASSERT}_PRED5. Don't use +// this in your code. +#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure)\ + GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, \ + #v1, \ + #v2, \ + #v3, \ + #v4, \ + #v5, \ + pred, \ + v1, \ + v2, \ + v3, \ + v4, \ + v5), on_failure) + +// 5-ary predicate assertion macros. +#define EXPECT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \ + GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_) +#define EXPECT_PRED5(pred, v1, v2, v3, v4, v5) \ + GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_) +#define ASSERT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \ + GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_) +#define ASSERT_PRED5(pred, v1, v2, v3, v4, v5) \ + GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_) + + + +#endif // GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ + +// Macros for testing equalities and inequalities. +// +// * {ASSERT|EXPECT}_EQ(expected, actual): Tests that expected == actual +// * {ASSERT|EXPECT}_NE(v1, v2): Tests that v1 != v2 +// * {ASSERT|EXPECT}_LT(v1, v2): Tests that v1 < v2 +// * {ASSERT|EXPECT}_LE(v1, v2): Tests that v1 <= v2 +// * {ASSERT|EXPECT}_GT(v1, v2): Tests that v1 > v2 +// * {ASSERT|EXPECT}_GE(v1, v2): Tests that v1 >= v2 +// +// When they are not, Google Test prints both the tested expressions and +// their actual values. The values must be compatible built-in types, +// or you will get a compiler error. By "compatible" we mean that the +// values can be compared by the respective operator. +// +// Note: +// +// 1. It is possible to make a user-defined type work with +// {ASSERT|EXPECT}_??(), but that requires overloading the +// comparison operators and is thus discouraged by the Google C++ +// Usage Guide. Therefore, you are advised to use the +// {ASSERT|EXPECT}_TRUE() macro to assert that two objects are +// equal. +// +// 2. The {ASSERT|EXPECT}_??() macros do pointer comparisons on +// pointers (in particular, C strings). Therefore, if you use it +// with two C strings, you are testing how their locations in memory +// are related, not how their content is related. To compare two C +// strings by content, use {ASSERT|EXPECT}_STR*(). +// +// 3. {ASSERT|EXPECT}_EQ(expected, actual) is preferred to +// {ASSERT|EXPECT}_TRUE(expected == actual), as the former tells you +// what the actual value is when it fails, and similarly for the +// other comparisons. +// +// 4. Do not depend on the order in which {ASSERT|EXPECT}_??() +// evaluate their arguments, which is undefined. +// +// 5. These macros evaluate their arguments exactly once. +// +// Examples: +// +// EXPECT_NE(5, Foo()); +// EXPECT_EQ(NULL, a_pointer); +// ASSERT_LT(i, array_size); +// ASSERT_GT(records.size(), 0) << "There is no record left."; + +#define EXPECT_EQ(expected, actual) \ + EXPECT_PRED_FORMAT2(::testing::internal:: \ + EqHelper<GTEST_IS_NULL_LITERAL_(expected)>::Compare, \ + expected, actual) +#define EXPECT_NE(expected, actual) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperNE, expected, actual) +#define EXPECT_LE(val1, val2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2) +#define EXPECT_LT(val1, val2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLT, val1, val2) +#define EXPECT_GE(val1, val2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGE, val1, val2) +#define EXPECT_GT(val1, val2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2) + +#define GTEST_ASSERT_EQ(expected, actual) \ + ASSERT_PRED_FORMAT2(::testing::internal:: \ + EqHelper<GTEST_IS_NULL_LITERAL_(expected)>::Compare, \ + expected, actual) +#define GTEST_ASSERT_NE(val1, val2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2) +#define GTEST_ASSERT_LE(val1, val2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2) +#define GTEST_ASSERT_LT(val1, val2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperLT, val1, val2) +#define GTEST_ASSERT_GE(val1, val2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperGE, val1, val2) +#define GTEST_ASSERT_GT(val1, val2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2) + +// Define macro GTEST_DONT_DEFINE_ASSERT_XY to 1 to omit the definition of +// ASSERT_XY(), which clashes with some users' own code. + +#if !GTEST_DONT_DEFINE_ASSERT_EQ +# define ASSERT_EQ(val1, val2) GTEST_ASSERT_EQ(val1, val2) +#endif + +#if !GTEST_DONT_DEFINE_ASSERT_NE +# define ASSERT_NE(val1, val2) GTEST_ASSERT_NE(val1, val2) +#endif + +#if !GTEST_DONT_DEFINE_ASSERT_LE +# define ASSERT_LE(val1, val2) GTEST_ASSERT_LE(val1, val2) +#endif + +#if !GTEST_DONT_DEFINE_ASSERT_LT +# define ASSERT_LT(val1, val2) GTEST_ASSERT_LT(val1, val2) +#endif + +#if !GTEST_DONT_DEFINE_ASSERT_GE +# define ASSERT_GE(val1, val2) GTEST_ASSERT_GE(val1, val2) +#endif + +#if !GTEST_DONT_DEFINE_ASSERT_GT +# define ASSERT_GT(val1, val2) GTEST_ASSERT_GT(val1, val2) +#endif + +// C-string Comparisons. All tests treat NULL and any non-NULL string +// as different. Two NULLs are equal. +// +// * {ASSERT|EXPECT}_STREQ(s1, s2): Tests that s1 == s2 +// * {ASSERT|EXPECT}_STRNE(s1, s2): Tests that s1 != s2 +// * {ASSERT|EXPECT}_STRCASEEQ(s1, s2): Tests that s1 == s2, ignoring case +// * {ASSERT|EXPECT}_STRCASENE(s1, s2): Tests that s1 != s2, ignoring case +// +// For wide or narrow string objects, you can use the +// {ASSERT|EXPECT}_??() macros. +// +// Don't depend on the order in which the arguments are evaluated, +// which is undefined. +// +// These macros evaluate their arguments exactly once. + +#define EXPECT_STREQ(expected, actual) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, expected, actual) +#define EXPECT_STRNE(s1, s2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2) +#define EXPECT_STRCASEEQ(expected, actual) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, expected, actual) +#define EXPECT_STRCASENE(s1, s2)\ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2) + +#define ASSERT_STREQ(expected, actual) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, expected, actual) +#define ASSERT_STRNE(s1, s2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2) +#define ASSERT_STRCASEEQ(expected, actual) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, expected, actual) +#define ASSERT_STRCASENE(s1, s2)\ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2) + +// Macros for comparing floating-point numbers. +// +// * {ASSERT|EXPECT}_FLOAT_EQ(expected, actual): +// Tests that two float values are almost equal. +// * {ASSERT|EXPECT}_DOUBLE_EQ(expected, actual): +// Tests that two double values are almost equal. +// * {ASSERT|EXPECT}_NEAR(v1, v2, abs_error): +// Tests that v1 and v2 are within the given distance to each other. +// +// Google Test uses ULP-based comparison to automatically pick a default +// error bound that is appropriate for the operands. See the +// FloatingPoint template class in gtest-internal.h if you are +// interested in the implementation details. + +#define EXPECT_FLOAT_EQ(expected, actual)\ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \ + expected, actual) + +#define EXPECT_DOUBLE_EQ(expected, actual)\ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \ + expected, actual) + +#define ASSERT_FLOAT_EQ(expected, actual)\ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \ + expected, actual) + +#define ASSERT_DOUBLE_EQ(expected, actual)\ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \ + expected, actual) + +#define EXPECT_NEAR(val1, val2, abs_error)\ + EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \ + val1, val2, abs_error) + +#define ASSERT_NEAR(val1, val2, abs_error)\ + ASSERT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \ + val1, val2, abs_error) + +// These predicate format functions work on floating-point values, and +// can be used in {ASSERT|EXPECT}_PRED_FORMAT2*(), e.g. +// +// EXPECT_PRED_FORMAT2(testing::DoubleLE, Foo(), 5.0); + +// Asserts that val1 is less than, or almost equal to, val2. Fails +// otherwise. In particular, it fails if either val1 or val2 is NaN. +GTEST_API_ AssertionResult FloatLE(const char* expr1, const char* expr2, + float val1, float val2); +GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2, + double val1, double val2); + + +#if GTEST_OS_WINDOWS + +// Macros that test for HRESULT failure and success, these are only useful +// on Windows, and rely on Windows SDK macros and APIs to compile. +// +// * {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED}(expr) +// +// When expr unexpectedly fails or succeeds, Google Test prints the +// expected result and the actual result with both a human-readable +// string representation of the error, if available, as well as the +// hex result code. +# define EXPECT_HRESULT_SUCCEEDED(expr) \ + EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr)) + +# define ASSERT_HRESULT_SUCCEEDED(expr) \ + ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr)) + +# define EXPECT_HRESULT_FAILED(expr) \ + EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr)) + +# define ASSERT_HRESULT_FAILED(expr) \ + ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr)) + +#endif // GTEST_OS_WINDOWS + +// Macros that execute statement and check that it doesn't generate new fatal +// failures in the current thread. +// +// * {ASSERT|EXPECT}_NO_FATAL_FAILURE(statement); +// +// Examples: +// +// EXPECT_NO_FATAL_FAILURE(Process()); +// ASSERT_NO_FATAL_FAILURE(Process()) << "Process() failed"; +// +#define ASSERT_NO_FATAL_FAILURE(statement) \ + GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_FATAL_FAILURE_) +#define EXPECT_NO_FATAL_FAILURE(statement) \ + GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_) + +// Causes a trace (including the source file path, the current line +// number, and the given message) to be included in every test failure +// message generated by code in the current scope. The effect is +// undone when the control leaves the current scope. +// +// The message argument can be anything streamable to std::ostream. +// +// In the implementation, we include the current line number as part +// of the dummy variable name, thus allowing multiple SCOPED_TRACE()s +// to appear in the same block - as long as they are on different +// lines. +#define SCOPED_TRACE(message) \ + ::testing::internal::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)(\ + __FILE__, __LINE__, ::testing::Message() << (message)) + +// Compile-time assertion for type equality. +// StaticAssertTypeEq<type1, type2>() compiles iff type1 and type2 are +// the same type. The value it returns is not interesting. +// +// Instead of making StaticAssertTypeEq a class template, we make it a +// function template that invokes a helper class template. This +// prevents a user from misusing StaticAssertTypeEq<T1, T2> by +// defining objects of that type. +// +// CAVEAT: +// +// When used inside a method of a class template, +// StaticAssertTypeEq<T1, T2>() is effective ONLY IF the method is +// instantiated. For example, given: +// +// template <typename T> class Foo { +// public: +// void Bar() { testing::StaticAssertTypeEq<int, T>(); } +// }; +// +// the code: +// +// void Test1() { Foo<bool> foo; } +// +// will NOT generate a compiler error, as Foo<bool>::Bar() is never +// actually instantiated. Instead, you need: +// +// void Test2() { Foo<bool> foo; foo.Bar(); } +// +// to cause a compiler error. +template <typename T1, typename T2> +bool StaticAssertTypeEq() { + (void)internal::StaticAssertTypeEqHelper<T1, T2>(); + return true; +} + +// Defines a test. +// +// The first parameter is the name of the test case, and the second +// parameter is the name of the test within the test case. +// +// The convention is to end the test case name with "Test". For +// example, a test case for the Foo class can be named FooTest. +// +// The user should put his test code between braces after using this +// macro. Example: +// +// TEST(FooTest, InitializesCorrectly) { +// Foo foo; +// EXPECT_TRUE(foo.StatusIsOK()); +// } + +// Note that we call GetTestTypeId() instead of GetTypeId< +// ::testing::Test>() here to get the type ID of testing::Test. This +// is to work around a suspected linker bug when using Google Test as +// a framework on Mac OS X. The bug causes GetTypeId< +// ::testing::Test>() to return different values depending on whether +// the call is from the Google Test framework itself or from user test +// code. GetTestTypeId() is guaranteed to always return the same +// value, as it always calls GetTypeId<>() from the Google Test +// framework. +#define GTEST_TEST(test_case_name, test_name)\ + GTEST_TEST_(test_case_name, test_name, \ + ::testing::Test, ::testing::internal::GetTestTypeId()) + +// Define this macro to 1 to omit the definition of TEST(), which +// is a generic name and clashes with some other libraries. +#if !GTEST_DONT_DEFINE_TEST +# define TEST(test_case_name, test_name) GTEST_TEST(test_case_name, test_name) +#endif + +// Defines a test that uses a test fixture. +// +// The first parameter is the name of the test fixture class, which +// also doubles as the test case name. The second parameter is the +// name of the test within the test case. +// +// A test fixture class must be declared earlier. The user should put +// his test code between braces after using this macro. Example: +// +// class FooTest : public testing::Test { +// protected: +// virtual void SetUp() { b_.AddElement(3); } +// +// Foo a_; +// Foo b_; +// }; +// +// TEST_F(FooTest, InitializesCorrectly) { +// EXPECT_TRUE(a_.StatusIsOK()); +// } +// +// TEST_F(FooTest, ReturnsElementCountCorrectly) { +// EXPECT_EQ(0, a_.size()); +// EXPECT_EQ(1, b_.size()); +// } + +#define TEST_F(test_fixture, test_name)\ + GTEST_TEST_(test_fixture, test_name, test_fixture, \ + ::testing::internal::GetTypeId<test_fixture>()) + +} // namespace testing + +// Use this function in main() to run all tests. It returns 0 if all +// tests are successful, or 1 otherwise. +// +// RUN_ALL_TESTS() should be invoked after the command line has been +// parsed by InitGoogleTest(). +// +// This function was formerly a macro; thus, it is in the global +// namespace and has an all-caps name. +int RUN_ALL_TESTS() GTEST_MUST_USE_RESULT_; + +inline int RUN_ALL_TESTS() { + return ::testing::UnitTest::GetInstance()->Run(); +} + +#endif // GTEST_INCLUDE_GTEST_GTEST_H_